****************************************************
* Part I.  Regression modeling and partial effects *
*          Based on healthcare.csv                 *
****************************************************

* Declare the data set to be a panel
xtset id
* Summarize the data
summarize *
* We will analyze the log of the income variable
gen logincome = log(income)
* One commonly used device is the kernel density estimator
* The data change from year to year, so we will examine one year
kdensity logincome if year == 1994, normal
* Box plots will suggest the year to year changes
* we compute them for female yeaded households, and trim the data
* as there are some outliers.
graph box income if female == 1 & income <= 2, over(year)
* We will be interested in the education variable. It is not
* coded in full years, so we use a transformed variable for 
* some computations.
hist educ if year == 1994
gen yearseduc = int(educ)
kdensity yearseduc
histogram yearseduc if year == 1994
* We will use these lists of names repeatedly, so we use a shortcut
* 1994 is defined as the base year by this list.
global demographic age female married
global years year1984 year1985 year1986 year1987 year1988 year1991
regr logincome $demographic $years
regrlogincome $demographic i.year
scalar r20 = e(r2)
regr logincome $demographic i.year educ
test educ
scalar r21 = e(r2)
scalar df1 = e(df_r)
* This is the F statistic for this hypothesis. It is the square of the t ratio
display ((r21 - r20)/1) / ((1 - r21)/df1)
test educ female
test $demographic
regr logincome $demographic $years
test $years
regr logincome $demographic $years educ
matrix vy = e(V)
matrix vy = vy[4..9,4..9]
matrix by = e(b)
matrix by = by[1..1,4..9]
matrix wald = by*invsym(vy)*by'
matrix list wald
* We examine the partial effects in a nonlinear regression
regress logincome c.age c.educ i.female c.age#c.educ c.age#c.age ///
        c.age#i.female c.educ#i.female
margins,dydx(age)
margins,dydx(educ)
margins,dydx(female)
margins,dydx(age) at(educ=(12(2)20))
margins,dydx(educ) at(age=(25(3)64))
marginsplot
margins,dydx(i.female) at(age=(25(5)65) educ=(12,16,20))
marginsplot

* Examining a categorical variable in detail
gen lths = educ < 12 
gen hs   = educ == 12
gen coll = educ > 12 & educ <= 16
gen grad = educ > 16
global degree lths hs coll grad
gen edlevel = 0*lths + 1*hs + 2*coll + 3*grad 
regr logincome c.age i.lths i.hs i.coll i.female c.age#i.lths c.age#i.hs c.age#i.coll ///
c.age#c.age c.age#i.female i.lths#i.female i.hs#i.female i.coll#i.female
margin,dydx(i.hs)
margin,dydx(i.coll)
* This does not work because grad is not in the model. grad is the base.
margin,dydx(i.grad)


global all $demographic $years
regress logincome $all
matrix mols = e(V)
matrix mols = vecdiag(mols)
matrix mols = mols'
regress logincome $all,cluster(id)
matrix mcluster= e(V)
matrix mcluster=vecdiag(mcluster)
matrix mcluster=mcluster'
matrix list mols
matrix list mcluster


xtreg logincome $all, re
xtreg logincome $all, fe


****************************************************
* Part II.  Probit Models with Cross Section Data  *
*           Based on panelprobit.csv               *
****************************************************
xtset firm
global sector rawmtl invgood consgood
global x im imum fdium sp prod logsales
global allx $x $sector
* Compare probit to logit, coefficients then partial effects.
probit ip $x
margin,dydx($x)
logit ip $x
margin,dydx($x)

regr ip $x
margin,dydx($x)
matrix bols=e(b)
matrix bols = bols'
matrix list bols
probit ip $x
margin,dydx($x)

* Conventional estimator
probit ip $x
matrix vmle=e(V)
matrix vmle=vecdiag(vmle)
matrix vmle=vmle'
* 'Robust' estimator
probit ip $x,robust
matrix vrobust=e(V)
matrix vrobust=vecdiag(vrobust)
matrix vrobust=vrobust'
* 'Cluster corrected' estimator
probit ip $x,cluster(firm)
matrix vcluster=e(V)
matrix vcluster=vecdiag(vcluster)
matrix vcluster=vcluster'
matrix list vmle
matrix list vrobust
matrix list vcluster

* Plotting probabilities
quietly probit ip im imum fdium sp prod logsales if t==1
quietly margins, at(logsales=(3.5(.7)18))
marginsplot

* A fit measure based on the model predictions
probit ip $x
predict p
gen iphat = p>.5
table ip iphat
probit

* Partial effects for a nonlinear model
probit ip im imum fdium sp prod c.logsales c.logsales#c.logsales
margins,dydx(c.logsales)
margins,dyex(fdium)
margins,dydx(c.logsales) at(fdium=(.05(.05)1))
marginsplot
probit ip im c.imum c.fdium c.sp prod c.logsales c.logsales#c.logsales ///
             c.imum#c.sp c.fdium#c.sp 
quietly margins,dydx(sp) at (sp=(.05(.05)1.0))
marginsplot

* Partial effects for categories - sectors
global sector $sector food
probit ip $x $sector
margins,dydx($sector)

* Chow style test for structural change
probit ip $x 
scalar ll = e(ll)
scalar loglsum = -ll
foreach time in 1 2 3 4 5 {
quietly probit ip $x if t==`time'
scalar loglsum = loglsum + e(ll) 
}
display 2*loglsum


* Testing for and estimating a heteroscedastic probit model
probit ip $x if t == 5
scalar logl0 = e(ll)
probit ip $x $sector if t == 5
test $sector
scalar logl1 = e(ll)
display 2*(logl1 - logl0)
hetprob ip $x if t==5,het($sector)
scalar loglh = e(ll) 
display 2*(loglh - logl0)

probit ip $x
margins, at(logsales=(5(1)15))
marginsplot