Statistical Inference (Chapter 4)

setwd("...")

library("foreign")

# 4.1
# http://ideas.repec.org/p/boc/bocins/wage1.html
download.file('http://fmwww.bc.edu/ec-p/data/wooldridge/wage1.dta','wage1.dta',mode="wb")
wage1 <- read.dta('wage1.dta')

lm.1 <- lm(lwage ~ educ + exper + tenure, data=wage1)
s.1 <- summary(lm.1)
s.1

In order to be able to use the values of the coefficients for further calculations, you can use copy and paste with the numbers provided in the summary s.1. Alternatively, and for the sake of more precision, you can access the coefficients directly. If you use the names function and run names(s.1), you will notice a subdirectory called “coefficients” which contains the coefficients, standard errors, t-statistics and p-values of the regression. Entering s.1$coefficients gives those values for all coefficients, but you can also extract them separately using the [row,column] signs, where the first value defines the row index and the second the column index. If no value is specified for a row or column, R will use all available values. This happens in the third line of the following code, where the t-values are calculated from the betas and their respective standard errors. The forth line of the code specifies a certain row which means that the t-value is calculated only for the parameter listed third in the coefficient list.

names(s.1)
s.1$coefficients
s.1$coefficient[,1]/s.1$coefficient[,2]
s.1$coefficient[3,1]/s.1$coefficient[3,2]

The following code illustrates the differences in statistical significance that can result from the use of log-values.

# 4.2
rm(list=ls())
# http://ideas.repec.org/p/boc/bocins/meap93.html
download.file('http://fmwww.bc.edu/ec-p/data/wooldridge/meap93.dta','meap93.dta',mode='wb')
meap93 <- read.dta('meap93.dta')

lm.1 <- lm(math10 ~ totcomp + staff + enroll, data=meap93)
summary(lm.1)

lm.2 <- lm(math10 ~ ltotcomp + lstaff + lenroll, data=meap93)
summary(lm.2)

# 4.3
rm(list=ls())
# http://ideas.repec.org/p/boc/bocins/gpa1.html
download.file('http://fmwww.bc.edu/ec-p/data/wooldridge/gpa1.dta','gpa1.dta',mode='wb')
gpa1 <- read.dta('gpa1.dta')

lm.1 <- lm(colGPA ~ hsGPA + ACT + skipped, data=gpa1)
summary(lm.1)

The next examples illustrate how the t-statistic is calculated in R if you want to know whether a coefficient is different from a value other than zero. This works by obtaining the coefficients from the regression summary, subtracting the value against which it should be tested and diving the result by the standard error.

# 4.4
rm(list=ls())
download.file('http://fmwww.bc.edu/ec-p/data/wooldridge/campus.dta','campus.dta',mode='wb')
campus <- read.dta('campus.dta')

lm.1 <- lm(lcrime ~ lenroll, data=campus)
s.1 <- summary(lm.1)
s.1

(s.1$coefficients[2,1]-1)/s.1$coefficients[2,2]

# 4.5
rm(list=ls())
download.file('http://fmwww.bc.edu/ec-p/data/wooldridge/hprice2.dta','hprice2.dta',mode='wb')
hprice2 <- read.dta('hprice2.dta')

names(hprice2)
ldist <- log(dist)

lm.1 <- lm(lprice ~ lnox + ldist + rooms + stratio, data=hprice2)
s.1 <- summary(lm.1)
s.1

(s.1$coefficients[2,1]+1)/s.1$coefficients[2,2]

# 4.6
rm(list=ls())
download.file('http://fmwww.bc.edu/ec-p/data/wooldridge/401k.dta','401k.dta',mode='wb')
a401k <- read.dta('401k.dta')

lm.1 <- lm(prate ~ mrate + age + totemp, data=a401k)
summary(lm.1)

The following code limits the used data to those observations, where the year is 1987 and the value for union is zero. The specification of data= tells R to take only those observations into consideration which meet these conditions. You could read this command in a way that R takes data from “jtrain”, where in each row – note that the second part of the “[,]”-sign is empty! – the conditions x and y have to hold.

# 4.7
rm(list=ls())
download.file('http://fmwww.bc.edu/ec-p/data/wooldridge/jtrain.dta','jtrain.dta',mode='wb')
jtrain <- read.dta('jtrain.dta')

lm.1 <- lm(lscrap ~ hrsemp + lsales + lemploy, data=jtrain[jtrain$year==1987 & jtrain$union==0,])
summary(lm.1)

The next example shows how you can calculate confidence intervals from estimation results. For this purpose you can use the confint function which needs the stored regression results (here lm.1) and the level of confidence specified by the option “level=…”.

# 4.8
rm(list=ls())
download.file('http://fmwww.bc.edu/ec-p/data/wooldridge/rdchem.dta','rdchem.dta',mode='wb')
rdchem <- read.dta('rdchem.dta')

lm.1 <- lm(lrd ~ lsales + profmarg, data=rdchem)
summary(lm.1)

# 10% confidence
confint(lm.1,level=.90)
# 5% confidence
confint(lm.1,level=.95)
# Equations 4.21 & 4.27
rm(list=ls())
download.file('http://fmwww.bc.edu/ec-p/data/wooldridge/twoyear.dta','twoyear.dta',mode='wb')
twoyear <- read.dta('twoyear.dta')

lm.1 <- lm(lwage ~ jc + univ + exper, data=twoyear)
s.1 <- summary(lm.1)

lm.2 <- lm(lwage ~ jc + totcoll + exper, data=twoyear)
s.2 <- summary(lm.2)

(s.1$coefficient[2,1]-s.1$coefficient[3,1])/s.2$coefficient[2,2]

Next, we want to calculate the F-test to evaluate the joint significance of independent variables. For this purpose you have to exert the regression of the restricted and the unrestricted model and store the results. Then, use the anova function. Specify which two regressions have to be used and let R do the rest.

# Equations 4.31 & 4.33
rm(list=ls())
download.file('http://fmwww.bc.edu/ec-p/data/wooldridge/mlb1.dta','mlb1.dta',mode='wb')
mlb1 <- read.dta('mlb1.dta')

# unrestricted model
lm.1 <- lm(lsalary ~ years + gamesyr + bavg + hrunsyr + rbisyr, data=mlb1)
# restricted model
lm.2 <- lm(lsalary ~ years + gamesyr, data=mlb1)
# f-test
anova(lm.1,lm.2)

# 4.9
rm(list=ls())
download.file('http://fmwww.bc.edu/ec-p/data/wooldridge/bwght.dta','bwght.dta',mode='wb')
bwght <- read.dta('bwght.dta')

lm.1 <- lm(bwght ~ cigs + parity + faminc + motheduc + fatheduc,data=bwght)
lm.2 <- lm(bwght ~ cigs + parity + faminc,data=bwght[is.na(bwght$motheduc)!=TRUE & is.na(bwght$fatheduc)!=TRUE,])
anova(lm.1,lm.2)

# Equation 4.48
rm(list=ls())
download.file('http://fmwww.bc.edu/ec-p/data/wooldridge/hprice1.dta','hprice1.dta',mode='wb')
hprice1 <- read.dta('hprice1.dta')

lm.1 <- lm(lprice ~ lassess + llotsize + lsqrft + bdrms, data=hprice1)

lprice <- hprice1$lprice-hprice1$lassess
lm.2 <- lm(lprice ~ 1)
anova(lm.1,lm.2)

# 4.10
rm(list=ls())
download.file('http://fmwww.bc.edu/ec-p/data/wooldridge/meap93.dta','meap93.dta',mode='wb')
meap93 <- read.dta('meap93.dta')

bs <- meap93$benefits/meap93$salary
lm.1 <- lm(lsalary ~ bs, data=meap93)
lm.2 <- lm(lsalary ~ bs + lenroll + lstaff, data=meap93)
lm.3 <- lm(lsalary ~ bs + lenroll + lstaff + droprate + gradrate, data=meap93)

summary(lm.1)
summary(lm.2)
summary(lm.3)

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s