Chapter 7

Dummy-Interact-Sep.R
data(gpa3, package='wooldridge')

# Estimate model for males (& spring data)
lm(cumgpa~sat+hsperc+tothrs, data=gpa3, subset=(spring==1&female==0))

# Estimate model for females (& spring data)
lm(cumgpa~sat+hsperc+tothrs, data=gpa3, subset=(spring==1&female==1))
Dummy-Interact.R
data(gpa3, package='wooldridge')

# Model with full interactions with female dummy (only for spring data)
reg<-lm(cumgpa~female*(sat+hsperc+tothrs), data=gpa3, subset=(spring==1))
summary(reg)

# F-Test from package "car". H0: the interaction coefficients are zero
# matchCoefs(...) selects all coeffs with names containing "female"
library(car)
linearHypothesis(reg, matchCoefs(reg, "female"))
Example-7-1-logical.R
data(wage1, package='wooldridge')

# replace "female" with logical variable
wage1$female <- as.logical(wage1$female)
table(wage1$female)
  
# regression with logical variable
lm(wage ~ female+educ+exper+tenure, data=wage1)
Example-7-1.R
data(wage1, package='wooldridge')

lm(wage ~ female+educ+exper+tenure, data=wage1)
Example-7-5.R
data(wage1, package='wooldridge')

lm(log(wage) ~ female+educ+exper+I(exper^2)+tenure+I(tenure^2), data=wage1)
Example-7-6.R
data(wage1, package='wooldridge')

lm(log(wage)~married*female+educ+exper+I(exper^2)+tenure+I(tenure^2),
                                                           data=wage1)
Example-7-8.R
data(lawsch85, package='wooldridge')

# Define cut points for the rank
cutpts <- c(0,10,25,40,60,100,175)

# Create factor variable containing ranges for the rank
lawsch85$rankcat <- cut(lawsch85$rank, cutpts)

# Display frequencies
table(lawsch85$rankcat)

# Choose reference category
lawsch85$rankcat <- relevel(lawsch85$rankcat,"(100,175]")

# Run regression
(res <- lm(log(salary)~rankcat+LSAT+GPA+log(libvol)+log(cost), data=lawsch85))

# ANOVA table
car::Anova(res)
Regr-Factors-Anova.R
data(CPS1985,package="AER")

# Regression
res <- lm(log(wage) ~ education+experience+gender+occupation, data=CPS1985)

# ANOVA table
car::Anova(res)
Regr-Factors.R
data(CPS1985,package="AER")

# Table of categories and frequencies for two factor variables:
table(CPS1985$gender)
table(CPS1985$occupation)

# Directly using factor variables in regression formula:
lm(log(wage) ~ education+experience+gender+occupation, data=CPS1985)

# Manually redefine the  reference category:
CPS1985$gender <- relevel(CPS1985$gender,"female")
CPS1985$occupation <- relevel(CPS1985$occupation,"management")

# Rerun regression:
lm(log(wage) ~ education+experience+gender+occupation, data=CPS1985)
Dummy-Interact-Sep.py
import wooldridge as woo
import pandas as pd
import statsmodels.formula.api as smf

gpa3 = woo.dataWoo('gpa3')

# estimate model for males (& spring data):
reg_m = smf.ols(formula='cumgpa ~ sat + hsperc + tothrs',
                data=gpa3,
                subset=(gpa3['spring'] == 1) & (gpa3['female'] == 0))
results_m = reg_m.fit()

# print regression table:
table_m = pd.DataFrame({'b': round(results_m.params, 4),
                        'se': round(results_m.bse, 4),
                        't': round(results_m.tvalues, 4),
                        'pval': round(results_m.pvalues, 4)})
print(f'table_m: \n{table_m}\n')

# estimate model for females (& spring data):
reg_f = smf.ols(formula='cumgpa ~ sat + hsperc + tothrs',
                data=gpa3,
                subset=(gpa3['spring'] == 1) & (gpa3['female'] == 1))
results_f = reg_f.fit()

# print regression table:
table_f = pd.DataFrame({'b': round(results_f.params, 4),
                        'se': round(results_f.bse, 4),
                        't': round(results_f.tvalues, 4),
                        'pval': round(results_f.pvalues, 4)})
print(f'table_f: \n{table_f}\n')
Dummy-Interact.py
import wooldridge as woo
import pandas as pd
import statsmodels.formula.api as smf

gpa3 = woo.dataWoo('gpa3')

# model with full interactions with female dummy (only for spring data):
reg = smf.ols(formula='cumgpa ~ female * (sat + hsperc + tothrs)',
              data=gpa3, subset=(gpa3['spring'] == 1))
results = reg.fit()

# print regression table:
table = pd.DataFrame({'b': round(results.params, 4),
                      'se': round(results.bse, 4),
                      't': round(results.tvalues, 4),
                      'pval': round(results.pvalues, 4)})
print(f'table: \n{table}\n')

# F-Test for H0 (the interaction coefficients of 'female' are zero):
hypotheses = ['female = 0', 'female:sat = 0',
              'female:hsperc = 0', 'female:tothrs = 0']
ftest = results.f_test(hypotheses)
fstat = ftest.statistic[0][0]
fpval = ftest.pvalue

print(f'fstat: {fstat}\n')
print(f'fpval: {fpval}\n')
Example-7-1-Boolean.py
import wooldridge as woo
import pandas as pd
import statsmodels.formula.api as smf

wage1 = woo.dataWoo('wage1')

# regression with boolean variable:
wage1['isfemale'] = (wage1['female'] == 1)
reg = smf.ols(formula='wage ~ isfemale + educ + exper + tenure', data=wage1)
results = reg.fit()

# print regression table:
table = pd.DataFrame({'b': round(results.params, 4),
                      'se': round(results.bse, 4),
                      't': round(results.tvalues, 4),
                      'pval': round(results.pvalues, 4)})
print(f'table: \n{table}\n')
Example-7-1.py
import wooldridge as woo
import pandas as pd
import statsmodels.formula.api as smf

wage1 = woo.dataWoo('wage1')

reg = smf.ols(formula='wage ~ female + educ + exper + tenure', data=wage1)
results = reg.fit()

# print regression table:
table = pd.DataFrame({'b': round(results.params, 4),
                      'se': round(results.bse, 4),
                      't': round(results.tvalues, 4),
                      'pval': round(results.pvalues, 4)})
print(f'table: \n{table}\n')
Example-7-5.py
import wooldridge as woo
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf

wage1 = woo.dataWoo('wage1')

reg = smf.ols(
    formula='np.log(wage) ~ female + educ + exper + I(exper**2) + tenure + I(tenure**2)',
    data=wage1)
results = reg.fit()

# print regression table:
table = pd.DataFrame({'b': round(results.params, 4),
                      'se': round(results.bse, 4),
                      't': round(results.tvalues, 4),
                      'pval': round(results.pvalues, 4)})
print(f'table: \n{table}\n')
Example-7-6.py
import wooldridge as woo
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf

wage1 = woo.dataWoo('wage1')

reg = smf.ols(formula='np.log(wage) ~ married*female + educ + exper +'
                      'I(exper**2) + tenure + I(tenure**2)', data=wage1)
results = reg.fit()

# print regression table:
table = pd.DataFrame({'b': round(results.params, 4),
                      'se': round(results.bse, 4),
                      't': round(results.tvalues, 4),
                      'pval': round(results.pvalues, 4)})
print(f'table: \n{table}\n')
Example-7-8.py
import wooldridge as woo
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf

lawsch85 = woo.dataWoo('lawsch85')

# define cut points for the rank:
cutpts = [0, 10, 25, 40, 60, 100, 175]

# create categorical variable containing ranges for the rank:
lawsch85['rc'] = pd.cut(lawsch85['rank'], bins=cutpts,
                        labels=['(0,10]', '(10,25]', '(25,40]',
                                '(40,60]', '(60,100]', '(100,175]'])

# display frequencies:
freq = pd.crosstab(lawsch85['rc'], columns='count')
print(f'freq: \n{freq}\n')

# run regression:
reg = smf.ols(formula='np.log(salary) ~ C(rc, Treatment("(100,175]")) +'
                      'LSAT + GPA + np.log(libvol) + np.log(cost)',
              data=lawsch85)
results = reg.fit()

# print regression table:
table_reg = pd.DataFrame({'b': round(results.params, 4),
                          'se': round(results.bse, 4),
                          't': round(results.tvalues, 4),
                          'pval': round(results.pvalues, 4)})
print(f'table_reg: \n{table_reg}\n')

# ANOVA table:
table_anova = sm.stats.anova_lm(results, typ=2)
print(f'table_anova: \n{table_anova}\n')
Regr-Categorical-Anova.py
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf

CPS1985 = pd.read_csv('data/CPS1985.csv')

# run regression:
reg = smf.ols(
    formula='np.log(wage) ~ education + experience + gender + occupation',
    data=CPS1985)
results = reg.fit()

# print regression table:
table_reg = pd.DataFrame({'b': round(results.params, 4),
                          'se': round(results.bse, 4),
                          't': round(results.tvalues, 4),
                          'pval': round(results.pvalues, 4)})
print(f'table_reg: \n{table_reg}\n')

# ANOVA table:
table_anova = sm.stats.anova_lm(results, typ=2)
print(f'table_anova: \n{table_anova}\n')
Regr-Categorical.py
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf

CPS1985 = pd.read_csv('data/CPS1985.csv')
# rename variable to make outputs more compact:
CPS1985['oc'] = CPS1985['occupation']

# table of categories and frequencies for two categorical variables:
freq_gender = pd.crosstab(CPS1985['gender'], columns='count')
print(f'freq_gender: \n{freq_gender}\n')

freq_occupation = pd.crosstab(CPS1985['oc'], columns='count')
print(f'freq_occupation: \n{freq_occupation}\n')

# directly using categorical variables in regression formula:
reg = smf.ols(formula='np.log(wage) ~ education +'
                      'experience + C(gender) + C(oc)', data=CPS1985)
results = reg.fit()

# print regression table:
table = pd.DataFrame({'b': round(results.params, 4),
                      'se': round(results.bse, 4),
                      't': round(results.tvalues, 4),
                      'pval': round(results.pvalues, 4)})
print(f'table: \n{table}\n')

# rerun regression with different reference category:
reg_newref = smf.ols(formula='np.log(wage) ~ education + experience + '
                             'C(gender, Treatment("male")) + '
                             'C(oc, Treatment("technical"))', data=CPS1985)
results_newref = reg_newref.fit()

# print results:
table_newref = pd.DataFrame({'b': round(results_newref.params, 4),
                             'se': round(results_newref.bse, 4),
                             't': round(results_newref.tvalues, 4),
                             'pval': round(results_newref.pvalues, 4)})
print(f'table_newref: \n{table_newref}\n')
Dummy-Interact-Sep.jl
using WooldridgeDatasets, GLM, DataFrames

gpa3 = DataFrame(wooldridge("gpa3"))

# estimate model for males (& spring data):
reg_m = lm(@formula(cumgpa ~ sat + hsperc + tothrs),
    subset(gpa3, :spring => ByRow(==(1)), :female => ByRow(==(0))))
table_reg_m = coeftable(reg_m)
println("table_reg_m: \n$table_reg_m")

# estimate model for females (& spring data):
reg_f = lm(@formula(cumgpa ~ sat + hsperc + tothrs),
    subset(gpa3, :spring => ByRow(==(1)), :female => ByRow(==(1))))
table_reg_f = coeftable(reg_f)
println("table_reg_f: \n$table_reg_f")
Dummy-Interact.jl
using WooldridgeDatasets, GLM, DataFrames

gpa3 = DataFrame(wooldridge("gpa3"))

# model with full interactions with female dummy (only for spring data):
reg_ur = lm(@formula(cumgpa ~ female * (sat + hsperc + tothrs)),
    subset(gpa3, :spring => ByRow(==(1))))
table_reg_ur = coeftable(reg_ur)
println("table_reg_ur: \n$table_reg_ur\n")

# F test for H0 (the interaction coefficients of "female" are zero):
reg_r = lm(@formula(cumgpa ~ sat + hsperc + tothrs),
    subset(gpa3, :spring => ByRow(==(1))))

ftest_res = ftest(reg_r.model, reg_ur.model)
fstat = ftest_res.fstat[2]
fpval = ftest_res.pval[2]
println("fstat = $fstat\n")
println("fpval = $fpval")
Example-7-1-Boolean.jl
using WooldridgeDatasets, GLM, DataFrames

wage1 = DataFrame(wooldridge("wage1"))

# regression with boolean variable:
wage1.isfemale = Bool.(wage1.female)
reg = lm(@formula(wage ~ isfemale + educ + exper + tenure), wage1,
    contrasts=Dict(:isfemale => DummyCoding()))

table_reg = coeftable(reg)
println("table_reg: \n$table_reg")
Example-7-1.jl
using WooldridgeDatasets, GLM, DataFrames

wage1 = DataFrame(wooldridge("wage1"))

reg = lm(@formula(wage ~ female + educ + exper + tenure), wage1)
table_reg = coeftable(reg)
println("table_reg: \n$table_reg")
Example-7-6.jl
using WooldridgeDatasets, GLM, DataFrames

wage1 = DataFrame(wooldridge("wage1"))

reg = lm(@formula(log(wage) ~
        married * female + educ + exper + (exper^2) +
        tenure + (tenure^2)), wage1)
table_reg = coeftable(reg)
println("table_reg: \n$table_reg")
Example-7-8.jl
using WooldridgeDatasets, GLM, DataFrames, CategoricalArrays, FreqTables

lawsch85 = DataFrame(wooldridge("lawsch85"))

# define cut points for the rank:
cutpts = [1, 11, 26, 41, 61, 101, 176]
# note that "cut" takes intervals only in the form of [lower, upper)

# create categorical variable containing ranges for the rank:
lawsch85.rc = cut(lawsch85.rank, cutpts,
        labels=["[1,11)", "[11,26)", "[26,41)",
                "[41,61)", "[61,101)", "[101,176)"])

# display frequencies:
freq = freqtable(lawsch85.rc)
println("freq: \n$freq\n")

# run regression:
reg = lm(@formula(log(salary) ~ rc + LSAT + GPA + log(libvol) + log(cost)),
        lawsch85,
        contrasts=Dict(:rc => DummyCoding(base="[101,176)",
                levels=["[1,11)", "[11,26)", "[26,41)",
                        "[41,61)", "[61,101)", "[101,176)"])))
table_reg = coeftable(reg)
println("table_reg: \n$table_reg")
Regr-Categorical.jl
using WooldridgeDatasets, GLM, DataFrames, FreqTables, CSV

CPS1985 = DataFrame(CSV.File("data/CPS1985.csv"))
# rename variable to make outputs more compact:
rename!(CPS1985, :occupation => :oc)

# table of categories and frequencies for two categorical variables:
freq_gender = freqtable(CPS1985.gender)
println("freq_gender: \n$freq_gender\n")

freq_occupation = freqtable(CPS1985.oc)
println("freq_occupation: \n$freq_occupation\n")

# directly using categorical variables in regression formula
# (the formula automatically interprets string
# columns as categorical variables and dummy codes them):
reg = lm(@formula(log(wage) ~ education + experience + gender + oc), CPS1985)
table_reg = coeftable(reg)
println("table_reg: \n$table_reg")

# rerun regression with different reference category:
reg_newref = lm(@formula(log(wage) ~ education + experience + gender + oc),
    CPS1985,
    contrasts=Dict(:gender => DummyCoding(base="male"),
        :oc => DummyCoding(base="technical")))
table_newref = coeftable(reg_newref)
println("table_newref: \n$table_newref")