# imports
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
from os.path import exists
from scipy.stats import norm, t, chi2, logistic
pylab
to control the appearance of figures. params = {'legend.fontsize': 'x-large',
'axes.labelsize': 'x-large',
'axes.titlesize':'x-large',
'xtick.labelsize':'x-large',
'ytick.labelsize':'x-large'}
pylab.rcParams.update(params)
This gives the model:
$$ \log \frac{ P(Y = 1 | X) }{P(Y = 0|X)} = X\beta, \quad \textrm{or} \quad E[Y | X] = \frac{e^{X\beta}}{1 + e^{X\beta}}, $$
Use the 2015 RECS data to examine features associated with homes that have finished basements.
# raw data files
stem = 'https://www.eia.gov/consumption/residential/data/'
recs15_file = stem + '2015/csv/recs2015_public_v4.csv'
recs15_local = 'recs2015_public_v4.csv'
if exists(recs15_local):
recs15 = pd.read_csv(recs15_local)
else:
recs15 = pd.read_csv(recs15_file)
recs15.to_csv(recs15_local)
# columns
cols = {
'DOEID': 'id',
'REGIONC': 'region',
'TYPEHUQ': 'home_type',
'UATYP10': 'urban',
'BASEFIN': 'finished_basement'
}
dat0 = recs15[cols.keys()].copy()
dat0.rename(columns=cols, inplace=True)
# category labels
cat_vars = {
'region': {1:'Northeast', 2: 'Midwest', 3: 'South', 4: 'West'},
'home_type': {
1: 'Mobile Home',
2: 'Single-Family Detached',
3: 'Single-Family Attached',
4: 'Apartment in Building with 2 - 4 Units',
5: 'Apartment in Building with 5+ Units'
},
'urban': {'U': 'urban', 'C': 'urban cluster', 'R': 'rural'}
}
for c in cat_vars.keys():
dat0[c] = dat0[c].replace(cat_vars[c])
dat0[c] = pd.Categorical(dat0[c])
# response variable
dep_vars = {'finished_basement': {1: 1, 0: 0, -2: np.nan}}
for c in dep_vars.keys():
dat0[c] = dat0[c].replace(dep_vars[c])
dat0.head()
(dat0
#.dropna()
.groupby(
['region', 'home_type', 'urban', 'finished_basement'],
#['region', 'home_type', 'urban'],
as_index=False
)
.size()
#.query('size > 0')
)
exog
) will be less than
full-rank. # cases for this analysis
dat0 = dat0.dropna()
dat0['home_type'] = dat0['home_type'].cat.remove_unused_categories()
# initial model
mod0 = smf.logit('finished_basement ~ home_type', data=dat0)
#mod0.exog
res0 = mod0.fit(disp=True)
res0.summary()
odds_ratio = np.exp(-1 * res0.params[1])
ci = np.exp(-1 * res0.conf_int().iloc[1, ]).values
'{0:4.2f} ({1:4.2f}, {2:4.2f})'.format(odds_ratio, ci[1], ci[0])
mod1 = smf.logit('finished_basement ~ region + home_type', data=dat0)
res1 = mod1.fit()
res1.summary()
mod2 = smf.logit('finished_basement ~ region*home_type', data=dat0)
res2 = mod2.fit()
res2.summary()
print([np.round(r.aic, 1) for r in (res0, res1, res2)])
# LRT
delta = 2 * (mod2.loglike(res2.params) - mod1.loglike(res1.params))
df = mod2.df_model - mod1.df_model
p = 1 - chi2.cdf(delta, df=df)
'$Chi^2$ = {0:4.2f}, df = {1:1g}, p = {2:4.2f}'.format(delta, df, p)
# unique values to predict
region_home = (dat0
#.dropna()
.groupby(
['region', 'home_type'],
as_index=False
)
.size()
)
modx = smf.ols('size ~ region + home_type', data=region_home)
x = modx.exog
# predictions
region_home['est'] = mod1.predict(params=res1.params, exog=modx.exog)
# confidence bounds
b = res1.params.values
est = np.dot(x, b)
v = res1.cov_params()
se = np.sqrt(np.diag(np.dot(np.dot(x, v), x.T)))
z = norm.ppf(0.975)
region_home['lwr'] = logistic.cdf(est - z * se)
region_home['upr'] = logistic.cdf(est + z * se)
xerr
. fig0, ax0 = plt.subplots(nrows=1)
colors = ('darkblue', 'magenta')
for i in range(2):
type = cat_vars['home_type'][(2, 3)[i]]
df = region_home.query('home_type == @type')
# ax0.scatter(x=df['est'], y=df['region'], color=colors[i], label=type)
err = df['est'] - df['lwr'], df['upr'] - df['est']
ax0.errorbar(
x=df['est'],
y=df['region'],
xerr=err,
color=colors[i],
capsize=6,
fmt="s",
label=type,
alpha=0.5
)
ax0.legend(loc='upper left')
ax0.set_xlim((0, 1))
Poisson regression with a log link $h(x) = \log(x)$ is often used for a count-valued response
$$ \log E[Y | X] = X\beta, \qquad \textrm{or} \qquad E[Y | X] = e^{X\beta}. $$
Common to use an exposure to model count-valued responses as rates.
$$ \log \frac{E[Y | X]}{N} = X\beta \iff \log E[Y | X] = X\beta + \log N. $$
recs15.columns[[c[0] == 'N' for c in recs15.columns]] #NUMCFAN
recs15.columns[[c[0] == 'T' for c in recs15.columns]] #TOTROOMS
recs15[['NUMCFAN', 'TOTROOMS']].min()
Is the popularity of ceiling fans similar across regions of the US? To answer, we estimate the rate of ceiling fans per room in residences of all types for each Census region.
fan_vars = {'NUMCFAN': 'n_ceil_fans', 'TOTROOMS': 'n_rooms'}
cols2 = cols.copy()
cols2.update(fan_vars)
_ = cols2.pop('BASEFIN')
dat1 = recs15[cols2.keys()].copy()
dat1.rename(columns=cols2, inplace=True)
for c in cat_vars.keys():
dat1[c] = dat1[c].replace(cat_vars[c])
dat1[c] = pd.Categorical(dat1[c])
fan_mod0 = smf.poisson(
'n_ceil_fans ~ region',
exposure=dat1['n_rooms'],
data=dat1
)
fan_res0 = fan_mod0.fit()
fan_res0.summary()
# design matrix for predictions
x_new = np.array([[1, 0, 0, 0], [1, 1, 0, 0], [1, 0, 1, 0], [1, 0, 0, 1]])
# predicted rates
y_hat = fan_mod0.predict(params=fan_res0.params, exog=x_new, linear=False)
#?fan_mod0.predict
# key values
b = fan_res0.params.values
v = fan_res0.cov_params()
lp = np.dot(x_new, b)
y_hat = np.exp(lp)
# confidence bounds / margins of error
se = np.sqrt(np.diag(np.dot(np.dot(x_new, v), x_new.T)))
z = norm.ppf(.975)
lwr, upr = np.exp(lp - z * se), np.exp(lp + z * se)
fans_per_room = pd.DataFrame({'est': y_hat, 'moe': z * se})
fans_per_room['region'] = ('MW', 'NE', 'S', 'W')
ax = (
fans_per_room
.sort_values('est')
.plot
.scatter(x='est', y='region', xerr='moe')
)
_ = ax.set_xlabel('Average Ceiling Fans per Room')
smf
gives us top-level functions for key
regression extensions, including common GLMs.