# imports 
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
from os.path import exists
from scipy.stats import norm, t, chi2, logistic


params = {'legend.fontsize': 'x-large',
         'axes.labelsize': 'x-large',
         'axes.titlesize':'x-large',
         'xtick.labelsize':'x-large',
         'ytick.labelsize':'x-large'}
pylab.rcParams.update(params)


# raw data files
stem = 'https://www.eia.gov/consumption/residential/data/'
recs15_file = stem + '2015/csv/recs2015_public_v4.csv'
recs15_local = 'recs2015_public_v4.csv'

if exists(recs15_local):
    recs15 = pd.read_csv(recs15_local)
else:
    recs15 = pd.read_csv(recs15_file)
    recs15.to_csv(recs15_local)


# columns 
cols = {
    'DOEID': 'id',
    'REGIONC': 'region',
    'TYPEHUQ': 'home_type',
    'UATYP10': 'urban',
    'BASEFIN': 'finished_basement'
}
dat0 = recs15[cols.keys()].copy()
dat0.rename(columns=cols, inplace=True)

# category labels
cat_vars = {
    'region': {1:'Northeast', 2: 'Midwest', 3: 'South', 4: 'West'},
    'home_type': {
        1: 'Mobile Home',
        2: 'Single-Family Detached',
        3: 'Single-Family Attached',
        4: 'Apartment in Building with 2 - 4 Units',
        5: 'Apartment in Building with 5+ Units'
    },
    'urban': {'U': 'urban', 'C': 'urban cluster', 'R': 'rural'}
}

for c in cat_vars.keys():
    dat0[c] = dat0[c].replace(cat_vars[c])
    dat0[c] = pd.Categorical(dat0[c])

# response variable
dep_vars =  {'finished_basement': {1: 1, 0: 0, -2: np.nan}}
for c in dep_vars.keys():
    dat0[c] = dat0[c].replace(dep_vars[c])

dat0.head()


(dat0
 #.dropna()
 .groupby(
   ['region', 'home_type', 'urban', 'finished_basement'],
   #['region', 'home_type', 'urban'],
   as_index=False
  )
 .size()
 #.query('size > 0')
)


# cases for this analysis
dat0 = dat0.dropna()
dat0['home_type'] = dat0['home_type'].cat.remove_unused_categories()

# initial model
mod0 = smf.logit('finished_basement ~ home_type', data=dat0)
#mod0.exog
res0 = mod0.fit(disp=True)
res0.summary()


odds_ratio = np.exp(-1 * res0.params[1])
ci = np.exp(-1 * res0.conf_int().iloc[1, ]).values
'{0:4.2f} ({1:4.2f}, {2:4.2f})'.format(odds_ratio, ci[1], ci[0])


mod1 = smf.logit('finished_basement ~ region + home_type', data=dat0)
res1 = mod1.fit()
res1.summary()


mod2 = smf.logit('finished_basement ~ region*home_type', data=dat0)
res2 = mod2.fit()
res2.summary()


print([np.round(r.aic, 1) for r in (res0, res1, res2)])

# LRT
delta = 2 * (mod2.loglike(res2.params) - mod1.loglike(res1.params))
df = mod2.df_model - mod1.df_model
p = 1 - chi2.cdf(delta, df=df)
'$Chi^2$ = {0:4.2f}, df = {1:1g}, p = {2:4.2f}'.format(delta, df, p)


# unique values to predict
region_home = (dat0
 #.dropna()
 .groupby(
   ['region', 'home_type'],
   as_index=False
  )
 .size()
)
modx = smf.ols('size ~ region + home_type', data=region_home)
x = modx.exog

# predictions
region_home['est'] = mod1.predict(params=res1.params, exog=modx.exog)

# confidence bounds
b = res1.params.values
est = np.dot(x, b)
v = res1.cov_params()
se = np.sqrt(np.diag(np.dot(np.dot(x, v), x.T)))
z = norm.ppf(0.975)
region_home['lwr'] = logistic.cdf(est - z * se)
region_home['upr'] = logistic.cdf(est + z * se)


fig0, ax0 = plt.subplots(nrows=1)
colors = ('darkblue', 'magenta')
for i in range(2):
    type = cat_vars['home_type'][(2, 3)[i]]
    df = region_home.query('home_type == @type')
#    ax0.scatter(x=df['est'], y=df['region'], color=colors[i], label=type)
    err = df['est'] - df['lwr'], df['upr'] - df['est']
    ax0.errorbar(
        x=df['est'],
        y=df['region'],
        xerr=err,
        color=colors[i],
        capsize=6,
        fmt="s",
        label=type,
        alpha=0.5
    )
ax0.legend(loc='upper left')
ax0.set_xlim((0, 1))


recs15.columns[[c[0] == 'N' for c in recs15.columns]] #NUMCFAN
recs15.columns[[c[0] == 'T' for c in recs15.columns]]  #TOTROOMS
recs15[['NUMCFAN', 'TOTROOMS']].min()


fan_vars = {'NUMCFAN': 'n_ceil_fans', 'TOTROOMS': 'n_rooms'}
cols2 = cols.copy()
cols2.update(fan_vars)
_ = cols2.pop('BASEFIN')

dat1 = recs15[cols2.keys()].copy()
dat1.rename(columns=cols2, inplace=True)

for c in cat_vars.keys():
    dat1[c] = dat1[c].replace(cat_vars[c])
    dat1[c] = pd.Categorical(dat1[c])


fan_mod0 = smf.poisson(
  'n_ceil_fans ~ region',
  exposure=dat1['n_rooms'],
  data=dat1
)
fan_res0 = fan_mod0.fit()
fan_res0.summary()


# design matrix for predictions
x_new = np.array([[1, 0, 0, 0], [1, 1, 0, 0], [1, 0, 1, 0], [1, 0, 0, 1]])

# predicted rates
y_hat = fan_mod0.predict(params=fan_res0.params, exog=x_new, linear=False)
#?fan_mod0.predict

# key values
b = fan_res0.params.values
v = fan_res0.cov_params()
lp = np.dot(x_new, b)
y_hat = np.exp(lp)

# confidence bounds / margins of error 
se = np.sqrt(np.diag(np.dot(np.dot(x_new, v), x_new.T)))
z = norm.ppf(.975)
lwr, upr = np.exp(lp - z * se), np.exp(lp + z * se)
fans_per_room = pd.DataFrame({'est': y_hat, 'moe': z * se})
fans_per_room['region'] = ('MW', 'NE', 'S', 'W')


ax = (
    fans_per_room
    .sort_values('est')
    .plot
    .scatter(x='est', y='region', xerr='moe')
)
_ = ax.set_xlabel('Average Ceiling Fans per Room')

statsmodels¶

Overview¶

Imports¶

Figures¶

GLM¶

GLM¶

GLM¶

OLS as GLM¶

Common GLMs¶

Logistic Regression¶

Logistic Regression Example¶

Download Data¶

Clean the Data¶

Examine cell sizes¶

Model 0¶

Odds Ratio¶

Model 1¶

Model 1¶

Model Comparison¶

Predictions¶

Visual¶

Poisson Regression¶

Mean Variance Relationship¶

Offsets and Exposures¶

Poisson Regression Example¶

Poisson Regression Example¶

Data Prep¶

Model¶

Predicted Rates¶

Visualize Rates¶

Takeaways¶