# imports 
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
from os.path import exists
from scipy.stats import t
sm.__version__


# tooth growth data
file = 'tooth_growth.feather'
if exists(file):
    tg_data = pd.read_feather(file)
else: 
    tooth_growth = sm.datasets.get_rdataset('ToothGrowth')
    #print(tooth_growth.__doc__)
    tg_data = tooth_growth.data
    tg_data.to_feather(file)

(tg_data
 .groupby(['dose', 'supp'])
 .apply(lambda gdf: gdf.iloc[0:2, :])
)


tg_05 = tg_data.query('dose == 0.5')
Y = tg_05['len']
X = pd.get_dummies(tg_05['supp'])['OJ']
X = sm.add_constant(X) 
(Y, X)


mod0_05 = sm.OLS(Y, X)
res0_05 = mod0_05.fit()
res0_05.summary()


print(type(res0_05))
print((res0_05.rsquared_adj, res0_05.aic))
(res0_05.params, res0_05.tvalues)
#res0_05.  # use tab complete to see methods
?res0_05.conf_int


b = res0_05.params                     # estimated parameters
v = res0_05.cov_params()               # variance-covariance
se = np.sqrt(np.diag(v))               # std errors
t_stats = res0_05.params / se          # t statistics
assert all(t_stats == res0_05.tvalues) # same as stored in object     

df = res0_05.df_resid                  # degrees of freedom
p = 2 * (1 - t.cdf(t_stats, df=df))    # p-values
assert all(np.round(p, 6) == np.round(res0_05.pvalues, 6))

tt = t.ppf(.975, df=df)                # multiplier
lwr, upr = b - tt * se, b + tt * se    # CI bounds
ci = res0_05.conf_int()
ci['lwr'], ci['upr'] = lwr, upr
ci


file_name = 'res0_05.pickle'
# notebook needs to be trusted
res0_05.save(file_name)
del res0_05
res0_05 = sm.load(file_name)
res0_05.summary2()


y_hat, r = res0_05.predict(), res0_05.resid
b0, b1 = res0_05.params
col = [('darkgreen', 'orange')[x] for x in X['OJ'].values]

fig0 = plt.figure(tight_layout=True)
gs = fig0.add_gridspec(2, 2)

f0_ax0 = fig0.add_subplot(gs[0, 0])
_ = f0_ax0.set_title('Observed ~ Predicted')
_ = f0_ax0.scatter(x=y_hat, y=Y, color=col)

f0_ax1 = fig0.add_subplot(gs[0, 1])
_ = f0_ax1.set_title('Residual ~ Predicted')
_ = f0_ax1.scatter(x=y_hat, y=r, color=col)

f0_ax2 = fig0.add_subplot(gs[1, :])
f0_ax2.set_title('QQ Plot')
_ = sm.qqplot(ax=f0_ax2, data=r, line='s')


tg_data['log_len'] = tg_data[['len']].transform(np.log)
tg_data['OJ'] = pd.get_dummies(tg_data['supp'])['OJ']
doses = [0.5, 1.0, 2.0]
cis = []
for dose in doses:
    # subset dose
    dat = tg_data.query('dose == @dose')
    
    # fit OLS model
    Y, X = dat['log_len'], sm.add_constant(dat['OJ'])
    m = sm.OLS(Y, X)
    res = m.fit()
    ci = res.conf_int()
    
    # format confidence interval
    ci.columns = ('lwr', 'upr')
    ci['est'] = res.params
    ci.index.name = 'term'
    ci = ci.transform(np.exp)
    ci['CI'] = (
        ci.groupby('term').apply(lambda gdf: 
                         '{0:4.2f} ({1:4.2f}, {2:4.2f})'.format(
                             gdf['est'].values[0],
                             gdf['lwr'].values[0], 
                             gdf['upr'].values[0]
                         )
                        )
    )
    cis.append(ci.loc['OJ', 'CI'])
pd.DataFrame({'dose': doses, 'Ratio of Means (95% CI)': cis})


mod1 = sm.OLS.from_formula('log_len ~ OJ + dose', data=tg_data)
res1 = mod1.fit()
res1.summary2()


X = mod1.exog
print(mod1.exog_names)
X


# smf.
mod1 = smf.ols('log_len ~ OJ + dose', data=tg_data)
res1 = mod1.fit()
res1.summary2()


mod2 = sm.OLS.from_formula('log_len ~ OJ:dose', data=tg_data)
res2 = mod2.fit()
res2.summary2()


X = mod2.exog
print(mod2.exog_names)
X


mod3 = smf.ols('log_len ~ OJ*dose', data=tg_data)
res3 = mod3.fit()
res3.summary2()


#mod3 = smf.ols('log_len ~ OJ*dose', data=tg_data)
X = mod3.exog
print(mod3.exog_names)
X


tg_data['dose_cat'] = pd.Categorical(tg_data['dose'])
mod4 = smf.ols('log_len ~ OJ*dose_cat', data=tg_data)
res4 = mod4.fit()
res4.summary2()


#mod4 = smf.ols('log_len ~ OJ*dose_cat', data=tg_data)
X = mod4.exog
print(mod4.exog_names)
X


print(np.round(-2 * mod4.loglike(res4.params) + 2 * len(res4.params), 2))
[np.round(r.aic, 1) for r in [res1, res2, res3, res4]]


b = res4.params
v = res4.cov_params()
L = np.zeros((3, len(b)))
L[0, 3] = 1      # OJ
L[1, (3, 4)] = 1 # OJ + OJ:dose_cat[T.1.0]
L[2, (3, 5)] = 1 # OJ + OJ:dose_cat[T.2.0]
est = np.exp(np.dot(L, b))
se = np.sqrt(np.diag(np.dot(np.dot(L, v), L.T)))
tt = t.ppf(.975, df=res4.df_resid)
lwr, upr = est - tt * se, est + tt * se
np.round(np.array([est, lwr, upr]).T, 2)


contrasts = res4.t_test(L)
print(np.round(np.exp(contrasts.effect), 2))
contrasts


contrasts = res4.t_test(
    'OJ = 0, OJ + OJ:dose_cat[T.1.0] = 0, OJ + OJ:dose_cat[T.2.0] = 0'
)
#contrasts
#contrasts.summary()
est_ci4 = np.zeros((3, 3))
est_ci4[:, 0] = np.exp(contrasts.effect)
est_ci4[:, 1:3] = np.exp(contrasts.conf_int())
np.round(est_ci4, 2)

statsmodels¶

Overview¶

statsmodels¶

statsmodels¶

Regression¶

Regression¶

Linear Regression¶

Design Matrix¶

Example - ToothGrowth¶

Example - endog, exog¶

Example - OLS¶

Example - OLS Attributes¶

Example - Coefficient table¶

Example - OLS Methods¶

Example - Diagnostics¶

Example¶

Formula Method¶

Design Inspection¶

Formula API¶

Interactions¶

Interaction Inspection¶

Main Effects & Interactions¶

Interaction Inspection¶

Categorical Interactions¶

Interaction Inspection¶

AIC¶

Contrasts¶

Contrasts¶

Contrasts¶

Takeaways¶