statsmodels is a Python module that provides classes and functions for the estimation of many different statistical models, as well as for conducting statistical tests, and statistical data exploration.
--[statsmodels][sm]
api
submodule with the handle sm
.# imports
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
from os.path import exists
from scipy.stats import t
sm.__version__
# tooth growth data
file = 'tooth_growth.feather'
if exists(file):
tg_data = pd.read_feather(file)
else:
tooth_growth = sm.datasets.get_rdataset('ToothGrowth')
#print(tooth_growth.__doc__)
tg_data = tooth_growth.data
tg_data.to_feather(file)
(tg_data
.groupby(['dose', 'supp'])
.apply(lambda gdf: gdf.iloc[0:2, :])
)
tg_05 = tg_data.query('dose == 0.5')
Y = tg_05['len']
X = pd.get_dummies(tg_05['supp'])['OJ']
X = sm.add_constant(X)
(Y, X)
sm.OLS()
...summary()
method prints key information. mod0_05 = sm.OLS(Y, X)
res0_05 = mod0_05.fit()
res0_05.summary()
res0_05
contains much of the summary information
in attributes (properties).print(type(res0_05))
print((res0_05.rsquared_adj, res0_05.aic))
(res0_05.params, res0_05.tvalues)
#res0_05. # use tab complete to see methods
?res0_05.conf_int
b = res0_05.params # estimated parameters
v = res0_05.cov_params() # variance-covariance
se = np.sqrt(np.diag(v)) # std errors
t_stats = res0_05.params / se # t statistics
assert all(t_stats == res0_05.tvalues) # same as stored in object
df = res0_05.df_resid # degrees of freedom
p = 2 * (1 - t.cdf(t_stats, df=df)) # p-values
assert all(np.round(p, 6) == np.round(res0_05.pvalues, 6))
tt = t.ppf(.975, df=df) # multiplier
lwr, upr = b - tt * se, b + tt * se # CI bounds
ci = res0_05.conf_int()
ci['lwr'], ci['upr'] = lwr, upr
ci
file_name = 'res0_05.pickle'
# notebook needs to be trusted
res0_05.save(file_name)
del res0_05
res0_05 = sm.load(file_name)
res0_05.summary2()
y_hat, r = res0_05.predict(), res0_05.resid
b0, b1 = res0_05.params
col = [('darkgreen', 'orange')[x] for x in X['OJ'].values]
fig0 = plt.figure(tight_layout=True)
gs = fig0.add_gridspec(2, 2)
f0_ax0 = fig0.add_subplot(gs[0, 0])
_ = f0_ax0.set_title('Observed ~ Predicted')
_ = f0_ax0.scatter(x=y_hat, y=Y, color=col)
f0_ax1 = fig0.add_subplot(gs[0, 1])
_ = f0_ax1.set_title('Residual ~ Predicted')
_ = f0_ax1.scatter(x=y_hat, y=r, color=col)
f0_ax2 = fig0.add_subplot(gs[1, :])
f0_ax2.set_title('QQ Plot')
_ = sm.qqplot(ax=f0_ax2, data=r, line='s')
tg_data['log_len'] = tg_data[['len']].transform(np.log)
tg_data['OJ'] = pd.get_dummies(tg_data['supp'])['OJ']
doses = [0.5, 1.0, 2.0]
cis = []
for dose in doses:
# subset dose
dat = tg_data.query('dose == @dose')
# fit OLS model
Y, X = dat['log_len'], sm.add_constant(dat['OJ'])
m = sm.OLS(Y, X)
res = m.fit()
ci = res.conf_int()
# format confidence interval
ci.columns = ('lwr', 'upr')
ci['est'] = res.params
ci.index.name = 'term'
ci = ci.transform(np.exp)
ci['CI'] = (
ci.groupby('term').apply(lambda gdf:
'{0:4.2f} ({1:4.2f}, {2:4.2f})'.format(
gdf['est'].values[0],
gdf['lwr'].values[0],
gdf['upr'].values[0]
)
)
)
cis.append(ci.loc['OJ', 'CI'])
pd.DataFrame({'dose': doses, 'Ratio of Means (95% CI)': cis})
~
separates DV(s) from the IVs. mod1 = sm.OLS.from_formula('log_len ~ OJ + dose', data=tg_data)
res1 = mod1.fit()
res1.summary2()
endog
and exog
and *_names
. X = mod1.exog
print(mod1.exog_names)
X
smf
. smf.ols()
with a formula when fitting linear regression models. # smf.
mod1 = smf.ols('log_len ~ OJ + dose', data=tg_data)
res1 = mod1.fit()
res1.summary2()
OJ
and dose
treated as a continuous variable. mod2 = sm.OLS.from_formula('log_len ~ OJ:dose', data=tg_data)
res2 = mod2.fit()
res2.summary2()
X
to look like below. How many zeros will it
have? X = mod2.exog
print(mod2.exog_names)
X
a + b + a:b
can be concisely written as a*b
. mod3 = smf.ols('log_len ~ OJ*dose', data=tg_data)
res3 = mod3.fit()
res3.summary2()
X
to have? #mod3 = smf.ols('log_len ~ OJ*dose', data=tg_data)
X = mod3.exog
print(mod3.exog_names)
X
supp
(or OJ
).tg_data['dose_cat'] = pd.Categorical(tg_data['dose'])
mod4 = smf.ols('log_len ~ OJ*dose_cat', data=tg_data)
res4 = mod4.fit()
res4.summary2()
X
to have? #mod4 = smf.ols('log_len ~ OJ*dose_cat', data=tg_data)
X = mod4.exog
print(mod4.exog_names)
X
print(np.round(-2 * mod4.loglike(res4.params) + 2 * len(res4.params), 2))
[np.round(r.aic, 1) for r in [res1, res2, res3, res4]]
L
below
specifies a contrast. b = res4.params
v = res4.cov_params()
L = np.zeros((3, len(b)))
L[0, 3] = 1 # OJ
L[1, (3, 4)] = 1 # OJ + OJ:dose_cat[T.1.0]
L[2, (3, 5)] = 1 # OJ + OJ:dose_cat[T.2.0]
est = np.exp(np.dot(L, b))
se = np.sqrt(np.diag(np.dot(np.dot(L, v), L.T)))
tt = t.ppf(.975, df=res4.df_resid)
lwr, upr = est - tt * se, est + tt * se
np.round(np.array([est, lwr, upr]).T, 2)
.t_test()
method of the
results object. contrasts = res4.t_test(L)
print(np.round(np.exp(contrasts.effect), 2))
contrasts
L
takes care..t_test()
method also accepts a string with contrasts specified using
the parameter (variable) names. contrasts = res4.t_test(
'OJ = 0, OJ + OJ:dose_cat[T.1.0] = 0, OJ + OJ:dose_cat[T.2.0] = 0'
)
#contrasts
#contrasts.summary()
est_ci4 = np.zeros((3, 3))
est_ci4[:, 0] = np.exp(contrasts.effect)
est_ci4[:, 1:3] = np.exp(contrasts.conf_int())
np.round(est_ci4, 2)
:
indicates an interaction and a*b = a + b + a:b
..t_test()
method.