Techniques that fall under the resampling umbrella include:
The utility of all of these techniques is greatly enhanced by the ability to automate the resampling process and subsequent computations.
# model imports
import numpy as np
import pandas as pd
from scipy.stats import t, bootstrap
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import statsmodels.api as sm
from os.path import exists
params = {'legend.fontsize': 'x-large',
'figure.figsize': (15, 5),
'axes.labelsize': 'x-large',
'axes.titlesize':'x-large',
'xtick.labelsize':'x-large',
'ytick.labelsize':'x-large'}
pylab.rcParams.update(params)
# tooth growth data
file = 'tooth_growth.feather'
if exists(file):
tg_data = pd.read_feather(file)
else:
tooth_growth = sm.datasets.get_rdataset('ToothGrowth')
#print(tooth_growth.__doc__)
tg_data = tooth_growth.data
tg_data.to_feather(file)
#tg_data
mean_ratio = (tg_data
.groupby(['supp', 'dose']) # orders by supplement
.mean()
.groupby(['dose'])
.agg(lambda x: x[0] / x[1])
)
mean_ratio
.sample()
.seed = 2021 * 10 * 3
(tg_data
.groupby(['supp', 'dose'])
.sample(frac=1, replace=True, random_state=seed)
)
def ratio_of_means(df, n_boot, x, invert=False):
"""
Compute ratios of means across replicate datasets.
The column `df[x]` is reshaped into `2 * n_boot` replicates and
the means of the first `n_boot` replicates are compared (pairwise) to
the means of the second block of `n_boot` using a ratio.
Parameters
----------
df : pandas DataFrame
A pandas DataFrame in which to find the column x.
n_boot : int
The number of bootstrap replications.
x : str
A numeric column in df.
invert : bool, optional
If True (False) use means from the second half of x as denominators
(numerators). The default is False.
Returns
-------
None.
"""
x = np.array(df[x])
x = x.reshape((2, n_boot, int(len(x) / (2 * n_boot))))
rom = np.mean(x[0, :, :], axis=1) / np.mean(x[1, :, :], axis=1)
if invert:
rom = 1 / rom
return(pd.DataFrame({'rom': rom}))
est_ratios = (tg_data
.groupby('dose')
.apply(lambda gdf: ratio_of_means(gdf, 1, x='len', invert=True))
)
est_ratios.reset_index(level=1, drop=True, inplace=True)
est_ratios
invert
to
specify the desired order. est_ratios = (tg_data
.sort_values(['dose', 'supp'])
.groupby('dose')
.apply(lambda gdf: ratio_of_means(gdf, 1, x='len', invert=False))
)
est_ratios.reset_index(level=1, drop=True, inplace=True)
est_ratios
n_boot = 1000
# ratios get inverted b/c OJ < VC alphabetically
ratios = (tg_data
.groupby(['supp', 'dose'])
.sample(frac=n_boot, replace=True, random_state = seed)
.groupby('dose')
.apply(lambda gdf: ratio_of_means(gdf, n_boot, x='len', invert=False))
)
ratios.groupby('dose').size()
fig0, ax0 = plt.subplots(nrows=3, sharex=True)
_ = fig0.set_size_inches(8, 8)
fig0.tight_layout()
for i, d in enumerate([0.5, 1.0, 2.0]):
lab = 'dose = {0:3.1f}'.format(d)
_ = (ratios
.query('dose == @d')['rom']
.plot
.hist(ax=ax0[i], legend=False, label=lab)
)
_ = ax0[i].legend()
fig1, ax1 = plt.subplots(nrows=1)
_ = fig1.set_size_inches(8, 4)
col = ['darkred', 'darkblue', 'darkgreen']
for i, d in enumerate([0.5, 1.0, 2.0]):
lab = 'dose = {0:3.1f}'.format(d)
_ = (ratios.
query('dose == @d')['rom']
.plot
.hist(ax=ax1, color=col[i], alpha=0.5, label=lab)
)
_ = ax1.legend()
bse = (ratios
.groupby(['dose'])
.quantile((.025, .975))
)
bse.index.names = ['dose', 'quantile']
bse = bse.reset_index().pivot(index='dose', columns='quantile', values=['rom'])
bse.columns = ('lwr', 'upr')
erb = est_ratios.join(bse)
erb['Ratio of Means (95% CI)'] = (erb
.groupby(['dose'])
.apply(lambda x:
'{0:4.2f} ({1:4.2f}, {2:4.2f})'.format(
x['rom'].values[0],
x['lwr'].values[0],
x['upr'].values[0])
)
)
erb[['Ratio of Means (95% CI)']]
plt.axvline()
to illustrate this in our plot. fig0, ax0 = plt.subplots(nrows=3, sharex=True)
_ = fig0.set_size_inches(8, 8)
fig0.tight_layout()
for i, d in enumerate([0.5, 1.0, 2.0]):
lab = 'dose = {0:3.1f}'.format(d)
_ = (ratios
.query('dose == @d')['rom']
.plot
.hist(ax=ax0[i], legend=False, label=lab)
)
_ = ax0[i].legend()
ax0[i].axvline(x=erb['lwr'].values[i], color='black')
ax0[i].axvline(x=erb['upr'].values[i], color='black')
# Welch's (unequal variance) t-test on log values
ert = {}
for dose in [0.5, 1, 2]:
# extract data
a = tg_data.query('dose == @dose and supp == "OJ"')['len'].values
b = tg_data.query('dose == @dose and supp == "VC"')['len'].values
# diff on a log scale
a, b = np.log(a), np.log(b)
a_bar, b_bar = np.mean(a), np.mean(b)
d = a_bar - b_bar
# std error
v_a, v_b = np.var(a, ddof=1), np.var(b, ddof=1)
n_a, n_b = len(a), len(b)
se = np.sqrt(v_a / n_a + v_b / n_b)
# degrees of freedom using Welch-Satterthwhaite approximation
df = (v_a / n_a + v_b / n_b) ** 2
df = df / (v_a ** 2 / n_a ** 2 / (n_a - 1) + v_b ** 2 / n_b ** 2 / (n_b - 1))
tt = t.ppf(.975, df=df)
lwr, upr = d - tt * se, d + tt * se
d, lwr, upr = np.exp(d), np.exp(lwr), np.exp(upr)
ci = '{0:4.2f} ({1:4.2f}, {2:4.2f})'.format(d, lwr, upr)
ert.update({dose: ci})
welch = pd.Series(ert)
erb['Ratio of Geometric Means (95% CI)'] = welch
erb.iloc[:, 3:5]
rng = np.random.default_rng(10 + 7 + 2021)
erb_scipy = {}
for dose in [0.5, 1, 2]:
# extract data
a = tg_data.query('dose == @dose and supp == "OJ"')['len'].values
b = tg_data.query('dose == @dose and supp == "VC"')['len'].values
res = bootstrap(
(a, b),
lambda a, b, axis: np.mean(a, axis=axis) / np.mean(b, axis=axis),
method='percentile',
axis=0,
random_state=rng
)
lwr, upr = res.confidence_interval
ci = '{0:4.2f} ({1:4.2f}, {2:4.2f})'.format(np.mean(a) / np.mean(b), lwr, upr)
erb_scipy.update({dose: ci})
erb['Ratio of Means (95% CI-Scipy)'] = pd.Series(erb_scipy)
erb.iloc[:, [3, 5]]
def perm_rom(a, b, n_perm=1000, alternative='two-sided', rng=None):
"""
Compute a permutation test p-value for the null that $\mu_a / \mu_b = 1$.
Parameters
----------
a, b - ndarray or convertible to such using np.asarray().
Sequences of observations from two independent groups.
n_perm - integer,
The number of permutations to use in the approximation.
Returns
-------
A float between 0 and 1 estimating the p-value.
"""
assert alternative in ['two-sided', 'greater', 'less']
if rng is None:
rng = np.random.default_rng()
elif isinstance(rng, int):
rng = np.random.default_rng(rng)
# convert of ndarray if needed
a, b = np.asarray(a), np.asarray(b)
# the observed ratio
obs = np.mean(a) / np.mean(b)
# length
n_a = len(a)
# concatenate
ab = np.array([a, b]).reshape((n_a + len(b), ))
# permutations
two = 0
for i in range(n_perm):
rng.shuffle(ab)
mean_a, mean_b = np.mean(ab[0:n_a]), np.mean(ab[n_a:])
if max(mean_a, mean_b) / min(mean_a, mean_b) >= obs:
two += 1
# construct p-value
p = (1 + two) / (n_perm + 1)
return(p)
assert 0 < perm_rom([2] * 7 + [1.9] * 3, [1.9] * 6 + [2] * 4, rng=None) < 1
assert 0 < perm_rom([2] * 7 + [1.9] * 3, [1.9] * 6 + [2] * 4, rng=42) < 1
erb['Permutation p-value'] = (tg_data
.groupby(['dose'])
.apply(lambda gdf: perm_rom(
gdf.query('supp == "OJ"')['len'].values,
gdf.query('supp == "VC"')['len'].values,
n_perm=1000,
rng=rng),
)
).transform(lambda x: np.round(x, 3))
erb.iloc[:, 3:8]