import numpy as np
import pandas as pd
rng = np.random.default_rng(9 * 2021 * 28)
n=100
a = rng.binomial(n=1, p=0.5, size=n)
b = 1 - 0.5 * a + rng.normal(size=n)
c = 0.8 * a + rng.normal(size=n)
df = pd.DataFrame({'a': a, 'b': b, 'c': c})
df
pd.categorical()
can be used to create a categorical type.df['a'] = pd.Categorical(df['a'].replace({0: 'control', 1: 'treatment'}))
(df['a'].dtype, df['a'].values.categories, df['a'].values.codes)
.groupby()
method. df.groupby('a').size()
[df.groupby('a').mean().index, df.groupby('a').mean().columns]
as_index=False
. df.groupby('a', as_index=False).mean().columns
df_max_a = (
df
.groupby('a')[['b']]
.max()
.rename(columns={'b': 'b_max'})
)
df2 = df.set_index('a').join(df_max_a)
df2.groupby(level='a').head(n=2)
df_max_a = (
df
.groupby('a')[['b']]
.max()
.rename(columns={'b': 'b_max'})
)
df2 = df.set_index('a').join(df_max_a)
df2.groupby(level='a').head(n=2)
.groupby()
with
.transform()
. df3 = df.copy()
df3[['b_max', 'c_max']] = (
df3
.groupby('a')
.transform(np.max)
)
df3.groupby('a').head(n=2)
df3.iloc[1, 1] = np.nan
df3.groupby('a').size()
#df3.groupby('a').count()
.agg()
(.aggregate()
) method supports more general functions. (
df
.groupby('a')
.agg(lambda x: np.quantile(x, .75) - np.quantile(x, .25))
.rename(mapper=lambda x: 'iqr_' + x, axis=1)
)
.agg()
. f_list = [
('min', np.min),
('max', np.max),
('iqr', lambda x: np.quantile(x, .75) - np.quantile(x, .25)),
]
df.groupby('a').agg(f_list)
df.groupby('a').quantile((.025, .975))
.apply()
method operates on each subset of
data and then puts them back together.def tail_values(df, columns=None, lwr=.025, upr=.975):
"""
Subset a DataFrame df to find rows with values in the distributional tail.
Parameters
----------
df : DataFrame
The DataFrame to be subset.
columns : string or list of strings. Optional.
Names of columns in which to look for tail values. If None use all
columns. The default is None.
lwr, upr : float. Optional.
Sample quantiles (inclusive) demarking the lower and upper tails,
respectively. The defaults are .025 and .975.
Returns
-------
A subset of df with rows taking values in the distributional tail of any
column in columns.
"""
if columns is None:
tail = df.transform(lambda x: (
np.logical_or(x >= np.quantile(x, upr), x <= np.quantile(x, lwr))
)).any(axis=1)
elif isinstance(columns, list):
tail = df[columns].transform(lambda x: (
np.logical_or(x >= np.quantile(x, upr), x <= np.quantile(x, lwr))
)).any(axis=1)
elif isinstance(columns, str):
tail = df[[columns]].transform(lambda x: (
np.logical_or(x >= np.quantile(x, upr), x <= np.quantile(x, lwr))
)).any(axis=1)
else:
raise TypeError("columns should be a str, list or None.")
return(df[tail])
df.groupby('a').apply(tail_values)
.groupby()
method of pandas DataFrame (Series) class.as_index
..groupby()
and .transform()
or by re-merging. .agg()
method
when not..apply()
method operates on each
group and then puts the pieces back together.