filtering to create subsets of cases (rows),
performing aggregations by group.
.columns attribute is an Index object. import numpy as np
import pandas as pd
print(pd.__version__)
df = pd.DataFrame(
{
"a": range(5),
"b": [("red", "black")[i % 2] for i in range(5)],
"c": [("x", "y", "z")[i % 3] for i in range(5)]
}
)
df.columns
Index using pd.Index(). Index objects are immutable. .set_index(). idx = pd.Index(list("stats"))
df.index = idx
df
Index using pd.Index(). Index objects are immutable. .set_index(). df1 = df.iloc[[3, 2, 1], ]
df_alt = df1.set_index('a')
df_alt
loc indexer. .reindex()) won't work with duplicates index values. try:
df.reindex(['a', 't', 's'])
except:
print('Duplicate label error.')
df.loc[['a', 't', 's'], 'a':'b']
.to_series() or
.to_frame() index using .reset_index(). df['idx'] = df.index.to_frame()
df.reset_index(inplace=True)
df
.map() to transform a Series element wise..applymap() for element-wise transformation of DataFrames..transform() for vectorized transformations. .replace() to map existing values to new values.df['a_sq'] = df['a'].map(lambda x: x ** 2)
df['a2'] = df['a'].transform(lambda x: np.power(x, 2))
df['a3'] = df[['a']].applymap(lambda x: x ** 3)
df['c'].replace('z', 'w', inplace=True)
df
str attribute.df['b'] = df['b'].str.title()
df['b'].str[:2]
MultiIndex class and associated constructor. num = [('object', 'numeric')[v != np.dtype('O')] for v in df.dtypes.values]
df.columns = [num, df.columns]
print(df.columns)
df.loc[:, 'numeric']
.to_flat_index(). print([df.loc[:, 'numeric'].columns, df.loc[:, ['numeric']].columns])
print(df.columns.to_flat_index())
df.loc[:, ('object', 'b')]
.transform().What happens if we don't index with a list?
df.loc[:, ['numeric']] = (df.loc[:, ['numeric']].
transform(lambda x: x - np.mean(x)))
df.columns = df.columns.droplevel()
df
pd.merge(). on for matching rows, defaults to shared column names.how to determine join type: left, inner, outer , right.left joins when in doubt. df0 = df.loc[:, 'a':'c'].reset_index()
df1 = df.loc[:, 'a':'c'].reset_index().query('b == "Red"')
df0['old'] = 0
df1['new'] = 1
pd.merge(df0, df1, on=['index', 'b', 'c'], how='left', indicator=True)
on are renamed with suffixes. pd.merge(df0, df1, on=['b', 'c'], how='right', suffixes=('_0', '_1'))
pd.merge(
df0.set_index('index'),
df1.set_index('index'),
left_index=True,
right_index=True
)
.join() method lets (left) joins be written more compactly. Can pass a list of DataFrames to join.
(df0
.set_index(['b', 'c'])
.join(df1.set_index(['b', 'c']), rsuffix='_1')
)
.melt() or pd.melt()..pivot() or pd.pivot(). .stack(),.unstack(). a1 and b1 as two measured values at time 1,
a2 and b2 as the same values measured at time 2. dat = pd.DataFrame({
'id': range(1, 5),
'trt': 2 * [0] + 2 * [1],
'a1': np.random.uniform(size=4),
'a2': np.random.uniform(size=4),
'b1': np.random.uniform(size=4),
'b2': np.random.uniform(size=4)
})
dat.loc[:, 'a1':'b2'] = (dat.loc[:, 'a1':'b2']
.transform(lambda x: np.round(x, 2))
)
dat
.melt() or pd.melt().id_vars should be replicated for each row in the longer data,value_vars should be pivoted, each variable to a new row. dat_long0 = dat.melt(
id_vars=['id', 'trt'],
value_vars=['a1', 'a2', 'b1', 'b2']
)
dat_long = pd.melt(dat, id_vars = ['id', 'trt'], value_name='val')
#all(dat_long0 == dat_long)
dat_long
.pivot() or pd.pivot().index columns to become the row index in the reshaped data,columns the column(s) (a (Multi)Index) in the reshaped data,values column to use as values in the reshaped data. dat_wide = (dat_long
.pivot(index=['id', 'trt'], columns='variable', values='val')
)
#dat_wide.columns.name = None
#dat_wide.reset_index(inplace=True)
dat_wide
a and b with separate rows for each time (1, 2).dat
a and b and then merge the result.a_long = dat.melt(
id_vars=['id', 'trt'],
value_vars=['a1', 'a2'],
var_name='time',
value_name='a'
)
a_long['time'] = np.int8(a_long['time'].str[1:].values)
a_long
a and b and then merge the result.b_long = dat.melt(
id_vars=['id', 'trt'],
value_vars=['b1', 'b2'],
var_name='time',
value_name='b'
)
b_long['time'] = np.int8(b_long['time'].str[1:].values)
b_long
a and b and then merge the result.d0 = pd.merge(a_long, b_long, on=['id', 'trt', 'time'])
d0
.stack() method moves a column index to a row index, making
the conformed data longer..unstack() method moves a row index to a column index, making
the conformed data wider. d1 = dat.set_index(['id', 'trt'])
d1.columns = [2 * ['a'] + 2 * ['b'], 2 * ['1', '2']]
d1.columns.names = (None, 'time')
d1 = d1.stack()
d1.reset_index(inplace=True)
d1
d2 = dat.set_index(['id', 'trt'])
# d2.columns = reversed([2 * ['a'] + 2 * ['b'], 2 * ['tm1', 'tm2']])
d2.columns = [[i[1] for i in d2.columns], [i[0] for i in d2.columns]]
d2.columns.names = (None, 'var')
d2 = d2.stack()
d2.reset_index(inplace=True)
d2
d3 = dat.set_index(['id', 'trt'])
#print(type(d3.stack()))
#d3.columns = [len(d3.columns) * ['value'], d3.columns]
#d3.columns.names = ('value', 'var')
#d3 = d3.stack().reset_index(2)
#d3.insert(1, column='time', value=np.int8(d3['var'].str[1]))
#d3['var'] = d3['var'].str[0]
#d3 = d3.reset_index().set_index(['id', 'trt', 'var', 'time'])
#d3 = d3.unstack(level=3)
d3
.melt() and .pivot()..stack() and .unstack().