filtering to create subsets of cases (rows),
performing aggregations by group.
.columns
attribute is an Index
object. import numpy as np
import pandas as pd
print(pd.__version__)
df = pd.DataFrame(
{
"a": range(5),
"b": [("red", "black")[i % 2] for i in range(5)],
"c": [("x", "y", "z")[i % 3] for i in range(5)]
}
)
df.columns
Index
using pd.Index()
. Index
objects are immutable. .set_index()
. idx = pd.Index(list("stats"))
df.index = idx
df
Index
using pd.Index()
. Index
objects are immutable. .set_index()
. df1 = df.iloc[[3, 2, 1], ]
df_alt = df1.set_index('a')
df_alt
loc
indexer. .reindex()
) won't work with duplicates index values. try:
df.reindex(['a', 't', 's'])
except:
print('Duplicate label error.')
df.loc[['a', 't', 's'], 'a':'b']
.to_series()
or
.to_frame()
index
using .reset_index()
. df['idx'] = df.index.to_frame()
df.reset_index(inplace=True)
df
.map()
to transform a Series element wise..applymap()
for element-wise transformation of DataFrames..transform()
for vectorized transformations. .replace()
to map existing values to new values.df['a_sq'] = df['a'].map(lambda x: x ** 2)
df['a2'] = df['a'].transform(lambda x: np.power(x, 2))
df['a3'] = df[['a']].applymap(lambda x: x ** 3)
df['c'].replace('z', 'w', inplace=True)
df
str
attribute.df['b'] = df['b'].str.title()
df['b'].str[:2]
MultiIndex
class and associated constructor. num = [('object', 'numeric')[v != np.dtype('O')] for v in df.dtypes.values]
df.columns = [num, df.columns]
print(df.columns)
df.loc[:, 'numeric']
.to_flat_index()
. print([df.loc[:, 'numeric'].columns, df.loc[:, ['numeric']].columns])
print(df.columns.to_flat_index())
df.loc[:, ('object', 'b')]
.transform()
.What happens if we don't index with a list?
df.loc[:, ['numeric']] = (df.loc[:, ['numeric']].
transform(lambda x: x - np.mean(x)))
df.columns = df.columns.droplevel()
df
pd.merge()
. on
for matching rows, defaults to shared column names.how
to determine join type: left
, inner
, outer
, right
.left
joins when in doubt. df0 = df.loc[:, 'a':'c'].reset_index()
df1 = df.loc[:, 'a':'c'].reset_index().query('b == "Red"')
df0['old'] = 0
df1['new'] = 1
pd.merge(df0, df1, on=['index', 'b', 'c'], how='left', indicator=True)
on
are renamed with suffixes. pd.merge(df0, df1, on=['b', 'c'], how='right', suffixes=('_0', '_1'))
pd.merge(
df0.set_index('index'),
df1.set_index('index'),
left_index=True,
right_index=True
)
.join()
method lets (left) joins be written more compactly. Can pass a list of DataFrames to join.
(df0
.set_index(['b', 'c'])
.join(df1.set_index(['b', 'c']), rsuffix='_1')
)
.melt()
or pd.melt()
..pivot()
or pd.pivot()
. .stack()
,.unstack()
. a1
and b1
as two measured values at time 1,
a2
and b2
as the same values measured at time 2. dat = pd.DataFrame({
'id': range(1, 5),
'trt': 2 * [0] + 2 * [1],
'a1': np.random.uniform(size=4),
'a2': np.random.uniform(size=4),
'b1': np.random.uniform(size=4),
'b2': np.random.uniform(size=4)
})
dat.loc[:, 'a1':'b2'] = (dat.loc[:, 'a1':'b2']
.transform(lambda x: np.round(x, 2))
)
dat
.melt()
or pd.melt()
.id_vars
should be replicated for each row in the longer data,value_vars
should be pivoted, each variable to a new row. dat_long0 = dat.melt(
id_vars=['id', 'trt'],
value_vars=['a1', 'a2', 'b1', 'b2']
)
dat_long = pd.melt(dat, id_vars = ['id', 'trt'], value_name='val')
#all(dat_long0 == dat_long)
dat_long
.pivot()
or pd.pivot()
.index
columns to become the row index in the reshaped data,columns
the column(s) (a (Multi)Index) in the reshaped data,values
column to use as values in the reshaped data. dat_wide = (dat_long
.pivot(index=['id', 'trt'], columns='variable', values='val')
)
#dat_wide.columns.name = None
#dat_wide.reset_index(inplace=True)
dat_wide
a
and b
with separate rows for each time (1, 2).dat
a
and b
and then merge the result.a_long = dat.melt(
id_vars=['id', 'trt'],
value_vars=['a1', 'a2'],
var_name='time',
value_name='a'
)
a_long['time'] = np.int8(a_long['time'].str[1:].values)
a_long
a
and b
and then merge the result.b_long = dat.melt(
id_vars=['id', 'trt'],
value_vars=['b1', 'b2'],
var_name='time',
value_name='b'
)
b_long['time'] = np.int8(b_long['time'].str[1:].values)
b_long
a
and b
and then merge the result.d0 = pd.merge(a_long, b_long, on=['id', 'trt', 'time'])
d0
.stack()
method moves a column index to a row index, making
the conformed data longer..unstack()
method moves a row index to a column index, making
the conformed data wider. d1 = dat.set_index(['id', 'trt'])
d1.columns = [2 * ['a'] + 2 * ['b'], 2 * ['1', '2']]
d1.columns.names = (None, 'time')
d1 = d1.stack()
d1.reset_index(inplace=True)
d1
d2 = dat.set_index(['id', 'trt'])
# d2.columns = reversed([2 * ['a'] + 2 * ['b'], 2 * ['tm1', 'tm2']])
d2.columns = [[i[1] for i in d2.columns], [i[0] for i in d2.columns]]
d2.columns.names = (None, 'var')
d2 = d2.stack()
d2.reset_index(inplace=True)
d2
d3 = dat.set_index(['id', 'trt'])
#print(type(d3.stack()))
#d3.columns = [len(d3.columns) * ['value'], d3.columns]
#d3.columns.names = ('value', 'var')
#d3 = d3.stack().reset_index(2)
#d3.insert(1, column='time', value=np.int8(d3['var'].str[1]))
#d3['var'] = d3['var'].str[0]
#d3 = d3.reset_index().set_index(['id', 'trt', 'var', 'time'])
#d3 = d3.unstack(level=3)
d3
.melt()
and .pivot()
..stack()
and .unstack()
.