Stack and Unstack
import pandas as pd
import numpy as np
data = pd.DataFrame(np.arange(6).reshape((2, 3)),
index=pd.Index(['Ohio', 'Colorado'], name='state'),
columns=pd.Index(['one', 'two', 'three'], name='number'))
data
number | one | two | three |
---|---|---|---|
state | |||
Ohio | 0 | 1 | 2 |
Colorado | 3 | 4 | 5 |
# stacking
res = data.stack()
res
state number Ohio one 0 two 1 three 2 Colorado one 3 two 4 three 5 dtype: int32
res.unstack()
number | one | two | three |
---|---|---|---|
state | |||
Ohio | 0 | 1 | 2 |
Colorado | 3 | 4 | 5 |
# By default the innermost level is unstacked (same with stack). You can unstack a different
# level by passing a level number or name
res.unstack(0)
state | Ohio | Colorado |
---|---|---|
number | ||
one | 0 | 3 |
two | 1 | 4 |
three | 2 | 5 |
res.unstack(1)
number | one | two | three |
---|---|---|---|
state | |||
Ohio | 0 | 1 | 2 |
Colorado | 3 | 4 | 5 |
res.unstack('state')
state | Ohio | Colorado |
---|---|---|
number | ||
one | 0 | 3 |
two | 1 | 4 |
three | 2 | 5 |
s1 = pd.Series([0, 1, 2, 3], index=['a', 'b', 'c', 'd'])
s2 = pd.Series([4, 5, 6], index=['c', 'd', 'e'])
data2 = pd.concat([s1, s2], keys=['one', 'two'])
data2
one a 0 b 1 c 2 d 3 two c 4 d 5 e 6 dtype: int64
data2.unstack()
a | b | c | d | e | |
---|---|---|---|---|---|
one | 0.0 | 1.0 | 2.0 | 3.0 | NaN |
two | NaN | NaN | 4.0 | 5.0 | 6.0 |
data2.unstack(0)
one | two | |
---|---|---|
a | 0.0 | NaN |
b | 1.0 | NaN |
c | 2.0 | 4.0 |
d | 3.0 | 5.0 |
e | NaN | 6.0 |
# Unstacking might introduce missing data if all of the values in the level aren’t found in
# each of the subgroups
# Stacking filters out missing data by default, so the operation is easily invertible
data2.unstack().stack()
one a 0.0 b 1.0 c 2.0 d 3.0 two c 4.0 d 5.0 e 6.0 dtype: float64
data2.unstack().stack(dropna=False)
one a 0.0 b 1.0 c 2.0 d 3.0 e NaN two a NaN b NaN c 4.0 d 5.0 e 6.0 dtype: float64
# When unstacking in a DataFrame, the level unstacked becomes the lowest level in the result
df = pd.DataFrame({'left': res, 'right': res + 5},
columns=pd.Index(['left', 'right'], name='side'))
df
side | left | right | |
---|---|---|---|
state | number | ||
Ohio | one | 0 | 5 |
two | 1 | 6 | |
three | 2 | 7 | |
Colorado | one | 3 | 8 |
two | 4 | 9 | |
three | 5 | 10 |
df.unstack('state')
side | left | right | ||
---|---|---|---|---|
state | Ohio | Colorado | Ohio | Colorado |
number | ||||
one | 0 | 3 | 5 | 8 |
two | 1 | 4 | 6 | 9 |
three | 2 | 5 | 7 | 10 |
df.unstack('state').stack('side')
state | Ohio | Colorado | |
---|---|---|---|
number | side | ||
one | left | 0 | 3 |
right | 5 | 8 | |
two | left | 1 | 4 |
right | 6 | 9 | |
three | left | 2 | 5 |
right | 7 | 10 |
df.stack('side')
state number side Ohio one left 0 right 5 two left 1 right 6 three left 2 right 7 Colorado one left 3 right 8 two left 4 right 9 three left 5 right 10 dtype: int32
df.stack('side').unstack('state')
state | Ohio | Colorado | |
---|---|---|---|
number | side | ||
one | left | 0 | 3 |
right | 5 | 8 | |
two | left | 1 | 4 |
right | 6 | 9 | |
three | left | 2 | 5 |
right | 7 | 10 |
df = pd.read_table("dataset/stack.txt", sep='|')
df
date | item | value | |
---|---|---|---|
0 | 1959-03-31 | realgdp | 2710.349 |
1 | 1959-03-31 | infl | 0.000 |
2 | 1959-03-31 | unemp | 5.800 |
3 | 1959-06-30 | realgdp | 2778.801 |
4 | 1959-06-30 | infl | 2.340 |
5 | 1959-06-30 | unemp | 5.100 |
6 | 1959-09-30 | realgdp | 2775.488 |
7 | 1959-09-30 | infl | 2.740 |
8 | 1959-09-30 | unemp | 5.300 |
9 | 1959-12-31 | realgdp | 2785.204 |
pivoted = df.pivot('date', 'item', 'value')
pivoted
item | infl | realgdp | unemp |
---|---|---|---|
date | |||
1959-03-31 | 0.00 | 2710.349 | 5.8 |
1959-06-30 | 2.34 | 2778.801 | 5.1 |
1959-09-30 | 2.74 | 2775.488 | 5.3 |
1959-12-31 | NaN | 2785.204 | NaN |
df['value2'] = np.random.randn(len(df))
df
date | item | value | value2 | |
---|---|---|---|---|
0 | 1959-03-31 | realgdp | 2710.349 | -1.083851 |
1 | 1959-03-31 | infl | 0.000 | 0.644205 |
2 | 1959-03-31 | unemp | 5.800 | 1.590181 |
3 | 1959-06-30 | realgdp | 2778.801 | 0.550000 |
4 | 1959-06-30 | infl | 2.340 | 0.149167 |
5 | 1959-06-30 | unemp | 5.100 | 1.316712 |
6 | 1959-09-30 | realgdp | 2775.488 | -0.026140 |
7 | 1959-09-30 | infl | 2.740 | -0.054482 |
8 | 1959-09-30 | unemp | 5.300 | -0.167150 |
9 | 1959-12-31 | realgdp | 2785.204 | -0.728173 |
pivoted = df.pivot('date', 'item')
pivoted
value | value2 | |||||
---|---|---|---|---|---|---|
item | infl | realgdp | unemp | infl | realgdp | unemp |
date | ||||||
1959-03-31 | 0.00 | 2710.349 | 5.8 | 0.644205 | -1.083851 | 1.590181 |
1959-06-30 | 2.34 | 2778.801 | 5.1 | 0.149167 | 0.550000 | 1.316712 |
1959-09-30 | 2.74 | 2775.488 | 5.3 | -0.054482 | -0.026140 | -0.167150 |
1959-12-31 | NaN | 2785.204 | NaN | NaN | -0.728173 | NaN |
# Note that pivot is just a shortcut for creating a hierarchical index using set_index and
# reshaping with unstack:
unstacked = df.set_index(['date', 'item']).unstack('item')
unstacked
value | value2 | |||||
---|---|---|---|---|---|---|
item | infl | realgdp | unemp | infl | realgdp | unemp |
date | ||||||
1959-03-31 | 0.00 | 2710.349 | 5.8 | 0.644205 | -1.083851 | 1.590181 |
1959-06-30 | 2.34 | 2778.801 | 5.1 | 0.149167 | 0.550000 | 1.316712 |
1959-09-30 | 2.74 | 2775.488 | 5.3 | -0.054482 | -0.026140 | -0.167150 |
1959-12-31 | NaN | 2785.204 | NaN | NaN | -0.728173 | NaN |