# import
import pandas as pd
import numpy as np
Hierarchical indexing is an important feature of pandas enabling you to have multiple (two or more) index levels on an axis. Somewhat abstractly, it provides a way for you to work with higher dimensional data in a lower dimensional form.
data = pd.Series(np.random.randn(10),
index=[['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'd', 'd'],
[1, 2, 3, 1, 2, 3, 1, 2, 2, 3]])
data
a 1 -1.318755 2 0.856719 3 0.507809 b 1 0.705543 2 -2.011981 3 1.714807 c 1 -0.177709 2 -0.940776 d 2 0.587829 3 0.712596 dtype: float64
# getting the index
data.index
MultiIndex(levels=[['a', 'b', 'c', 'd'], [1, 2, 3]], labels=[[0, 0, 0, 1, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 1, 2, 0, 1, 1, 2]])
data['b']
1 0.705543 2 -2.011981 3 1.714807 dtype: float64
data['b'][1]
0.70554263020800034
data['b', 1]
0.70554263020800034
data['b':'c']
b 1 0.705543 2 -2.011981 3 1.714807 c 1 -0.177709 2 -0.940776 dtype: float64
data.ix['b':'d']
b 1 0.705543 2 -2.011981 3 1.714807 c 1 -0.177709 2 -0.940776 d 2 0.587829 3 0.712596 dtype: float64
data[:, 2]
a 0.856719 b -2.011981 c -0.940776 d 0.587829 dtype: float64
# Hierarchical indexing plays a critical role in reshaping data and
# group-based operations like forming a pivot table.
data.unstack()
1 | 2 | 3 | |
---|---|---|---|
a | -1.318755 | 0.856719 | 0.507809 |
b | 0.705543 | -2.011981 | 1.714807 |
c | -0.177709 | -0.940776 | NaN |
d | NaN | 0.587829 | 0.712596 |
data.unstack().stack()
a 1 -1.318755 2 0.856719 3 0.507809 b 1 0.705543 2 -2.011981 3 1.714807 c 1 -0.177709 2 -0.940776 d 2 0.587829 3 0.712596 dtype: float64
# With a DataFrame, either axis can have a hierarchical index
frame = pd.DataFrame(np.arange(12).reshape((4, 3)),
index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
columns=[['Ohio', 'Ohio', 'Colorado'],
['Green', 'Red', 'Green']])
frame
Ohio | Colorado | |||
---|---|---|---|---|
Green | Red | Green | ||
a | 1 | 0 | 1 | 2 |
2 | 3 | 4 | 5 | |
b | 1 | 6 | 7 | 8 |
2 | 9 | 10 | 11 |
frame.index.names = ['key1', 'key2']
frame.columns.names = ['state', 'color']
frame
state | Ohio | Colorado | ||
---|---|---|---|---|
color | Green | Red | Green | |
key1 | key2 | |||
a | 1 | 0 | 1 | 2 |
2 | 3 | 4 | 5 | |
b | 1 | 6 | 7 | 8 |
2 | 9 | 10 | 11 |
frame['Ohio']
color | Green | Red | |
---|---|---|---|
key1 | key2 | ||
a | 1 | 0 | 1 |
2 | 3 | 4 | |
b | 1 | 6 | 7 |
2 | 9 | 10 |
frame.index
MultiIndex(levels=[['a', 'b'], [1, 2]], labels=[[0, 0, 1, 1], [0, 1, 0, 1]], names=['key1', 'key2'])
# The swaplevel takes two level numbers or names and
# returns a new object with the levels interchanged
frame.swaplevel('key1', 'key2')
state | Ohio | Colorado | ||
---|---|---|---|---|
color | Green | Red | Green | |
key2 | key1 | |||
1 | a | 0 | 1 | 2 |
2 | a | 3 | 4 | 5 |
1 | b | 6 | 7 | 8 |
2 | b | 9 | 10 | 11 |
# sortlevel, on the other hand, sorts the data (stably) using only
# the values in a single level.
frame.sortlevel(1)
state | Ohio | Colorado | ||
---|---|---|---|---|
color | Green | Red | Green | |
key1 | key2 | |||
a | 1 | 0 | 1 | 2 |
b | 1 | 6 | 7 | 8 |
a | 2 | 3 | 4 | 5 |
b | 2 | 9 | 10 | 11 |
frame.sortlevel(0)
state | Ohio | Colorado | ||
---|---|---|---|---|
color | Green | Red | Green | |
key1 | key2 | |||
a | 1 | 0 | 1 | 2 |
2 | 3 | 4 | 5 | |
b | 1 | 6 | 7 | 8 |
2 | 9 | 10 | 11 |
frame.swaplevel(0, 1).sortlevel(0)
state | Ohio | Colorado | ||
---|---|---|---|---|
color | Green | Red | Green | |
key2 | key1 | |||
1 | a | 0 | 1 | 2 |
b | 6 | 7 | 8 | |
2 | a | 3 | 4 | 5 |
b | 9 | 10 | 11 |
frame.sum(level='key2')
state | Ohio | Colorado | |
---|---|---|---|
color | Green | Red | Green |
key2 | |||
1 | 6 | 8 | 10 |
2 | 12 | 14 | 16 |
frame.sum(level='color', axis=1)
color | Green | Red | |
---|---|---|---|
key1 | key2 | ||
a | 1 | 2 | 1 |
2 | 8 | 4 | |
b | 1 | 14 | 7 |
2 | 20 | 10 |
frame = pd.DataFrame({'a': range(7), 'b': range(7, 0, -1),
'c': ['one', 'one', 'one', 'two', 'two', 'two', 'two'],
'd': [0, 1, 2, 0, 1, 2, 3]})
frame
a | b | c | d | |
---|---|---|---|---|
0 | 0 | 7 | one | 0 |
1 | 1 | 6 | one | 1 |
2 | 2 | 5 | one | 2 |
3 | 3 | 4 | two | 0 |
4 | 4 | 3 | two | 1 |
5 | 5 | 2 | two | 2 |
6 | 6 | 1 | two | 3 |
frame2 = frame.set_index(['c', 'd'])
frame2
a | b | ||
---|---|---|---|
c | d | ||
one | 0 | 0 | 7 |
1 | 1 | 6 | |
2 | 2 | 5 | |
two | 0 | 3 | 4 |
1 | 4 | 3 | |
2 | 5 | 2 | |
3 | 6 | 1 |
# By default the columns are removed from the DataFrame, though we can have the as well
frame.set_index(['c', 'd'], drop=False)
a | b | c | d | ||
---|---|---|---|---|---|
c | d | ||||
one | 0 | 0 | 7 | one | 0 |
1 | 1 | 6 | one | 1 | |
2 | 2 | 5 | one | 2 | |
two | 0 | 3 | 4 | two | 0 |
1 | 4 | 3 | two | 1 | |
2 | 5 | 2 | two | 2 | |
3 | 6 | 1 | two | 3 |
frame2.reset_index()
c | d | a | b | |
---|---|---|---|---|
0 | one | 0 | 0 | 7 |
1 | one | 1 | 1 | 6 |
2 | one | 2 | 2 | 5 |
3 | two | 0 | 3 | 4 |
4 | two | 1 | 4 | 3 |
5 | two | 2 | 5 | 2 |
6 | two | 3 | 6 | 1 |
ser = pd.Series(np.arange(3.))
ser
0 0.0 1 1.0 2 2.0 dtype: float64
# ser[-1] this will generate an error
ser2 = pd.Series(np.arange(3.), index=['a', 'b', 'c'])
ser2
a 0.0 b 1.0 c 2.0 dtype: float64
ser2[-1]
2.0
# To keep things consistent, if you have an axis index containing indexers, data selection
# with integers will always be label-oriented. This includes slicing with ix, too
ser.ix[:1]
0 0.0 1 1.0 dtype: float64
# use the iget_value method from Series and irow and icol methods from DataFrame
ser3 = pd.Series(range(3), index=[-5, 1, 3])
ser3
-5 0 1 1 3 2 dtype: int32
ser[2]
2.0
ser3.iget_value(2)
C:\tools\Anaconda3\lib\site-packages\ipykernel\__main__.py:1: FutureWarning: iget_value(i) is deprecated. Please use .iloc[i] or .iat[i] if __name__ == '__main__':
2
ser3.iat[2]
2
frame = pd.DataFrame(np.arange(6).reshape(3, 2), index=[2, 0, 1])
frame
0 | 1 | |
---|---|---|
2 | 0 | 1 |
0 | 2 | 3 |
1 | 4 | 5 |
frame.irow(0)
C:\tools\Anaconda3\lib\site-packages\ipykernel\__main__.py:1: FutureWarning: irow(i) is deprecated. Please use .iloc[i] if __name__ == '__main__':
0 0 1 1 Name: 2, dtype: int32
frame.iloc[0]
0 0 1 1 Name: 2, dtype: int32