In [1]:
# import
import pandas as pd
import numpy as np

Hierarchical Indexing

Hierarchical indexing is an important feature of pandas enabling you to have multiple (two or more) index levels on an axis. Somewhat abstractly, it provides a way for you to work with higher dimensional data in a lower dimensional form.

In [2]:
data = pd.Series(np.random.randn(10),
              index=[['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'd', 'd'],
                     [1, 2, 3, 1, 2, 3, 1, 2, 2, 3]])
data
Out[2]:
a  1   -1.318755
   2    0.856719
   3    0.507809
b  1    0.705543
   2   -2.011981
   3    1.714807
c  1   -0.177709
   2   -0.940776
d  2    0.587829
   3    0.712596
dtype: float64
In [3]:
# getting the index
data.index
Out[3]:
MultiIndex(levels=[['a', 'b', 'c', 'd'], [1, 2, 3]],
           labels=[[0, 0, 0, 1, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 1, 2, 0, 1, 1, 2]])
In [4]:
data['b']
Out[4]:
1    0.705543
2   -2.011981
3    1.714807
dtype: float64
In [5]:
data['b'][1]
Out[5]:
0.70554263020800034
In [6]:
data['b', 1]
Out[6]:
0.70554263020800034
In [7]:
data['b':'c']
Out[7]:
b  1    0.705543
   2   -2.011981
   3    1.714807
c  1   -0.177709
   2   -0.940776
dtype: float64
In [8]:
data.ix['b':'d']
Out[8]:
b  1    0.705543
   2   -2.011981
   3    1.714807
c  1   -0.177709
   2   -0.940776
d  2    0.587829
   3    0.712596
dtype: float64
In [9]:
data[:, 2]
Out[9]:
a    0.856719
b   -2.011981
c   -0.940776
d    0.587829
dtype: float64
In [10]:
# Hierarchical indexing plays a critical role in reshaping data and 
# group-based operations like forming a pivot table.
data.unstack()
Out[10]:
1 2 3
a -1.318755 0.856719 0.507809
b 0.705543 -2.011981 1.714807
c -0.177709 -0.940776 NaN
d NaN 0.587829 0.712596
In [11]:
data.unstack().stack()
Out[11]:
a  1   -1.318755
   2    0.856719
   3    0.507809
b  1    0.705543
   2   -2.011981
   3    1.714807
c  1   -0.177709
   2   -0.940776
d  2    0.587829
   3    0.712596
dtype: float64
In [12]:
# With a DataFrame, either axis can have a hierarchical index
frame = pd.DataFrame(np.arange(12).reshape((4, 3)),
                  index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
                  columns=[['Ohio', 'Ohio', 'Colorado'],
                           ['Green', 'Red', 'Green']])
frame
Out[12]:
Ohio Colorado
Green Red Green
a 1 0 1 2
2 3 4 5
b 1 6 7 8
2 9 10 11
In [13]:
frame.index.names = ['key1', 'key2']
frame.columns.names = ['state', 'color']
frame
Out[13]:
state Ohio Colorado
color Green Red Green
key1 key2
a 1 0 1 2
2 3 4 5
b 1 6 7 8
2 9 10 11
In [14]:
frame['Ohio']
Out[14]:
color Green Red
key1 key2
a 1 0 1
2 3 4
b 1 6 7
2 9 10
In [15]:
frame.index
Out[15]:
MultiIndex(levels=[['a', 'b'], [1, 2]],
           labels=[[0, 0, 1, 1], [0, 1, 0, 1]],
           names=['key1', 'key2'])

Reordering and Sorting Levels

In [16]:
# The swaplevel takes two level numbers or names and
# returns a new object with the levels interchanged
frame.swaplevel('key1', 'key2')
Out[16]:
state Ohio Colorado
color Green Red Green
key2 key1
1 a 0 1 2
2 a 3 4 5
1 b 6 7 8
2 b 9 10 11
In [17]:
# sortlevel, on the other hand, sorts the data (stably) using only 
# the values in a single level.
frame.sortlevel(1)
Out[17]:
state Ohio Colorado
color Green Red Green
key1 key2
a 1 0 1 2
b 1 6 7 8
a 2 3 4 5
b 2 9 10 11
In [18]:
frame.sortlevel(0)
Out[18]:
state Ohio Colorado
color Green Red Green
key1 key2
a 1 0 1 2
2 3 4 5
b 1 6 7 8
2 9 10 11
In [19]:
frame.swaplevel(0, 1).sortlevel(0)
Out[19]:
state Ohio Colorado
color Green Red Green
key2 key1
1 a 0 1 2
b 6 7 8
2 a 3 4 5
b 9 10 11

Summary Statistics by Level

In [20]:
frame.sum(level='key2')
Out[20]:
state Ohio Colorado
color Green Red Green
key2
1 6 8 10
2 12 14 16
In [21]:
frame.sum(level='color', axis=1)
Out[21]:
color Green Red
key1 key2
a 1 2 1
2 8 4
b 1 14 7
2 20 10
In [22]:
frame = pd.DataFrame({'a': range(7), 'b': range(7, 0, -1),
                       'c': ['one', 'one', 'one', 'two', 'two', 'two', 'two'],
                       'd': [0, 1, 2, 0, 1, 2, 3]})
frame
Out[22]:
a b c d
0 0 7 one 0
1 1 6 one 1
2 2 5 one 2
3 3 4 two 0
4 4 3 two 1
5 5 2 two 2
6 6 1 two 3
In [23]:
frame2 = frame.set_index(['c', 'd'])
frame2
Out[23]:
a b
c d
one 0 0 7
1 1 6
2 2 5
two 0 3 4
1 4 3
2 5 2
3 6 1
In [24]:
# By default the columns are removed from the DataFrame, though we can have the as well
frame.set_index(['c', 'd'], drop=False)
Out[24]:
a b c d
c d
one 0 0 7 one 0
1 1 6 one 1
2 2 5 one 2
two 0 3 4 two 0
1 4 3 two 1
2 5 2 two 2
3 6 1 two 3
In [25]:
frame2.reset_index()
Out[25]:
c d a b
0 one 0 0 7
1 one 1 1 6
2 one 2 2 5
3 two 0 3 4
4 two 1 4 3
5 two 2 5 2
6 two 3 6 1

Integer Indexing

In [26]:
ser = pd.Series(np.arange(3.))
ser
Out[26]:
0    0.0
1    1.0
2    2.0
dtype: float64
In [27]:
 # ser[-1]  this will generate an error
In [28]:
ser2 = pd.Series(np.arange(3.), index=['a', 'b', 'c'])
ser2
Out[28]:
a    0.0
b    1.0
c    2.0
dtype: float64
In [29]:
ser2[-1]
Out[29]:
2.0
In [30]:
# To keep things consistent, if you have an axis index containing indexers, data selection
# with integers will always be label-oriented. This includes slicing with ix, too
ser.ix[:1]
Out[30]:
0    0.0
1    1.0
dtype: float64
In [31]:
# use the iget_value method from Series and irow and icol methods from DataFrame
ser3 = pd.Series(range(3), index=[-5, 1, 3])
ser3
Out[31]:
-5    0
 1    1
 3    2
dtype: int32
In [32]:
ser[2]
Out[32]:
2.0
In [33]:
ser3.iget_value(2)
C:\tools\Anaconda3\lib\site-packages\ipykernel\__main__.py:1: FutureWarning: iget_value(i) is deprecated. Please use .iloc[i] or .iat[i]
  if __name__ == '__main__':
Out[33]:
2
In [34]:
ser3.iat[2]
Out[34]:
2
In [35]:
frame = pd.DataFrame(np.arange(6).reshape(3, 2), index=[2, 0, 1])
frame
Out[35]:
0 1
2 0 1
0 2 3
1 4 5
In [36]:
frame.irow(0)
C:\tools\Anaconda3\lib\site-packages\ipykernel\__main__.py:1: FutureWarning: irow(i) is deprecated. Please use .iloc[i]
  if __name__ == '__main__':
Out[36]:
0    0
1    1
Name: 2, dtype: int32
In [37]:
frame.iloc[0]
Out[37]:
0    0
1    1
Name: 2, dtype: int32