In [1]:

# import
import pandas as pd
import numpy as np

Hierarchical Indexing¶

Hierarchical indexing is an important feature of pandas enabling you to have multiple (two or more) index levels on an axis. Somewhat abstractly, it provides a way for you to work with higher dimensional data in a lower dimensional form.

In [2]:

data = pd.Series(np.random.randn(10),
              index=[['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'd', 'd'],
                     [1, 2, 3, 1, 2, 3, 1, 2, 2, 3]])
data

Out[2]:

a  1   -1.318755
   2    0.856719
   3    0.507809
b  1    0.705543
   2   -2.011981
   3    1.714807
c  1   -0.177709
   2   -0.940776
d  2    0.587829
   3    0.712596
dtype: float64

In [3]:

# getting the index
data.index

Out[3]:

MultiIndex(levels=[['a', 'b', 'c', 'd'], [1, 2, 3]],
           labels=[[0, 0, 0, 1, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 1, 2, 0, 1, 1, 2]])

In [4]:

data['b']

Out[4]:

1    0.705543
2   -2.011981
3    1.714807
dtype: float64

In [5]:

data['b'][1]

Out[5]:

0.70554263020800034

In [6]:

data['b', 1]

Out[6]:

0.70554263020800034

In [7]:

data['b':'c']

Out[7]:

b  1    0.705543
   2   -2.011981
   3    1.714807
c  1   -0.177709
   2   -0.940776
dtype: float64

In [8]:

data.ix['b':'d']

Out[8]:

b  1    0.705543
   2   -2.011981
   3    1.714807
c  1   -0.177709
   2   -0.940776
d  2    0.587829
   3    0.712596
dtype: float64

In [9]:

data[:, 2]

Out[9]:

a    0.856719
b   -2.011981
c   -0.940776
d    0.587829
dtype: float64

In [10]:

# Hierarchical indexing plays a critical role in reshaping data and 
# group-based operations like forming a pivot table.
data.unstack()

Out[10]:

	1	2	3
a	-1.318755	0.856719	0.507809
b	0.705543	-2.011981	1.714807
c	-0.177709	-0.940776	NaN
d	NaN	0.587829	0.712596

In [11]:

data.unstack().stack()

Out[11]:

a  1   -1.318755
   2    0.856719
   3    0.507809
b  1    0.705543
   2   -2.011981
   3    1.714807
c  1   -0.177709
   2   -0.940776
d  2    0.587829
   3    0.712596
dtype: float64

In [12]:

# With a DataFrame, either axis can have a hierarchical index
frame = pd.DataFrame(np.arange(12).reshape((4, 3)),
                  index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
                  columns=[['Ohio', 'Ohio', 'Colorado'],
                           ['Green', 'Red', 'Green']])
frame

Out[12]:

		Ohio		Colorado
		Green	Red	Green
a	1	0	1	2
a	2	3	4	5
b	1	6	7	8
b	2	9	10	11

In [13]:

frame.index.names = ['key1', 'key2']
frame.columns.names = ['state', 'color']
frame

Out[13]:

	state	Ohio		Colorado
	color	Green	Red	Green
key1	key2
a	1	0	1	2
a	2	3	4	5
b	1	6	7	8
b	2	9	10	11

In [14]:

frame['Ohio']

Out[14]:

	color	Green	Red
key1	key2
a	1	0	1
a	2	3	4
b	1	6	7
b	2	9	10

In [15]:

frame.index

Out[15]:

MultiIndex(levels=[['a', 'b'], [1, 2]],
           labels=[[0, 0, 1, 1], [0, 1, 0, 1]],
           names=['key1', 'key2'])

Reordering and Sorting Levels¶

In [16]:

# The swaplevel takes two level numbers or names and
# returns a new object with the levels interchanged
frame.swaplevel('key1', 'key2')

Out[16]:

	state	Ohio		Colorado
	color	Green	Red	Green
key2	key1
1	a	0	1	2
2	a	3	4	5
1	b	6	7	8
2	b	9	10	11

In [17]:

# sortlevel, on the other hand, sorts the data (stably) using only 
# the values in a single level.
frame.sortlevel(1)

Out[17]:

	state	Ohio		Colorado
	color	Green	Red	Green
key1	key2
a	1	0	1	2
b	1	6	7	8
a	2	3	4	5
b	2	9	10	11

In [18]:

frame.sortlevel(0)

Out[18]:

	state	Ohio		Colorado
	color	Green	Red	Green
key1	key2
a	1	0	1	2
a	2	3	4	5
b	1	6	7	8
b	2	9	10	11

In [19]:

frame.swaplevel(0, 1).sortlevel(0)

Out[19]:

	state	Ohio		Colorado
	color	Green	Red	Green
key2	key1
1	a	0	1	2
1	b	6	7	8
2	a	3	4	5
2	b	9	10	11

Summary Statistics by Level¶

In [20]:

frame.sum(level='key2')

Out[20]:

state	Ohio		Colorado
color	Green	Red	Green
key2
1	6	8	10
2	12	14	16

In [21]:

frame.sum(level='color', axis=1)

Out[21]:

	color	Green	Red
key1	key2
a	1	2	1
a	2	8	4
b	1	14	7
b	2	20	10

In [22]:

frame = pd.DataFrame({'a': range(7), 'b': range(7, 0, -1),
                       'c': ['one', 'one', 'one', 'two', 'two', 'two', 'two'],
                       'd': [0, 1, 2, 0, 1, 2, 3]})
frame

Out[22]:

	a	b	c	d
0	0	7	one	0
1	1	6	one	1
2	2	5	one	2
3	3	4	two	0
4	4	3	two	1
5	5	2	two	2
6	6	1	two	3

In [23]:

frame2 = frame.set_index(['c', 'd'])
frame2

Out[23]:

		a	b
c	d
one	0	0	7
	1	1	6
	2	2	5
two	0	3	4
	1	4	3
	2	5	2
	3	6	1

In [24]:

# By default the columns are removed from the DataFrame, though we can have the as well
frame.set_index(['c', 'd'], drop=False)

Out[24]:

		a	b	c	d
c	d
one	0	0	7	one	0
	1	1	6	one	1
	2	2	5	one	2
two	0	3	4	two	0
	1	4	3	two	1
	2	5	2	two	2
	3	6	1	two	3

In [25]:

frame2.reset_index()

Out[25]:

	c	d	a	b
0	one	0	0	7
1	one	1	1	6
2	one	2	2	5
3	two	0	3	4
4	two	1	4	3
5	two	2	5	2
6	two	3	6	1

Integer Indexing¶

In [26]:

ser = pd.Series(np.arange(3.))
ser

Out[26]:

0    0.0
1    1.0
2    2.0
dtype: float64

In [27]:

 # ser[-1]  this will generate an error

In [28]:

ser2 = pd.Series(np.arange(3.), index=['a', 'b', 'c'])
ser2

Out[28]:

a    0.0
b    1.0
c    2.0
dtype: float64

In [29]:

ser2[-1]

Out[29]:

2.0

In [30]:

# To keep things consistent, if you have an axis index containing indexers, data selection
# with integers will always be label-oriented. This includes slicing with ix, too
ser.ix[:1]

Out[30]:

0    0.0
1    1.0
dtype: float64

In [31]:

# use the iget_value method from Series and irow and icol methods from DataFrame
ser3 = pd.Series(range(3), index=[-5, 1, 3])
ser3

Out[31]:

-5    0
 1    1
 3    2
dtype: int32

In [32]:

ser[2]

Out[32]:

2.0

In [33]:

ser3.iget_value(2)

C:\tools\Anaconda3\lib\site-packages\ipykernel\__main__.py:1: FutureWarning: iget_value(i) is deprecated. Please use .iloc[i] or .iat[i]
  if __name__ == '__main__':

Out[33]:

In [34]:

ser3.iat[2]

Out[34]:

In [35]:

frame = pd.DataFrame(np.arange(6).reshape(3, 2), index=[2, 0, 1])
frame

Out[35]:

	0	1
2	0	1
0	2	3
1	4	5

In [36]:

frame.irow(0)

C:\tools\Anaconda3\lib\site-packages\ipykernel\__main__.py:1: FutureWarning: irow(i) is deprecated. Please use .iloc[i]
  if __name__ == '__main__':

Out[36]:

0    0
1    1
Name: 2, dtype: int32

In [37]:

frame.iloc[0]

Out[37]:

0    0
1    1
Name: 2, dtype: int32

In [ ]: