In [1]:
%pylab inline
Populating the interactive namespace from numpy and matplotlib
In [3]:
from pandas import Series, DataFrame
In [2]:
import pandas as pd

Introduction to pandas Data Structures

Series

In [4]:
obj = Series([4, 7, -5, 3])
In [5]:
obj
Out[5]:
0    4
1    7
2   -5
3    3
dtype: int64
In [7]:
obj.values
Out[7]:
array([ 4,  7, -5,  3])
In [8]:
obj.index
Out[8]:
Int64Index([0, 1, 2, 3], dtype='int64')
In [9]:
obj2 = Series([4,7,-5,3], index = ['d','b','a','c'])
In [10]:
obj2
Out[10]:
d    4
b    7
a   -5
c    3
dtype: int64
In [11]:
obj2.values
Out[11]:
array([ 4,  7, -5,  3])
In [12]:
obj2.index
Out[12]:
Index([u'd', u'b', u'a', u'c'], dtype='object')
In [13]:
obj2['a']
Out[13]:
-5
In [14]:
obj2[['c','a','d']]
Out[14]:
c    3
a   -5
d    4
dtype: int64
In [15]:
obj2[obj2>0]
Out[15]:
d    4
b    7
c    3
dtype: int64
In [16]:
obj2*2
Out[16]:
d     8
b    14
a   -10
c     6
dtype: int64
In [17]:
np.exp(obj2)
Out[17]:
d      54.598150
b    1096.633158
a       0.006738
c      20.085537
dtype: float64
In [18]:
'b' in obj2
Out[18]:
True
In [19]:
'e' in obj2
Out[19]:
False
In [20]:
sdata = {'Ohio':3500,'Texas':71000,'Oregon':16000, 'Utah':5000}
In [21]:
obj3 = Series(sdata)
In [22]:
obj3
Out[22]:
Ohio       3500
Oregon    16000
Texas     71000
Utah       5000
dtype: int64
In [23]:
states = ['California','Ohio','Oregon','Texas']
In [24]:
obj4 = Series(sdata, index=states)
In [25]:
obj4
Out[25]:
California      NaN
Ohio           3500
Oregon        16000
Texas         71000
dtype: float64
In [26]:
pd.isnull(obj4)
Out[26]:
California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool
In [27]:
pd.notnull(obj4)
Out[27]:
California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool
Series also has instance methods.
In [28]:
obj4.isnull()
Out[28]:
California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool
Automatically aligns differently indexed data in arithmetic operations.
In [29]:
obj3
Out[29]:
Ohio       3500
Oregon    16000
Texas     71000
Utah       5000
dtype: int64
In [30]:
obj4
Out[30]:
California      NaN
Ohio           3500
Oregon        16000
Texas         71000
dtype: float64
In [31]:
obj3 + obj4
Out[31]:
California       NaN
Ohio            7000
Oregon         32000
Texas         142000
Utah             NaN
dtype: float64
name attribute
In [32]:
obj4.name = 'population'
In [33]:
obj4.index.name='state'
In [34]:
obj4
Out[34]:
state
California      NaN
Ohio           3500
Oregon        16000
Texas         71000
Name: population, dtype: float64
index altered in place
In [35]:
obj.index = ['Bob','Steve','Jeff','Ryan']
In [36]:
obj
Out[36]:
Bob      4
Steve    7
Jeff    -5
Ryan     3
dtype: int64

DataFrame

In [37]:
data = {'state':['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
        'year':[2000, 2001, 2002, 2001, 2002],
        'pop':[1.5, 1.7, 3.6, 2.4, 2.9]}
frame = DataFrame(data)
In [38]:
frame
Out[38]:
pop state year
0 1.5 Ohio 2000
1 1.7 Ohio 2001
2 3.6 Ohio 2002
3 2.4 Nevada 2001
4 2.9 Nevada 2002
column that is not contained in the data
In [39]:
frame2 = DataFrame(data, columns = ['year','state','pop','debt'],
                   index=['one','two','three','four','five'])
In [40]:
frame2
Out[40]:
year state pop debt
one 2000 Ohio 1.5 NaN
two 2001 Ohio 1.7 NaN
three 2002 Ohio 3.6 NaN
four 2001 Nevada 2.4 NaN
five 2002 Nevada 2.9 NaN
In [41]:
frame2.columns
Out[41]:
Index([u'year', u'state', u'pop', u'debt'], dtype='object')
column retrieved
In [42]:
frame2['state']
Out[42]:
one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
Name: state, dtype: object
In [44]:
frame2.year
Out[44]:
one      2000
two      2001
three    2002
four     2001
five     2002
Name: year, dtype: int64
rows retrieved
In [45]:
frame2.ix['three']
Out[45]:
year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object
In [47]:
frame2.ix[2]
Out[47]:
year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object
columns modified by assignment
In [48]:
frame2['debt']=16.5
In [49]:
frame2
Out[49]:
year state pop debt
one 2000 Ohio 1.5 16.5
two 2001 Ohio 1.7 16.5
three 2002 Ohio 3.6 16.5
four 2001 Nevada 2.4 16.5
five 2002 Nevada 2.9 16.5
In [50]:
frame2['debt']=np.arange(5.)
In [51]:
frame2
Out[51]:
year state pop debt
one 2000 Ohio 1.5 0
two 2001 Ohio 1.7 1
three 2002 Ohio 3.6 2
four 2001 Nevada 2.4 3
five 2002 Nevada 2.9 4
assign series to a column
In [52]:
val = Series([1.2, 1.5, -1.7], index = ['two', 'four','five'])
In [53]:
frame2['debt']=val
In [54]:
frame2
Out[54]:
year state pop debt
one 2000 Ohio 1.5 NaN
two 2001 Ohio 1.7 1.2
three 2002 Ohio 3.6 NaN
four 2001 Nevada 2.4 1.5
five 2002 Nevada 2.9 -1.7
assigning a column that does not exist
In [55]:
frame2['eastern'] = frame2.state=='Ohio'
In [56]:
frame2
Out[56]:
year state pop debt eastern
one 2000 Ohio 1.5 NaN True
two 2001 Ohio 1.7 1.2 True
three 2002 Ohio 3.6 NaN True
four 2001 Nevada 2.4 1.5 False
five 2002 Nevada 2.9 -1.7 False
delete columns
In [57]:
del frame2['eastern']
In [58]:
frame2.columns
Out[58]:
Index([u'year', u'state', u'pop', u'debt'], dtype='object')
nested dict of dicts
In [59]:
pop={'Nevada':{2001:2.4, 2002:2.9},
     'Ohio': {2000:1.5, 2001:1.7,2002:3.6} }
In [60]:
frame3 = DataFrame(pop)
In [61]:
frame3
Out[61]:
Nevada Ohio
2000 NaN 1.5
2001 2.4 1.7
2002 2.9 3.6
transpose
In [62]:
frame3.T
Out[62]:
2000 2001 2002
Nevada NaN 2.4 2.9
Ohio 1.5 1.7 3.6

Note that the keys in the inner dicts are unioned and sorted to form the index in the resul. This is not true if an explicit index is specified.

In [63]:
DataFrame(pop, index=[2001, 2002, 2003])
Out[63]:
Nevada Ohio
2001 2.4 1.7
2002 2.9 3.6
2003 NaN NaN
Dicts of series
In [64]:
pdata = {'Ohio':frame3['Ohio'][:-1],
         'Nevada':frame3['Nevada'][:2]}
In [65]:
DataFrame(pdata)
Out[65]:
Nevada Ohio
2000 NaN 1.5
2001 2.4 1.7
In [66]:
frame3.index.name='year'
In [67]:
frame3.columns.name = 'state'
In [68]:
frame3
Out[68]:
state Nevada Ohio
year
2000 NaN 1.5
2001 2.4 1.7
2002 2.9 3.6
In [69]:
frame3.values
Out[69]:
array([[ nan,  1.5],
       [ 2.4,  1.7],
       [ 2.9,  3.6]])
different types of columns
In [70]:
frame2.values
Out[70]:
array([[2000, 'Ohio', 1.5, nan],
       [2001, 'Ohio', 1.7, 1.2],
       [2002, 'Ohio', 3.6, nan],
       [2001, 'Nevada', 2.4, 1.5],
       [2002, 'Nevada', 2.9, -1.7]], dtype=object)

Index Objects

In [71]:
obj = Series(range(3), index=['a','b','c'])
In [73]:
index = obj.index
In [74]:
index[1:]
Out[74]:
Index([u'b', u'c'], dtype='object')
immutable
In [75]:
index[1]='d'
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-75-d3f90986bdb1> in <module>()
----> 1 index[1]='d'

/Users/sergulaydore/anaconda/lib/python2.7/site-packages/pandas/core/base.pyc in _disabled(self, *args, **kwargs)
    180         """This method will not function because object is immutable."""
    181         raise TypeError("'%s' does not support mutable operations." %
--> 182                         self.__class__)
    183 
    184     __setitem__ = __setslice__ = __delitem__ = __delslice__ = _disabled

TypeError: '<class 'pandas.core.index.Index'>' does not support mutable operations.
In [76]:
index = pd.Index(np.arange(3))
In [77]:
obj2 = Series([1.5,-2.5,0], index=index)
In [78]:
obj2.index is index
Out[78]:
True
Index objects are sets
In [79]:
frame3
Out[79]:
state Nevada Ohio
year
2000 NaN 1.5
2001 2.4 1.7
2002 2.9 3.6
In [80]:
'Ohio' in frame3
Out[80]:
True
In [81]:
2003 in frame3.index
Out[81]:
False
In [82]:
frame3.index-frame2.index
Out[82]:
Int64Index([2000, 2001, 2002], dtype='int64')
In [85]:
frame3.index.diff(frame2.index)
Out[85]:
Int64Index([2000, 2001, 2002], dtype='int64')

Index methods: append, diff, intersection, union, isin, delete, drop, insert, is_monotonic, is_unique, uniuq

Essential Functionality

Reindexing

In [86]:
obj = Series([4.5,7.2,-5.3,3.6], index=['d','b','a','c'])
In [87]:
obj
Out[87]:
d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64
In [88]:
obj2 = obj.reindex(['a','b','c','d','e'])
In [89]:
obj2
Out[89]:
a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64
In [90]:
obj.reindex(['a','b','c','d','e'], fill_value=0)
Out[90]:
a   -5.3
b    7.2
c    3.6
d    4.5
e    0.0
dtype: float64
In [91]:
obj
Out[91]:
d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64
filling of values
In [92]:
obj3 = Series(['blue','purple','yellow'], index=[0,2,4])
In [93]:
obj3.reindex(range(6))
Out[93]:
0      blue
1       NaN
2    purple
3       NaN
4    yellow
5       NaN
dtype: object
In [95]:
obj3.reindex(range(6),method='ffill') # forward fills the values
Out[95]:
0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

available method options: ffill or pad, bfill or backfill

reindexing columns
In [96]:
frame = DataFrame(np.arange(9).reshape(3,3), index = ['a','c','d'],
                  columns=['Ohio','Texas','California'])
In [97]:
frame
Out[97]:
Ohio Texas California
a 0 1 2
c 3 4 5
d 6 7 8
In [98]:
frame2 = frame.reindex(['a','b','c','d'])
In [99]:
frame2
Out[99]:
Ohio Texas California
a 0 1 2
b NaN NaN NaN
c 3 4 5
d 6 7 8
In [100]:
states = ['Texas','Utah','California']
In [101]:
frame.reindex(columns=states)
Out[101]:
Texas Utah California
a 1 NaN 2
c 4 NaN 5
d 7 NaN 8
Both columns and rows can be reindexed
In [102]:
frame.reindex(index=['a','b','c','d'], method='ffill',
              columns=states)
Out[102]:
Texas Utah California
a 1 NaN 2
b 1 NaN 2
c 4 NaN 5
d 7 NaN 8
In [103]:
frame
Out[103]:
Ohio Texas California
a 0 1 2
c 3 4 5
d 6 7 8
reindexing with ix
In [105]:
frame.ix[['a','b','c','d'], states]
Out[105]:
Texas Utah California
a 1 NaN 2
b NaN NaN NaN
c 4 NaN 5
d 7 NaN 8
Dropping entries from an axis
In [106]:
obj = Series(np.arange(5.), index = ['a','b','c','d','e'])
In [107]:
obj
Out[107]:
a    0
b    1
c    2
d    3
e    4
dtype: float64
In [109]:
new_obj = obj.drop('c')
In [110]:
new_obj
Out[110]:
a    0
b    1
d    3
e    4
dtype: float64
In [111]:
obj.drop(['d','c'])
Out[111]:
a    0
b    1
e    4
dtype: float64
Index values can be deleted from either axis
In [112]:
data = DataFrame(np.arange(16).reshape((4,4)),
                 index = ['Ohio','Colorado','Utah','New York'],
                 columns = ['one','two','three','four'])
In [113]:
data
Out[113]:
one two three four
Ohio 0 1 2 3
Colorado 4 5 6 7
Utah 8 9 10 11
New York 12 13 14 15
In [114]:
data.drop(['Colorado','Ohio'])
Out[114]:
one two three four
Utah 8 9 10 11
New York 12 13 14 15
In [115]:
data.drop('two', axis=1)
Out[115]:
one three four
Ohio 0 2 3
Colorado 4 6 7
Utah 8 10 11
New York 12 14 15
In [116]:
data.drop(['two','four'], axis=1)
Out[116]:
one three
Ohio 0 2
Colorado 4 6
Utah 8 10
New York 12 14

Indexing, selection and filtering

In [117]:
obj = Series(np.arange(4.), index=['a','b','c','d'])
In [118]:
obj['b']
Out[118]:
1.0
In [119]:
obj[2:4]
Out[119]:
c    2
d    3
dtype: float64
In [120]:
obj[['b','a','d']]
Out[120]:
b    1
a    0
d    3
dtype: float64
In [121]:
obj[[1,3]]
Out[121]:
b    1
d    3
dtype: float64
In [122]:
obj[obj<2]
Out[122]:
a    0
b    1
dtype: float64
In [123]:
obj['b':'c']
Out[123]:
b    1
c    2
dtype: float64
In [124]:
obj['b':'c']=5
In [125]:
obj
Out[125]:
a    0
b    5
c    5
d    3
dtype: float64
In [126]:
data=DataFrame(np.arange(16).reshape((4,4)),
               index=['Ohio','Colorado','Utah','New York'],
               columns=['one','two','three','four'])
In [127]:
data
Out[127]:
one two three four
Ohio 0 1 2 3
Colorado 4 5 6 7
Utah 8 9 10 11
New York 12 13 14 15
In [128]:
data['two']
Out[128]:
Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int64
In [129]:
data[['three','one']]
Out[129]:
three one
Ohio 2 0
Colorado 6 4
Utah 10 8
New York 14 12
In [130]:
data[:2]
Out[130]:
one two three four
Ohio 0 1 2 3
Colorado 4 5 6 7
In [131]:
data[data['three']>5]
Out[131]:
one two three four
Colorado 4 5 6 7
Utah 8 9 10 11
New York 12 13 14 15
indexing with boolean DataFrame
In [132]:
data<5
Out[132]:
one two three four
Ohio True True True True
Colorado True False False False
Utah False False False False
New York False False False False
In [133]:
data[data<5]=0
In [134]:
data
Out[134]:
one two three four
Ohio 0 0 0 0
Colorado 0 5 6 7
Utah 8 9 10 11
New York 12 13 14 15
label-indexing on the rows (this is a less verbose way to do reindexing)
In [135]:
data.ix['Colorado',['two','three']]
Out[135]:
two      5
three    6
Name: Colorado, dtype: int64
In [136]:
data.ix[['Colorado','Utah'],[3,0,1]]
Out[136]:
four one two
Colorado 7 0 5
Utah 11 8 9
In [137]:
data.ix[2]
Out[137]:
one       8
two       9
three    10
four     11
Name: Utah, dtype: int64
In [138]:
data.ix[:'Utah','two']
Out[138]:
Ohio        0
Colorado    5
Utah        9
Name: two, dtype: int64
In [140]:
data.ix[data.three>5,:3]
Out[140]:
one two three
Colorado 0 5 6
Utah 8 9 10
New York 12 13 14

Arithmetic and data alignment

In [141]:
s1 = Series([7.3,-2.5,3.4,1.5], index=['a','c','d','e'])
In [142]:
s2 = Series([-2.1,3.6,-1.5,4,3.1], index=['a','c','e','f','g'])
In [143]:
s1
Out[143]:
a    7.3
c   -2.5
d    3.4
e    1.5
dtype: float64
In [144]:
s2
Out[144]:
a   -2.1
c    3.6
e   -1.5
f    4.0
g    3.1
dtype: float64
adding these together yields
In [145]:
s1+s2
Out[145]:
a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64
aligment is performed on both the rows and the columns
In [146]:
df1 = DataFrame(np.arange(9.).reshape((3,3)),
                columns = list('bcd'),
                index=['Ohio','Texas','Colorado'])
In [150]:
df2 = DataFrame(np.arange(12.).reshape((4,3)),
               columns = list('bde'),
               index=['Utah','Ohio','Texas','Oregon'])
In [148]:
df1
Out[148]:
b c d
Ohio 0 1 2
Texas 3 4 5
Colorado 6 7 8
In [151]:
df2
Out[151]:
b d e
Utah 0 1 2
Ohio 3 4 5
Texas 6 7 8
Oregon 9 10 11
In [152]:
df1 + df2
Out[152]:
b c d e
Colorado NaN NaN NaN NaN
Ohio 3 NaN 6 NaN
Oregon NaN NaN NaN NaN
Texas 9 NaN 12 NaN
Utah NaN NaN NaN NaN
Arithmetci operations with fill values
In [153]:
df1 = DataFrame(np.arange(12.).reshape((3,4)), columns = list('abcd'))
In [154]:
df2 = DataFrame(np.arange(20.).reshape((4,5)), columns = list('abcde'))
In [155]:
df1
Out[155]:
a b c d
0 0 1 2 3
1 4 5 6 7
2 8 9 10 11
In [156]:
df2
Out[156]:
a b c d e
0 0 1 2 3 4
1 5 6 7 8 9
2 10 11 12 13 14
3 15 16 17 18 19
In [157]:
df1 + df2
Out[157]:
a b c d e
0 0 2 4 6 NaN
1 9 11 13 15 NaN
2 18 20 22 24 NaN
3 NaN NaN NaN NaN NaN
In [158]:
df1.add(df2, fill_value=0)
Out[158]:
a b c d e
0 0 2 4 6 4
1 9 11 13 15 9
2 18 20 22 24 14
3 15 16 17 18 19
In [159]:
df1.reindex(columns=df2.columns, fill_value=0)
Out[159]:
a b c d e
0 0 1 2 3 0
1 4 5 6 7 0
2 8 9 10 11 0

aritmetic methods: add, sub, div, mul

Operations between DataFrame and Series
broadcasting
In [160]:
arr = np.arange(12.).reshape((3,4))
In [161]:
arr
Out[161]:
array([[  0.,   1.,   2.,   3.],
       [  4.,   5.,   6.,   7.],
       [  8.,   9.,  10.,  11.]])
In [162]:
arr[0]
Out[162]:
array([ 0.,  1.,  2.,  3.])
In [163]:
arr-arr[0]
Out[163]:
array([[ 0.,  0.,  0.,  0.],
       [ 4.,  4.,  4.,  4.],
       [ 8.,  8.,  8.,  8.]])
In [164]:
frame = DataFrame(np.arange(12.).reshape((4,3)), columns=list('bde'),
                  index=['Utah','Ohio','Texas','Oregon'])
In [165]:
frame
Out[165]:
b d e
Utah 0 1 2
Ohio 3 4 5
Texas 6 7 8
Oregon 9 10 11
In [166]:
series =frame.ix[0]
In [167]:
series
Out[167]:
b    0
d    1
e    2
Name: Utah, dtype: float64
In [168]:
frame-series
Out[168]:
b d e
Utah 0 0 0
Ohio 3 3 3
Texas 6 6 6
Oregon 9 9 9
if an index value is not found
In [169]:
series2 = Series(range(3), index=['b','e','f'])
In [170]:
series2
Out[170]:
b    0
e    1
f    2
dtype: int64
In [172]:
frame + series2
Out[172]:
b d e f
Utah 0 NaN 3 NaN
Ohio 3 NaN 6 NaN
Texas 6 NaN 9 NaN
Oregon 9 NaN 12 NaN
broadcast over the columns
In [174]:
series3 = frame['d']
In [175]:
frame
Out[175]:
b d e
Utah 0 1 2
Ohio 3 4 5
Texas 6 7 8
Oregon 9 10 11
In [176]:
series3
Out[176]:
Utah       1
Ohio       4
Texas      7
Oregon    10
Name: d, dtype: float64
In [179]:
frame.sub(series3, axis=0)
Out[179]:
b d e
Utah -1 0 1
Ohio -1 0 1
Texas -1 0 1
Oregon -1 0 1

Function Application and Mapping

In [5]:
import numpy as np
In [6]:
frame = DataFrame(np.random.randn(4,3), columns=list('bde'),
                  index=['Utah','Ohio','Texas','Oregon'])
In [7]:
frame
Out[7]:
b d e
Utah -0.267451 -1.829889 2.544224
Ohio 1.116375 -0.270886 1.086111
Texas 0.390590 -0.339125 0.341343
Oregon 1.145794 -1.268138 -0.925395
In [8]:
np.abs(frame)
Out[8]:
b d e
Utah 0.267451 1.829889 2.544224
Ohio 1.116375 0.270886 1.086111
Texas 0.390590 0.339125 0.341343
Oregon 1.145794 1.268138 0.925395
apply method
In [10]:
f = lambda x: x.max()-x.min()
In [13]:
frame.apply(f,axis=1)
Out[13]:
Utah      4.374113
Ohio      1.387261
Texas     0.729715
Oregon    2.413932
dtype: float64
function return a series with multiple values
In [14]:
def f(x):
    return Series([x.min(),x.max()], index=['min','max'])
In [17]:
frame.apply(f, axis=0)
Out[17]:
b d e
min -0.267451 -1.829889 -0.925395
max 1.145794 -0.270886 2.544224
element-wise python functions
In [18]:
format = lambda x: '%.2f' %x
In [19]:
frame.applymap(format)
Out[19]:
b d e
Utah -0.27 -1.83 2.54
Ohio 1.12 -0.27 1.09
Texas 0.39 -0.34 0.34
Oregon 1.15 -1.27 -0.93
map method
In [20]:
frame['e'].map(format)
Out[20]:
Utah       2.54
Ohio       1.09
Texas      0.34
Oregon    -0.93
Name: e, dtype: object

Sorting and ranking

In [21]:
obj = Series(range(4), index=['d','a','b','c'])
In [22]:
obj.sort_index()
Out[22]:
a    1
b    2
c    3
d    0
dtype: int64
sort by index on either axis
In [23]:
frame = DataFrame(np.arange(8).reshape((2,4)), index = ['three','one'],
                  columns = ['d','a','b','c'])
In [24]:
frame.sort_index()
Out[24]:
d a b c
one 4 5 6 7
three 0 1 2 3
In [25]:
frame.sort_index(axis=1)
Out[25]:
a b c d
three 1 2 3 0
one 5 6 7 4
descending order
In [26]:
frame.sort_index(axis=1, ascending = False)
Out[26]:
d c b a
three 0 3 2 1
one 4 7 6 5
sort a series by values
In [27]:
obj = Series([4,7,-3,2])
In [28]:
obj.order()
Out[28]:
2   -3
3    2
0    4
1    7
dtype: int64
missing values
In [29]:
obj = Series([4,np.nan,7,np.nan,-3,2])
In [30]:
obj.order()
Out[30]:
4    -3
5     2
0     4
2     7
1   NaN
3   NaN
dtype: float64
sort values in one or more columns in data frame
In [31]:
frame = DataFrame({'b':[4,7,-3,2],'a':[0,1,0,1]})
In [32]:
frame
Out[32]:
a b
0 0 4
1 1 7
2 0 -3
3 1 2
In [33]:
frame.sort_index(by='b')
Out[33]:
a b
2 0 -3
3 1 2
0 0 4
1 1 7
In [34]:
frame.sort_index(by = ['a','b'])
Out[34]:
a b
2 0 -3
0 0 4
3 1 2
1 1 7
Ranking
In [35]:
obj = Series([7, -5, 7, 4, 2, 0, 4])
In [36]:
obj.rank()
Out[36]:
0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64
ranking according to the order they're observed in the data
In [37]:
obj.rank(method='first')
Out[37]:
0    6
1    1
2    7
3    4
4    3
5    2
6    5
dtype: float64
In [38]:
obj.rank(ascending=False, method='max')
Out[38]:
0    2
1    7
2    2
3    4
4    5
5    6
6    4
dtype: float64
ranks over the rows or columns
In [39]:
frame = DataFrame({'b':[4.3,7,-3,2], 'a':[0,1,0,1],
                   'c':[-2,5,8,-2.5]})
In [40]:
frame
Out[40]:
a b c
0 0 4.3 -2.0
1 1 7.0 5.0
2 0 -3.0 8.0
3 1 2.0 -2.5
In [41]:
frame.rank(axis=1)
Out[41]:
a b c
0 2 3 1
1 1 3 2
2 2 1 3
3 2 3 1
Tie-breaking methods with rank

average (default), min, max, first

Axis indexes with duplicate values

In [42]:
obj = Series(range(5),index=['a','a','b','b','c'])
In [43]:
obj
Out[43]:
a    0
a    1
b    2
b    3
c    4
dtype: int64
In [44]:
obj.index.is_unique
Out[44]:
False
In [45]:
obj['a']
Out[45]:
a    0
a    1
dtype: int64
In [46]:
obj['c']
Out[46]:
4
In [47]:
df = DataFrame(np.random.randn(4,3), index=['a','a','b','b'])
In [48]:
df
Out[48]:
0 1 2
a 0.654546 -0.963798 0.228551
a -1.451196 -1.528684 1.567436
b -1.213714 -1.556472 -0.573577
b -1.838320 -2.250823 0.440242
In [49]:
df.ix['b']
Out[49]:
0 1 2
b -1.213714 -1.556472 -0.573577
b -1.838320 -2.250823 0.440242

Summarizing and Computing Descriptive Statistics

In [50]:
df = DataFrame([[1.4, np.nan], [7.1,-4.5],
                [np.nan, np.nan],[0.75,-1.3]],
               index=['a','b','c','d'],
               columns=['one','two'])
In [51]:
df
Out[51]:
one two
a 1.40 NaN
b 7.10 -4.5
c NaN NaN
d 0.75 -1.3
column sums
In [52]:
df.sum()
Out[52]:
one    9.25
two   -5.80
dtype: float64
row sums
In [53]:
df.sum(axis=1) 
Out[53]:
a    1.40
b    2.60
c     NaN
d   -0.55
dtype: float64

NA values are excluded unless the entire slice is NA. This can be disables using skipna.

In [54]:
df.mean(axis=1, skipna=False)
Out[54]:
a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64
indirect statistics
In [55]:
df.idxmax()
Out[55]:
one    b
two    d
dtype: object
accumulations
In [57]:
df.cumsum()
Out[57]:
one two
a 1.40 NaN
b 8.50 -4.5
c NaN NaN
d 9.25 -5.8
multiple summary statistics
In [59]:
df.describe()
Out[59]:
one two
count 3.000000 2.000000
mean 3.083333 -2.900000
std 3.493685 2.262742
min 0.750000 -4.500000
25% 1.075000 -3.700000
50% 1.400000 -2.900000
75% 4.250000 -2.100000
max 7.100000 -1.300000
non-numeric data
In [60]:
obj = Series(['a','a','b','c']*4)
In [61]:
obj.describe()
Out[61]:
count     16
unique     3
top        a
freq       8
dtype: object

Correlation and Covariance

In [62]:
import pandas.io.data as web
In [63]:
all_data = {}
In [72]:
for ticker in ['AAPL','IBM','MSFT','GOOG']:
    all_data[ticker] = web.get_data_yahoo(ticker,'04/11/2010','04/14/2014')
In [73]:
price =  DataFrame({tic: data['Adj Close']
                    for tic, data in all_data.iteritems()})
In [74]:
volume = DataFrame({tic: data['Volume']
                    for tic, data in all_data.iteritems()})
In [75]:
returns = price.pct_change()
In [77]:
returns.tail()
Out[77]:
AAPL GOOG IBM MSFT
Date
2014-04-08 -0.000136 0.031125 -0.006328 0.000516
2014-04-09 0.013219 0.016652 0.017355 0.016508
2014-04-10 -0.012912 -0.041107 -0.004904 -0.027404
2014-04-11 -0.007358 -0.019133 -0.002516 -0.003913
2014-04-14 0.003981 0.003619 0.013244 -0.000786
In [78]:
returns.MSFT.corr(returns.IBM)
Out[78]:
0.50623670894935635
In [79]:
returns.MSFT.cov(returns.IBM)
Out[79]:
8.8008494273331047e-05
full correlation or covariance
In [80]:
returns.corr()
Out[80]:
AAPL GOOG IBM MSFT
AAPL 1.000000 0.737439 0.384410 0.336002
GOOG 0.737439 1.000000 0.329676 0.717087
IBM 0.384410 0.329676 1.000000 0.506237
MSFT 0.336002 0.717087 0.506237 1.000000
In [81]:
returns.cov()
Out[81]:
AAPL GOOG IBM MSFT
AAPL 0.000301 0.000152 0.000080 0.000085
GOOG 0.000152 0.000528 0.000069 0.000265
IBM 0.000080 0.000069 0.000143 0.000088
MSFT 0.000085 0.000265 0.000088 0.000211
In [82]:
returns.corrwith(returns.IBM)
Out[82]:
AAPL    0.384410
GOOG    0.329676
IBM     1.000000
MSFT    0.506237
dtype: float64
correlations of matching column names
In [84]:
returns.corrwith(volume)
Out[84]:
AAPL   -0.128694
GOOG   -0.478992
IBM    -0.152372
MSFT   -0.102546
dtype: float64

Unique values, value count, and membership

In [85]:
obj = Series(['c','a','d','a','a','b','b','c','c'])
In [86]:
uniques = obj.unique()
In [87]:
uniques
Out[87]:
array(['c', 'a', 'd', 'b'], dtype=object)
In [89]:
obj.value_counts()
Out[89]:
c    3
a    3
b    2
d    1
dtype: int64
In [91]:
pd.value_counts(obj.values, sort=False)
Out[91]:
a    3
c    3
b    2
d    1
dtype: int64
In [93]:
mask = obj.isin(['b','c'])
In [95]:
mask
Out[95]:
0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool
In [96]:
obj[mask]
Out[96]:
0    c
5    b
6    b
7    c
8    c
dtype: object
compute a histogram
In [97]:
data = DataFrame({'Quo1':[1,3,4,3,4],
                  'Quo2': [2,3,1,2,3],
                  'Quo3':[1,5,2,4,4]})
In [98]:
data
Out[98]:
Quo1 Quo2 Quo3
0 1 2 1
1 3 3 5
2 4 1 2
3 3 2 4
4 4 3 4
In [100]:
result = data.apply(pd.value_counts).fillna(0)
In [101]:
result
Out[101]:
Quo1 Quo2 Quo3
1 1 1 1
2 0 2 1
3 2 2 0
4 2 0 2
5 0 0 1

Handling Missing Data

In [103]:
string_data = Series(['aardvark','artichoke',np.nan, 'avocado'])
In [104]:
string_data
Out[104]:
0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object
In [105]:
string_data.isnull()
Out[105]:
0    False
1    False
2     True
3    False
dtype: bool
built-in Python's none
In [106]:
string_data[0] = None
In [107]:
string_data.isnull()
Out[107]:
0     True
1    False
2     True
3    False
dtype: bool

NA handling methods: dropna, fillna, isnull, notnull

FIltering out missing data

In [109]:
from numpy import nan as NA
In [110]:
data = Series([1, NA, 3.5, NA, 7])
In [111]:
data.dropna()
Out[111]:
0    1.0
2    3.5
4    7.0
dtype: float64
using boolean indexing
In [112]:
data[data.notnull()]
Out[112]:
0    1.0
2    3.5
4    7.0
dtype: float64
dropna with dataframa
In [113]:
data = DataFrame([[1.,6.5,3.],[1.,NA,NA],
                  [NA,NA,NA],[NA,6.5,3.]])
In [114]:
data
Out[114]:
0 1 2
0 1 6.5 3
1 1 NaN NaN
2 NaN NaN NaN
3 NaN 6.5 3
In [115]:
cleaned = data.dropna()
In [116]:
cleaned
Out[116]:
0 1 2
0 1 6.5 3
drop rows that are all NA
In [117]:
data.dropna(how='all')
Out[117]:
0 1 2
0 1 6.5 3
1 1 NaN NaN
3 NaN 6.5 3
drop columns that are all NA
In [118]:
data[4]=NA
In [119]:
data
Out[119]:
0 1 2 4
0 1 6.5 3 NaN
1 1 NaN NaN NaN
2 NaN NaN NaN NaN
3 NaN 6.5 3 NaN
In [120]:
data.dropna(axis=1,how='all')
Out[120]:
0 1 2
0 1 6.5 3
1 1 NaN NaN
2 NaN NaN NaN
3 NaN 6.5 3
keep only rows contaiing a certain no of obs
In [121]:
df = DataFrame(np.random.randn(7,3))
In [122]:
df.ix[:4,1]=NA; df.ix[:2,2]=NA
In [123]:
df
Out[123]:
0 1 2
0 -0.676323 NaN NaN
1 -1.088497 NaN NaN
2 0.122962 NaN NaN
3 2.087705 NaN 1.538338
4 -1.267822 NaN -0.924860
5 -1.596591 -1.001367 -0.705282
6 -0.026753 -0.694895 -0.386125
In [124]:
df.dropna(thresh=2)
Out[124]:
0 1 2
3 2.087705 NaN 1.538338
4 -1.267822 NaN -0.924860
5 -1.596591 -1.001367 -0.705282
6 -0.026753 -0.694895 -0.386125

Filling in Missing Data

In [125]:
df.fillna(0)
Out[125]:
0 1 2
0 -0.676323 0.000000 0.000000
1 -1.088497 0.000000 0.000000
2 0.122962 0.000000 0.000000
3 2.087705 0.000000 1.538338
4 -1.267822 0.000000 -0.924860
5 -1.596591 -1.001367 -0.705282
6 -0.026753 -0.694895 -0.386125
different fill value for each column
In [126]:
df.fillna({1:0.5,3:-1})
Out[126]:
0 1 2
0 -0.676323 0.500000 NaN
1 -1.088497 0.500000 NaN
2 0.122962 0.500000 NaN
3 2.087705 0.500000 1.538338
4 -1.267822 0.500000 -0.924860
5 -1.596591 -1.001367 -0.705282
6 -0.026753 -0.694895 -0.386125
modify existing project in place
In [127]:
_ = df.fillna(0, inplace=True)
In [128]:
df
Out[128]:
0 1 2
0 -0.676323 0.000000 0.000000
1 -1.088497 0.000000 0.000000
2 0.122962 0.000000 0.000000
3 2.087705 0.000000 1.538338
4 -1.267822 0.000000 -0.924860
5 -1.596591 -1.001367 -0.705282
6 -0.026753 -0.694895 -0.386125
interpolation methods
In [129]:
df = DataFrame(np.random.randn(6,3))
In [130]:
df.ix[2:,1]=NA; df.ix[4:,2]=NA
In [131]:
df
Out[131]:
0 1 2
0 -0.570090 -0.513487 -0.707382
1 1.453435 0.440059 -0.292707
2 -0.045053 NaN 0.951857
3 1.277519 NaN -0.463472
4 -2.012467 NaN NaN
5 -0.854318 NaN NaN
In [132]:
df.fillna(method = 'ffill')
Out[132]:
0 1 2
0 -0.570090 -0.513487 -0.707382
1 1.453435 0.440059 -0.292707
2 -0.045053 0.440059 0.951857
3 1.277519 0.440059 -0.463472
4 -2.012467 0.440059 -0.463472
5 -0.854318 0.440059 -0.463472
In [133]:
df.fillna(method='ffill',limit=2)
Out[133]:
0 1 2
0 -0.570090 -0.513487 -0.707382
1 1.453435 0.440059 -0.292707
2 -0.045053 0.440059 0.951857
3 1.277519 0.440059 -0.463472
4 -2.012467 NaN -0.463472
5 -0.854318 NaN -0.463472
pass the mean or median value of a Series
In [134]:
data=Series([1.,NA,3.5,NA,7])
In [135]:
data.fillna(data.mean())
Out[135]:
0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

fillna function arguments : value, method, axis, inplace, limit

Hierarchical Indexing

In [136]:
data = Series(np.random.randn(10),
              index = [['a','a','a','b','b','b','c','c','d','d'],
                       [1,2,3,1,2,3,1,2,2,3]])
In [137]:
data
Out[137]:
a  1   -0.734465
   2   -0.775564
   3   -2.134157
b  1    1.169687
   2    0.655510
   3    0.229640
c  1    0.607370
   2   -0.227422
d  2   -0.319022
   3   -1.861461
dtype: float64
In [138]:
data.index
Out[138]:
MultiIndex(levels=[[u'a', u'b', u'c', u'd'], [1, 2, 3]],
           labels=[[0, 0, 0, 1, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 1, 2, 0, 1, 1, 2]])
partial indexing
In [139]:
data['b']
Out[139]:
1    1.169687
2    0.655510
3    0.229640
dtype: float64
In [140]:
data['b':'c']
Out[140]:
b  1    1.169687
   2    0.655510
   3    0.229640
c  1    0.607370
   2   -0.227422
dtype: float64
In [141]:
data.ix[['b','d']]
Out[141]:
b  1    1.169687
   2    0.655510
   3    0.229640
d  2   -0.319022
   3   -1.861461
dtype: float64
selection from an inner level
In [142]:
data[:,2]
Out[142]:
a   -0.775564
b    0.655510
c   -0.227422
d   -0.319022
dtype: float64
rearranged into a DataFrame
In [143]:
data.unstack()
Out[143]:
1 2 3
a -0.734465 -0.775564 -2.134157
b 1.169687 0.655510 0.229640
c 0.607370 -0.227422 NaN
d NaN -0.319022 -1.861461
inverse operation of unstack
In [144]:
data.unstack().stack()
Out[144]:
a  1   -0.734465
   2   -0.775564
   3   -2.134157
b  1    1.169687
   2    0.655510
   3    0.229640
c  1    0.607370
   2   -0.227422
d  2   -0.319022
   3   -1.861461
dtype: float64
either axis can have a hierarchical index:
In [145]:
frame = DataFrame(np.arange(12).reshape((4,3)),
                  index = [['a','a','b','b'],[1,2,1,2]],
                  columns = [['Ohio','Ohio','Colorado'],
                             ['Green','Red','Green']])
In [146]:
frame
Out[146]:
Ohio Colorado
Green Red Green
a 1 0 1 2
2 3 4 5
b 1 6 7 8
2 9 10 11
hierarchical levels can have names
In [147]:
frame.index.names = ['key1', 'key2']
In [149]:
frame.columns.names = ['state', 'color']
In [150]:
frame
Out[150]:
state Ohio Colorado
color Green Red Green
key1 key2
a 1 0 1 2
2 3 4 5
b 1 6 7 8
2 9 10 11
partial column indexing
In [151]:
frame['Ohio']
Out[151]:
color Green Red
key1 key2
a 1 0 1
2 3 4
b 1 6 7
2 9 10

Reordering and Sorting Levels

In [152]:
frame.swaplevel('key1','key2')
Out[152]:
state Ohio Colorado
color Green Red Green
key2 key1
1 a 0 1 2
2 a 3 4 5
1 b 6 7 8
2 b 9 10 11
In [153]:
frame.sortlevel(1)
Out[153]:
state Ohio Colorado
color Green Red Green
key1 key2
a 1 0 1 2
b 1 6 7 8
a 2 3 4 5
b 2 9 10 11
In [154]:
frame.sortlevel(0)
Out[154]:
state Ohio Colorado
color Green Red Green
key1 key2
a 1 0 1 2
2 3 4 5
b 1 6 7 8
2 9 10 11
In [155]:
frame.swaplevel(0,1).sortlevel(0)
Out[155]:
state Ohio Colorado
color Green Red Green
key2 key1
1 a 0 1 2
b 6 7 8
2 a 3 4 5
b 9 10 11

Summary Statistics by level

In [156]:
frame
Out[156]:
state Ohio Colorado
color Green Red Green
key1 key2
a 1 0 1 2
2 3 4 5
b 1 6 7 8
2 9 10 11
In [157]:
frame.sum(level='key2')
Out[157]:
state Ohio Colorado
color Green Red Green
key2
1 6 8 10
2 12 14 16
In [158]:
frame.sum(level='color',axis=1)
Out[158]:
color Green Red
key1 key2
a 1 2 1
2 8 4
b 1 14 7
2 20 10

Using DataFrame's COlumns

In [161]:
frame = DataFrame({'a':range(7), 'b':range(7,0,-1),
          'c':['one','one','one','two','two','two','two'],
          'd':[0,1,2,0,1,2,3]})
In [162]:
frame
Out[162]:
a b c d
0 0 7 one 0
1 1 6 one 1
2 2 5 one 2
3 3 4 two 0
4 4 3 two 1
5 5 2 two 2
6 6 1 two 3
new data frame
In [163]:
frame2 = frame.set_index(['c','d'])
In [164]:
frame2
Out[164]:
a b
c d
one 0 0 7
1 1 6
2 2 5
two 0 3 4
1 4 3
2 5 2
3 6 1
you can leave the columns in
In [165]:
frame.set_index(['c','d'], drop=False)
Out[165]:
a b c d
c d
one 0 0 7 one 0
1 1 6 one 1
2 2 5 one 2
two 0 3 4 two 0
1 4 3 two 1
2 5 2 two 2
3 6 1 two 3
In [166]:
frame2.reset_index()
Out[166]:
c d a b
0 one 0 0 7
1 one 1 1 6
2 one 2 2 5
3 two 0 3 4
4 two 1 4 3
5 two 2 5 2
6 two 3 6 1

Other Pandas Topics

Integer indexing

In [167]:
ser = Series(np.arange(3.))
In [168]:
ser[-1]
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-168-3cbe0b873a9e> in <module>()
----> 1 ser[-1]

/Users/sergulaydore/anaconda/lib/python2.7/site-packages/pandas/core/series.pyc in __getitem__(self, key)
    482     def __getitem__(self, key):
    483         try:
--> 484             result = self.index.get_value(self, key)
    485 
    486             if not np.isscalar(result):

/Users/sergulaydore/anaconda/lib/python2.7/site-packages/pandas/core/index.pyc in get_value(self, series, key)
   1194 
   1195         try:
-> 1196             return self._engine.get_value(s, k)
   1197         except KeyError as e1:
   1198             if len(self) > 0 and self.inferred_type in ['integer','boolean']:

/Users/sergulaydore/anaconda/lib/python2.7/site-packages/pandas/index.so in pandas.index.IndexEngine.get_value (pandas/index.c:2993)()

/Users/sergulaydore/anaconda/lib/python2.7/site-packages/pandas/index.so in pandas.index.IndexEngine.get_value (pandas/index.c:2808)()

/Users/sergulaydore/anaconda/lib/python2.7/site-packages/pandas/index.so in pandas.index.IndexEngine.get_loc (pandas/index.c:3534)()

/Users/sergulaydore/anaconda/lib/python2.7/site-packages/pandas/hashtable.so in pandas.hashtable.Int64HashTable.get_item (pandas/hashtable.c:7035)()

/Users/sergulaydore/anaconda/lib/python2.7/site-packages/pandas/hashtable.so in pandas.hashtable.Int64HashTable.get_item (pandas/hashtable.c:6976)()

KeyError: -1
In [169]:
ser
Out[169]:
0    0
1    1
2    2
dtype: float64

non-integer index works

In [170]:
ser2 = Series(np.arange(3.), index = ['a','b','c'])
In [171]:
ser2[-1]
Out[171]:
2.0
To keep things consistent, use .ix
In [172]:
ser.ix[:1]
Out[172]:
0    0
1    1
dtype: float64
reliable position indexing
In [173]:
ser3 = Series(range(3), index = [-5, 1,3])
In [174]:
ser3.iget_value(2)
Out[174]:
2
In [175]:
frame = DataFrame(np.arange(6).reshape(3,2), index=[2,0,1])
In [176]:
frame
Out[176]:
0 1
2 0 1
0 2 3
1 4 5
In [177]:
frame.irow(0)
Out[177]:
0    0
1    1
Name: 2, dtype: int64
In [179]:
frame.irow(2)
Out[179]:
0    4
1    5
Name: 1, dtype: int64