In [1]:

%pylab inline

Populating the interactive namespace from numpy and matplotlib

In [3]:

from pandas import Series, DataFrame

In [2]:

import pandas as pd

Introduction to pandas Data Structures¶

Series¶

In [4]:

obj = Series([4, 7, -5, 3])

In [5]:

obj

Out[5]:

0    4
1    7
2   -5
3    3
dtype: int64

In [7]:

obj.values

Out[7]:

array([ 4,  7, -5,  3])

In [8]:

obj.index

Out[8]:

Int64Index([0, 1, 2, 3], dtype='int64')

In [9]:

obj2 = Series([4,7,-5,3], index = ['d','b','a','c'])

In [10]:

obj2

Out[10]:

d    4
b    7
a   -5
c    3
dtype: int64

In [11]:

obj2.values

Out[11]:

array([ 4,  7, -5,  3])

In [12]:

obj2.index

Out[12]:

Index([u'd', u'b', u'a', u'c'], dtype='object')

In [13]:

obj2['a']

Out[13]:

-5

In [14]:

obj2[['c','a','d']]

Out[14]:

c    3
a   -5
d    4
dtype: int64

In [15]:

obj2[obj2>0]

Out[15]:

d    4
b    7
c    3
dtype: int64

In [16]:

obj2*2

Out[16]:

d     8
b    14
a   -10
c     6
dtype: int64

In [17]:

np.exp(obj2)

Out[17]:

d      54.598150
b    1096.633158
a       0.006738
c      20.085537
dtype: float64

In [18]:

'b' in obj2

Out[18]:

True

In [19]:

'e' in obj2

Out[19]:

False

In [20]:

sdata = {'Ohio':3500,'Texas':71000,'Oregon':16000, 'Utah':5000}

In [21]:

obj3 = Series(sdata)

In [22]:

obj3

Out[22]:

Ohio       3500
Oregon    16000
Texas     71000
Utah       5000
dtype: int64

In [23]:

states = ['California','Ohio','Oregon','Texas']

In [24]:

obj4 = Series(sdata, index=states)

In [25]:

obj4

Out[25]:

California      NaN
Ohio           3500
Oregon        16000
Texas         71000
dtype: float64

In [26]:

pd.isnull(obj4)

Out[26]:

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [27]:

pd.notnull(obj4)

Out[27]:

California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

Series also has instance methods.¶

In [28]:

obj4.isnull()

Out[28]:

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

Automatically aligns differently indexed data in arithmetic operations.¶

In [29]:

obj3

Out[29]:

Ohio       3500
Oregon    16000
Texas     71000
Utah       5000
dtype: int64

In [30]:

obj4

Out[30]:

California      NaN
Ohio           3500
Oregon        16000
Texas         71000
dtype: float64

In [31]:

obj3 + obj4

Out[31]:

California       NaN
Ohio            7000
Oregon         32000
Texas         142000
Utah             NaN
dtype: float64

name attribute¶

In [32]:

obj4.name = 'population'

In [33]:

obj4.index.name='state'

In [34]:

obj4

Out[34]:

state
California      NaN
Ohio           3500
Oregon        16000
Texas         71000
Name: population, dtype: float64

index altered in place¶

In [35]:

obj.index = ['Bob','Steve','Jeff','Ryan']

In [36]:

obj

Out[36]:

Bob      4
Steve    7
Jeff    -5
Ryan     3
dtype: int64

DataFrame¶

In [37]:

data = {'state':['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
        'year':[2000, 2001, 2002, 2001, 2002],
        'pop':[1.5, 1.7, 3.6, 2.4, 2.9]}
frame = DataFrame(data)

In [38]:

frame

Out[38]:

	pop	state	year
0	1.5	Ohio	2000
1	1.7	Ohio	2001
2	3.6	Ohio	2002
3	2.4	Nevada	2001
4	2.9	Nevada	2002

column that is not contained in the data¶

In [39]:

frame2 = DataFrame(data, columns = ['year','state','pop','debt'],
                   index=['one','two','three','four','five'])

In [40]:

frame2

Out[40]:

	year	state	pop	debt
one	2000	Ohio	1.5	NaN
two	2001	Ohio	1.7	NaN
three	2002	Ohio	3.6	NaN
four	2001	Nevada	2.4	NaN
five	2002	Nevada	2.9	NaN

In [41]:

frame2.columns

Out[41]:

Index([u'year', u'state', u'pop', u'debt'], dtype='object')

column retrieved¶

In [42]:

frame2['state']

Out[42]:

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
Name: state, dtype: object

In [44]:

frame2.year

Out[44]:

one      2000
two      2001
three    2002
four     2001
five     2002
Name: year, dtype: int64

rows retrieved¶

In [45]:

frame2.ix['three']

Out[45]:

year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object

In [47]:

frame2.ix[2]

Out[47]:

year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object

columns modified by assignment¶

In [48]:

frame2['debt']=16.5

In [49]:

frame2

Out[49]:

	year	state	pop	debt
one	2000	Ohio	1.5	16.5
two	2001	Ohio	1.7	16.5
three	2002	Ohio	3.6	16.5
four	2001	Nevada	2.4	16.5
five	2002	Nevada	2.9	16.5

In [50]:

frame2['debt']=np.arange(5.)

In [51]:

frame2

Out[51]:

	year	state	pop	debt
one	2000	Ohio	1.5	0
two	2001	Ohio	1.7	1
three	2002	Ohio	3.6	2
four	2001	Nevada	2.4	3
five	2002	Nevada	2.9	4

assign series to a column¶

In [52]:

val = Series([1.2, 1.5, -1.7], index = ['two', 'four','five'])

In [53]:

frame2['debt']=val

In [54]:

frame2

Out[54]:

	year	state	pop	debt
one	2000	Ohio	1.5	NaN
two	2001	Ohio	1.7	1.2
three	2002	Ohio	3.6	NaN
four	2001	Nevada	2.4	1.5
five	2002	Nevada	2.9	-1.7

assigning a column that does not exist¶

In [55]:

frame2['eastern'] = frame2.state=='Ohio'

In [56]:

frame2

Out[56]:

	year	state	pop	debt	eastern
one	2000	Ohio	1.5	NaN	True
two	2001	Ohio	1.7	1.2	True
three	2002	Ohio	3.6	NaN	True
four	2001	Nevada	2.4	1.5	False
five	2002	Nevada	2.9	-1.7	False

delete columns¶

In [57]:

del frame2['eastern']

In [58]:

frame2.columns

Out[58]:

Index([u'year', u'state', u'pop', u'debt'], dtype='object')

nested dict of dicts¶

In [59]:

pop={'Nevada':{2001:2.4, 2002:2.9},
     'Ohio': {2000:1.5, 2001:1.7,2002:3.6} }

In [60]:

frame3 = DataFrame(pop)

In [61]:

frame3

Out[61]:

	Nevada	Ohio
2000	NaN	1.5
2001	2.4	1.7
2002	2.9	3.6

transpose¶

In [62]:

frame3.T

Out[62]:

	2000	2001	2002
Nevada	NaN	2.4	2.9
Ohio	1.5	1.7	3.6

Note that the keys in the inner dicts are unioned and sorted to form the index in the resul. This is not true if an explicit index is specified.

In [63]:

DataFrame(pop, index=[2001, 2002, 2003])

Out[63]:

	Nevada	Ohio
2001	2.4	1.7
2002	2.9	3.6
2003	NaN	NaN

Dicts of series¶

In [64]:

pdata = {'Ohio':frame3['Ohio'][:-1],
         'Nevada':frame3['Nevada'][:2]}

In [65]:

DataFrame(pdata)

Out[65]:

	Nevada	Ohio
2000	NaN	1.5
2001	2.4	1.7

In [66]:

frame3.index.name='year'

In [67]:

frame3.columns.name = 'state'

In [68]:

frame3

Out[68]:

state	Nevada	Ohio
year
2000	NaN	1.5
2001	2.4	1.7
2002	2.9	3.6

In [69]:

frame3.values

Out[69]:

array([[ nan,  1.5],
       [ 2.4,  1.7],
       [ 2.9,  3.6]])

different types of columns¶

In [70]:

frame2.values

Out[70]:

array([[2000, 'Ohio', 1.5, nan],
       [2001, 'Ohio', 1.7, 1.2],
       [2002, 'Ohio', 3.6, nan],
       [2001, 'Nevada', 2.4, 1.5],
       [2002, 'Nevada', 2.9, -1.7]], dtype=object)

Index Objects¶

In [71]:

obj = Series(range(3), index=['a','b','c'])

In [73]:

index = obj.index

In [74]:

index[1:]

Out[74]:

Index([u'b', u'c'], dtype='object')

immutable¶

In [75]:

index[1]='d'

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-75-d3f90986bdb1> in <module>()
----> 1 index[1]='d'

/Users/sergulaydore/anaconda/lib/python2.7/site-packages/pandas/core/base.pyc in _disabled(self, *args, **kwargs)
    180         """This method will not function because object is immutable."""
    181         raise TypeError("'%s' does not support mutable operations." %
--> 182                         self.__class__)
    183 
    184     __setitem__ = __setslice__ = __delitem__ = __delslice__ = _disabled

TypeError: '<class 'pandas.core.index.Index'>' does not support mutable operations.

In [76]:

index = pd.Index(np.arange(3))

In [77]:

obj2 = Series([1.5,-2.5,0], index=index)

In [78]:

obj2.index is index

Out[78]:

True

Index objects are sets¶

In [79]:

frame3

Out[79]:

state	Nevada	Ohio
year
2000	NaN	1.5
2001	2.4	1.7
2002	2.9	3.6

In [80]:

'Ohio' in frame3

Out[80]:

True

In [81]:

2003 in frame3.index

Out[81]:

False

In [82]:

frame3.index-frame2.index

Out[82]:

Int64Index([2000, 2001, 2002], dtype='int64')

In [85]:

frame3.index.diff(frame2.index)

Out[85]:

Int64Index([2000, 2001, 2002], dtype='int64')

Index methods: append, diff, intersection, union, isin, delete, drop, insert, is_monotonic, is_unique, uniuq

Essential Functionality¶

Reindexing¶

In [86]:

obj = Series([4.5,7.2,-5.3,3.6], index=['d','b','a','c'])

In [87]:

obj

Out[87]:

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [88]:

obj2 = obj.reindex(['a','b','c','d','e'])

In [89]:

obj2

Out[89]:

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

In [90]:

obj.reindex(['a','b','c','d','e'], fill_value=0)

Out[90]:

a   -5.3
b    7.2
c    3.6
d    4.5
e    0.0
dtype: float64

In [91]:

obj

Out[91]:

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

filling of values¶

In [92]:

obj3 = Series(['blue','purple','yellow'], index=[0,2,4])

In [93]:

obj3.reindex(range(6))

Out[93]:

0      blue
1       NaN
2    purple
3       NaN
4    yellow
5       NaN
dtype: object

In [95]:

obj3.reindex(range(6),method='ffill') # forward fills the values

Out[95]:

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

available method options: ffill or pad, bfill or backfill

reindexing columns¶

In [96]:

frame = DataFrame(np.arange(9).reshape(3,3), index = ['a','c','d'],
                  columns=['Ohio','Texas','California'])

In [97]:

frame

Out[97]:

	Ohio	Texas	California
a	0	1	2
c	3	4	5
d	6	7	8

In [98]:

frame2 = frame.reindex(['a','b','c','d'])

In [99]:

frame2

Out[99]:

	Ohio	Texas	California
a	0	1	2
b	NaN	NaN	NaN
c	3	4	5
d	6	7	8

In [100]:

states = ['Texas','Utah','California']

In [101]:

frame.reindex(columns=states)

Out[101]:

	Texas	Utah	California
a	1	NaN	2
c	4	NaN	5
d	7	NaN	8

Both columns and rows can be reindexed¶

In [102]:

frame.reindex(index=['a','b','c','d'], method='ffill',
              columns=states)

Out[102]:

	Texas	Utah	California
a	1	NaN	2
b	1	NaN	2
c	4	NaN	5
d	7	NaN	8

In [103]:

frame

Out[103]:

	Ohio	Texas	California
a	0	1	2
c	3	4	5
d	6	7	8

reindexing with ix¶

In [105]:

frame.ix[['a','b','c','d'], states]

Out[105]:

	Texas	Utah	California
a	1	NaN	2
b	NaN	NaN	NaN
c	4	NaN	5
d	7	NaN	8

Dropping entries from an axis¶

In [106]:

obj = Series(np.arange(5.), index = ['a','b','c','d','e'])

In [107]:

obj

Out[107]:

a    0
b    1
c    2
d    3
e    4
dtype: float64

In [109]:

new_obj = obj.drop('c')

In [110]:

new_obj

Out[110]:

a    0
b    1
d    3
e    4
dtype: float64

In [111]:

obj.drop(['d','c'])

Out[111]:

a    0
b    1
e    4
dtype: float64

Index values can be deleted from either axis¶

In [112]:

data = DataFrame(np.arange(16).reshape((4,4)),
                 index = ['Ohio','Colorado','Utah','New York'],
                 columns = ['one','two','three','four'])

In [113]:

data

Out[113]:

	one	two	three	four
Ohio	0	1	2	3
Colorado	4	5	6	7
Utah	8	9	10	11
New York	12	13	14	15

In [114]:

data.drop(['Colorado','Ohio'])

Out[114]:

	one	two	three	four
Utah	8	9	10	11
New York	12	13	14	15

In [115]:

data.drop('two', axis=1)

Out[115]:

	one	three	four
Ohio	0	2	3
Colorado	4	6	7
Utah	8	10	11
New York	12	14	15

In [116]:

data.drop(['two','four'], axis=1)

Out[116]:

	one	three
Ohio	0	2
Colorado	4	6
Utah	8	10
New York	12	14

Indexing, selection and filtering¶

In [117]:

obj = Series(np.arange(4.), index=['a','b','c','d'])

In [118]:

obj['b']

Out[118]:

1.0

In [119]:

obj[2:4]

Out[119]:

c    2
d    3
dtype: float64

In [120]:

obj[['b','a','d']]

Out[120]:

b    1
a    0
d    3
dtype: float64

In [121]:

obj[[1,3]]

Out[121]:

b    1
d    3
dtype: float64

In [122]:

obj[obj<2]

Out[122]:

a    0
b    1
dtype: float64

In [123]:

obj['b':'c']

Out[123]:

b    1
c    2
dtype: float64

In [124]:

obj['b':'c']=5

In [125]:

obj

Out[125]:

a    0
b    5
c    5
d    3
dtype: float64

In [126]:

data=DataFrame(np.arange(16).reshape((4,4)),
               index=['Ohio','Colorado','Utah','New York'],
               columns=['one','two','three','four'])

In [127]:

data

Out[127]:

	one	two	three	four
Ohio	0	1	2	3
Colorado	4	5	6	7
Utah	8	9	10	11
New York	12	13	14	15

In [128]:

data['two']

Out[128]:

Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int64

In [129]:

data[['three','one']]

Out[129]:

	three	one
Ohio	2	0
Colorado	6	4
Utah	10	8
New York	14	12

In [130]:

data[:2]

Out[130]:

	one	two	three	four
Ohio	0	1	2	3
Colorado	4	5	6	7

In [131]:

data[data['three']>5]

Out[131]:

	one	two	three	four
Colorado	4	5	6	7
Utah	8	9	10	11
New York	12	13	14	15

indexing with boolean DataFrame¶

In [132]:

data<5

Out[132]:

	one	two	three	four
Ohio	True	True	True	True
Colorado	True	False	False	False
Utah	False	False	False	False
New York	False	False	False	False

In [133]:

data[data<5]=0

In [134]:

data

Out[134]:

	one	two	three	four
Ohio	0	0	0	0
Colorado	0	5	6	7
Utah	8	9	10	11
New York	12	13	14	15

label-indexing on the rows (this is a less verbose way to do reindexing)¶

In [135]:

data.ix['Colorado',['two','three']]

Out[135]:

two      5
three    6
Name: Colorado, dtype: int64

In [136]:

data.ix[['Colorado','Utah'],[3,0,1]]

Out[136]:

	four	one	two
Colorado	7	0	5
Utah	11	8	9

In [137]:

data.ix[2]

Out[137]:

one       8
two       9
three    10
four     11
Name: Utah, dtype: int64

In [138]:

data.ix[:'Utah','two']

Out[138]:

Ohio        0
Colorado    5
Utah        9
Name: two, dtype: int64

In [140]:

data.ix[data.three>5,:3]

Out[140]:

	one	two	three
Colorado	0	5	6
Utah	8	9	10
New York	12	13	14

Arithmetic and data alignment¶

In [141]:

s1 = Series([7.3,-2.5,3.4,1.5], index=['a','c','d','e'])

In [142]:

s2 = Series([-2.1,3.6,-1.5,4,3.1], index=['a','c','e','f','g'])

In [143]:

s1

Out[143]:

a    7.3
c   -2.5
d    3.4
e    1.5
dtype: float64

In [144]:

s2

Out[144]:

a   -2.1
c    3.6
e   -1.5
f    4.0
g    3.1
dtype: float64

adding these together yields¶

In [145]:

s1+s2

Out[145]:

a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

aligment is performed on both the rows and the columns¶

In [146]:

df1 = DataFrame(np.arange(9.).reshape((3,3)),
                columns = list('bcd'),
                index=['Ohio','Texas','Colorado'])

In [150]:

df2 = DataFrame(np.arange(12.).reshape((4,3)),
               columns = list('bde'),
               index=['Utah','Ohio','Texas','Oregon'])

In [148]:

df1

Out[148]:

	b	c	d
Ohio	0	1	2
Texas	3	4	5
Colorado	6	7	8

In [151]:

df2

Out[151]:

	b	d	e
Utah	0	1	2
Ohio	3	4	5
Texas	6	7	8
Oregon	9	10	11

In [152]:

df1 + df2

Out[152]:

	b	c	d	e
Colorado	NaN	NaN	NaN	NaN
Ohio	3	NaN	6	NaN
Oregon	NaN	NaN	NaN	NaN
Texas	9	NaN	12	NaN
Utah	NaN	NaN	NaN	NaN

Arithmetci operations with fill values¶

In [153]:

df1 = DataFrame(np.arange(12.).reshape((3,4)), columns = list('abcd'))

In [154]:

df2 = DataFrame(np.arange(20.).reshape((4,5)), columns = list('abcde'))

In [155]:

df1

Out[155]:

	a	b	c	d
0	0	1	2	3
1	4	5	6	7
2	8	9	10	11

In [156]:

df2

Out[156]:

	a	b	c	d	e
0	0	1	2	3	4
1	5	6	7	8	9
2	10	11	12	13	14
3	15	16	17	18	19

In [157]:

df1 + df2

Out[157]:

	a	b	c	d	e
0	0	2	4	6	NaN
1	9	11	13	15	NaN
2	18	20	22	24	NaN
3	NaN	NaN	NaN	NaN	NaN

In [158]:

df1.add(df2, fill_value=0)

Out[158]:

	a	b	c	d	e
0	0	2	4	6	4
1	9	11	13	15	9
2	18	20	22	24	14
3	15	16	17	18	19

In [159]:

df1.reindex(columns=df2.columns, fill_value=0)

Out[159]:

	a	b	c	d
0	0	1	2	3
1	4	5	6	7
2	8	9	10	11

aritmetic methods: add, sub, div, mul

Operations between DataFrame and Series¶

broadcasting¶

In [160]:

arr = np.arange(12.).reshape((3,4))

In [161]:

arr

Out[161]:

array([[  0.,   1.,   2.,   3.],
       [  4.,   5.,   6.,   7.],
       [  8.,   9.,  10.,  11.]])

In [162]:

arr[0]

Out[162]:

array([ 0.,  1.,  2.,  3.])

In [163]:

arr-arr[0]

Out[163]:

array([[ 0.,  0.,  0.,  0.],
       [ 4.,  4.,  4.,  4.],
       [ 8.,  8.,  8.,  8.]])

In [164]:

frame = DataFrame(np.arange(12.).reshape((4,3)), columns=list('bde'),
                  index=['Utah','Ohio','Texas','Oregon'])

In [165]:

frame

Out[165]:

	b	d	e
Utah	0	1	2
Ohio	3	4	5
Texas	6	7	8
Oregon	9	10	11

In [166]:

series =frame.ix[0]

In [167]:

series

Out[167]:

b    0
d    1
e    2
Name: Utah, dtype: float64

In [168]:

frame-series

Out[168]:

	b	d	e
Utah	0	0	0
Ohio	3	3	3
Texas	6	6	6
Oregon	9	9	9

if an index value is not found¶

In [169]:

series2 = Series(range(3), index=['b','e','f'])

In [170]:

series2

Out[170]:

b    0
e    1
f    2
dtype: int64

In [172]:

frame + series2

Out[172]:

	b	d	e	f
Utah	0	NaN	3	NaN
Ohio	3	NaN	6	NaN
Texas	6	NaN	9	NaN
Oregon	9	NaN	12	NaN

broadcast over the columns¶

In [174]:

series3 = frame['d']

In [175]:

frame

Out[175]:

	b	d	e
Utah	0	1	2
Ohio	3	4	5
Texas	6	7	8
Oregon	9	10	11

In [176]:

series3

Out[176]:

Utah       1
Ohio       4
Texas      7
Oregon    10
Name: d, dtype: float64

In [179]:

frame.sub(series3, axis=0)

Out[179]:

	b	e
Utah	-1	1
Ohio	-1	1
Texas	-1	1
Oregon	-1	1

Function Application and Mapping¶

In [5]:

import numpy as np

In [6]:

frame = DataFrame(np.random.randn(4,3), columns=list('bde'),
                  index=['Utah','Ohio','Texas','Oregon'])

In [7]:

frame

Out[7]:

	b	d	e
Utah	-0.267451	-1.829889	2.544224
Ohio	1.116375	-0.270886	1.086111
Texas	0.390590	-0.339125	0.341343
Oregon	1.145794	-1.268138	-0.925395

In [8]:

np.abs(frame)

Out[8]:

	b	d	e
Utah	0.267451	1.829889	2.544224
Ohio	1.116375	0.270886	1.086111
Texas	0.390590	0.339125	0.341343
Oregon	1.145794	1.268138	0.925395

apply method¶

In [10]:

f = lambda x: x.max()-x.min()

In [13]:

frame.apply(f,axis=1)

Out[13]:

Utah      4.374113
Ohio      1.387261
Texas     0.729715
Oregon    2.413932
dtype: float64

function return a series with multiple values¶

In [14]:

def f(x):
    return Series([x.min(),x.max()], index=['min','max'])

In [17]:

frame.apply(f, axis=0)

Out[17]:

	b	d	e
min	-0.267451	-1.829889	-0.925395
max	1.145794	-0.270886	2.544224

element-wise python functions¶

In [18]:

format = lambda x: '%.2f' %x

In [19]:

frame.applymap(format)

Out[19]:

	b	d	e
Utah	-0.27	-1.83	2.54
Ohio	1.12	-0.27	1.09
Texas	0.39	-0.34	0.34
Oregon	1.15	-1.27	-0.93

map method¶

In [20]:

frame['e'].map(format)

Out[20]:

Utah       2.54
Ohio       1.09
Texas      0.34
Oregon    -0.93
Name: e, dtype: object

Sorting and ranking¶

In [21]:

obj = Series(range(4), index=['d','a','b','c'])

In [22]:

obj.sort_index()

Out[22]:

a    1
b    2
c    3
d    0
dtype: int64

sort by index on either axis¶

In [23]:

frame = DataFrame(np.arange(8).reshape((2,4)), index = ['three','one'],
                  columns = ['d','a','b','c'])

In [24]:

frame.sort_index()

Out[24]:

	d	a	b	c
one	4	5	6	7
three	0	1	2	3

In [25]:

frame.sort_index(axis=1)

Out[25]:

	a	b	c	d
three	1	2	3	0
one	5	6	7	4

descending order¶

In [26]:

frame.sort_index(axis=1, ascending = False)

Out[26]:

	d	c	b	a
three	0	3	2	1
one	4	7	6	5

sort a series by values¶

In [27]:

obj = Series([4,7,-3,2])

In [28]:

obj.order()

Out[28]:

2   -3
3    2
0    4
1    7
dtype: int64

missing values¶

In [29]:

obj = Series([4,np.nan,7,np.nan,-3,2])

In [30]:

obj.order()

Out[30]:

4    -3
5     2
0     4
2     7
1   NaN
3   NaN
dtype: float64

sort values in one or more columns in data frame¶

In [31]:

frame = DataFrame({'b':[4,7,-3,2],'a':[0,1,0,1]})

In [32]:

frame

Out[32]:

	a	b
0	0	4
1	1	7
2	0	-3
3	1	2

In [33]:

frame.sort_index(by='b')

Out[33]:

	a	b
2	0	-3
3	1	2
0	0	4
1	1	7

In [34]:

frame.sort_index(by = ['a','b'])

Out[34]:

	a	b
2	0	-3
0	0	4
3	1	2
1	1	7

Ranking¶

In [35]:

obj = Series([7, -5, 7, 4, 2, 0, 4])

In [36]:

obj.rank()

Out[36]:

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

ranking according to the order they're observed in the data¶

In [37]:

obj.rank(method='first')

Out[37]:

0    6
1    1
2    7
3    4
4    3
5    2
6    5
dtype: float64

In [38]:

obj.rank(ascending=False, method='max')

Out[38]:

0    2
1    7
2    2
3    4
4    5
5    6
6    4
dtype: float64

ranks over the rows or columns¶

In [39]:

frame = DataFrame({'b':[4.3,7,-3,2], 'a':[0,1,0,1],
                   'c':[-2,5,8,-2.5]})

In [40]:

frame

Out[40]:

	a	b	c
0	0	4.3	-2.0
1	1	7.0	5.0
2	0	-3.0	8.0
3	1	2.0	-2.5

In [41]:

frame.rank(axis=1)

Out[41]:

	a	b	c
0	2	3	1
1	1	3	2
2	2	1	3
3	2	3	1

Tie-breaking methods with rank¶

average (default), min, max, first

Axis indexes with duplicate values¶

In [42]:

obj = Series(range(5),index=['a','a','b','b','c'])

In [43]:

obj

Out[43]:

a    0
a    1
b    2
b    3
c    4
dtype: int64

In [44]:

obj.index.is_unique

Out[44]:

False

In [45]:

obj['a']

Out[45]:

a    0
a    1
dtype: int64

In [46]:

obj['c']

Out[46]:

In [47]:

df = DataFrame(np.random.randn(4,3), index=['a','a','b','b'])

In [48]:

df

Out[48]:

	0	1	2
a	0.654546	-0.963798	0.228551
a	-1.451196	-1.528684	1.567436
b	-1.213714	-1.556472	-0.573577
b	-1.838320	-2.250823	0.440242

In [49]:

df.ix['b']

Out[49]:

	0	1	2
b	-1.213714	-1.556472	-0.573577
b	-1.838320	-2.250823	0.440242

Summarizing and Computing Descriptive Statistics¶

In [50]:

df = DataFrame([[1.4, np.nan], [7.1,-4.5],
                [np.nan, np.nan],[0.75,-1.3]],
               index=['a','b','c','d'],
               columns=['one','two'])

In [51]:

df

Out[51]:

	one	two
a	1.40	NaN
b	7.10	-4.5
c	NaN	NaN
d	0.75	-1.3

column sums¶

In [52]:

df.sum()

Out[52]:

one    9.25
two   -5.80
dtype: float64

row sums¶

In [53]:

df.sum(axis=1) 

Out[53]:

a    1.40
b    2.60
c     NaN
d   -0.55
dtype: float64

NA values are excluded unless the entire slice is NA. This can be disables using skipna.

In [54]:

df.mean(axis=1, skipna=False)

Out[54]:

a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64

indirect statistics¶

In [55]:

df.idxmax()

Out[55]:

one    b
two    d
dtype: object

accumulations¶

In [57]:

df.cumsum()

Out[57]:

	one	two
a	1.40	NaN
b	8.50	-4.5
c	NaN	NaN
d	9.25	-5.8

multiple summary statistics¶

In [59]:

df.describe()

Out[59]:

	one	two
count	3.000000	2.000000
mean	3.083333	-2.900000
std	3.493685	2.262742
min	0.750000	-4.500000
25%	1.075000	-3.700000
50%	1.400000	-2.900000
75%	4.250000	-2.100000
max	7.100000	-1.300000

non-numeric data¶

In [60]:

obj = Series(['a','a','b','c']*4)

In [61]:

obj.describe()

Out[61]:

count     16
unique     3
top        a
freq       8
dtype: object

Correlation and Covariance¶

In [62]:

import pandas.io.data as web

In [63]:

all_data = {}

In [72]:

for ticker in ['AAPL','IBM','MSFT','GOOG']:
    all_data[ticker] = web.get_data_yahoo(ticker,'04/11/2010','04/14/2014')

In [73]:

price =  DataFrame({tic: data['Adj Close']
                    for tic, data in all_data.iteritems()})

In [74]:

volume = DataFrame({tic: data['Volume']
                    for tic, data in all_data.iteritems()})

In [75]:

returns = price.pct_change()

In [77]:

returns.tail()

Out[77]:

	AAPL	GOOG	IBM	MSFT
Date
2014-04-08	-0.000136	0.031125	-0.006328	0.000516
2014-04-09	0.013219	0.016652	0.017355	0.016508
2014-04-10	-0.012912	-0.041107	-0.004904	-0.027404
2014-04-11	-0.007358	-0.019133	-0.002516	-0.003913
2014-04-14	0.003981	0.003619	0.013244	-0.000786

In [78]:

returns.MSFT.corr(returns.IBM)

Out[78]:

0.50623670894935635

In [79]:

returns.MSFT.cov(returns.IBM)

Out[79]:

8.8008494273331047e-05

full correlation or covariance¶

In [80]:

returns.corr()

Out[80]:

	AAPL	GOOG	IBM	MSFT
AAPL	1.000000	0.737439	0.384410	0.336002
GOOG	0.737439	1.000000	0.329676	0.717087
IBM	0.384410	0.329676	1.000000	0.506237
MSFT	0.336002	0.717087	0.506237	1.000000

In [81]:

returns.cov()

Out[81]:

	AAPL	GOOG	IBM	MSFT
AAPL	0.000301	0.000152	0.000080	0.000085
GOOG	0.000152	0.000528	0.000069	0.000265
IBM	0.000080	0.000069	0.000143	0.000088
MSFT	0.000085	0.000265	0.000088	0.000211

In [82]:

returns.corrwith(returns.IBM)

Out[82]:

AAPL    0.384410
GOOG    0.329676
IBM     1.000000
MSFT    0.506237
dtype: float64

correlations of matching column names¶

In [84]:

returns.corrwith(volume)

Out[84]:

AAPL   -0.128694
GOOG   -0.478992
IBM    -0.152372
MSFT   -0.102546
dtype: float64

Unique values, value count, and membership¶

In [85]:

obj = Series(['c','a','d','a','a','b','b','c','c'])

In [86]:

uniques = obj.unique()

In [87]:

uniques

Out[87]:

array(['c', 'a', 'd', 'b'], dtype=object)

In [89]:

obj.value_counts()

Out[89]:

c    3
a    3
b    2
d    1
dtype: int64

In [91]:

pd.value_counts(obj.values, sort=False)

Out[91]:

a    3
c    3
b    2
d    1
dtype: int64

In [93]:

mask = obj.isin(['b','c'])

In [95]:

mask

Out[95]:

0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool

In [96]:

obj[mask]

Out[96]:

0    c
5    b
6    b
7    c
8    c
dtype: object

compute a histogram¶

In [97]:

data = DataFrame({'Quo1':[1,3,4,3,4],
                  'Quo2': [2,3,1,2,3],
                  'Quo3':[1,5,2,4,4]})

In [98]:

data

Out[98]:

	Quo1	Quo2	Quo3
0	1	2	1
1	3	3	5
2	4	1	2
3	3	2	4
4	4	3	4

In [100]:

result = data.apply(pd.value_counts).fillna(0)

In [101]:

result

Out[101]:

	Quo1	Quo2	Quo3
1	1	1	1
2	0	2	1
3	2	2	0
4	2	0	2
5	0	0	1

Handling Missing Data¶

In [103]:

string_data = Series(['aardvark','artichoke',np.nan, 'avocado'])

In [104]:

string_data

Out[104]:

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [105]:

string_data.isnull()

Out[105]:

0    False
1    False
2     True
3    False
dtype: bool

built-in Python's none¶

In [106]:

string_data[0] = None

In [107]:

string_data.isnull()

Out[107]:

0     True
1    False
2     True
3    False
dtype: bool

NA handling methods: dropna, fillna, isnull, notnull

FIltering out missing data¶

In [109]:

from numpy import nan as NA

In [110]:

data = Series([1, NA, 3.5, NA, 7])

In [111]:

data.dropna()

Out[111]:

0    1.0
2    3.5
4    7.0
dtype: float64

using boolean indexing¶

In [112]:

data[data.notnull()]

Out[112]:

0    1.0
2    3.5
4    7.0
dtype: float64

dropna with dataframa¶

In [113]:

data = DataFrame([[1.,6.5,3.],[1.,NA,NA],
                  [NA,NA,NA],[NA,6.5,3.]])

In [114]:

data

Out[114]:

	0	1	2
0	1	6.5	3
1	1	NaN	NaN
2	NaN	NaN	NaN
3	NaN	6.5	3

In [115]:

cleaned = data.dropna()

In [116]:

cleaned

Out[116]:

	0	1	2
0	1	6.5	3

drop rows that are all NA¶

In [117]:

data.dropna(how='all')

Out[117]:

	0	1	2
0	1	6.5	3
1	1	NaN	NaN
3	NaN	6.5	3

drop columns that are all NA¶

In [118]:

data[4]=NA

In [119]:

data

Out[119]:

	0	1	2	4
0	1	6.5	3	NaN
1	1	NaN	NaN	NaN
2	NaN	NaN	NaN	NaN
3	NaN	6.5	3	NaN

In [120]:

data.dropna(axis=1,how='all')

Out[120]:

	0	1	2
0	1	6.5	3
1	1	NaN	NaN
2	NaN	NaN	NaN
3	NaN	6.5	3

keep only rows contaiing a certain no of obs¶

In [121]:

df = DataFrame(np.random.randn(7,3))

In [122]:

df.ix[:4,1]=NA; df.ix[:2,2]=NA

In [123]:

df

Out[123]:

	0	1	2
0	-0.676323	NaN	NaN
1	-1.088497	NaN	NaN
2	0.122962	NaN	NaN
3	2.087705	NaN	1.538338
4	-1.267822	NaN	-0.924860
5	-1.596591	-1.001367	-0.705282
6	-0.026753	-0.694895	-0.386125

In [124]:

df.dropna(thresh=2)

Out[124]:

	0	1	2
3	2.087705	NaN	1.538338
4	-1.267822	NaN	-0.924860
5	-1.596591	-1.001367	-0.705282
6	-0.026753	-0.694895	-0.386125

Filling in Missing Data¶

In [125]:

df.fillna(0)

Out[125]:

	0	1	2
0	-0.676323	0.000000	0.000000
1	-1.088497	0.000000	0.000000
2	0.122962	0.000000	0.000000
3	2.087705	0.000000	1.538338
4	-1.267822	0.000000	-0.924860
5	-1.596591	-1.001367	-0.705282
6	-0.026753	-0.694895	-0.386125

different fill value for each column¶

In [126]:

df.fillna({1:0.5,3:-1})

Out[126]:

	0	1	2
0	-0.676323	0.500000	NaN
1	-1.088497	0.500000	NaN
2	0.122962	0.500000	NaN
3	2.087705	0.500000	1.538338
4	-1.267822	0.500000	-0.924860
5	-1.596591	-1.001367	-0.705282
6	-0.026753	-0.694895	-0.386125

modify existing project in place¶

In [127]:

_ = df.fillna(0, inplace=True)

In [128]:

df

Out[128]:

	0	1	2
0	-0.676323	0.000000	0.000000
1	-1.088497	0.000000	0.000000
2	0.122962	0.000000	0.000000
3	2.087705	0.000000	1.538338
4	-1.267822	0.000000	-0.924860
5	-1.596591	-1.001367	-0.705282
6	-0.026753	-0.694895	-0.386125

interpolation methods¶

In [129]:

df = DataFrame(np.random.randn(6,3))

In [130]:

df.ix[2:,1]=NA; df.ix[4:,2]=NA

In [131]:

df

Out[131]:

	0	1	2
0	-0.570090	-0.513487	-0.707382
1	1.453435	0.440059	-0.292707
2	-0.045053	NaN	0.951857
3	1.277519	NaN	-0.463472
4	-2.012467	NaN	NaN
5	-0.854318	NaN	NaN

In [132]:

df.fillna(method = 'ffill')

Out[132]:

	0	1	2
0	-0.570090	-0.513487	-0.707382
1	1.453435	0.440059	-0.292707
2	-0.045053	0.440059	0.951857
3	1.277519	0.440059	-0.463472
4	-2.012467	0.440059	-0.463472
5	-0.854318	0.440059	-0.463472

In [133]:

df.fillna(method='ffill',limit=2)

Out[133]:

	0	1	2
0	-0.570090	-0.513487	-0.707382
1	1.453435	0.440059	-0.292707
2	-0.045053	0.440059	0.951857
3	1.277519	0.440059	-0.463472
4	-2.012467	NaN	-0.463472
5	-0.854318	NaN	-0.463472

pass the mean or median value of a Series¶

In [134]:

data=Series([1.,NA,3.5,NA,7])

In [135]:

data.fillna(data.mean())

Out[135]:

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

fillna function arguments : value, method, axis, inplace, limit

Hierarchical Indexing¶

In [136]:

data = Series(np.random.randn(10),
              index = [['a','a','a','b','b','b','c','c','d','d'],
                       [1,2,3,1,2,3,1,2,2,3]])

In [137]:

data

Out[137]:

a  1   -0.734465
   2   -0.775564
   3   -2.134157
b  1    1.169687
   2    0.655510
   3    0.229640
c  1    0.607370
   2   -0.227422
d  2   -0.319022
   3   -1.861461
dtype: float64

In [138]:

data.index

Out[138]:

MultiIndex(levels=[[u'a', u'b', u'c', u'd'], [1, 2, 3]],
           labels=[[0, 0, 0, 1, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 1, 2, 0, 1, 1, 2]])

partial indexing¶

In [139]:

data['b']

Out[139]:

1    1.169687
2    0.655510
3    0.229640
dtype: float64

In [140]:

data['b':'c']

Out[140]:

b  1    1.169687
   2    0.655510
   3    0.229640
c  1    0.607370
   2   -0.227422
dtype: float64

In [141]:

data.ix[['b','d']]

Out[141]:

b  1    1.169687
   2    0.655510
   3    0.229640
d  2   -0.319022
   3   -1.861461
dtype: float64

selection from an inner level¶

In [142]:

data[:,2]

Out[142]:

a   -0.775564
b    0.655510
c   -0.227422
d   -0.319022
dtype: float64

rearranged into a DataFrame¶

In [143]:

data.unstack()

Out[143]:

	1	2	3
a	-0.734465	-0.775564	-2.134157
b	1.169687	0.655510	0.229640
c	0.607370	-0.227422	NaN
d	NaN	-0.319022	-1.861461

inverse operation of unstack¶

In [144]:

data.unstack().stack()

Out[144]:

a  1   -0.734465
   2   -0.775564
   3   -2.134157
b  1    1.169687
   2    0.655510
   3    0.229640
c  1    0.607370
   2   -0.227422
d  2   -0.319022
   3   -1.861461
dtype: float64

either axis can have a hierarchical index:¶

In [145]:

frame = DataFrame(np.arange(12).reshape((4,3)),
                  index = [['a','a','b','b'],[1,2,1,2]],
                  columns = [['Ohio','Ohio','Colorado'],
                             ['Green','Red','Green']])

In [146]:

frame

Out[146]:

		Ohio		Colorado
		Green	Red	Green
a	1	0	1	2
a	2	3	4	5
b	1	6	7	8
b	2	9	10	11

hierarchical levels can have names¶

In [147]:

frame.index.names = ['key1', 'key2']

In [149]:

frame.columns.names = ['state', 'color']

In [150]:

frame

Out[150]:

	state	Ohio		Colorado
	color	Green	Red	Green
key1	key2
a	1	0	1	2
a	2	3	4	5
b	1	6	7	8
b	2	9	10	11

partial column indexing¶