%pylab inline
Populating the interactive namespace from numpy and matplotlib
from pandas import Series, DataFrame
import pandas as pd
obj = Series([4, 7, -5, 3])
obj
0 4 1 7 2 -5 3 3 dtype: int64
obj.values
array([ 4, 7, -5, 3])
obj.index
Int64Index([0, 1, 2, 3], dtype='int64')
obj2 = Series([4,7,-5,3], index = ['d','b','a','c'])
obj2
d 4 b 7 a -5 c 3 dtype: int64
obj2.values
array([ 4, 7, -5, 3])
obj2.index
Index([u'd', u'b', u'a', u'c'], dtype='object')
obj2['a']
-5
obj2[['c','a','d']]
c 3 a -5 d 4 dtype: int64
obj2[obj2>0]
d 4 b 7 c 3 dtype: int64
obj2*2
d 8 b 14 a -10 c 6 dtype: int64
np.exp(obj2)
d 54.598150 b 1096.633158 a 0.006738 c 20.085537 dtype: float64
'b' in obj2
True
'e' in obj2
False
sdata = {'Ohio':3500,'Texas':71000,'Oregon':16000, 'Utah':5000}
obj3 = Series(sdata)
obj3
Ohio 3500 Oregon 16000 Texas 71000 Utah 5000 dtype: int64
states = ['California','Ohio','Oregon','Texas']
obj4 = Series(sdata, index=states)
obj4
California NaN Ohio 3500 Oregon 16000 Texas 71000 dtype: float64
pd.isnull(obj4)
California True Ohio False Oregon False Texas False dtype: bool
pd.notnull(obj4)
California False Ohio True Oregon True Texas True dtype: bool
obj4.isnull()
California True Ohio False Oregon False Texas False dtype: bool
obj3
Ohio 3500 Oregon 16000 Texas 71000 Utah 5000 dtype: int64
obj4
California NaN Ohio 3500 Oregon 16000 Texas 71000 dtype: float64
obj3 + obj4
California NaN Ohio 7000 Oregon 32000 Texas 142000 Utah NaN dtype: float64
obj4.name = 'population'
obj4.index.name='state'
obj4
state California NaN Ohio 3500 Oregon 16000 Texas 71000 Name: population, dtype: float64
obj.index = ['Bob','Steve','Jeff','Ryan']
obj
Bob 4 Steve 7 Jeff -5 Ryan 3 dtype: int64
data = {'state':['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
'year':[2000, 2001, 2002, 2001, 2002],
'pop':[1.5, 1.7, 3.6, 2.4, 2.9]}
frame = DataFrame(data)
frame
pop | state | year | |
---|---|---|---|
0 | 1.5 | Ohio | 2000 |
1 | 1.7 | Ohio | 2001 |
2 | 3.6 | Ohio | 2002 |
3 | 2.4 | Nevada | 2001 |
4 | 2.9 | Nevada | 2002 |
frame2 = DataFrame(data, columns = ['year','state','pop','debt'],
index=['one','two','three','four','five'])
frame2
year | state | pop | debt | |
---|---|---|---|---|
one | 2000 | Ohio | 1.5 | NaN |
two | 2001 | Ohio | 1.7 | NaN |
three | 2002 | Ohio | 3.6 | NaN |
four | 2001 | Nevada | 2.4 | NaN |
five | 2002 | Nevada | 2.9 | NaN |
frame2.columns
Index([u'year', u'state', u'pop', u'debt'], dtype='object')
frame2['state']
one Ohio two Ohio three Ohio four Nevada five Nevada Name: state, dtype: object
frame2.year
one 2000 two 2001 three 2002 four 2001 five 2002 Name: year, dtype: int64
frame2.ix['three']
year 2002 state Ohio pop 3.6 debt NaN Name: three, dtype: object
frame2.ix[2]
year 2002 state Ohio pop 3.6 debt NaN Name: three, dtype: object
frame2['debt']=16.5
frame2
year | state | pop | debt | |
---|---|---|---|---|
one | 2000 | Ohio | 1.5 | 16.5 |
two | 2001 | Ohio | 1.7 | 16.5 |
three | 2002 | Ohio | 3.6 | 16.5 |
four | 2001 | Nevada | 2.4 | 16.5 |
five | 2002 | Nevada | 2.9 | 16.5 |
frame2['debt']=np.arange(5.)
frame2
year | state | pop | debt | |
---|---|---|---|---|
one | 2000 | Ohio | 1.5 | 0 |
two | 2001 | Ohio | 1.7 | 1 |
three | 2002 | Ohio | 3.6 | 2 |
four | 2001 | Nevada | 2.4 | 3 |
five | 2002 | Nevada | 2.9 | 4 |
val = Series([1.2, 1.5, -1.7], index = ['two', 'four','five'])
frame2['debt']=val
frame2
year | state | pop | debt | |
---|---|---|---|---|
one | 2000 | Ohio | 1.5 | NaN |
two | 2001 | Ohio | 1.7 | 1.2 |
three | 2002 | Ohio | 3.6 | NaN |
four | 2001 | Nevada | 2.4 | 1.5 |
five | 2002 | Nevada | 2.9 | -1.7 |
frame2['eastern'] = frame2.state=='Ohio'
frame2
year | state | pop | debt | eastern | |
---|---|---|---|---|---|
one | 2000 | Ohio | 1.5 | NaN | True |
two | 2001 | Ohio | 1.7 | 1.2 | True |
three | 2002 | Ohio | 3.6 | NaN | True |
four | 2001 | Nevada | 2.4 | 1.5 | False |
five | 2002 | Nevada | 2.9 | -1.7 | False |
del frame2['eastern']
frame2.columns
Index([u'year', u'state', u'pop', u'debt'], dtype='object')
pop={'Nevada':{2001:2.4, 2002:2.9},
'Ohio': {2000:1.5, 2001:1.7,2002:3.6} }
frame3 = DataFrame(pop)
frame3
Nevada | Ohio | |
---|---|---|
2000 | NaN | 1.5 |
2001 | 2.4 | 1.7 |
2002 | 2.9 | 3.6 |
frame3.T
2000 | 2001 | 2002 | |
---|---|---|---|
Nevada | NaN | 2.4 | 2.9 |
Ohio | 1.5 | 1.7 | 3.6 |
Note that the keys in the inner dicts are unioned and sorted to form the index in the resul. This is not true if an explicit index is specified.
DataFrame(pop, index=[2001, 2002, 2003])
Nevada | Ohio | |
---|---|---|
2001 | 2.4 | 1.7 |
2002 | 2.9 | 3.6 |
2003 | NaN | NaN |
pdata = {'Ohio':frame3['Ohio'][:-1],
'Nevada':frame3['Nevada'][:2]}
DataFrame(pdata)
Nevada | Ohio | |
---|---|---|
2000 | NaN | 1.5 |
2001 | 2.4 | 1.7 |
frame3.index.name='year'
frame3.columns.name = 'state'
frame3
state | Nevada | Ohio |
---|---|---|
year | ||
2000 | NaN | 1.5 |
2001 | 2.4 | 1.7 |
2002 | 2.9 | 3.6 |
frame3.values
array([[ nan, 1.5], [ 2.4, 1.7], [ 2.9, 3.6]])
frame2.values
array([[2000, 'Ohio', 1.5, nan], [2001, 'Ohio', 1.7, 1.2], [2002, 'Ohio', 3.6, nan], [2001, 'Nevada', 2.4, 1.5], [2002, 'Nevada', 2.9, -1.7]], dtype=object)
obj = Series(range(3), index=['a','b','c'])
index = obj.index
index[1:]
Index([u'b', u'c'], dtype='object')
index[1]='d'
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) <ipython-input-75-d3f90986bdb1> in <module>() ----> 1 index[1]='d' /Users/sergulaydore/anaconda/lib/python2.7/site-packages/pandas/core/base.pyc in _disabled(self, *args, **kwargs) 180 """This method will not function because object is immutable.""" 181 raise TypeError("'%s' does not support mutable operations." % --> 182 self.__class__) 183 184 __setitem__ = __setslice__ = __delitem__ = __delslice__ = _disabled TypeError: '<class 'pandas.core.index.Index'>' does not support mutable operations.
index = pd.Index(np.arange(3))
obj2 = Series([1.5,-2.5,0], index=index)
obj2.index is index
True
frame3
state | Nevada | Ohio |
---|---|---|
year | ||
2000 | NaN | 1.5 |
2001 | 2.4 | 1.7 |
2002 | 2.9 | 3.6 |
'Ohio' in frame3
True
2003 in frame3.index
False
frame3.index-frame2.index
Int64Index([2000, 2001, 2002], dtype='int64')
frame3.index.diff(frame2.index)
Int64Index([2000, 2001, 2002], dtype='int64')
Index methods: append, diff, intersection, union, isin, delete, drop, insert, is_monotonic, is_unique, uniuq
obj = Series([4.5,7.2,-5.3,3.6], index=['d','b','a','c'])
obj
d 4.5 b 7.2 a -5.3 c 3.6 dtype: float64
obj2 = obj.reindex(['a','b','c','d','e'])
obj2
a -5.3 b 7.2 c 3.6 d 4.5 e NaN dtype: float64
obj.reindex(['a','b','c','d','e'], fill_value=0)
a -5.3 b 7.2 c 3.6 d 4.5 e 0.0 dtype: float64
obj
d 4.5 b 7.2 a -5.3 c 3.6 dtype: float64
obj3 = Series(['blue','purple','yellow'], index=[0,2,4])
obj3.reindex(range(6))
0 blue 1 NaN 2 purple 3 NaN 4 yellow 5 NaN dtype: object
obj3.reindex(range(6),method='ffill') # forward fills the values
0 blue 1 blue 2 purple 3 purple 4 yellow 5 yellow dtype: object
available method options: ffill or pad, bfill or backfill
frame = DataFrame(np.arange(9).reshape(3,3), index = ['a','c','d'],
columns=['Ohio','Texas','California'])
frame
Ohio | Texas | California | |
---|---|---|---|
a | 0 | 1 | 2 |
c | 3 | 4 | 5 |
d | 6 | 7 | 8 |
frame2 = frame.reindex(['a','b','c','d'])
frame2
Ohio | Texas | California | |
---|---|---|---|
a | 0 | 1 | 2 |
b | NaN | NaN | NaN |
c | 3 | 4 | 5 |
d | 6 | 7 | 8 |
states = ['Texas','Utah','California']
frame.reindex(columns=states)
Texas | Utah | California | |
---|---|---|---|
a | 1 | NaN | 2 |
c | 4 | NaN | 5 |
d | 7 | NaN | 8 |
frame.reindex(index=['a','b','c','d'], method='ffill',
columns=states)
Texas | Utah | California | |
---|---|---|---|
a | 1 | NaN | 2 |
b | 1 | NaN | 2 |
c | 4 | NaN | 5 |
d | 7 | NaN | 8 |
frame
Ohio | Texas | California | |
---|---|---|---|
a | 0 | 1 | 2 |
c | 3 | 4 | 5 |
d | 6 | 7 | 8 |
frame.ix[['a','b','c','d'], states]
Texas | Utah | California | |
---|---|---|---|
a | 1 | NaN | 2 |
b | NaN | NaN | NaN |
c | 4 | NaN | 5 |
d | 7 | NaN | 8 |
obj = Series(np.arange(5.), index = ['a','b','c','d','e'])
obj
a 0 b 1 c 2 d 3 e 4 dtype: float64
new_obj = obj.drop('c')
new_obj
a 0 b 1 d 3 e 4 dtype: float64
obj.drop(['d','c'])
a 0 b 1 e 4 dtype: float64
data = DataFrame(np.arange(16).reshape((4,4)),
index = ['Ohio','Colorado','Utah','New York'],
columns = ['one','two','three','four'])
data
one | two | three | four | |
---|---|---|---|---|
Ohio | 0 | 1 | 2 | 3 |
Colorado | 4 | 5 | 6 | 7 |
Utah | 8 | 9 | 10 | 11 |
New York | 12 | 13 | 14 | 15 |
data.drop(['Colorado','Ohio'])
one | two | three | four | |
---|---|---|---|---|
Utah | 8 | 9 | 10 | 11 |
New York | 12 | 13 | 14 | 15 |
data.drop('two', axis=1)
one | three | four | |
---|---|---|---|
Ohio | 0 | 2 | 3 |
Colorado | 4 | 6 | 7 |
Utah | 8 | 10 | 11 |
New York | 12 | 14 | 15 |
data.drop(['two','four'], axis=1)
one | three | |
---|---|---|
Ohio | 0 | 2 |
Colorado | 4 | 6 |
Utah | 8 | 10 |
New York | 12 | 14 |
obj = Series(np.arange(4.), index=['a','b','c','d'])
obj['b']
1.0
obj[2:4]
c 2 d 3 dtype: float64
obj[['b','a','d']]
b 1 a 0 d 3 dtype: float64
obj[[1,3]]
b 1 d 3 dtype: float64
obj[obj<2]
a 0 b 1 dtype: float64
obj['b':'c']
b 1 c 2 dtype: float64
obj['b':'c']=5
obj
a 0 b 5 c 5 d 3 dtype: float64
data=DataFrame(np.arange(16).reshape((4,4)),
index=['Ohio','Colorado','Utah','New York'],
columns=['one','two','three','four'])
data
one | two | three | four | |
---|---|---|---|---|
Ohio | 0 | 1 | 2 | 3 |
Colorado | 4 | 5 | 6 | 7 |
Utah | 8 | 9 | 10 | 11 |
New York | 12 | 13 | 14 | 15 |
data['two']
Ohio 1 Colorado 5 Utah 9 New York 13 Name: two, dtype: int64
data[['three','one']]
three | one | |
---|---|---|
Ohio | 2 | 0 |
Colorado | 6 | 4 |
Utah | 10 | 8 |
New York | 14 | 12 |
data[:2]
one | two | three | four | |
---|---|---|---|---|
Ohio | 0 | 1 | 2 | 3 |
Colorado | 4 | 5 | 6 | 7 |
data[data['three']>5]
one | two | three | four | |
---|---|---|---|---|
Colorado | 4 | 5 | 6 | 7 |
Utah | 8 | 9 | 10 | 11 |
New York | 12 | 13 | 14 | 15 |
data<5
one | two | three | four | |
---|---|---|---|---|
Ohio | True | True | True | True |
Colorado | True | False | False | False |
Utah | False | False | False | False |
New York | False | False | False | False |
data[data<5]=0
data
one | two | three | four | |
---|---|---|---|---|
Ohio | 0 | 0 | 0 | 0 |
Colorado | 0 | 5 | 6 | 7 |
Utah | 8 | 9 | 10 | 11 |
New York | 12 | 13 | 14 | 15 |
data.ix['Colorado',['two','three']]
two 5 three 6 Name: Colorado, dtype: int64
data.ix[['Colorado','Utah'],[3,0,1]]
four | one | two | |
---|---|---|---|
Colorado | 7 | 0 | 5 |
Utah | 11 | 8 | 9 |
data.ix[2]
one 8 two 9 three 10 four 11 Name: Utah, dtype: int64
data.ix[:'Utah','two']
Ohio 0 Colorado 5 Utah 9 Name: two, dtype: int64
data.ix[data.three>5,:3]
one | two | three | |
---|---|---|---|
Colorado | 0 | 5 | 6 |
Utah | 8 | 9 | 10 |
New York | 12 | 13 | 14 |
s1 = Series([7.3,-2.5,3.4,1.5], index=['a','c','d','e'])
s2 = Series([-2.1,3.6,-1.5,4,3.1], index=['a','c','e','f','g'])
s1
a 7.3 c -2.5 d 3.4 e 1.5 dtype: float64
s2
a -2.1 c 3.6 e -1.5 f 4.0 g 3.1 dtype: float64
s1+s2
a 5.2 c 1.1 d NaN e 0.0 f NaN g NaN dtype: float64
df1 = DataFrame(np.arange(9.).reshape((3,3)),
columns = list('bcd'),
index=['Ohio','Texas','Colorado'])
df2 = DataFrame(np.arange(12.).reshape((4,3)),
columns = list('bde'),
index=['Utah','Ohio','Texas','Oregon'])
df1
b | c | d | |
---|---|---|---|
Ohio | 0 | 1 | 2 |
Texas | 3 | 4 | 5 |
Colorado | 6 | 7 | 8 |
df2
b | d | e | |
---|---|---|---|
Utah | 0 | 1 | 2 |
Ohio | 3 | 4 | 5 |
Texas | 6 | 7 | 8 |
Oregon | 9 | 10 | 11 |
df1 + df2
b | c | d | e | |
---|---|---|---|---|
Colorado | NaN | NaN | NaN | NaN |
Ohio | 3 | NaN | 6 | NaN |
Oregon | NaN | NaN | NaN | NaN |
Texas | 9 | NaN | 12 | NaN |
Utah | NaN | NaN | NaN | NaN |
df1 = DataFrame(np.arange(12.).reshape((3,4)), columns = list('abcd'))
df2 = DataFrame(np.arange(20.).reshape((4,5)), columns = list('abcde'))
df1
a | b | c | d | |
---|---|---|---|---|
0 | 0 | 1 | 2 | 3 |
1 | 4 | 5 | 6 | 7 |
2 | 8 | 9 | 10 | 11 |
df2
a | b | c | d | e | |
---|---|---|---|---|---|
0 | 0 | 1 | 2 | 3 | 4 |
1 | 5 | 6 | 7 | 8 | 9 |
2 | 10 | 11 | 12 | 13 | 14 |
3 | 15 | 16 | 17 | 18 | 19 |
df1 + df2
a | b | c | d | e | |
---|---|---|---|---|---|
0 | 0 | 2 | 4 | 6 | NaN |
1 | 9 | 11 | 13 | 15 | NaN |
2 | 18 | 20 | 22 | 24 | NaN |
3 | NaN | NaN | NaN | NaN | NaN |
df1.add(df2, fill_value=0)
a | b | c | d | e | |
---|---|---|---|---|---|
0 | 0 | 2 | 4 | 6 | 4 |
1 | 9 | 11 | 13 | 15 | 9 |
2 | 18 | 20 | 22 | 24 | 14 |
3 | 15 | 16 | 17 | 18 | 19 |
df1.reindex(columns=df2.columns, fill_value=0)
a | b | c | d | e | |
---|---|---|---|---|---|
0 | 0 | 1 | 2 | 3 | 0 |
1 | 4 | 5 | 6 | 7 | 0 |
2 | 8 | 9 | 10 | 11 | 0 |
aritmetic methods: add, sub, div, mul
arr = np.arange(12.).reshape((3,4))
arr
array([[ 0., 1., 2., 3.], [ 4., 5., 6., 7.], [ 8., 9., 10., 11.]])
arr[0]
array([ 0., 1., 2., 3.])
arr-arr[0]
array([[ 0., 0., 0., 0.], [ 4., 4., 4., 4.], [ 8., 8., 8., 8.]])
frame = DataFrame(np.arange(12.).reshape((4,3)), columns=list('bde'),
index=['Utah','Ohio','Texas','Oregon'])
frame
b | d | e | |
---|---|---|---|
Utah | 0 | 1 | 2 |
Ohio | 3 | 4 | 5 |
Texas | 6 | 7 | 8 |
Oregon | 9 | 10 | 11 |
series =frame.ix[0]
series
b 0 d 1 e 2 Name: Utah, dtype: float64
frame-series
b | d | e | |
---|---|---|---|
Utah | 0 | 0 | 0 |
Ohio | 3 | 3 | 3 |
Texas | 6 | 6 | 6 |
Oregon | 9 | 9 | 9 |
series2 = Series(range(3), index=['b','e','f'])
series2
b 0 e 1 f 2 dtype: int64
frame + series2
b | d | e | f | |
---|---|---|---|---|
Utah | 0 | NaN | 3 | NaN |
Ohio | 3 | NaN | 6 | NaN |
Texas | 6 | NaN | 9 | NaN |
Oregon | 9 | NaN | 12 | NaN |
series3 = frame['d']
frame
b | d | e | |
---|---|---|---|
Utah | 0 | 1 | 2 |
Ohio | 3 | 4 | 5 |
Texas | 6 | 7 | 8 |
Oregon | 9 | 10 | 11 |
series3
Utah 1 Ohio 4 Texas 7 Oregon 10 Name: d, dtype: float64
frame.sub(series3, axis=0)
b | d | e | |
---|---|---|---|
Utah | -1 | 0 | 1 |
Ohio | -1 | 0 | 1 |
Texas | -1 | 0 | 1 |
Oregon | -1 | 0 | 1 |
import numpy as np
frame = DataFrame(np.random.randn(4,3), columns=list('bde'),
index=['Utah','Ohio','Texas','Oregon'])
frame
b | d | e | |
---|---|---|---|
Utah | -0.267451 | -1.829889 | 2.544224 |
Ohio | 1.116375 | -0.270886 | 1.086111 |
Texas | 0.390590 | -0.339125 | 0.341343 |
Oregon | 1.145794 | -1.268138 | -0.925395 |
np.abs(frame)
b | d | e | |
---|---|---|---|
Utah | 0.267451 | 1.829889 | 2.544224 |
Ohio | 1.116375 | 0.270886 | 1.086111 |
Texas | 0.390590 | 0.339125 | 0.341343 |
Oregon | 1.145794 | 1.268138 | 0.925395 |
f = lambda x: x.max()-x.min()
frame.apply(f,axis=1)
Utah 4.374113 Ohio 1.387261 Texas 0.729715 Oregon 2.413932 dtype: float64
def f(x):
return Series([x.min(),x.max()], index=['min','max'])
frame.apply(f, axis=0)
b | d | e | |
---|---|---|---|
min | -0.267451 | -1.829889 | -0.925395 |
max | 1.145794 | -0.270886 | 2.544224 |
format = lambda x: '%.2f' %x
frame.applymap(format)
b | d | e | |
---|---|---|---|
Utah | -0.27 | -1.83 | 2.54 |
Ohio | 1.12 | -0.27 | 1.09 |
Texas | 0.39 | -0.34 | 0.34 |
Oregon | 1.15 | -1.27 | -0.93 |
frame['e'].map(format)
Utah 2.54 Ohio 1.09 Texas 0.34 Oregon -0.93 Name: e, dtype: object
obj = Series(range(4), index=['d','a','b','c'])
obj.sort_index()
a 1 b 2 c 3 d 0 dtype: int64
frame = DataFrame(np.arange(8).reshape((2,4)), index = ['three','one'],
columns = ['d','a','b','c'])
frame.sort_index()
d | a | b | c | |
---|---|---|---|---|
one | 4 | 5 | 6 | 7 |
three | 0 | 1 | 2 | 3 |
frame.sort_index(axis=1)
a | b | c | d | |
---|---|---|---|---|
three | 1 | 2 | 3 | 0 |
one | 5 | 6 | 7 | 4 |
frame.sort_index(axis=1, ascending = False)
d | c | b | a | |
---|---|---|---|---|
three | 0 | 3 | 2 | 1 |
one | 4 | 7 | 6 | 5 |
obj = Series([4,7,-3,2])
obj.order()
2 -3 3 2 0 4 1 7 dtype: int64
obj = Series([4,np.nan,7,np.nan,-3,2])
obj.order()
4 -3 5 2 0 4 2 7 1 NaN 3 NaN dtype: float64
frame = DataFrame({'b':[4,7,-3,2],'a':[0,1,0,1]})
frame
a | b | |
---|---|---|
0 | 0 | 4 |
1 | 1 | 7 |
2 | 0 | -3 |
3 | 1 | 2 |
frame.sort_index(by='b')
a | b | |
---|---|---|
2 | 0 | -3 |
3 | 1 | 2 |
0 | 0 | 4 |
1 | 1 | 7 |
frame.sort_index(by = ['a','b'])
a | b | |
---|---|---|
2 | 0 | -3 |
0 | 0 | 4 |
3 | 1 | 2 |
1 | 1 | 7 |
obj = Series([7, -5, 7, 4, 2, 0, 4])
obj.rank()
0 6.5 1 1.0 2 6.5 3 4.5 4 3.0 5 2.0 6 4.5 dtype: float64
obj.rank(method='first')
0 6 1 1 2 7 3 4 4 3 5 2 6 5 dtype: float64
obj.rank(ascending=False, method='max')
0 2 1 7 2 2 3 4 4 5 5 6 6 4 dtype: float64
frame = DataFrame({'b':[4.3,7,-3,2], 'a':[0,1,0,1],
'c':[-2,5,8,-2.5]})
frame
a | b | c | |
---|---|---|---|
0 | 0 | 4.3 | -2.0 |
1 | 1 | 7.0 | 5.0 |
2 | 0 | -3.0 | 8.0 |
3 | 1 | 2.0 | -2.5 |
frame.rank(axis=1)
a | b | c | |
---|---|---|---|
0 | 2 | 3 | 1 |
1 | 1 | 3 | 2 |
2 | 2 | 1 | 3 |
3 | 2 | 3 | 1 |
average (default), min, max, first
obj = Series(range(5),index=['a','a','b','b','c'])
obj
a 0 a 1 b 2 b 3 c 4 dtype: int64
obj.index.is_unique
False
obj['a']
a 0 a 1 dtype: int64
obj['c']
4
df = DataFrame(np.random.randn(4,3), index=['a','a','b','b'])
df
0 | 1 | 2 | |
---|---|---|---|
a | 0.654546 | -0.963798 | 0.228551 |
a | -1.451196 | -1.528684 | 1.567436 |
b | -1.213714 | -1.556472 | -0.573577 |
b | -1.838320 | -2.250823 | 0.440242 |
df.ix['b']
0 | 1 | 2 | |
---|---|---|---|
b | -1.213714 | -1.556472 | -0.573577 |
b | -1.838320 | -2.250823 | 0.440242 |
df = DataFrame([[1.4, np.nan], [7.1,-4.5],
[np.nan, np.nan],[0.75,-1.3]],
index=['a','b','c','d'],
columns=['one','two'])
df
one | two | |
---|---|---|
a | 1.40 | NaN |
b | 7.10 | -4.5 |
c | NaN | NaN |
d | 0.75 | -1.3 |
df.sum()
one 9.25 two -5.80 dtype: float64
df.sum(axis=1)
a 1.40 b 2.60 c NaN d -0.55 dtype: float64
NA values are excluded unless the entire slice is NA. This can be disables using skipna.
df.mean(axis=1, skipna=False)
a NaN b 1.300 c NaN d -0.275 dtype: float64
df.idxmax()
one b two d dtype: object
df.cumsum()
one | two | |
---|---|---|
a | 1.40 | NaN |
b | 8.50 | -4.5 |
c | NaN | NaN |
d | 9.25 | -5.8 |
df.describe()
one | two | |
---|---|---|
count | 3.000000 | 2.000000 |
mean | 3.083333 | -2.900000 |
std | 3.493685 | 2.262742 |
min | 0.750000 | -4.500000 |
25% | 1.075000 | -3.700000 |
50% | 1.400000 | -2.900000 |
75% | 4.250000 | -2.100000 |
max | 7.100000 | -1.300000 |
obj = Series(['a','a','b','c']*4)
obj.describe()
count 16 unique 3 top a freq 8 dtype: object
import pandas.io.data as web
all_data = {}
for ticker in ['AAPL','IBM','MSFT','GOOG']:
all_data[ticker] = web.get_data_yahoo(ticker,'04/11/2010','04/14/2014')
price = DataFrame({tic: data['Adj Close']
for tic, data in all_data.iteritems()})
volume = DataFrame({tic: data['Volume']
for tic, data in all_data.iteritems()})
returns = price.pct_change()
returns.tail()
AAPL | GOOG | IBM | MSFT | |
---|---|---|---|---|
Date | ||||
2014-04-08 | -0.000136 | 0.031125 | -0.006328 | 0.000516 |
2014-04-09 | 0.013219 | 0.016652 | 0.017355 | 0.016508 |
2014-04-10 | -0.012912 | -0.041107 | -0.004904 | -0.027404 |
2014-04-11 | -0.007358 | -0.019133 | -0.002516 | -0.003913 |
2014-04-14 | 0.003981 | 0.003619 | 0.013244 | -0.000786 |
returns.MSFT.corr(returns.IBM)
0.50623670894935635
returns.MSFT.cov(returns.IBM)
8.8008494273331047e-05
returns.corr()
AAPL | GOOG | IBM | MSFT | |
---|---|---|---|---|
AAPL | 1.000000 | 0.737439 | 0.384410 | 0.336002 |
GOOG | 0.737439 | 1.000000 | 0.329676 | 0.717087 |
IBM | 0.384410 | 0.329676 | 1.000000 | 0.506237 |
MSFT | 0.336002 | 0.717087 | 0.506237 | 1.000000 |
returns.cov()
AAPL | GOOG | IBM | MSFT | |
---|---|---|---|---|
AAPL | 0.000301 | 0.000152 | 0.000080 | 0.000085 |
GOOG | 0.000152 | 0.000528 | 0.000069 | 0.000265 |
IBM | 0.000080 | 0.000069 | 0.000143 | 0.000088 |
MSFT | 0.000085 | 0.000265 | 0.000088 | 0.000211 |
returns.corrwith(returns.IBM)
AAPL 0.384410 GOOG 0.329676 IBM 1.000000 MSFT 0.506237 dtype: float64
returns.corrwith(volume)
AAPL -0.128694 GOOG -0.478992 IBM -0.152372 MSFT -0.102546 dtype: float64
obj = Series(['c','a','d','a','a','b','b','c','c'])
uniques = obj.unique()
uniques
array(['c', 'a', 'd', 'b'], dtype=object)
obj.value_counts()
c 3 a 3 b 2 d 1 dtype: int64
pd.value_counts(obj.values, sort=False)
a 3 c 3 b 2 d 1 dtype: int64
mask = obj.isin(['b','c'])
mask
0 True 1 False 2 False 3 False 4 False 5 True 6 True 7 True 8 True dtype: bool
obj[mask]
0 c 5 b 6 b 7 c 8 c dtype: object
data = DataFrame({'Quo1':[1,3,4,3,4],
'Quo2': [2,3,1,2,3],
'Quo3':[1,5,2,4,4]})
data
Quo1 | Quo2 | Quo3 | |
---|---|---|---|
0 | 1 | 2 | 1 |
1 | 3 | 3 | 5 |
2 | 4 | 1 | 2 |
3 | 3 | 2 | 4 |
4 | 4 | 3 | 4 |
result = data.apply(pd.value_counts).fillna(0)
result
Quo1 | Quo2 | Quo3 | |
---|---|---|---|
1 | 1 | 1 | 1 |
2 | 0 | 2 | 1 |
3 | 2 | 2 | 0 |
4 | 2 | 0 | 2 |
5 | 0 | 0 | 1 |
string_data = Series(['aardvark','artichoke',np.nan, 'avocado'])
string_data
0 aardvark 1 artichoke 2 NaN 3 avocado dtype: object
string_data.isnull()
0 False 1 False 2 True 3 False dtype: bool
string_data[0] = None
string_data.isnull()
0 True 1 False 2 True 3 False dtype: bool
NA handling methods: dropna, fillna, isnull, notnull
from numpy import nan as NA
data = Series([1, NA, 3.5, NA, 7])
data.dropna()
0 1.0 2 3.5 4 7.0 dtype: float64
data[data.notnull()]
0 1.0 2 3.5 4 7.0 dtype: float64
data = DataFrame([[1.,6.5,3.],[1.,NA,NA],
[NA,NA,NA],[NA,6.5,3.]])
data
0 | 1 | 2 | |
---|---|---|---|
0 | 1 | 6.5 | 3 |
1 | 1 | NaN | NaN |
2 | NaN | NaN | NaN |
3 | NaN | 6.5 | 3 |
cleaned = data.dropna()
cleaned
0 | 1 | 2 | |
---|---|---|---|
0 | 1 | 6.5 | 3 |
data.dropna(how='all')
0 | 1 | 2 | |
---|---|---|---|
0 | 1 | 6.5 | 3 |
1 | 1 | NaN | NaN |
3 | NaN | 6.5 | 3 |
data[4]=NA
data
0 | 1 | 2 | 4 | |
---|---|---|---|---|
0 | 1 | 6.5 | 3 | NaN |
1 | 1 | NaN | NaN | NaN |
2 | NaN | NaN | NaN | NaN |
3 | NaN | 6.5 | 3 | NaN |
data.dropna(axis=1,how='all')
0 | 1 | 2 | |
---|---|---|---|
0 | 1 | 6.5 | 3 |
1 | 1 | NaN | NaN |
2 | NaN | NaN | NaN |
3 | NaN | 6.5 | 3 |
df = DataFrame(np.random.randn(7,3))
df.ix[:4,1]=NA; df.ix[:2,2]=NA
df
0 | 1 | 2 | |
---|---|---|---|
0 | -0.676323 | NaN | NaN |
1 | -1.088497 | NaN | NaN |
2 | 0.122962 | NaN | NaN |
3 | 2.087705 | NaN | 1.538338 |
4 | -1.267822 | NaN | -0.924860 |
5 | -1.596591 | -1.001367 | -0.705282 |
6 | -0.026753 | -0.694895 | -0.386125 |
df.dropna(thresh=2)
0 | 1 | 2 | |
---|---|---|---|
3 | 2.087705 | NaN | 1.538338 |
4 | -1.267822 | NaN | -0.924860 |
5 | -1.596591 | -1.001367 | -0.705282 |
6 | -0.026753 | -0.694895 | -0.386125 |
df.fillna(0)
0 | 1 | 2 | |
---|---|---|---|
0 | -0.676323 | 0.000000 | 0.000000 |
1 | -1.088497 | 0.000000 | 0.000000 |
2 | 0.122962 | 0.000000 | 0.000000 |
3 | 2.087705 | 0.000000 | 1.538338 |
4 | -1.267822 | 0.000000 | -0.924860 |
5 | -1.596591 | -1.001367 | -0.705282 |
6 | -0.026753 | -0.694895 | -0.386125 |
df.fillna({1:0.5,3:-1})
0 | 1 | 2 | |
---|---|---|---|
0 | -0.676323 | 0.500000 | NaN |
1 | -1.088497 | 0.500000 | NaN |
2 | 0.122962 | 0.500000 | NaN |
3 | 2.087705 | 0.500000 | 1.538338 |
4 | -1.267822 | 0.500000 | -0.924860 |
5 | -1.596591 | -1.001367 | -0.705282 |
6 | -0.026753 | -0.694895 | -0.386125 |
_ = df.fillna(0, inplace=True)
df
0 | 1 | 2 | |
---|---|---|---|
0 | -0.676323 | 0.000000 | 0.000000 |
1 | -1.088497 | 0.000000 | 0.000000 |
2 | 0.122962 | 0.000000 | 0.000000 |
3 | 2.087705 | 0.000000 | 1.538338 |
4 | -1.267822 | 0.000000 | -0.924860 |
5 | -1.596591 | -1.001367 | -0.705282 |
6 | -0.026753 | -0.694895 | -0.386125 |
df = DataFrame(np.random.randn(6,3))
df.ix[2:,1]=NA; df.ix[4:,2]=NA
df
0 | 1 | 2 | |
---|---|---|---|
0 | -0.570090 | -0.513487 | -0.707382 |
1 | 1.453435 | 0.440059 | -0.292707 |
2 | -0.045053 | NaN | 0.951857 |
3 | 1.277519 | NaN | -0.463472 |
4 | -2.012467 | NaN | NaN |
5 | -0.854318 | NaN | NaN |
df.fillna(method = 'ffill')
0 | 1 | 2 | |
---|---|---|---|
0 | -0.570090 | -0.513487 | -0.707382 |
1 | 1.453435 | 0.440059 | -0.292707 |
2 | -0.045053 | 0.440059 | 0.951857 |
3 | 1.277519 | 0.440059 | -0.463472 |
4 | -2.012467 | 0.440059 | -0.463472 |
5 | -0.854318 | 0.440059 | -0.463472 |
df.fillna(method='ffill',limit=2)
0 | 1 | 2 | |
---|---|---|---|
0 | -0.570090 | -0.513487 | -0.707382 |
1 | 1.453435 | 0.440059 | -0.292707 |
2 | -0.045053 | 0.440059 | 0.951857 |
3 | 1.277519 | 0.440059 | -0.463472 |
4 | -2.012467 | NaN | -0.463472 |
5 | -0.854318 | NaN | -0.463472 |
data=Series([1.,NA,3.5,NA,7])
data.fillna(data.mean())
0 1.000000 1 3.833333 2 3.500000 3 3.833333 4 7.000000 dtype: float64
fillna function arguments : value, method, axis, inplace, limit
data = Series(np.random.randn(10),
index = [['a','a','a','b','b','b','c','c','d','d'],
[1,2,3,1,2,3,1,2,2,3]])
data
a 1 -0.734465 2 -0.775564 3 -2.134157 b 1 1.169687 2 0.655510 3 0.229640 c 1 0.607370 2 -0.227422 d 2 -0.319022 3 -1.861461 dtype: float64
data.index
MultiIndex(levels=[[u'a', u'b', u'c', u'd'], [1, 2, 3]], labels=[[0, 0, 0, 1, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 1, 2, 0, 1, 1, 2]])
data['b']
1 1.169687 2 0.655510 3 0.229640 dtype: float64
data['b':'c']
b 1 1.169687 2 0.655510 3 0.229640 c 1 0.607370 2 -0.227422 dtype: float64
data.ix[['b','d']]
b 1 1.169687 2 0.655510 3 0.229640 d 2 -0.319022 3 -1.861461 dtype: float64
data[:,2]
a -0.775564 b 0.655510 c -0.227422 d -0.319022 dtype: float64
data.unstack()
1 | 2 | 3 | |
---|---|---|---|
a | -0.734465 | -0.775564 | -2.134157 |
b | 1.169687 | 0.655510 | 0.229640 |
c | 0.607370 | -0.227422 | NaN |
d | NaN | -0.319022 | -1.861461 |
data.unstack().stack()
a 1 -0.734465 2 -0.775564 3 -2.134157 b 1 1.169687 2 0.655510 3 0.229640 c 1 0.607370 2 -0.227422 d 2 -0.319022 3 -1.861461 dtype: float64
frame = DataFrame(np.arange(12).reshape((4,3)),
index = [['a','a','b','b'],[1,2,1,2]],
columns = [['Ohio','Ohio','Colorado'],
['Green','Red','Green']])
frame
Ohio | Colorado | |||
---|---|---|---|---|
Green | Red | Green | ||
a | 1 | 0 | 1 | 2 |
2 | 3 | 4 | 5 | |
b | 1 | 6 | 7 | 8 |
2 | 9 | 10 | 11 |
frame.index.names = ['key1', 'key2']
frame.columns.names = ['state', 'color']
frame
state | Ohio | Colorado | ||
---|---|---|---|---|
color | Green | Red | Green | |
key1 | key2 | |||
a | 1 | 0 | 1 | 2 |
2 | 3 | 4 | 5 | |
b | 1 | 6 | 7 | 8 |
2 | 9 | 10 | 11 |
frame['Ohio']
color | Green | Red | |
---|---|---|---|
key1 | key2 | ||
a | 1 | 0 | 1 |
2 | 3 | 4 | |
b | 1 | 6 | 7 |
2 | 9 | 10 |
frame.swaplevel('key1','key2')
state | Ohio | Colorado | ||
---|---|---|---|---|
color | Green | Red | Green | |
key2 | key1 | |||
1 | a | 0 | 1 | 2 |
2 | a | 3 | 4 | 5 |
1 | b | 6 | 7 | 8 |
2 | b | 9 | 10 | 11 |
frame.sortlevel(1)
state | Ohio | Colorado | ||
---|---|---|---|---|
color | Green | Red | Green | |
key1 | key2 | |||
a | 1 | 0 | 1 | 2 |
b | 1 | 6 | 7 | 8 |
a | 2 | 3 | 4 | 5 |
b | 2 | 9 | 10 | 11 |
frame.sortlevel(0)
state | Ohio | Colorado | ||
---|---|---|---|---|
color | Green | Red | Green | |
key1 | key2 | |||
a | 1 | 0 | 1 | 2 |
2 | 3 | 4 | 5 | |
b | 1 | 6 | 7 | 8 |
2 | 9 | 10 | 11 |
frame.swaplevel(0,1).sortlevel(0)
state | Ohio | Colorado | ||
---|---|---|---|---|
color | Green | Red | Green | |
key2 | key1 | |||
1 | a | 0 | 1 | 2 |
b | 6 | 7 | 8 | |
2 | a | 3 | 4 | 5 |
b | 9 | 10 | 11 |
frame
state | Ohio | Colorado | ||
---|---|---|---|---|
color | Green | Red | Green | |
key1 | key2 | |||
a | 1 | 0 | 1 | 2 |
2 | 3 | 4 | 5 | |
b | 1 | 6 | 7 | 8 |
2 | 9 | 10 | 11 |
frame.sum(level='key2')
state | Ohio | Colorado | |
---|---|---|---|
color | Green | Red | Green |
key2 | |||
1 | 6 | 8 | 10 |
2 | 12 | 14 | 16 |
frame.sum(level='color',axis=1)
color | Green | Red | |
---|---|---|---|
key1 | key2 | ||
a | 1 | 2 | 1 |
2 | 8 | 4 | |
b | 1 | 14 | 7 |
2 | 20 | 10 |
frame = DataFrame({'a':range(7), 'b':range(7,0,-1),
'c':['one','one','one','two','two','two','two'],
'd':[0,1,2,0,1,2,3]})
frame
a | b | c | d | |
---|---|---|---|---|
0 | 0 | 7 | one | 0 |
1 | 1 | 6 | one | 1 |
2 | 2 | 5 | one | 2 |
3 | 3 | 4 | two | 0 |
4 | 4 | 3 | two | 1 |
5 | 5 | 2 | two | 2 |
6 | 6 | 1 | two | 3 |
frame2 = frame.set_index(['c','d'])
frame2
a | b | ||
---|---|---|---|
c | d | ||
one | 0 | 0 | 7 |
1 | 1 | 6 | |
2 | 2 | 5 | |
two | 0 | 3 | 4 |
1 | 4 | 3 | |
2 | 5 | 2 | |
3 | 6 | 1 |
frame.set_index(['c','d'], drop=False)
a | b | c | d | ||
---|---|---|---|---|---|
c | d | ||||
one | 0 | 0 | 7 | one | 0 |
1 | 1 | 6 | one | 1 | |
2 | 2 | 5 | one | 2 | |
two | 0 | 3 | 4 | two | 0 |
1 | 4 | 3 | two | 1 | |
2 | 5 | 2 | two | 2 | |
3 | 6 | 1 | two | 3 |
frame2.reset_index()
c | d | a | b | |
---|---|---|---|---|
0 | one | 0 | 0 | 7 |
1 | one | 1 | 1 | 6 |
2 | one | 2 | 2 | 5 |
3 | two | 0 | 3 | 4 |
4 | two | 1 | 4 | 3 |
5 | two | 2 | 5 | 2 |
6 | two | 3 | 6 | 1 |
ser = Series(np.arange(3.))
ser[-1]
--------------------------------------------------------------------------- KeyError Traceback (most recent call last) <ipython-input-168-3cbe0b873a9e> in <module>() ----> 1 ser[-1] /Users/sergulaydore/anaconda/lib/python2.7/site-packages/pandas/core/series.pyc in __getitem__(self, key) 482 def __getitem__(self, key): 483 try: --> 484 result = self.index.get_value(self, key) 485 486 if not np.isscalar(result): /Users/sergulaydore/anaconda/lib/python2.7/site-packages/pandas/core/index.pyc in get_value(self, series, key) 1194 1195 try: -> 1196 return self._engine.get_value(s, k) 1197 except KeyError as e1: 1198 if len(self) > 0 and self.inferred_type in ['integer','boolean']: /Users/sergulaydore/anaconda/lib/python2.7/site-packages/pandas/index.so in pandas.index.IndexEngine.get_value (pandas/index.c:2993)() /Users/sergulaydore/anaconda/lib/python2.7/site-packages/pandas/index.so in pandas.index.IndexEngine.get_value (pandas/index.c:2808)() /Users/sergulaydore/anaconda/lib/python2.7/site-packages/pandas/index.so in pandas.index.IndexEngine.get_loc (pandas/index.c:3534)() /Users/sergulaydore/anaconda/lib/python2.7/site-packages/pandas/hashtable.so in pandas.hashtable.Int64HashTable.get_item (pandas/hashtable.c:7035)() /Users/sergulaydore/anaconda/lib/python2.7/site-packages/pandas/hashtable.so in pandas.hashtable.Int64HashTable.get_item (pandas/hashtable.c:6976)() KeyError: -1
ser
0 0 1 1 2 2 dtype: float64
non-integer index works
ser2 = Series(np.arange(3.), index = ['a','b','c'])
ser2[-1]
2.0
ser.ix[:1]
0 0 1 1 dtype: float64
ser3 = Series(range(3), index = [-5, 1,3])
ser3.iget_value(2)
2
frame = DataFrame(np.arange(6).reshape(3,2), index=[2,0,1])
frame
0 | 1 | |
---|---|---|
2 | 0 | 1 |
0 | 2 | 3 |
1 | 4 | 5 |
frame.irow(0)
0 0 1 1 Name: 2, dtype: int64
frame.irow(2)
0 4 1 5 Name: 1, dtype: int64