In [1]:
# import
import pandas as pd
import numpy as np

Possible data inputs to DataFrame constructor

2D ndarray                             A matrix of data, passing optional row and column labels
dict of arrays, lists, or tuples       Each sequence becomes a column in the DataFrame. All sequences must be the same length.
NumPy structured/record array          Treated as the “dict of arrays” case
dict of Series                         Each value becomes a column. Indexes from each Series are unioned together to form the result’s row index if no explicit index is passed.
dict of dicts                          Each inner dict becomes a column. Keys are unioned to form the row index as in the “dict of Series” case.
list of dicts or Series                Each item becomes a row in the DataFrame. Union of dict keys or Series indexes become the DataFrame’s column labels
List of lists or tuples                Treated as the “2D ndarray” case
Another DataFrame                      The DataFrame’s indexes are used unless different ones are passed
NumPy MaskedArray                      Like the “2D ndarray” case except masked values become NA/missing in the DataFrame result
In [2]:
state = ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada']
year = [2000, 2001, 2002, 2001, 2002]
pop = [1.5, 1.7, 3.6, 2.4, 2.9]

print(type(state), type(year), type(pop))
<class 'list'> <class 'list'> <class 'list'>
In [3]:
# creating dataframe
df = pd.DataFrame({'state':state, 'year':year, 'pop':pop})
print(df.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 3 columns):
pop      5 non-null float64
state    5 non-null object
year     5 non-null int64
dtypes: float64(1), int64(1), object(1)
memory usage: 200.0+ bytes
None
In [4]:
print(df)
   pop   state  year
0  1.5    Ohio  2000
1  1.7    Ohio  2001
2  3.6    Ohio  2002
3  2.4  Nevada  2001
4  2.9  Nevada  2002
In [5]:
sdata = {'state':state, 'year':year, 'pop':pop}
print(sdata,"\n",type(sdata))
{'year': [2000, 2001, 2002, 2001, 2002], 'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'], 'pop': [1.5, 1.7, 3.6, 2.4, 2.9]} 
 <class 'dict'>
In [6]:
df = pd.DataFrame(sdata, columns=['pop1', 'state1', 'year1'])   # we can not rename columns like this, but create column names 
                                                                # if doesn't exists
print(df)
Empty DataFrame
Columns: [pop1, state1, year1]
Index: []
In [7]:
df = pd.DataFrame(sdata, columns=['pop1', 'state', 'year'])  # this will pick those columns from sdata which matched
print(df)
  pop1   state  year
0  NaN    Ohio  2000
1  NaN    Ohio  2001
2  NaN    Ohio  2002
3  NaN  Nevada  2001
4  NaN  Nevada  2002
In [8]:
df = pd.DataFrame(sdata)
print(df.columns)
Index(['pop', 'state', 'year'], dtype='object')
In [9]:
# renaming columns and index
df.columns = ['pop1', 'state1', 'year1']
df.index = ['one', 'two', 'three', 'four', 'five']
print(df)
       pop1  state1  year1
one     1.5    Ohio   2000
two     1.7    Ohio   2001
three   3.6    Ohio   2002
four    2.4  Nevada   2001
five    2.9  Nevada   2002
In [10]:
# stats about dataframe
print(df.index, "\n", df.shape, "\n", df.columns)
Index(['one', 'two', 'three', 'four', 'five'], dtype='object') 
 (5, 3) 
 Index(['pop1', 'state1', 'year1'], dtype='object')
In [11]:
df['pop1'] = 1.5
print(df)
       pop1  state1  year1
one     1.5    Ohio   2000
two     1.5    Ohio   2001
three   1.5    Ohio   2002
four    1.5  Nevada   2001
five    1.5  Nevada   2002
In [12]:
df['pop1'] = range(5)
print(df)
       pop1  state1  year1
one       0    Ohio   2000
two       1    Ohio   2001
three     2    Ohio   2002
four      3  Nevada   2001
five      4  Nevada   2002
In [13]:
# can access the data as 
print(df['state1'])
one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
Name: state1, dtype: object
In [14]:
print(df.state1)
one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
Name: state1, dtype: object
In [15]:
# for deleting any columns
del df['pop1']
print(df)
       state1  year1
one      Ohio   2000
two      Ohio   2001
three    Ohio   2002
four   Nevada   2001
five   Nevada   2002
In [16]:
# transpose the dataframe
dft = df.T
print(dft)
         one   two three    four    five
state1  Ohio  Ohio  Ohio  Nevada  Nevada
year1   2000  2001  2002    2001    2002
In [17]:
# using columns as an index
df.index = df['year1']
del df['year1']
print(df)
       state1
year1        
2000     Ohio
2001     Ohio
2002     Ohio
2001   Nevada
2002   Nevada
In [18]:
df.columns.name, df.index.name
Out[18]:
(None, 'year1')
In [19]:
df.columns
Out[19]:
Index(['state1'], dtype='object')
In [20]:
# printing values
df.values
Out[20]:
array([['Ohio'],
       ['Ohio'],
       ['Ohio'],
       ['Nevada'],
       ['Nevada']], dtype=object)

Index methods and properties

append               Concatenate with additional Index objects, producing a new Index
diff                 Compute set difference as an Index
intersection         Compute set intersection
union                Compute set union
isin                 Compute boolean array indicating whether each value is contained in the passed collection
delete               Compute new Index with element at index i deleted
drop                 Compute new index by deleting passed values
insert               Compute new Index by inserting element at index i
is_monotonic         Returns True if each element is greater than or equal to the previous element
is_unique            Returns True if the Index has no duplicate values
unique               Compute the array of unique values in the Index
In [21]:
# Series and DataFrames index are mutable
df.index
Out[21]:
Int64Index([2000, 2001, 2002, 2001, 2002], dtype='int64', name='year1')
In [22]:
#df.index[2]=2009 # this will throw a error

Reindex Series or DataFrme

index            New sequence to use as index. Can be Index instance or any other sequence-like Python data structure. An Index will be used exactly as is without any copying
method           Interpolation (fill) method, see Table 5-4 for options.
fill_value       Substitute value to use when introducing missing data by reindexing
limit            When forward- or backfilling, maximum size gap to fill
level            Match simple Index on level of MultiIndex, otherwise select subset of
copy             Do not copy underlying data if new index is equivalent to old index. True by default (i.e. always copy data).
In [23]:
print(df)
       state1
year1        
2000     Ohio
2001     Ohio
2002     Ohio
2001   Nevada
2002   Nevada
In [24]:
df.index
Out[24]:
Int64Index([2000, 2001, 2002, 2001, 2002], dtype='int64', name='year1')
In [25]:
# df2 = df.reindex([2000, 2001, 2002, 2001, 2002, 2009]) 
# this will throw an value error, as index should be unique
In [26]:
frame = pd.DataFrame(np.arange(9).reshape((3, 3)), index=['a', 'c', 'd'],columns=['Ohio', 'Texas', 'California'])
print(frame)
   Ohio  Texas  California
a     0      1           2
c     3      4           5
d     6      7           8
In [27]:
frame2 = frame.reindex(['a', 'b', 'c', 'd'])
print(frame2)
   Ohio  Texas  California
a   0.0    1.0         2.0
b   NaN    NaN         NaN
c   3.0    4.0         5.0
d   6.0    7.0         8.0
In [28]:
# likewise let's revert the df
df['year'] = df.index
df.index = [0,1,2,3,4]
print(df)
   state1  year
0    Ohio  2000
1    Ohio  2001
2    Ohio  2002
3  Nevada  2001
4  Nevada  2002
In [29]:
# now we can reindex this df
df2 = df.reindex([1,2,3,4,5,6,7])  # again, reindex will first look into the df and then create the new
print(df2)  # as here, it will keep 1,2,3,4 and drop 0 and create new 5,6,7 index
   state1    year
1    Ohio  2001.0
2    Ohio  2002.0
3  Nevada  2001.0
4  Nevada  2002.0
5     NaN     NaN
6     NaN     NaN
7     NaN     NaN
In [30]:
# better and faster way to do that is - 
df3=df2.ix[[1,2,3,4,6]]
print(df3)
   state1    year
1    Ohio  2001.0
2    Ohio  2002.0
3  Nevada  2001.0
4  Nevada  2002.0
6     NaN     NaN
In [31]:
# CAN ALter the columns as well
new_columns = ['state1', 'year', 'population']
df4 = df3.ix[[1,2,3,4,6], new_columns]
print(df4)
   state1    year  population
1    Ohio  2001.0         NaN
2    Ohio  2002.0         NaN
3  Nevada  2001.0         NaN
4  Nevada  2002.0         NaN
6     NaN     NaN         NaN
In [32]:
df4.columns
Out[32]:
Index(['state1', 'year', 'population'], dtype='object')
In [33]:
# renaming columns
df4.columns = ['state', 'year', 'pop']
print(df4)
    state    year  pop
1    Ohio  2001.0  NaN
2    Ohio  2002.0  NaN
3  Nevada  2001.0  NaN
4  Nevada  2002.0  NaN
6     NaN     NaN  NaN
In [34]:
# dropping index or columns
df5=df4.drop([3])
print(df5)
    state    year  pop
1    Ohio  2001.0  NaN
2    Ohio  2002.0  NaN
4  Nevada  2002.0  NaN
6     NaN     NaN  NaN
In [35]:
df5 = df5.drop(['pop'], axis=1)
print(df5)
    state    year
1    Ohio  2001.0
2    Ohio  2002.0
4  Nevada  2002.0
6     NaN     NaN

Indexing, selection, and filtering

In [36]:
df4
Out[36]:
state year pop
1 Ohio 2001.0 NaN
2 Ohio 2002.0 NaN
3 Nevada 2001.0 NaN
4 Nevada 2002.0 NaN
6 NaN NaN NaN
In [37]:
df4[df4['state']=='Ohio']
Out[37]:
state year pop
1 Ohio 2001.0 NaN
2 Ohio 2002.0 NaN
In [38]:
df4[['state', 'year']]
Out[38]:
state year
1 Ohio 2001.0
2 Ohio 2002.0
3 Nevada 2001.0
4 Nevada 2002.0
6 NaN NaN
In [39]:
df4['year'][df4['state']=='Ohio']=2004
C:\tools\Anaconda3\lib\site-packages\ipykernel\__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
In [40]:
df4
Out[40]:
state year pop
1 Ohio 2004.0 NaN
2 Ohio 2004.0 NaN
3 Nevada 2001.0 NaN
4 Nevada 2002.0 NaN
6 NaN NaN NaN
In [41]:
# ix enables you to select a subset of the rows and columns from a DataFrame with NumPy like notation plus axis labels
df4.ix[[1,2],['state']]
Out[41]:
state
1 Ohio
2 Ohio
In [42]:
df4.ix[[3,6],[0,2]]
Out[42]:
state pop
3 Nevada NaN
6 NaN NaN
In [43]:
df4.ix[df4['year']<2003,[0,2]]
Out[43]:
state pop
3 Nevada NaN
4 Nevada NaN

Indexing options with DataFrame

obj[val]              Select single column or sequence of columns from the DataFrame. Special case con-veniences: boolean array (filter rows), slice (slice rows), or boolean DataFrame (set values based on some criterion).
obj.ix[val]           Selects single row of subset of rows from the DataFrame.
obj.ix[:, val]        Selects single column of subset of columns.
obj.ix[val1, val2]    Select both rows and columns. 
reindex method        Conform one or more axes to new indexes. 
xs method             Select single row or column as a Series by label.
icol, irow methods    Select single column or row, respectively, as a Series by integer location.
get_value, set_value methods             Select single value by row and column label.

Arithmetic and data alignment

In [44]:
s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])
s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1], index=['a', 'c', 'e', 'f', 'g'])
In [45]:
s1 + s2  #assigned NaN for those index which is not found in another series
Out[45]:
a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64
In [46]:
df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'), index=['Ohio', 'Texas', 'Colorado'])
df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon'])
In [47]:
df1 + df2
Out[47]:
b c d e
Colorado NaN NaN NaN NaN
Ohio 3.0 NaN 6.0 NaN
Oregon NaN NaN NaN NaN
Texas 9.0 NaN 12.0 NaN
Utah NaN NaN NaN NaN

Arithmetic methods with fill values

In [48]:
df1.add(df2, fill_value=0)
Out[48]:
b c d e
Colorado 6.0 7.0 8.0 NaN
Ohio 3.0 1.0 6.0 5.0
Oregon 9.0 NaN 10.0 11.0
Texas 9.0 4.0 12.0 8.0
Utah 0.0 NaN 1.0 2.0
In [49]:
# when reindexing a Series or DataFrame, you can also specify a different fill value
df1.reindex(columns=df2.columns, fill_value=0)
Out[49]:
b d e
Ohio 0.0 2.0 0
Texas 3.0 5.0 0
Colorado 6.0 8.0 0

Flexible arithmetic methods

add   Method for addition (+)
sub   Method for subtraction (-)
div   Method for division (/)
mul   Method for multiplication (*)

Operations between DataFrame and Series

In [50]:
frame = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon'])
frame
Out[50]:
b d e
Utah 0.0 1.0 2.0
Ohio 3.0 4.0 5.0
Texas 6.0 7.0 8.0
Oregon 9.0 10.0 11.0
In [51]:
series = frame.ix[0]  # pickng first row
series
Out[51]:
b    0.0
d    1.0
e    2.0
Name: Utah, dtype: float64
In [52]:
frame * series
Out[52]:
b d e
Utah 0.0 1.0 4.0
Ohio 0.0 4.0 10.0
Texas 0.0 7.0 16.0
Oregon 0.0 10.0 22.0
In [53]:
# By default, arithmetic between DataFrame and Series matches the index of the Series on the DataFrame's columns, 
# broadcasting down the rows:
frame - series
Out[53]:
b d e
Utah 0.0 0.0 0.0
Ohio 3.0 3.0 3.0
Texas 6.0 6.0 6.0
Oregon 9.0 9.0 9.0
In [54]:
series2 = pd.Series(range(3), index=['b', 'e', 'f'])
frame * series2
Out[54]:
b d e f
Utah 0.0 NaN 2.0 NaN
Ohio 0.0 NaN 5.0 NaN
Texas 0.0 NaN 8.0 NaN
Oregon 0.0 NaN 11.0 NaN

Function application and mapping

In [55]:
f = lambda x : x.max() - x.min()
In [56]:
frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon'])
print(frame)
               b         d         e
Utah    0.572069  1.094135  0.809313
Ohio   -0.205726  0.284662  0.193174
Texas  -1.313522  0.970605 -1.067873
Oregon  0.052951  0.225374 -0.274396
In [57]:
frame.apply(f)
Out[57]:
b    1.885591
d    0.868761
e    1.877187
dtype: float64
In [58]:
frame.apply(f, axis=1)
Out[58]:
Utah      0.522065
Ohio      0.490388
Texas     2.284127
Oregon    0.499770
dtype: float64
In [59]:
# defining a func
def f(x):
    return pd.Series([x.max(), x.min()], index=['max', 'min'])
In [60]:
frame.apply(f)
Out[60]:
b d e
max 0.572069 1.094135 0.809313
min -1.313522 0.225374 -1.067873
In [61]:
frame.apply(f, axis=1)
Out[61]:
max min
Utah 1.094135 0.572069
Ohio 0.284662 -0.205726
Texas 0.970605 -1.313522
Oregon 0.225374 -0.274396
In [62]:
format = lambda x: '%.2f' % x
frame.applymap(format)
Out[62]:
b d e
Utah 0.57 1.09 0.81
Ohio -0.21 0.28 0.19
Texas -1.31 0.97 -1.07
Oregon 0.05 0.23 -0.27

Sorting and ranking

In [63]:
obj = pd.Series(range(4), index=['d', 'a', 'b', 'c'])
obj
Out[63]:
d    0
a    1
b    2
c    3
dtype: int32
In [64]:
# sorting on index
obj.sort_index()
Out[64]:
a    1
b    2
c    3
d    0
dtype: int32
In [65]:
frame = pd.DataFrame(np.arange(8).reshape((2, 4)), index=['three', 'one'], columns=['d', 'a', 'b', 'c'])
frame
Out[65]:
d a b c
three 0 1 2 3
one 4 5 6 7
In [66]:
frame.sort_index()
Out[66]:
d a b c
one 4 5 6 7
three 0 1 2 3
In [67]:
frame.sort_index(axis=1)
Out[67]:
a b c d
three 1 2 3 0
one 5 6 7 4
In [68]:
frame.sort_index(axis=1).sort_index()
Out[68]:
a b c d
one 5 6 7 4
three 1 2 3 0
In [69]:
frame.sort_index(axis=1, ascending=False)
Out[69]:
d c b a
three 0 3 2 1
one 4 7 6 5
In [70]:
# To sort a Series by its values, use its order method
sr = pd.Series(['2', np.nan, '-3', '5'])
sr
Out[70]:
0      2
1    NaN
2     -3
3      5
dtype: object
In [71]:
# sorting by value
sr.sort_values()
Out[71]:
2     -3
0      2
3      5
1    NaN
dtype: object
In [72]:
frame = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
frame
Out[72]:
a b
0 0 4
1 1 7
2 0 -3
3 1 2
In [73]:
frame.sort_values(by='b')
Out[73]:
a b
2 0 -3
3 1 2
0 0 4
1 1 7
In [74]:
frame.sort_values(by=['a', 'b'])
Out[74]:
a b
2 0 -3
0 0 4
3 1 2
1 1 7
In [75]:
# ranking   # Explore more
obj = pd.Series([7, -5, 7, 4, 2, 0, 4])
obj
Out[75]:
0    7
1   -5
2    7
3    4
4    2
5    0
6    4
dtype: int64
In [76]:
obj.rank()
Out[76]:
0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

Axis indexes with duplicate values

In [77]:
obj = pd.Series(range(5), index=['a', 'a', 'b', 'b', 'c'])
obj
Out[77]:
a    0
a    1
b    2
b    3
c    4
dtype: int32
In [78]:
obj.index.unique()  # get unique index
Out[78]:
array(['a', 'b', 'c'], dtype=object)
In [79]:
obj.index.is_unique  # check if index are unique
Out[79]:
False
In [80]:
df = pd.DataFrame(np.random.randn(4, 3), index=['a', 'a', 'b', 'b'])
df
Out[80]:
0 1 2
a -1.077273 0.747123 0.948973
a 1.097137 0.753723 0.810827
b -1.413813 -0.161147 0.414814
b -0.633463 0.705439 0.175328
In [81]:
df.index.is_unique
Out[81]:
False
In [82]:
df.ix['a']   # ix is used to select rows by index
Out[82]:
0 1 2
a -1.077273 0.747123 0.948973
a 1.097137 0.753723 0.810827
In [83]:
df.ix[0]
Out[83]:
0   -1.077273
1    0.747123
2    0.948973
Name: a, dtype: float64