Finally a better indexing interface in pandas 0.12
%pylab inline
import pandas as pd
import numpy as np
from IPython.core import display
Welcome to pylab, a matplotlib-based Python environment [backend: module://IPython.zmq.pylab.backend_inline]. For more information, type 'help(pylab)'.
s = pd.Series(data = [1, 3, 5, np.nan, 6, 8])
s
0 1 1 3 2 5 3 NaN 4 6 5 8 dtype: float64
dates = pd.date_range("20130101", periods=6) # periods in days
df = pd.DataFrame(data = np.random.randn(6, 4), index = dates, columns=list("ABCD"))
df
A | B | C | D | |
---|---|---|---|---|
2013-01-01 | -0.030813 | -0.865584 | -0.433447 | 0.507878 |
2013-01-02 | 0.170206 | -1.967499 | -0.362710 | -1.417809 |
2013-01-03 | -0.964603 | 0.761011 | -0.286228 | -0.162099 |
2013-01-04 | 1.204170 | 0.711299 | 0.451529 | 0.546794 |
2013-01-05 | -1.193963 | 0.739754 | -0.405187 | -0.366887 |
2013-01-06 | 1.015976 | -0.962043 | 0.178670 | 0.669697 |
## ways of creating series/df with duplicate values
df2 = pd.DataFrame({
"A": 1.,
"B": pd.Timestamp("20130102"),
"C": pd.Series(1, index = range(4), dtype="float32"),
"D": np.array([3] * 4, dtype = 'int32'),
"E": 'foo'
})
print df2
print df2.dtypes
A B C D E 0 1 2013-01-02 00:00:00 1 3 foo 1 1 2013-01-02 00:00:00 1 3 foo 2 1 2013-01-02 00:00:00 1 3 foo 3 1 2013-01-02 00:00:00 1 3 foo A float64 B datetime64[ns] C float32 D int32 E object dtype: object
## Basic elements of a dataframe
print df2.values
print
print df2.values.dtype
print
print df2.index
print
print df2.columns
print
print df2.dtypes
[[1.0 datetime.datetime(2013, 1, 2, 0, 0) 1.0 3 'foo'] [1.0 datetime.datetime(2013, 1, 2, 0, 0) 1.0 3 'foo'] [1.0 datetime.datetime(2013, 1, 2, 0, 0) 1.0 3 'foo'] [1.0 datetime.datetime(2013, 1, 2, 0, 0) 1.0 3 'foo']] object Int64Index([0, 1, 2, 3], dtype=int64) Index([u'A', u'B', u'C', u'D', u'E'], dtype=object) A float64 B datetime64[ns] C float32 D int32 E object dtype: object
## rows, columns
print df2.head(n=3)
print
print df2.tail(n=5)
print
print df2.describe() # B, E are missing
print
print df2.T
A B C D E 0 1 2013-01-02 00:00:00 1 3 foo 1 1 2013-01-02 00:00:00 1 3 foo 2 1 2013-01-02 00:00:00 1 3 foo A B C D E 0 1 2013-01-02 00:00:00 1 3 foo 1 1 2013-01-02 00:00:00 1 3 foo 2 1 2013-01-02 00:00:00 1 3 foo 3 1 2013-01-02 00:00:00 1 3 foo A C D count 4 4 4 mean 1 1 3 std 0 0 0 min 1 1 3 25% 1 1 3 50% 1 1 3 75% 1 1 3 max 1 1 3 0 1 2 3 A 1 1 1 1 B 2013-01-02 00:00:00 2013-01-02 00:00:00 2013-01-02 00:00:00 2013-01-02 00:00:00 C 1 1 1 1 D 3 3 3 3 E foo foo foo foo
## MOST COMMON CASE - sort by columns, by specifying columns and axis must be 0
## return new values
sorted_df = df.sort(columns = ["B", "C"], ascending=[0, 1])
print df
print
print sorted_df
print
## LESS COMMON CASE - sort by rows by specifying axis = 1
## normal row order
print df.sort(axis = 1, ascending=1)
## reverse row order
print
print df.sort(axis = 1, ascending=0)
A B C D 2013-01-01 1.803674 1.435856 1.368229 1.351646 2013-01-02 -2.017179 0.583955 1.025420 -1.153183 2013-01-03 0.307859 1.068456 0.147881 -0.631105 2013-01-04 -1.549112 -0.255675 0.135599 -0.935568 2013-01-05 -0.046233 -0.512905 -1.658730 0.473678 2013-01-06 -1.121827 -1.050659 2.499783 1.136633 A B C D 2013-01-01 1.803674 1.435856 1.368229 1.351646 2013-01-03 0.307859 1.068456 0.147881 -0.631105 2013-01-02 -2.017179 0.583955 1.025420 -1.153183 2013-01-04 -1.549112 -0.255675 0.135599 -0.935568 2013-01-05 -0.046233 -0.512905 -1.658730 0.473678 2013-01-06 -1.121827 -1.050659 2.499783 1.136633 A B C D 2013-01-01 1.803674 1.435856 1.368229 1.351646 2013-01-02 -2.017179 0.583955 1.025420 -1.153183 2013-01-03 0.307859 1.068456 0.147881 -0.631105 2013-01-04 -1.549112 -0.255675 0.135599 -0.935568 2013-01-05 -0.046233 -0.512905 -1.658730 0.473678 2013-01-06 -1.121827 -1.050659 2.499783 1.136633 D C B A 2013-01-01 1.351646 1.368229 1.435856 1.803674 2013-01-02 -1.153183 1.025420 0.583955 -2.017179 2013-01-03 -0.631105 0.147881 1.068456 0.307859 2013-01-04 -0.935568 0.135599 -0.255675 -1.549112 2013-01-05 0.473678 -1.658730 -0.512905 -0.046233 2013-01-06 1.136633 2.499783 -1.050659 -1.121827
## selection by index/row
print df
print '>> selecting one element by scalar index'
print df.loc[pd.Timestamp("2013-01-01", ), "A"].shape # key error if the index is NOT right
print '>> selecting by slicing (inclusive at both ends)'
print df.loc[pd.date_range("2013-01-01","2013-01-05")].shape # does not work with at
print '>> selecting by explict listing'
print df.loc[[pd.Timestamp("20130101"), pd.Timestamp("20130104")], ["A", "D"]].shape
print ">> selecting columns"
print df.loc[:, "A"].shape
A B C D 2013-01-01 1.803674 1.435856 1.368229 1.351646 2013-01-02 -2.017179 0.583955 1.025420 -1.153183 2013-01-03 0.307859 1.068456 0.147881 -0.631105 2013-01-04 -1.549112 -0.255675 0.135599 -0.935568 2013-01-05 -0.046233 -0.512905 -1.658730 0.473678 2013-01-06 -1.121827 -1.050659 2.499783 1.136633 >> selecting one element by scalar index () >> selecting by slicing (inclusive at both ends) (5, 4) >> selecting by explict listing (2, 2) >> selecting columns (6,)
## selection by position (from 0 ~ shape[]-1)
print df
print '>> selecting one element by scalar index'
print df.iloc[1, 1]
print '>> selecting by slicing (excluding due to slicing)'
print df.iloc[1:3, :3]
print '>> selecting by explicit listing'
print df.iloc[[0, 3], [0, 3]]
print '>> selecting a single column'
print df.iloc[:, 1].shape
A B C D 2013-01-01 1.803674 1.435856 1.368229 1.351646 2013-01-02 -2.017179 0.583955 1.025420 -1.153183 2013-01-03 0.307859 1.068456 0.147881 -0.631105 2013-01-04 -1.549112 -0.255675 0.135599 -0.935568 2013-01-05 -0.046233 -0.512905 -1.658730 0.473678 2013-01-06 -1.121827 -1.050659 2.499783 1.136633 >> selecting one element by scalar index 0.583954623432 >> selecting by slicing (excluding due to slicing) A B C 2013-01-02 -2.017179 0.583955 1.025420 2013-01-03 0.307859 1.068456 0.147881 >> selecting by explicit listing A D 2013-01-01 1.803674 1.351646 2013-01-04 -1.549112 -0.935568 >> selecting a single column (6,)
## Boolean / Conditional Selection - ON WHOLE MATRIX
## MOST COMMON CASE - use boolean selection to do summarization or assignment
print df
print df > 0
print df[df > 0] ## ZIG-ZAG matrix
df[df > 0] = -100
print df
print '>> reset'
df = pd.DataFrame(data = np.random.randn(6, 4), index = df.index, columns = df.columns)
print df
A B C D 2013-01-01 1.803674 1.435856 1.368229 1.351646 2013-01-02 -2.017179 0.583955 1.025420 -1.153183 2013-01-03 0.307859 1.068456 0.147881 -0.631105 2013-01-04 -1.549112 -0.255675 0.135599 -0.935568 2013-01-05 -0.046233 -0.512905 -1.658730 0.473678 2013-01-06 -1.121827 -1.050659 2.499783 1.136633 A B C D 2013-01-01 True True True True 2013-01-02 False True True False 2013-01-03 True True True False 2013-01-04 False False True False 2013-01-05 False False False True 2013-01-06 False False True True A B C D 2013-01-01 1.803674 1.435856 1.368229 1.351646 2013-01-02 NaN 0.583955 1.025420 NaN 2013-01-03 0.307859 1.068456 0.147881 NaN 2013-01-04 NaN NaN 0.135599 NaN 2013-01-05 NaN NaN NaN 0.473678 2013-01-06 NaN NaN 2.499783 1.136633 A B C D 2013-01-01 -100.000000 -100.000000 -100.00000 -100.000000 2013-01-02 -2.017179 -100.000000 -100.00000 -1.153183 2013-01-03 -100.000000 -100.000000 -100.00000 -0.631105 2013-01-04 -1.549112 -0.255675 -100.00000 -0.935568 2013-01-05 -0.046233 -0.512905 -1.65873 -100.000000 2013-01-06 -1.121827 -1.050659 -100.00000 -100.000000 >> reset A B C D 2013-01-01 0.307393 -0.478925 0.888284 0.250375 2013-01-02 0.689614 0.949111 -0.325068 -1.772610 2013-01-03 0.449991 -0.698569 -0.660214 -1.574039 2013-01-04 0.696428 -1.042774 0.482444 -2.030131 2013-01-05 0.647665 -0.095332 0.178925 -1.374787 2013-01-06 1.993792 2.377870 0.161765 -1.319822
## Boolean / Conditional Selection - ON CERTAIN COLUMN
print df
print df.A > 0
print '\nselecting df elements of df.A > 0'
print df[df.A > 0] # selecting rows
print '\nselecting df elements of df.A > 0 and df.D < 0'
print df[np.logical_and(df.A > 0, df.D < 0)]
## SAME AS ABOVE - NOTE THE PARENTHSIS
print df[(df.A > 0) & (df.D < 0)]
df[df.A > 0] = 100 ## assiging whole bunch of rows
print df
## reset
df = pd.DataFrame(data = np.random.randn(6, 4), index = df.index, columns = df.columns)
A B C D 2013-01-01 -0.030813 -0.865584 -0.433447 0.507878 2013-01-02 0.170206 -1.967499 -0.362710 -1.417809 2013-01-03 -0.964603 0.761011 -0.286228 -0.162099 2013-01-04 1.204170 0.711299 0.451529 0.546794 2013-01-05 -1.193963 0.739754 -0.405187 -0.366887 2013-01-06 1.015976 -0.962043 0.178670 0.669697 2013-01-01 False 2013-01-02 True 2013-01-03 False 2013-01-04 True 2013-01-05 False 2013-01-06 True Freq: D, Name: A, dtype: bool selecting df elements of df.A > 0 A B C D 2013-01-02 0.170206 -1.967499 -0.362710 -1.417809 2013-01-04 1.204170 0.711299 0.451529 0.546794 2013-01-06 1.015976 -0.962043 0.178670 0.669697 selecting df elements of df.A > 0 and df.D < 0 A B C D 2013-01-02 0.170206 -1.967499 -0.36271 -1.417809 A B C D 2013-01-02 0.170206 -1.967499 -0.36271 -1.417809 A B C D 2013-01-01 -0.030813 -0.865584 -0.433447 0.507878 2013-01-02 100.000000 100.000000 100.000000 100.000000 2013-01-03 -0.964603 0.761011 -0.286228 -0.162099 2013-01-04 100.000000 100.000000 100.000000 100.000000 2013-01-05 -1.193963 0.739754 -0.405187 -0.366887 2013-01-06 100.000000 100.000000 100.000000 100.000000
## WHERE operation with setting
## COPY OF DATAFRAME - either DEEP (default) or SHALLOW
dff = df.copy(deep = True)
print dff
print ">> reverse negative values"
dff[dff < 0] = - dff
print dff
dff[dff.A > 0] = - dff
print dff
A B C D 2013-01-01 -0.440529 0.320633 0.594432 -1.285903 2013-01-02 0.353733 1.351794 -0.759201 -0.474171 2013-01-03 -0.018955 -0.341444 -0.555280 -0.162339 2013-01-04 -1.209282 0.977806 0.909265 0.710577 2013-01-05 -0.005757 -1.061972 -0.782961 -1.449468 2013-01-06 1.170326 0.011626 -1.814532 -0.245602 >> reverse negative values A B C D 2013-01-01 0.440529 0.320633 0.594432 1.285903 2013-01-02 0.353733 1.351794 0.759201 0.474171 2013-01-03 0.018955 0.341444 0.555280 0.162339 2013-01-04 1.209282 0.977806 0.909265 0.710577 2013-01-05 0.005757 1.061972 0.782961 1.449468 2013-01-06 1.170326 0.011626 1.814532 0.245602 A B C D 2013-01-01 -0.440529 -0.320633 -0.594432 -1.285903 2013-01-02 -0.353733 -1.351794 -0.759201 -0.474171 2013-01-03 -0.018955 -0.341444 -0.555280 -0.162339 2013-01-04 -1.209282 -0.977806 -0.909265 -0.710577 2013-01-05 -0.005757 -1.061972 -0.782961 -1.449468 2013-01-06 -1.170326 -0.011626 -1.814532 -0.245602
## CREATE missing values
dff = df.copy(deep = True)
dff['E'] = [1] * 3 + [np.nan] * 3
dff.D.iloc[1] = np.nan
dff['F'] = np.nan
print dff
A B C D E F 2013-01-01 -0.440529 0.320633 0.594432 -1.285903 1 NaN 2013-01-02 0.353733 1.351794 -0.759201 NaN 1 NaN 2013-01-03 -0.018955 -0.341444 -0.555280 -0.162339 1 NaN 2013-01-04 -1.209282 0.977806 0.909265 0.710577 NaN NaN 2013-01-05 -0.005757 -1.061972 -0.782961 -1.449468 NaN NaN 2013-01-06 1.170326 0.011626 -1.814532 -0.245602 NaN NaN
## FINDING of missing values - BOOLEAN MATRIX
dfff = dff.copy(deep = True)
print pd.isnull(dfff)
dfff[pd.isnull(dfff)] = 100
print dfff
A B C D E F 2013-01-01 False False False False False True 2013-01-02 False False False True False True 2013-01-03 False False False False False True 2013-01-04 False False False False True True 2013-01-05 False False False False True True 2013-01-06 False False False False True True A B C D E F 2013-01-01 -0.440529 0.320633 0.594432 -1.285903 1 100 2013-01-02 0.353733 1.351794 -0.759201 100.000000 1 100 2013-01-03 -0.018955 -0.341444 -0.555280 -0.162339 1 100 2013-01-04 -1.209282 0.977806 0.909265 0.710577 100 100 2013-01-05 -0.005757 -1.061972 -0.782961 -1.449468 100 100 2013-01-06 1.170326 0.011626 -1.814532 -0.245602 100 100
## REMOVE missing data
print dff
print '\n>> drop ANY na-rows axis = 0'
print dff.dropna(axis = 0, how = 'any')
print '\n>> drop ALL na-rows axis = 0'
print dff.dropna(axis = 0, how = 'all')
print '\n>> drop ANY na-cols axis = 1'
print dff.dropna(axis = 1, how = 'any')
print '\n>> drop ALL na-cols axis = 1'
print dff.dropna(axis = 1, how = 'all')
print '\n>> drop na-rows based on NON-NA threshold - how param is DEACTIVATED'
print dff.dropna(axis = 0, thresh=5)
print '\n>> drop na-cols based on NON-NA threshold - how param is DEACTIVATED'
print dff.dropna(axis = 1, thresh=4)
A B C D E F 2013-01-01 -0.440529 0.320633 0.594432 -1.285903 1 NaN 2013-01-02 0.353733 1.351794 -0.759201 NaN 1 NaN 2013-01-03 -0.018955 -0.341444 -0.555280 -0.162339 1 NaN 2013-01-04 -1.209282 0.977806 0.909265 0.710577 NaN NaN 2013-01-05 -0.005757 -1.061972 -0.782961 -1.449468 NaN NaN 2013-01-06 1.170326 0.011626 -1.814532 -0.245602 NaN NaN >> drop ANY na-rows axis = 0 Empty DataFrame Columns: [A, B, C, D, E, F] Index: [] >> drop ALL na-rows axis = 0 A B C D E F 2013-01-01 -0.440529 0.320633 0.594432 -1.285903 1 NaN 2013-01-02 0.353733 1.351794 -0.759201 NaN 1 NaN 2013-01-03 -0.018955 -0.341444 -0.555280 -0.162339 1 NaN 2013-01-04 -1.209282 0.977806 0.909265 0.710577 NaN NaN 2013-01-05 -0.005757 -1.061972 -0.782961 -1.449468 NaN NaN 2013-01-06 1.170326 0.011626 -1.814532 -0.245602 NaN NaN >> drop ANY na-cols axis = 1 A B C 2013-01-01 -0.440529 0.320633 0.594432 2013-01-02 0.353733 1.351794 -0.759201 2013-01-03 -0.018955 -0.341444 -0.555280 2013-01-04 -1.209282 0.977806 0.909265 2013-01-05 -0.005757 -1.061972 -0.782961 2013-01-06 1.170326 0.011626 -1.814532 >> drop ALL na-cols axis = 1 A B C D E 2013-01-01 -0.440529 0.320633 0.594432 -1.285903 1 2013-01-02 0.353733 1.351794 -0.759201 NaN 1 2013-01-03 -0.018955 -0.341444 -0.555280 -0.162339 1 2013-01-04 -1.209282 0.977806 0.909265 0.710577 NaN 2013-01-05 -0.005757 -1.061972 -0.782961 -1.449468 NaN 2013-01-06 1.170326 0.011626 -1.814532 -0.245602 NaN >> drop na-rows based on NON-NA threshold - how param is DEACTIVATED A B C D E F 2013-01-01 -0.440529 0.320633 0.594432 -1.285903 1 NaN 2013-01-03 -0.018955 -0.341444 -0.555280 -0.162339 1 NaN >> drop na-cols based on NON-NA threshold - how param is DEACTIVATED A B C D 2013-01-01 -0.440529 0.320633 0.594432 -1.285903 2013-01-02 0.353733 1.351794 -0.759201 NaN 2013-01-03 -0.018955 -0.341444 -0.555280 -0.162339 2013-01-04 -1.209282 0.977806 0.909265 0.710577 2013-01-05 -0.005757 -1.061972 -0.782961 -1.449468 2013-01-06 1.170326 0.011626 -1.814532 -0.245602
## FILL UP missing values - IMPUTATION
## Parameters
## ----------
## method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None
## Method to use for filling holes in reindexed Series
## pad / ffill: propagate last valid observation forward to next valid
## backfill / bfill: use NEXT valid observation to fill gap
## None: fill by value
## value : scalar or dict
## Value to use to fill holes (e.g. 0), alternately a dict of values
## specifying which value to use for each column (columns not in the
## dict will not be filled). This value cannot be a list.
## axis : {0, 1}, default 0
## 0: fill column-by-column
## 1: fill row-by-row
## inplace : boolean, default False
## If True, fill the DataFrame in place. Note: this will modify any
## other views on this DataFrame, like if you took a no-copy slice of
## an existing DataFrame, for example a column in a DataFrame. Returns
## a reference to the filled object, which is self if inplace=True
## limit : int, default None
## Maximum size gap to forward or backward fill
## downcast : dict, default is None, a dict of item->dtype of what to
## downcast if possible
print dff
print dff.fillna(method='backfill')
print dff.fillna(method="ffill")
print dff.fillna(method = None, value = {"D": -100, "E": 0})
A B C D E F 2013-01-01 -0.440529 0.320633 0.594432 -1.285903 1 NaN 2013-01-02 0.353733 1.351794 -0.759201 NaN 1 NaN 2013-01-03 -0.018955 -0.341444 -0.555280 -0.162339 1 NaN 2013-01-04 -1.209282 0.977806 0.909265 0.710577 NaN NaN 2013-01-05 -0.005757 -1.061972 -0.782961 -1.449468 NaN NaN 2013-01-06 1.170326 0.011626 -1.814532 -0.245602 NaN NaN A B C D E F 2013-01-01 -0.440529 0.320633 0.594432 -1.285903 1 NaN 2013-01-02 0.353733 1.351794 -0.759201 -0.162339 1 NaN 2013-01-03 -0.018955 -0.341444 -0.555280 -0.162339 1 NaN 2013-01-04 -1.209282 0.977806 0.909265 0.710577 NaN NaN 2013-01-05 -0.005757 -1.061972 -0.782961 -1.449468 NaN NaN 2013-01-06 1.170326 0.011626 -1.814532 -0.245602 NaN NaN A B C D E F 2013-01-01 -0.440529 0.320633 0.594432 -1.285903 1 NaN 2013-01-02 0.353733 1.351794 -0.759201 -1.285903 1 NaN 2013-01-03 -0.018955 -0.341444 -0.555280 -0.162339 1 NaN 2013-01-04 -1.209282 0.977806 0.909265 0.710577 1 NaN 2013-01-05 -0.005757 -1.061972 -0.782961 -1.449468 1 NaN 2013-01-06 1.170326 0.011626 -1.814532 -0.245602 1 NaN A B C D E F 2013-01-01 -0.440529 0.320633 0.594432 -1.285903 1 NaN 2013-01-02 0.353733 1.351794 -0.759201 -100.000000 1 NaN 2013-01-03 -0.018955 -0.341444 -0.555280 -0.162339 1 NaN 2013-01-04 -1.209282 0.977806 0.909265 0.710577 0 NaN 2013-01-05 -0.005757 -1.061972 -0.782961 -1.449468 0 NaN 2013-01-06 1.170326 0.011626 -1.814532 -0.245602 0 NaN
print df
print ">> apply mean on columns"
print df.apply(np.mean, raw = True, axis = 0)
print ">> apply sum on rows"
print df.apply(np.sum, raw = True, axis = 1)
print df.applymap(lambda x: x **2) # equlivanent to df ** 2
A B C D 2013-01-01 -0.440529 0.320633 0.594432 -1.285903 2013-01-02 0.353733 1.351794 -0.759201 -0.474171 2013-01-03 -0.018955 -0.341444 -0.555280 -0.162339 2013-01-04 -1.209282 0.977806 0.909265 0.710577 2013-01-05 -0.005757 -1.061972 -0.782961 -1.449468 2013-01-06 1.170326 0.011626 -1.814532 -0.245602 >> apply mean on columns A -0.025077 B 0.209740 C -0.401380 D -0.484485 dtype: float64 >> apply sum on rows 2013-01-01 -0.811367 2013-01-02 0.472155 2013-01-03 -1.078019 2013-01-04 1.388366 2013-01-05 -3.300159 2013-01-06 -0.878183 Freq: D, dtype: float64 A B C D 2013-01-01 0.194066 0.102805 0.353350 1.653547 2013-01-02 0.125127 1.827346 0.576386 0.224838 2013-01-03 0.000359 0.116584 0.308336 0.026354 2013-01-04 1.462363 0.956105 0.826763 0.504920 2013-01-05 0.000033 1.127785 0.613029 2.100959 2013-01-06 1.369662 0.000135 3.292527 0.060320
df['E'] = pd.Series(map(str, np.random.randint(0, 4, size = 6)), index = df.index)
print df
## HISTOGRAMMING by Value-Counting
## - RETURN ANOTHER SEREIS (essentially convertable to dict)
df.E.value_counts().to_dict()
A B C D E 2013-01-01 -0.440529 0.320633 0.594432 -1.285903 1 2013-01-02 0.353733 1.351794 -0.759201 -0.474171 3 2013-01-03 -0.018955 -0.341444 -0.555280 -0.162339 1 2013-01-04 -1.209282 0.977806 0.909265 0.710577 0 2013-01-05 -0.005757 -1.061972 -0.782961 -1.449468 1 2013-01-06 1.170326 0.011626 -1.814532 -0.245602 0
{'0': 2, '1': 3, '3': 1}
## Discretization and Quantiling -- for numeric data ONLY
## very similiar to R factor now
print df
A_factors = pd.cut(df.A, 3)
print type(A_factors)
print A_factors
print 'FACTOR LABELS >> ', A_factors.labels
print 'FACTOR LEVELS >>', A_factors.levels
print 'quatiling >>'
A_quantiles = pd.qcut(df.A, [0, 0.25, 0.75, 1])
print 'QFACTOR LABELS >> ', A_quantiles.labels
print 'QFACTOR LEVELS >> ', A_quantiles.levels
A B C D E 2013-01-01 -0.440529 0.320633 0.594432 -1.285903 1 2013-01-02 0.353733 1.351794 -0.759201 -0.474171 3 2013-01-03 -0.018955 -0.341444 -0.555280 -0.162339 1 2013-01-04 -1.209282 0.977806 0.909265 0.710577 0 2013-01-05 -0.005757 -1.061972 -0.782961 -1.449468 1 2013-01-06 1.170326 0.011626 -1.814532 -0.245602 0 <class 'pandas.core.categorical.Categorical'> Categorical: A [(-1.212, -0.416], (-0.416, 0.377], (-0.416, 0.377], (-1.212, -0.416], (-0.416, 0.377], (0.377, 1.17]] Levels (3): Index(['(-1.212, -0.416]', '(-0.416, 0.377]', '(0.377, 1.17]'], dtype=object) FACTOR LABELS >> [0 1 1 0 1 2] FACTOR LEVELS >> Index([u'(-1.212, -0.416]', u'(-0.416, 0.377]', u'(0.377, 1.17]'], dtype=object) quatiling >> QFACTOR LABELS >> [0 2 1 0 1 2] QFACTOR LEVELS >> Index([u'[-1.209, -0.335]', u'(-0.335, 0.264]', u'(0.264, 1.17]'], dtype=object)
## REPLACEMENT
print df
print df.replace({'E': {'3': '2'}}) ## COMBINE 3 and 2
A B C D E 2013-01-01 -0.440529 0.320633 0.594432 -1.285903 1 2013-01-02 0.353733 1.351794 -0.759201 -0.474171 3 2013-01-03 -0.018955 -0.341444 -0.555280 -0.162339 1 2013-01-04 -1.209282 0.977806 0.909265 0.710577 0 2013-01-05 -0.005757 -1.061972 -0.782961 -1.449468 1 2013-01-06 1.170326 0.011626 -1.814532 -0.245602 0 A B C D E 2013-01-01 -0.440529 0.320633 0.594432 -1.285903 1 2013-01-02 0.353733 1.351794 -0.759201 -0.474171 2 2013-01-03 -0.018955 -0.341444 -0.555280 -0.162339 1 2013-01-04 -1.209282 0.977806 0.909265 0.710577 0 2013-01-05 -0.005757 -1.061972 -0.782961 -1.449468 1 2013-01-06 1.170326 0.011626 -1.814532 -0.245602 0
## Horizontal concat
print "HORIZONTAL SPLITTING AND CONCATING"
print df
hdf1 = df.iloc[:3]
hdf2 = df.iloc[3:]
print hdf1
print hdf2
print pd.concat([hdf2, hdf1], axis = 0)
## Vertical concat
print '\nVERTICAL SPLITTING AND CONCATING'
vdf1 = df.iloc[:, :3]
vdf2 = df.iloc[:, 3:]
print vdf1
print vdf2
print pd.concat([vdf2, vdf1], axis = 1)
HORIZONTAL SPLITTING AND CONCATING A B C D E 2013-01-01 -0.440529 0.320633 0.594432 -1.285903 1 2013-01-02 0.353733 1.351794 -0.759201 -0.474171 3 2013-01-03 -0.018955 -0.341444 -0.555280 -0.162339 1 2013-01-04 -1.209282 0.977806 0.909265 0.710577 0 2013-01-05 -0.005757 -1.061972 -0.782961 -1.449468 1 2013-01-06 1.170326 0.011626 -1.814532 -0.245602 0 A B C D E 2013-01-01 -0.440529 0.320633 0.594432 -1.285903 1 2013-01-02 0.353733 1.351794 -0.759201 -0.474171 3 2013-01-03 -0.018955 -0.341444 -0.555280 -0.162339 1 A B C D E 2013-01-04 -1.209282 0.977806 0.909265 0.710577 0 2013-01-05 -0.005757 -1.061972 -0.782961 -1.449468 1 2013-01-06 1.170326 0.011626 -1.814532 -0.245602 0 A B C D E 2013-01-04 -1.209282 0.977806 0.909265 0.710577 0 2013-01-05 -0.005757 -1.061972 -0.782961 -1.449468 1 2013-01-06 1.170326 0.011626 -1.814532 -0.245602 0 2013-01-01 -0.440529 0.320633 0.594432 -1.285903 1 2013-01-02 0.353733 1.351794 -0.759201 -0.474171 3 2013-01-03 -0.018955 -0.341444 -0.555280 -0.162339 1 VERTICAL SPLITTING AND CONCATING A B C 2013-01-01 -0.440529 0.320633 0.594432 2013-01-02 0.353733 1.351794 -0.759201 2013-01-03 -0.018955 -0.341444 -0.555280 2013-01-04 -1.209282 0.977806 0.909265 2013-01-05 -0.005757 -1.061972 -0.782961 2013-01-06 1.170326 0.011626 -1.814532 D E 2013-01-01 -1.285903 1 2013-01-02 -0.474171 3 2013-01-03 -0.162339 1 2013-01-04 0.710577 0 2013-01-05 -1.449468 1 2013-01-06 -0.245602 0 D E A B C 2013-01-01 -1.285903 1 -0.440529 0.320633 0.594432 2013-01-02 -0.474171 3 0.353733 1.351794 -0.759201 2013-01-03 -0.162339 1 -0.018955 -0.341444 -0.555280 2013-01-04 0.710577 0 -1.209282 0.977806 0.909265 2013-01-05 -1.449468 1 -0.005757 -1.061972 -0.782961 2013-01-06 -0.245602 0 1.170326 0.011626 -1.814532
## JOINING / MERGING
## pd.merge(left, right, how='inner', on=None,
## left_on=None, right_on=None,
## left_index=False, right_index=False, sort=False,
## suffixes=('_x', '_y'), copy=True)
left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [6, 7]})
print left
print right
print pd.merge(left, right, on = 'key')
key lval 0 foo 1 1 foo 2 key rval 0 foo 6 1 foo 7 key lval rval 0 foo 1 6 1 foo 1 7 2 foo 2 6 3 foo 2 7
## APPENDING ROWS
## NOT REALLY VERY USEFUL
print df
print type(df.iloc[3])
print df.iloc[3].name
df.append(pd.Series(np.random.randn(5), index = df.columns), ignore_index=True, )
A B C D E 2013-01-01 -0.440529 0.320633 0.594432 -1.285903 1 2013-01-02 0.353733 1.351794 -0.759201 -0.474171 3 2013-01-03 -0.018955 -0.341444 -0.555280 -0.162339 1 2013-01-04 -1.209282 0.977806 0.909265 0.710577 0 2013-01-05 -0.005757 -1.061972 -0.782961 -1.449468 1 2013-01-06 1.170326 0.011626 -1.814532 -0.245602 0 <class 'pandas.core.series.Series'> 2013-01-04 00:00:00
A | B | C | D | E | |
---|---|---|---|---|---|
0 | -0.440529 | 0.320633 | 0.594432 | -1.285903 | 1 |
1 | 0.353733 | 1.351794 | -0.759201 | -0.474171 | 3 |
2 | -0.018955 | -0.341444 | -0.555280 | -0.162339 | 1 |
3 | -1.209282 | 0.977806 | 0.909265 | 0.710577 | 0 |
4 | -0.005757 | -1.061972 | -0.782961 | -1.449468 | 1 |
5 | 1.170326 | 0.011626 | -1.814532 | -0.245602 | 0 |
6 | 0.268328 | 1.146425 | 0.306675 | 2.480935 | -2.537934 |
From Pandas:
By “group by” we are referring to a process involving one or more of the following steps
the most common case is to group matrix by columns (features)
df = pd.DataFrame({
'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo']
, 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three']
, 'C': np.random.randn(8)
, 'D': np.random.randn(8)
})
print df
A B C D 0 foo one 0.684732 -1.999816 1 bar one 0.502373 0.779502 2 foo two -0.187921 2.815911 3 bar three 1.588625 1.610015 4 foo two -1.367470 0.239797 5 bar two -0.287122 -0.418336 6 foo one 0.122107 -1.787529 7 foo three -0.676687 1.060654
print df.groupby(by = 'A', axis = 0).sum()
print "\nIterating Group Structure (WHAT GROUPBY IS REALLY ABOUT - A DICT) >>"
for grp_name, sub_df in df.groupby(by = ['B', 'A']):
print grp_name
print sub_df
C D A bar 1.803875 1.971181 foo -1.425239 0.329017 Iterating Group Structure (WHAT GROUPBY IS REALLY ABOUT - A DICT) >> ('one', 'bar') A B C D 1 bar one 0.502373 0.779502 ('one', 'foo') A B C D 0 foo one 0.684732 -1.999816 6 foo one 0.122107 -1.787529 ('three', 'bar') A B C D 3 bar three 1.588625 1.610015 ('three', 'foo') A B C D 7 foo three -0.676687 1.060654 ('two', 'bar') A B C D 5 bar two -0.287122 -0.418336 ('two', 'foo') A B C D 2 foo two -0.187921 2.815911 4 foo two -1.367470 0.239797
df = pd.DataFrame({'A' : ['one', 'one', 'two', 'three'] * 3,
'B' : ['A', 'B', 'C'] * 4,
'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2,
'D' : np.random.randn(12),
'E' : np.random.randn(12)})
print df
A B C D E 0 one A foo -0.257251 0.941467 1 one B foo -2.041284 -1.070327 2 two C foo -2.375388 0.539690 3 three A bar 0.074301 1.018966 4 one B bar -0.756021 0.669371 5 one C bar 1.300727 0.330807 6 two A foo 1.113186 0.263582 7 three B foo 0.222825 -1.196940 8 one C foo 0.749618 0.331946 9 one A bar 0.876066 -2.755204 10 two B bar 0.254959 -1.748141 11 three C bar 2.029449 0.109468
## PIVOT TABLE
## pd.pivot_table(data, values=None, rows=None, cols=None, aggfunc='mean',
## fill_value=None, margins=False, dropna=True)
## Create a spreadsheet-style pivot table as a DataFrame. The levels in the
## pivot table will be stored in MultiIndex objects (hierarchical indexes) on
## the index and columns of the result DataFrame
## IMPORTANT Parameters
## data : DataFrame
## values : column to aggregate, optional
## rows : list of column names or arrays to group on
## Keys to group on the x-axis of the pivot table
## cols : list of column names or arrays to group on
## Keys to group on the y-axis of the pivot table
## aggfunc : function, default numpy.mean, or list of functions
## If list of functions passed, the resulting pivot table will have
## hierarchical columns whose top level are the function names (inferred
## from the function objects themselves)
## rows, cols should be CATEGORICAL variables that can be grouped by
## values are usually numeric variables to be calculated on (by applying aggfunc)
## values are the remaining variables by default
## IN THIS SENSE, PIVOT TABLE is more like a shortcut for groupby(), as group_rows,
## group_cols, values() and aggregation statics no them are usually of the most interest
## AGGFUNC IS ALWAYS THERE - DEFAULT IS NP.MEAN
print df
print
print pd.pivot_table(df, values = ['D', 'E'], rows = ['A', 'B'], cols = 'C')
print
print pd.pivot_table(df, values = ['D'], rows = ['A', 'B'], cols = 'C')
print
print pd.pivot_table(df, rows = ['A'], cols = ['C'], values = ['D'],
aggfunc=[np.mean, np.min, np.max])
A B C D E 0 one A foo -0.257251 0.941467 1 one B foo -2.041284 -1.070327 2 two C foo -2.375388 0.539690 3 three A bar 0.074301 1.018966 4 one B bar -0.756021 0.669371 5 one C bar 1.300727 0.330807 6 two A foo 1.113186 0.263582 7 three B foo 0.222825 -1.196940 8 one C foo 0.749618 0.331946 9 one A bar 0.876066 -2.755204 10 two B bar 0.254959 -1.748141 11 three C bar 2.029449 0.109468 D E C bar foo bar foo A B one A 0.876066 -0.257251 -2.755204 0.941467 B -0.756021 -2.041284 0.669371 -1.070327 C 1.300727 0.749618 0.330807 0.331946 three A 0.074301 NaN 1.018966 NaN B NaN 0.222825 NaN -1.196940 C 2.029449 NaN 0.109468 NaN two A NaN 1.113186 NaN 0.263582 B 0.254959 NaN -1.748141 NaN C NaN -2.375388 NaN 0.539690 D C bar foo A B one A 0.876066 -0.257251 B -0.756021 -2.041284 C 1.300727 0.749618 three A 0.074301 NaN B NaN 0.222825 C 2.029449 NaN two A NaN 1.113186 B 0.254959 NaN C NaN -2.375388 mean amin amax D D D C bar foo bar foo bar foo A one 0.473591 -0.516305 -0.756021 -2.041284 1.300727 0.749618 three 1.051875 0.222825 0.074301 0.222825 2.029449 0.222825 two 0.254959 -0.631101 0.254959 -2.375388 0.254959 1.113186
In summary
rng = pd.date_range('1/1/2012', periods=500, freq='S', ) ## periods and freq
ts = pd.Series(randint(0, 500, len(rng)), index = rng)
ts.plot()
print '\n>> time series with frequence = sencond >>'
print ts
tts = ts.resample('1Min', how='sum') ## default how = 'mean', which is downsampling
figure()
tts.plot()
print '\n>> time series with frequence resampled at 1 min >>'
print tts
>> time series with frequence = sencond >> 2012-01-01 00:00:00 183 2012-01-01 00:00:01 373 2012-01-01 00:00:02 390 2012-01-01 00:00:03 126 2012-01-01 00:00:04 140 2012-01-01 00:00:05 3 2012-01-01 00:00:06 477 2012-01-01 00:00:07 450 2012-01-01 00:00:08 170 2012-01-01 00:00:09 71 2012-01-01 00:00:10 65 2012-01-01 00:00:11 107 2012-01-01 00:00:12 419 2012-01-01 00:00:13 306 2012-01-01 00:00:14 1 ... 2012-01-01 00:08:05 111 2012-01-01 00:08:06 148 2012-01-01 00:08:07 45 2012-01-01 00:08:08 132 2012-01-01 00:08:09 197 2012-01-01 00:08:10 348 2012-01-01 00:08:11 102 2012-01-01 00:08:12 124 2012-01-01 00:08:13 363 2012-01-01 00:08:14 141 2012-01-01 00:08:15 34 2012-01-01 00:08:16 20 2012-01-01 00:08:17 25 2012-01-01 00:08:18 305 2012-01-01 00:08:19 165 Freq: S, Length: 500, dtype: int64 >> time series with frequence resampled at 1 min >> 2012-01-01 00:00:00 13107 2012-01-01 00:01:00 14907 2012-01-01 00:02:00 13006 2012-01-01 00:03:00 13201 2012-01-01 00:04:00 14889 2012-01-01 00:05:00 14300 2012-01-01 00:06:00 15604 2012-01-01 00:07:00 13959 2012-01-01 00:08:00 3772 Freq: T, dtype: int64
## Time zone reprentation - tz_localize()
rng = pd.date_range('3/6/2012 00:00', periods = 5, freq = 'D')
ts = pd.Series(randn(len(rng)), rng)
ts_utc = ts.tz_localize('UTC', )
print '>> Time zone UTC >>'
print ts_utc
ts_utc.plot()
## Convert to another time zone - tz_convert
ts_us = ts_utc.tz_convert('US/Eastern')
print '>> Time zone US >>'
print ts_us
figure()
ts_us.plot()
>> Time zone UTC >> 2012-03-06 00:00:00+00:00 0.146046 2012-03-07 00:00:00+00:00 -1.695428 2012-03-08 00:00:00+00:00 -0.956475 2012-03-09 00:00:00+00:00 1.338961 2012-03-10 00:00:00+00:00 -0.296398 Freq: D, dtype: float64 >> Time zone US >> 2012-03-05 19:00:00-05:00 0.146046 2012-03-06 19:00:00-05:00 -1.695428 2012-03-07 19:00:00-05:00 -0.956475 2012-03-08 19:00:00-05:00 1.338961 2012-03-09 19:00:00-05:00 -0.296398 Freq: D, dtype: float64
<matplotlib.axes.AxesSubplot at 0x3fe1a90>
## Converting between time span representations
rng = pd.date_range('1/1/2012', periods=5, freq = 'M') # frequency = 1 month for a period of 5
ts = pd.Series(randn(len(rng)), index = rng)
print '>> Time series (Datetime Index) >>'
print ts
ps = ts.to_period()
print '>> Period series (Period Index) >>'
print ps
print ' >> Time stamp series (Datetime Index again)>>'
tm_s = ps.to_timestamp()
print tm_s
>> Time series (Datetime Index) >> 2012-01-31 -0.369075 2012-02-29 -0.244696 2012-03-31 -1.621117 2012-04-30 -1.972054 2012-05-31 1.699680 Freq: M, dtype: float64 >> Period series (Period Index) >> 2012-01 -0.369075 2012-02 -0.244696 2012-03 -1.621117 2012-04 -1.972054 2012-05 1.699680 Freq: M, dtype: float64 >> Time stamp series (Datetime Index again)>> 2012-01-01 -0.369075 2012-02-01 -0.244696 2012-03-01 -1.621117 2012-04-01 -1.972054 2012-05-01 1.699680 Freq: MS, dtype: float64
## PLOTTING TIME SERIES
ts = pd.Series(randn(1000), index = pd.date_range('1/1/2000', periods=1000, freq='D'))
ts.plot()
cts = ts.cumsum()
figure()
cts.plot()
print ">> time series >>"
print ts.head(5)
print ">> cummulative sum of time series >>"
print cts.head(5)
>> time series >> 2000-01-01 -0.070830 2000-01-02 -1.180476 2000-01-03 -0.036886 2000-01-04 0.018825 2000-01-05 0.280117 Freq: D, dtype: float64 >> cummulative sum of time series >> 2000-01-01 -0.070830 2000-01-02 -1.251305 2000-01-03 -1.288192 2000-01-04 -1.269367 2000-01-05 -0.989250 Freq: D, dtype: float64
## PLOT OF DATAFRAME
df = pd.DataFrame(randn(1000, 4), index = ts.index, columns=list("ABCD"))
df.plot()
legend(loc='best')
figure()
cdf = df.cumsum(axis = 0)
cdf.plot()
legend(loc='best')
<matplotlib.legend.Legend at 0x7247150>