In [1]:

# import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

Object Creation - Series and DataFrame¶

In [2]:

# creating Series (array)

s1 = pd.Series([[1,3,5,np.nan,6,8]])
s1

Out[2]:

0    [1, 3, 5, nan, 6, 8]
dtype: object

In [3]:

# creating DataFrames
dates = pd.date_range('20130101', periods=6)
dates

Out[3]:

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [4]:

df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
df

Out[4]:

	A	B	C	D
2013-01-01	-0.005574	-0.385657	-0.193270	0.264901
2013-01-02	-0.208680	-0.296488	-0.790010	-1.391032
2013-01-03	-1.616674	-0.042173	-1.356455	-0.717640
2013-01-04	2.156849	-0.201232	0.165786	-0.162465
2013-01-05	-2.019862	-2.055205	1.369486	1.182199
2013-01-06	0.815260	0.766377	0.160439	0.353855

In [5]:

# Creating a DataFrame by passing a dict of objects that can be converted to series-like.
df2 = pd.DataFrame({ 'A' : 1.,
                         'B' : pd.Timestamp('20130102'),
                         'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
                         'D' : np.array([3] * 4,dtype='int32'),
                         'E' : pd.Categorical(["test","train","test","train"]),
                         'F' : 'foo' })
df2

Out[5]:

	A	B	C	D	E	F
0	1.0	2013-01-02	1.0	3	test	foo
1	1.0	2013-01-02	1.0	3	train	foo
2	1.0	2013-01-02	1.0	3	test	foo
3	1.0	2013-01-02	1.0	3	train	foo

In [6]:

# checking Data Types
df2.dtypes

Out[6]:

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [7]:

# Use df2.<TAB> for column name completion as well as attributes which can work on dataframe

Viewing Data¶

In [8]:

# for first 5 records
df.head()

Out[8]:

	A	B	C	D
2013-01-01	-0.005574	-0.385657	-0.193270	0.264901
2013-01-02	-0.208680	-0.296488	-0.790010	-1.391032
2013-01-03	-1.616674	-0.042173	-1.356455	-0.717640
2013-01-04	2.156849	-0.201232	0.165786	-0.162465
2013-01-05	-2.019862	-2.055205	1.369486	1.182199

In [9]:

# for last 3
df.tail(3)

Out[9]:

	A	B	C	D
2013-01-04	2.156849	-0.201232	0.165786	-0.162465
2013-01-05	-2.019862	-2.055205	1.369486	1.182199
2013-01-06	0.815260	0.766377	0.160439	0.353855

In [10]:

# checking df index
df.index

Out[10]:

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [11]:

# column names
df.columns

Out[11]:

Index(['A', 'B', 'C', 'D'], dtype='object')

In [12]:

# df values
df.values

Out[12]:

array([[-0.00557444, -0.38565665, -0.19326961,  0.26490064],
       [-0.20867983, -0.29648787, -0.79000981, -1.39103244],
       [-1.6166742 , -0.04217266, -1.35645484, -0.71763967],
       [ 2.15684898, -0.20123222,  0.16578647, -0.16246515],
       [-2.01986196, -2.05520516,  1.36948649,  1.18219902],
       [ 0.81525976,  0.76637688,  0.16043882,  0.35385518]])

Summary of Data¶

In [13]:

# information abt df
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 6 entries, 2013-01-01 to 2013-01-06
Freq: D
Data columns (total 4 columns):
A    6 non-null float64
B    6 non-null float64
C    6 non-null float64
D    6 non-null float64
dtypes: float64(4)
memory usage: 240.0 bytes

In [14]:

# describing stastistic summay
df.describe()

Out[14]:

	A	B	C	D
count	6.000000	6.000000	6.000000	6.000000
mean	-0.146447	-0.369063	-0.107337	-0.078364
std	1.544346	0.924431	0.933920	0.898054
min	-2.019862	-2.055205	-1.356455	-1.391032
25%	-1.264676	-0.363364	-0.640825	-0.578846
50%	-0.107127	-0.248860	-0.016415	0.051218
75%	0.610051	-0.081938	0.164450	0.331617
max	2.156849	0.766377	1.369486	1.182199

Transposing and Sorting the data¶

In [15]:

df.T

Out[15]:

	2013-01-01 00:00:00	2013-01-02 00:00:00	2013-01-03 00:00:00	2013-01-04 00:00:00	2013-01-05 00:00:00	2013-01-06 00:00:00
A	-0.005574	-0.208680	-1.616674	2.156849	-2.019862	0.815260
B	-0.385657	-0.296488	-0.042173	-0.201232	-2.055205	0.766377
C	-0.193270	-0.790010	-1.356455	0.165786	1.369486	0.160439
D	0.264901	-1.391032	-0.717640	-0.162465	1.182199	0.353855

In [16]:

# Sorting by index
df.sort_index(axis=1, ascending=False)

Out[16]:

	D	C	B	A
2013-01-01	0.264901	-0.193270	-0.385657	-0.005574
2013-01-02	-1.391032	-0.790010	-0.296488	-0.208680
2013-01-03	-0.717640	-1.356455	-0.042173	-1.616674
2013-01-04	-0.162465	0.165786	-0.201232	2.156849
2013-01-05	1.182199	1.369486	-2.055205	-2.019862
2013-01-06	0.353855	0.160439	0.766377	0.815260

In [17]:

# sorting by values
df.sort_values(by="B")

Out[17]:

	A	B	C	D
2013-01-05	-2.019862	-2.055205	1.369486	1.182199
2013-01-01	-0.005574	-0.385657	-0.193270	0.264901
2013-01-02	-0.208680	-0.296488	-0.790010	-1.391032
2013-01-04	2.156849	-0.201232	0.165786	-0.162465
2013-01-03	-1.616674	-0.042173	-1.356455	-0.717640
2013-01-06	0.815260	0.766377	0.160439	0.353855

In [18]:

df.sort_values(by=["B", "A"])

Out[18]:

	A	B	C	D
2013-01-05	-2.019862	-2.055205	1.369486	1.182199
2013-01-01	-0.005574	-0.385657	-0.193270	0.264901
2013-01-02	-0.208680	-0.296488	-0.790010	-1.391032
2013-01-04	2.156849	-0.201232	0.165786	-0.162465
2013-01-03	-1.616674	-0.042173	-1.356455	-0.717640
2013-01-06	0.815260	0.766377	0.160439	0.353855

Selection of Data¶

In pandas, data can be accessed with these methods .at, .iat, .loc, .iloc and .ix

In [19]:

# selecting a column A
df['A']

Out[19]:

2013-01-01   -0.005574
2013-01-02   -0.208680
2013-01-03   -1.616674
2013-01-04    2.156849
2013-01-05   -2.019862
2013-01-06    0.815260
Freq: D, Name: A, dtype: float64

In [20]:

# slicing rows
df[0:3]

Out[20]:

	A	B	C	D
2013-01-01	-0.005574	-0.385657	-0.193270	0.264901
2013-01-02	-0.208680	-0.296488	-0.790010	-1.391032
2013-01-03	-1.616674	-0.042173	-1.356455	-0.717640

In [21]:

df['2013-01-01':'2013-01-03']

Out[21]:

	A	B	C	D
2013-01-01	-0.005574	-0.385657	-0.193270	0.264901
2013-01-02	-0.208680	-0.296488	-0.790010	-1.391032
2013-01-03	-1.616674	-0.042173	-1.356455	-0.717640

Selection by lable¶

In [22]:

# cross selection using a lable
df.loc[dates[0]]

Out[22]:

A   -0.005574
B   -0.385657
C   -0.193270
D    0.264901
Name: 2013-01-01 00:00:00, dtype: float64

In [23]:

df.loc[:, ['A', 'B']]   # [row, column]

Out[23]:

	A	B
2013-01-01	-0.005574	-0.385657
2013-01-02	-0.208680	-0.296488
2013-01-03	-1.616674	-0.042173
2013-01-04	2.156849	-0.201232
2013-01-05	-2.019862	-2.055205
2013-01-06	0.815260	0.766377

In [24]:

# Showing label slicing, both endpoints are included
df.loc['20130102':'20130104',['A','B']]

Out[24]:

	A	B
2013-01-02	-0.208680	-0.296488
2013-01-03	-1.616674	-0.042173
2013-01-04	2.156849	-0.201232

In [25]:

df.loc['20130102',['A','B']]

Out[25]:

A   -0.208680
B   -0.296488
Name: 2013-01-02 00:00:00, dtype: float64

In [26]:

# For getting a scalar value
df.loc['20130102',['A']]

Out[26]:

A   -0.20868
Name: 2013-01-02 00:00:00, dtype: float64

In [27]:

df.loc['20130102','B']

Out[27]:

-0.29648786843717295

In [28]:

# for faster access
#df.at['20130102', 'A']

Selection by Position¶

In [29]:

# Select via the position of the passed integers
df.iloc[3]

Out[29]:

A    2.156849
B   -0.201232
C    0.165786
D   -0.162465
Name: 2013-01-04 00:00:00, dtype: float64

In [30]:

# By integer slices, acting similar to numpy/python
df.iloc[3:5, 2:4]   # row - 3 n 4 , col = 2, 3

Out[30]:

	C	D
2013-01-04	0.165786	-0.162465
2013-01-05	1.369486	1.182199

In [31]:

# By lists of integer position locations, similar to the numpy/python style
df.iloc[[1,3,4], 2:4]

Out[31]:

	C	D
2013-01-02	-0.790010	-1.391032
2013-01-04	0.165786	-0.162465
2013-01-05	1.369486	1.182199

In [32]:

# For slicing rows explicitly
df.iloc[1:3,:]

Out[32]:

	A	B	C	D
2013-01-02	-0.208680	-0.296488	-0.790010	-1.391032
2013-01-03	-1.616674	-0.042173	-1.356455	-0.717640

In [33]:

# For slicing columns explicitly
df.iloc[:,1:3]

Out[33]:

	B	C
2013-01-01	-0.385657	-0.193270
2013-01-02	-0.296488	-0.790010
2013-01-03	-0.042173	-1.356455
2013-01-04	-0.201232	0.165786
2013-01-05	-2.055205	1.369486
2013-01-06	0.766377	0.160439

In [34]:

# For getting a value explicitly
df.iloc[1,1]

Out[34]:

-0.29648786843717295

In [35]:

df.loc['2013-01-02', 'B']

Out[35]:

-0.29648786843717295

In [36]:

# For getting fast access to a scalar (equiv to the prior method)
df.iat[1,1]

Out[36]:

-0.29648786843717295

Boolean Indexing¶

In [37]:

df.B > 0

Out[37]:

2013-01-01    False
2013-01-02    False
2013-01-03    False
2013-01-04    False
2013-01-05    False
2013-01-06     True
Freq: D, Name: B, dtype: bool

In [38]:

df[df.B > 0]

Out[38]:

	A	B	C	D
2013-01-06	0.81526	0.766377	0.160439	0.353855

In [39]:

df[df>0]

Out[39]:

	A	B	C	D
2013-01-01	NaN	NaN	NaN	0.264901
2013-01-02	NaN	NaN	NaN	NaN
2013-01-03	NaN	NaN	NaN	NaN
2013-01-04	2.156849	NaN	0.165786	NaN
2013-01-05	NaN	NaN	1.369486	1.182199
2013-01-06	0.815260	0.766377	0.160439	0.353855

In [40]:

# Using the isin() method for filtering:
df3 = df.copy()

df3['E'] = ['one', 'one','two','three','four','three']

df3

Out[40]:

	A	B	C	D	E
2013-01-01	-0.005574	-0.385657	-0.193270	0.264901	one
2013-01-02	-0.208680	-0.296488	-0.790010	-1.391032	one
2013-01-03	-1.616674	-0.042173	-1.356455	-0.717640	two
2013-01-04	2.156849	-0.201232	0.165786	-0.162465	three
2013-01-05	-2.019862	-2.055205	1.369486	1.182199	four
2013-01-06	0.815260	0.766377	0.160439	0.353855	three

In [41]:

df3[df3['E'].isin(['two','four'])]

Out[41]:

	A	B	C	D	E
2013-01-03	-1.616674	-0.042173	-1.356455	-0.717640	two
2013-01-05	-2.019862	-2.055205	1.369486	1.182199	four

Setting a new column¶

In [42]:

s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20130102', periods=6))
s1

Out[42]:

2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64

In [43]:

df['F'] = s1
df

Out[43]:

	A	B	C	D	F
2013-01-01	-0.005574	-0.385657	-0.193270	0.264901	NaN
2013-01-02	-0.208680	-0.296488	-0.790010	-1.391032	1.0
2013-01-03	-1.616674	-0.042173	-1.356455	-0.717640	2.0
2013-01-04	2.156849	-0.201232	0.165786	-0.162465	3.0
2013-01-05	-2.019862	-2.055205	1.369486	1.182199	4.0
2013-01-06	0.815260	0.766377	0.160439	0.353855	5.0

In [44]:

# Setting values by label
df.at[dates[0],'A'] = 0
df

Out[44]:

	A	B	C	D	F
2013-01-01	0.000000	-0.385657	-0.193270	0.264901	NaN
2013-01-02	-0.208680	-0.296488	-0.790010	-1.391032	1.0
2013-01-03	-1.616674	-0.042173	-1.356455	-0.717640	2.0
2013-01-04	2.156849	-0.201232	0.165786	-0.162465	3.0
2013-01-05	-2.019862	-2.055205	1.369486	1.182199	4.0
2013-01-06	0.815260	0.766377	0.160439	0.353855	5.0

In [45]:

# Setting values by position
df.iat[0,1] = 0
df

Out[45]:

	A	B	C	D	F
2013-01-01	0.000000	0.000000	-0.193270	0.264901	NaN
2013-01-02	-0.208680	-0.296488	-0.790010	-1.391032	1.0
2013-01-03	-1.616674	-0.042173	-1.356455	-0.717640	2.0
2013-01-04	2.156849	-0.201232	0.165786	-0.162465	3.0
2013-01-05	-2.019862	-2.055205	1.369486	1.182199	4.0
2013-01-06	0.815260	0.766377	0.160439	0.353855	5.0

In [46]:

# Setting by assigning with a numpy array
df.loc[:,'D'] = np.array([5] * len(df))
df

Out[46]:

	A	B	C	D	F
2013-01-01	0.000000	0.000000	-0.193270	5	NaN
2013-01-02	-0.208680	-0.296488	-0.790010	5	1.0
2013-01-03	-1.616674	-0.042173	-1.356455	5	2.0
2013-01-04	2.156849	-0.201232	0.165786	5	3.0
2013-01-05	-2.019862	-2.055205	1.369486	5	4.0
2013-01-06	0.815260	0.766377	0.160439	5	5.0

In [47]:

df2 = df.copy()
df2

Out[47]:

	A	B	C	D	F
2013-01-01	0.000000	0.000000	-0.193270	5	NaN
2013-01-02	-0.208680	-0.296488	-0.790010	5	1.0
2013-01-03	-1.616674	-0.042173	-1.356455	5	2.0
2013-01-04	2.156849	-0.201232	0.165786	5	3.0
2013-01-05	-2.019862	-2.055205	1.369486	5	4.0
2013-01-06	0.815260	0.766377	0.160439	5	5.0

In [48]:

# to replace all the positive value from its negative

df2[df2 > 0] = -df2
df2

Out[48]:

	A	B	C	D	F
2013-01-01	0.000000	0.000000	-0.193270	-5	NaN
2013-01-02	-0.208680	-0.296488	-0.790010	-5	-1.0
2013-01-03	-1.616674	-0.042173	-1.356455	-5	-2.0
2013-01-04	-2.156849	-0.201232	-0.165786	-5	-3.0
2013-01-05	-2.019862	-2.055205	-1.369486	-5	-4.0
2013-01-06	-0.815260	-0.766377	-0.160439	-5	-5.0

In [49]:

df2[df2 < 0] = -df2
df2

Out[49]:

	A	B	C	D	F
2013-01-01	0.000000	0.000000	0.193270	5	NaN
2013-01-02	0.208680	0.296488	0.790010	5	1.0
2013-01-03	1.616674	0.042173	1.356455	5	2.0
2013-01-04	2.156849	0.201232	0.165786	5	3.0
2013-01-05	2.019862	2.055205	1.369486	5	4.0
2013-01-06	0.815260	0.766377	0.160439	5	5.0

Missing Data¶

In [50]:

# Reindexing allows you to change/add/delete the index on a specified axis. This returns a copy of the data.
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])
df1

Out[50]:

	A	B	C	D	F	E
2013-01-01	0.000000	0.000000	-0.193270	5	NaN	NaN
2013-01-02	-0.208680	-0.296488	-0.790010	5	1.0	NaN
2013-01-03	-1.616674	-0.042173	-1.356455	5	2.0	NaN
2013-01-04	2.156849	-0.201232	0.165786	5	3.0	NaN

In [51]:

df1.loc[dates[1],'E'] = 1
df1

Out[51]:

	A	B	C	D	F	E
2013-01-01	0.000000	0.000000	-0.193270	5	NaN	NaN
2013-01-02	-0.208680	-0.296488	-0.790010	5	1.0	1.0
2013-01-03	-1.616674	-0.042173	-1.356455	5	2.0	NaN
2013-01-04	2.156849	-0.201232	0.165786	5	3.0	NaN

In [52]:

# to drop any rows that have missing data.
df1.dropna(how='any')   # if any columns have NULL or NaN

Out[52]:

	A	B	C	D	F	E
2013-01-02	-0.20868	-0.296488	-0.79001	5	1.0	1.0

In [53]:

df1.dropna()

Out[53]:

	A	B	C	D	F	E
2013-01-02	-0.20868	-0.296488	-0.79001	5	1.0	1.0

In [54]:

df1.dropna(how='all')   # if ALL columns have NULL or NaN

Out[54]:

	A	B	C	D	F	E
2013-01-01	0.000000	0.000000	-0.193270	5	NaN	NaN
2013-01-02	-0.208680	-0.296488	-0.790010	5	1.0	1.0
2013-01-03	-1.616674	-0.042173	-1.356455	5	2.0	NaN
2013-01-04	2.156849	-0.201232	0.165786	5	3.0	NaN

Filling missing data¶

In [55]:

df1.fillna(3)

Out[55]:

	A	B	C	D	F	E
2013-01-01	0.000000	0.000000	-0.193270	5	3.0	3.0
2013-01-02	-0.208680	-0.296488	-0.790010	5	1.0	1.0
2013-01-03	-1.616674	-0.042173	-1.356455	5	2.0	3.0
2013-01-04	2.156849	-0.201232	0.165786	5	3.0	3.0

In [56]:

df1.fillna(value=4)

Out[56]:

	A	B	C	D	F	E
2013-01-01	0.000000	0.000000	-0.193270	5	4.0	4.0
2013-01-02	-0.208680	-0.296488	-0.790010	5	1.0	1.0
2013-01-03	-1.616674	-0.042173	-1.356455	5	2.0	4.0
2013-01-04	2.156849	-0.201232	0.165786	5	3.0	4.0

In [57]:

df1.fillna({'F':3, 'E':2.9})   # Fill F column with 3 and E column wih 2.9

Out[57]:

	A	B	C	D	F	E
2013-01-01	0.000000	0.000000	-0.193270	5	3.0	2.9
2013-01-02	-0.208680	-0.296488	-0.790010	5	1.0	1.0
2013-01-03	-1.616674	-0.042173	-1.356455	5	2.0	2.9
2013-01-04	2.156849	-0.201232	0.165786	5	3.0	2.9

In [58]:

pd.isnull(df1)

Out[58]:

	A	B	C	D	F	E
2013-01-01	False	False	False	False	True	True
2013-01-02	False	False	False	False	False	False
2013-01-03	False	False	False	False	False	True
2013-01-04	False	False	False	False	False	True

Stats¶

In [59]:

df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 6 entries, 2013-01-01 to 2013-01-06
Freq: D
Data columns (total 5 columns):
A    6 non-null float64
B    6 non-null float64
C    6 non-null float64
D    6 non-null int32
F    5 non-null float64
dtypes: float64(4), int32(1)
memory usage: 264.0 bytes

In [60]:

df.describe()

Out[60]:

	A	B	C	D	F
count	6.000000	6.000000	6.000000	6.0	5.000000
mean	-0.145518	-0.304787	-0.107337	5.0	3.000000
std	1.544449	0.936377	0.933920	0.0	1.581139
min	-2.019862	-2.055205	-1.356455	5.0	1.000000
25%	-1.264676	-0.272674	-0.640825	5.0	2.000000
50%	-0.104340	-0.121702	-0.016415	5.0	3.000000
75%	0.611445	-0.010543	0.164450	5.0	4.000000
max	2.156849	0.766377	1.369486	5.0	5.000000

In [61]:

df.count()

Out[61]:

A    6
B    6
C    6
D    6
F    5
dtype: int64

In [62]:

df.mean()  # column wise

Out[62]:

A   -0.145518
B   -0.304787
C   -0.107337
D    5.000000
F    3.000000
dtype: float64

In [63]:

df.mean(1)  # row wise

Out[63]:

2013-01-01    1.201683
2013-01-02    0.940964
2013-01-03    0.796940
2013-01-04    2.024281
2013-01-05    1.258884
2013-01-06    2.348415
Freq: D, dtype: float64

In [64]:

df.std()

Out[64]:

A    1.544449
B    0.936377
C    0.933920
D    0.000000
F    1.581139
dtype: float64

In [65]:

df.std(1)

Out[65]:

2013-01-01    2.533850
2013-01-02    2.362652
2013-01-03    2.751355
2013-01-04    2.134913
2013-01-05    3.288437
2013-01-06    2.434258
Freq: D, dtype: float64

In [66]:

s = pd.Series([1,3,5,np.nan,6,8], index=dates)
s

Out[66]:

2013-01-01    1.0
2013-01-02    3.0
2013-01-03    5.0
2013-01-04    NaN
2013-01-05    6.0
2013-01-06    8.0
Freq: D, dtype: float64

In [67]:

s = s.shift(2)  # shifting the content by 2 index
s

Out[67]:

2013-01-01    NaN
2013-01-02    NaN
2013-01-03    1.0
2013-01-04    3.0
2013-01-05    5.0
2013-01-06    NaN
Freq: D, dtype: float64

In [68]:

 df.sub(s, axis='index')

Out[68]:

	A	B	C	D	F
2013-01-01	NaN	NaN	NaN	NaN	NaN
2013-01-02	NaN	NaN	NaN	NaN	NaN
2013-01-03	-2.616674	-1.042173	-2.356455	4.0	1.0
2013-01-04	-0.843151	-3.201232	-2.834214	2.0	0.0
2013-01-05	-7.019862	-7.055205	-3.630514	0.0	-1.0
2013-01-06	NaN	NaN	NaN	NaN	NaN

Apply¶

In [69]:

df.apply(np.cumsum)

Out[69]:

	A	B	C	D	F
2013-01-01	0.000000	0.000000	-0.193270	5	NaN
2013-01-02	-0.208680	-0.296488	-0.983279	10	1.0
2013-01-03	-1.825354	-0.338661	-2.339734	15	3.0
2013-01-04	0.331495	-0.539893	-2.173948	20	6.0
2013-01-05	-1.688367	-2.595098	-0.804461	25	10.0
2013-01-06	-0.873107	-1.828721	-0.644022	30	15.0

In [70]:

df.apply(lambda x: x.max() - x.min())

Out[70]:

A    4.176711
B    2.821582
C    2.725941
D    0.000000
F    4.000000
dtype: float64

Histogramming¶

In [71]:

s = pd.Series(np.random.randint(0, 7, size=10))
s

Out[71]:

0    0
1    1
2    5
3    0
4    0
5    1
6    4
7    4
8    3
9    0
dtype: int32

In [72]:

s.value_counts()  # checking unique value counts

Out[72]:

0    4
4    2
1    2
5    1
3    1
dtype: int64

String Methods¶

In [73]:

s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
s

Out[73]:

0       A
1       B
2       C
3    Aaba
4    Baca
5     NaN
6    CABA
7     dog
8     cat
dtype: object

In [74]:

s.str.lower()

Out[74]:

0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object

Merge¶

Concat

In [75]:

df = pd.DataFrame(np.random.randn(10, 4))
df

Out[75]:

	0	1	2	3
0	0.862392	-0.487448	1.767044	1.619584
1	-0.192533	0.452153	0.160491	0.163961
2	0.646025	-0.710062	-0.764748	2.061172
3	-1.356019	-0.077627	-0.947133	0.952742
4	0.883336	1.352857	-1.960570	1.267786
5	1.811561	-2.001506	-0.595290	-0.820276
6	-2.535877	-0.415728	-0.000989	-1.231573
7	-0.021901	1.382943	-0.957129	-0.228767
8	0.813857	-0.864718	1.981206	1.672659
9	-0.443268	0.921847	-0.548603	0.190629

In [76]:

pieces = [df[:3], df[3:7], df[7:]]
pieces

Out[76]:

[          0         1         2         3
 0  0.862392 -0.487448  1.767044  1.619584
 1 -0.192533  0.452153  0.160491  0.163961
 2  0.646025 -0.710062 -0.764748  2.061172,
           0         1         2         3
 3 -1.356019 -0.077627 -0.947133  0.952742
 4  0.883336  1.352857 -1.960570  1.267786
 5  1.811561 -2.001506 -0.595290 -0.820276
 6 -2.535877 -0.415728 -0.000989 -1.231573,
           0         1         2         3
 7 -0.021901  1.382943 -0.957129 -0.228767
 8  0.813857 -0.864718  1.981206  1.672659
 9 -0.443268  0.921847 -0.548603  0.190629]

In [77]:

pd.concat(pieces)  # concat rowwise

Out[77]:

	0	1	2	3
0	0.862392	-0.487448	1.767044	1.619584
1	-0.192533	0.452153	0.160491	0.163961
2	0.646025	-0.710062	-0.764748	2.061172
3	-1.356019	-0.077627	-0.947133	0.952742
4	0.883336	1.352857	-1.960570	1.267786
5	1.811561	-2.001506	-0.595290	-0.820276
6	-2.535877	-0.415728	-0.000989	-1.231573
7	-0.021901	1.382943	-0.957129	-0.228767
8	0.813857	-0.864718	1.981206	1.672659
9	-0.443268	0.921847	-0.548603	0.190629

In [78]:

pd.concat(pieces, axis=1)  # concat column wise

Out[78]:

	0	1	2	3	0	1	2	3	0	1	2	3
0	0.862392	-0.487448	1.767044	1.619584	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	-0.192533	0.452153	0.160491	0.163961	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2	0.646025	-0.710062	-0.764748	2.061172	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
3	NaN	NaN	NaN	NaN	-1.356019	-0.077627	-0.947133	0.952742	NaN	NaN	NaN	NaN
4	NaN	NaN	NaN	NaN	0.883336	1.352857	-1.960570	1.267786	NaN	NaN	NaN	NaN
5	NaN	NaN	NaN	NaN	1.811561	-2.001506	-0.595290	-0.820276	NaN	NaN	NaN	NaN
6	NaN	NaN	NaN	NaN	-2.535877	-0.415728	-0.000989	-1.231573	NaN	NaN	NaN	NaN
7	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	-0.021901	1.382943	-0.957129	-0.228767
8	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.813857	-0.864718	1.981206	1.672659
9	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	-0.443268	0.921847	-0.548603	0.190629

Join¶

pd.merge(left, right, how='inner', on=None, left_on=None, right_on=None,
         left_index=False, right_index=False, sort=True,
         suffixes=('_x', '_y'), copy=True, indicator=False)

In [79]:

left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]})

left

Out[79]:

	key	lval
0	foo	1
1	foo	2

In [80]:

right

Out[80]:

	key	rval
0	foo	4
1	foo	5

In [81]:

pd.merge(left, right, on='key')

Out[81]:

	key	lval	rval
0	foo	1	4
1	foo	1	5
2	foo	2	4
3	foo	2	5

In [82]:

left = pd.DataFrame({'key': ['foo', 'bar'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'bar'], 'rval': [4, 5]})

left

Out[82]:

	key	lval
0	foo	1
1	bar	2

In [83]:

pd.merge(left, right, on='key')

Out[83]:

	key	lval	rval
0	foo	1	4
1	bar	2	5

Append¶

In [84]:

df = pd.DataFrame(np.random.randn(8, 4), columns=['A','B','C','D'])
df

Out[84]:

	A	B	C	D
0	-0.770577	-0.170541	-1.076229	-0.451980
1	1.916760	0.572713	1.164140	0.632860
2	-1.005490	-1.477879	-0.110988	0.870474
3	-0.316136	0.618400	-1.937116	-0.129132
4	0.175017	-0.135414	-0.088902	1.275741
5	0.385744	-0.969253	-0.307694	0.746990
6	1.229319	-1.976747	-0.243775	0.166781
7	-0.940357	-0.894759	0.202037	0.233910

In [85]:

s = df.iloc[3]
s

Out[85]:

A   -0.316136
B    0.618400
C   -1.937116
D   -0.129132
Name: 3, dtype: float64

In [86]:

df.append(s, ignore_index=True)
df

Out[86]:

	A	B	C	D
0	-0.770577	-0.170541	-1.076229	-0.451980
1	1.916760	0.572713	1.164140	0.632860
2	-1.005490	-1.477879	-0.110988	0.870474
3	-0.316136	0.618400	-1.937116	-0.129132
4	0.175017	-0.135414	-0.088902	1.275741
5	0.385744	-0.969253	-0.307694	0.746990
6	1.229319	-1.976747	-0.243775	0.166781
7	-0.940357	-0.894759	0.202037	0.233910

Grouping¶

By “group by” we are referring to a process involving one or more of the following steps

    Splitting the data into groups based on some criteria
    Applying a function to each group independently
    Combining the results into a data structure

In [87]:

df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
                          'foo', 'bar', 'foo', 'foo'],
                       'B' : ['one', 'one', 'two', 'three',
                              'two', 'two', 'one', 'three'],
                       'C' : np.random.randn(8),
                       'D' : np.random.randn(8)})
df

Out[87]:

	A	B	C	D
0	foo	one	0.206509	-0.437572
1	bar	one	-1.189741	-0.096997
2	foo	two	1.289879	-0.130252
3	bar	three	0.103044	0.251160
4	foo	two	1.495027	-1.839108
5	bar	two	1.084158	-0.955764
6	foo	one	0.064089	0.721713
7	foo	three	-1.174013	1.183452

In [88]:

df.groupby('A').sum()

Out[88]:

	C	D
A
bar	-0.002539	-0.801601
foo	1.881491	-0.501767

In [89]:

df.groupby(['A','B']).sum()

Out[89]:

		C	D
A	B
bar	one	-1.189741	-0.096997
	three	0.103044	0.251160
	two	1.084158	-0.955764
foo	one	0.270598	0.284141
	three	-1.174013	1.183452
	two	2.784906	-1.969361

Reshaping¶

In [90]:

tuples = list(zip(*[['bar', 'bar', 'baz', 'baz',
                     'foo', 'foo', 'qux', 'qux'],
                        ['one', 'two', 'one', 'two',
                         'one', 'two', 'one', 'two']]))
index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])
index

Out[90]:

MultiIndex(levels=[['bar', 'baz', 'foo', 'qux'], ['one', 'two']],
           labels=[[0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 0, 1, 0, 1, 0, 1]],
           names=['first', 'second'])

In [91]:

df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=['A', 'B'])
df

Out[91]:

		A	B
first	second
bar	one	0.077103	0.071665
bar	two	0.243469	0.789529
baz	one	-0.452906	0.148716
baz	two	-0.829278	-1.180111
foo	one	-0.251384	-0.004602
foo	two	-0.857481	0.512469
qux	one	1.660127	-1.580966
qux	two	0.521759	-1.693887

The stack() method “compresses” a level in the DataFrame’s columns

In [92]:

df.stack()

Out[92]:

first  second   
bar    one     A    0.077103
               B    0.071665
       two     A    0.243469
               B    0.789529
baz    one     A   -0.452906
               B    0.148716
       two     A   -0.829278
               B   -1.180111
foo    one     A   -0.251384
               B   -0.004602
       two     A   -0.857481
               B    0.512469
qux    one     A    1.660127
               B   -1.580966
       two     A    0.521759
               B   -1.693887
dtype: float64

With a “stacked” DataFrame or Series (having a MultiIndex as the index), the inverse operation of stack() is unstack(), which by default unstacks the last level:

In [93]:

df.unstack()  # this will unstack the inner index (last level)

Out[93]:

	A		B
second	one	two	one	two
first
bar	0.077103	0.243469	0.071665	0.789529
baz	-0.452906	-0.829278	0.148716	-1.180111
foo	-0.251384	-0.857481	-0.004602	0.512469
qux	1.660127	0.521759	-1.580966	-1.693887

In [94]:

df.unstack('first')  # unstacking by lable

Out[94]:

	A				B
first	bar	baz	foo	qux	bar	baz	foo	qux
second
one	0.077103	-0.452906	-0.251384	1.660127	0.071665	0.148716	-0.004602	-1.580966
two	0.243469	-0.829278	-0.857481	0.521759	0.789529	-1.180111	0.512469	-1.693887

In [95]:

df.unstack('second')  

Out[95]:

	A		B
second	one	two	one	two
first
bar	0.077103	0.243469	0.071665	0.789529
baz	-0.452906	-0.829278	0.148716	-1.180111
foo	-0.251384	-0.857481	-0.004602	0.512469
qux	1.660127	0.521759	-1.580966	-1.693887

In [96]:

df.unstack(0)    # unstacking by index

Out[96]:

	A				B
first	bar	baz	foo	qux	bar	baz	foo	qux
second
one	0.077103	-0.452906	-0.251384	1.660127	0.071665	0.148716	-0.004602	-1.580966
two	0.243469	-0.829278	-0.857481	0.521759	0.789529	-1.180111	0.512469	-1.693887

Pivot Tables¶

In [97]:

df = pd.DataFrame({'A' : ['one', 'one', 'two', 'three'] * 3,
                       'B' : ['A', 'B', 'C'] * 4,
                       'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2,
                       'D' : np.random.randn(12),
                       'E' : np.random.randn(12)})
df

Out[97]:

	A	B	C	D	E
0	one	A	foo	0.869958	-2.824236
1	one	B	foo	-1.263218	0.101729
2	two	C	foo	-0.743723	1.409101
3	three	A	bar	0.292353	0.282451
4	one	B	bar	-0.091658	-0.752110
5	one	C	bar	1.295819	-0.448113
6	two	A	foo	1.573841	0.196957
7	three	B	foo	-0.645814	1.409150
8	one	C	foo	0.635127	-1.389018
9	one	A	bar	-1.174726	-0.077247
10	two	B	bar	1.264011	-1.072017
11	three	C	bar	1.441732	0.543449

In [98]:

pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C'])

Out[98]:

	C	bar	foo
A	B
one	A	-1.174726	0.869958
	B	-0.091658	-1.263218
	C	1.295819	0.635127
three	A	0.292353	NaN
	B	NaN	-0.645814
	C	1.441732	NaN
two	A	NaN	1.573841
	B	1.264011	NaN
	C	NaN	-0.743723

Time Series¶

In [99]:

rng = pd.date_range('1/1/2012', periods=100, freq='S')
rng[:5]

Out[99]:

DatetimeIndex(['2012-01-01 00:00:00', '2012-01-01 00:00:01',
               '2012-01-01 00:00:02', '2012-01-01 00:00:03',
               '2012-01-01 00:00:04'],
              dtype='datetime64[ns]', freq='S')

In [100]:

ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)
ts[:8]

Out[100]:

2012-01-01 00:00:00      0
2012-01-01 00:00:01    175
2012-01-01 00:00:02      9
2012-01-01 00:00:03    173
2012-01-01 00:00:04    285
2012-01-01 00:00:05    262
2012-01-01 00:00:06    148
2012-01-01 00:00:07    255
Freq: S, dtype: int32

In [101]:

ts.sum()

Out[101]:

In [102]:

ts.resample('5Min').sum()

Out[102]:

2012-01-01    26318
Freq: 5T, dtype: int32

In [103]:

ts.resample('5Min')

C:\Anaconda3\lib\site-packages\IPython\lib\pretty.py:108: FutureWarning: .resample() is now a deferred operation
use .resample(...).mean() instead of .resample(...)
  return getattr(obj, attr, default)

Out[103]:

DatetimeIndexResampler [freq=<5 * Minutes>, axis=0, closed=left, label=left, convention=start, base=0]

Time zone representation

In [104]:

rng = pd.date_range('3/6/2012 00:00', periods=5, freq='D')
rng

Out[104]:

DatetimeIndex(['2012-03-06', '2012-03-07', '2012-03-08', '2012-03-09',
               '2012-03-10'],
              dtype='datetime64[ns]', freq='D')

In [105]:

ts = pd.Series(np.random.randn(len(rng)), rng)
ts

Out[105]:

2012-03-06    1.166836
2012-03-07    0.309653
2012-03-08    0.953754
2012-03-09    0.606327
2012-03-10   -0.804818
Freq: D, dtype: float64

In [106]:

ts_utc = ts.tz_localize('UTC')
ts_utc

Out[106]:

2012-03-06 00:00:00+00:00    1.166836
2012-03-07 00:00:00+00:00    0.309653
2012-03-08 00:00:00+00:00    0.953754
2012-03-09 00:00:00+00:00    0.606327
2012-03-10 00:00:00+00:00   -0.804818
Freq: D, dtype: float64

Convert to another time zone

In [107]:

ts_utc.tz_convert('Asia/Calcutta')

Out[107]:

2012-03-06 05:30:00+05:30    1.166836
2012-03-07 05:30:00+05:30    0.309653
2012-03-08 05:30:00+05:30    0.953754
2012-03-09 05:30:00+05:30    0.606327
2012-03-10 05:30:00+05:30   -0.804818
Freq: D, dtype: float64

Converting between time span representations

In [108]:

rng = pd.date_range('1/1/2012', periods=5, freq='M')
ts = pd.Series(np.random.randn(len(rng)), index=rng)

ts

Out[108]:

2012-01-31   -1.609627
2012-02-29   -1.381380
2012-03-31    1.065309
2012-04-30   -0.192158
2012-05-31   -0.742545
Freq: M, dtype: float64

In [109]:

ps = ts.to_period()
ps

Out[109]:

2012-01   -1.609627
2012-02   -1.381380
2012-03    1.065309
2012-04   -0.192158
2012-05   -0.742545
Freq: M, dtype: float64

In [110]:

ps.to_timestamp()

Out[110]:

2012-01-01   -1.609627
2012-02-01   -1.381380
2012-03-01    1.065309
2012-04-01   -0.192158
2012-05-01   -0.742545
Freq: MS, dtype: float64

Converting between period and timestamp enables some convenient arithmetic functions to be used. In the following example, we convert a quarterly frequency with year ending in November to 9am of the end of the month following the quarter end:

In [111]:

prng = pd.period_range('1990Q1', '2000Q4', freq='Q-NOV')
ts = pd.Series(np.random.randn(len(prng)), prng)

ts.head()

Out[111]:

1990Q1   -0.583649
1990Q2    0.855731
1990Q3   -0.625833
1990Q4    1.185059
1991Q1   -0.317823
Freq: Q-NOV, dtype: float64

In [112]:

ts.index = (prng.asfreq('M', 'e') + 1).asfreq('H', 's') + 9

ts.head()

Out[112]:

1990-03-01 09:00   -0.583649
1990-06-01 09:00    0.855731
1990-09-01 09:00   -0.625833
1990-12-01 09:00    1.185059
1991-03-01 09:00   -0.317823
Freq: H, dtype: float64

Categoricals¶

In [113]:

df = pd.DataFrame({"id":[1,2,3,4,5,6], "raw_grade":['a', 'b', 'b', 'a', 'a', 'e']})

df

Out[113]:

	id	raw_grade
0	1	a
1	2	b
2	3	b
3	4	a
4	5	a
5	6	e

In [114]:

df.dtypes

Out[114]:

id            int64
raw_grade    object
dtype: object

In [115]:

df["grade"] = df["raw_grade"].astype("category")

df["grade"]

Out[115]:

0    a
1    b
2    b
3    a
4    a
5    e
Name: grade, dtype: category
Categories (3, object): [a, b, e]

Rename the categories to more meaningful names (assigning to Series.cat.categories is inplace!)

In [116]:

df["grade"].cat.categories = ["very good", "good", "very bad"]

df

Out[116]:

	id	raw_grade	grade
0	1	a	very good
1	2	b	good
2	3	b	good
3	4	a	very good
4	5	a	very good
5	6	e	very bad

In [117]:

# Reorder the categories and simultaneously add the
# missing categories (methods under Series .cat return a new Series per default).

df["grade"] = df["grade"].cat.set_categories(["very bad", "bad", "medium", "good", "very good"])

df

Out[117]:

	id	raw_grade	grade
0	1	a	very good
1	2	b	good
2	3	b	good
3	4	a	very good
4	5	a	very good
5	6	e	very bad

In [118]:

# Sorting is per order in the categories, not lexical order.
df.sort_values(by="grade")

Out[118]:

	id	raw_grade	grade
5	6	e	very bad
1	2	b	good
2	3	b	good
0	1	a	very good
3	4	a	very good
4	5	a	very good

In [119]:

# Grouping by a categorical column shows also empty categories.
df.groupby("grade").size()

Out[119]:

grade
very bad     1
bad          0
medium       0
good         2
very good    3
dtype: int64

Plotting¶

In [120]:

ts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000))

ts = ts.cumsum()

ts.plot()

Out[120]:

<matplotlib.axes._subplots.AxesSubplot at 0x8d607b8>

In [121]:

# On DataFrame, plot() is a convenience to plot all of the columns with labels:

df = pd.DataFrame(np.random.randn(1000, 4), index=ts.index,
                  columns=['A', 'B', 'C', 'D'])
df = df.cumsum()

df.plot(grid=True)

Out[121]:

<matplotlib.axes._subplots.AxesSubplot at 0x8f4ecf8>

Data Read and Write¶

In [122]:

df[:4]

Out[122]:

	A	B	C	D
2000-01-01	0.383624	0.429101	0.743808	1.577623
2000-01-02	-0.609680	0.453151	1.483189	0.865432
2000-01-03	-0.652463	0.116940	2.300037	-0.406062
2000-01-04	-1.804128	0.703522	1.802338	-0.694557

to_csv and read_csv

In [123]:

# writing it to csv
df.to_csv("dataset/df_as_csv.csv")

In [124]:

# reading it 
df2 = pd.read_csv("dataset/df_as_csv.csv")
df2.head()

Out[124]:

	Unnamed: 0	A	B	C	D
0	2000-01-01	0.383624	0.429101	0.743808	1.577623
1	2000-01-02	-0.609680	0.453151	1.483189	0.865432
2	2000-01-03	-0.652463	0.116940	2.300037	-0.406062
3	2000-01-04	-1.804128	0.703522	1.802338	-0.694557
4	2000-01-05	-3.547775	0.947640	1.720123	-0.952502

for hdfs, to_hdf and read_hdf

In [125]:

df.to_hdf('foo.h5','df')

In [126]:

pd.read_hdf('foo.h5','df').head()

Out[126]:

	A	B	C	D
2000-01-01	0.383624	0.429101	0.743808	1.577623
2000-01-02	-0.609680	0.453151	1.483189	0.865432
2000-01-03	-0.652463	0.116940	2.300037	-0.406062
2000-01-04	-1.804128	0.703522	1.802338	-0.694557
2000-01-05	-3.547775	0.947640	1.720123	-0.952502

For Excel, to_excel and read_excel

In [127]:

 df.to_excel('foo.xlsx', sheet_name='Sheet1')

In [129]:

pd.read_excel('foo.xlsx', 'Sheet1', index_col=None, na_values=['NA']).head()

Out[129]:

	A	B	C	D
2000-01-01	0.383624	0.429101	0.743808	1.577623
2000-01-02	-0.609680	0.453151	1.483189	0.865432
2000-01-03	-0.652463	0.116940	2.300037	-0.406062
2000-01-04	-1.804128	0.703522	1.802338	-0.694557
2000-01-05	-3.547775	0.947640	1.720123	-0.952502

In [ ]: