# import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
# creating Series (array)
s1 = pd.Series([[1,3,5,np.nan,6,8]])
s1
0 [1, 3, 5, nan, 6, 8] dtype: object
# creating DataFrames
dates = pd.date_range('20130101', periods=6)
dates
DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04', '2013-01-05', '2013-01-06'], dtype='datetime64[ns]', freq='D')
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
df
A | B | C | D | |
---|---|---|---|---|
2013-01-01 | -0.005574 | -0.385657 | -0.193270 | 0.264901 |
2013-01-02 | -0.208680 | -0.296488 | -0.790010 | -1.391032 |
2013-01-03 | -1.616674 | -0.042173 | -1.356455 | -0.717640 |
2013-01-04 | 2.156849 | -0.201232 | 0.165786 | -0.162465 |
2013-01-05 | -2.019862 | -2.055205 | 1.369486 | 1.182199 |
2013-01-06 | 0.815260 | 0.766377 | 0.160439 | 0.353855 |
# Creating a DataFrame by passing a dict of objects that can be converted to series-like.
df2 = pd.DataFrame({ 'A' : 1.,
'B' : pd.Timestamp('20130102'),
'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
'D' : np.array([3] * 4,dtype='int32'),
'E' : pd.Categorical(["test","train","test","train"]),
'F' : 'foo' })
df2
A | B | C | D | E | F | |
---|---|---|---|---|---|---|
0 | 1.0 | 2013-01-02 | 1.0 | 3 | test | foo |
1 | 1.0 | 2013-01-02 | 1.0 | 3 | train | foo |
2 | 1.0 | 2013-01-02 | 1.0 | 3 | test | foo |
3 | 1.0 | 2013-01-02 | 1.0 | 3 | train | foo |
# checking Data Types
df2.dtypes
A float64 B datetime64[ns] C float32 D int32 E category F object dtype: object
# Use df2.<TAB> for column name completion as well as attributes which can work on dataframe
# for first 5 records
df.head()
A | B | C | D | |
---|---|---|---|---|
2013-01-01 | -0.005574 | -0.385657 | -0.193270 | 0.264901 |
2013-01-02 | -0.208680 | -0.296488 | -0.790010 | -1.391032 |
2013-01-03 | -1.616674 | -0.042173 | -1.356455 | -0.717640 |
2013-01-04 | 2.156849 | -0.201232 | 0.165786 | -0.162465 |
2013-01-05 | -2.019862 | -2.055205 | 1.369486 | 1.182199 |
# for last 3
df.tail(3)
A | B | C | D | |
---|---|---|---|---|
2013-01-04 | 2.156849 | -0.201232 | 0.165786 | -0.162465 |
2013-01-05 | -2.019862 | -2.055205 | 1.369486 | 1.182199 |
2013-01-06 | 0.815260 | 0.766377 | 0.160439 | 0.353855 |
# checking df index
df.index
DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04', '2013-01-05', '2013-01-06'], dtype='datetime64[ns]', freq='D')
# column names
df.columns
Index(['A', 'B', 'C', 'D'], dtype='object')
# df values
df.values
array([[-0.00557444, -0.38565665, -0.19326961, 0.26490064], [-0.20867983, -0.29648787, -0.79000981, -1.39103244], [-1.6166742 , -0.04217266, -1.35645484, -0.71763967], [ 2.15684898, -0.20123222, 0.16578647, -0.16246515], [-2.01986196, -2.05520516, 1.36948649, 1.18219902], [ 0.81525976, 0.76637688, 0.16043882, 0.35385518]])
# information abt df
df.info()
<class 'pandas.core.frame.DataFrame'> DatetimeIndex: 6 entries, 2013-01-01 to 2013-01-06 Freq: D Data columns (total 4 columns): A 6 non-null float64 B 6 non-null float64 C 6 non-null float64 D 6 non-null float64 dtypes: float64(4) memory usage: 240.0 bytes
# describing stastistic summay
df.describe()
A | B | C | D | |
---|---|---|---|---|
count | 6.000000 | 6.000000 | 6.000000 | 6.000000 |
mean | -0.146447 | -0.369063 | -0.107337 | -0.078364 |
std | 1.544346 | 0.924431 | 0.933920 | 0.898054 |
min | -2.019862 | -2.055205 | -1.356455 | -1.391032 |
25% | -1.264676 | -0.363364 | -0.640825 | -0.578846 |
50% | -0.107127 | -0.248860 | -0.016415 | 0.051218 |
75% | 0.610051 | -0.081938 | 0.164450 | 0.331617 |
max | 2.156849 | 0.766377 | 1.369486 | 1.182199 |
df.T
2013-01-01 00:00:00 | 2013-01-02 00:00:00 | 2013-01-03 00:00:00 | 2013-01-04 00:00:00 | 2013-01-05 00:00:00 | 2013-01-06 00:00:00 | |
---|---|---|---|---|---|---|
A | -0.005574 | -0.208680 | -1.616674 | 2.156849 | -2.019862 | 0.815260 |
B | -0.385657 | -0.296488 | -0.042173 | -0.201232 | -2.055205 | 0.766377 |
C | -0.193270 | -0.790010 | -1.356455 | 0.165786 | 1.369486 | 0.160439 |
D | 0.264901 | -1.391032 | -0.717640 | -0.162465 | 1.182199 | 0.353855 |
# Sorting by index
df.sort_index(axis=1, ascending=False)
D | C | B | A | |
---|---|---|---|---|
2013-01-01 | 0.264901 | -0.193270 | -0.385657 | -0.005574 |
2013-01-02 | -1.391032 | -0.790010 | -0.296488 | -0.208680 |
2013-01-03 | -0.717640 | -1.356455 | -0.042173 | -1.616674 |
2013-01-04 | -0.162465 | 0.165786 | -0.201232 | 2.156849 |
2013-01-05 | 1.182199 | 1.369486 | -2.055205 | -2.019862 |
2013-01-06 | 0.353855 | 0.160439 | 0.766377 | 0.815260 |
# sorting by values
df.sort_values(by="B")
A | B | C | D | |
---|---|---|---|---|
2013-01-05 | -2.019862 | -2.055205 | 1.369486 | 1.182199 |
2013-01-01 | -0.005574 | -0.385657 | -0.193270 | 0.264901 |
2013-01-02 | -0.208680 | -0.296488 | -0.790010 | -1.391032 |
2013-01-04 | 2.156849 | -0.201232 | 0.165786 | -0.162465 |
2013-01-03 | -1.616674 | -0.042173 | -1.356455 | -0.717640 |
2013-01-06 | 0.815260 | 0.766377 | 0.160439 | 0.353855 |
df.sort_values(by=["B", "A"])
A | B | C | D | |
---|---|---|---|---|
2013-01-05 | -2.019862 | -2.055205 | 1.369486 | 1.182199 |
2013-01-01 | -0.005574 | -0.385657 | -0.193270 | 0.264901 |
2013-01-02 | -0.208680 | -0.296488 | -0.790010 | -1.391032 |
2013-01-04 | 2.156849 | -0.201232 | 0.165786 | -0.162465 |
2013-01-03 | -1.616674 | -0.042173 | -1.356455 | -0.717640 |
2013-01-06 | 0.815260 | 0.766377 | 0.160439 | 0.353855 |
In pandas, data can be accessed with these methods .at, .iat, .loc, .iloc and .ix
# selecting a column A
df['A']
2013-01-01 -0.005574 2013-01-02 -0.208680 2013-01-03 -1.616674 2013-01-04 2.156849 2013-01-05 -2.019862 2013-01-06 0.815260 Freq: D, Name: A, dtype: float64
# slicing rows
df[0:3]
A | B | C | D | |
---|---|---|---|---|
2013-01-01 | -0.005574 | -0.385657 | -0.193270 | 0.264901 |
2013-01-02 | -0.208680 | -0.296488 | -0.790010 | -1.391032 |
2013-01-03 | -1.616674 | -0.042173 | -1.356455 | -0.717640 |
df['2013-01-01':'2013-01-03']
A | B | C | D | |
---|---|---|---|---|
2013-01-01 | -0.005574 | -0.385657 | -0.193270 | 0.264901 |
2013-01-02 | -0.208680 | -0.296488 | -0.790010 | -1.391032 |
2013-01-03 | -1.616674 | -0.042173 | -1.356455 | -0.717640 |
# cross selection using a lable
df.loc[dates[0]]
A -0.005574 B -0.385657 C -0.193270 D 0.264901 Name: 2013-01-01 00:00:00, dtype: float64
df.loc[:, ['A', 'B']] # [row, column]
A | B | |
---|---|---|
2013-01-01 | -0.005574 | -0.385657 |
2013-01-02 | -0.208680 | -0.296488 |
2013-01-03 | -1.616674 | -0.042173 |
2013-01-04 | 2.156849 | -0.201232 |
2013-01-05 | -2.019862 | -2.055205 |
2013-01-06 | 0.815260 | 0.766377 |
# Showing label slicing, both endpoints are included
df.loc['20130102':'20130104',['A','B']]
A | B | |
---|---|---|
2013-01-02 | -0.208680 | -0.296488 |
2013-01-03 | -1.616674 | -0.042173 |
2013-01-04 | 2.156849 | -0.201232 |
df.loc['20130102',['A','B']]
A -0.208680 B -0.296488 Name: 2013-01-02 00:00:00, dtype: float64
# For getting a scalar value
df.loc['20130102',['A']]
A -0.20868 Name: 2013-01-02 00:00:00, dtype: float64
df.loc['20130102','B']
-0.29648786843717295
# for faster access
#df.at['20130102', 'A']
# Select via the position of the passed integers
df.iloc[3]
A 2.156849 B -0.201232 C 0.165786 D -0.162465 Name: 2013-01-04 00:00:00, dtype: float64
# By integer slices, acting similar to numpy/python
df.iloc[3:5, 2:4] # row - 3 n 4 , col = 2, 3
C | D | |
---|---|---|
2013-01-04 | 0.165786 | -0.162465 |
2013-01-05 | 1.369486 | 1.182199 |
# By lists of integer position locations, similar to the numpy/python style
df.iloc[[1,3,4], 2:4]
C | D | |
---|---|---|
2013-01-02 | -0.790010 | -1.391032 |
2013-01-04 | 0.165786 | -0.162465 |
2013-01-05 | 1.369486 | 1.182199 |
# For slicing rows explicitly
df.iloc[1:3,:]
A | B | C | D | |
---|---|---|---|---|
2013-01-02 | -0.208680 | -0.296488 | -0.790010 | -1.391032 |
2013-01-03 | -1.616674 | -0.042173 | -1.356455 | -0.717640 |
# For slicing columns explicitly
df.iloc[:,1:3]
B | C | |
---|---|---|
2013-01-01 | -0.385657 | -0.193270 |
2013-01-02 | -0.296488 | -0.790010 |
2013-01-03 | -0.042173 | -1.356455 |
2013-01-04 | -0.201232 | 0.165786 |
2013-01-05 | -2.055205 | 1.369486 |
2013-01-06 | 0.766377 | 0.160439 |
# For getting a value explicitly
df.iloc[1,1]
-0.29648786843717295
df.loc['2013-01-02', 'B']
-0.29648786843717295
# For getting fast access to a scalar (equiv to the prior method)
df.iat[1,1]
-0.29648786843717295
df.B > 0
2013-01-01 False 2013-01-02 False 2013-01-03 False 2013-01-04 False 2013-01-05 False 2013-01-06 True Freq: D, Name: B, dtype: bool
df[df.B > 0]
A | B | C | D | |
---|---|---|---|---|
2013-01-06 | 0.81526 | 0.766377 | 0.160439 | 0.353855 |
df[df>0]
A | B | C | D | |
---|---|---|---|---|
2013-01-01 | NaN | NaN | NaN | 0.264901 |
2013-01-02 | NaN | NaN | NaN | NaN |
2013-01-03 | NaN | NaN | NaN | NaN |
2013-01-04 | 2.156849 | NaN | 0.165786 | NaN |
2013-01-05 | NaN | NaN | 1.369486 | 1.182199 |
2013-01-06 | 0.815260 | 0.766377 | 0.160439 | 0.353855 |
# Using the isin() method for filtering:
df3 = df.copy()
df3['E'] = ['one', 'one','two','three','four','three']
df3
A | B | C | D | E | |
---|---|---|---|---|---|
2013-01-01 | -0.005574 | -0.385657 | -0.193270 | 0.264901 | one |
2013-01-02 | -0.208680 | -0.296488 | -0.790010 | -1.391032 | one |
2013-01-03 | -1.616674 | -0.042173 | -1.356455 | -0.717640 | two |
2013-01-04 | 2.156849 | -0.201232 | 0.165786 | -0.162465 | three |
2013-01-05 | -2.019862 | -2.055205 | 1.369486 | 1.182199 | four |
2013-01-06 | 0.815260 | 0.766377 | 0.160439 | 0.353855 | three |
df3[df3['E'].isin(['two','four'])]
A | B | C | D | E | |
---|---|---|---|---|---|
2013-01-03 | -1.616674 | -0.042173 | -1.356455 | -0.717640 | two |
2013-01-05 | -2.019862 | -2.055205 | 1.369486 | 1.182199 | four |
s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20130102', periods=6))
s1
2013-01-02 1 2013-01-03 2 2013-01-04 3 2013-01-05 4 2013-01-06 5 2013-01-07 6 Freq: D, dtype: int64
df['F'] = s1
df
A | B | C | D | F | |
---|---|---|---|---|---|
2013-01-01 | -0.005574 | -0.385657 | -0.193270 | 0.264901 | NaN |
2013-01-02 | -0.208680 | -0.296488 | -0.790010 | -1.391032 | 1.0 |
2013-01-03 | -1.616674 | -0.042173 | -1.356455 | -0.717640 | 2.0 |
2013-01-04 | 2.156849 | -0.201232 | 0.165786 | -0.162465 | 3.0 |
2013-01-05 | -2.019862 | -2.055205 | 1.369486 | 1.182199 | 4.0 |
2013-01-06 | 0.815260 | 0.766377 | 0.160439 | 0.353855 | 5.0 |
# Setting values by label
df.at[dates[0],'A'] = 0
df
A | B | C | D | F | |
---|---|---|---|---|---|
2013-01-01 | 0.000000 | -0.385657 | -0.193270 | 0.264901 | NaN |
2013-01-02 | -0.208680 | -0.296488 | -0.790010 | -1.391032 | 1.0 |
2013-01-03 | -1.616674 | -0.042173 | -1.356455 | -0.717640 | 2.0 |
2013-01-04 | 2.156849 | -0.201232 | 0.165786 | -0.162465 | 3.0 |
2013-01-05 | -2.019862 | -2.055205 | 1.369486 | 1.182199 | 4.0 |
2013-01-06 | 0.815260 | 0.766377 | 0.160439 | 0.353855 | 5.0 |
# Setting values by position
df.iat[0,1] = 0
df
A | B | C | D | F | |
---|---|---|---|---|---|
2013-01-01 | 0.000000 | 0.000000 | -0.193270 | 0.264901 | NaN |
2013-01-02 | -0.208680 | -0.296488 | -0.790010 | -1.391032 | 1.0 |
2013-01-03 | -1.616674 | -0.042173 | -1.356455 | -0.717640 | 2.0 |
2013-01-04 | 2.156849 | -0.201232 | 0.165786 | -0.162465 | 3.0 |
2013-01-05 | -2.019862 | -2.055205 | 1.369486 | 1.182199 | 4.0 |
2013-01-06 | 0.815260 | 0.766377 | 0.160439 | 0.353855 | 5.0 |
# Setting by assigning with a numpy array
df.loc[:,'D'] = np.array([5] * len(df))
df
A | B | C | D | F | |
---|---|---|---|---|---|
2013-01-01 | 0.000000 | 0.000000 | -0.193270 | 5 | NaN |
2013-01-02 | -0.208680 | -0.296488 | -0.790010 | 5 | 1.0 |
2013-01-03 | -1.616674 | -0.042173 | -1.356455 | 5 | 2.0 |
2013-01-04 | 2.156849 | -0.201232 | 0.165786 | 5 | 3.0 |
2013-01-05 | -2.019862 | -2.055205 | 1.369486 | 5 | 4.0 |
2013-01-06 | 0.815260 | 0.766377 | 0.160439 | 5 | 5.0 |
df2 = df.copy()
df2
A | B | C | D | F | |
---|---|---|---|---|---|
2013-01-01 | 0.000000 | 0.000000 | -0.193270 | 5 | NaN |
2013-01-02 | -0.208680 | -0.296488 | -0.790010 | 5 | 1.0 |
2013-01-03 | -1.616674 | -0.042173 | -1.356455 | 5 | 2.0 |
2013-01-04 | 2.156849 | -0.201232 | 0.165786 | 5 | 3.0 |
2013-01-05 | -2.019862 | -2.055205 | 1.369486 | 5 | 4.0 |
2013-01-06 | 0.815260 | 0.766377 | 0.160439 | 5 | 5.0 |
# to replace all the positive value from its negative
df2[df2 > 0] = -df2
df2
A | B | C | D | F | |
---|---|---|---|---|---|
2013-01-01 | 0.000000 | 0.000000 | -0.193270 | -5 | NaN |
2013-01-02 | -0.208680 | -0.296488 | -0.790010 | -5 | -1.0 |
2013-01-03 | -1.616674 | -0.042173 | -1.356455 | -5 | -2.0 |
2013-01-04 | -2.156849 | -0.201232 | -0.165786 | -5 | -3.0 |
2013-01-05 | -2.019862 | -2.055205 | -1.369486 | -5 | -4.0 |
2013-01-06 | -0.815260 | -0.766377 | -0.160439 | -5 | -5.0 |
df2[df2 < 0] = -df2
df2
A | B | C | D | F | |
---|---|---|---|---|---|
2013-01-01 | 0.000000 | 0.000000 | 0.193270 | 5 | NaN |
2013-01-02 | 0.208680 | 0.296488 | 0.790010 | 5 | 1.0 |
2013-01-03 | 1.616674 | 0.042173 | 1.356455 | 5 | 2.0 |
2013-01-04 | 2.156849 | 0.201232 | 0.165786 | 5 | 3.0 |
2013-01-05 | 2.019862 | 2.055205 | 1.369486 | 5 | 4.0 |
2013-01-06 | 0.815260 | 0.766377 | 0.160439 | 5 | 5.0 |
# Reindexing allows you to change/add/delete the index on a specified axis. This returns a copy of the data.
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])
df1
A | B | C | D | F | E | |
---|---|---|---|---|---|---|
2013-01-01 | 0.000000 | 0.000000 | -0.193270 | 5 | NaN | NaN |
2013-01-02 | -0.208680 | -0.296488 | -0.790010 | 5 | 1.0 | NaN |
2013-01-03 | -1.616674 | -0.042173 | -1.356455 | 5 | 2.0 | NaN |
2013-01-04 | 2.156849 | -0.201232 | 0.165786 | 5 | 3.0 | NaN |
df1.loc[dates[1],'E'] = 1
df1
A | B | C | D | F | E | |
---|---|---|---|---|---|---|
2013-01-01 | 0.000000 | 0.000000 | -0.193270 | 5 | NaN | NaN |
2013-01-02 | -0.208680 | -0.296488 | -0.790010 | 5 | 1.0 | 1.0 |
2013-01-03 | -1.616674 | -0.042173 | -1.356455 | 5 | 2.0 | NaN |
2013-01-04 | 2.156849 | -0.201232 | 0.165786 | 5 | 3.0 | NaN |
# to drop any rows that have missing data.
df1.dropna(how='any') # if any columns have NULL or NaN
A | B | C | D | F | E | |
---|---|---|---|---|---|---|
2013-01-02 | -0.20868 | -0.296488 | -0.79001 | 5 | 1.0 | 1.0 |
df1.dropna()
A | B | C | D | F | E | |
---|---|---|---|---|---|---|
2013-01-02 | -0.20868 | -0.296488 | -0.79001 | 5 | 1.0 | 1.0 |
df1.dropna(how='all') # if ALL columns have NULL or NaN
A | B | C | D | F | E | |
---|---|---|---|---|---|---|
2013-01-01 | 0.000000 | 0.000000 | -0.193270 | 5 | NaN | NaN |
2013-01-02 | -0.208680 | -0.296488 | -0.790010 | 5 | 1.0 | 1.0 |
2013-01-03 | -1.616674 | -0.042173 | -1.356455 | 5 | 2.0 | NaN |
2013-01-04 | 2.156849 | -0.201232 | 0.165786 | 5 | 3.0 | NaN |
df1.fillna(3)
A | B | C | D | F | E | |
---|---|---|---|---|---|---|
2013-01-01 | 0.000000 | 0.000000 | -0.193270 | 5 | 3.0 | 3.0 |
2013-01-02 | -0.208680 | -0.296488 | -0.790010 | 5 | 1.0 | 1.0 |
2013-01-03 | -1.616674 | -0.042173 | -1.356455 | 5 | 2.0 | 3.0 |
2013-01-04 | 2.156849 | -0.201232 | 0.165786 | 5 | 3.0 | 3.0 |
df1.fillna(value=4)
A | B | C | D | F | E | |
---|---|---|---|---|---|---|
2013-01-01 | 0.000000 | 0.000000 | -0.193270 | 5 | 4.0 | 4.0 |
2013-01-02 | -0.208680 | -0.296488 | -0.790010 | 5 | 1.0 | 1.0 |
2013-01-03 | -1.616674 | -0.042173 | -1.356455 | 5 | 2.0 | 4.0 |
2013-01-04 | 2.156849 | -0.201232 | 0.165786 | 5 | 3.0 | 4.0 |
df1.fillna({'F':3, 'E':2.9}) # Fill F column with 3 and E column wih 2.9
A | B | C | D | F | E | |
---|---|---|---|---|---|---|
2013-01-01 | 0.000000 | 0.000000 | -0.193270 | 5 | 3.0 | 2.9 |
2013-01-02 | -0.208680 | -0.296488 | -0.790010 | 5 | 1.0 | 1.0 |
2013-01-03 | -1.616674 | -0.042173 | -1.356455 | 5 | 2.0 | 2.9 |
2013-01-04 | 2.156849 | -0.201232 | 0.165786 | 5 | 3.0 | 2.9 |
pd.isnull(df1)
A | B | C | D | F | E | |
---|---|---|---|---|---|---|
2013-01-01 | False | False | False | False | True | True |
2013-01-02 | False | False | False | False | False | False |
2013-01-03 | False | False | False | False | False | True |
2013-01-04 | False | False | False | False | False | True |
df.info()
<class 'pandas.core.frame.DataFrame'> DatetimeIndex: 6 entries, 2013-01-01 to 2013-01-06 Freq: D Data columns (total 5 columns): A 6 non-null float64 B 6 non-null float64 C 6 non-null float64 D 6 non-null int32 F 5 non-null float64 dtypes: float64(4), int32(1) memory usage: 264.0 bytes
df.describe()
A | B | C | D | F | |
---|---|---|---|---|---|
count | 6.000000 | 6.000000 | 6.000000 | 6.0 | 5.000000 |
mean | -0.145518 | -0.304787 | -0.107337 | 5.0 | 3.000000 |
std | 1.544449 | 0.936377 | 0.933920 | 0.0 | 1.581139 |
min | -2.019862 | -2.055205 | -1.356455 | 5.0 | 1.000000 |
25% | -1.264676 | -0.272674 | -0.640825 | 5.0 | 2.000000 |
50% | -0.104340 | -0.121702 | -0.016415 | 5.0 | 3.000000 |
75% | 0.611445 | -0.010543 | 0.164450 | 5.0 | 4.000000 |
max | 2.156849 | 0.766377 | 1.369486 | 5.0 | 5.000000 |
df.count()
A 6 B 6 C 6 D 6 F 5 dtype: int64
df.mean() # column wise
A -0.145518 B -0.304787 C -0.107337 D 5.000000 F 3.000000 dtype: float64
df.mean(1) # row wise
2013-01-01 1.201683 2013-01-02 0.940964 2013-01-03 0.796940 2013-01-04 2.024281 2013-01-05 1.258884 2013-01-06 2.348415 Freq: D, dtype: float64
df.std()
A 1.544449 B 0.936377 C 0.933920 D 0.000000 F 1.581139 dtype: float64
df.std(1)
2013-01-01 2.533850 2013-01-02 2.362652 2013-01-03 2.751355 2013-01-04 2.134913 2013-01-05 3.288437 2013-01-06 2.434258 Freq: D, dtype: float64
s = pd.Series([1,3,5,np.nan,6,8], index=dates)
s
2013-01-01 1.0 2013-01-02 3.0 2013-01-03 5.0 2013-01-04 NaN 2013-01-05 6.0 2013-01-06 8.0 Freq: D, dtype: float64
s = s.shift(2) # shifting the content by 2 index
s
2013-01-01 NaN 2013-01-02 NaN 2013-01-03 1.0 2013-01-04 3.0 2013-01-05 5.0 2013-01-06 NaN Freq: D, dtype: float64
df.sub(s, axis='index')
A | B | C | D | F | |
---|---|---|---|---|---|
2013-01-01 | NaN | NaN | NaN | NaN | NaN |
2013-01-02 | NaN | NaN | NaN | NaN | NaN |
2013-01-03 | -2.616674 | -1.042173 | -2.356455 | 4.0 | 1.0 |
2013-01-04 | -0.843151 | -3.201232 | -2.834214 | 2.0 | 0.0 |
2013-01-05 | -7.019862 | -7.055205 | -3.630514 | 0.0 | -1.0 |
2013-01-06 | NaN | NaN | NaN | NaN | NaN |
df.apply(np.cumsum)
A | B | C | D | F | |
---|---|---|---|---|---|
2013-01-01 | 0.000000 | 0.000000 | -0.193270 | 5 | NaN |
2013-01-02 | -0.208680 | -0.296488 | -0.983279 | 10 | 1.0 |
2013-01-03 | -1.825354 | -0.338661 | -2.339734 | 15 | 3.0 |
2013-01-04 | 0.331495 | -0.539893 | -2.173948 | 20 | 6.0 |
2013-01-05 | -1.688367 | -2.595098 | -0.804461 | 25 | 10.0 |
2013-01-06 | -0.873107 | -1.828721 | -0.644022 | 30 | 15.0 |
df.apply(lambda x: x.max() - x.min())
A 4.176711 B 2.821582 C 2.725941 D 0.000000 F 4.000000 dtype: float64
s = pd.Series(np.random.randint(0, 7, size=10))
s
0 0 1 1 2 5 3 0 4 0 5 1 6 4 7 4 8 3 9 0 dtype: int32
s.value_counts() # checking unique value counts
0 4 4 2 1 2 5 1 3 1 dtype: int64
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
s
0 A 1 B 2 C 3 Aaba 4 Baca 5 NaN 6 CABA 7 dog 8 cat dtype: object
s.str.lower()
0 a 1 b 2 c 3 aaba 4 baca 5 NaN 6 caba 7 dog 8 cat dtype: object
df = pd.DataFrame(np.random.randn(10, 4))
df
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | 0.862392 | -0.487448 | 1.767044 | 1.619584 |
1 | -0.192533 | 0.452153 | 0.160491 | 0.163961 |
2 | 0.646025 | -0.710062 | -0.764748 | 2.061172 |
3 | -1.356019 | -0.077627 | -0.947133 | 0.952742 |
4 | 0.883336 | 1.352857 | -1.960570 | 1.267786 |
5 | 1.811561 | -2.001506 | -0.595290 | -0.820276 |
6 | -2.535877 | -0.415728 | -0.000989 | -1.231573 |
7 | -0.021901 | 1.382943 | -0.957129 | -0.228767 |
8 | 0.813857 | -0.864718 | 1.981206 | 1.672659 |
9 | -0.443268 | 0.921847 | -0.548603 | 0.190629 |
pieces = [df[:3], df[3:7], df[7:]]
pieces
[ 0 1 2 3 0 0.862392 -0.487448 1.767044 1.619584 1 -0.192533 0.452153 0.160491 0.163961 2 0.646025 -0.710062 -0.764748 2.061172, 0 1 2 3 3 -1.356019 -0.077627 -0.947133 0.952742 4 0.883336 1.352857 -1.960570 1.267786 5 1.811561 -2.001506 -0.595290 -0.820276 6 -2.535877 -0.415728 -0.000989 -1.231573, 0 1 2 3 7 -0.021901 1.382943 -0.957129 -0.228767 8 0.813857 -0.864718 1.981206 1.672659 9 -0.443268 0.921847 -0.548603 0.190629]
pd.concat(pieces) # concat rowwise
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | 0.862392 | -0.487448 | 1.767044 | 1.619584 |
1 | -0.192533 | 0.452153 | 0.160491 | 0.163961 |
2 | 0.646025 | -0.710062 | -0.764748 | 2.061172 |
3 | -1.356019 | -0.077627 | -0.947133 | 0.952742 |
4 | 0.883336 | 1.352857 | -1.960570 | 1.267786 |
5 | 1.811561 | -2.001506 | -0.595290 | -0.820276 |
6 | -2.535877 | -0.415728 | -0.000989 | -1.231573 |
7 | -0.021901 | 1.382943 | -0.957129 | -0.228767 |
8 | 0.813857 | -0.864718 | 1.981206 | 1.672659 |
9 | -0.443268 | 0.921847 | -0.548603 | 0.190629 |
pd.concat(pieces, axis=1) # concat column wise
0 | 1 | 2 | 3 | 0 | 1 | 2 | 3 | 0 | 1 | 2 | 3 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.862392 | -0.487448 | 1.767044 | 1.619584 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1 | -0.192533 | 0.452153 | 0.160491 | 0.163961 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2 | 0.646025 | -0.710062 | -0.764748 | 2.061172 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
3 | NaN | NaN | NaN | NaN | -1.356019 | -0.077627 | -0.947133 | 0.952742 | NaN | NaN | NaN | NaN |
4 | NaN | NaN | NaN | NaN | 0.883336 | 1.352857 | -1.960570 | 1.267786 | NaN | NaN | NaN | NaN |
5 | NaN | NaN | NaN | NaN | 1.811561 | -2.001506 | -0.595290 | -0.820276 | NaN | NaN | NaN | NaN |
6 | NaN | NaN | NaN | NaN | -2.535877 | -0.415728 | -0.000989 | -1.231573 | NaN | NaN | NaN | NaN |
7 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | -0.021901 | 1.382943 | -0.957129 | -0.228767 |
8 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.813857 | -0.864718 | 1.981206 | 1.672659 |
9 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | -0.443268 | 0.921847 | -0.548603 | 0.190629 |
pd.merge(left, right, how='inner', on=None, left_on=None, right_on=None,
left_index=False, right_index=False, sort=True,
suffixes=('_x', '_y'), copy=True, indicator=False)
left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]})
left
key | lval | |
---|---|---|
0 | foo | 1 |
1 | foo | 2 |
right
key | rval | |
---|---|---|
0 | foo | 4 |
1 | foo | 5 |
pd.merge(left, right, on='key')
key | lval | rval | |
---|---|---|---|
0 | foo | 1 | 4 |
1 | foo | 1 | 5 |
2 | foo | 2 | 4 |
3 | foo | 2 | 5 |
left = pd.DataFrame({'key': ['foo', 'bar'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'bar'], 'rval': [4, 5]})
left
key | lval | |
---|---|---|
0 | foo | 1 |
1 | bar | 2 |
pd.merge(left, right, on='key')
key | lval | rval | |
---|---|---|---|
0 | foo | 1 | 4 |
1 | bar | 2 | 5 |
df = pd.DataFrame(np.random.randn(8, 4), columns=['A','B','C','D'])
df
A | B | C | D | |
---|---|---|---|---|
0 | -0.770577 | -0.170541 | -1.076229 | -0.451980 |
1 | 1.916760 | 0.572713 | 1.164140 | 0.632860 |
2 | -1.005490 | -1.477879 | -0.110988 | 0.870474 |
3 | -0.316136 | 0.618400 | -1.937116 | -0.129132 |
4 | 0.175017 | -0.135414 | -0.088902 | 1.275741 |
5 | 0.385744 | -0.969253 | -0.307694 | 0.746990 |
6 | 1.229319 | -1.976747 | -0.243775 | 0.166781 |
7 | -0.940357 | -0.894759 | 0.202037 | 0.233910 |
s = df.iloc[3]
s
A -0.316136 B 0.618400 C -1.937116 D -0.129132 Name: 3, dtype: float64
df.append(s, ignore_index=True)
df
A | B | C | D | |
---|---|---|---|---|
0 | -0.770577 | -0.170541 | -1.076229 | -0.451980 |
1 | 1.916760 | 0.572713 | 1.164140 | 0.632860 |
2 | -1.005490 | -1.477879 | -0.110988 | 0.870474 |
3 | -0.316136 | 0.618400 | -1.937116 | -0.129132 |
4 | 0.175017 | -0.135414 | -0.088902 | 1.275741 |
5 | 0.385744 | -0.969253 | -0.307694 | 0.746990 |
6 | 1.229319 | -1.976747 | -0.243775 | 0.166781 |
7 | -0.940357 | -0.894759 | 0.202037 | 0.233910 |
By “group by” we are referring to a process involving one or more of the following steps
Splitting the data into groups based on some criteria
Applying a function to each group independently
Combining the results into a data structure
df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
'foo', 'bar', 'foo', 'foo'],
'B' : ['one', 'one', 'two', 'three',
'two', 'two', 'one', 'three'],
'C' : np.random.randn(8),
'D' : np.random.randn(8)})
df
A | B | C | D | |
---|---|---|---|---|
0 | foo | one | 0.206509 | -0.437572 |
1 | bar | one | -1.189741 | -0.096997 |
2 | foo | two | 1.289879 | -0.130252 |
3 | bar | three | 0.103044 | 0.251160 |
4 | foo | two | 1.495027 | -1.839108 |
5 | bar | two | 1.084158 | -0.955764 |
6 | foo | one | 0.064089 | 0.721713 |
7 | foo | three | -1.174013 | 1.183452 |
df.groupby('A').sum()
C | D | |
---|---|---|
A | ||
bar | -0.002539 | -0.801601 |
foo | 1.881491 | -0.501767 |
df.groupby(['A','B']).sum()
C | D | ||
---|---|---|---|
A | B | ||
bar | one | -1.189741 | -0.096997 |
three | 0.103044 | 0.251160 | |
two | 1.084158 | -0.955764 | |
foo | one | 0.270598 | 0.284141 |
three | -1.174013 | 1.183452 | |
two | 2.784906 | -1.969361 |
tuples = list(zip(*[['bar', 'bar', 'baz', 'baz',
'foo', 'foo', 'qux', 'qux'],
['one', 'two', 'one', 'two',
'one', 'two', 'one', 'two']]))
index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])
index
MultiIndex(levels=[['bar', 'baz', 'foo', 'qux'], ['one', 'two']], labels=[[0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 0, 1, 0, 1, 0, 1]], names=['first', 'second'])
df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=['A', 'B'])
df
A | B | ||
---|---|---|---|
first | second | ||
bar | one | 0.077103 | 0.071665 |
two | 0.243469 | 0.789529 | |
baz | one | -0.452906 | 0.148716 |
two | -0.829278 | -1.180111 | |
foo | one | -0.251384 | -0.004602 |
two | -0.857481 | 0.512469 | |
qux | one | 1.660127 | -1.580966 |
two | 0.521759 | -1.693887 |
The stack() method “compresses” a level in the DataFrame’s columns
df.stack()
first second bar one A 0.077103 B 0.071665 two A 0.243469 B 0.789529 baz one A -0.452906 B 0.148716 two A -0.829278 B -1.180111 foo one A -0.251384 B -0.004602 two A -0.857481 B 0.512469 qux one A 1.660127 B -1.580966 two A 0.521759 B -1.693887 dtype: float64
With a “stacked” DataFrame or Series (having a MultiIndex as the index), the inverse operation of stack() is unstack(), which by default unstacks the last level:
df.unstack() # this will unstack the inner index (last level)
A | B | |||
---|---|---|---|---|
second | one | two | one | two |
first | ||||
bar | 0.077103 | 0.243469 | 0.071665 | 0.789529 |
baz | -0.452906 | -0.829278 | 0.148716 | -1.180111 |
foo | -0.251384 | -0.857481 | -0.004602 | 0.512469 |
qux | 1.660127 | 0.521759 | -1.580966 | -1.693887 |
df.unstack('first') # unstacking by lable
A | B | |||||||
---|---|---|---|---|---|---|---|---|
first | bar | baz | foo | qux | bar | baz | foo | qux |
second | ||||||||
one | 0.077103 | -0.452906 | -0.251384 | 1.660127 | 0.071665 | 0.148716 | -0.004602 | -1.580966 |
two | 0.243469 | -0.829278 | -0.857481 | 0.521759 | 0.789529 | -1.180111 | 0.512469 | -1.693887 |
df.unstack('second')
A | B | |||
---|---|---|---|---|
second | one | two | one | two |
first | ||||
bar | 0.077103 | 0.243469 | 0.071665 | 0.789529 |
baz | -0.452906 | -0.829278 | 0.148716 | -1.180111 |
foo | -0.251384 | -0.857481 | -0.004602 | 0.512469 |
qux | 1.660127 | 0.521759 | -1.580966 | -1.693887 |
df.unstack(0) # unstacking by index
A | B | |||||||
---|---|---|---|---|---|---|---|---|
first | bar | baz | foo | qux | bar | baz | foo | qux |
second | ||||||||
one | 0.077103 | -0.452906 | -0.251384 | 1.660127 | 0.071665 | 0.148716 | -0.004602 | -1.580966 |
two | 0.243469 | -0.829278 | -0.857481 | 0.521759 | 0.789529 | -1.180111 | 0.512469 | -1.693887 |
df = pd.DataFrame({'A' : ['one', 'one', 'two', 'three'] * 3,
'B' : ['A', 'B', 'C'] * 4,
'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2,
'D' : np.random.randn(12),
'E' : np.random.randn(12)})
df
A | B | C | D | E | |
---|---|---|---|---|---|
0 | one | A | foo | 0.869958 | -2.824236 |
1 | one | B | foo | -1.263218 | 0.101729 |
2 | two | C | foo | -0.743723 | 1.409101 |
3 | three | A | bar | 0.292353 | 0.282451 |
4 | one | B | bar | -0.091658 | -0.752110 |
5 | one | C | bar | 1.295819 | -0.448113 |
6 | two | A | foo | 1.573841 | 0.196957 |
7 | three | B | foo | -0.645814 | 1.409150 |
8 | one | C | foo | 0.635127 | -1.389018 |
9 | one | A | bar | -1.174726 | -0.077247 |
10 | two | B | bar | 1.264011 | -1.072017 |
11 | three | C | bar | 1.441732 | 0.543449 |
pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C'])
C | bar | foo | |
---|---|---|---|
A | B | ||
one | A | -1.174726 | 0.869958 |
B | -0.091658 | -1.263218 | |
C | 1.295819 | 0.635127 | |
three | A | 0.292353 | NaN |
B | NaN | -0.645814 | |
C | 1.441732 | NaN | |
two | A | NaN | 1.573841 |
B | 1.264011 | NaN | |
C | NaN | -0.743723 |
rng = pd.date_range('1/1/2012', periods=100, freq='S')
rng[:5]
DatetimeIndex(['2012-01-01 00:00:00', '2012-01-01 00:00:01', '2012-01-01 00:00:02', '2012-01-01 00:00:03', '2012-01-01 00:00:04'], dtype='datetime64[ns]', freq='S')
ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)
ts[:8]
2012-01-01 00:00:00 0 2012-01-01 00:00:01 175 2012-01-01 00:00:02 9 2012-01-01 00:00:03 173 2012-01-01 00:00:04 285 2012-01-01 00:00:05 262 2012-01-01 00:00:06 148 2012-01-01 00:00:07 255 Freq: S, dtype: int32
ts.sum()
26318
ts.resample('5Min').sum()
2012-01-01 26318 Freq: 5T, dtype: int32
ts.resample('5Min')
C:\Anaconda3\lib\site-packages\IPython\lib\pretty.py:108: FutureWarning: .resample() is now a deferred operation use .resample(...).mean() instead of .resample(...) return getattr(obj, attr, default)
DatetimeIndexResampler [freq=<5 * Minutes>, axis=0, closed=left, label=left, convention=start, base=0]
Time zone representation
rng = pd.date_range('3/6/2012 00:00', periods=5, freq='D')
rng
DatetimeIndex(['2012-03-06', '2012-03-07', '2012-03-08', '2012-03-09', '2012-03-10'], dtype='datetime64[ns]', freq='D')
ts = pd.Series(np.random.randn(len(rng)), rng)
ts
2012-03-06 1.166836 2012-03-07 0.309653 2012-03-08 0.953754 2012-03-09 0.606327 2012-03-10 -0.804818 Freq: D, dtype: float64
ts_utc = ts.tz_localize('UTC')
ts_utc
2012-03-06 00:00:00+00:00 1.166836 2012-03-07 00:00:00+00:00 0.309653 2012-03-08 00:00:00+00:00 0.953754 2012-03-09 00:00:00+00:00 0.606327 2012-03-10 00:00:00+00:00 -0.804818 Freq: D, dtype: float64
Convert to another time zone
ts_utc.tz_convert('Asia/Calcutta')
2012-03-06 05:30:00+05:30 1.166836 2012-03-07 05:30:00+05:30 0.309653 2012-03-08 05:30:00+05:30 0.953754 2012-03-09 05:30:00+05:30 0.606327 2012-03-10 05:30:00+05:30 -0.804818 Freq: D, dtype: float64
Converting between time span representations
rng = pd.date_range('1/1/2012', periods=5, freq='M')
ts = pd.Series(np.random.randn(len(rng)), index=rng)
ts
2012-01-31 -1.609627 2012-02-29 -1.381380 2012-03-31 1.065309 2012-04-30 -0.192158 2012-05-31 -0.742545 Freq: M, dtype: float64
ps = ts.to_period()
ps
2012-01 -1.609627 2012-02 -1.381380 2012-03 1.065309 2012-04 -0.192158 2012-05 -0.742545 Freq: M, dtype: float64
ps.to_timestamp()
2012-01-01 -1.609627 2012-02-01 -1.381380 2012-03-01 1.065309 2012-04-01 -0.192158 2012-05-01 -0.742545 Freq: MS, dtype: float64
Converting between period and timestamp enables some convenient arithmetic functions to be used. In the following example, we convert a quarterly frequency with year ending in November to 9am of the end of the month following the quarter end:
prng = pd.period_range('1990Q1', '2000Q4', freq='Q-NOV')
ts = pd.Series(np.random.randn(len(prng)), prng)
ts.head()
1990Q1 -0.583649 1990Q2 0.855731 1990Q3 -0.625833 1990Q4 1.185059 1991Q1 -0.317823 Freq: Q-NOV, dtype: float64
ts.index = (prng.asfreq('M', 'e') + 1).asfreq('H', 's') + 9
ts.head()
1990-03-01 09:00 -0.583649 1990-06-01 09:00 0.855731 1990-09-01 09:00 -0.625833 1990-12-01 09:00 1.185059 1991-03-01 09:00 -0.317823 Freq: H, dtype: float64
df = pd.DataFrame({"id":[1,2,3,4,5,6], "raw_grade":['a', 'b', 'b', 'a', 'a', 'e']})
df
id | raw_grade | |
---|---|---|
0 | 1 | a |
1 | 2 | b |
2 | 3 | b |
3 | 4 | a |
4 | 5 | a |
5 | 6 | e |
df.dtypes
id int64 raw_grade object dtype: object
df["grade"] = df["raw_grade"].astype("category")
df["grade"]
0 a 1 b 2 b 3 a 4 a 5 e Name: grade, dtype: category Categories (3, object): [a, b, e]
Rename the categories to more meaningful names (assigning to Series.cat.categories is inplace!)
df["grade"].cat.categories = ["very good", "good", "very bad"]
df
id | raw_grade | grade | |
---|---|---|---|
0 | 1 | a | very good |
1 | 2 | b | good |
2 | 3 | b | good |
3 | 4 | a | very good |
4 | 5 | a | very good |
5 | 6 | e | very bad |
# Reorder the categories and simultaneously add the
# missing categories (methods under Series .cat return a new Series per default).
df["grade"] = df["grade"].cat.set_categories(["very bad", "bad", "medium", "good", "very good"])
df
id | raw_grade | grade | |
---|---|---|---|
0 | 1 | a | very good |
1 | 2 | b | good |
2 | 3 | b | good |
3 | 4 | a | very good |
4 | 5 | a | very good |
5 | 6 | e | very bad |
# Sorting is per order in the categories, not lexical order.
df.sort_values(by="grade")
id | raw_grade | grade | |
---|---|---|---|
5 | 6 | e | very bad |
1 | 2 | b | good |
2 | 3 | b | good |
0 | 1 | a | very good |
3 | 4 | a | very good |
4 | 5 | a | very good |
# Grouping by a categorical column shows also empty categories.
df.groupby("grade").size()
grade very bad 1 bad 0 medium 0 good 2 very good 3 dtype: int64
ts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000))
ts = ts.cumsum()
ts.plot()
<matplotlib.axes._subplots.AxesSubplot at 0x8d607b8>
# On DataFrame, plot() is a convenience to plot all of the columns with labels:
df = pd.DataFrame(np.random.randn(1000, 4), index=ts.index,
columns=['A', 'B', 'C', 'D'])
df = df.cumsum()
df.plot(grid=True)
<matplotlib.axes._subplots.AxesSubplot at 0x8f4ecf8>
df[:4]
A | B | C | D | |
---|---|---|---|---|
2000-01-01 | 0.383624 | 0.429101 | 0.743808 | 1.577623 |
2000-01-02 | -0.609680 | 0.453151 | 1.483189 | 0.865432 |
2000-01-03 | -0.652463 | 0.116940 | 2.300037 | -0.406062 |
2000-01-04 | -1.804128 | 0.703522 | 1.802338 | -0.694557 |
to_csv and read_csv
# writing it to csv
df.to_csv("dataset/df_as_csv.csv")
# reading it
df2 = pd.read_csv("dataset/df_as_csv.csv")
df2.head()
Unnamed: 0 | A | B | C | D | |
---|---|---|---|---|---|
0 | 2000-01-01 | 0.383624 | 0.429101 | 0.743808 | 1.577623 |
1 | 2000-01-02 | -0.609680 | 0.453151 | 1.483189 | 0.865432 |
2 | 2000-01-03 | -0.652463 | 0.116940 | 2.300037 | -0.406062 |
3 | 2000-01-04 | -1.804128 | 0.703522 | 1.802338 | -0.694557 |
4 | 2000-01-05 | -3.547775 | 0.947640 | 1.720123 | -0.952502 |
for hdfs, to_hdf and read_hdf
df.to_hdf('foo.h5','df')
pd.read_hdf('foo.h5','df').head()
A | B | C | D | |
---|---|---|---|---|
2000-01-01 | 0.383624 | 0.429101 | 0.743808 | 1.577623 |
2000-01-02 | -0.609680 | 0.453151 | 1.483189 | 0.865432 |
2000-01-03 | -0.652463 | 0.116940 | 2.300037 | -0.406062 |
2000-01-04 | -1.804128 | 0.703522 | 1.802338 | -0.694557 |
2000-01-05 | -3.547775 | 0.947640 | 1.720123 | -0.952502 |
For Excel, to_excel and read_excel
df.to_excel('foo.xlsx', sheet_name='Sheet1')
pd.read_excel('foo.xlsx', 'Sheet1', index_col=None, na_values=['NA']).head()
A | B | C | D | |
---|---|---|---|---|
2000-01-01 | 0.383624 | 0.429101 | 0.743808 | 1.577623 |
2000-01-02 | -0.609680 | 0.453151 | 1.483189 | 0.865432 |
2000-01-03 | -0.652463 | 0.116940 | 2.300037 | -0.406062 |
2000-01-04 | -1.804128 | 0.703522 | 1.802338 | -0.694557 |
2000-01-05 | -3.547775 | 0.947640 | 1.720123 | -0.952502 |