In [1]:
# import
import pandas as pd
import numpy as np
In [2]:
df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],
                [np.nan, np.nan], [0.75, -1.3]],
               index=['a', 'b', 'c', 'd'],
                columns=['one', 'two'])
In [3]:
df
Out[3]:
one two
a 1.40 NaN
b 7.10 -4.5
c NaN NaN
d 0.75 -1.3
In [4]:
# printning sum of df columns
df.sum()
Out[4]:
one    9.25
two   -5.80
dtype: float64
In [5]:
  df.mean()
Out[5]:
one    3.083333
two   -2.900000
dtype: float64
In [6]:
df.sum(axis=1)
Out[6]:
a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64
In [7]:
# NA values are excluded unless the entire slice is NA. This can be disabled using the skipna option
df.sum(axis=1, skipna=False)
Out[7]:
a     NaN
b    2.60
c     NaN
d   -0.55
dtype: float64

Options for reduction method

axis       Axis to reduce over. 0 for DataFrame’s rows and 1 for columns.
skipna     Exclude missing values, True by default.
level      Reduce grouped by level if the axis is hierarchically-indexed (MultiIndex).
In [8]:
# idxmin and idxmax, return indirect statistics like the index value where the minimum or maximum values are attained
df.idxmin()
Out[8]:
one    d
two    b
dtype: object
In [9]:
# for cumulative sum
df.cumsum()
Out[9]:
one two
a 1.40 NaN
b 8.50 -4.5
c NaN NaN
d 9.25 -5.8
In [10]:
# describing 
df.describe()
C:\tools\Anaconda3\lib\site-packages\numpy\lib\function_base.py:3834: RuntimeWarning: Invalid value encountered in percentile
  RuntimeWarning)
Out[10]:
one two
count 3.000000 2.000000
mean 3.083333 -2.900000
std 3.493685 2.262742
min 0.750000 -4.500000
25% NaN NaN
50% NaN NaN
75% NaN NaN
max 7.100000 -1.300000

Descriptive and summary statistics

count               Number of non-NA values
describe            Compute set of summary statistics for Series or each DataFrame column
min, max            Compute minimum and maximum values
argmin, argmax      Compute index locations (integers) at which minimum or maximum value obtained, respectively
idxmin, idxmax      Compute index values at which minimum or maximum value obtained, respectively
quantile            Compute sample quantile ranging from 0 to 1
sum                 Sum of values
mean                Mean of values
median              Arithmetic median (50% quantile) of values
mad                 Mean absolute deviation from mean value
var                 Sample variance of values
std                 Sample standard deviation of values
skew                Sample skewness (3rd moment) of values
kurt                Sample kurtosis (4th moment) of values
cumsum              Cumulative sum of values
cummin, cummax      Cumulative minimum or maximum of values, respectively
cumprod             Cumulative product of values
diff                Compute 1st arithmetic difference (useful for time series)
pct_change          Compute percent changes
In [11]:
df.quantile()
C:\tools\Anaconda3\lib\site-packages\numpy\lib\function_base.py:3834: RuntimeWarning: Invalid value encountered in percentile
  RuntimeWarning)
Out[11]:
one   NaN
two   NaN
dtype: float64
In [12]:
obj = pd.Series(['a', 'a', 'b', 'c'] * 4)
print(obj)
0     a
1     a
2     b
3     c
4     a
5     a
6     b
7     c
8     a
9     a
10    b
11    c
12    a
13    a
14    b
15    c
dtype: object
In [13]:
obj.describe()
Out[13]:
count     16
unique     3
top        a
freq       8
dtype: object

Correlation and Covariance

In [14]:
import pandas.io.data as web

all_data = {}

for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']:
    all_data[ticker] = web.get_data_yahoo(ticker, '1/1/2000', '1/1/2010')
    price = pd.DataFrame({tic: data['Adj Close']
                   for tic, data in all_data.items()})
    
volume = pd.DataFrame({tic: data['Volume']
                    for tic, data in all_data.items()})
C:\tools\Anaconda3\lib\site-packages\pandas\io\data.py:35: FutureWarning: 
The pandas.io.data module is moved to a separate package (pandas-datareader) and will be removed from pandas in a future version.
After installing the pandas-datareader package (https://github.com/pydata/pandas-datareader), you can change the import ``from pandas.io import data, wb`` to ``from pandas_datareader import data, wb``.
  FutureWarning)
In [15]:
returns = price.pct_change()
In [16]:
returns.tail()
Out[16]:
AAPL GOOG IBM MSFT
Date
2009-12-24 0.034339 0.011117 0.004385 0.002587
2009-12-28 0.012294 0.007098 0.013326 0.005484
2009-12-29 -0.011861 -0.005571 -0.003477 0.007058
2009-12-30 0.012147 0.005376 0.005461 -0.013699
2009-12-31 -0.004300 -0.004416 -0.012597 -0.015504
In [17]:
returns.MSFT.corr(returns.IBM)
Out[17]:
0.4959796261031541
In [18]:
 returns.MSFT.cov(returns.IBM)
Out[18]:
0.00021595760076743132
In [19]:
returns.corr()
Out[19]:
AAPL GOOG IBM MSFT
AAPL 1.000000 0.470676 0.410011 0.424305
GOOG 0.470676 1.000000 0.390689 0.443587
IBM 0.410011 0.390689 1.000000 0.495980
MSFT 0.424305 0.443587 0.495980 1.000000
In [20]:
 returns.cov()
Out[20]:
AAPL GOOG IBM MSFT
AAPL 0.001027 0.000303 0.000252 0.000309
GOOG 0.000303 0.000580 0.000142 0.000205
IBM 0.000252 0.000142 0.000367 0.000216
MSFT 0.000309 0.000205 0.000216 0.000516
In [21]:
returns.corrwith(returns.IBM)
Out[21]:
AAPL    0.410011
GOOG    0.390689
IBM     1.000000
MSFT    0.495980
dtype: float64
In [22]:
returns.corrwith(volume)
Out[22]:
AAPL   -0.057549
GOOG    0.062647
IBM    -0.007892
MSFT   -0.014245
dtype: float64

Unique Values, Value Counts, and Membership

In [23]:
obj = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])
print(obj)
0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object
In [24]:
obj.unique()
Out[24]:
array(['c', 'a', 'd', 'b'], dtype=object)
In [25]:
 obj.value_counts()
Out[25]:
a    3
c    3
b    2
d    1
dtype: int64
In [26]:
pd.value_counts(obj.values, sort=False)
Out[26]:
c    3
d    1
b    2
a    3
dtype: int64
In [27]:
mask = obj.isin(['b', 'c'])
print(mask)
0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool
In [28]:
print(obj[mask])
0    c
5    b
6    b
7    c
8    c
dtype: object

Unique, value counts, and binning method

isin          Compute boolean array indicating whether each Series value is contained in the passed sequence of values.
unique        Compute array of unique values in a Series, returned in the order observed.
value_counts  Return a Series containing unique values as its index and frequencies as its values, ordered count in
descending order.
In [29]:
data = pd.DataFrame({'Qu1': [1, 3, 4, 3, 4],
                  'Qu2': [2, 3, 1, 2, 3],
                  'Qu3': [1, 5, 2, 4, 4]})
print(data)
   Qu1  Qu2  Qu3
0    1    2    1
1    3    3    5
2    4    1    2
3    3    2    4
4    4    3    4
In [30]:
result = data.apply(pd.value_counts)
result
Out[30]:
Qu1 Qu2 Qu3
1 1.0 1.0 1.0
2 NaN 2.0 1.0
3 2.0 2.0 NaN
4 2.0 NaN 2.0
5 NaN NaN 1.0
In [31]:
result.fillna(0)
Out[31]:
Qu1 Qu2 Qu3
1 1.0 1.0 1.0
2 0.0 2.0 1.0
3 2.0 2.0 0.0
4 2.0 0.0 2.0
5 0.0 0.0 1.0

Handling Missing Data

In [32]:
string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])
print(string_data)
0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object
In [33]:
string_data.isnull()
Out[33]:
0    False
1    False
2     True
3    False
dtype: bool
In [34]:
string_data[0] = None    # None is also treated as NaN
string_data.isnull()
Out[34]:
0     True
1    False
2     True
3    False
dtype: bool

NA handling methods

dropna          Filter axis labels based on whether values for each label have missing data, with varying thresholds for how much missing data to tolerate.
fillna          Fill in missing data with some value or using an interpolation method such as 'ffill' or 'bfill'.
isnull          Return like-type object containing boolean values indicating which values are missing / NA.
notnull         Negation of isnull.

Filtering Out Missing Data

In [35]:
from numpy import nan as NA

data = pd.Series([1, NA, 3.5, NA, 7])
print(data)
0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64
In [36]:
data.dropna()  # dropping na records
Out[36]:
0    1.0
2    3.5
4    7.0
dtype: float64
In [37]:
# same can be achieved as binary filtering
data[data.notnull()]
Out[37]:
0    1.0
2    3.5
4    7.0
dtype: float64
In [38]:
data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],
                  [NA, NA, NA], [NA, 6.5, 3.]])
print(data)
     0    1    2
0  1.0  6.5  3.0
1  1.0  NaN  NaN
2  NaN  NaN  NaN
3  NaN  6.5  3.0
In [39]:
data.dropna()
Out[39]:
0 1 2
0 1.0 6.5 3.0
In [40]:
data.dropna(how='all')  # this will remove only those records which has NaN in all columns
Out[40]:
0 1 2
0 1.0 6.5 3.0
1 1.0 NaN NaN
3 NaN 6.5 3.0
In [41]:
# can drop the columns same way with axis = 1
data[4] = NA
print(data)
     0    1    2   4
0  1.0  6.5  3.0 NaN
1  1.0  NaN  NaN NaN
2  NaN  NaN  NaN NaN
3  NaN  6.5  3.0 NaN
In [42]:
data.dropna(how='all', axis=1)
Out[42]:
0 1 2
0 1.0 6.5 3.0
1 1.0 NaN NaN
2 NaN NaN NaN
3 NaN 6.5 3.0
In [43]:
data.dropna(how='all', axis=1).dropna(how='all')
Out[43]:
0 1 2
0 1.0 6.5 3.0
1 1.0 NaN NaN
3 NaN 6.5 3.0
In [44]:
df = pd.DataFrame(np.random.randn(7, 3))
print(df)
          0         1         2
0 -1.030615  0.602560 -0.387511
1  2.186335 -2.965361  0.432411
2 -0.392753 -1.305728  0.178472
3 -1.287533  0.233790  1.783785
4  0.012547 -0.827266  0.196979
5 -0.027398 -0.616937  0.645708
6  1.407688  0.178475  3.133043
In [45]:
df.ix[:4, 1] = NA; 
df.ix[:2, 2] = NA;
print(df)
          0         1         2
0 -1.030615       NaN       NaN
1  2.186335       NaN       NaN
2 -0.392753       NaN       NaN
3 -1.287533       NaN  1.783785
4  0.012547       NaN  0.196979
5 -0.027398 -0.616937  0.645708
6  1.407688  0.178475  3.133043
In [46]:
df.dropna(thresh=2)
Out[46]:
0 1 2
3 -1.287533 NaN 1.783785
4 0.012547 NaN 0.196979
5 -0.027398 -0.616937 0.645708
6 1.407688 0.178475 3.133043

Filling in Missing Data

In [47]:
data.fillna(999)  # replacing na with some value
Out[47]:
0 1 2 4
0 1.0 6.5 3.0 999.0
1 1.0 999.0 999.0 999.0
2 999.0 999.0 999.0 999.0
3 999.0 6.5 3.0 999.0
In [48]:
df.fillna(0)
Out[48]:
0 1 2
0 -1.030615 0.000000 0.000000
1 2.186335 0.000000 0.000000
2 -0.392753 0.000000 0.000000
3 -1.287533 0.000000 1.783785
4 0.012547 0.000000 0.196979
5 -0.027398 -0.616937 0.645708
6 1.407688 0.178475 3.133043
In [49]:
df.fillna({1: 0.5, 2: -1})
Out[49]:
0 1 2
0 -1.030615 0.500000 -1.000000
1 2.186335 0.500000 -1.000000
2 -0.392753 0.500000 -1.000000
3 -1.287533 0.500000 1.783785
4 0.012547 0.500000 0.196979
5 -0.027398 -0.616937 0.645708
6 1.407688 0.178475 3.133043
In [50]:
# fillna returns a new object, but you can modify the existing object in place
df
Out[50]:
0 1 2
0 -1.030615 NaN NaN
1 2.186335 NaN NaN
2 -0.392753 NaN NaN
3 -1.287533 NaN 1.783785
4 0.012547 NaN 0.196979
5 -0.027398 -0.616937 0.645708
6 1.407688 0.178475 3.133043
In [51]:
_ = df.fillna(0, inplace=True) # modify the existing object in place
df
Out[51]:
0 1 2
0 -1.030615 0.000000 0.000000
1 2.186335 0.000000 0.000000
2 -0.392753 0.000000 0.000000
3 -1.287533 0.000000 1.783785
4 0.012547 0.000000 0.196979
5 -0.027398 -0.616937 0.645708
6 1.407688 0.178475 3.133043
In [52]:
df = pd.DataFrame(np.random.randn(6, 3))
df.ix[2:, 1] = NA
df.ix[4:, 2] = NA
df
Out[52]:
0 1 2
0 1.250792 0.430961 1.018721
1 -0.681740 -0.679749 -0.765389
2 -2.656496 NaN -1.899419
3 0.879665 NaN -0.637943
4 0.698958 NaN NaN
5 -1.493191 NaN NaN
In [53]:
df.fillna(method='ffill')   # repeat the last value of columns into NaN field
Out[53]:
0 1 2
0 1.250792 0.430961 1.018721
1 -0.681740 -0.679749 -0.765389
2 -2.656496 -0.679749 -1.899419
3 0.879665 -0.679749 -0.637943
4 0.698958 -0.679749 -0.637943
5 -1.493191 -0.679749 -0.637943
In [54]:
df.fillna(method='ffill', limit=2)  # limit the fill
Out[54]:
0 1 2
0 1.250792 0.430961 1.018721
1 -0.681740 -0.679749 -0.765389
2 -2.656496 -0.679749 -1.899419
3 0.879665 -0.679749 -0.637943
4 0.698958 NaN -0.637943
5 -1.493191 NaN -0.637943
In [55]:
df.fillna(df.mean())
Out[55]:
0 1 2
0 1.250792 0.430961 1.018721
1 -0.681740 -0.679749 -0.765389
2 -2.656496 -0.124394 -1.899419
3 0.879665 -0.124394 -0.637943
4 0.698958 -0.124394 -0.571007
5 -1.493191 -0.124394 -0.571007

fillna function arguments

value             Scalar value or dict-like object to use to fill missing values
method            Interpolation, by default 'ffill' if function called with no other arguments
axis              Axis to fill on, default axis=0
inplace           Modify the calling object without producing a copy
limit             For forward and backward filling, maximum number of consecutive periods to fill