In [1]:

# import
import pandas as pd
import numpy as np

In [2]:

df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],
                [np.nan, np.nan], [0.75, -1.3]],
               index=['a', 'b', 'c', 'd'],
                columns=['one', 'two'])

In [3]:

df

Out[3]:

	one	two
a	1.40	NaN
b	7.10	-4.5
c	NaN	NaN
d	0.75	-1.3

In [4]:

# printning sum of df columns
df.sum()

Out[4]:

one    9.25
two   -5.80
dtype: float64

In [5]:

  df.mean()

Out[5]:

one    3.083333
two   -2.900000
dtype: float64

In [6]:

df.sum(axis=1)

Out[6]:

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [7]:

# NA values are excluded unless the entire slice is NA. This can be disabled using the skipna option
df.sum(axis=1, skipna=False)

Out[7]:

a     NaN
b    2.60
c     NaN
d   -0.55
dtype: float64

Options for reduction method¶

axis       Axis to reduce over. 0 for DataFrame’s rows and 1 for columns.
skipna     Exclude missing values, True by default.
level      Reduce grouped by level if the axis is hierarchically-indexed (MultiIndex).

In [8]:

# idxmin and idxmax, return indirect statistics like the index value where the minimum or maximum values are attained
df.idxmin()

Out[8]:

one    d
two    b
dtype: object

In [9]:

# for cumulative sum
df.cumsum()

Out[9]:

	one	two
a	1.40	NaN
b	8.50	-4.5
c	NaN	NaN
d	9.25	-5.8

In [10]:

# describing 
df.describe()

C:\tools\Anaconda3\lib\site-packages\numpy\lib\function_base.py:3834: RuntimeWarning: Invalid value encountered in percentile
  RuntimeWarning)

Out[10]:

	one	two
count	3.000000	2.000000
mean	3.083333	-2.900000
std	3.493685	2.262742
min	0.750000	-4.500000
25%	NaN	NaN
50%	NaN	NaN
75%	NaN	NaN
max	7.100000	-1.300000

Descriptive and summary statistics¶

count               Number of non-NA values
describe            Compute set of summary statistics for Series or each DataFrame column
min, max            Compute minimum and maximum values
argmin, argmax      Compute index locations (integers) at which minimum or maximum value obtained, respectively
idxmin, idxmax      Compute index values at which minimum or maximum value obtained, respectively
quantile            Compute sample quantile ranging from 0 to 1
sum                 Sum of values
mean                Mean of values
median              Arithmetic median (50% quantile) of values
mad                 Mean absolute deviation from mean value
var                 Sample variance of values
std                 Sample standard deviation of values
skew                Sample skewness (3rd moment) of values
kurt                Sample kurtosis (4th moment) of values
cumsum              Cumulative sum of values
cummin, cummax      Cumulative minimum or maximum of values, respectively
cumprod             Cumulative product of values
diff                Compute 1st arithmetic difference (useful for time series)
pct_change          Compute percent changes

In [11]:

df.quantile()

C:\tools\Anaconda3\lib\site-packages\numpy\lib\function_base.py:3834: RuntimeWarning: Invalid value encountered in percentile
  RuntimeWarning)

Out[11]:

one   NaN
two   NaN
dtype: float64

In [12]:

obj = pd.Series(['a', 'a', 'b', 'c'] * 4)
print(obj)

0     a
1     a
2     b
3     c
4     a
5     a
6     b
7     c
8     a
9     a
10    b
11    c
12    a
13    a
14    b
15    c
dtype: object

In [13]:

obj.describe()

Out[13]:

count     16
unique     3
top        a
freq       8
dtype: object

Correlation and Covariance¶

In [14]:

import pandas.io.data as web

all_data = {}

for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']:
    all_data[ticker] = web.get_data_yahoo(ticker, '1/1/2000', '1/1/2010')
    price = pd.DataFrame({tic: data['Adj Close']
                   for tic, data in all_data.items()})
    
volume = pd.DataFrame({tic: data['Volume']
                    for tic, data in all_data.items()})

C:\tools\Anaconda3\lib\site-packages\pandas\io\data.py:35: FutureWarning: 
The pandas.io.data module is moved to a separate package (pandas-datareader) and will be removed from pandas in a future version.
After installing the pandas-datareader package (https://github.com/pydata/pandas-datareader), you can change the import ``from pandas.io import data, wb`` to ``from pandas_datareader import data, wb``.
  FutureWarning)

In [15]:

returns = price.pct_change()

In [16]:

returns.tail()

Out[16]:

	AAPL	GOOG	IBM	MSFT
Date
2009-12-24	0.034339	0.011117	0.004385	0.002587
2009-12-28	0.012294	0.007098	0.013326	0.005484
2009-12-29	-0.011861	-0.005571	-0.003477	0.007058
2009-12-30	0.012147	0.005376	0.005461	-0.013699
2009-12-31	-0.004300	-0.004416	-0.012597	-0.015504

In [17]:

returns.MSFT.corr(returns.IBM)

Out[17]:

0.4959796261031541

In [18]:

 returns.MSFT.cov(returns.IBM)

Out[18]:

0.00021595760076743132

In [19]:

returns.corr()

Out[19]:

	AAPL	GOOG	IBM	MSFT
AAPL	1.000000	0.470676	0.410011	0.424305
GOOG	0.470676	1.000000	0.390689	0.443587
IBM	0.410011	0.390689	1.000000	0.495980
MSFT	0.424305	0.443587	0.495980	1.000000

In [20]:

 returns.cov()

Out[20]:

	AAPL	GOOG	IBM	MSFT
AAPL	0.001027	0.000303	0.000252	0.000309
GOOG	0.000303	0.000580	0.000142	0.000205
IBM	0.000252	0.000142	0.000367	0.000216
MSFT	0.000309	0.000205	0.000216	0.000516

In [21]:

returns.corrwith(returns.IBM)

Out[21]:

AAPL    0.410011
GOOG    0.390689
IBM     1.000000
MSFT    0.495980
dtype: float64

In [22]:

returns.corrwith(volume)

Out[22]:

AAPL   -0.057549
GOOG    0.062647
IBM    -0.007892
MSFT   -0.014245
dtype: float64

Unique Values, Value Counts, and Membership¶

In [23]:

obj = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])
print(obj)

0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object

In [24]:

obj.unique()

Out[24]:

array(['c', 'a', 'd', 'b'], dtype=object)

In [25]:

 obj.value_counts()

Out[25]:

a    3
c    3
b    2
d    1
dtype: int64

In [26]:

pd.value_counts(obj.values, sort=False)

Out[26]:

c    3
d    1
b    2
a    3
dtype: int64

In [27]:

mask = obj.isin(['b', 'c'])
print(mask)

0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool

In [28]:

print(obj[mask])

0    c
5    b
6    b
7    c
8    c
dtype: object

Unique, value counts, and binning method¶

isin          Compute boolean array indicating whether each Series value is contained in the passed sequence of values.
unique        Compute array of unique values in a Series, returned in the order observed.
value_counts  Return a Series containing unique values as its index and frequencies as its values, ordered count in
descending order.

In [29]:

data = pd.DataFrame({'Qu1': [1, 3, 4, 3, 4],
                  'Qu2': [2, 3, 1, 2, 3],
                  'Qu3': [1, 5, 2, 4, 4]})
print(data)

   Qu1  Qu2  Qu3
0    1    2    1
1    3    3    5
2    4    1    2
3    3    2    4
4    4    3    4

In [30]:

result = data.apply(pd.value_counts)
result

Out[30]:

	Qu1	Qu2	Qu3
1	1.0	1.0	1.0
2	NaN	2.0	1.0
3	2.0	2.0	NaN
4	2.0	NaN	2.0
5	NaN	NaN	1.0

In [31]:

result.fillna(0)

Out[31]:

	Qu1	Qu2	Qu3
1	1.0	1.0	1.0
2	0.0	2.0	1.0
3	2.0	2.0	0.0
4	2.0	0.0	2.0
5	0.0	0.0	1.0

Handling Missing Data¶

In [32]:

string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])
print(string_data)

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [33]:

string_data.isnull()

Out[33]:

0    False
1    False
2     True
3    False
dtype: bool

In [34]:

string_data[0] = None    # None is also treated as NaN
string_data.isnull()

Out[34]:

0     True
1    False
2     True
3    False
dtype: bool

NA handling methods¶

dropna          Filter axis labels based on whether values for each label have missing data, with varying thresholds for how much missing data to tolerate.
fillna          Fill in missing data with some value or using an interpolation method such as 'ffill' or 'bfill'.
isnull          Return like-type object containing boolean values indicating which values are missing / NA.
notnull         Negation of isnull.

Filtering Out Missing Data¶

In [35]:

from numpy import nan as NA

data = pd.Series([1, NA, 3.5, NA, 7])
print(data)

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [36]:

data.dropna()  # dropping na records

Out[36]:

0    1.0
2    3.5
4    7.0
dtype: float64

In [37]:

# same can be achieved as binary filtering
data[data.notnull()]

Out[37]:

0    1.0
2    3.5
4    7.0
dtype: float64

In [38]:

data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],
                  [NA, NA, NA], [NA, 6.5, 3.]])
print(data)

     0    1    2
0  1.0  6.5  3.0
1  1.0  NaN  NaN
2  NaN  NaN  NaN
3  NaN  6.5  3.0

In [39]:

data.dropna()

Out[39]:

	0	1	2
0	1.0	6.5	3.0

In [40]:

data.dropna(how='all')  # this will remove only those records which has NaN in all columns

Out[40]:

	0	1	2
0	1.0	6.5	3.0
1	1.0	NaN	NaN
3	NaN	6.5	3.0

In [41]:

# can drop the columns same way with axis = 1
data[4] = NA
print(data)

     0    1    2   4
0  1.0  6.5  3.0 NaN
1  1.0  NaN  NaN NaN
2  NaN  NaN  NaN NaN
3  NaN  6.5  3.0 NaN

In [42]:

data.dropna(how='all', axis=1)

Out[42]:

	0	1	2
0	1.0	6.5	3.0
1	1.0	NaN	NaN
2	NaN	NaN	NaN
3	NaN	6.5	3.0

In [43]:

data.dropna(how='all', axis=1).dropna(how='all')

Out[43]:

	0	1	2
0	1.0	6.5	3.0
1	1.0	NaN	NaN
3	NaN	6.5	3.0

In [44]:

df = pd.DataFrame(np.random.randn(7, 3))
print(df)

          0         1         2
0 -1.030615  0.602560 -0.387511
1  2.186335 -2.965361  0.432411
2 -0.392753 -1.305728  0.178472
3 -1.287533  0.233790  1.783785
4  0.012547 -0.827266  0.196979
5 -0.027398 -0.616937  0.645708
6  1.407688  0.178475  3.133043

In [45]:

df.ix[:4, 1] = NA; 
df.ix[:2, 2] = NA;
print(df)

          0         1         2
0 -1.030615       NaN       NaN
1  2.186335       NaN       NaN
2 -0.392753       NaN       NaN
3 -1.287533       NaN  1.783785
4  0.012547       NaN  0.196979
5 -0.027398 -0.616937  0.645708
6  1.407688  0.178475  3.133043

In [46]:

df.dropna(thresh=2)

Out[46]:

	0	1	2
3	-1.287533	NaN	1.783785
4	0.012547	NaN	0.196979
5	-0.027398	-0.616937	0.645708
6	1.407688	0.178475	3.133043

Filling in Missing Data¶

In [47]:

data.fillna(999)  # replacing na with some value

Out[47]:

	0	1	2	4
0	1.0	6.5	3.0	999.0
1	1.0	999.0	999.0	999.0
2	999.0	999.0	999.0	999.0
3	999.0	6.5	3.0	999.0

In [48]:

df.fillna(0)

Out[48]:

	0	1	2
0	-1.030615	0.000000	0.000000
1	2.186335	0.000000	0.000000
2	-0.392753	0.000000	0.000000
3	-1.287533	0.000000	1.783785
4	0.012547	0.000000	0.196979
5	-0.027398	-0.616937	0.645708
6	1.407688	0.178475	3.133043

In [49]:

df.fillna({1: 0.5, 2: -1})

Out[49]:

	0	1	2
0	-1.030615	0.500000	-1.000000
1	2.186335	0.500000	-1.000000
2	-0.392753	0.500000	-1.000000
3	-1.287533	0.500000	1.783785
4	0.012547	0.500000	0.196979
5	-0.027398	-0.616937	0.645708
6	1.407688	0.178475	3.133043

In [50]:

# fillna returns a new object, but you can modify the existing object in place
df

Out[50]:

	0	1	2
0	-1.030615	NaN	NaN
1	2.186335	NaN	NaN
2	-0.392753	NaN	NaN
3	-1.287533	NaN	1.783785
4	0.012547	NaN	0.196979
5	-0.027398	-0.616937	0.645708
6	1.407688	0.178475	3.133043

In [51]:

_ = df.fillna(0, inplace=True) # modify the existing object in place
df

Out[51]:

	0	1	2
0	-1.030615	0.000000	0.000000
1	2.186335	0.000000	0.000000
2	-0.392753	0.000000	0.000000
3	-1.287533	0.000000	1.783785
4	0.012547	0.000000	0.196979
5	-0.027398	-0.616937	0.645708
6	1.407688	0.178475	3.133043

In [52]:

df = pd.DataFrame(np.random.randn(6, 3))
df.ix[2:, 1] = NA
df.ix[4:, 2] = NA
df

Out[52]:

	0	1	2
0	1.250792	0.430961	1.018721
1	-0.681740	-0.679749	-0.765389
2	-2.656496	NaN	-1.899419
3	0.879665	NaN	-0.637943
4	0.698958	NaN	NaN
5	-1.493191	NaN	NaN

In [53]:

df.fillna(method='ffill')   # repeat the last value of columns into NaN field

Out[53]:

	0	1	2
0	1.250792	0.430961	1.018721
1	-0.681740	-0.679749	-0.765389
2	-2.656496	-0.679749	-1.899419
3	0.879665	-0.679749	-0.637943
4	0.698958	-0.679749	-0.637943
5	-1.493191	-0.679749	-0.637943

In [54]:

df.fillna(method='ffill', limit=2)  # limit the fill

Out[54]:

	0	1	2
0	1.250792	0.430961	1.018721
1	-0.681740	-0.679749	-0.765389
2	-2.656496	-0.679749	-1.899419
3	0.879665	-0.679749	-0.637943
4	0.698958	NaN	-0.637943
5	-1.493191	NaN	-0.637943

In [55]:

df.fillna(df.mean())

Out[55]:

	0	1	2
0	1.250792	0.430961	1.018721
1	-0.681740	-0.679749	-0.765389
2	-2.656496	-0.124394	-1.899419
3	0.879665	-0.124394	-0.637943
4	0.698958	-0.124394	-0.571007
5	-1.493191	-0.124394	-0.571007

fillna function arguments¶

value             Scalar value or dict-like object to use to fill missing values
method            Interpolation, by default 'ffill' if function called with no other arguments
axis              Axis to fill on, default axis=0
inplace           Modify the calling object without producing a copy
limit             For forward and backward filling, maximum number of consecutive periods to fill

In [ ]: