# import
import pandas as pd
import numpy as np
df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],
[np.nan, np.nan], [0.75, -1.3]],
index=['a', 'b', 'c', 'd'],
columns=['one', 'two'])
df
one | two | |
---|---|---|
a | 1.40 | NaN |
b | 7.10 | -4.5 |
c | NaN | NaN |
d | 0.75 | -1.3 |
# printning sum of df columns
df.sum()
one 9.25 two -5.80 dtype: float64
df.mean()
one 3.083333 two -2.900000 dtype: float64
df.sum(axis=1)
a 1.40 b 2.60 c 0.00 d -0.55 dtype: float64
# NA values are excluded unless the entire slice is NA. This can be disabled using the skipna option
df.sum(axis=1, skipna=False)
a NaN b 2.60 c NaN d -0.55 dtype: float64
axis Axis to reduce over. 0 for DataFrame’s rows and 1 for columns.
skipna Exclude missing values, True by default.
level Reduce grouped by level if the axis is hierarchically-indexed (MultiIndex).
# idxmin and idxmax, return indirect statistics like the index value where the minimum or maximum values are attained
df.idxmin()
one d two b dtype: object
# for cumulative sum
df.cumsum()
one | two | |
---|---|---|
a | 1.40 | NaN |
b | 8.50 | -4.5 |
c | NaN | NaN |
d | 9.25 | -5.8 |
# describing
df.describe()
C:\tools\Anaconda3\lib\site-packages\numpy\lib\function_base.py:3834: RuntimeWarning: Invalid value encountered in percentile RuntimeWarning)
one | two | |
---|---|---|
count | 3.000000 | 2.000000 |
mean | 3.083333 | -2.900000 |
std | 3.493685 | 2.262742 |
min | 0.750000 | -4.500000 |
25% | NaN | NaN |
50% | NaN | NaN |
75% | NaN | NaN |
max | 7.100000 | -1.300000 |
count Number of non-NA values
describe Compute set of summary statistics for Series or each DataFrame column
min, max Compute minimum and maximum values
argmin, argmax Compute index locations (integers) at which minimum or maximum value obtained, respectively
idxmin, idxmax Compute index values at which minimum or maximum value obtained, respectively
quantile Compute sample quantile ranging from 0 to 1
sum Sum of values
mean Mean of values
median Arithmetic median (50% quantile) of values
mad Mean absolute deviation from mean value
var Sample variance of values
std Sample standard deviation of values
skew Sample skewness (3rd moment) of values
kurt Sample kurtosis (4th moment) of values
cumsum Cumulative sum of values
cummin, cummax Cumulative minimum or maximum of values, respectively
cumprod Cumulative product of values
diff Compute 1st arithmetic difference (useful for time series)
pct_change Compute percent changes
df.quantile()
C:\tools\Anaconda3\lib\site-packages\numpy\lib\function_base.py:3834: RuntimeWarning: Invalid value encountered in percentile RuntimeWarning)
one NaN two NaN dtype: float64
obj = pd.Series(['a', 'a', 'b', 'c'] * 4)
print(obj)
0 a 1 a 2 b 3 c 4 a 5 a 6 b 7 c 8 a 9 a 10 b 11 c 12 a 13 a 14 b 15 c dtype: object
obj.describe()
count 16 unique 3 top a freq 8 dtype: object
import pandas.io.data as web
all_data = {}
for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']:
all_data[ticker] = web.get_data_yahoo(ticker, '1/1/2000', '1/1/2010')
price = pd.DataFrame({tic: data['Adj Close']
for tic, data in all_data.items()})
volume = pd.DataFrame({tic: data['Volume']
for tic, data in all_data.items()})
C:\tools\Anaconda3\lib\site-packages\pandas\io\data.py:35: FutureWarning: The pandas.io.data module is moved to a separate package (pandas-datareader) and will be removed from pandas in a future version. After installing the pandas-datareader package (https://github.com/pydata/pandas-datareader), you can change the import ``from pandas.io import data, wb`` to ``from pandas_datareader import data, wb``. FutureWarning)
returns = price.pct_change()
returns.tail()
AAPL | GOOG | IBM | MSFT | |
---|---|---|---|---|
Date | ||||
2009-12-24 | 0.034339 | 0.011117 | 0.004385 | 0.002587 |
2009-12-28 | 0.012294 | 0.007098 | 0.013326 | 0.005484 |
2009-12-29 | -0.011861 | -0.005571 | -0.003477 | 0.007058 |
2009-12-30 | 0.012147 | 0.005376 | 0.005461 | -0.013699 |
2009-12-31 | -0.004300 | -0.004416 | -0.012597 | -0.015504 |
returns.MSFT.corr(returns.IBM)
0.4959796261031541
returns.MSFT.cov(returns.IBM)
0.00021595760076743132
returns.corr()
AAPL | GOOG | IBM | MSFT | |
---|---|---|---|---|
AAPL | 1.000000 | 0.470676 | 0.410011 | 0.424305 |
GOOG | 0.470676 | 1.000000 | 0.390689 | 0.443587 |
IBM | 0.410011 | 0.390689 | 1.000000 | 0.495980 |
MSFT | 0.424305 | 0.443587 | 0.495980 | 1.000000 |
returns.cov()
AAPL | GOOG | IBM | MSFT | |
---|---|---|---|---|
AAPL | 0.001027 | 0.000303 | 0.000252 | 0.000309 |
GOOG | 0.000303 | 0.000580 | 0.000142 | 0.000205 |
IBM | 0.000252 | 0.000142 | 0.000367 | 0.000216 |
MSFT | 0.000309 | 0.000205 | 0.000216 | 0.000516 |
returns.corrwith(returns.IBM)
AAPL 0.410011 GOOG 0.390689 IBM 1.000000 MSFT 0.495980 dtype: float64
returns.corrwith(volume)
AAPL -0.057549 GOOG 0.062647 IBM -0.007892 MSFT -0.014245 dtype: float64
obj = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])
print(obj)
0 c 1 a 2 d 3 a 4 a 5 b 6 b 7 c 8 c dtype: object
obj.unique()
array(['c', 'a', 'd', 'b'], dtype=object)
obj.value_counts()
a 3 c 3 b 2 d 1 dtype: int64
pd.value_counts(obj.values, sort=False)
c 3 d 1 b 2 a 3 dtype: int64
mask = obj.isin(['b', 'c'])
print(mask)
0 True 1 False 2 False 3 False 4 False 5 True 6 True 7 True 8 True dtype: bool
print(obj[mask])
0 c 5 b 6 b 7 c 8 c dtype: object
isin Compute boolean array indicating whether each Series value is contained in the passed sequence of values.
unique Compute array of unique values in a Series, returned in the order observed.
value_counts Return a Series containing unique values as its index and frequencies as its values, ordered count in
descending order.
data = pd.DataFrame({'Qu1': [1, 3, 4, 3, 4],
'Qu2': [2, 3, 1, 2, 3],
'Qu3': [1, 5, 2, 4, 4]})
print(data)
Qu1 Qu2 Qu3 0 1 2 1 1 3 3 5 2 4 1 2 3 3 2 4 4 4 3 4
result = data.apply(pd.value_counts)
result
Qu1 | Qu2 | Qu3 | |
---|---|---|---|
1 | 1.0 | 1.0 | 1.0 |
2 | NaN | 2.0 | 1.0 |
3 | 2.0 | 2.0 | NaN |
4 | 2.0 | NaN | 2.0 |
5 | NaN | NaN | 1.0 |
result.fillna(0)
Qu1 | Qu2 | Qu3 | |
---|---|---|---|
1 | 1.0 | 1.0 | 1.0 |
2 | 0.0 | 2.0 | 1.0 |
3 | 2.0 | 2.0 | 0.0 |
4 | 2.0 | 0.0 | 2.0 |
5 | 0.0 | 0.0 | 1.0 |
string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])
print(string_data)
0 aardvark 1 artichoke 2 NaN 3 avocado dtype: object
string_data.isnull()
0 False 1 False 2 True 3 False dtype: bool
string_data[0] = None # None is also treated as NaN
string_data.isnull()
0 True 1 False 2 True 3 False dtype: bool
dropna Filter axis labels based on whether values for each label have missing data, with varying thresholds for how much missing data to tolerate.
fillna Fill in missing data with some value or using an interpolation method such as 'ffill' or 'bfill'.
isnull Return like-type object containing boolean values indicating which values are missing / NA.
notnull Negation of isnull.
from numpy import nan as NA
data = pd.Series([1, NA, 3.5, NA, 7])
print(data)
0 1.0 1 NaN 2 3.5 3 NaN 4 7.0 dtype: float64
data.dropna() # dropping na records
0 1.0 2 3.5 4 7.0 dtype: float64
# same can be achieved as binary filtering
data[data.notnull()]
0 1.0 2 3.5 4 7.0 dtype: float64
data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],
[NA, NA, NA], [NA, 6.5, 3.]])
print(data)
0 1 2 0 1.0 6.5 3.0 1 1.0 NaN NaN 2 NaN NaN NaN 3 NaN 6.5 3.0
data.dropna()
0 | 1 | 2 | |
---|---|---|---|
0 | 1.0 | 6.5 | 3.0 |
data.dropna(how='all') # this will remove only those records which has NaN in all columns
0 | 1 | 2 | |
---|---|---|---|
0 | 1.0 | 6.5 | 3.0 |
1 | 1.0 | NaN | NaN |
3 | NaN | 6.5 | 3.0 |
# can drop the columns same way with axis = 1
data[4] = NA
print(data)
0 1 2 4 0 1.0 6.5 3.0 NaN 1 1.0 NaN NaN NaN 2 NaN NaN NaN NaN 3 NaN 6.5 3.0 NaN
data.dropna(how='all', axis=1)
0 | 1 | 2 | |
---|---|---|---|
0 | 1.0 | 6.5 | 3.0 |
1 | 1.0 | NaN | NaN |
2 | NaN | NaN | NaN |
3 | NaN | 6.5 | 3.0 |
data.dropna(how='all', axis=1).dropna(how='all')
0 | 1 | 2 | |
---|---|---|---|
0 | 1.0 | 6.5 | 3.0 |
1 | 1.0 | NaN | NaN |
3 | NaN | 6.5 | 3.0 |
df = pd.DataFrame(np.random.randn(7, 3))
print(df)
0 1 2 0 -1.030615 0.602560 -0.387511 1 2.186335 -2.965361 0.432411 2 -0.392753 -1.305728 0.178472 3 -1.287533 0.233790 1.783785 4 0.012547 -0.827266 0.196979 5 -0.027398 -0.616937 0.645708 6 1.407688 0.178475 3.133043
df.ix[:4, 1] = NA;
df.ix[:2, 2] = NA;
print(df)
0 1 2 0 -1.030615 NaN NaN 1 2.186335 NaN NaN 2 -0.392753 NaN NaN 3 -1.287533 NaN 1.783785 4 0.012547 NaN 0.196979 5 -0.027398 -0.616937 0.645708 6 1.407688 0.178475 3.133043
df.dropna(thresh=2)
0 | 1 | 2 | |
---|---|---|---|
3 | -1.287533 | NaN | 1.783785 |
4 | 0.012547 | NaN | 0.196979 |
5 | -0.027398 | -0.616937 | 0.645708 |
6 | 1.407688 | 0.178475 | 3.133043 |
data.fillna(999) # replacing na with some value
0 | 1 | 2 | 4 | |
---|---|---|---|---|
0 | 1.0 | 6.5 | 3.0 | 999.0 |
1 | 1.0 | 999.0 | 999.0 | 999.0 |
2 | 999.0 | 999.0 | 999.0 | 999.0 |
3 | 999.0 | 6.5 | 3.0 | 999.0 |
df.fillna(0)
0 | 1 | 2 | |
---|---|---|---|
0 | -1.030615 | 0.000000 | 0.000000 |
1 | 2.186335 | 0.000000 | 0.000000 |
2 | -0.392753 | 0.000000 | 0.000000 |
3 | -1.287533 | 0.000000 | 1.783785 |
4 | 0.012547 | 0.000000 | 0.196979 |
5 | -0.027398 | -0.616937 | 0.645708 |
6 | 1.407688 | 0.178475 | 3.133043 |
df.fillna({1: 0.5, 2: -1})
0 | 1 | 2 | |
---|---|---|---|
0 | -1.030615 | 0.500000 | -1.000000 |
1 | 2.186335 | 0.500000 | -1.000000 |
2 | -0.392753 | 0.500000 | -1.000000 |
3 | -1.287533 | 0.500000 | 1.783785 |
4 | 0.012547 | 0.500000 | 0.196979 |
5 | -0.027398 | -0.616937 | 0.645708 |
6 | 1.407688 | 0.178475 | 3.133043 |
# fillna returns a new object, but you can modify the existing object in place
df
0 | 1 | 2 | |
---|---|---|---|
0 | -1.030615 | NaN | NaN |
1 | 2.186335 | NaN | NaN |
2 | -0.392753 | NaN | NaN |
3 | -1.287533 | NaN | 1.783785 |
4 | 0.012547 | NaN | 0.196979 |
5 | -0.027398 | -0.616937 | 0.645708 |
6 | 1.407688 | 0.178475 | 3.133043 |
_ = df.fillna(0, inplace=True) # modify the existing object in place
df
0 | 1 | 2 | |
---|---|---|---|
0 | -1.030615 | 0.000000 | 0.000000 |
1 | 2.186335 | 0.000000 | 0.000000 |
2 | -0.392753 | 0.000000 | 0.000000 |
3 | -1.287533 | 0.000000 | 1.783785 |
4 | 0.012547 | 0.000000 | 0.196979 |
5 | -0.027398 | -0.616937 | 0.645708 |
6 | 1.407688 | 0.178475 | 3.133043 |
df = pd.DataFrame(np.random.randn(6, 3))
df.ix[2:, 1] = NA
df.ix[4:, 2] = NA
df
0 | 1 | 2 | |
---|---|---|---|
0 | 1.250792 | 0.430961 | 1.018721 |
1 | -0.681740 | -0.679749 | -0.765389 |
2 | -2.656496 | NaN | -1.899419 |
3 | 0.879665 | NaN | -0.637943 |
4 | 0.698958 | NaN | NaN |
5 | -1.493191 | NaN | NaN |
df.fillna(method='ffill') # repeat the last value of columns into NaN field
0 | 1 | 2 | |
---|---|---|---|
0 | 1.250792 | 0.430961 | 1.018721 |
1 | -0.681740 | -0.679749 | -0.765389 |
2 | -2.656496 | -0.679749 | -1.899419 |
3 | 0.879665 | -0.679749 | -0.637943 |
4 | 0.698958 | -0.679749 | -0.637943 |
5 | -1.493191 | -0.679749 | -0.637943 |
df.fillna(method='ffill', limit=2) # limit the fill
0 | 1 | 2 | |
---|---|---|---|
0 | 1.250792 | 0.430961 | 1.018721 |
1 | -0.681740 | -0.679749 | -0.765389 |
2 | -2.656496 | -0.679749 | -1.899419 |
3 | 0.879665 | -0.679749 | -0.637943 |
4 | 0.698958 | NaN | -0.637943 |
5 | -1.493191 | NaN | -0.637943 |
df.fillna(df.mean())
0 | 1 | 2 | |
---|---|---|---|
0 | 1.250792 | 0.430961 | 1.018721 |
1 | -0.681740 | -0.679749 | -0.765389 |
2 | -2.656496 | -0.124394 | -1.899419 |
3 | 0.879665 | -0.124394 | -0.637943 |
4 | 0.698958 | -0.124394 | -0.571007 |
5 | -1.493191 | -0.124394 | -0.571007 |
value Scalar value or dict-like object to use to fill missing values
method Interpolation, by default 'ffill' if function called with no other arguments
axis Axis to fill on, default axis=0
inplace Modify the calling object without producing a copy
limit For forward and backward filling, maximum number of consecutive periods to fill