In [1]:
import pandas as pd
import numpy as np
from datetime import datetime as dt
In [24]:
# Build a sample DataFrame that looks like what you will get back from a history() call.

df = pd.DataFrame(np.array([np.arange(10), np.arange(10)]).transpose(),
                  index=pd.date_range(start=pd.Timestamp('2014-06-17'), periods=10),
                  # In the actual history dataframe, columns are Security objects.
                  columns=['sid_1', 'sid_2'])
df
Out[24]:
sid_1 sid_2
2014-06-17 0 0
2014-06-18 1 1
2014-06-19 2 2
2014-06-20 3 3
2014-06-21 4 4
2014-06-22 5 5
2014-06-23 6 6
2014-06-24 7 7
2014-06-25 8 8
2014-06-26 9 9
In [25]:
# Get the mean of all prices in this window.
means = df.mean()
means
Out[25]:
sid_1    4.5
sid_2    4.5
dtype: float64
In [45]:
# Get a rolling means window in which each cell contains 
# the mean of the current day and the previous three days.
# Note that the first three windows have NaN (not a number), 
# because they don't have a full 3 days yet.
rolling_means = pd.rolling_mean(df, window=4)
rolling_means
Out[45]:
sid_1 sid_2
2014-06-17 NaN NaN
2014-06-18 NaN NaN
2014-06-19 NaN NaN
2014-06-20 1.5 1.5
2014-06-21 2.5 2.5
2014-06-22 3.5 3.5
2014-06-23 4.5 4.5
2014-06-24 5.5 5.5
2014-06-25 6.5 6.5
2014-06-26 7.5 7.5
In [47]:
# We can customize the minumum number of required observations by specifying min_periods.
rolling_means_less_nans = pd.rolling_mean(df, window=4, min_periods=2)
rolling_means_less_nans
Out[47]:
sid_1 sid_2
2014-06-17 NaN NaN
2014-06-18 0.5 0.5
2014-06-19 1.0 1.0
2014-06-20 1.5 1.5
2014-06-21 2.5 2.5
2014-06-22 3.5 3.5
2014-06-23 4.5 4.5
2014-06-24 5.5 5.5
2014-06-25 6.5 6.5
2014-06-26 7.5 7.5
In [68]:
# Pandas has similar rolling functions for sum, stddev, 
# and a variety of other statistical operations.
rolling_sums = pd.rolling_sum(df, window=2)
rolling_stddevs = pd.rolling_std(df, window=3)
# Combine both DataFrames by joining on the date index.
combined = rolling_sums.merge(rolling_stddevs, left_index=True, right_index=True, suffixes=("_sum", "_stddev"))
combined
Out[68]:
sid_1_sum sid_2_sum sid_1_stddev sid_2_stddev
2014-06-17 NaN NaN NaN NaN
2014-06-18 1 1 NaN NaN
2014-06-19 3 3 1 1
2014-06-20 5 5 1 1
2014-06-21 7 7 1 1
2014-06-22 9 9 1 1
2014-06-23 11 11 1 1
2014-06-24 13 13 1 1
2014-06-25 15 15 1 1
2014-06-26 17 17 1 1
In [71]:
# Note that these outputs are still DataFrames, so we can do nice things like this:

# Get the first 5 elements of the frame.
combined.iloc[:5]
Out[71]:
sid_1_sum sid_2_sum sid_1_stddev sid_2_stddev
2014-06-17 NaN NaN NaN NaN
2014-06-18 1 1 NaN NaN
2014-06-19 3 3 1 1
2014-06-20 5 5 1 1
2014-06-21 7 7 1 1
In [81]:
# Or this:
# Get all but the last element of the frame:
combined.iloc[:-1]
Out[81]:
sid_1_sum sid_2_sum sid_1_stddev sid_2_stddev
2014-06-17 NaN NaN NaN NaN
2014-06-18 1 1 NaN NaN
2014-06-19 3 3 1 1
2014-06-20 5 5 1 1
2014-06-21 7 7 1 1
2014-06-22 9 9 1 1
2014-06-23 11 11 1 1
2014-06-24 13 13 1 1
2014-06-25 15 15 1 1
In [85]:
# Or this:
# Get every other entry in the DataFrame, in reverse.
combined.iloc[::-2]
Out[85]:
sid_1_sum sid_2_sum sid_1_stddev sid_2_stddev
2014-06-26 17 17 1 1
2014-06-24 13 13 1 1
2014-06-22 9 9 1 1
2014-06-20 5 5 1 1
2014-06-18 1 1 NaN NaN
In [77]:
# Or this:
# Get the entries for June 20th and June 21st.
combined.loc[[pd.Timestamp('2014-06-20'), pd.Timestamp('2014-06-21')]]
Out[77]:
sid_1_sum sid_2_sum sid_1_stddev sid_2_stddev
2014-06-20 5 5 1 1
2014-06-21 7 7 1 1
In [86]:
# Or this:
# Get a specific column (this returns a Series)
combined['sid_2_sum']
Out[86]:
2014-06-17   NaN
2014-06-18     1
2014-06-19     3
2014-06-20     5
2014-06-21     7
2014-06-22     9
2014-06-23    11
2014-06-24    13
2014-06-25    15
2014-06-26    17
Freq: D, Name: sid_2_sum, dtype: float64
In [87]:
# Or this:
# Get multiple specific columns (This returns a DataFrame)
combined[['sid_2_sum', 'sid_1_stddev']]
Out[87]:
sid_2_sum sid_1_stddev
2014-06-17 NaN NaN
2014-06-18 1 NaN
2014-06-19 3 1
2014-06-20 5 1
2014-06-21 7 1
2014-06-22 9 1
2014-06-23 11 1
2014-06-24 13 1
2014-06-25 15 1
2014-06-26 17 1