from datetime import datetime, date, time import sys sys.version import pandas as pd from pandas import Series, DataFrame, Panel pd.__version__ np.__version__ import matplotlib.pyplot as plt import matplotlib as mpl mpl.rc('figure', figsize=(10, 8)) mpl.__version__ with open('data.csv', 'r') as fh: print fh.readline() # headers print fh.readline() # first row data = pd.read_csv('data.csv', parse_dates={'Timestamp': ['Date', 'Time']}, index_col='Timestamp') data ticks = data.ix[:, ['Price', 'Volume']] ticks.head() bars = ticks.Price.resample('1min', how='ohlc') bars minute_range = bars.high - bars.low minute_range.describe() minute_return = bars.close / bars.open - 1 minute_return.describe() volume = ticks.Volume.resample('1min', how='sum') value = ticks.prod(axis=1).resample('1min', how='sum') vwap = value / volume vwap.ix['2011-11-01 09:27':'2011-11-01 09:32'] bars.open.at_time('9:30') bars.close.at_time('16:00') filtered = vwap.between_time('10:00', '16:00') filtered.head(20) vol = volume.between_time('10:00', '16:00') vol.head(20) filtered.ix['2011-11-03':'2011-11-04'].head(20) filled = filtered.fillna(method='pad', limit=1) filled.ix['2011-11-03':'2011-11-04'].head(20) vol = vol.fillna(0.) vol.head(20) filled.ix['2011-11-03':'2011-11-04'].plot() plt.ylim(103.5, 104.5) vwap.ix['2011-11-03':'2011-11-04'].plot() plt.ylim(103.5, 104.5) vol.ix['2011-11-03':'2011-11-04'].plot(secondary_y=True, style='r') ticks.head() ticks.shift(1).head() ticks.shift(-1).head() ticks.tshift(1, 'min').head() minute_return.head() mr = minute_return.between_time('9:30', '16:00') mr.head() lagged = mr.shift(1) lagged.head() lagged.at_time('9:30') mr.at_time('16:00') lagged = minute_return.tshift(1, 'min').between_time('9:30', '16:00') lagged.at_time('9:30') pd.ols(y=mr, x=lagged) mr = vwap / bars.open - 1 mr = mr.between_time('9:30', '16:00') lagged = mr.tshift(1, 'min').between_time('9:30', '16:00') pd.ols(y=mr, x=lagged) inter = mr * vol inter = inter.between_time('9:30', '16:00') lagged_inter = inter.tshift(1, 'min').between_time('9:30', '16:00') pd.ols(y=mr, x=lagged_inter) vol = vol.groupby(vol.index.day).transform(lambda x: x/x.sum()) vol.head() vol.resample('D', how='sum') inter = mr * vol inter = inter.between_time('9:30', '16:00') lagged_inter = inter.tshift(1, 'min').between_time('9:30', '16:00') pd.ols(y=mr, x=lagged_inter) hour = vol.index.hour hourly_volume = vol.groupby(hour).mean() hourly_volume.plot(kind='bar') hourly = vol.resample('H') def calc_mean(hr): hr = time(hour=hr) data = hourly.at_time(hr) return pd.expanding_mean(data) df = pd.concat([calc_mean(hr) for hr in range(10, 16)]) df = df.sort_index() df clean_vol = vol.between_time('10:00', '15:59') dev = clean_vol - df.reindex(clean_vol.index, method='pad') # be careful over day boundaries dev inter = mr * dev inter = inter.between_time('10:00', '15:59') pd.ols(y=mr, x=inter.tshift(1, 'min')) rng = pd.date_range('2005', '2012', freq='M') rng pd.date_range('2005', periods=7*12, freq='M') pd.date_range(end='2012', periods=7*12, freq='M') pd.date_range('2005', periods=4, freq='Q') pd.date_range('2005', periods=4, freq='Q-NOV') wkrng = pd.date_range('2012-10-25', periods=3, freq='W') wkrng wkrng[0].dayofweek pd.date_range('2005', periods=3, freq='A-JUN') isinstance(rng, pd.Index) rng[2:4] s = Series(randn(len(rng)), rng) s.head() df = DataFrame(randn(len(rng), 3), rng, ['X', 'Y', 'Z']) df.head() s[datetime(2005, 1, 31) : datetime(2006, 12, 31)] #slice end inclusive df['2005-1-31':'2006-12-31'] s['2005':'2006'] df[:2] # slice end exclusive elm = rng[0] elm isinstance(elm, datetime) elm.nanosecond val = rng.values type(val) val.dtype val[0] rng.asobject.values[0] rng.asobject rng.to_pydatetime() rng.to_pydatetime()[0] type(rng.asi8) rng.asi8.dtype rng.asi8[0] s.index.freqstr s.resample('30D').head(10) s.resample('30D', fill_method='ffill').head(10) s.ix[:3].resample('W') s.ix[:3].resample('W', fill_method='ffill') s.asfreq('Q').head() s.resample('Q', 'last').head() s.resample('Q').head() s.ix[3:6].mean() s.resample('Q', closed='left').head() s.ix[2:5].mean() s.resample('Q').head() s.resample('Q', label='left').head() s.resample('Q', label='left', loffset='-1D').head() rng.tz d = rng[0] d d.tz localized = rng.tz_localize('US/Eastern') localized[0] localized.asi8[0] rng.asi8[0] d_utc = d.tz_localize('UTC') d_utc d_utc.tz_localize('US/Eastern') localized.tz_convert('UTC') d_ny = d_utc.tz_convert('US/Eastern') d_ny rng.tz_convert('US/Eastern') p = pd.Period('2005', 'A') p pd.Period('2006Q1', 'Q-MAR') pd.Period('2007-1-1', 'B') pd.Period('2005', 'AS') pd.period_range('2005', '2012', freq='A') prng = pd.period_range('2005', periods=7, freq='A') prng p p.to_timestamp() p.to_timestamp('M', 's') p.to_timestamp('M', 'e') prng.to_timestamp(how='e') prng.to_timestamp('M', 'e') rng rng.to_period() rng.to_period('D') p p.end_time datetime(2005, 12, 31, 10, 0, 0) < p.end_time # WAT?!