import numpy as np import pandas as pd from datetime import time pd.set_option('html', False) from IPython.core.display import Image Image('http://akamaicovers.oreilly.com/images/0636920023784/lrg.jpg') import pandas as pd import numpy as np temp = '/Users/wesm/Downloads/minutebars/%s.csv' path = temp % 'AAPL' !wc -l $path aapl_bars = pd.read_csv(temp % 'AAPL') aapl_bars %time _ = pd.read_csv(path) aapl_bars.dt aapl_bars.index = pd.to_datetime(aapl_bars.pop('dt')) aapl_bars.head() def load_bars(ticker): bars = pd.read_csv(temp % ticker) bars.index = pd.to_datetime(bars.pop('dt')) return bars aapl_bars.at_time(time(15, 0)).head(10) aapl_bars.close_price['2009-10-15'] aapl_bars.close_price mth_mean = aapl_bars.close_price.resample('M', how=['mean', 'median', 'std']) mth_mean mth_mean.plot() close = aapl_bars.close_price close / close.shift(1) - 1 minute_returns = aapl_bars.close_price.pct_change() std_10day = pd.rolling_std(minute_returns, 390 * 10) std_10day.resample('B').plot() ts1 = pd.Series(np.random.randn(10), index=pd.date_range('1/1/2000', periods=10)) ts1 ts2 = ts1[[0, 2, 4, 5, 6, 7, 8]] ts2 ts1 + ts2 df = pd.DataFrame({'A': ts1, 'B': ts2}) df ibm_bars = load_bars('IBM') def subsample(frame, pct=0.9): N = len(frame) indexer = np.sort(np.random.permutation(N)[:pct*N]) return frame.take(indexer) f1 = subsample(ibm_bars) f2 = subsample(aapl_bars) f1 both = pd.concat([f1, f2], axis=1, keys=['IBM', 'AAPL']) both.head(20) df df.count() both.count() df.sum() df.mean(1) df.dropna() df.fillna(0) df.fillna(method='ffill') df.asfreq('4h') df.asfreq('4h').ffill(limit=3) import random, string import matplotlib as mpl def rands(n): choices = string.ascii_letters return ''.join([random.choice(choices) for _ in xrange(n)]) mpl.rc('figure', figsize=(12, 8)) ind_names = np.array(['ENERGY', 'FINANCIAL', 'TECH', 'CONSDUR', 'SERVICES', 'UTILITIES'], dtype='O') ccys = np.array(['USD', 'EUR'], dtype='O') Nfull = 2000 tickers = np.array(sorted(rands(5).upper() for _ in xrange(Nfull)), dtype='O') tickers = np.unique(tickers) industries = pd.Series(ind_names.take(np.random.randint(0, 6, Nfull)), index=tickers, name='industry') ccy = pd.Series(ccys.take(np.random.randint(0, len(ccys), Nfull)), index=tickers, name='ccy') ccy df = pd.DataFrame({'Momentum' : np.random.randn(1000) / 200 + 0.03, 'Value' : np.random.randn(1000) / 200 + 0.08, 'ShortInterest' : np.random.randn(1000) / 200 - 0.02}, index=tickers.take(np.random.permutation(Nfull)[:1000])) df.head() means = df.groupby(industries).mean() means means.plot(kind='barh') means = df.groupby([industries, ccy]).mean() means keys = [industries, ccy] zscore = lambda x: (x - x.mean()) / x.std() normed = df.groupby(keys).apply(zscore) normed.groupby(keys).agg(['mean', 'std']) means means['Momentum'] means.ix['TECH'] means.stack() means.stack().unstack('industry') base = '/Users/wesm/Dropbox/book/svn/book_scripts/movielens/ml-1m' get_path = lambda x: '%s/%s.dat' % (base, x) unames = ['user_id', 'gender', 'age', 'occupation', 'zip'] users = pd.read_table(get_path('users'), sep='::', header=None, names=unames) rnames = ['user_id', 'movie_id', 'rating', 'timestamp'] ratings = pd.read_table(get_path('ratings'), sep='::', header=None, names=rnames) mnames = ['movie_id', 'title', 'genres'] movies = pd.read_table(get_path('movies'), sep='::', header=None, names=mnames) movies.head() ratings.head() users.head() data = pd.merge(pd.merge(ratings, users), movies) data rating_counts = data.groupby('title').size() freq_titles = rating_counts.index[rating_counts > 1000] freq_titles highest_rated = data.groupby('title').rating.mean()[freq_titles].order()[-20:] highest_rated filtered = data[data.title.isin(highest_rated.index)] filtered.title = filtered.title.str[:25] filtered.groupby(['title', 'gender']).rating.count().unstack() mean_ratings = data.pivot_table('rating', rows='title', cols='gender', aggfunc='mean') mean_ratings.tail(20) data.title.value_counts() data.rating.describe() by_gender = data.groupby('gender').rating.describe() by_gender by_gender.unstack(0)