import numpy as np
import pandas as pd
from datetime import time
pd.set_option('html', False)
from IPython.core.display import Image
Image('http://akamaicovers.oreilly.com/images/0636920023784/lrg.jpg')
import pandas as pd
import numpy as np
temp = '/Users/wesm/Downloads/minutebars/%s.csv'
path = temp % 'AAPL'
!wc -l $path
489598 /Users/wesm/Downloads/minutebars/AAPL.csv
aapl_bars = pd.read_csv(temp % 'AAPL')
aapl_bars
<class 'pandas.core.frame.DataFrame'> Int64Index: 489597 entries, 0 to 489596 Data columns: volume 489597 non-null values high 489597 non-null values low 489597 non-null values close_price 489597 non-null values dt 489597 non-null values open_price 489597 non-null values dtypes: float64(4), int64(1), object(1)
%time _ = pd.read_csv(path)
CPU times: user 0.72 s, sys: 0.18 s, total: 0.90 s Wall time: 0.90 s
aapl_bars.dt
0 2008-01-07 14:31:00+00:00 1 2008-01-07 14:32:00+00:00 2 2008-01-07 14:33:00+00:00 3 2008-01-07 14:34:00+00:00 4 2008-01-07 14:35:00+00:00 5 2008-01-07 14:36:00+00:00 6 2008-01-07 14:37:00+00:00 7 2008-01-07 14:38:00+00:00 8 2008-01-07 14:39:00+00:00 9 2008-01-07 14:40:00+00:00 10 2008-01-07 14:41:00+00:00 11 2008-01-07 14:42:00+00:00 12 2008-01-07 14:43:00+00:00 13 2008-01-07 14:44:00+00:00 14 2008-01-07 14:45:00+00:00 ... 489582 2013-01-07 20:46:00+00:00 489583 2013-01-07 20:47:00+00:00 489584 2013-01-07 20:48:00+00:00 489585 2013-01-07 20:49:00+00:00 489586 2013-01-07 20:50:00+00:00 489587 2013-01-07 20:51:00+00:00 489588 2013-01-07 20:52:00+00:00 489589 2013-01-07 20:53:00+00:00 489590 2013-01-07 20:54:00+00:00 489591 2013-01-07 20:55:00+00:00 489592 2013-01-07 20:56:00+00:00 489593 2013-01-07 20:57:00+00:00 489594 2013-01-07 20:58:00+00:00 489595 2013-01-07 20:59:00+00:00 489596 2013-01-07 21:00:00+00:00 Name: dt, Length: 489597
aapl_bars.index = pd.to_datetime(aapl_bars.pop('dt'))
aapl_bars.head()
volume high low close_price open_price dt 2008-01-07 14:31:00 593143 182.07 181.00 182.03 181.25 2008-01-07 14:32:00 344139 182.56 181.92 182.54 182.04 2008-01-07 14:33:00 257436 182.75 182.23 182.50 182.53 2008-01-07 14:34:00 235358 182.62 182.15 182.30 182.51 2008-01-07 14:35:00 281763 182.50 181.71 181.97 182.33
def load_bars(ticker):
bars = pd.read_csv(temp % ticker)
bars.index = pd.to_datetime(bars.pop('dt'))
return bars
aapl_bars.at_time(time(15, 0)).head(10)
volume high low close_price open_price dt 2008-01-07 15:00:00 148512 183.10 182.60 182.64 182.860 2008-01-08 15:00:00 136202 179.44 178.87 179.40 179.020 2008-01-09 15:00:00 179490 173.49 172.71 172.80 173.431 2008-01-10 15:00:00 253765 177.08 176.08 176.19 176.540 2008-01-11 15:00:00 96055 176.70 176.17 176.43 176.520 2008-01-14 15:00:00 181223 176.70 176.00 176.53 176.600 2008-01-15 15:00:00 166167 177.78 177.19 177.72 177.200 2008-01-16 15:00:00 292435 165.91 164.70 164.76 165.402 2008-01-17 15:00:00 152760 164.24 163.29 163.29 163.600 2008-01-18 15:00:00 217712 163.97 163.50 163.75 163.500
aapl_bars.close_price['2009-10-15']
dt 2009-10-15 14:31:00 189.870 2009-10-15 14:32:00 189.800 2009-10-15 14:33:00 189.890 2009-10-15 14:34:00 189.930 2009-10-15 14:35:00 189.920 2009-10-15 14:36:00 189.976 2009-10-15 14:37:00 190.030 2009-10-15 14:38:00 189.940 2009-10-15 14:39:00 189.910 2009-10-15 14:40:00 190.069 2009-10-15 14:41:00 190.020 2009-10-15 14:42:00 189.990 2009-10-15 14:43:00 189.920 2009-10-15 14:44:00 189.990 2009-10-15 14:45:00 189.830 ... 2009-10-15 20:46:00 189.990 2009-10-15 20:47:00 189.940 2009-10-15 20:48:00 189.910 2009-10-15 20:49:00 189.821 2009-10-15 20:50:00 189.860 2009-10-15 20:51:00 189.940 2009-10-15 20:52:00 189.940 2009-10-15 20:53:00 189.980 2009-10-15 20:54:00 190.010 2009-10-15 20:55:00 190.090 2009-10-15 20:56:00 190.139 2009-10-15 20:57:00 190.170 2009-10-15 20:58:00 190.210 2009-10-15 20:59:00 190.200 2009-10-15 21:00:00 190.560 Name: close_price, Length: 390
aapl_bars.close_price
dt 2008-01-07 14:31:00 182.03 2008-01-07 14:32:00 182.54 2008-01-07 14:33:00 182.50 2008-01-07 14:34:00 182.30 2008-01-07 14:35:00 181.97 2008-01-07 14:36:00 181.48 2008-01-07 14:37:00 181.04 2008-01-07 14:38:00 180.68 2008-01-07 14:39:00 180.80 2008-01-07 14:40:00 180.86 2008-01-07 14:41:00 181.03 2008-01-07 14:42:00 180.88 2008-01-07 14:43:00 181.11 2008-01-07 14:44:00 181.24 2008-01-07 14:45:00 181.84 ... 2013-01-07 20:46:00 523.480 2013-01-07 20:47:00 522.970 2013-01-07 20:48:00 522.954 2013-01-07 20:49:00 523.850 2013-01-07 20:50:00 523.750 2013-01-07 20:51:00 524.270 2013-01-07 20:52:00 524.230 2013-01-07 20:53:00 524.940 2013-01-07 20:54:00 524.580 2013-01-07 20:55:00 524.490 2013-01-07 20:56:00 524.143 2013-01-07 20:57:00 523.750 2013-01-07 20:58:00 524.040 2013-01-07 20:59:00 523.780 2013-01-07 21:00:00 523.970 Name: close_price, Length: 489597
mth_mean = aapl_bars.close_price.resample('M', how=['mean', 'median', 'std'])
mth_mean
mean median std dt 2008-01-31 155.579126 161.1000 19.396180 2008-02-29 125.582646 125.7500 4.720866 2008-03-31 130.494650 127.3600 8.403366 2008-04-30 158.145335 154.7350 9.087710 2008-05-31 184.774497 185.0800 3.618693 2008-06-30 178.654374 178.9300 6.429312 2008-07-31 167.784954 169.9300 7.347889 2008-08-31 171.233317 174.0500 7.244400 2008-09-30 141.942077 140.9100 16.775448 2008-10-31 99.343456 97.6300 6.335940 2008-11-30 93.985805 92.2400 7.549505 2008-12-31 91.581060 90.2600 4.812012 2009-01-31 88.675004 89.8200 4.586041 2009-02-28 94.207834 94.3400 4.092791 2009-03-31 97.352191 96.9470 7.816247 2009-04-30 119.858558 120.3500 4.900853 2009-05-31 127.983364 128.2600 4.061250 2009-06-30 139.360858 139.3000 2.799540 2009-07-31 149.103911 148.3800 9.518534 2009-08-31 166.170257 165.9500 2.354870 2009-09-30 177.893266 181.7000 7.385285 2009-10-31 193.195646 190.5500 6.595516 2009-11-30 200.248734 202.0900 5.619620 2009-12-31 199.102938 197.2200 6.731990 2010-01-31 208.156303 209.4800 5.058381 2010-02-28 198.559427 198.3900 3.454915 2010-03-31 223.336387 224.3300 7.781226 2010-04-30 250.766901 245.9600 11.538351 2010-05-31 251.805447 252.0230 7.846905 2010-06-30 261.842737 262.5325 8.737081 2010-07-31 254.440632 256.2700 5.695913 2010-08-31 251.561496 250.5900 7.495469 2010-09-30 273.745057 272.1700 13.610222 2010-10-31 300.674357 302.7455 10.336923 2010-11-30 311.558581 311.1500 5.419399 2010-12-31 321.606753 321.5000 2.633648 2011-01-31 338.037888 338.5950 5.570475 2011-02-28 351.196638 352.7500 7.126330 2011-03-31 347.578442 349.0200 7.478078 2011-04-30 340.500490 339.4450 8.392912 2011-05-31 341.849013 342.9230 5.946428 2011-06-30 331.625547 331.5605 8.191484 2011-07-31 371.942253 365.4800 19.378892 2011-08-31 377.103717 377.3100 10.823605 2011-09-30 393.123445 391.6450 13.226161 2011-10-31 397.190649 400.4600 15.248496 2011-11-30 385.337475 383.8800 11.555804 2011-12-31 393.056772 392.8600 8.102446 2012-01-31 428.571704 424.3000 12.856495 2012-02-29 496.902626 502.0500 25.815138 2012-03-31 576.550595 587.1600 30.231559 2012-04-30 607.127307 609.0175 21.316798 2012-05-31 565.350624 567.3405 13.847976 2012-06-30 574.423152 573.5770 7.848743 2012-07-31 600.786165 604.4400 11.827663 2012-08-31 642.572344 635.9400 23.148544 2012-09-30 682.100657 679.4900 13.630957 2012-10-31 635.947102 634.1975 20.621637 2012-11-30 564.845334 563.0600 21.851129 2012-12-31 532.420270 528.8600 21.166298 2013-01-31 536.401272 536.9800 10.776982
mth_mean.plot()
<matplotlib.axes.AxesSubplot at 0x10db20e50>
close = aapl_bars.close_price
close / close.shift(1) - 1
dt 2008-01-07 14:31:00 NaN 2008-01-07 14:32:00 0.002802 2008-01-07 14:33:00 -0.000219 2008-01-07 14:34:00 -0.001096 2008-01-07 14:35:00 -0.001810 2008-01-07 14:36:00 -0.002693 2008-01-07 14:37:00 -0.002425 2008-01-07 14:38:00 -0.001989 2008-01-07 14:39:00 0.000664 2008-01-07 14:40:00 0.000332 2008-01-07 14:41:00 0.000940 2008-01-07 14:42:00 -0.000829 2008-01-07 14:43:00 0.001272 2008-01-07 14:44:00 0.000718 2008-01-07 14:45:00 0.003311 ... 2013-01-07 20:46:00 -0.000363 2013-01-07 20:47:00 -0.000974 2013-01-07 20:48:00 -0.000031 2013-01-07 20:49:00 0.001713 2013-01-07 20:50:00 -0.000191 2013-01-07 20:51:00 0.000993 2013-01-07 20:52:00 -0.000076 2013-01-07 20:53:00 0.001354 2013-01-07 20:54:00 -0.000686 2013-01-07 20:55:00 -0.000172 2013-01-07 20:56:00 -0.000662 2013-01-07 20:57:00 -0.000750 2013-01-07 20:58:00 0.000554 2013-01-07 20:59:00 -0.000496 2013-01-07 21:00:00 0.000363 Name: close_price, Length: 489597
minute_returns = aapl_bars.close_price.pct_change()
std_10day = pd.rolling_std(minute_returns, 390 * 10)
std_10day.resample('B').plot()
<matplotlib.axes.AxesSubplot at 0x10d94ca90>
ts1 = pd.Series(np.random.randn(10),
index=pd.date_range('1/1/2000', periods=10))
ts1
2000-01-01 0.583378 2000-01-02 -1.542445 2000-01-03 -0.215126 2000-01-04 -1.231664 2000-01-05 0.334308 2000-01-06 1.234491 2000-01-07 -0.678003 2000-01-08 -0.496512 2000-01-09 -2.023730 2000-01-10 -0.464823 Freq: D
ts2 = ts1[[0, 2, 4, 5, 6, 7, 8]]
ts2
2000-01-01 0.583378 2000-01-03 -0.215126 2000-01-05 0.334308 2000-01-06 1.234491 2000-01-07 -0.678003 2000-01-08 -0.496512 2000-01-09 -2.023730
ts1 + ts2
2000-01-01 1.166756 2000-01-02 NaN 2000-01-03 -0.430252 2000-01-04 NaN 2000-01-05 0.668617 2000-01-06 2.468981 2000-01-07 -1.356005 2000-01-08 -0.993023 2000-01-09 -4.047460 2000-01-10 NaN
df = pd.DataFrame({'A': ts1, 'B': ts2})
df
A B 2000-01-01 0.583378 0.583378 2000-01-02 -1.542445 NaN 2000-01-03 -0.215126 -0.215126 2000-01-04 -1.231664 NaN 2000-01-05 0.334308 0.334308 2000-01-06 1.234491 1.234491 2000-01-07 -0.678003 -0.678003 2000-01-08 -0.496512 -0.496512 2000-01-09 -2.023730 -2.023730 2000-01-10 -0.464823 NaN
ibm_bars = load_bars('IBM')
def subsample(frame, pct=0.9):
N = len(frame)
indexer = np.sort(np.random.permutation(N)[:pct*N])
return frame.take(indexer)
f1 = subsample(ibm_bars)
f2 = subsample(aapl_bars)
f1
<class 'pandas.core.frame.DataFrame'> DatetimeIndex: 440607 entries, 2008-01-07 14:31:00 to 2013-01-07 21:00:00 Data columns: volume 440607 non-null values high 440607 non-null values low 440607 non-null values close_price 440607 non-null values open_price 440607 non-null values dtypes: float64(4), int64(1)
both = pd.concat([f1, f2], axis=1, keys=['IBM', 'AAPL'])
both.head(20)
IBM AAPL \ volume high low close_price open_price volume high dt 2008-01-07 14:31:00 173600 100.47 100.02 100.100 100.250 593143 182.070 2008-01-07 14:32:00 93937 100.13 99.77 99.900 100.110 344139 182.560 2008-01-07 14:33:00 75100 99.93 99.75 99.890 99.900 257436 182.750 2008-01-07 14:34:00 60300 99.93 99.55 99.560 99.900 235358 182.620 2008-01-07 14:35:00 66300 99.84 99.51 99.659 99.550 281763 182.500 2008-01-07 14:36:00 115600 99.70 99.56 99.610 99.694 184191 182.000 2008-01-07 14:37:00 68400 99.65 99.50 99.600 99.620 NaN NaN 2008-01-07 14:38:00 94800 99.68 99.55 99.590 99.620 324597 181.100 2008-01-07 14:39:00 71200 99.60 99.25 99.320 99.590 274712 181.160 2008-01-07 14:40:00 61900 99.56 99.27 99.550 99.300 403421 181.040 2008-01-07 14:41:00 50400 99.75 99.54 99.630 99.550 342292 181.172 2008-01-07 14:42:00 64300 99.87 99.56 99.775 99.660 229040 181.430 2008-01-07 14:43:00 41500 99.79 99.69 99.760 99.770 203759 181.120 2008-01-07 14:44:00 60900 99.85 99.71 99.720 99.780 163075 181.240 2008-01-07 14:45:00 46400 100.00 99.72 100.000 99.720 263733 181.870 2008-01-07 14:46:00 64800 100.42 99.98 100.330 100.000 239652 182.500 2008-01-07 14:47:00 43200 100.39 100.17 100.180 100.340 217670 182.680 2008-01-07 14:48:00 66400 100.49 100.14 100.300 100.230 235761 182.420 2008-01-07 14:49:00 80954 100.59 100.26 100.550 100.300 159339 182.380 2008-01-07 14:50:00 81100 100.59 100.41 100.490 100.550 NaN NaN low close_price open_price dt 2008-01-07 14:31:00 181.00 182.03 181.250 2008-01-07 14:32:00 181.92 182.54 182.040 2008-01-07 14:33:00 182.23 182.50 182.530 2008-01-07 14:34:00 182.15 182.30 182.510 2008-01-07 14:35:00 181.71 181.97 182.330 2008-01-07 14:36:00 181.41 181.48 181.904 2008-01-07 14:37:00 NaN NaN NaN 2008-01-07 14:38:00 180.65 180.68 181.050 2008-01-07 14:39:00 180.62 180.80 180.680 2008-01-07 14:40:00 180.21 180.86 180.800 2008-01-07 14:41:00 180.67 181.03 180.860 2008-01-07 14:42:00 180.83 180.88 181.025 2008-01-07 14:43:00 180.80 181.11 180.890 2008-01-07 14:44:00 180.99 181.24 181.120 2008-01-07 14:45:00 181.19 181.84 181.240 2008-01-07 14:46:00 181.84 182.50 181.880 2008-01-07 14:47:00 182.20 182.35 182.500 2008-01-07 14:48:00 181.62 181.63 182.240 2008-01-07 14:49:00 181.67 181.99 181.720 2008-01-07 14:50:00 NaN NaN NaN
df
A B 2000-01-01 0.583378 0.583378 2000-01-02 -1.542445 NaN 2000-01-03 -0.215126 -0.215126 2000-01-04 -1.231664 NaN 2000-01-05 0.334308 0.334308 2000-01-06 1.234491 1.234491 2000-01-07 -0.678003 -0.678003 2000-01-08 -0.496512 -0.496512 2000-01-09 -2.023730 -2.023730 2000-01-10 -0.464823 NaN
df.count()
A 10 B 7
both.count()
IBM volume 440607 high 440607 low 440607 close_price 440607 open_price 440607 AAPL volume 440637 high 440637 low 440637 close_price 440637 open_price 440637
df.sum()
A -4.500125 B -1.261193
df.mean(1)
2000-01-01 0.583378 2000-01-02 -1.542445 2000-01-03 -0.215126 2000-01-04 -1.231664 2000-01-05 0.334308 2000-01-06 1.234491 2000-01-07 -0.678003 2000-01-08 -0.496512 2000-01-09 -2.023730 2000-01-10 -0.464823 Freq: D
df.dropna()
A B 2000-01-01 0.583378 0.583378 2000-01-03 -0.215126 -0.215126 2000-01-05 0.334308 0.334308 2000-01-06 1.234491 1.234491 2000-01-07 -0.678003 -0.678003 2000-01-08 -0.496512 -0.496512 2000-01-09 -2.023730 -2.023730
df.fillna(0)
A B 2000-01-01 0.583378 0.583378 2000-01-02 -1.542445 0.000000 2000-01-03 -0.215126 -0.215126 2000-01-04 -1.231664 0.000000 2000-01-05 0.334308 0.334308 2000-01-06 1.234491 1.234491 2000-01-07 -0.678003 -0.678003 2000-01-08 -0.496512 -0.496512 2000-01-09 -2.023730 -2.023730 2000-01-10 -0.464823 0.000000
df.fillna(method='ffill')
A B 2000-01-01 0.583378 0.583378 2000-01-02 -1.542445 0.583378 2000-01-03 -0.215126 -0.215126 2000-01-04 -1.231664 -0.215126 2000-01-05 0.334308 0.334308 2000-01-06 1.234491 1.234491 2000-01-07 -0.678003 -0.678003 2000-01-08 -0.496512 -0.496512 2000-01-09 -2.023730 -2.023730 2000-01-10 -0.464823 -2.023730
df.asfreq('4h')
A B 2000-01-01 00:00:00 0.583378 0.583378 2000-01-01 04:00:00 NaN NaN 2000-01-01 08:00:00 NaN NaN 2000-01-01 12:00:00 NaN NaN 2000-01-01 16:00:00 NaN NaN 2000-01-01 20:00:00 NaN NaN 2000-01-02 00:00:00 -1.542445 NaN 2000-01-02 04:00:00 NaN NaN 2000-01-02 08:00:00 NaN NaN 2000-01-02 12:00:00 NaN NaN 2000-01-02 16:00:00 NaN NaN 2000-01-02 20:00:00 NaN NaN 2000-01-03 00:00:00 -0.215126 -0.215126 2000-01-03 04:00:00 NaN NaN 2000-01-03 08:00:00 NaN NaN 2000-01-03 12:00:00 NaN NaN 2000-01-03 16:00:00 NaN NaN 2000-01-03 20:00:00 NaN NaN 2000-01-04 00:00:00 -1.231664 NaN 2000-01-04 04:00:00 NaN NaN 2000-01-04 08:00:00 NaN NaN 2000-01-04 12:00:00 NaN NaN 2000-01-04 16:00:00 NaN NaN 2000-01-04 20:00:00 NaN NaN 2000-01-05 00:00:00 0.334308 0.334308 2000-01-05 04:00:00 NaN NaN 2000-01-05 08:00:00 NaN NaN 2000-01-05 12:00:00 NaN NaN 2000-01-05 16:00:00 NaN NaN 2000-01-05 20:00:00 NaN NaN 2000-01-06 00:00:00 1.234491 1.234491 2000-01-06 04:00:00 NaN NaN 2000-01-06 08:00:00 NaN NaN 2000-01-06 12:00:00 NaN NaN 2000-01-06 16:00:00 NaN NaN 2000-01-06 20:00:00 NaN NaN 2000-01-07 00:00:00 -0.678003 -0.678003 2000-01-07 04:00:00 NaN NaN 2000-01-07 08:00:00 NaN NaN 2000-01-07 12:00:00 NaN NaN 2000-01-07 16:00:00 NaN NaN 2000-01-07 20:00:00 NaN NaN 2000-01-08 00:00:00 -0.496512 -0.496512 2000-01-08 04:00:00 NaN NaN 2000-01-08 08:00:00 NaN NaN 2000-01-08 12:00:00 NaN NaN 2000-01-08 16:00:00 NaN NaN 2000-01-08 20:00:00 NaN NaN 2000-01-09 00:00:00 -2.023730 -2.023730 2000-01-09 04:00:00 NaN NaN 2000-01-09 08:00:00 NaN NaN 2000-01-09 12:00:00 NaN NaN 2000-01-09 16:00:00 NaN NaN 2000-01-09 20:00:00 NaN NaN 2000-01-10 00:00:00 -0.464823 NaN
df.asfreq('4h').ffill(limit=3)
A B 2000-01-01 00:00:00 0.583378 0.583378 2000-01-01 04:00:00 0.583378 0.583378 2000-01-01 08:00:00 0.583378 0.583378 2000-01-01 12:00:00 0.583378 0.583378 2000-01-01 16:00:00 NaN NaN 2000-01-01 20:00:00 NaN NaN 2000-01-02 00:00:00 -1.542445 NaN 2000-01-02 04:00:00 -1.542445 NaN 2000-01-02 08:00:00 -1.542445 NaN 2000-01-02 12:00:00 -1.542445 NaN 2000-01-02 16:00:00 NaN NaN 2000-01-02 20:00:00 NaN NaN 2000-01-03 00:00:00 -0.215126 -0.215126 2000-01-03 04:00:00 -0.215126 -0.215126 2000-01-03 08:00:00 -0.215126 -0.215126 2000-01-03 12:00:00 -0.215126 -0.215126 2000-01-03 16:00:00 NaN NaN 2000-01-03 20:00:00 NaN NaN 2000-01-04 00:00:00 -1.231664 NaN 2000-01-04 04:00:00 -1.231664 NaN 2000-01-04 08:00:00 -1.231664 NaN 2000-01-04 12:00:00 -1.231664 NaN 2000-01-04 16:00:00 NaN NaN 2000-01-04 20:00:00 NaN NaN 2000-01-05 00:00:00 0.334308 0.334308 2000-01-05 04:00:00 0.334308 0.334308 2000-01-05 08:00:00 0.334308 0.334308 2000-01-05 12:00:00 0.334308 0.334308 2000-01-05 16:00:00 NaN NaN 2000-01-05 20:00:00 NaN NaN 2000-01-06 00:00:00 1.234491 1.234491 2000-01-06 04:00:00 1.234491 1.234491 2000-01-06 08:00:00 1.234491 1.234491 2000-01-06 12:00:00 1.234491 1.234491 2000-01-06 16:00:00 NaN NaN 2000-01-06 20:00:00 NaN NaN 2000-01-07 00:00:00 -0.678003 -0.678003 2000-01-07 04:00:00 -0.678003 -0.678003 2000-01-07 08:00:00 -0.678003 -0.678003 2000-01-07 12:00:00 -0.678003 -0.678003 2000-01-07 16:00:00 NaN NaN 2000-01-07 20:00:00 NaN NaN 2000-01-08 00:00:00 -0.496512 -0.496512 2000-01-08 04:00:00 -0.496512 -0.496512 2000-01-08 08:00:00 -0.496512 -0.496512 2000-01-08 12:00:00 -0.496512 -0.496512 2000-01-08 16:00:00 NaN NaN 2000-01-08 20:00:00 NaN NaN 2000-01-09 00:00:00 -2.023730 -2.023730 2000-01-09 04:00:00 -2.023730 -2.023730 2000-01-09 08:00:00 -2.023730 -2.023730 2000-01-09 12:00:00 -2.023730 -2.023730 2000-01-09 16:00:00 NaN NaN 2000-01-09 20:00:00 NaN NaN 2000-01-10 00:00:00 -0.464823 NaN
import random, string
import matplotlib as mpl
def rands(n):
choices = string.ascii_letters
return ''.join([random.choice(choices) for _ in xrange(n)])
mpl.rc('figure', figsize=(12, 8))
ind_names = np.array(['ENERGY', 'FINANCIAL', 'TECH',
'CONSDUR', 'SERVICES', 'UTILITIES'], dtype='O')
ccys = np.array(['USD', 'EUR'], dtype='O')
Nfull = 2000
tickers = np.array(sorted(rands(5).upper() for _ in xrange(Nfull)), dtype='O')
tickers = np.unique(tickers)
industries = pd.Series(ind_names.take(np.random.randint(0, 6, Nfull)),
index=tickers, name='industry')
ccy = pd.Series(ccys.take(np.random.randint(0, len(ccys), Nfull)),
index=tickers, name='ccy')
ccy
AAEUC EUR AAKGK EUR AATPS EUR AAUGH EUR AAXBK USD ABQLB USD ACPJP EUR ACSGL EUR ADCYV EUR ADFEZ EUR ADNMK USD ADUCW EUR AENYT EUR AEQKA EUR AETPI USD ... ZVHFY EUR ZVHKA USD ZVVJW EUR ZWCVS EUR ZWMOZ EUR ZXBDD USD ZXDQC EUR ZXNPV EUR ZYLEW EUR ZYTMF EUR ZYYHV USD ZZBYE USD ZZENJ EUR ZZJOX EUR ZZZCZ USD Name: ccy, Length: 2000
df = pd.DataFrame({'Momentum' : np.random.randn(1000) / 200 + 0.03,
'Value' : np.random.randn(1000) / 200 + 0.08,
'ShortInterest' : np.random.randn(1000) / 200 - 0.02},
index=tickers.take(np.random.permutation(Nfull)[:1000]))
df.head()
Momentum ShortInterest Value WPZGV 0.030320 -0.022210 0.067899 KTCLL 0.031851 -0.016965 0.079431 RMXIV 0.031954 -0.018996 0.077051 SUYKU 0.025723 -0.022544 0.076665 EEYEA 0.022493 -0.026726 0.079324
means = df.groupby(industries).mean()
means
Momentum ShortInterest Value industry CONSDUR 0.029303 -0.020244 0.080010 ENERGY 0.029526 -0.019828 0.079985 FINANCIAL 0.029805 -0.019937 0.079605 SERVICES 0.030393 -0.020437 0.079563 TECH 0.029396 -0.019980 0.080333 UTILITIES 0.029690 -0.020159 0.080312
means.plot(kind='barh')
<matplotlib.axes.AxesSubplot at 0x13025aa50>
means = df.groupby([industries, ccy]).mean()
means
Momentum ShortInterest Value industry ccy CONSDUR EUR 0.029077 -0.020063 0.080763 USD 0.029529 -0.020424 0.079257 ENERGY EUR 0.029349 -0.019963 0.079538 USD 0.029728 -0.019673 0.080498 FINANCIAL EUR 0.029460 -0.020284 0.079718 USD 0.030215 -0.019524 0.079471 SERVICES EUR 0.030222 -0.020150 0.079125 USD 0.030539 -0.020683 0.079939 TECH EUR 0.029018 -0.019371 0.080187 USD 0.029731 -0.020521 0.080461 UTILITIES EUR 0.029562 -0.020285 0.080697 USD 0.029847 -0.020006 0.079843
keys = [industries, ccy]
zscore = lambda x: (x - x.mean()) / x.std()
normed = df.groupby(keys).apply(zscore)
normed.groupby(keys).agg(['mean', 'std'])
Momentum ShortInterest Value mean std mean std mean std industry ccy CONSDUR EUR -1.717145e-16 1 1.281938e-15 1 2.158274e-15 1 USD -1.576517e-15 1 -1.245300e-15 1 -3.043491e-15 1 ENERGY EUR -4.680352e-16 1 -2.024524e-15 1 1.094985e-15 1 USD 6.536594e-16 1 9.056426e-16 1 -1.187564e-15 1 FINANCIAL EUR -1.144085e-15 1 1.731948e-16 1 -7.722989e-16 1 USD -4.130294e-17 1 4.378112e-16 1 2.775558e-15 1 SERVICES EUR -3.221210e-16 1 -1.845159e-16 1 -5.222739e-15 1 USD -2.193694e-16 1 -1.553643e-15 1 1.317888e-15 1 TECH EUR -1.467181e-15 1 8.108844e-16 1 -2.259093e-15 1 USD 1.066563e-15 1 -1.777604e-15 1 2.604658e-15 1 UTILITIES EUR -2.628850e-15 1 1.698377e-15 1 -3.649858e-15 1 USD -6.454177e-16 1 -6.082091e-16 1 -1.024141e-15 1
means
Momentum ShortInterest Value industry ccy CONSDUR EUR 0.029077 -0.020063 0.080763 USD 0.029529 -0.020424 0.079257 ENERGY EUR 0.029349 -0.019963 0.079538 USD 0.029728 -0.019673 0.080498 FINANCIAL EUR 0.029460 -0.020284 0.079718 USD 0.030215 -0.019524 0.079471 SERVICES EUR 0.030222 -0.020150 0.079125 USD 0.030539 -0.020683 0.079939 TECH EUR 0.029018 -0.019371 0.080187 USD 0.029731 -0.020521 0.080461 UTILITIES EUR 0.029562 -0.020285 0.080697 USD 0.029847 -0.020006 0.079843
means['Momentum']
industry ccy CONSDUR EUR 0.029077 USD 0.029529 ENERGY EUR 0.029349 USD 0.029728 FINANCIAL EUR 0.029460 USD 0.030215 SERVICES EUR 0.030222 USD 0.030539 TECH EUR 0.029018 USD 0.029731 UTILITIES EUR 0.029562 USD 0.029847 Name: Momentum
means.ix['TECH']
Momentum ShortInterest Value ccy EUR 0.029018 -0.019371 0.080187 USD 0.029731 -0.020521 0.080461
means.stack()
industry ccy CONSDUR EUR Momentum 0.029077 ShortInterest -0.020063 Value 0.080763 USD Momentum 0.029529 ShortInterest -0.020424 Value 0.079257 ENERGY EUR Momentum 0.029349 ShortInterest -0.019963 Value 0.079538 USD Momentum 0.029728 ShortInterest -0.019673 Value 0.080498 FINANCIAL EUR Momentum 0.029460 ShortInterest -0.020284 Value 0.079718 USD Momentum 0.030215 ShortInterest -0.019524 Value 0.079471 SERVICES EUR Momentum 0.030222 ShortInterest -0.020150 Value 0.079125 USD Momentum 0.030539 ShortInterest -0.020683 Value 0.079939 TECH EUR Momentum 0.029018 ShortInterest -0.019371 Value 0.080187 USD Momentum 0.029731 ShortInterest -0.020521 Value 0.080461 UTILITIES EUR Momentum 0.029562 ShortInterest -0.020285 Value 0.080697 USD Momentum 0.029847 ShortInterest -0.020006 Value 0.079843
means.stack().unstack('industry')
industry CONSDUR ENERGY FINANCIAL SERVICES TECH UTILITIES ccy EUR Momentum 0.029077 0.029349 0.029460 0.030222 0.029018 0.029562 ShortInterest -0.020063 -0.019963 -0.020284 -0.020150 -0.019371 -0.020285 Value 0.080763 0.079538 0.079718 0.079125 0.080187 0.080697 USD Momentum 0.029529 0.029728 0.030215 0.030539 0.029731 0.029847 ShortInterest -0.020424 -0.019673 -0.019524 -0.020683 -0.020521 -0.020006 Value 0.079257 0.080498 0.079471 0.079939 0.080461 0.079843
base = '/Users/wesm/Dropbox/book/svn/book_scripts/movielens/ml-1m'
get_path = lambda x: '%s/%s.dat' % (base, x)
unames = ['user_id', 'gender', 'age', 'occupation', 'zip']
users = pd.read_table(get_path('users'), sep='::', header=None, names=unames)
rnames = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_table(get_path('ratings'), sep='::', header=None, names=rnames)
mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table(get_path('movies'), sep='::', header=None, names=mnames)
movies.head()
movie_id title genres 0 1 Toy Story (1995) Animation|Children's|Comedy 1 2 Jumanji (1995) Adventure|Children's|Fantasy 2 3 Grumpier Old Men (1995) Comedy|Romance 3 4 Waiting to Exhale (1995) Comedy|Drama 4 5 Father of the Bride Part II (1995) Comedy
ratings.head()
user_id movie_id rating timestamp 0 1 1193 5 978300760 1 1 661 3 978302109 2 1 914 3 978301968 3 1 3408 4 978300275 4 1 2355 5 978824291
users.head()
user_id gender age occupation zip 0 1 F 1 10 48067 1 2 M 56 16 70072 2 3 M 25 15 55117 3 4 M 45 7 02460 4 5 M 25 20 55455
data = pd.merge(pd.merge(ratings, users), movies)
data
<class 'pandas.core.frame.DataFrame'> Int64Index: 1000209 entries, 0 to 1000208 Data columns: user_id 1000209 non-null values movie_id 1000209 non-null values rating 1000209 non-null values timestamp 1000209 non-null values gender 1000209 non-null values age 1000209 non-null values occupation 1000209 non-null values zip 1000209 non-null values title 1000209 non-null values genres 1000209 non-null values dtypes: int64(6), object(4)
rating_counts = data.groupby('title').size()
freq_titles = rating_counts.index[rating_counts > 1000]
freq_titles
Index([2001: A Space Odyssey (1968), Abyss, The (1989), African Queen, The (1951), Air Force One (1997), Airplane! (1980), Aladdin (1992), Alien (1979), Aliens (1986), Amadeus (1984), American Beauty (1999), American Pie (1999), American President, The (1995), Animal House (1978), Annie Hall (1977), Apocalypse Now (1979), Apollo 13 (1995), Arachnophobia (1990), Armageddon (1998), As Good As It Gets (1997), Austin Powers: International Man of Mystery (1997), Austin Powers: The Spy Who Shagged Me (1999), Babe (1995), Back to the Future (1985), Back to the Future Part II (1989), Back to the Future Part III (1990), Batman (1989), Batman Returns (1992), Beauty and the Beast (1991), Beetlejuice (1988), Being John Malkovich (1999), Big (1988), Big Lebowski, The (1998), Blade Runner (1982), Blair Witch Project, The (1999), Blazing Saddles (1974), Blues Brothers, The (1980), Boat, The (Das Boot) (1981), Boogie Nights (1997), Braveheart (1995), Breakfast Club, The (1985), Bug's Life, A (1998), Bull Durham (1988), Butch Cassidy and the Sundance Kid (1969), Casablanca (1942), Chicken Run (2000), Chinatown (1974), Christmas Story, A (1983), Citizen Kane (1941), Clear and Present Danger (1994), Clerks (1994), Clockwork Orange, A (1971), Close Encounters of the Third Kind (1977), Clueless (1995), Cocoon (1985), Contact (1997), Crying Game, The (1992), Dances with Wolves (1990), Die Hard (1988), Die Hard 2 (1990), Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1963), E.T. the Extra-Terrestrial (1982), Edward Scissorhands (1990), Election (1999), Erin Brockovich (2000), Face/Off (1997), Fargo (1996), Ferris Bueller's Day Off (1986), Few Good Men, A (1992), Fifth Element, The (1997), Fight Club (1999), Fish Called Wanda, A (1988), Fly, The (1986), Forrest Gump (1994), Four Weddings and a Funeral (1994), Fugitive, The (1993), Full Metal Jacket (1987), Full Monty, The (1997), Galaxy Quest (1999), Gattaca (1997), Get Shorty (1995), Ghost (1990), Ghostbusters (1984), Gladiator (2000), Glory (1989), Godfather, The (1972), Godfather: Part II, The (1974), Gone with the Wind (1939), Good Morning, Vietnam (1987), Good Will Hunting (1997), GoodFellas (1990), Graduate, The (1967), Green Mile, The (1999), Grosse Pointe Blank (1997), Groundhog Day (1993), High Fidelity (2000), Honey, I Shrunk the Kids (1989), Hunt for Red October, The (1990), Independence Day (ID4) (1996), Indiana Jones and the Last Crusade (1989), Indiana Jones and the Temple of Doom (1984), Jaws (1975), Jerry Maguire (1996), Jurassic Park (1993), L.A. Confidential (1997), League of Their Own, A (1992), Lethal Weapon (1987), Lethal Weapon 2 (1989), Life Is Beautiful (La Vita � bella) (1997), Lion King, The (1994), Little Mermaid, The (1989), Lost World: Jurassic Park, The (1997), M*A*S*H (1970), Mad Max (1979), Mad Max 2 (a.k.a. The Road Warrior) (1981), Magnolia (1999), Maltese Falcon, The (1941), Mars Attacks! (1996), Mary Poppins (1964), Mask, The (1994), Matrix, The (1999), Men in Black (1997), Mission: Impossible (1996), Mission: Impossible 2 (2000), Monty Python and the Holy Grail (1974), Mummy, The (1999), My Cousin Vinny (1992), North by Northwest (1959), One Flew Over the Cuckoo's Nest (1975), Patriot Games (1992), Patriot, The (2000), Perfect Storm, The (2000), Planet of the Apes (1968), Platoon (1986), Pleasantville (1998), Predator (1987), Pretty Woman (1990), Princess Bride, The (1987), Psycho (1960), Pulp Fiction (1994), Raiders of the Lost Ark (1981), Rain Man (1988), Raising Arizona (1987), Rear Window (1954), Reservoir Dogs (1992), Robocop (1987), Rock, The (1996), Rocky (1976), Rocky Horror Picture Show, The (1975), Romancing the Stone (1984), Run Lola Run (Lola rennt) (1998), Rushmore (1998), Saving Private Ryan (1998), Schindler's List (1993), Seven (Se7en) (1995), Shakespeare in Love (1998), Shawshank Redemption, The (1994), Shining, The (1980), Silence of the Lambs, The (1991), Sixth Sense, The (1999), Sleepy Hollow (1999), Sling Blade (1996), Sneakers (1992), South Park: Bigger, Longer and Uncut (1999), Speed (1994), Splash (1984), Stand by Me (1986), Star Trek IV: The Voyage Home (1986), Star Trek VI: The Undiscovered Country (1991), Star Trek: First Contact (1996), Star Trek: The Wrath of Khan (1982), Star Wars: Episode I - The Phantom Menace (1999), Star Wars: Episode IV - A New Hope (1977), Star Wars: Episode V - The Empire Strikes Back (1980), Star Wars: Episode VI - Return of the Jedi (1983), Stargate (1994), Starship Troopers (1997), Sting, The (1973), Superman (1978), Talented Mr. Ripley, The (1999), Taxi Driver (1976), Terminator 2: Judgment Day (1991), Terminator, The (1984), Thelma & Louise (1991), There's Something About Mary (1998), This Is Spinal Tap (1984), Thomas Crown Affair, The (1999), Three Kings (1999), Time Bandits (1981), Titanic (1997), Top Gun (1986), Total Recall (1990), Toy Story (1995), Toy Story 2 (1999), True Lies (1994), Truman Show, The (1998), Twelve Monkeys (1995), Twister (1996), Untouchables, The (1987), Usual Suspects, The (1995), Wayne's World (1992), When Harry Met Sally... (1989), Who Framed Roger Rabbit? (1988), Willy Wonka and the Chocolate Factory (1971), Witness (1985), Wizard of Oz, The (1939), X-Men (2000), Young Frankenstein (1974)], dtype=object)
highest_rated = data.groupby('title').rating.mean()[freq_titles].order()[-20:]
highest_rated
title Life Is Beautiful (La Vita � bella) (1997) 4.329861 Monty Python and the Holy Grail (1974) 4.335210 Saving Private Ryan (1998) 4.337354 Chinatown (1974) 4.339241 Silence of the Lambs, The (1991) 4.351823 Godfather: Part II, The (1974) 4.357565 North by Northwest (1959) 4.384030 Citizen Kane (1941) 4.388889 One Flew Over the Cuckoo's Nest (1975) 4.390725 Maltese Falcon, The (1941) 4.395973 Sixth Sense, The (1999) 4.406263 Casablanca (1942) 4.412822 Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1963) 4.449890 Star Wars: Episode IV - A New Hope (1977) 4.453694 Rear Window (1954) 4.476190 Raiders of the Lost Ark (1981) 4.477725 Schindler's List (1993) 4.510417 Usual Suspects, The (1995) 4.517106 Godfather, The (1972) 4.524966 Shawshank Redemption, The (1994) 4.554558 Name: rating
filtered = data[data.title.isin(highest_rated.index)]
filtered.title = filtered.title.str[:25]
filtered.groupby(['title', 'gender']).rating.count().unstack()
gender F M title Casablanca (1942) 505 1164 Chinatown (1974) 255 930 Citizen Kane (1941) 280 836 Dr. Strangelove or: How I 231 1136 Godfather, The (1972) 483 1740 Godfather: Part II, The ( 342 1350 Life Is Beautiful (La Vit 367 785 Maltese Falcon, The (1941 235 808 Monty Python and the Holy 352 1247 North by Northwest (1959) 332 983 One Flew Over the Cuckoo' 444 1281 Raiders of the Lost Ark ( 572 1942 Rear Window (1954) 291 759 Saving Private Ryan (1998 575 2078 Schindler's List (1993) 615 1689 Shawshank Redemption, The 627 1600 Silence of the Lambs, The 706 1872 Sixth Sense, The (1999) 664 1795 Star Wars: Episode IV - A 647 2344 Usual Suspects, The (1995 413 1370
mean_ratings = data.pivot_table('rating', rows='title',
cols='gender', aggfunc='mean')
mean_ratings.tail(20)
gender F M title Year of the Horse (1997) NaN 3.250000 Yellow Submarine (1968) 3.714286 3.689286 Yojimbo (1961) 4.423077 4.402116 You Can't Take It With You (1938) 4.192308 3.921569 You So Crazy (1994) 3.666667 2.300000 You've Got Mail (1998) 3.542424 3.275591 Young Doctors in Love (1982) 1.923077 2.742424 Young Frankenstein (1974) 4.289963 4.239177 Young Guns (1988) 3.371795 3.425620 Young Guns II (1990) 2.934783 2.904025 Young Poisoner's Handbook, The (1995) 4.000000 3.532258 Young Sherlock Holmes (1985) 3.514706 3.363344 Young and Innocent (1937) 2.500000 3.500000 Your Friends and Neighbors (1998) 2.888889 3.536585 Zachariah (1971) NaN 3.500000 Zed & Two Noughts, A (1985) 3.500000 3.380952 Zero Effect (1998) 3.864407 3.723140 Zero Kelvin (Kj�rlighetens kj�tere) (1995) NaN 3.500000 Zeus and Roxanne (1997) 2.777778 2.357143 eXistenZ (1999) 3.098592 3.289086
summary, value_counts, etc.
data.title.value_counts()
American Beauty (1999) 3428 Star Wars: Episode IV - A New Hope (1977) 2991 Star Wars: Episode V - The Empire Strikes Back (1980) 2990 Star Wars: Episode VI - Return of the Jedi (1983) 2883 Jurassic Park (1993) 2672 Saving Private Ryan (1998) 2653 Terminator 2: Judgment Day (1991) 2649 Matrix, The (1999) 2590 Back to the Future (1985) 2583 Silence of the Lambs, The (1991) 2578 Men in Black (1997) 2538 Raiders of the Lost Ark (1981) 2514 Fargo (1996) 2513 Sixth Sense, The (1999) 2459 Braveheart (1995) 2443 ... Beauty (1998) 1 Legal Deceit (1997) 1 Silence of the Palace, The (Saimt el Qusur) (1994) 1 Relative Fear (1994) 1 For Ever Mozart (1996) 1 White Boys (1999) 1 Terror in a Texas Town (1958) 1 Schlafes Bruder (Brother of Sleep) (1995) 1 Follow the Bitch (1998) 1 Even Dwarfs Started Small (Auch Zwerge haben klein angefangen) (1971) 1 Low Life, The (1994) 1 Wooden Man's Bride, The (Wu Kui) (1994) 1 Back Stage (2000) 1 One Man's Hero (1999) 1 Beloved/Friend (Amigo/Amado) (1999) 1 Length: 3706
data.rating.describe()
count 1000209.000000 mean 3.581564 std 1.117102 min 1.000000 25% 3.000000 50% 4.000000 75% 4.000000 max 5.000000
by_gender = data.groupby('gender').rating.describe()
by_gender
gender F count 246440.000000 mean 3.620366 std 1.111228 min 1.000000 25% 3.000000 50% 4.000000 75% 4.000000 max 5.000000 M count 753769.000000 mean 3.568879 std 1.118724 min 1.000000 25% 3.000000 50% 4.000000 75% 4.000000 max 5.000000
by_gender.unstack(0)
gender F M count 246440.000000 753769.000000 mean 3.620366 3.568879 std 1.111228 1.118724 min 1.000000 1.000000 25% 3.000000 3.000000 50% 4.000000 4.000000 75% 4.000000 4.000000 max 5.000000 5.000000