import pandas as pd
import pandas_datareader.data as web
import datetime
import numpy as np
from talib.abstract import *
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot
import cufflinks as cf
init_notebook_mode()
start = datetime.datetime(1998, 1, 1)
end = datetime.datetime(2016, 6, 30)
top_500 = ['AAPL', 'MSFT', 'XOM', 'JNJ', 'GE', 'BRK-B', 'T', 'VZ']
f = web.DataReader(top_500, 'yahoo',start,end)
cleanData = f.ix['Adj Close']
stock_data = pd.DataFrame(cleanData)
stock_data.iplot(dimensions=(950,400), yTitle='Daily Price ($)')
stocks = {}
for i in top_500:
stocks[i] = web.DataReader(i, 'yahoo',start,end)
for i,j in enumerate(stocks):
stocks[j].columns = [s.lower() for s in stocks[j].columns]
stocks[j].volume = stocks[j].volume.apply(lambda x: float(x))
def get_indicators(stocks, period):
stocks_indicators = {}
for i in stocks:
features = pd.DataFrame(SMA(stocks[i], timeperiod=5))
features.columns = ['sma_5']
features['sma_10'] = pd.DataFrame(SMA(stocks[i], timeperiod=10))
features['mom_10'] = pd.DataFrame(MOM(stocks[i],10))
features['wma_10'] = pd.DataFrame(WMA(stocks[i],10))
features['wma_5'] = pd.DataFrame(WMA(stocks[i],5))
features = pd.concat([features,STOCHF(stocks[i],
fastk_period=14,
fastd_period=3)],
axis=1)
features['macd'] = pd.DataFrame(MACD(stocks[i], fastperiod=12, slowperiod=26)['macd'])
features['rsi'] = pd.DataFrame(RSI(stocks[i], timeperiod=14))
features['willr'] = pd.DataFrame(WILLR(stocks[i], timeperiod=14))
features['cci'] = pd.DataFrame(CCI(stocks[i], timeperiod=14))
features['adosc'] = pd.DataFrame(ADOSC(stocks[i], fastperiod=3, slowperiod=10))
features['pct_change'] = ROC(stocks[i], timeperiod=period)
features['pct_change'] = features['pct_change'].shift(-period)
features['pct_change'] = features['pct_change'].apply(lambda x: '1' if x > 0 else '0' if x <= 0 else np.nan)
features = features.dropna()
features = features.iloc[np.where(features.index=='1998-5-5')[0][0]:np.where(features.index=='2015-5-5')[0][0]]
stocks_indicators[i] = features
return stocks_indicators
stocks_indicators = get_indicators(stocks, 1)
stocks_indicators['AAPL'].head()
sma_5 | sma_10 | mom_10 | wma_10 | wma_5 | fastk | fastd | macd | rsi | willr | cci | adosc | pct_change | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Date | |||||||||||||
1998-05-05 | 28.2250 | 27.89375 | 0.687499 | 28.153409 | 28.695833 | 94.827559 | 76.670924 | 0.681958 | 65.507369 | -5.172441 | 149.900758 | 12066108.525685 | 1 |
1998-05-06 | 28.8875 | 28.17500 | 2.812500 | 28.593182 | 29.391667 | 97.014949 | 91.725290 | 0.810651 | 68.116574 | -2.985051 | 159.423513 | 71289716.765831 | 0 |
1998-05-07 | 29.4500 | 28.42500 | 2.500000 | 28.959091 | 29.825000 | 90.000000 | 93.947502 | 0.892269 | 67.024556 | -10.000000 | 137.254904 | 81706570.127784 | 1 |
1998-05-08 | 29.9375 | 28.67500 | 2.499999 | 29.325000 | 30.154166 | 95.714263 | 94.243070 | 0.965989 | 68.125177 | -4.285737 | 116.666674 | 95295085.325839 | 1 |
1998-05-11 | 30.3125 | 28.99375 | 3.187501 | 29.736364 | 30.487500 | 87.209302 | 90.974522 | 1.052625 | 70.262954 | -12.790698 | 134.908507 | 61962542.877240 | 0 |
len(stocks_indicators['AAPL'])
4277
def weighs_tabale(stocks, period):
table = pd.DataFrame()
for j in stocks:
weighs_1 = []
for i in range(1,period+1):
stocks_indicators = get_indicators(stocks, i)
weighs_1.append((len(stocks_indicators[j][stocks_indicators[j]['pct_change']=='1'])/\
float(len(stocks_indicators[j])))*100)
table = pd.concat([table, pd.DataFrame(weighs_1)], axis=1)
table.index = range(1,period+1)
table.columns = stocks.keys()
return table
table = weighs_tabale(stocks, 20)
table.iplot(kind='bar', subplots=True, dimensions=(950,500), title='Percentage of the Increase Data Points')
def avg_score(x_train, y_train,x_test,y_test,trees):
accuracy = []
f1 = []
rf_model = RandomForestClassifier(trees)
for i in range(5):
rf_model.fit(x_train,y_train)
accuracy.append(rf_model.score(x_test,y_test))
f1.append(f1_score(y_test,rf_model.predict(x_test), pos_label='1'))
avg_accuracy = sum(accuracy)/len(accuracy)
avg_f1 = sum(f1)/len(f1)
return avg_accuracy, avg_f1
def accuracy(stocks, trees, period):
table_accuracy = pd.DataFrame()
table_f1 = pd.DataFrame()
for j in stocks:
accuracy_values = []
f1_values = []
for i in range(1,period+1):
stocks_indicators = get_indicators(stocks, i)
train, test = train_test_split(stocks_indicators[j])
accuracy, f1 = avg_score(train.iloc[:,:-1],train.iloc[:,-1],test.iloc[:,:-1],test.iloc[:,-1],trees)
accuracy_values.append(accuracy)
f1_values.append(f1)
table_accuracy = pd.concat([table_accuracy, pd.DataFrame({j : accuracy_values})], axis=1)
table_f1 = pd.concat([table_f1, pd.DataFrame({j : f1_values})], axis=1)
table_accuracy.index = range(1,period+1)
table_f1.index = range(1,period+1)
return table_accuracy, table_f1
accuracy_table, f1_table = accuracy(stocks, 300, 20)
accuracy_table.iplot(dimensions=(950,400), xTitle='Days Ahead', yTitle='Average Score', title='Accuracy scores')
f1_table.iplot(dimensions=(950,400), xTitle='Days Ahead', yTitle='Average Score', title='F1 scores')
def highlight_max(s):
'''
highlight the maximum in a Series yellow.
'''
is_max = s == s.max()
return ['background-color: yellow' if v else '' for v in is_max]
accuracy_table.style.apply(highlight_max, axis=0)
AAPL | GE | T | JNJ | VZ | XOM | MSFT | BRK-B | |
---|---|---|---|---|---|---|---|---|
1.0 | 0.519252 | 0.525421 | 0.517944 | 0.485047 | 0.494393 | 0.508785 | 0.494579 | 0.521308 |
2.0 | 0.612897 | 0.626168 | 0.611589 | 0.601869 | 0.594019 | 0.581495 | 0.606916 | 0.609346 |
3.0 | 0.652897 | 0.645047 | 0.643738 | 0.619626 | 0.627103 | 0.631589 | 0.655327 | 0.646542 |
4.0 | 0.70729 | 0.680374 | 0.674579 | 0.666542 | 0.673271 | 0.649159 | 0.672897 | 0.708972 |
5.0 | 0.723738 | 0.695514 | 0.674579 | 0.694393 | 0.687664 | 0.692897 | 0.722056 | 0.705607 |
6.0 | 0.762243 | 0.717757 | 0.708224 | 0.710093 | 0.686168 | 0.683738 | 0.710654 | 0.714019 |
7.0 | 0.756262 | 0.743551 | 0.680561 | 0.709533 | 0.714766 | 0.693271 | 0.733458 | 0.735701 |
8.0 | 0.773832 | 0.729346 | 0.689159 | 0.702243 | 0.728037 | 0.73271 | 0.74972 | 0.729159 |
9.0 | 0.791963 | 0.750841 | 0.696636 | 0.74785 | 0.72785 | 0.728785 | 0.776262 | 0.726729 |
10.0 | 0.792523 | 0.783925 | 0.722804 | 0.746168 | 0.768598 | 0.714953 | 0.786168 | 0.74 |
11.0 | 0.791776 | 0.765794 | 0.731589 | 0.763925 | 0.741495 | 0.733645 | 0.756262 | 0.76243 |
12.0 | 0.808037 | 0.78243 | 0.731589 | 0.759252 | 0.771963 | 0.730093 | 0.775514 | 0.742056 |
13.0 | 0.798692 | 0.79271 | 0.75271 | 0.745421 | 0.770654 | 0.743178 | 0.785794 | 0.755701 |
14.0 | 0.822243 | 0.785607 | 0.755701 | 0.739065 | 0.764673 | 0.742991 | 0.796075 | 0.74785 |
15.0 | 0.800374 | 0.786168 | 0.731963 | 0.754766 | 0.756262 | 0.766168 | 0.786542 | 0.782991 |
16.0 | 0.822056 | 0.809533 | 0.74729 | 0.753271 | 0.780561 | 0.763551 | 0.796262 | 0.771963 |
17.0 | 0.82 | 0.814206 | 0.733458 | 0.759439 | 0.782056 | 0.771776 | 0.807477 | 0.775888 |
18.0 | 0.845981 | 0.792897 | 0.730467 | 0.77514 | 0.774953 | 0.773458 | 0.771963 | 0.793832 |
19.0 | 0.815327 | 0.817383 | 0.758318 | 0.781495 | 0.776822 | 0.777944 | 0.794579 | 0.808224 |
20.0 | 0.830654 | 0.803925 | 0.76243 | 0.777944 | 0.778318 | 0.755701 | 0.825234 | 0.828972 |
f1_table.style.apply(highlight_max, axis=0)
AAPL | GE | T | JNJ | VZ | XOM | MSFT | BRK-B | |
---|---|---|---|---|---|---|---|---|
1.0 | 0.55655 | 0.499288 | 0.511822 | 0.493808 | 0.496157 | 0.521275 | 0.482192 | 0.457299 |
2.0 | 0.653846 | 0.621222 | 0.621617 | 0.607392 | 0.59568 | 0.600957 | 0.601147 | 0.594455 |
3.0 | 0.687908 | 0.652629 | 0.648996 | 0.658133 | 0.623337 | 0.666124 | 0.655587 | 0.642572 |
4.0 | 0.744953 | 0.677195 | 0.680722 | 0.701584 | 0.69286 | 0.679973 | 0.681106 | 0.711727 |
5.0 | 0.76905 | 0.692699 | 0.665386 | 0.714906 | 0.700688 | 0.728731 | 0.727292 | 0.7141 |
6.0 | 0.802115 | 0.710061 | 0.70913 | 0.724623 | 0.705367 | 0.720978 | 0.72058 | 0.728237 |
7.0 | 0.793349 | 0.739073 | 0.670981 | 0.741261 | 0.713483 | 0.727011 | 0.742396 | 0.750602 |
8.0 | 0.819336 | 0.73535 | 0.684991 | 0.729119 | 0.728154 | 0.758769 | 0.770754 | 0.736687 |
9.0 | 0.834055 | 0.753007 | 0.698364 | 0.777638 | 0.724964 | 0.766157 | 0.784511 | 0.736289 |
10.0 | 0.824203 | 0.789509 | 0.725813 | 0.780898 | 0.763915 | 0.743292 | 0.801674 | 0.765148 |
11.0 | 0.830385 | 0.766178 | 0.74261 | 0.786836 | 0.736925 | 0.769218 | 0.778984 | 0.781447 |
12.0 | 0.843659 | 0.789659 | 0.738803 | 0.781764 | 0.783221 | 0.765203 | 0.786406 | 0.764902 |
13.0 | 0.839664 | 0.803679 | 0.758625 | 0.772321 | 0.773991 | 0.769918 | 0.807715 | 0.783345 |
14.0 | 0.856713 | 0.787158 | 0.766072 | 0.767028 | 0.772862 | 0.776607 | 0.808835 | 0.763452 |
15.0 | 0.843354 | 0.792747 | 0.736083 | 0.779941 | 0.766301 | 0.798573 | 0.802415 | 0.797844 |
16.0 | 0.855012 | 0.816624 | 0.754357 | 0.789945 | 0.782349 | 0.794004 | 0.809895 | 0.784531 |
17.0 | 0.858311 | 0.826139 | 0.747061 | 0.797161 | 0.789372 | 0.795992 | 0.821733 | 0.806891 |
18.0 | 0.879672 | 0.800857 | 0.746119 | 0.806321 | 0.779313 | 0.801505 | 0.783988 | 0.818966 |
19.0 | 0.853244 | 0.834159 | 0.764946 | 0.805528 | 0.780322 | 0.800532 | 0.810352 | 0.830749 |
20.0 | 0.86735 | 0.816957 | 0.759221 | 0.805166 | 0.784369 | 0.784572 | 0.836171 | 0.845565 |