import pandas as pd
from datetime import datetime
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
# Reading dataset and sorting through dates
data = pd.read_csv('sphist.csv')
data['Date'] = pd.to_datetime(data['Date'])
data['Recent'] = data['Date'] > datetime(year = 2015, month=4, day=1)
data = data.sort_values(by='Date', ascending=True)
# Stock market observations are not all independent, because each row comes sometime after
# the previous row. Therefore, we shouldn't inject future knowledge.
data.head(10)
Date | Open | High | Low | Close | Volume | Adj Close | Recent | |
---|---|---|---|---|---|---|---|---|
16589 | 1950-01-03 | 16.660000 | 16.660000 | 16.660000 | 16.660000 | 1260000.0 | 16.660000 | False |
16588 | 1950-01-04 | 16.850000 | 16.850000 | 16.850000 | 16.850000 | 1890000.0 | 16.850000 | False |
16587 | 1950-01-05 | 16.930000 | 16.930000 | 16.930000 | 16.930000 | 2550000.0 | 16.930000 | False |
16586 | 1950-01-06 | 16.980000 | 16.980000 | 16.980000 | 16.980000 | 2010000.0 | 16.980000 | False |
16585 | 1950-01-09 | 17.080000 | 17.080000 | 17.080000 | 17.080000 | 2520000.0 | 17.080000 | False |
16584 | 1950-01-10 | 17.030001 | 17.030001 | 17.030001 | 17.030001 | 2160000.0 | 17.030001 | False |
16583 | 1950-01-11 | 17.090000 | 17.090000 | 17.090000 | 17.090000 | 2630000.0 | 17.090000 | False |
16582 | 1950-01-12 | 16.760000 | 16.760000 | 16.760000 | 16.760000 | 2970000.0 | 16.760000 | False |
16581 | 1950-01-13 | 16.670000 | 16.670000 | 16.670000 | 16.670000 | 3330000.0 | 16.670000 | False |
16580 | 1950-01-16 | 16.719999 | 16.719999 | 16.719999 | 16.719999 | 1460000.0 | 16.719999 | False |
data['5_day'] = data['Close'].rolling(5).mean().shift(1)
data['5_day_std']=data['Close'].rolling(5).std().shift(1)
data['30_day'] = data['Close'].rolling(30).mean().shift(1)
data['30_day_std']= data['Close'].rolling(30).std().shift(1)
data['5_day_avg_vol'] = data['Volume'].rolling(5).mean().shift(1)
df = data[data['Date']>datetime(year=1951, month=1, day=2)]
df.head(2)
Date | Open | High | Low | Close | Volume | Adj Close | Recent | 5_day | 30_day | 5_day_std | 30_day_std | 5_day_avg_vol | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
16339 | 1951-01-03 | 20.690001 | 20.690001 | 20.690001 | 20.690001 | 3370000.0 | 20.690001 | False | 20.360 | 19.815000 | 0.304385 | 0.40385 | 3126000.0 |
16338 | 1951-01-04 | 20.870001 | 20.870001 | 20.870001 | 20.870001 | 3390000.0 | 20.870001 | False | 20.514 | 19.842666 | 0.204524 | 0.43432 | 3268000.0 |
#creating two new dataframes for training and testing
train =df[df['Date']<datetime(year=2013, month=1, day=1)]
test = df[df['Date']>datetime(year=2013, month=1, day=1)]
#creating a prediction function
def prediction_func(cols):
features = train[cols]
target = train[['Close']]
reg = LinearRegression()
reg.fit(features, target)
train_predictions = reg.predict(features)
test_predictions = reg.predict(test[cols])
train_mae = mean_absolute_error(target, train_predictions)
test_mae = mean_absolute_error(test['Close'], test_predictions)
print('Training dataset mean absolute error = {} \nTest dataset mean absolute error = {}'.format(train_mae, test_mae))
# making predictions based on 5 day average, 30 day average, 30 standard deviation and 5 day standard deviation
cols = ['5_day', '30_day', '5_day_std', '30_day_std']
prediction_func(cols)
Training dataset mean absolute error = 4.951234200140633 Test dataset mean absolute error = 16.238284175900816
#making predictions based 5 day average, 30 average, and 5 day standard deviation
cols = ['5_day', '30_day', '5_day_std']
prediction_func(cols)
Training dataset mean absolute error = 4.9486853679305955 Test dataset mean absolute error = 16.14272308471701
# making predictions based on 5 day average, 5 day std, 5 day average volume
cols = ['5_day', '5_day_std', '5_day_avg_vol' ]
prediction_func(cols)
Training dataset mean absolute error = 4.993960591777934 Test dataset mean absolute error = 16.137448310877698