Required Libraries

  • pandas
  • numpy
  • matplotlib
  • scikit-learn
  • statsmodels

note: at one point i used all of these but over time got rid of ones that were unhelpful

In [8]:
import pandas as pd
from pandas.tools.plotting import scatter_matrix
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import ElasticNet
from sklearn.cross_validation import cross_val_score
# import statsmodels.formula.api as smf
import time
from itertools import chain
%matplotlib inline

Functions that split datetime into date and time and creates submission file

the complete breakdown of what I plan on doing for my own clarity

  1. preprocessing of data
    • convert data to "clean" form
    • datetime into year month day hour and convert to int
  2. plot a day to get general idea
    • plot 2 working days and 2
  3. run algorithm
    • random forest
    • AdaBoost
    • something else
  4. plot algorithm prediction
    • compared to nearby date
  5. create submission file
    • strip original datetime since its in submittable format
    • so long as rows remain with dataframe row index this shouldnt be a problem
In [2]:
# here i have created 2 functions to deal with data and to create submissions.  
# nicedata() is a bit sloppy with names of col and getting them in the order i wanted
def nicedata(data):
    '''
    change data into good format. use for both train and test
    '''
    yearmonthday,time = zip(*[item.split(' ') for item in data['datetime']])
    year, month, day = zip(*[item.split('-') for item in yearmonthday])
    data2 = data
    data2 = data.drop('datetime',1)
    #split datatime into year month day and time.  
    data2['year'] = year
    data2['month']= month
    # data2['month']=month
    data2['day']=day
    data2['time']=time
    col = data2.columns.tolist()
    #change columns
    for x in xrange(4):
        col = col[-1:]+col[:-1]
    # col[0], col[1] = col[1], col[0]
    data2 = data2[col]
    #remove 00:00 and change all to int from float
    data2['time']=data2['time'].str.replace(':00:00','').astype(int)
    data2[['year','month','day']]=data2[['year','month','day']].astype(int)
    
    return data2
    
# data = pd.read_csv('data/train.csv')
# data = nicedata(data)
    
def submitdata(pred,name='submission'):
    '''
    use with predictions in this function as it uses time in string as well
    pulls datetime from original file rather than recombine year month day time
    '''

    #extract name from original 'datetime' feature
    keep = pd.read_csv('data/test.csv')
    keep = keep['datetime']
    
    #save to file
    submit = pd.concat([keep,pred],axis=1)
    submit.columns=['datetime','count']
    submit['count']=submit['count'].astype(int)
#     submit.ix[submit['count'] <= 0, :] = 1
    
    timestr = time.strftime("%m-%d")
    submit.to_csv('data/'+name+timestr+'.csv',index=False)
    

def normalizedata(data):
    '''
    use if you wish to normalize any of the features from [0,1]
    '''
    features_to_norm = ['temp','atemp','humidity','windspeed']
    for x in features_to_norm:
        #normalized between 0 and 1 but can later change
        temporary = (data[x] - min(data[x]))/(max(data[x])- min(data[x]))
        data[x]=temporary  
    return data
#this removes warning that will otherwise come about
pd.options.mode.chained_assignment = None

Plot Data

Here we will plot the data for 2 days, 1 working and 1 nonworking. Days are NOT always 24 hours, sometimes data is missing. On the last day we will average over the month just to show that there is some similarity (i.e. the first few days are not so different from the days overall).

In [48]:
data = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
data = nicedata(data)
test2 = nicedata(test)

day_1 = data[0:24]
day_2 = data[24:47]
day_3 = data[47:69]
day_4 = data[69:92]

#re add (ax2, ax3) to below command and uncomment out below to make 4 graphs
f, (ax1, ax4) = plt.subplots(2, sharex=True, sharey=True)
ax1.plot(day_1.time,day_1['count'])
ax1.set_title('2 Weekends, 2 Working days')
# ax2.plot(day_2.time,day_2['count'])
# ax3.plot(day_3.time,day_3['count'])
ax4.plot(day_4.time,day_4['count'])
f.subplots_adjust(hspace=0)
f.set_size_inches(12,7)

jan2011 = data[(data['month']==1) & (data['year']==2011) & data['workingday']==True]
grouped = jan2011.groupby('time')
grouped = grouped.agg(np.mean)
plt.plot(grouped['count'],c='r')
# plt.figure()
# below plot if necessary
# plt.plot(data['time'],data['count'].values)
Out[48]:
[<matplotlib.lines.Line2D at 0x10c693bd0>]

As you can see, for working days (the second 2) there is a clear spike during morning and afternoon rushes.

Using two different models, SVR and RandomForestRegressor, then comparing cross validation scores. Note that you cannot use data from future data (i.e. you cant use data from Feb of 2011 for Jan 2011 dates) so I just iterated over each month in each year and created a seperate model for each. An intersting idea would be to use all previous months to create a model for each month but it did not work for me as expected.

In [96]:
test = pd.read_csv('data/test.csv')
data = pd.read_csv('data/train.csv')
data = nicedata(data)
test= nicedata(test)

test = normalizedata(test)
data = normalizedata(data)


years = [2011,2012]
months = list(xrange(1,13,1))
hours =  list(xrange(0,24,1))
pred_months_years = []
pd.DataFrame(pred_months_years)
for y in [2011,2012]:
    for m in range(1,13,1):
#         print(y,m)
        monthSVR = SVR(kernel='rbf', C=1e3, gamma=1)
        muse = data[(data['month']==m) & (data['year']==y)]
        tuse = test[(test['month']==m) & (test['year']==y)]
        monthSVR.fit(muse[['time','workingday','atemp','humidity','windspeed','weather']],muse['count'])
        predmonthSVR = monthSVR.predict(tuse[['time','workingday','atemp','humidity','windspeed','weather']])
        pd.DataFrame(predmonthSVR)
        pred_months_years.append(predmonthSVR)

z = pred_months_years
merged = list(chain(*z))
z = pd.Series(merged)
z.astype(int);
z.name = "count"
predictionsSVR = pd.DataFrame(z)
predictionsSVR.ix[predictionsSVR['count'] <= 1, :] = 1
# submitdata(subsub,name='SVRrbf')

compared to random forest

In [58]:
test = pd.read_csv('data/test.csv')
data = pd.read_csv('data/train.csv')
data = nicedata(data)
test= nicedata(test)

test = normalizedata(test)
data = normalizedata(data)

pred_months_years = []
pd.DataFrame(pred_months_years)
# final = []
# pd.Series(final)
for y in [2011,2012]:
    for m in range(1,13,1):
#         print(y,m)
        monthforest = RandomForestRegressor(n_estimators=250)
        muse = data[(data['month']==m) & (data['year']==y)]
        tuse = test[(test['month']==m) & (test['year']==y)]
        monthforest.fit(muse[['time','workingday','atemp','humidity','windspeed','weather']],muse['count'])
        predmonth = monthforest.predict(tuse[['time','workingday','atemp','humidity','windspeed','weather']])
        pd.DataFrame(predmonth)
        pred_months_years.append(predmonth)

z = pred_months_years
merged = list(chain(*z))
predictionsRF = pd.Series(merged)
predictionsRF.astype(int)
submitdata(predictionsRF,name='timeinforest')

Here we will plot the random forest predictions and the svm predictions and compare it to a day near it. i would like to make it so it finds a random day from the predictions and then compares with nearest day from train in the future.

In [99]:
fig = plt.figure()
fig.set_size_inches(10,7)
rf1 = plt.subplot2grid((3, 4), (0, 0), colspan=2)
svr2 = plt.subplot2grid((3, 4), (0, 2), colspan=2)
ax3 = plt.subplot2grid((3, 4), (1, 0), colspan=4, rowspan=1)

rf1.plot(test.time[0:24],predictionsRF[0:24])
svr2.plot(test.time[0:24],predictionsSVR[0:24])# example_plot(ax3)
ax3.plot(day_19.time,day_19['count'])

ax3.set_xlabel('Time')
ax3.set_ylabel('Count')

ax3.set_title('Day 19'),rf1.set_title('Day 20 Random Forest'),svr2.set_title('Day 20 SVR')



plt.tight_layout()

cross validation:

In [95]:
scoreMF = cross_val_score(monthforest, muse[['time','workingday','atemp','humidity','windspeed','weather']],
                         muse['count'], cv=5)
scoreSVR = cross_val_score(monthSVR, muse[['time','workingday','atemp','humidity','windspeed','weather']],
                         muse['count'], cv=5)

print("Accuracy of Random Forest Regressor: %0.2f (+/- %0.2f)" % (scoreMF.mean(), scoreMF.std() * 2))
print("Accuracy of SVR: %0.2f (+/- %0.2f)" % (scoreSVR.mean(), scoreSVR.std() * 2))
Accuracy of Random Forest Regressor: 0.87 (+/- 0.09)
Accuracy of SVR: 0.81 (+/- 0.10)
In [94]:
# from sklearn import tree
# dt = tree.ExtraTreeRegressor()
# #set target, train and test. train and test must have same number of features
# df = data
# target = df['count']
# train  = df[['time','holiday','season','temp','atemp','windspeed','weather','humidity']]
# test   = test2[['time','holiday','season','temp','atemp','windspeed','weather','humidity']]
# dt.fit(train,target)


# predicted_probs = dt.predict(test)
# predicted_probs = pd.Series(predicted_probs)
# predicted_probs = predicted_probs.map(lambda x: int(x))

# keep = pd.read_csv('data/test.csv')
# keep = keep['datetime']
# #save to file
# submit = pd.concat([keep,predicted_probs],axis=1)
# # print(forest.feature_importances_)
# submit.columns=['datetime','count']
# submit.to_csv('data/submissiondtree.csv',index=False)

# plt.figure()
# # pl.scatter(tr, y, c="k", label="data")
# plt.plot(train['time'], target, c="g", label="train", linewidth=2)
# # plt.plot(test['time'], predicted_probs, c="r", label="test", linewidth=2)
# plt.xlabel("data")
# plt.ylabel("target")
# plt.title("Decision Tree Regression")
# plt.legend()
# plt.show()

#removed plot for SVM

# day_19 = data[(data['month']==1) & (data['year']==2011) & (data['day']==19)]
# f, (ax1, ax2) = plt.subplots(2, sharex=True, sharey=True)
# ax1.plot(day_19.time,day_19['count'])
# # ax1.set_title('2 Weekends, 2 Working days')
# ax2.plot(test.time[0:24],subsub[0:24])
# # ax3.plot(day_3.time,day_3['count'])
# # ax4.plot(day_4.time,day_4['count'])
# f.subplots_adjust(hspace=0)
# f.set_size_inches(10,7)



#removed plot for random forest

# day_19rf = data[(data['month']==1) & (data['year']==2011) & (data['day']==19)]
# f, (ax1, ax2) = plt.subplots(2, sharex=True, sharey=True)
# ax1.plot(day_19.time,day_19['count'])
# # ax1.set_title('2 Weekends, 2 Working days')
# ax2.plot(test.time[0:24],z[0:24])
# # ax3.plot(day_3.time,day_3['count'])
# # ax4.plot(day_4.time,day_4['count'])
# f.subplots_adjust(hspace=0)
# f.set_size_inches(10,7)
# submitdata(z,name='timeinforest')