import pandas as pd from pandas.tools.plotting import scatter_matrix import numpy as np import matplotlib.pyplot as plt from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor from sklearn.svm import SVR from sklearn.linear_model import ElasticNet from sklearn.cross_validation import cross_val_score # import statsmodels.formula.api as smf import time from itertools import chain %matplotlib inline # here i have created 2 functions to deal with data and to create submissions. # nicedata() is a bit sloppy with names of col and getting them in the order i wanted def nicedata(data): ''' change data into good format. use for both train and test ''' yearmonthday,time = zip(*[item.split(' ') for item in data['datetime']]) year, month, day = zip(*[item.split('-') for item in yearmonthday]) data2 = data data2 = data.drop('datetime',1) #split datatime into year month day and time. data2['year'] = year data2['month']= month # data2['month']=month data2['day']=day data2['time']=time col = data2.columns.tolist() #change columns for x in xrange(4): col = col[-1:]+col[:-1] # col[0], col[1] = col[1], col[0] data2 = data2[col] #remove 00:00 and change all to int from float data2['time']=data2['time'].str.replace(':00:00','').astype(int) data2[['year','month','day']]=data2[['year','month','day']].astype(int) return data2 # data = pd.read_csv('data/train.csv') # data = nicedata(data) def submitdata(pred,name='submission'): ''' use with predictions in this function as it uses time in string as well pulls datetime from original file rather than recombine year month day time ''' #extract name from original 'datetime' feature keep = pd.read_csv('data/test.csv') keep = keep['datetime'] #save to file submit = pd.concat([keep,pred],axis=1) submit.columns=['datetime','count'] submit['count']=submit['count'].astype(int) # submit.ix[submit['count'] <= 0, :] = 1 timestr = time.strftime("%m-%d") submit.to_csv('data/'+name+timestr+'.csv',index=False) def normalizedata(data): ''' use if you wish to normalize any of the features from [0,1] ''' features_to_norm = ['temp','atemp','humidity','windspeed'] for x in features_to_norm: #normalized between 0 and 1 but can later change temporary = (data[x] - min(data[x]))/(max(data[x])- min(data[x])) data[x]=temporary return data #this removes warning that will otherwise come about pd.options.mode.chained_assignment = None data = pd.read_csv('data/train.csv') test = pd.read_csv('data/test.csv') data = nicedata(data) test2 = nicedata(test) day_1 = data[0:24] day_2 = data[24:47] day_3 = data[47:69] day_4 = data[69:92] #re add (ax2, ax3) to below command and uncomment out below to make 4 graphs f, (ax1, ax4) = plt.subplots(2, sharex=True, sharey=True) ax1.plot(day_1.time,day_1['count']) ax1.set_title('2 Weekends, 2 Working days') # ax2.plot(day_2.time,day_2['count']) # ax3.plot(day_3.time,day_3['count']) ax4.plot(day_4.time,day_4['count']) f.subplots_adjust(hspace=0) f.set_size_inches(12,7) jan2011 = data[(data['month']==1) & (data['year']==2011) & data['workingday']==True] grouped = jan2011.groupby('time') grouped = grouped.agg(np.mean) plt.plot(grouped['count'],c='r') # plt.figure() # below plot if necessary # plt.plot(data['time'],data['count'].values) test = pd.read_csv('data/test.csv') data = pd.read_csv('data/train.csv') data = nicedata(data) test= nicedata(test) test = normalizedata(test) data = normalizedata(data) years = [2011,2012] months = list(xrange(1,13,1)) hours = list(xrange(0,24,1)) pred_months_years = [] pd.DataFrame(pred_months_years) for y in [2011,2012]: for m in range(1,13,1): # print(y,m) monthSVR = SVR(kernel='rbf', C=1e3, gamma=1) muse = data[(data['month']==m) & (data['year']==y)] tuse = test[(test['month']==m) & (test['year']==y)] monthSVR.fit(muse[['time','workingday','atemp','humidity','windspeed','weather']],muse['count']) predmonthSVR = monthSVR.predict(tuse[['time','workingday','atemp','humidity','windspeed','weather']]) pd.DataFrame(predmonthSVR) pred_months_years.append(predmonthSVR) z = pred_months_years merged = list(chain(*z)) z = pd.Series(merged) z.astype(int); z.name = "count" predictionsSVR = pd.DataFrame(z) predictionsSVR.ix[predictionsSVR['count'] <= 1, :] = 1 # submitdata(subsub,name='SVRrbf') test = pd.read_csv('data/test.csv') data = pd.read_csv('data/train.csv') data = nicedata(data) test= nicedata(test) test = normalizedata(test) data = normalizedata(data) pred_months_years = [] pd.DataFrame(pred_months_years) # final = [] # pd.Series(final) for y in [2011,2012]: for m in range(1,13,1): # print(y,m) monthforest = RandomForestRegressor(n_estimators=250) muse = data[(data['month']==m) & (data['year']==y)] tuse = test[(test['month']==m) & (test['year']==y)] monthforest.fit(muse[['time','workingday','atemp','humidity','windspeed','weather']],muse['count']) predmonth = monthforest.predict(tuse[['time','workingday','atemp','humidity','windspeed','weather']]) pd.DataFrame(predmonth) pred_months_years.append(predmonth) z = pred_months_years merged = list(chain(*z)) predictionsRF = pd.Series(merged) predictionsRF.astype(int) submitdata(predictionsRF,name='timeinforest') fig = plt.figure() fig.set_size_inches(10,7) rf1 = plt.subplot2grid((3, 4), (0, 0), colspan=2) svr2 = plt.subplot2grid((3, 4), (0, 2), colspan=2) ax3 = plt.subplot2grid((3, 4), (1, 0), colspan=4, rowspan=1) rf1.plot(test.time[0:24],predictionsRF[0:24]) svr2.plot(test.time[0:24],predictionsSVR[0:24])# example_plot(ax3) ax3.plot(day_19.time,day_19['count']) ax3.set_xlabel('Time') ax3.set_ylabel('Count') ax3.set_title('Day 19'),rf1.set_title('Day 20 Random Forest'),svr2.set_title('Day 20 SVR') plt.tight_layout() scoreMF = cross_val_score(monthforest, muse[['time','workingday','atemp','humidity','windspeed','weather']], muse['count'], cv=5) scoreSVR = cross_val_score(monthSVR, muse[['time','workingday','atemp','humidity','windspeed','weather']], muse['count'], cv=5) print("Accuracy of Random Forest Regressor: %0.2f (+/- %0.2f)" % (scoreMF.mean(), scoreMF.std() * 2)) print("Accuracy of SVR: %0.2f (+/- %0.2f)" % (scoreSVR.mean(), scoreSVR.std() * 2)) # from sklearn import tree # dt = tree.ExtraTreeRegressor() # #set target, train and test. train and test must have same number of features # df = data # target = df['count'] # train = df[['time','holiday','season','temp','atemp','windspeed','weather','humidity']] # test = test2[['time','holiday','season','temp','atemp','windspeed','weather','humidity']] # dt.fit(train,target) # predicted_probs = dt.predict(test) # predicted_probs = pd.Series(predicted_probs) # predicted_probs = predicted_probs.map(lambda x: int(x)) # keep = pd.read_csv('data/test.csv') # keep = keep['datetime'] # #save to file # submit = pd.concat([keep,predicted_probs],axis=1) # # print(forest.feature_importances_) # submit.columns=['datetime','count'] # submit.to_csv('data/submissiondtree.csv',index=False) # plt.figure() # # pl.scatter(tr, y, c="k", label="data") # plt.plot(train['time'], target, c="g", label="train", linewidth=2) # # plt.plot(test['time'], predicted_probs, c="r", label="test", linewidth=2) # plt.xlabel("data") # plt.ylabel("target") # plt.title("Decision Tree Regression") # plt.legend() # plt.show() #removed plot for SVM # day_19 = data[(data['month']==1) & (data['year']==2011) & (data['day']==19)] # f, (ax1, ax2) = plt.subplots(2, sharex=True, sharey=True) # ax1.plot(day_19.time,day_19['count']) # # ax1.set_title('2 Weekends, 2 Working days') # ax2.plot(test.time[0:24],subsub[0:24]) # # ax3.plot(day_3.time,day_3['count']) # # ax4.plot(day_4.time,day_4['count']) # f.subplots_adjust(hspace=0) # f.set_size_inches(10,7) #removed plot for random forest # day_19rf = data[(data['month']==1) & (data['year']==2011) & (data['day']==19)] # f, (ax1, ax2) = plt.subplots(2, sharex=True, sharey=True) # ax1.plot(day_19.time,day_19['count']) # # ax1.set_title('2 Weekends, 2 Working days') # ax2.plot(test.time[0:24],z[0:24]) # # ax3.plot(day_3.time,day_3['count']) # # ax4.plot(day_4.time,day_4['count']) # f.subplots_adjust(hspace=0) # f.set_size_inches(10,7) # submitdata(z,name='timeinforest')