import pandas as pd
from pandas.tools.plotting import scatter_matrix
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import ElasticNet
from sklearn.cross_validation import cross_val_score
# import statsmodels.formula.api as smf
import time
from itertools import chain
%matplotlib inline

# here i have created 2 functions to deal with data and to create submissions.  
# nicedata() is a bit sloppy with names of col and getting them in the order i wanted
def nicedata(data):
    '''
    change data into good format. use for both train and test
    '''
    yearmonthday,time = zip(*[item.split(' ') for item in data['datetime']])
    year, month, day = zip(*[item.split('-') for item in yearmonthday])
    data2 = data
    data2 = data.drop('datetime',1)
    #split datatime into year month day and time.  
    data2['year'] = year
    data2['month']= month
    # data2['month']=month
    data2['day']=day
    data2['time']=time
    col = data2.columns.tolist()
    #change columns
    for x in xrange(4):
        col = col[-1:]+col[:-1]
    # col[0], col[1] = col[1], col[0]
    data2 = data2[col]
    #remove 00:00 and change all to int from float
    data2['time']=data2['time'].str.replace(':00:00','').astype(int)
    data2[['year','month','day']]=data2[['year','month','day']].astype(int)
    
    return data2
    
# data = pd.read_csv('data/train.csv')
# data = nicedata(data)
    
def submitdata(pred,name='submission'):
    '''
    use with predictions in this function as it uses time in string as well
    pulls datetime from original file rather than recombine year month day time
    '''

    #extract name from original 'datetime' feature
    keep = pd.read_csv('data/test.csv')
    keep = keep['datetime']
    
    #save to file
    submit = pd.concat([keep,pred],axis=1)
    submit.columns=['datetime','count']
    submit['count']=submit['count'].astype(int)
#     submit.ix[submit['count'] <= 0, :] = 1
    
    timestr = time.strftime("%m-%d")
    submit.to_csv('data/'+name+timestr+'.csv',index=False)
    

def normalizedata(data):
    '''
    use if you wish to normalize any of the features from [0,1]
    '''
    features_to_norm = ['temp','atemp','humidity','windspeed']
    for x in features_to_norm:
        #normalized between 0 and 1 but can later change
        temporary = (data[x] - min(data[x]))/(max(data[x])- min(data[x]))
        data[x]=temporary  
    return data
#this removes warning that will otherwise come about
pd.options.mode.chained_assignment = None

data = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
data = nicedata(data)
test2 = nicedata(test)

day_1 = data[0:24]
day_2 = data[24:47]
day_3 = data[47:69]
day_4 = data[69:92]

#re add (ax2, ax3) to below command and uncomment out below to make 4 graphs
f, (ax1, ax4) = plt.subplots(2, sharex=True, sharey=True)
ax1.plot(day_1.time,day_1['count'])
ax1.set_title('2 Weekends, 2 Working days')
# ax2.plot(day_2.time,day_2['count'])
# ax3.plot(day_3.time,day_3['count'])
ax4.plot(day_4.time,day_4['count'])
f.subplots_adjust(hspace=0)
f.set_size_inches(12,7)

jan2011 = data[(data['month']==1) & (data['year']==2011) & data['workingday']==True]
grouped = jan2011.groupby('time')
grouped = grouped.agg(np.mean)
plt.plot(grouped['count'],c='r')
# plt.figure()
# below plot if necessary
# plt.plot(data['time'],data['count'].values)

test = pd.read_csv('data/test.csv')
data = pd.read_csv('data/train.csv')
data = nicedata(data)
test= nicedata(test)

test = normalizedata(test)
data = normalizedata(data)


years = [2011,2012]
months = list(xrange(1,13,1))
hours =  list(xrange(0,24,1))
pred_months_years = []
pd.DataFrame(pred_months_years)
for y in [2011,2012]:
    for m in range(1,13,1):
#         print(y,m)
        monthSVR = SVR(kernel='rbf', C=1e3, gamma=1)
        muse = data[(data['month']==m) & (data['year']==y)]
        tuse = test[(test['month']==m) & (test['year']==y)]
        monthSVR.fit(muse[['time','workingday','atemp','humidity','windspeed','weather']],muse['count'])
        predmonthSVR = monthSVR.predict(tuse[['time','workingday','atemp','humidity','windspeed','weather']])
        pd.DataFrame(predmonthSVR)
        pred_months_years.append(predmonthSVR)

z = pred_months_years
merged = list(chain(*z))
z = pd.Series(merged)
z.astype(int);
z.name = "count"
predictionsSVR = pd.DataFrame(z)
predictionsSVR.ix[predictionsSVR['count'] <= 1, :] = 1
# submitdata(subsub,name='SVRrbf')

test = pd.read_csv('data/test.csv')
data = pd.read_csv('data/train.csv')
data = nicedata(data)
test= nicedata(test)

test = normalizedata(test)
data = normalizedata(data)

pred_months_years = []
pd.DataFrame(pred_months_years)
# final = []
# pd.Series(final)
for y in [2011,2012]:
    for m in range(1,13,1):
#         print(y,m)
        monthforest = RandomForestRegressor(n_estimators=250)
        muse = data[(data['month']==m) & (data['year']==y)]
        tuse = test[(test['month']==m) & (test['year']==y)]
        monthforest.fit(muse[['time','workingday','atemp','humidity','windspeed','weather']],muse['count'])
        predmonth = monthforest.predict(tuse[['time','workingday','atemp','humidity','windspeed','weather']])
        pd.DataFrame(predmonth)
        pred_months_years.append(predmonth)

z = pred_months_years
merged = list(chain(*z))
predictionsRF = pd.Series(merged)
predictionsRF.astype(int)
submitdata(predictionsRF,name='timeinforest')

fig = plt.figure()
fig.set_size_inches(10,7)
rf1 = plt.subplot2grid((3, 4), (0, 0), colspan=2)
svr2 = plt.subplot2grid((3, 4), (0, 2), colspan=2)
ax3 = plt.subplot2grid((3, 4), (1, 0), colspan=4, rowspan=1)

rf1.plot(test.time[0:24],predictionsRF[0:24])
svr2.plot(test.time[0:24],predictionsSVR[0:24])# example_plot(ax3)
ax3.plot(day_19.time,day_19['count'])

ax3.set_xlabel('Time')
ax3.set_ylabel('Count')

ax3.set_title('Day 19'),rf1.set_title('Day 20 Random Forest'),svr2.set_title('Day 20 SVR')


plt.tight_layout()

scoreMF = cross_val_score(monthforest, muse[['time','workingday','atemp','humidity','windspeed','weather']],
                         muse['count'], cv=5)
scoreSVR = cross_val_score(monthSVR, muse[['time','workingday','atemp','humidity','windspeed','weather']],
                         muse['count'], cv=5)

print("Accuracy of Random Forest Regressor: %0.2f (+/- %0.2f)" % (scoreMF.mean(), scoreMF.std() * 2))
print("Accuracy of SVR: %0.2f (+/- %0.2f)" % (scoreSVR.mean(), scoreSVR.std() * 2))

# from sklearn import tree
# dt = tree.ExtraTreeRegressor()
# #set target, train and test. train and test must have same number of features
# df = data
# target = df['count']
# train  = df[['time','holiday','season','temp','atemp','windspeed','weather','humidity']]
# test   = test2[['time','holiday','season','temp','atemp','windspeed','weather','humidity']]
# dt.fit(train,target)


# predicted_probs = dt.predict(test)
# predicted_probs = pd.Series(predicted_probs)
# predicted_probs = predicted_probs.map(lambda x: int(x))

# keep = pd.read_csv('data/test.csv')
# keep = keep['datetime']
# #save to file
# submit = pd.concat([keep,predicted_probs],axis=1)
# # print(forest.feature_importances_)
# submit.columns=['datetime','count']
# submit.to_csv('data/submissiondtree.csv',index=False)

# plt.figure()
# # pl.scatter(tr, y, c="k", label="data")
# plt.plot(train['time'], target, c="g", label="train", linewidth=2)
# # plt.plot(test['time'], predicted_probs, c="r", label="test", linewidth=2)
# plt.xlabel("data")
# plt.ylabel("target")
# plt.title("Decision Tree Regression")
# plt.legend()
# plt.show()

#removed plot for SVM

# day_19 = data[(data['month']==1) & (data['year']==2011) & (data['day']==19)]
# f, (ax1, ax2) = plt.subplots(2, sharex=True, sharey=True)
# ax1.plot(day_19.time,day_19['count'])
# # ax1.set_title('2 Weekends, 2 Working days')
# ax2.plot(test.time[0:24],subsub[0:24])
# # ax3.plot(day_3.time,day_3['count'])
# # ax4.plot(day_4.time,day_4['count'])
# f.subplots_adjust(hspace=0)
# f.set_size_inches(10,7)


#removed plot for random forest

# day_19rf = data[(data['month']==1) & (data['year']==2011) & (data['day']==19)]
# f, (ax1, ax2) = plt.subplots(2, sharex=True, sharey=True)
# ax1.plot(day_19.time,day_19['count'])
# # ax1.set_title('2 Weekends, 2 Working days')
# ax2.plot(test.time[0:24],z[0:24])
# # ax3.plot(day_3.time,day_3['count'])
# # ax4.plot(day_4.time,day_4['count'])
# f.subplots_adjust(hspace=0)
# f.set_size_inches(10,7)
# submitdata(z,name='timeinforest')