Given the Bike Sharing dataset with hourly level information of bikes along with weather and other attributes, model a system which can predict the bike count.
%matplotlib inline
# data manuipulation
import numpy as np
import pandas as pd
# modeling utilities
import scipy.stats as stats
from sklearn import metrics
from sklearn import preprocessing
from sklearn import linear_model
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict
# plotting libraries
import matplotlib.pyplot as plt
import seaborn as sn
sn.set_style('whitegrid')
sn.set_context('talk')
params = {'legend.fontsize': 'x-large',
'figure.figsize': (30, 10),
'axes.labelsize': 'x-large',
'axes.titlesize':'x-large',
'xtick.labelsize':'x-large',
'ytick.labelsize':'x-large'}
plt.rcParams.update(params)
hour_df = pd.read_csv('hour.csv')
print("Shape of dataset::{}".format(hour_df.shape))
Shape of dataset::(17379, 17)
hour_df.rename(columns={'instant':'rec_id',
'dteday':'datetime',
'holiday':'is_holiday',
'workingday':'is_workingday',
'weathersit':'weather_condition',
'hum':'humidity',
'mnth':'month',
'cnt':'total_count',
'hr':'hour',
'yr':'year'},inplace=True)
# date time conversion
hour_df['datetime'] = pd.to_datetime(hour_df.datetime)
# categorical variables
hour_df['season'] = hour_df.season.astype('category')
hour_df['is_holiday'] = hour_df.is_holiday.astype('category')
hour_df['weekday'] = hour_df.weekday.astype('category')
hour_df['weather_condition'] = hour_df.weather_condition.astype('category')
hour_df['is_workingday'] = hour_df.is_workingday.astype('category')
hour_df['month'] = hour_df.month.astype('category')
hour_df['year'] = hour_df.year.astype('category')
hour_df['hour'] = hour_df.hour.astype('category')
def fit_transform_ohe(df,col_name):
"""This function performs one hot encoding for the specified
column.
Args:
df(pandas.DataFrame): the data frame containing the mentioned column name
col_name: the column to be one hot encoded
Returns:
tuple: label_encoder, one_hot_encoder, transformed column as pandas Series
"""
# label encode the column
le = preprocessing.LabelEncoder()
le_labels = le.fit_transform(df[col_name])
df[col_name+'_label'] = le_labels
# one hot encoding
ohe = preprocessing.OneHotEncoder()
feature_arr = ohe.fit_transform(df[[col_name+'_label']]).toarray()
feature_labels = [col_name+'_'+str(cls_label) for cls_label in le.classes_]
features_df = pd.DataFrame(feature_arr, columns=feature_labels)
return le,ohe,features_df
# given label encoder and one hot encoder objects,
# encode attribute to ohe
def transform_ohe(df,le,ohe,col_name):
"""This function performs one hot encoding for the specified
column using the specified encoder objects.
Args:
df(pandas.DataFrame): the data frame containing the mentioned column name
le(Label Encoder): the label encoder object used to fit label encoding
ohe(One Hot Encoder): the onen hot encoder object used to fit one hot encoding
col_name: the column to be one hot encoded
Returns:
tuple: transformed column as pandas Series
"""
# label encode
col_labels = le.transform(df[col_name])
df[col_name+'_label'] = col_labels
# ohe
feature_arr = ohe.fit_transform(df[[col_name+'_label']]).toarray()
feature_labels = [col_name+'_'+str(cls_label) for cls_label in le.classes_]
features_df = pd.DataFrame(feature_arr, columns=feature_labels)
return features_df
X, X_test, y, y_test = train_test_split(hour_df.iloc[:,0:-3], hour_df.iloc[:,-1],
test_size=0.33, random_state=42)
X.reset_index(inplace=True)
y = y.reset_index()
X_test.reset_index(inplace=True)
y_test = y_test.reset_index()
print("Training set::{}{}".format(X.shape,y.shape))
print("Testing set::{}".format(X_test.shape))
Training set::(11643, 15)(11643, 2) Testing set::(5736, 15)
stats.probplot(y.total_count.tolist(), dist="norm", plot=plt)
plt.show()
cat_attr_list = ['season','is_holiday',
'weather_condition','is_workingday',
'hour','weekday','month','year']
numeric_feature_cols = ['temp','humidity','windspeed','hour','weekday','month','year']
subset_cat_features = ['season','is_holiday','weather_condition','is_workingday']
encoded_attr_list = []
for col in cat_attr_list:
return_obj = fit_transform_ohe(X,col)
encoded_attr_list.append({'label_enc':return_obj[0],
'ohe_enc':return_obj[1],
'feature_df':return_obj[2],
'col_name':col})
feature_df_list = [X[numeric_feature_cols]]
feature_df_list.extend([enc['feature_df'] \
for enc in encoded_attr_list \
if enc['col_name'] in subset_cat_features])
train_df_new = pd.concat(feature_df_list, axis=1)
print("Shape::{}".format(train_df_new.shape))
Shape::(11643, 19)
X = train_df_new
y= y.total_count.values.reshape(-1,1)
lin_reg = linear_model.LinearRegression()
predicted = cross_val_predict(lin_reg, X, y, cv=10)
fig, ax = plt.subplots()
ax.scatter(y, y-predicted)
ax.axhline(lw=2,color='black')
ax.set_xlabel('Observed')
ax.set_ylabel('Residual')
plt.show()
r2_scores = cross_val_score(lin_reg, X, y, cv=10)
mse_scores = cross_val_score(lin_reg, X, y, cv=10,scoring='neg_mean_squared_error')
fig, ax = plt.subplots()
ax.plot([i for i in range(len(r2_scores))],r2_scores,lw=2)
ax.set_xlabel('Iteration')
ax.set_ylabel('R-Squared')
ax.title.set_text("Cross Validation Scores, Avg:{}".format(np.average(r2_scores)))
plt.show()
print("R-squared::{}".format(r2_scores))
print("MSE::{}".format(mse_scores))
R-squared::[ 0.39894459 0.35575732 0.3873037 0.38796861 0.42489499 0.41571164 0.37379762 0.39339864 0.39589746 0.40871611] MSE::[-19612.38349313 -20800.77110185 -20256.54013607 -18545.99033804 -18746.57816436 -21015.35560028 -21549.12876053 -21567.27946203 -21044.42416385 -18899.05989574]
lin_reg.fit(X,y)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
test_encoded_attr_list = []
for enc in encoded_attr_list:
col_name = enc['col_name']
le = enc['label_enc']
ohe = enc['ohe_enc']
test_encoded_attr_list.append({'feature_df':transform_ohe(X_test,
le,ohe,
col_name),
'col_name':col_name})
test_feature_df_list = [X_test[numeric_feature_cols]]
test_feature_df_list.extend([enc['feature_df'] \
for enc in test_encoded_attr_list \
if enc['col_name'] in subset_cat_features])
test_df_new = pd.concat(test_feature_df_list, axis=1)
print("Shape::{}".format(test_df_new.shape))
Shape::(5736, 19)
test_df_new.head()
temp | humidity | windspeed | hour | weekday | month | year | season_1 | season_2 | season_3 | season_4 | is_holiday_0 | is_holiday_1 | weather_condition_1 | weather_condition_2 | weather_condition_3 | weather_condition_4 | is_workingday_0 | is_workingday_1 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.80 | 0.27 | 0.1940 | 19 | 6 | 6 | 1 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
1 | 0.24 | 0.41 | 0.2239 | 20 | 1 | 1 | 1 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
2 | 0.32 | 0.66 | 0.2836 | 2 | 5 | 10 | 0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
3 | 0.78 | 0.52 | 0.3582 | 19 | 2 | 5 | 1 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
4 | 0.26 | 0.56 | 0.3881 | 0 | 4 | 1 | 0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
X_test = test_df_new
y_test = y_test.total_count.values.reshape(-1,1)
y_pred = lin_reg.predict(X_test)
residuals = y_test-y_pred
r2_score = lin_reg.score(X_test,y_test)
print("R-squared::{}".format(r2_score))
print("MSE: %.2f"
% metrics.mean_squared_error(y_test, y_pred))
R-squared::0.4024409682673428 MSE: 19063.00
fig, ax = plt.subplots()
ax.scatter(y_test, residuals)
ax.axhline(lw=2,color='black')
ax.set_xlabel('Observed')
ax.set_ylabel('Residuals')
ax.title.set_text("Residual Plot with R-Squared={}".format(np.average(r2_score)))
plt.show()
import statsmodels.api as sm
# Set the independent variable
X = X.values.tolist()
# This handles the intercept.
# Statsmodel takes 0 intercept by default
X = sm.add_constant(X)
X_test = X_test.values.tolist()
X_test = sm.add_constant(X_test)
# Build OLS model
model = sm.OLS(y, X)
results = model.fit()
# Get the predicted values for dependent variable
pred_y = results.predict(X_test)
# View Model stats
print(results.summary())
C:\Anaconda2\envs\python3\lib\site-packages\statsmodels\compat\pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead. from pandas.core import datetools
OLS Regression Results ============================================================================== Dep. Variable: y R-squared: 0.396 Model: OLS Adj. R-squared: 0.395 Method: Least Squares F-statistic: 508.2 Date: Tue, 26 Sep 2017 Prob (F-statistic): 0.00 Time: 21:21:43 Log-Likelihood: -74221. No. Observations: 11643 AIC: 1.485e+05 Df Residuals: 11627 BIC: 1.486e+05 Df Model: 15 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ const -1.3509 11.086 -0.122 0.903 -23.081 20.380 x1 347.2038 11.464 30.286 0.000 324.732 369.675 x2 -194.4103 8.490 -22.900 0.000 -211.051 -177.769 x3 24.7546 11.587 2.136 0.033 2.042 47.467 x4 7.4108 0.204 36.366 0.000 7.011 7.810 x5 1.5378 0.659 2.334 0.020 0.246 2.829 x6 -0.0700 0.701 -0.100 0.920 -1.443 1.303 x7 81.9754 2.653 30.896 0.000 76.774 87.176 x8 -22.2212 4.323 -5.141 0.000 -30.694 -13.748 x9 0.0692 3.855 0.018 0.986 -7.488 7.626 x10 -24.8764 4.775 -5.210 0.000 -34.236 -15.517 x11 45.6775 4.864 9.391 0.000 36.144 55.211 x12 13.5188 6.450 2.096 0.036 0.875 26.163 x13 -14.8697 7.381 -2.015 0.044 -29.338 -0.402 x14 -6.3554 22.733 -0.280 0.780 -50.916 38.205 x15 2.5271 22.763 0.111 0.912 -42.092 47.146 x16 -32.4939 22.981 -1.414 0.157 -77.540 12.552 x17 34.9713 77.958 0.449 0.654 -117.840 187.782 x18 -2.3677 5.718 -0.414 0.679 -13.575 8.840 x19 1.0168 5.748 0.177 0.860 -10.249 12.283 ============================================================================== Omnibus: 2329.840 Durbin-Watson: 2.001 Prob(Omnibus): 0.000 Jarque-Bera (JB): 4560.310 Skew: 1.214 Prob(JB): 0.00 Kurtosis: 4.872 Cond. No. 3.42e+17 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The smallest eigenvalue is 2.27e-29. This might indicate that there are strong multicollinearity problems or that the design matrix is singular.
plt.scatter(pred_y,y_test)
<matplotlib.collections.PathCollection at 0x21104c5c080>