Given the Bike Sharing dataset with hourly level information of bikes along with weather and other attributes, model a system which can predict the bike count.
%matplotlib inline
# data manuipulation
import numpy as np
import pandas as pd
# modeling utilities
import pydotplus
from sklearn import tree
from sklearn import metrics
from sklearn import preprocessing
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
# plotting libraries
import seaborn as sn
import matplotlib.pyplot as plt
import seaborn as sn
sn.set_style('whitegrid')
sn.set_context('talk')
params = {'legend.fontsize': 'x-large',
'figure.figsize': (30, 10),
'axes.labelsize': 'x-large',
'axes.titlesize':'x-large',
'xtick.labelsize':'x-large',
'ytick.labelsize':'x-large'}
plt.rcParams.update(params)
hour_df = pd.read_csv('hour.csv')
print("Shape of dataset::{}".format(hour_df.shape))
Shape of dataset::(17379, 17)
hour_df.rename(columns={'instant':'rec_id',
'dteday':'datetime',
'holiday':'is_holiday',
'workingday':'is_workingday',
'weathersit':'weather_condition',
'hum':'humidity',
'mnth':'month',
'cnt':'total_count',
'hr':'hour',
'yr':'year'},inplace=True)
# date time conversion
hour_df['datetime'] = pd.to_datetime(hour_df.datetime)
# categorical variables
hour_df['season'] = hour_df.season.astype('category')
hour_df['is_holiday'] = hour_df.is_holiday.astype('category')
hour_df['weekday'] = hour_df.weekday.astype('category')
hour_df['weather_condition'] = hour_df.weather_condition.astype('category')
hour_df['is_workingday'] = hour_df.is_workingday.astype('category')
hour_df['month'] = hour_df.month.astype('category')
hour_df['year'] = hour_df.year.astype('category')
hour_df['hour'] = hour_df.hour.astype('category')
def fit_transform_ohe(df,col_name):
"""This function performs one hot encoding for the specified
column.
Args:
df(pandas.DataFrame): the data frame containing the mentioned column name
col_name: the column to be one hot encoded
Returns:
tuple: label_encoder, one_hot_encoder, transformed column as pandas Series
"""
# label encode the column
le = preprocessing.LabelEncoder()
le_labels = le.fit_transform(df[col_name])
df[col_name+'_label'] = le_labels
# one hot encoding
ohe = preprocessing.OneHotEncoder()
feature_arr = ohe.fit_transform(df[[col_name+'_label']]).toarray()
feature_labels = [col_name+'_'+str(cls_label) for cls_label in le.classes_]
features_df = pd.DataFrame(feature_arr, columns=feature_labels)
return le,ohe,features_df
# given label encoder and one hot encoder objects,
# encode attribute to ohe
def transform_ohe(df,le,ohe,col_name):
"""This function performs one hot encoding for the specified
column using the specified encoder objects.
Args:
df(pandas.DataFrame): the data frame containing the mentioned column name
le(Label Encoder): the label encoder object used to fit label encoding
ohe(One Hot Encoder): the onen hot encoder object used to fit one hot encoding
col_name: the column to be one hot encoded
Returns:
tuple: transformed column as pandas Series
"""
# label encode
col_labels = le.transform(df[col_name])
df[col_name+'_label'] = col_labels
# ohe
feature_arr = ohe.fit_transform(df[[col_name+'_label']]).toarray()
feature_labels = [col_name+'_'+str(cls_label) for cls_label in le.classes_]
features_df = pd.DataFrame(feature_arr, columns=feature_labels)
return features_df
X, X_test, y, y_test = train_test_split(hour_df.iloc[:,0:-3], hour_df.iloc[:,-1],
test_size=0.33, random_state=42)
X.reset_index(inplace=True)
y = y.reset_index()
X_test.reset_index(inplace=True)
y_test = y_test.reset_index()
print("Training set::{}{}".format(X.shape,y.shape))
print("Testing set::{}".format(X_test.shape))
Training set::(11643, 15)(11643, 2) Testing set::(5736, 15)
cat_attr_list = ['season','is_holiday',
'weather_condition','is_workingday',
'hour','weekday','month','year']
numeric_feature_cols = ['temp','humidity','windspeed','hour','weekday','month','year']
subset_cat_features = ['season','is_holiday','weather_condition','is_workingday']
encoded_attr_list = []
for col in cat_attr_list:
return_obj = fit_transform_ohe(X,col)
encoded_attr_list.append({'label_enc':return_obj[0],
'ohe_enc':return_obj[1],
'feature_df':return_obj[2],
'col_name':col})
feature_df_list = [X[numeric_feature_cols]]
feature_df_list.extend([enc['feature_df'] \
for enc in encoded_attr_list \
if enc['col_name'] in subset_cat_features])
train_df_new = pd.concat(feature_df_list, axis=1)
print("Shape::{}".format(train_df_new.shape))
Shape::(11643, 19)
X = train_df_new
y= y.total_count.values.reshape(-1,1)
X.shape,y.shape
((11643, 19), (11643, 1))
dtr = DecisionTreeRegressor(max_depth=4,
min_samples_split=5,
max_leaf_nodes=10)
dtr.fit(X,y)
DecisionTreeRegressor(criterion='mse', max_depth=4, max_features=None, max_leaf_nodes=10, min_impurity_split=1e-07, min_samples_leaf=1, min_samples_split=5, min_weight_fraction_leaf=0.0, presort=False, random_state=None, splitter='best')
dtr.score(X,y)
0.60565765621037793
dot_data = tree.export_graphviz(dtr, out_file=None)
graph = pydotplus.graph_from_dot_data(dot_data)
graph.write_pdf("bikeshare.pdf")
True
param_grid = {"criterion": ["mse", "mae"],
"min_samples_split": [10, 20, 40],
"max_depth": [2, 6, 8],
"min_samples_leaf": [20, 40, 100],
"max_leaf_nodes": [5, 20, 100, 500, 800],
}
grid_cv_dtr = GridSearchCV(dtr, param_grid, cv=5)
grid_cv_dtr.fit(X,y)
GridSearchCV(cv=5, error_score='raise', estimator=DecisionTreeRegressor(criterion='mse', max_depth=4, max_features=None, max_leaf_nodes=10, min_impurity_split=1e-07, min_samples_leaf=1, min_samples_split=5, min_weight_fraction_leaf=0.0, presort=False, random_state=None, splitter='best'), fit_params={}, iid=True, n_jobs=1, param_grid={'min_samples_split': [10, 20, 40], 'max_leaf_nodes': [5, 20, 100, 500, 800], 'min_samples_leaf': [20, 40, 100], 'max_depth': [2, 6, 8], 'criterion': ['mse', 'mae']}, pre_dispatch='2*n_jobs', refit=True, return_train_score=True, scoring=None, verbose=0)
print("R-Squared::{}".format(grid_cv_dtr.best_score_))
print("Best Hyperparameters::\n{}".format(grid_cv_dtr.best_params_))
R-Squared::0.85891903233008 Best Hyperparameters:: {'min_samples_split': 10, 'max_leaf_nodes': 500, 'min_samples_leaf': 20, 'max_depth': 8, 'criterion': 'mse'}
df = pd.DataFrame(data=grid_cv_dtr.cv_results_)
df.head()
mean_fit_time | mean_score_time | mean_test_score | mean_train_score | param_criterion | param_max_depth | param_max_leaf_nodes | param_min_samples_leaf | param_min_samples_split | params | ... | split2_test_score | split2_train_score | split3_test_score | split3_train_score | split4_test_score | split4_train_score | std_fit_time | std_score_time | std_test_score | std_train_score | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.025334 | 0.004203 | 0.48401 | 0.48875 | mse | 2 | 5 | 20 | 10 | {'min_samples_split': 10, 'max_leaf_nodes': 5,... | ... | 0.486478 | 0.48915 | 0.475286 | 0.491167 | 0.491096 | 0.487932 | 0.001725 | 0.000401 | 0.007223 | 0.002883 |
1 | 0.023615 | 0.004169 | 0.48401 | 0.48875 | mse | 2 | 5 | 20 | 20 | {'min_samples_split': 20, 'max_leaf_nodes': 5,... | ... | 0.486478 | 0.48915 | 0.475286 | 0.491167 | 0.491096 | 0.487932 | 0.001019 | 0.000494 | 0.007223 | 0.002883 |
2 | 0.024118 | 0.004603 | 0.48401 | 0.48875 | mse | 2 | 5 | 20 | 40 | {'min_samples_split': 40, 'max_leaf_nodes': 5,... | ... | 0.486478 | 0.48915 | 0.475286 | 0.491167 | 0.491096 | 0.487932 | 0.002801 | 0.000800 | 0.007223 | 0.002883 |
3 | 0.025617 | 0.004905 | 0.48401 | 0.48875 | mse | 2 | 5 | 40 | 10 | {'min_samples_split': 10, 'max_leaf_nodes': 5,... | ... | 0.486478 | 0.48915 | 0.475286 | 0.491167 | 0.491096 | 0.487932 | 0.002578 | 0.001360 | 0.007223 | 0.002883 |
4 | 0.025018 | 0.004804 | 0.48401 | 0.48875 | mse | 2 | 5 | 40 | 20 | {'min_samples_split': 20, 'max_leaf_nodes': 5,... | ... | 0.486478 | 0.48915 | 0.475286 | 0.491167 | 0.491096 | 0.487932 | 0.002550 | 0.000749 | 0.007223 | 0.002883 |
5 rows × 25 columns
fig,ax = plt.subplots()
sn.pointplot(data=df[['mean_test_score',
'param_max_leaf_nodes',
'param_max_depth']],
y='mean_test_score',x='param_max_depth',
hue='param_max_leaf_nodes',ax=ax)
ax.set(title="Effect of Depth and Leaf Nodes on Model Performance")
[<matplotlib.text.Text at 0x1b0e5ddde80>]
predicted = grid_cv_dtr.best_estimator_.predict(X)
residuals = y.flatten()-predicted
fig, ax = plt.subplots()
ax.scatter(y.flatten(), residuals)
ax.axhline(lw=2,color='black')
ax.set_xlabel('Observed')
ax.set_ylabel('Residual')
plt.show()
r2_scores = cross_val_score(grid_cv_dtr.best_estimator_, X, y, cv=10)
mse_scores = cross_val_score(grid_cv_dtr.best_estimator_, X, y, cv=10,scoring='neg_mean_squared_error')
print("avg R-squared::{}".format(np.mean(r2_scores)))
print("MSE::{}".format(np.mean(mse_scores)))
avg R-squared::0.863424851864871 MSE::-4546.657974664084
best_dtr_model = grid_cv_dtr.best_estimator_
test_encoded_attr_list = []
for enc in encoded_attr_list:
col_name = enc['col_name']
le = enc['label_enc']
ohe = enc['ohe_enc']
test_encoded_attr_list.append({'feature_df':transform_ohe(X_test,le,ohe,col_name),
'col_name':col_name})
test_feature_df_list = [X_test[numeric_feature_cols]]
test_feature_df_list.extend([enc['feature_df'] for enc in test_encoded_attr_list if enc['col_name'] in subset_cat_features])
test_df_new = pd.concat(test_feature_df_list, axis=1)
print("Shape::{}".format(test_df_new.shape))
Shape::(5736, 19)
X_test = test_df_new
y_test = y_test.total_count.values.reshape(-1,1)
y_pred = best_dtr_model.predict(X_test)
residuals = y_test.flatten() - y_pred
r2_score = best_dtr_model.score(X_test,y_test)
print("R-squared::{}".format(r2_score))
print("MSE: %.2f"
% metrics.mean_squared_error(y_test, y_pred))
R-squared::0.8722059567160857 MSE: 4076.82
fig, ax = plt.subplots()
ax.scatter(y_test.flatten(), residuals)
ax.axhline(lw=2,color='black')
ax.set_xlabel('Observed')
ax.set_ylabel('Residual')
plt.show()
r2_score = grid_cv_dtr.best_estimator_.score(X_test,y_test)