Reference: Fanaee-T, Hadi, and Gama, Joao, 'Event labeling combining ensemble detectors and background knowledge', Progress in Artificial Intelligence (2013): pp. 1-15, Springer Berlin Heidelberg,
# data manipulation
import numpy as np
import pandas as pd
# plotting
import seaborn as sn
import matplotlib.pyplot as plt
%matplotlib inline
# setting params
params = {'legend.fontsize': 'x-large',
'figure.figsize': (30, 10),
'axes.labelsize': 'x-large',
'axes.titlesize':'x-large',
'xtick.labelsize':'x-large',
'ytick.labelsize':'x-large'}
sn.set_style('whitegrid')
sn.set_context('talk')
plt.rcParams.update(params)
pd.options.display.max_colwidth = 600
# pandas display data frames as tables
from IPython.display import display, HTML
hour_df = pd.read_csv('hour.csv')
print("Shape of dataset::{}".format(hour_df.shape))
display(hour_df.head())
# data types of attributes
hour_df.dtypes
# dataset summary stats
hour_df.describe()
The dataset has:
hour_df.rename(columns={'instant':'rec_id',
'dteday':'datetime',
'holiday':'is_holiday',
'workingday':'is_workingday',
'weathersit':'weather_condition',
'hum':'humidity',
'mnth':'month',
'cnt':'total_count',
'hr':'hour',
'yr':'year'},inplace=True)
# date time conversion
hour_df['datetime'] = pd.to_datetime(hour_df.datetime)
# categorical variables
hour_df['season'] = hour_df.season.astype('category')
hour_df['is_holiday'] = hour_df.is_holiday.astype('category')
hour_df['weekday'] = hour_df.weekday.astype('category')
hour_df['weather_condition'] = hour_df.weather_condition.astype('category')
hour_df['is_workingday'] = hour_df.is_workingday.astype('category')
hour_df['month'] = hour_df.month.astype('category')
hour_df['year'] = hour_df.year.astype('category')
hour_df['hour'] = hour_df.hour.astype('category')
fig,ax = plt.subplots()
sn.pointplot(data=hour_df[['hour',
'total_count',
'season']],
x='hour',y='total_count',
hue='season',ax=ax)
ax.set(title="Season wise hourly distribution of counts")
fig,ax = plt.subplots()
sn.pointplot(data=hour_df[['hour','total_count','weekday']],x='hour',y='total_count',hue='weekday',ax=ax)
ax.set(title="Weekday wise hourly distribution of counts")
fig,ax = plt.subplots()
sn.boxplot(data=hour_df[['hour','total_count']],x="hour",y="total_count",ax=ax)
ax.set(title="Box Pot for hourly distribution of counts")
fig,ax = plt.subplots()
sn.barplot(data=hour_df[['month',
'total_count']],
x="month",y="total_count")
ax.set(title="Monthly distribution of counts")
df_col_list = ['month','weekday','total_count']
plot_col_list= ['month','total_count']
spring_df = hour_df[hour_df.season==1][df_col_list]
summer_df = hour_df[hour_df.season==2][df_col_list]
fall_df = hour_df[hour_df.season==3][df_col_list]
winter_df = hour_df[hour_df.season==4][df_col_list]
fig,ax= plt.subplots(nrows=2,ncols=2)
sn.barplot(data=spring_df[plot_col_list],x="month",y="total_count",ax=ax[0][0],)
ax[0][0].set(title="Spring")
sn.barplot(data=summer_df[plot_col_list],x="month",y="total_count",ax=ax[0][1])
ax[0][1].set(title="Summer")
sn.barplot(data=fall_df[plot_col_list],x="month",y="total_count",ax=ax[1][0])
ax[1][0].set(title="Fall")
sn.barplot(data=winter_df[plot_col_list],x="month",y="total_count",ax=ax[1][1])
ax[1][1].set(title="Winter")
sn.violinplot(data=hour_df[['year',
'total_count']],
x="year",y="total_count")
fig,(ax1,ax2) = plt.subplots(ncols=2)
sn.barplot(data=hour_df,x='is_holiday',y='total_count',hue='season',ax=ax1)
sn.barplot(data=hour_df,x='is_workingday',y='total_count',hue='season',ax=ax2)
fig,(ax1,ax2)= plt.subplots(ncols=2)
sn.boxplot(data=hour_df[['total_count',
'casual','registered']],ax=ax1)
sn.boxplot(data=hour_df[['temp','windspeed']],ax=ax2)
corrMatt = hour_df[["temp","atemp",
"humidity","windspeed",
"casual","registered",
"total_count"]].corr()
mask = np.array(corrMatt)
mask[np.tril_indices_from(mask)] = False
sn.heatmap(corrMatt, mask=mask,
vmax=.8, square=True,annot=True)