import pandas as pd
# some display options to make figures bigger
pd.set_option('display.max_columns', 15)
rcParams['figure.figsize'] = (17, 7)
bike_data = pd.read_csv("./2012.csv", encoding='latin1', sep=';', index_col='Date', parse_dates=True, dayfirst=True)
bike_data = bike_data.dropna(axis=1)
bike_data.head()
Berri 1 | Côte-Sainte-Catherine | Maisonneuve 1 | Maisonneuve 2 | du Parc | Pierre-Dupuy | Rachel1 | |
---|---|---|---|---|---|---|---|
Date | |||||||
2012-01-01 | 35 | 0 | 38 | 51 | 26 | 10 | 16 |
2012-01-02 | 83 | 1 | 68 | 153 | 53 | 6 | 43 |
2012-01-03 | 135 | 2 | 104 | 248 | 89 | 3 | 58 |
2012-01-04 | 144 | 1 | 116 | 318 | 111 | 8 | 61 |
2012-01-05 | 197 | 2 | 124 | 330 | 97 | 13 | 95 |
bike_data.plot()
<matplotlib.axes.AxesSubplot at 0x33afe50>
bike_data.describe()
Berri 1 | Côte-Sainte-Catherine | Maisonneuve 1 | Maisonneuve 2 | du Parc | Pierre-Dupuy | Rachel1 | |
---|---|---|---|---|---|---|---|
count | 310.000000 | 310.000000 | 310.000000 | 310.000000 | 310.000000 | 310.000000 | 310.000000 |
mean | 2985.048387 | 1233.351613 | 1983.325806 | 3510.261290 | 1862.983871 | 1054.306452 | 2873.483871 |
std | 2169.271062 | 944.643188 | 1450.715170 | 2484.959789 | 1332.543266 | 1064.029205 | 2039.315504 |
min | 32.000000 | 0.000000 | 33.000000 | 47.000000 | 18.000000 | 0.000000 | 0.000000 |
25% | 596.000000 | 243.250000 | 427.000000 | 831.000000 | 474.750000 | 53.250000 | 731.000000 |
50% | 3128.000000 | 1269.000000 | 2019.500000 | 3688.500000 | 1822.500000 | 704.000000 | 3223.500000 |
75% | 4973.250000 | 2003.000000 | 3168.250000 | 5731.750000 | 3069.000000 | 1818.500000 | 4717.250000 |
max | 7077.000000 | 3124.000000 | 4999.000000 | 8222.000000 | 4510.000000 | 4386.000000 | 6595.000000 |
bike_data[['Berri 1', 'Maisonneuve 2']].plot()
<matplotlib.axes.AxesSubplot at 0x3400850>
def get_weather_data(year):
url_template = "http://climate.weatheroffice.gc.ca/climateData/bulkdata_e.html?Prov=QC&StationID=5415&Year={year}&Month={month}&Day=14&timeframe=1&format=csv"
data_by_month = []
for month in range(1, 13):
url = url_template.format(year=year, month=month)
weather_data = pd.read_csv(url, skiprows=16, index_col='Date/Time', parse_dates=True).dropna(axis=1)
weather_data.columns = map(lambda x: x.replace('\xb0', ''), weather_data.columns)
weather_data = weather_data.drop(['Year', 'Day', 'Month', 'Time', 'Data Quality'], axis=1)
data_by_month.append(weather_data.dropna())
return pd.concat(data_by_month)
weather_data = get_weather_data(2012)
print list(weather_data.columns)
['Temp (C)', 'Dew Point Temp (C)', 'Rel Hum (%)', 'Wind Dir (10s deg)', 'Wind Spd (km/h)', 'Visibility (km)', 'Stn Press (kPa)', 'Weather']
weather_data[['Temp (C)', 'Weather', 'Wind Spd (km/h)', 'Rel Hum (%)', 'Wind Spd (km/h)']].head()
Temp (C) | Weather | Wind Spd (km/h) | Rel Hum (%) | Wind Spd (km/h) | |
---|---|---|---|---|---|
Date/Time | |||||
2012-01-01 00:00:00 | -1.8 | Fog | 4 | 86 | 4 |
2012-01-01 01:00:00 | -1.8 | Fog | 4 | 87 | 4 |
2012-01-01 02:00:00 | -1.8 | Freezing Drizzle,Fog | 7 | 89 | 7 |
2012-01-01 03:00:00 | -1.5 | Freezing Drizzle,Fog | 6 | 88 | 6 |
2012-01-01 04:00:00 | -1.5 | Fog | 7 | 88 | 7 |
bike_data['mean temp'] = weather_data['Temp (C)'].resample('D', how='mean')
bike_data.head()
Berri 1 | Côte-Sainte-Catherine | Maisonneuve 1 | Maisonneuve 2 | du Parc | Pierre-Dupuy | Rachel1 | mean temp | |
---|---|---|---|---|---|---|---|---|
Date | ||||||||
2012-01-01 | 35 | 0 | 38 | 51 | 26 | 10 | 16 | 0.629167 |
2012-01-02 | 83 | 1 | 68 | 153 | 53 | 6 | 43 | 0.041667 |
2012-01-03 | 135 | 2 | 104 | 248 | 89 | 3 | 58 | -14.416667 |
2012-01-04 | 144 | 1 | 116 | 318 | 111 | 8 | 61 | -13.645833 |
2012-01-05 | 197 | 2 | 124 | 330 | 97 | 13 | 95 | -6.750000 |
bike_data[['Berri 1', 'mean temp']].plot(subplots=True)
array([Axes(0.125,0.581818;0.775x0.318182), Axes(0.125,0.2;0.775x0.318182)], dtype=object)
bike_data['Rain'] = weather_data['Weather'].str.contains('Rain').map(lambda x: int(x)).resample('D', how='mean')
bike_data[['Berri 1', 'Rain']].plot(subplots=True)
array([Axes(0.125,0.581818;0.775x0.318182), Axes(0.125,0.2;0.775x0.318182)], dtype=object)
bike_data['weekday'] = bike_data.index.weekday
bike_data.head()
Berri 1 | Côte-Sainte-Catherine | Maisonneuve 1 | Maisonneuve 2 | du Parc | Pierre-Dupuy | Rachel1 | mean temp | Rain | weekday | |
---|---|---|---|---|---|---|---|---|---|---|
Date | ||||||||||
2012-01-01 | 35 | 0 | 38 | 51 | 26 | 10 | 16 | 0.629167 | 0.125000 | 6 |
2012-01-02 | 83 | 1 | 68 | 153 | 53 | 6 | 43 | 0.041667 | 0.041667 | 0 |
2012-01-03 | 135 | 2 | 104 | 248 | 89 | 3 | 58 | -14.416667 | 0.000000 | 1 |
2012-01-04 | 144 | 1 | 116 | 318 | 111 | 8 | 61 | -13.645833 | 0.000000 | 2 |
2012-01-05 | 197 | 2 | 124 | 330 | 97 | 13 | 95 | -6.750000 | 0.000000 | 3 |
days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
bike_data['weekday'] = bike_data['weekday'].map(lambda x: days[x])
bike_data.head()
Berri 1 | Côte-Sainte-Catherine | Maisonneuve 1 | Maisonneuve 2 | du Parc | Pierre-Dupuy | Rachel1 | mean temp | Rain | weekday | |
---|---|---|---|---|---|---|---|---|---|---|
Date | ||||||||||
2012-01-01 | 35 | 0 | 38 | 51 | 26 | 10 | 16 | 0.629167 | 0.125000 | Sunday |
2012-01-02 | 83 | 1 | 68 | 153 | 53 | 6 | 43 | 0.041667 | 0.041667 | Monday |
2012-01-03 | 135 | 2 | 104 | 248 | 89 | 3 | 58 | -14.416667 | 0.000000 | Tuesday |
2012-01-04 | 144 | 1 | 116 | 318 | 111 | 8 | 61 | -13.645833 | 0.000000 | Wednesday |
2012-01-05 | 197 | 2 | 124 | 330 | 97 | 13 | 95 | -6.750000 | 0.000000 | Thursday |
counts_by_day = bike_data.groupby('weekday').aggregate(numpy.sum)
counts_by_day.index = days
counts_by_day.plot()
<matplotlib.axes.AxesSubplot at 0x4b964d0>