#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') import pandas as pd import matplotlib.pyplot as plt import numpy as np pd.set_option('display.mpl_style', 'default') plt.rcParams['figure.figsize'] = (15, 3) plt.rcParams['font.family'] = 'sans-serif' # # Summary # By the end of this chapter, we're going to have downloaded all of Canada's weather data for 2012, and saved it to a CSV. # # We'll do this by downloading it one month at a time, and then combining all the months together. # # Here's the temperature every hour for 2012! # In[2]: weather_2012_final = pd.read_csv('../data/weather_2012.csv', index_col='Date/Time') weather_2012_final['Temp (C)'].plot(figsize=(15, 6)) # # 5.1 Downloading one month of weather data # When playing with the cycling data, I wanted temperature and precipitation data to find out if people like biking when it's raining. So I went to the site for [Canadian historical weather data](http://climate.weather.gc.ca/index_e.html#access), and figured out how to get it automatically. # # Here we're going to get the data for March 2012, and clean it up # # Here's an URL template you can use to get data in Montreal. # In[3]: url_template = "http://climate.weather.gc.ca/climateData/bulkdata_e.html?format=csv&stationID=5415&Year={year}&Month={month}&timeframe=1&submit=Download+Data" # To get the data for March 2013, we need to format it with `month=3, year=2012`. # In[4]: url = url_template.format(month=3, year=2012) weather_mar2012 = pd.read_csv(url, skiprows=15, index_col='Date/Time', parse_dates=True, encoding='latin1', header=True) # This is super great! We can just use the same `read_csv` function as before, and just give it a URL as a filename. Awesome. # # There are 16 rows of metadata at the top of this CSV, but pandas knows CSVs are weird, so there's a `skiprows` options. We parse the dates again, and set 'Date/Time' to be the index column. Here's the resulting dataframe. # In[5]: weather_mar2012 # Let's plot it! # In[6]: weather_mar2012[u"Temp (\xc2\xb0C)"].plot(figsize=(15, 5)) # Notice how it goes up to 25° C in the middle there? That was a big deal. It was March, and people were wearing shorts outside. # # And I was out of town and I missed it. Still sad, humans. # # I had to write `'\xb0'` for that degree character °. Let's fix up the columns. We're going to just print them out, copy, and fix them up by hand. # In[7]: weather_mar2012.columns = [ u'Year', u'Month', u'Day', u'Time', u'Data Quality', u'Temp (C)', u'Temp Flag', u'Dew Point Temp (C)', u'Dew Point Temp Flag', u'Rel Hum (%)', u'Rel Hum Flag', u'Wind Dir (10s deg)', u'Wind Dir Flag', u'Wind Spd (km/h)', u'Wind Spd Flag', u'Visibility (km)', u'Visibility Flag', u'Stn Press (kPa)', u'Stn Press Flag', u'Hmdx', u'Hmdx Flag', u'Wind Chill', u'Wind Chill Flag', u'Weather'] # You'll notice in the summary above that there are a few columns which are are either entirely empty or only have a few values in them. Let's get rid of all of those with `dropna`. # # The argument `axis=1` to `dropna` means "drop columns", not rows", and `how='any'` means "drop the column if any value is null". # # This is much better now -- we only have columns with real data. # In[8]: weather_mar2012 = weather_mar2012.dropna(axis=1, how='any') weather_mar2012[:5] # The Year/Month/Day/Time columns are redundant, though, and the Data Quality column doesn't look too useful. Let's get rid of those. # # The `axis=1` argument means "Drop columns", like before. The default for operations like `dropna` and `drop` is always to operate on rows. # In[9]: weather_mar2012 = weather_mar2012.drop(['Year', 'Month', 'Day', 'Time', 'Data Quality'], axis=1) weather_mar2012[:5] # Awesome! We now only have the relevant columns, and it's much more manageable. # # 2.3 Plotting the temperature by hour of day # This one's just for fun -- we've already done this before, using groupby and aggregate! We will learn whether or not it gets colder at night. Well, obviously. But let's do it anyway. # In[10]: temperatures = weather_mar2012[[u'Temp (C)']].copy() print(temperatures.head) temperatures.loc[:,'Hour'] = weather_mar2012.index.hour temperatures.groupby('Hour').aggregate(np.median).plot() # So it looks like the time with the highest median temperature is 2pm. Neat. # # 5.3 Getting the whole year of data # Okay, so what if we want the data for the whole year? Ideally the API would just let us download that, but I couldn't figure out a way to do that. # # First, let's put our work from above into a function that gets the weather for a given month. # # I noticed that there's an irritating bug where when I ask for January, it gives me data for the previous year, so we'll fix that too. [no, really. You can check =)] # In[11]: def download_weather_month(year, month): if month == 1: year += 1 url = url_template.format(year=year, month=month) weather_data = pd.read_csv(url, skiprows=15, index_col='Date/Time', parse_dates=True, header=True) weather_data = weather_data.dropna(axis=1) weather_data.columns = [col.replace('\xb0', '') for col in weather_data.columns] weather_data = weather_data.drop(['Year', 'Day', 'Month', 'Time', 'Data Quality'], axis=1) return weather_data # We can test that this function does the right thing: # In[12]: download_weather_month(2012, 1)[:5] # Now we can get all the months at once. This will take a little while to run. # In[13]: data_by_month = [download_weather_month(2012, i) for i in range(1, 13)] # Once we have this, it's easy to concatenate all the dataframes together into one big dataframe using `pd.concat`. And now we have the whole year's data! # In[14]: weather_2012 = pd.concat(data_by_month) weather_2012 # # 5.4 Saving to a CSV # It's slow and unnecessary to download the data every time, so let's save our dataframe for later use! # In[15]: weather_2012.to_csv('../data/weather_2012.csv') # And we're done!