#!/usr/bin/env python # coding: utf-8 # In[7]: get_ipython().run_line_magic('matplotlib', 'inline') import pandas as pd import matplotlib.pyplot as plt import numpy as np pd.set_option('display.mpl_style', 'default') plt.rcParams['figure.figsize'] = (15, 3) plt.rcParams['font.family'] = 'sans-serif' # We saw earlier that pandas is really good at dealing with dates. It is also amazing with strings! We're going to go back to our weather data from Chapter 5, here. # In[8]: weather_2012 = pd.read_csv('../data/weather_2012.csv', parse_dates=True, index_col='Date/Time') weather_2012[:5] # # 6.1 String operations # You'll see that the 'Weather' column has a text description of the weather that was going on each hour. We'll assume it's snowing if the text description contains "Snow". # # pandas provides vectorized string functions, to make it easy to operate on columns containing text. There are some great [examples](http://pandas.pydata.org/pandas-docs/stable/basics.html#vectorized-string-methods) in the documentation. # In[9]: weather_description = weather_2012['Weather'] is_snowing = weather_description.str.contains('Snow') # This gives us a binary vector, which is a bit hard to look at, so we'll plot it. # In[10]: # Not super useful is_snowing[:5] # In[11]: # More useful! is_snowing.plot() # # 6.2 Use resampling to find the snowiest month # If we wanted the median temperature each month, we could use the `resample()` method like this: # In[12]: weather_2012['Temp (C)'].resample('M', how=np.median).plot(kind='bar') # Unsurprisingly, July and August are the warmest. # So we can think of snowiness as being a bunch of 1s and 0s instead of `True`s and `False`s: # In[13]: is_snowing.astype(float)[:10] # and then use `resample` to find the percentage of time it was snowing each month # In[14]: is_snowing.astype(float).resample('M', how=np.mean) # In[15]: is_snowing.astype(float).resample('M', how=np.mean).plot(kind='bar') # So now we know! In 2012, December was the snowiest month. Also, this graph suggests something that I feel -- it starts snowing pretty abruptly in November, and then tapers off slowly and takes a long time to stop, with the last snow usually being in April or May. # # 6.3 Plotting temperature and snowiness stats together # We can also combine these two statistics (temperature, and snowiness) into one dataframe and plot them together: # In[16]: temperature = weather_2012['Temp (C)'].resample('M', how=np.median) is_snowing = weather_2012['Weather'].str.contains('Snow') snowiness = is_snowing.astype(float).resample('M', how=np.mean) # Name the columns temperature.name = "Temperature" snowiness.name = "Snowiness" # We'll use `concat` again to combine the two statistics into a single dataframe. # In[17]: stats = pd.concat([temperature, snowiness], axis=1) stats # In[18]: stats.plot(kind='bar') # Uh, that didn't work so well because the scale was wrong. We can do better by plotting them on two separate graphs: # In[19]: stats.plot(kind='bar', subplots=True, figsize=(15, 10)) #