#!/usr/bin/env python
# coding: utf-8

# # PyCon 2018: Using pandas for Better (and Worse) Data Science
# 
# ### GitHub repository: https://github.com/justmarkham/pycon-2018-tutorial
# 
# ### Instructor: Kevin Markham
# 
# - GitHub: https://github.com/justmarkham
# - Twitter: https://twitter.com/justmarkham
# - YouTube: https://www.youtube.com/dataschool
# - Website: http://www.dataschool.io

# In[1]:


import pandas as pd
pd.__version__


# In[2]:


import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')


# # Dataset: Stanford Open Policing Project  ([video](https://www.youtube.com/watch?v=hl-TGI4550M&list=PL5-da3qGB5IBITZj_dYSFqnd_15JgqwA6&index=1))
# 
# https://openpolicing.stanford.edu/

# In[3]:


# ri stands for Rhode Island
ri = pd.read_csv('police.csv')


# In[4]:


# what does each row represent?
ri.head()


# In[5]:


# what do these numbers mean?
ri.shape


# In[6]:


# what do these types mean?
ri.dtypes


# - What does NaN mean?
# - Why might a value be missing?
# - Why mark it as NaN? Why not mark it as a 0 or an empty string or a string saying "Unknown"?

# In[7]:


# what are these counts? how does this work?
ri.isnull().sum()


# In[8]:


(True == 1) and (False == 0)


# ## 1. Remove the column that only contains missing values ([video](https://www.youtube.com/watch?v=TW5RqdDBasg&list=PL5-da3qGB5IBITZj_dYSFqnd_15JgqwA6&index=2))

# In[9]:


# axis=1 also works, inplace is False by default, inplace=True avoids assignment statement
ri.drop('county_name', axis='columns', inplace=True)


# In[10]:


ri.shape


# In[11]:


ri.columns


# In[12]:


# alternative method
ri.dropna(axis='columns', how='all').shape


# Lessons:
# 
# - Pay attention to default arguments
# - Check your work
# - There is more than one way to do everything in pandas

# ## 2. Do men or women speed more often? ([video](https://www.youtube.com/watch?v=d0oBRIONOEw&list=PL5-da3qGB5IBITZj_dYSFqnd_15JgqwA6&index=3))

# In[13]:


# when someone is stopped for speeding, how often is it a man or woman?
ri[ri.violation == 'Speeding'].driver_gender.value_counts(normalize=True)


# In[14]:


# alternative
ri.loc[ri.violation == 'Speeding', 'driver_gender'].value_counts(normalize=True)


# In[15]:


# when a man is pulled over, how often is it for speeding?
ri[ri.driver_gender == 'M'].violation.value_counts(normalize=True)


# In[16]:


# repeat for women
ri[ri.driver_gender == 'F'].violation.value_counts(normalize=True)


# In[17]:


# combines the two lines above
ri.groupby('driver_gender').violation.value_counts(normalize=True)


# What are some relevant facts that we don't know?
# 
# Lessons:
# 
# - There is more than one way to understand a question

# ## 3. Does gender affect who gets searched during a stop? ([video](https://www.youtube.com/watch?v=WzpGq1X5U1M&list=PL5-da3qGB5IBITZj_dYSFqnd_15JgqwA6&index=4))

# In[18]:


# ignore gender for the moment
ri.search_conducted.value_counts(normalize=True)


# In[19]:


# how does this work?
ri.search_conducted.mean()


# In[20]:


# search rate by gender
ri.groupby('driver_gender').search_conducted.mean()


# Does this prove that gender affects who gets searched?

# In[21]:


# include a second factor
ri.groupby(['violation', 'driver_gender']).search_conducted.mean()


# Does this prove causation?
# 
# Lessons:
# 
# - Causation is difficult to conclude, so focus on relationships
# - Include all relevant factors when studying a relationship

# ## 4. Why is search_type missing so often? ([video](https://www.youtube.com/watch?v=3D6smaE9c-g&list=PL5-da3qGB5IBITZj_dYSFqnd_15JgqwA6&index=5))

# In[22]:


ri.isnull().sum()


# In[23]:


# maybe search_type is missing any time search_conducted is False?
ri.search_conducted.value_counts()


# In[24]:


# test that theory, why is the Series empty?
ri[ri.search_conducted == False].search_type.value_counts()


# In[25]:


# value_counts ignores missing values by default
ri[ri.search_conducted == False].search_type.value_counts(dropna=False)


# In[26]:


# when search_conducted is True, search_type is never missing
ri[ri.search_conducted == True].search_type.value_counts(dropna=False)


# In[27]:


# alternative
ri[ri.search_conducted == True].search_type.isnull().sum()


# Lessons:
# 
# - Verify your assumptions about your data
# - pandas functions ignore missing values by default

# ## 5. During a search, how often is the driver frisked? ([video](https://www.youtube.com/watch?v=4tTO_xH4aQE&list=PL5-da3qGB5IBITZj_dYSFqnd_15JgqwA6&index=6))

# In[28]:


# multiple types are separated by commas
ri.search_type.value_counts(dropna=False)


# In[29]:


# use bracket notation when creating a column
ri['frisk'] = ri.search_type == 'Protective Frisk'


# In[30]:


ri.frisk.dtype


# In[31]:


# includes exact matches only
ri.frisk.sum()


# In[32]:


# is this the answer?
ri.frisk.mean()


# In[33]:


# uses the wrong denominator (includes stops that didn't involve a search)
ri.frisk.value_counts()


# In[34]:


161 / (91580 + 161)


# In[35]:


# includes partial matches
ri['frisk'] = ri.search_type.str.contains('Protective Frisk')


# In[36]:


# seems about right
ri.frisk.sum()


# In[37]:


# frisk rate during a search
ri.frisk.mean()


# In[38]:


# str.contains preserved missing values from search_type
ri.frisk.value_counts(dropna=False)


# In[39]:


# excludes stops that didn't involve a search
274 / (2922 + 274)


# Lessons:
# 
# - Use string methods to find partial matches
# - Use the correct denominator when calculating rates
# - pandas calculations ignore missing values
# - Apply the "smell test" to your results

# ## 6. Which year had the least number of stops? ([video](https://www.youtube.com/watch?v=W0zGzXQmE7c&list=PL5-da3qGB5IBITZj_dYSFqnd_15JgqwA6&index=7))

# In[40]:


# this works, but there's a better way
ri.stop_date.str.slice(0, 4).value_counts()


# In[41]:


# make sure you create this column
combined = ri.stop_date.str.cat(ri.stop_time, sep=' ')
ri['stop_datetime'] = pd.to_datetime(combined)


# In[42]:


ri.dtypes


# In[43]:


# why is 2005 so much smaller?
ri.stop_datetime.dt.year.value_counts()


# Lessons:
# 
# - Consider removing chunks of data that may be biased
# - Use the datetime data type for dates and times

# ## 7. How does drug activity change by time of day? ([video](https://www.youtube.com/watch?v=jV24N7SPXEU&list=PL5-da3qGB5IBITZj_dYSFqnd_15JgqwA6&index=8))

# In[44]:


ri.drugs_related_stop.dtype


# In[45]:


# baseline rate
ri.drugs_related_stop.mean()


# In[46]:


# can't groupby 'hour' unless you create it as a column
ri.groupby(ri.stop_datetime.dt.hour).drugs_related_stop.mean()


# In[47]:


# line plot by default (for a Series)
ri.groupby(ri.stop_datetime.dt.hour).drugs_related_stop.mean().plot()


# In[48]:


# alternative: count drug-related stops by hour
ri.groupby(ri.stop_datetime.dt.hour).drugs_related_stop.sum().plot()


# Lessons:
# 
# - Use plots to help you understand trends
# - Create exploratory plots using pandas one-liners

# ## 8. Do most stops occur at night? ([video](https://www.youtube.com/watch?v=GsQ6x3pt2w4&list=PL5-da3qGB5IBITZj_dYSFqnd_15JgqwA6&index=9))

# In[49]:


ri.stop_datetime.dt.hour.value_counts()


# In[50]:


ri.stop_datetime.dt.hour.value_counts().plot()


# In[51]:


ri.stop_datetime.dt.hour.value_counts().sort_index().plot()


# In[52]:


# alternative method
ri.groupby(ri.stop_datetime.dt.hour).stop_date.count().plot()


# Lessons:
# 
# - Be conscious of sorting when plotting

# ## 9. Find the bad data in the stop_duration column and fix it ([video](https://www.youtube.com/watch?v=8U8ob9bXakY&list=PL5-da3qGB5IBITZj_dYSFqnd_15JgqwA6&index=10))

# In[53]:


# mark bad data as missing
ri.stop_duration.value_counts()


# In[54]:


# what four things are wrong with this code?
# ri[ri.stop_duration == 1 | ri.stop_duration == 2].stop_duration = 'NaN'


# In[55]:


# what two things are still wrong with this code?
ri[(ri.stop_duration == '1') | (ri.stop_duration == '2')].stop_duration = 'NaN'


# In[56]:


# assignment statement did not work
ri.stop_duration.value_counts()


# In[57]:


# solves SettingWithCopyWarning
ri.loc[(ri.stop_duration == '1') | (ri.stop_duration == '2'), 'stop_duration'] = 'NaN'


# In[58]:


# confusing!
ri.stop_duration.value_counts(dropna=False)


# In[59]:


# replace 'NaN' string with actual NaN value
import numpy as np
ri.loc[ri.stop_duration == 'NaN', 'stop_duration'] = np.nan


# In[60]:


ri.stop_duration.value_counts(dropna=False)


# In[61]:


# alternative method
ri.stop_duration.replace(['1', '2'], value=np.nan, inplace=True)


# Lessons:
# 
# - Ambiguous data should be marked as missing
# - Don't ignore the SettingWithCopyWarning
# - NaN is not a string

# ## 10. What is the mean stop_duration for each violation_raw?

# In[62]:


# make sure you create this column
mapping = {'0-15 Min':8, '16-30 Min':23, '30+ Min':45}
ri['stop_minutes'] = ri.stop_duration.map(mapping)


# In[63]:


# matches value_counts for stop_duration
ri.stop_minutes.value_counts()


# In[64]:


ri.groupby('violation_raw').stop_minutes.mean()


# In[65]:


ri.groupby('violation_raw').stop_minutes.agg(['mean', 'count'])


# Lessons:
# 
# - Convert strings to numbers for analysis
# - Approximate when necessary
# - Use count with mean to looking for meaningless means

# ## 11. Plot the results of the first groupby from the previous exercise

# In[66]:


# what's wrong with this?
ri.groupby('violation_raw').stop_minutes.mean().plot()


# In[67]:


# how could this be made better?
ri.groupby('violation_raw').stop_minutes.mean().plot(kind='bar')


# In[68]:


ri.groupby('violation_raw').stop_minutes.mean().sort_values().plot(kind='barh')


# Lessons:
# 
# - Don't use a line plot to compare categories
# - Be conscious of sorting and orientation when plotting

# ## 12. Compare the age distributions for each violation

# In[69]:


# good first step
ri.groupby('violation').driver_age.describe()


# In[70]:


# histograms are excellent for displaying distributions
ri.driver_age.plot(kind='hist')


# In[71]:


# similar to a histogram
ri.driver_age.value_counts().sort_index().plot()


# In[72]:


# can't use the plot method
ri.hist('driver_age', by='violation')


# In[73]:


# what changed? how is this better or worse?
ri.hist('driver_age', by='violation', sharex=True)


# In[74]:


# what changed? how is this better or worse?
ri.hist('driver_age', by='violation', sharex=True, sharey=True)


# Lessons:
# 
# - Use histograms to show distributions
# - Be conscious of axes when using grouped plots

# ## 13. Pretend you don't have the driver_age column, and create it from driver_age_raw (and call it new_age)

# In[75]:


ri.head()


# In[76]:


# appears to be year of stop_date minus driver_age_raw
ri.tail()


# In[77]:


ri['new_age'] = ri.stop_datetime.dt.year - ri.driver_age_raw


# In[78]:


# compare the distributions
ri[['driver_age', 'new_age']].hist()


# In[79]:


# compare the summary statistics (focus on min and max)
ri[['driver_age', 'new_age']].describe()


# In[80]:


# calculate how many ages are outside that range
ri[(ri.new_age < 15) | (ri.new_age > 99)].shape


# In[81]:


# raw data given to the researchers
ri.driver_age_raw.isnull().sum()


# In[82]:


# age computed by the researchers (has more missing values)
ri.driver_age.isnull().sum()


# In[83]:


# what does this tell us? researchers set driver_age as missing if less than 15 or more than 99
5621-5327


# In[84]:


# driver_age_raw NOT MISSING, driver_age MISSING
ri[(ri.driver_age_raw.notnull()) & (ri.driver_age.isnull())].head()


# In[85]:


# set the ages outside that range as missing
ri.loc[(ri.new_age < 15) | (ri.new_age > 99), 'new_age'] = np.nan


# In[86]:


ri.new_age.equals(ri.driver_age)


# Lessons:
# 
# - Don't assume that the head and tail are representative of the data
# - Columns with missing values may still have bad data (driver_age_raw)
# - Data cleaning sometimes involves guessing (driver_age)
# - Use histograms for a sanity check