#!/usr/bin/env python
# coding: utf-8

# In[113]:


# The usual preamble
get_ipython().run_line_magic('matplotlib', 'inline')

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Make the graphs a bit prettier, and bigger
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (15, 5)
plt.rcParams['font.family'] = 'sans-serif'

# This is necessary to show lots of columns in pandas 0.12. 
# Not necessary in pandas 0.13.
pd.set_option('display.width', 5000) 
pd.set_option('display.max_columns', 60)


# One of the main problems with messy data is: how do you know if it's messy or not?
# 
# We're going to use the NYC 311 service request dataset again here, since it's big and a bit unwieldy.

# In[114]:


requests = pd.read_csv('../data/311-service-requests.csv', dtype='unicode')


# # 7.1 How do we know if it's messy? 

# We're going to look at a few columns here. I know already that there are some problems with the zip code, so let's look at that first.
#  
# To get a sense for whether a column has problems, I usually use `.unique()` to look at all its values. If it's a numeric column, I'll instead plot a histogram to get a sense of the distribution.
# 
# When we look at the unique values in "Incident Zip", it quickly becomes clear that this is a mess.
# 
# Some of the problems:
# 
# * Some have been parsed as strings, and some as floats
# * There are `nan`s 
# * Some of the zip codes are `29616-0759` or `83`
# * There are some N/A values that pandas didn't recognize, like 'N/A' and 'NO CLUE'
# 
# What we can do:
# 
# * Normalize 'N/A' and 'NO CLUE' into regular nan values
# * Look at what's up with the 83, and decide what to do
# * Make everything strings

# In[115]:


requests['Incident Zip'].unique()


# # 7.2 Fixing the nan values and string/float confusion

# We can pass a `na_values` option to `pd.read_csv` to clean this up a little bit. We can also specify that the type of Incident Zip is a string, not a float.

# In[116]:


na_values = ['NO CLUE', 'N/A', '0']
requests = pd.read_csv('../data/311-service-requests.csv', na_values=na_values, dtype={'Incident Zip': str})


# In[117]:


requests['Incident Zip'].unique()


# # 7.3 What's up with the dashes?

# In[118]:


rows_with_dashes = requests['Incident Zip'].str.contains('-').fillna(False)
len(requests[rows_with_dashes])


# In[119]:


requests[rows_with_dashes]


# I thought these were missing data and originally deleted them like this:
# 
# `requests['Incident Zip'][rows_with_dashes] = np.nan`
# 
# But then my friend Dave pointed out that 9-digit zip codes are normal. Let's look at all the zip codes with more than 5 digits, make sure they're okay, and then truncate them.

# In[120]:


long_zip_codes = requests['Incident Zip'].str.len() > 5
requests['Incident Zip'][long_zip_codes].unique()


# Those all look okay to truncate to me.

# In[121]:


requests['Incident Zip'] = requests['Incident Zip'].str.slice(0, 5)


# Done.

# Earlier I thought 00083 was a broken zip code, but turns out Central Park's zip code 00083! Shows what I know. I'm still concerned about the 00000 zip codes, though: let's look at that. 

# In[122]:


requests[requests['Incident Zip'] == '00000']


# This looks bad to me. Let's set these to nan.

# In[123]:


zero_zips = requests['Incident Zip'] == '00000'
requests.loc[zero_zips, 'Incident Zip'] = np.nan


# Great. Let's see where we are now:

# In[124]:


unique_zips = requests['Incident Zip'].unique()
unique_zips.sort()
unique_zips


# Amazing! This is much cleaner. There's something a bit weird here, though -- I looked up 77056 on Google maps, and that's in Texas.
# 
# Let's take a closer look:

# In[125]:


zips = requests['Incident Zip']
# Let's say the zips starting with '0' and '1' are okay, for now. (this isn't actually true -- 13221 is in Syracuse, and why?)
is_close = zips.str.startswith('0') | zips.str.startswith('1')
# There are a bunch of NaNs, but we're not interested in them right now, so we'll say they're False
is_far = ~(is_close) & zips.notnull()


# In[126]:


zips[is_far]


# In[127]:


requests[is_far][['Incident Zip', 'Descriptor', 'City']].sort_values('Incident Zip')


# Okay, there really are requests coming from LA and Houston! Good to know. Filtering by zip code is probably a bad way to handle this -- we should really be looking at the city instead.

# In[128]:


requests['City'].str.upper().value_counts()


# It looks like these are legitimate complaints, so we'll just leave them alone.

# # 7.4 Putting it together

# Here's what we ended up doing to clean up our zip codes, all together:

# In[129]:


na_values = ['NO CLUE', 'N/A', '0']
requests = pd.read_csv('../data/311-service-requests.csv', 
                       na_values=na_values, 
                       dtype={'Incident Zip': str})


# In[130]:


def fix_zip_codes(zips):
    # Truncate everything to length 5 
    zips = zips.str.slice(0, 5)
    
    # Set 00000 zip codes to nan
    zero_zips = zips == '00000'
    zips[zero_zips] = np.nan
    
    return zips


# In[131]:


requests['Incident Zip'] = fix_zip_codes(requests['Incident Zip'])


# In[132]:


requests['Incident Zip'].unique()


# <style>
#     @font-face {
#         font-family: "Computer Modern";
#         src: url('http://mirrors.ctan.org/fonts/cm-unicode/fonts/otf/cmunss.otf');
#     }
#     div.cell{
#         width:800px;
#         margin-left:16% !important;
#         margin-right:auto;
#     }
#     h1 {
#         font-family: Helvetica, serif;
#     }
#     h4{
#         margin-top:12px;
#         margin-bottom: 3px;
#        }
#     div.text_cell_render{
#         font-family: Computer Modern, "Helvetica Neue", Arial, Helvetica, Geneva, sans-serif;
#         line-height: 145%;
#         font-size: 130%;
#         width:800px;
#         margin-left:auto;
#         margin-right:auto;
#     }
#     .CodeMirror{
#             font-family: "Source Code Pro", source-code-pro,Consolas, monospace;
#     }
#     .text_cell_render h5 {
#         font-weight: 300;
#         font-size: 22pt;
#         color: #4057A1;
#         font-style: italic;
#         margin-bottom: .5em;
#         margin-top: 0.5em;
#         display: block;
#     }
#     
#     .warning{
#         color: rgb( 240, 20, 20 )
#         }  

# In[ ]: