#!/usr/bin/env python # coding: utf-8 # In[26]: # The usual preamble get_ipython().run_line_magic('matplotlib', 'inline') import pandas as pd import matplotlib.pyplot as plt # Make the graphs a bit prettier, and bigger plt.style.use('ggplot') # This is necessary to show lots of columns in pandas 0.12. # Not necessary in pandas 0.13. pd.set_option('display.width', 5000) pd.set_option('display.max_columns', 60) plt.rcParams['figure.figsize'] = (15, 5) # We're going to use a new dataset here, to demonstrate how to deal with larger datasets. This is a subset of the of 311 service requests from [NYC Open Data](https://nycopendata.socrata.com/Social-Services/311-Service-Requests-from-2010-to-Present/erm2-nwe9). # In[41]: # because of mixed types we specify dtype to prevent any errors complaints = pd.read_csv('../data/311-service-requests.csv', dtype='unicode') # Depending on your pandas version, you might see an error like "DtypeWarning: Columns (8) have mixed types". This means that it's encountered a problem reading in our data. In this case it almost certainly means that it has columns where some of the entries are strings and some are integers. # # For now we're going to ignore it and hope we don't run into a problem, but in the long run we'd need to investigate this warning. # # 2.1 What's even in it? (the summary) # When you print a large dataframe, it will only show you the first few rows. # # If you don't see this, don't panic! The default behavior for large dataframes changed between pandas 0.12 and 0.13. Previous to 0.13 it would show you a summary of the dataframe. This includes all the columns, and how many non-null values there are in each column. # In[28]: complaints # # 2.2 Selecting columns and rows # To select a column, we index with the name of the column, like this: # In[29]: complaints['Complaint Type'] # To get the first 5 rows of a dataframe, we can use a slice: `df[:5]`. # # This is a great way to get a sense for what kind of information is in the dataframe -- take a minute to look at the contents and get a feel for this dataset. # In[30]: complaints[:5] # We can combine these to get the first 5 rows of a column: # In[31]: complaints['Complaint Type'][:5] # and it doesn't matter which direction we do it in: # In[32]: complaints[:5]['Complaint Type'] # # 2.3 Selecting multiple columns # What if we just want to know the complaint type and the borough, but not the rest of the information? Pandas makes it really easy to select a subset of the columns: just index with list of columns you want. # In[33]: complaints[['Complaint Type', 'Borough']] # That showed us a summary, and then we can look at the first 10 rows: # In[34]: complaints[['Complaint Type', 'Borough']][:10] # # 2.4 What's the most common complaint type? # This is a really easy question to answer! There's a `.value_counts()` method that we can use: # In[35]: complaints['Complaint Type'].value_counts() # If we just wanted the top 10 most common complaints, we can do this: # In[36]: complaint_counts = complaints['Complaint Type'].value_counts() complaint_counts[:10] # But it gets better! We can plot them! # In[37]: complaint_counts[:10].plot(kind='bar') #