Table of Contents
https://plot.ly/python/big-data-analytics-with-pandas-and-sqlite/
import pandas as pd
import datetime as dt
from IPython.display import display
import plotly.plotly as py # interactive graphing
from plotly.graph_objs import Bar, Scatter, Marker, Layout
from os import path
%%sh
ls -l ~/Downloads/*.csv
# a 9gb file!
-rw-r--r-- 1 takanori takanori 9087741962 Sep 15 02:28 /home/takanori/Downloads/311_Service_Requests_from_2010_to_Present.csv
filepath = path.expanduser('~/Downloads/311_Service_Requests_from_2010_to_Present.csv')
pd.read_csv(filepath,nrows=2).head()
Unique Key | Created Date | Closed Date | Agency | Agency Name | Complaint Type | Descriptor | Location Type | Incident Zip | Incident Address | Street Name | Cross Street 1 | Cross Street 2 | Intersection Street 1 | Intersection Street 2 | Address Type | City | Landmark | Facility Type | Status | Due Date | Resolution Description | Resolution Action Updated Date | Community Board | Borough | X Coordinate (State Plane) | Y Coordinate (State Plane) | Park Facility Name | Park Borough | School Name | School Number | School Region | School Code | School Phone Number | School Address | School City | School State | School Zip | School Not Found | School or Citywide Complaint | Vehicle Type | Taxi Company Borough | Taxi Pick Up Location | Bridge Highway Name | Bridge Highway Direction | Road Ramp | Bridge Highway Segment | Garage Lot Name | Ferry Direction | Ferry Terminal Name | Latitude | Longitude | Location | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 33504340 | 06/03/2016 08:51:27 PM | 06/09/2016 12:41:22 PM | HPD | Department of Housing Preservation and Develop... | UNSANITARY CONDITION | MOLD | RESIDENTIAL BUILDING | 10469 | 3471 MICKLE AVENUE | MICKLE AVENUE | NaN | NaN | NaN | NaN | ADDRESS | BRONX | NaN | NaN | Closed | NaN | The Department of Housing Preservation and Dev... | 06/09/2016 12:41:22 PM | 12 BRONX | BRONX | 1026814 | 259342 | Unspecified | BRONX | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 40.8784 | -73.8461 | (40.87840550821806, -73.84608300411983) |
1 | 33504341 | 06/02/2016 03:55:00 PM | 06/06/2016 12:00:00 PM | DSNY | A - Illegal Posting Staten Island, Queens and ... | Dirty Conditions | E15 Illegal Postering | Sidewalk | 11228 | NaN | NaN | NaN | NaN | 15 AVENUE | BAY RIDGE PARKWAY | INTERSECTION | BROOKLYN | NaN | NaN | Closed | NaN | The Department of Sanitation has removed illeg... | 06/06/2016 12:00:00 PM | 11 BROOKLYN | BROOKLYN | 983410 | 164301 | Unspecified | BROOKLYN | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 40.6176 | -74.0030 | (40.61764469366998, -74.00302568205309) |
pd.read_csv(filepath,nrows=2).tail()
Unique Key | Created Date | Closed Date | Agency | Agency Name | Complaint Type | Descriptor | Location Type | Incident Zip | Incident Address | Street Name | Cross Street 1 | Cross Street 2 | Intersection Street 1 | Intersection Street 2 | Address Type | City | Landmark | Facility Type | Status | Due Date | Resolution Description | Resolution Action Updated Date | Community Board | Borough | X Coordinate (State Plane) | Y Coordinate (State Plane) | Park Facility Name | Park Borough | School Name | School Number | School Region | School Code | School Phone Number | School Address | School City | School State | School Zip | School Not Found | School or Citywide Complaint | Vehicle Type | Taxi Company Borough | Taxi Pick Up Location | Bridge Highway Name | Bridge Highway Direction | Road Ramp | Bridge Highway Segment | Garage Lot Name | Ferry Direction | Ferry Terminal Name | Latitude | Longitude | Location | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 33504340 | 06/03/2016 08:51:27 PM | 06/09/2016 12:41:22 PM | HPD | Department of Housing Preservation and Develop... | UNSANITARY CONDITION | MOLD | RESIDENTIAL BUILDING | 10469 | 3471 MICKLE AVENUE | MICKLE AVENUE | NaN | NaN | NaN | NaN | ADDRESS | BRONX | NaN | NaN | Closed | NaN | The Department of Housing Preservation and Dev... | 06/09/2016 12:41:22 PM | 12 BRONX | BRONX | 1026814 | 259342 | Unspecified | BRONX | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 40.8784 | -73.8461 | (40.87840550821806, -73.84608300411983) |
1 | 33504341 | 06/02/2016 03:55:00 PM | 06/06/2016 12:00:00 PM | DSNY | A - Illegal Posting Staten Island, Queens and ... | Dirty Conditions | E15 Illegal Postering | Sidewalk | 11228 | NaN | NaN | NaN | NaN | 15 AVENUE | BAY RIDGE PARKWAY | INTERSECTION | BROOKLYN | NaN | NaN | Closed | NaN | The Department of Sanitation has removed illeg... | 06/06/2016 12:00:00 PM | 11 BROOKLYN | BROOKLYN | 983410 | 164301 | Unspecified | BROOKLYN | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 40.6176 | -74.0030 | (40.61764469366998, -74.00302568205309) |
%%bash
DATA_FILE="${HOME}/Downloads/311_Service_Requests_from_2010_to_Present.csv"
#wc -l < ${DATA_FILE}
Process is interrupted.
from sqlalchemy import create_engine
disk_engine = create_engine('sqlite:///mytest.db') # Initializes database with filename mytest.db in current directory
!ls
bokeh-try-movies.ipynb plotly_js Choose my geolocator.ipynb plotly-sql.ipynb ipywidgets readme.rst matplotlib-demo RFXATDASL0S2I8PD3YMI4G53TBDSYK78.png nilearn sklearn old-stuffs sqlite3 Pandas-snippets.ipynb streaming plotly
start = dt.datetime.now()
chunksize = 2000
j = 0
index_start = 1
for df in pd.read_csv(filepath, chunksize=chunksize, iterator=True, encoding='utf-8'):
df = df.rename(columns={c: c.replace(' ', '') for c in df.columns}) # Remove spaces from columns
df['CreatedDate'] = pd.to_datetime(df['CreatedDate']) # Convert to datetimes
df['ClosedDate'] = pd.to_datetime(df['ClosedDate'])
df.index += index_start
# Remove the un-interesting columns
columns = ['Agency', 'CreatedDate', 'ClosedDate', 'ComplaintType', 'Descriptor',
'CreatedDate', 'ClosedDate', 'TimeToCompletion',
'City']
for c in df.columns:
if c not in columns:
df = df.drop(c, axis=1)
j+=1
print '{} seconds: completed {} rows'.format((dt.datetime.now() - start).seconds, j*chunksize)
df.to_sql('data', disk_engine, if_exists='append')
index_start = df.index[-1] + 1
if j*chunksize >= 10000:
# i don't want it to get TOO big for this prototype purpose
break
0 seconds: completed 2000 rows 2 seconds: completed 4000 rows 3 seconds: completed 6000 rows 4 seconds: completed 8000 rows 5 seconds: completed 10000 rows
!du -h mytest.db
2.8M mytest.db
pd.read_sql_query('SELECT * FROM data LIMIT 1', disk_engine).columns.tolist()
[u'index', u'CreatedDate', u'ClosedDate', u'Agency', u'ComplaintType', u'Descriptor', u'City']
pd.read_sql_query('SELECT * FROM data LIMIT 3', disk_engine)
index | CreatedDate | ClosedDate | Agency | ComplaintType | Descriptor | City | |
---|---|---|---|---|---|---|---|
0 | 1 | 2016-06-03 20:51:27.000000 | 2016-06-09 12:41:22.000000 | HPD | UNSANITARY CONDITION | MOLD | BRONX |
1 | 2 | 2016-06-02 15:55:00.000000 | 2016-06-06 12:00:00.000000 | DSNY | Dirty Conditions | E15 Illegal Postering | BROOKLYN |
2 | 3 | 2016-06-03 06:24:00.000000 | 2016-06-03 12:00:00.000000 | DSNY | Dirty Conditions | E3 Dirty Sidewalk | STATEN ISLAND |
pd.read_sql_query('SELECT ComplaintType, Descriptor, Agency '
'FROM data '
'LIMIT 10', disk_engine)
ComplaintType | Descriptor | Agency | |
---|---|---|---|
0 | UNSANITARY CONDITION | MOLD | HPD |
1 | Dirty Conditions | E15 Illegal Postering | DSNY |
2 | Dirty Conditions | E3 Dirty Sidewalk | DSNY |
3 | Missed Collection (All Materials) | 1 Missed Collection | DSNY |
4 | Missed Collection (All Materials) | 1 Missed Collection | DSNY |
5 | Missed Collection (All Materials) | 2 Bulk-Missed Collection | DSNY |
6 | Derelict Vehicles | 14 Derelict Vehicles | DSNY |
7 | Noise - Residential | Loud Music/Party | NYPD |
8 | Sanitation Condition | 15 Street Cond/Dump-Out/Drop-Off | DSNY |
9 | Sanitation Condition | 15 Street Cond/Dump-Out/Drop-Off | DSNY |
pd.read_sql_query('SELECT ComplaintType, Descriptor, Agency '
'FROM data '
'WHERE Agency = "NYPD" '
'LIMIT 10', disk_engine)
ComplaintType | Descriptor | Agency | |
---|---|---|---|
0 | Noise - Residential | Loud Music/Party | NYPD |
1 | Vending | Unlicensed | NYPD |
2 | Blocked Driveway | No Access | NYPD |
3 | Derelict Vehicle | With License Plate | NYPD |
4 | Illegal Parking | Blocked Hydrant | NYPD |
5 | Urinating in Public | None | NYPD |
6 | Noise - Commercial | Car/Truck Horn | NYPD |
7 | Noise - Vehicle | Car/Truck Horn | NYPD |
8 | Illegal Parking | Commercial Overnight Parking | NYPD |
9 | Noise - Street/Sidewalk | Loud Music/Party | NYPD |
pd.read_sql_query('SELECT ComplaintType, Descriptor, Agency '
'FROM data '
'WHERE Agency IN ("NYPD", "DOB")'
'LIMIT 50', disk_engine).T
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
ComplaintType | Noise - Residential | Vending | Blocked Driveway | Derelict Vehicle | Illegal Parking | Urinating in Public | Noise - Commercial | Noise - Vehicle | Illegal Parking | Noise - Street/Sidewalk | Illegal Parking | Noise - Street/Sidewalk | Non-Emergency Police Matter | Illegal Parking | Noise - Residential | Homeless Encampment | Noise - Residential | Noise - Residential | Blocked Driveway | Illegal Parking | Blocked Driveway | Blocked Driveway | Illegal Parking | Illegal Parking | Noise - Residential | Noise - Residential | Noise - Residential | Noise - Street/Sidewalk | Blocked Driveway | Derelict Vehicle | Noise - Residential | Blocked Driveway | General Construction/Plumbing | Noise - Residential | Noise - Street/Sidewalk | Noise - Residential | Elevator | General Construction/Plumbing | General Construction/Plumbing | Noise - Residential | Noise - Residential | Plumbing | Building/Use | Special Enforcement | Building/Use | General Construction/Plumbing | General Construction/Plumbing | General Construction/Plumbing | Noise - Residential | Noise - Street/Sidewalk |
Descriptor | Loud Music/Party | Unlicensed | No Access | With License Plate | Blocked Hydrant | None | Car/Truck Horn | Car/Truck Horn | Commercial Overnight Parking | Loud Music/Party | Blocked Hydrant | Loud Music/Party | Trespassing | Overnight Commercial Storage | Loud Talking | None | Loud Music/Party | Loud Music/Party | No Access | Blocked Sidewalk | No Access | No Access | Double Parked Blocking Traffic | Blocked Hydrant | Loud Music/Party | Loud Music/Party | Loud Music/Party | Loud Music/Party | No Access | With License Plate | Loud Music/Party | No Access | Cons - Contrary/Beyond Approved Plans/Permits | Loud Talking | Loud Music/Party | Loud Music/Party | Elevator - Defective/Not Working | Egress - Doors Locked/Blocked/Improper/No Seco... | Working Contrary To Stop Work Order | Loud Music/Party | Loud Music/Party | Failure To Retain Water/Improper Drainage- (LL... | Illegal Conversion Of Residential Building/Space | SEP - Professional Certification Compliance Audit | Illegal Conversion Of Residential Building/Space | Site Conditions Endangering Workers | Working Contrary To Stop Work Order | Sidewalk Shed/Pipe Scafford - Inadequate Defec... | Loud Music/Party | Loud Music/Party |
Agency | NYPD | NYPD | NYPD | NYPD | NYPD | NYPD | NYPD | NYPD | NYPD | NYPD | NYPD | NYPD | NYPD | NYPD | NYPD | NYPD | NYPD | NYPD | NYPD | NYPD | NYPD | NYPD | NYPD | NYPD | NYPD | NYPD | NYPD | NYPD | NYPD | NYPD | NYPD | NYPD | DOB | NYPD | NYPD | NYPD | DOB | DOB | DOB | NYPD | NYPD | DOB | DOB | DOB | DOB | DOB | DOB | DOB | NYPD | NYPD |
# unique values with DISTINCT (recall pyspark)
pd.read_sql_query('SELECT DISTINCT City FROM data', disk_engine).T
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
City | BRONX | BROOKLYN | STATEN ISLAND | South Richmond Hill | NEW YORK | Rego Park | CORONA | Maspeth | WOODHAVEN | ROCKAWAY PARK | None | QUEENS VILLAGE | FAR ROCKAWAY | ASTORIA | Jamaica | HOLLIS | Richmond Hill | SAINT ALBANS | EAST ELMHURST | Corona | Astoria | Whitestone | Queens Village | Elmhurst | Bayside | Fresh Meadows | Middle Village | Flushing | Jackson Heights | Arverne | Rosedale | Saint Albans | Woodside | Sunnyside | Forest Hills | FRESH MEADOWS | ELMHURST | SOUTH OZONE PARK | WOODSIDE | FLUSHING | LONG ISLAND CITY | RICHMOND HILL | RIDGEWOOD | LITTLE NECK | OZONE PARK | MIDDLE VILLAGE | HOWARD BEACH | FOREST HILLS | South Ozone Park | SUNNYSIDE | JAMAICA | BAYSIDE | Far Rockaway | KEW GARDENS | OAKLAND GARDENS | Long Island City | Woodhaven | Cambria Heights | SOUTH RICHMOND HILL | FLORAL PARK | Springfield Gardens | East Elmhurst | Bellerose | Little Neck | Ozone Park | MASPETH | Ridgewood | Glen Oaks | WHITESTONE | CAMBRIA HEIGHTS | JACKSON HEIGHTS | SPRINGFIELD GARDENS | COLLEGE POINT | Oakland Gardens | Hollis | College Point | ROSEDALE | GLEN OAKS | REGO PARK | Howard Beach | ARVERNE | Kew Gardens | BELLEROSE | NEW HYDE PARK | Rockaway Park | NEWARK | Floral Park | BELLMORE | PARSIPPANY | PELHAM MANOR |
pd.read_sql_query('SELECT Agency, COUNT(*) as `num_complaints`'
'FROM data '
'GROUP BY Agency ', disk_engine).T
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Agency | 3-1-1 | DCA | DEP | DFTA | DHS | DOB | DOE | DOF | DOHMH | DOITT | DOT | DPR | DSNY | EDC | FDNY | HPD | HRA | NYCEM | NYPD | OPS | TLC |
num_complaints | 21 | 183 | 1673 | 59 | 514 | 716 | 22 | 503 | 739 | 20 | 2290 | 1053 | 1945 | 4 | 87 | 3230 | 181 | 2 | 8989 | 2 | 267 |
df = pd.read_sql_query('SELECT Agency, COUNT(*) as `num_complaints`'
'FROM data '
'GROUP BY Agency '
'ORDER BY -num_complaints', disk_engine)
df.T
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Agency | NYPD | HPD | DOT | DSNY | DEP | DPR | DOHMH | DOB | DHS | DOF | TLC | DCA | HRA | FDNY | DFTA | DOE | 3-1-1 | DOITT | EDC | NYCEM | OPS |
num_complaints | 8989 | 3230 | 2290 | 1945 | 1673 | 1053 | 739 | 716 | 514 | 503 | 267 | 183 | 181 | 87 | 59 | 22 | 21 | 20 | 4 | 2 | 2 |
py.iplot([Bar(x=df.Agency, y=df.num_complaints)], filename='311/most common complaints by agency')
df = pd.read_sql_query('SELECT ComplaintType, COUNT(*) as `num_complaints`, Agency '
'FROM data '
'GROUP BY `ComplaintType` '
'ORDER BY -num_complaints', disk_engine)
most_common_complaints = df # used later
py.iplot({
'data': [Bar(x=df['ComplaintType'], y=df.num_complaints)],
'layout': {
'margin': {'b': 150}, # Make the bottom margin a bit bigger to handle the long text
'xaxis': {'tickangle': 40}} # Angle the labels a bit
}, filename='311/most common complaints by complaint type')
# number of cities recorded in the data
len(pd.read_sql_query('SELECT DISTINCT City FROM data', disk_engine))
90
pd.read_sql_query('SELECT City, COUNT(*) as `num_complaints` '
'FROM data '
'GROUP BY `City` '
'ORDER BY -num_complaints '
'LIMIT 25 ', disk_engine).T
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
City | BROOKLYN | NEW YORK | BRONX | None | STATEN ISLAND | ASTORIA | JAMAICA | FLUSHING | Jamaica | RIDGEWOOD | CORONA | OZONE PARK | Astoria | Flushing | ELMHURST | WOODSIDE | SOUTH RICHMOND HILL | LONG ISLAND CITY | RICHMOND HILL | QUEENS VILLAGE | Ridgewood | SOUTH OZONE PARK | EAST ELMHURST | FRESH MEADOWS | JACKSON HEIGHTS |
num_complaints | 6793 | 4655 | 3571 | 1071 | 1036 | 296 | 249 | 227 | 215 | 205 | 172 | 153 | 145 | 144 | 125 | 117 | 107 | 106 | 104 | 102 | 96 | 95 | 91 | 91 | 89 |
# repeat above but with case insentivitiy
df = pd.read_sql_query('SELECT City, COUNT(*) as `num_complaints` '
'FROM data '
'GROUP BY `City` '
'COLLATE NOCASE '
'ORDER BY -num_complaints '
'LIMIT 11 ', disk_engine)
df
City | num_complaints | |
---|---|---|
0 | BROOKLYN | 6793 |
1 | NEW YORK | 4655 |
2 | BRONX | 3571 |
3 | None | 1071 |
4 | STATEN ISLAND | 1036 |
5 | JAMAICA | 464 |
6 | ASTORIA | 441 |
7 | FLUSHING | 371 |
8 | RIDGEWOOD | 301 |
9 | CORONA | 253 |
10 | OZONE PARK | 221 |
cities = list(df.City)
print cities
[u'BROOKLYN', u'NEW YORK', u'BRONX', None, u'STATEN ISLAND', u'JAMAICA', u'ASTORIA', u'FLUSHING', u'RIDGEWOOD', u'CORONA', u'OZONE PARK']
# oh, remove NONE!
cities.remove(None)
from pprint import pprint
pprint(cities)
[u'BROOKLYN', u'NEW YORK', u'BRONX', u'STATEN ISLAND', u'JAMAICA', u'ASTORIA', u'FLUSHING', u'RIDGEWOOD', u'CORONA', u'OZONE PARK']
traces = [] # the series in the graph - one trace for each city
for city in cities:
df = pd.read_sql_query('SELECT ComplaintType, COUNT(*) as `num_complaints` '
'FROM data '
'WHERE City = "{}" COLLATE NOCASE '
'GROUP BY `ComplaintType` '
'ORDER BY -num_complaints'.format(city), disk_engine)
traces.append(Bar(x=df['ComplaintType'], y=df.num_complaints, name=city.capitalize()))
py.iplot({'data': traces,
'layout': Layout(barmode='stack',
xaxis={'tickangle': 40},
margin={'b': 150})},
filename='311/complaints by city stacked')
# normalize the counts
for trace in traces:
trace['y'] = 100.*trace['y']/sum(trace['y'])
py.iplot({'data': traces,
'layout': Layout(
barmode='group',
xaxis={'tickangle': 40, 'autorange': False, 'range': [-0.5, 16]},
yaxis={'title': 'Percent of Complaints by City'},
margin={'b': 150},
title='Relative Number of 311 Complaints by City')
}, filename='311/relative complaints by city', validate=False)
New York is loud
Staten Island is moldy, wet, and vacant
Flushing's muni meters are broken
Trash collection is great in the Bronx
Woodside doesn't like its graffiti
Filter SQLite rows with timestamp strings: YYYY-MM-DD hh:mm:ss
pd.read_sql_query('SELECT ComplaintType, CreatedDate, City '
'FROM data '
'WHERE CreatedDate < "2014-11-16 23:47:00" '
'AND CreatedDate > "2014-11-16 23:45:00"', disk_engine)
ComplaintType | CreatedDate | City |
---|
# Pull out the hour unit from timestamps with strftime
df = pd.read_sql_query('SELECT CreatedDate, '
'strftime(\'%H\', CreatedDate) as hour, '
'ComplaintType '
'FROM data '
'LIMIT 5 ', disk_engine)
df.head()
CreatedDate | hour | ComplaintType | |
---|---|---|---|
0 | 2016-06-03 20:51:27.000000 | 20 | UNSANITARY CONDITION |
1 | 2016-06-02 15:55:00.000000 | 15 | Dirty Conditions |
2 | 2016-06-03 06:24:00.000000 | 06 | Dirty Conditions |
3 | 2016-06-03 09:10:00.000000 | 09 | Missed Collection (All Materials) |
4 | 2016-06-03 10:39:00.000000 | 10 | Missed Collection (All Materials) |
df = pd.read_sql_query('SELECT CreatedDate, '
'strftime(\'%H\', CreatedDate) as hour, '
'count(*) as `Complaints per Hour`'
'FROM data '
'GROUP BY hour', disk_engine)
df.T
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
CreatedDate | 2016-06-05 00:00:00.000000 | 2016-06-05 01:26:07.000000 | 2016-06-05 02:03:50.000000 | 2016-06-04 03:51:13.000000 | 2016-06-04 04:38:14.000000 | 2016-06-04 05:40:45.000000 | 2016-06-04 06:30:39.000000 | 2016-06-04 07:59:53.000000 | 2016-06-04 08:33:42.000000 | 2016-06-04 09:17:24.000000 | 2016-06-04 10:20:14.000000 | 2016-06-04 11:52:55.000000 | 2016-06-04 12:42:13.000000 | 2016-06-04 13:19:24.000000 | 2016-06-04 14:36:15.000000 | 2016-06-04 15:33:31.000000 | 2016-06-04 16:21:14.000000 | 2016-06-04 17:32:36.000000 | 2016-06-04 18:29:05.000000 | 2016-06-04 19:24:21.000000 | 2016-06-03 20:25:00.000000 | 2016-06-04 21:00:38.000000 | 2016-06-04 22:21:44.000000 | 2016-06-04 23:22:55.000000 |
hour | 00 | 01 | 02 | 03 | 04 | 05 | 06 | 07 | 08 | 09 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 |
Complaints per Hour | 1622 | 889 | 377 | 186 | 146 | 176 | 287 | 599 | 995 | 1224 | 1613 | 1430 | 1319 | 1027 | 1185 | 1078 | 1241 | 1087 | 870 | 722 | 866 | 1002 | 1262 | 1297 |
py.iplot({
'data': [Bar(x=df['hour'], y=df['Complaints per Hour'])],
'layout': Layout(xaxis={'title': 'Hour in Day'},
yaxis={'title': 'Number of Complaints'})}, filename='311/complaints per hour')
df = pd.read_sql_query('SELECT CreatedDate, '
'strftime(\'%H\', CreatedDate) as `hour`, '
'count(*) as `Complaints per Hour`'
'FROM data '
'WHERE ComplaintType IN ("Noise", '
'"Noise - Street/Sidewalk", '
'"Noise - Commercial", '
'"Noise - Vehicle", '
'"Noise - Park", '
'"Noise - House of Worship", '
'"Noise - Helicopter", '
'"Collection Truck Noise") '
'GROUP BY hour', disk_engine)
display(df.head(n=2))
py.iplot({
'data': [Bar(x=df['hour'], y=df['Complaints per Hour'])],
'layout': Layout(xaxis={'title': 'Hour in Day'},
yaxis={'title': 'Number of Complaints'},
title='Number of Noise Complaints in NYC by Hour in Day'
)}, filename='311/noise complaints per hour')
CreatedDate | hour | Complaints per Hour | |
---|---|---|---|
0 | 2016-06-05 00:15:44.000000 | 00 | 383 |
1 | 2016-06-05 01:51:56.000000 | 01 | 291 |
complaint_traces = {} # Each series in the graph will represent a complaint
complaint_traces['Other'] = {}
for hour in range(1, 24):
hour_str = '0'+str(hour) if hour < 10 else str(hour)
df = pd.read_sql_query('SELECT CreatedDate, '
'ComplaintType ,'
'strftime(\'%H\', CreatedDate) as `hour`, '
'COUNT(*) as num_complaints '
'FROM data '
'WHERE hour = "{}" '
'GROUP BY ComplaintType '
'ORDER BY -num_complaints'.format(hour_str), disk_engine)
complaint_traces['Other'][hour] = sum(df.num_complaints)
# Grab the 7 most common complaints for that hour
for i in range(7):
complaint = df.get_value(i, 'ComplaintType')
count = df.get_value(i, 'num_complaints')
complaint_traces['Other'][hour] -= count
if complaint in complaint_traces:
complaint_traces[complaint][hour] = count
else:
complaint_traces[complaint] = {hour: count}
traces = []
for complaint in complaint_traces:
traces.append({
'x': range(25),
'y': [complaint_traces[complaint].get(i, None) for i in range(25)],
'name': complaint,
'type': 'bar'
})
py.iplot({
'data': traces,
'layout': {
'barmode': 'stack',
'xaxis': {'title': 'Hour in Day'},
'yaxis': {'title': 'Number of Complaints'},
'title': 'The 7 Most Common 311 Complaints by Hour in a Day'
}}, filename='311/most common complaints by hour')
# First, create a new column with timestamps rounded to the previous 15 minute interval
minutes = 15
seconds = 15*60
df = pd.read_sql_query('SELECT CreatedDate, '
'datetime(('
'strftime(\'%s\', CreatedDate) / {seconds}) * {seconds}, \'unixepoch\') interval '
'FROM data '
'LIMIT 10 '.format(seconds=seconds), disk_engine)
display(df.head())
CreatedDate | interval | |
---|---|---|
0 | 2016-06-03 20:51:27.000000 | 2016-06-03 20:45:00 |
1 | 2016-06-02 15:55:00.000000 | 2016-06-02 15:45:00 |
2 | 2016-06-03 06:24:00.000000 | 2016-06-03 06:15:00 |
3 | 2016-06-03 09:10:00.000000 | 2016-06-03 09:00:00 |
4 | 2016-06-03 10:39:00.000000 | 2016-06-03 10:30:00 |
# Then, GROUP BY that interval and COUNT(*)
minutes = 15
seconds = minutes*60
df = pd.read_sql_query('SELECT datetime(('
'strftime(\'%s\', CreatedDate) / {seconds}) * {seconds}, \'unixepoch\') interval ,'
'COUNT(*) as "Complaints / interval"'
'FROM data '
'GROUP BY interval '
'ORDER BY interval '
'LIMIT 500'.format(seconds=seconds), disk_engine)
display(df.head())
display(df.tail())
interval | Complaints / interval | |
---|---|---|
0 | 2016-05-25 00:00:00 | 2 |
1 | 2016-05-31 20:30:00 | 2 |
2 | 2016-05-31 22:30:00 | 3 |
3 | 2016-06-01 08:00:00 | 4 |
4 | 2016-06-01 08:45:00 | 3 |
interval | Complaints / interval | |
---|---|---|
249 | 2016-06-05 01:30:00 | 103 |
250 | 2016-06-05 01:45:00 | 77 |
251 | 2016-06-05 02:00:00 | 57 |
252 | 2016-06-05 12:45:00 | 2 |
253 | 2016-06-05 17:15:00 | 2 |
# === too dense of a data ... ===
# py.iplot(
# {
# 'data': [{
# 'x': df.interval,
# 'y': df['Complaints / interval'],
# 'type': 'bar'
# }],
# 'layout': {
# 'title': 'Number of 311 Complaints per 15 Minutes'
# }
# }, filename='311/complaints per 15 minutes')