import pandas as pd # From Pandas 0.12 onwards, you can just use a single line: # data = pd.read_csv('http://nypd.openscrape.com/data/collisions.csv.gz', compression='gzip', sep='\t') # But for older versions, we need to download the file first. import os.path import urllib # Let's download it only if it does not already exist. Save ourselves some bandwidth. if not os.path.exists('collisions.csv.gz'): urllib.urlretrieve('http://nypd.openscrape.com/data/collisions.csv.gz', 'collisions.csv.gz') data = pd.read_csv('collisions.csv.gz', compression='gzip', sep='\t') data from IPython.display import HTML HTML(data.head().to_html()) pd.set_option('display.max_columns', 100) pd.set_option('display.line_width', 10000) data.head() heatmap, xedges, yedges = np.histogram2d(data['lon'], data['lat'], bins=50) locations = data[['lon', 'lat']].dropna() locations heatmap, xedges, yedges = np.histogram2d(locations['lon'], locations['lat'], bins=50) plt.imshow(heatmap, extent=[xedges[0], xedges[-1], yedges[0], yedges[-1]]) locations['lon'].hist() rcParams['figure.figsize'] = 15, 5 locations['lon'].hist(bins=100) locations['lat'].hist(bins=100) fig, axes = plt.subplots(nrows=1, ncols=2, sharey=True) locations['lon'].hist(bins=100, ax=axes[0]) locations['lat'].hist(bins=100, ax=axes[1]) coords = locations[(locations['lon'] < -73.5) & (locations['lon'] > -74.5) & (locations['lat'] < +41.0) & (locations['lat'] > +40.5)] coords heatmap, xedges, yedges = np.histogram2d(coords['lon'], coords['lat'], bins=50) plt.imshow(heatmap, extent=[xedges[0], xedges[-1], yedges[0], yedges[-1]]) rcParams['figure.figsize'] = [10,10] heatmap, xedges, yedges = np.histogram2d(-coords['lon'], coords['lat'], bins=500) plt.imshow(heatmap ** .5, cmap='YlOrBr', aspect=1.5, extent=[xedges[0], xedges[-1], yedges[0], yedges[-1]]) # How many collisions? data.collisions.sum() # How many were injured? data.total_injured.sum() # How many were killed? data.total_killed.sum() # Let's break this up by month. data.groupby(['year', 'month'])['collisions', 'total_injured', 'total_killed'].sum() data.groupby(['year', 'month'])['collisions', 'total_injured', 'total_killed'].sum().plot(subplots=True, kind='line', figsize=(15,9)) data.groupby('borocode')['collisions'].sum().order().plot(kind='barh', figsize=[10,3]) data.groupby('precinct')['collisions'].sum().order().plot(kind='barh', figsize=[10,15]) precinct = data['precinct'].unique() precinct def precinct_name(number): if number == 14: return 'Midtown South' elif number == 18: return 'Midtown North' elif number == 22: return 'Central Park' elif number % 10 == 1 and number % 100 != 11: return '%dst precinct' % number elif number % 10 == 2 and number % 100 != 12: return '%dnd precinct' % number elif number % 10 == 3 and number % 100 != 13: return '%drd precinct' % number else: return '%dth precinct' % number data.groupby('precinct')['collisions'].sum().order().rename(precinct_name).plot(kind='barh', figsize=[10, 15]) from cStringIO import StringIO population = pd.read_table(StringIO('''precinct population 1 66679 5 52568 6 62226 7 56355 9 76443 10 50180 13 93640 14 20651 17 79126 18 54066 19 208259 20 102624 22 25 23 73106 24 106460 25 47405 26 49508 28 44781 30 60685 32 70942 33 77645 34 112375 40 91497 41 52246 42 79762 43 172122 44 146441 45 120833 46 128200 47 152374 48 83266 49 114712 50 101720 52 139307 60 104278 61 159645 62 181981 63 108646 66 191382 67 155252 68 124491 69 84480 70 160664 71 98429 72 126230 73 86468 75 183328 76 43694 77 96309 78 61099 79 90263 81 62722 83 112634 84 48196 88 51421 90 116836 94 56247 100 47913 101 67065 102 144008 103 105803 104 170190 105 188582 106 122441 107 151107 108 113200 109 247354 110 172634 111 116431 112 112277 113 120132 114 202766 115 171576 120 175876 122 194822 123 98032 ''')) population.sort('population').head() collisions = pd.DataFrame({'collisions': data.groupby('precinct')['collisions'].sum().order()}) collisions.head() population = population.set_index('precinct') population.head() collisions['population'] = population['population'] collisions collisions[collisions['population'].isnull()] (collisions['collisions'].astype(float) / collisions['population']).rename(precinct_name).order().plot(kind='barh', figsize=[10, 15]) precincts = collisions.drop([22, 121]) (precincts['collisions'].astype(float) / precincts['population']).rename(precinct_name).order().plot(kind='barh', figsize=[10, 15])