import pandas as pd import requests from cStringIO import StringIO shortnames = {'Murder and nonnegligent manslaughter rate': 'Murder', 'Forcible rape rate': 'Rape', 'Robbery rate': 'Robbery', 'Aggravated assault rate': 'Assault', 'Property crime rate': 'Property', 'Burglary rate': 'Burglary', 'Larceny-theft rate': 'Larceny', 'Motor vehicle theft rate': 'Vehicular'} def city_data(year): """ Grab crime statistics by city, for a single year. Return as a CSV string """ url = 'http://www.ucrdatatool.gov/Search/Crime/Local/DownCrimeOneYearofDataLarge.cfm/LocalCrimeOneYearofData.csv' form = {"CrimeCrossId": "40,136,310,375,381,725,886,910,911,929,945,946,961,971,973,978,1010,1013,1028,1042,1056,1103,1116,1117,1135,1143,1146,1158,1177,1227,1233,1240,1245,1252,1261,1265,1267,1269,1272,1289,1306,1317,1322,1362,1380,1391,1434,1452,1471,1476,1477,1481,1486,1491,1496,1503,1524,1531,1540,1541,1550,1557,1562,1563,1574,1584,1596,1610,1630,1639,1642,1647,1652,1672,1750,1753,1763,1765,1767,1817,1820,1823,1825,1833,1874,1878,1888,22564,1918,1941,1969,2024,2089,2122,2141,2157,2194,2214,2244,2257,2277,2428,2456,2463,2487,2704,2724,2793,2811,2560,2976,2983,3045,3056,3081,3104,3182,3201,3204,3274,3310,3312,3436,3858,4037,4101,4134,4141,4411,4583,4701,4846,5032,5216,5303,5368,5453,5774,5789,5876,6134,6306,6430,6523,6612,6643,6970,22787,6985,7244,7391,7453,7506,7633,7655,7798,7948,8036,8055,8451,8621,8670,8713,8836,9144,9195,9539,9627,9672,9883,10027,10043,10291,10306,10518,10702,11291,11304,11386,11418,11472,11509,11763,11948,11956,12012,11017,11063,12262,12517,12520,12629,12750,12797,13003,13015,13168,13171,13183,13187,13222,13307,13794,13987,14155,14294,14305,14463,14477,14499,14532,15146,15319,15559,15574,15692,15826,15847,15954,15968,16061,16513,17184,17198,17736,17815,17847,18014,18293,18378,18383,18510,18552,18573,18576,18713,18733,18752,18763,18784,18843,18874,18902,18916,18965,19000,19016,19061,19109,19123,19144,19242,19262,19320,19366,19378,19411,19444,19449,19462,19466,19543,19575,19592,19632,19662,19680,19863,19941,20075,20085,20132,20133,20142,20220,20332,20430,20436,20501,20576,20718,20786,20816,20915,20930,20942,20959,21179,21256,21295", "YearStart": str(year), "YearEnd": str(year), "DataType":"1,2,3,4"} return requests.post(url, data=form).content def parse_city(year): """ Download crime data for 1 year across cities, and parse into a DataFrame """ print year, data = city_data(year) cols = ['Agency', u'State', u'Population', 'Violent Crime rate', 'Murder and nonnegligent manslaughter rate', 'Forcible rape rate', 'Robbery rate', u'Aggravated assault rate', 'Property crime rate', u'Burglary rate', 'Larceny-theft rate', u'Motor vehicle theft rate'] # seek to first line of data data = data[data.find('Agency,'):] df = pd.read_csv(StringIO(data)) df = df[cols] # focus on relevant columns... df = df[df.Population > 100000] # ...and rows df = df.rename_axis(shortnames, axis=1) # relabel df['Year'] = year return df city = pd.concat(map(parse_city, range(1985, 2013)), ignore_index=True) city.to_csv('city_crime.csv', index=False) city.head(20) url = 'http://www.ucrdatatool.gov/Search/Crime/State/DownCrimeStatebyState.cfm/CrimeStatebyState.csv' form = {"StateId":"52,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51", "YearStart":"1960", "YearEnd":"2012", "DataType":"1,2,3,4"} def parse_state(txt): cols = ['Year', 'Population', 'Violent Crime rate', 'Murder and nonnegligent manslaughter rate', 'Forcible rape rate', 'Robbery rate', 'Aggravated assault rate', 'Property crime rate', 'Burglary rate', 'Larceny-theft rate', 'Motor vehicle theft rate'] state = txt.split('\n')[0].strip() df = pd.read_csv(StringIO(txt), skiprows=2) df = df[cols] df['State'] = state df = df.rename_axis(shortnames, axis=1) df = df[df.Population.convert_objects(convert_numeric=True) > 0] # remove notes at end of document return df data = requests.post(url, data=form).content data = data.split('Estimated crime in')[1:] states = pd.concat(map(parse_state, data), ignore_index=True) states.to_csv('state_crime.csv', index=False) states.tail(20)