Scraping Crime Data from the FBI

The Uniform Crime Reports is an FBI program to collect standardized crime statistics across the United States. This notebook collects crime information from states and large cities, and compiles them into two CSV files.

In [11]:
import pandas as pd
import requests

from cStringIO import StringIO

City-Level Data

In [142]:
shortnames = {'Murder and nonnegligent manslaughter rate': 'Murder',
              'Forcible rape rate': 'Rape',
              'Robbery rate': 'Robbery',
              'Aggravated assault rate': 'Assault',
              'Property crime rate': 'Property',
              'Burglary rate': 'Burglary',
              'Larceny-theft rate': 'Larceny',
              'Motor vehicle theft rate': 'Vehicular'}

def city_data(year):
    """
    Grab crime statistics by city, for a single year. Return as a CSV string
    """
    url = 'http://www.ucrdatatool.gov/Search/Crime/Local/DownCrimeOneYearofDataLarge.cfm/LocalCrimeOneYearofData.csv'
    form = {"CrimeCrossId": "40,136,310,375,381,725,886,910,911,929,945,946,961,971,973,978,1010,1013,1028,1042,1056,1103,1116,1117,1135,1143,1146,1158,1177,1227,1233,1240,1245,1252,1261,1265,1267,1269,1272,1289,1306,1317,1322,1362,1380,1391,1434,1452,1471,1476,1477,1481,1486,1491,1496,1503,1524,1531,1540,1541,1550,1557,1562,1563,1574,1584,1596,1610,1630,1639,1642,1647,1652,1672,1750,1753,1763,1765,1767,1817,1820,1823,1825,1833,1874,1878,1888,22564,1918,1941,1969,2024,2089,2122,2141,2157,2194,2214,2244,2257,2277,2428,2456,2463,2487,2704,2724,2793,2811,2560,2976,2983,3045,3056,3081,3104,3182,3201,3204,3274,3310,3312,3436,3858,4037,4101,4134,4141,4411,4583,4701,4846,5032,5216,5303,5368,5453,5774,5789,5876,6134,6306,6430,6523,6612,6643,6970,22787,6985,7244,7391,7453,7506,7633,7655,7798,7948,8036,8055,8451,8621,8670,8713,8836,9144,9195,9539,9627,9672,9883,10027,10043,10291,10306,10518,10702,11291,11304,11386,11418,11472,11509,11763,11948,11956,12012,11017,11063,12262,12517,12520,12629,12750,12797,13003,13015,13168,13171,13183,13187,13222,13307,13794,13987,14155,14294,14305,14463,14477,14499,14532,15146,15319,15559,15574,15692,15826,15847,15954,15968,16061,16513,17184,17198,17736,17815,17847,18014,18293,18378,18383,18510,18552,18573,18576,18713,18733,18752,18763,18784,18843,18874,18902,18916,18965,19000,19016,19061,19109,19123,19144,19242,19262,19320,19366,19378,19411,19444,19449,19462,19466,19543,19575,19592,19632,19662,19680,19863,19941,20075,20085,20132,20133,20142,20220,20332,20430,20436,20501,20576,20718,20786,20816,20915,20930,20942,20959,21179,21256,21295",
            "YearStart": str(year),
            "YearEnd": str(year),
            "DataType":"1,2,3,4"}
    return requests.post(url, data=form).content

def parse_city(year):
    """
    Download crime data for 1 year across cities, and parse into a DataFrame
    """
    print year,
    data = city_data(year)

    cols = ['Agency', u'State', u'Population', 'Violent Crime rate', 
            'Murder and nonnegligent manslaughter rate', 'Forcible rape rate', 
            'Robbery rate', u'Aggravated assault rate', 
            'Property crime rate', u'Burglary rate', 'Larceny-theft rate', u'Motor vehicle theft rate']
    
    # seek to first line of data
    data = data[data.find('Agency,'):]    
    df = pd.read_csv(StringIO(data))
        
    df = df[cols] # focus on relevant columns...
    df = df[df.Population > 100000] # ...and rows
    df = df.rename_axis(shortnames, axis=1) # relabel
    df['Year'] = year
    return df
In [119]:
city = pd.concat(map(parse_city, range(1985, 2013)), ignore_index=True)
city.to_csv('city_crime.csv', index=False)
1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012
In [123]:
city.head(20)
Out[123]:
Agency State Population Violent Crime rate Murder Rape Robbery Assault Property Burglary Larceny Vehicular Year
0 Abilene Police Dept TX 111317 318.9 7.2 32.3 86.2 193.1 5530.2 1458.0 3697.5 374.6 1985
1 Akron City Police Dept OH 226704 826.2 7.5 69.7 225.4 523.6 5849.5 1410.2 4025.5 413.8 1985
2 Albuquerque Police Dept NM 357051 1149.7 11.8 66.7 349.2 722.0 8136.9 2572.7 5023.1 541.1 1985
3 Alexandria Police Dept VA 108346 740.2 6.5 48.0 392.3 293.5 6339.0 1343.8 4378.6 616.5 1985
4 Allentown City Police Dept PA 103483 382.7 3.9 45.4 184.6 148.8 5014.4 1331.6 3426.6 256.1 1985
5 Amarillo Police Dept TX 166747 455.2 10.2 35.4 97.2 312.4 6047.5 1612.6 4116.4 318.4 1985
6 Amherst Town Police Dept NY 108570 210.9 0.9 2.8 38.7 168.6 2441.7 446.7 1760.2 234.9 1985
7 Anaheim Police Dept CA 240283 483.6 6.7 44.5 250.1 182.3 6829.0 2147.5 3971.6 710.0 1985
8 Anchorage Police Dept AK 229579 515.7 6.1 84.1 161.6 264.0 5853.3 1259.7 3987.3 606.3 1985
9 Ann Arbor Police Dept MI 107832 588.9 3.7 34.3 132.6 418.2 7936.4 1448.5 6025.1 462.8 1985
10 Arlington Police Dept TX 218931 507.0 8.2 48.0 136.6 314.3 8461.1 1599.6 6065.8 795.7 1985
11 Atlanta Police Dept GA 436214 2661.5 33.2 156.6 1098.5 1373.2 10521.2 2795.0 6733.4 992.9 1985
12 Aurora Police Dept CO 198033 1136.2 5.0 44.9 219.2 867.0 7400.8 2000.2 4916.9 483.8 1985
13 Austin Police Dept TX 406469 581.3 12.8 120.3 264.7 183.5 9024.3 2524.4 5980.3 519.6 1985
14 Bakersfield Police Dept CA 133983 897.1 12.7 51.5 447.1 385.9 8746.6 2878.0 5241.0 627.7 1985
15 Baltimore City Police Dept MD 771097 2009.9 27.6 76.8 1007.8 897.7 6565.1 1799.0 3985.5 780.6 1985
16 Baton Rouge Police Dept LA 245830 1761.8 14.6 51.3 395.0 1300.9 10962.0 3247.4 6729.9 984.8 1985
17 Beaumont Police Dept TX 126298 772.0 8.7 103.7 269.2 390.3 6500.5 2323.9 3669.9 506.7 1985
18 Berkeley Police Dept CA 106768 1090.2 10.3 40.3 618.2 421.5 11272.1 2744.3 7713.9 813.9 1985
19 Birmingham Police Dept AL 281973 1188.1 34.4 100.4 508.9 544.4 8768.9 2543.9 4979.9 1245.2 1985

State-Level Data

In [161]:
url = 'http://www.ucrdatatool.gov/Search/Crime/State/DownCrimeStatebyState.cfm/CrimeStatebyState.csv'
form = {"StateId":"52,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51",
        "YearStart":"1960",
        "YearEnd":"2012",
        "DataType":"1,2,3,4"}

def parse_state(txt):
    cols = ['Year', 'Population', 
            'Violent Crime rate', 
            'Murder and nonnegligent manslaughter rate', 
            'Forcible rape rate', 
            'Robbery rate', 
            'Aggravated assault rate', 
            'Property crime rate', 
            'Burglary rate', 
            'Larceny-theft rate', 
            'Motor vehicle theft rate']

    state = txt.split('\n')[0].strip()
    df = pd.read_csv(StringIO(txt), skiprows=2)
    df = df[cols]
    df['State'] = state
    df = df.rename_axis(shortnames, axis=1)
    df = df[df.Population.convert_objects(convert_numeric=True) > 0] # remove notes at end of document
    
    return df

data = requests.post(url, data=form).content
data = data.split('Estimated crime in')[1:]
states = pd.concat(map(parse_state, data), ignore_index=True)
states.to_csv('state_crime.csv', index=False)
In [162]:
states.tail(20)
Out[162]:
Year Population Violent Crime rate Murder Rape Robbery Assault Property Burglary Larceny Vehicular State
2731 1993 257782608 747.1 9.5 41.1 256.0 440.5 4740.0 1099.7 3033.9 606.3 United States-Total
2732 1994 260327021 713.6 9.0 39.3 237.8 427.6 4660.2 1042.1 3026.9 591.3 United States-Total
2733 1995 262803276 684.5 8.2 37.1 220.9 418.3 4590.5 987.0 3043.2 560.3 United States-Total
2734 1996 265228572 636.6 7.4 36.3 201.9 391.0 4451.0 945.0 2980.3 525.7 United States-Total
2735 1997 267783607 611.0 6.8 35.9 186.2 382.1 4316.3 918.8 2891.8 505.7 United States-Total
2736 1998 270248003 567.6 6.3 34.5 165.5 361.4 4052.5 863.2 2729.5 459.9 United States-Total
2737 1999 272690813 523.0 5.7 32.8 150.1 334.3 3743.6 770.4 2550.7 422.5 United States-Total
2738 2000 281421906 506.5 5.5 32.0 145.0 324.0 3618.3 728.8 2477.3 412.2 United States-Total
2739 2001 285317559 504.5 5.6 31.8 148.5 318.6 3658.1 741.8 2485.7 430.5 United States-Total
2740 2002 287973924 494.4 5.6 33.1 146.1 309.5 3630.6 747.0 2450.7 432.9 United States-Total
2741 2003 290788976 475.8 5.7 32.3 142.5 295.4 3591.2 741.0 2416.5 433.7 United States-Total
2742 2004 293656842 463.2 5.5 32.4 136.7 288.6 3514.1 730.3 2362.3 421.5 United States-Total
2743 2005 296507061 469.0 5.6 31.8 140.8 290.8 3431.5 726.9 2287.8 416.8 United States-Total
2744 2006 299398484 479.3 5.8 31.6 150.0 292.0 3346.6 733.1 2213.2 400.2 United States-Total
2745 2007 301621157 471.8 5.7 30.6 148.3 287.2 3276.4 726.1 2185.4 364.9 United States-Total
2746 2008 304059724 458.6 5.4 29.8 145.9 277.5 3214.6 733.0 2166.1 315.4 United States-Total
2747 2009 307006550 431.9 5.0 29.1 133.1 264.7 3041.3 717.7 2064.5 259.2 United States-Total
2748 2010 309330219 404.5 4.8 27.7 119.3 252.8 2945.9 701.0 2005.8 239.1 United States-Total
2749 2011 311587816 387.1 4.7 27.0 113.9 241.5 2905.4 701.3 1974.1 230.0 United States-Total
2750 2012 313914040 386.9 4.7 26.9 112.9 242.3 2859.2 670.2 1959.3 229.7 United States-Total
In [ ]: