import pandas as pd datafile = "njaccidents.csv" df = pd.read_csv(datafile) df.head() #this is a list that you can pluck items out of. df.columns df.describe() df['Severity'].describe() df['County Name'].describe() df[df['County Name']=='Passaic'] df[df['County Name'].str.contains('PASSAIC')].head() df['County Name'][df['County Name'].str.contains('PASSAIC')].ix[242727] df['County Name']=df['County Name'].map(str.strip) df['Police Department'][df['Police Department'].str.contains('BLOOMINGDALE')] df[df['County Name']=='PASSAIC'] df['Police Dept Code'].unique() df[['Police Dept Code', 'Police Department']][df['Police Dept Code']==99] df['Police Dept Code']=df['Police Dept Code'].astype(str) df['Police Dept Code'].unique() df['Police Dept Code'][df['Police Dept Code']==' ']='Unknown' #Create a smaller frame myframe = df[['County Name', 'Municipality Name', 'Crash Date', 'Crash Day Of Week', 'Crash Time', 'Total Killed', 'Total Injured', 'Pedestrians Killed', 'Pedestrians Injured', 'Total Vehicles Involved', 'Crash Type Code', 'Alcohol Involved', 'Environmental Condition', 'Light Condition', 'Cell Phone In Use Flag']] myframe.to_csv('smallertab.csv') from datetime import datetime newframe = pd.read_csv('smallertab.csv') newframe.groupby('County Name') #What county had the most accident deaths? newframe.groupby('County Name').sum() newframe.groupby('County Name').sum().iloc[:,1].order(ascending=False) countydeaths = newframe.groupby('County Name').sum().iloc[:,1].order(ascending=False) #we use lambda, which creates an unnamed, one-line function and applies it to each item in the column newframe['Crash Date']=newframe['Crash Date'].apply(lambda x: datetime.strptime(x, "%m/%d/%Y").date()) crashesbydate = newframe.groupby('Crash Date').count().iloc[:,0] newframe['Total Involved']=newframe['Total Killed']+newframe['Total Injured'] newframe['Crash Day Of Week']=newframe['Crash Day Of Week'].map(str.strip) countyframe = newframe.groupby('County Name').sum() %pylab inline import matplotlib.pyplot as plt countydeaths.plot(kind='bar') crashesbydate.plot() Say you want to look at the distribution of the number of people injured or killed in accidents. You can run the ```.hist()``` function on a column in the DataFrame and see that in the vast majority of accidents, no one was hurt. newframe['Total Involved'].hist(bins=50) #let's scatter plot county name by total killed and pedestrians killed. countyframe.plot(kind='scatter', x='Total Killed', y='Pedestrians Killed')