import pandas as pd


datafile = "njaccidents.csv"

df = pd.read_csv(datafile)

df.head()

#this is a list that you can pluck items out of.
df.columns

df.describe()

df['Severity'].describe()

df['County Name'].describe()

df[df['County Name']=='Passaic']

df[df['County Name'].str.contains('PASSAIC')].head()

df['County Name'][df['County Name'].str.contains('PASSAIC')].ix[242727]

df['County Name']=df['County Name'].map(str.strip)

df['Police Department'][df['Police Department'].str.contains('BLOOMINGDALE')]

df[df['County Name']=='PASSAIC']

df['Police Dept Code'].unique()

df[['Police Dept Code', 'Police Department']][df['Police Dept Code']==99]

df['Police Dept Code']=df['Police Dept Code'].astype(str)
df['Police Dept Code'].unique()

df['Police Dept Code'][df['Police Dept Code']=='  ']='Unknown'


#Create a smaller frame
myframe = df[['County Name', 'Municipality Name', 'Crash Date', 'Crash Day Of Week', 'Crash Time', 'Total Killed', 'Total Injured', 'Pedestrians Killed', 'Pedestrians Injured', 'Total Vehicles Involved', 'Crash Type Code', 'Alcohol Involved', 'Environmental Condition', 'Light Condition', 'Cell Phone In Use Flag']]

myframe.to_csv('smallertab.csv')

from datetime import datetime

newframe = pd.read_csv('smallertab.csv')

newframe.groupby('County Name')

#What county had the most accident deaths?
newframe.groupby('County Name').sum()

newframe.groupby('County Name').sum().iloc[:,1].order(ascending=False)


countydeaths = newframe.groupby('County Name').sum().iloc[:,1].order(ascending=False)

#we use lambda, which creates an unnamed, one-line function and applies it to each item in the column
newframe['Crash Date']=newframe['Crash Date'].apply(lambda x: datetime.strptime(x, "%m/%d/%Y").date())

crashesbydate = newframe.groupby('Crash Date').count().iloc[:,0]

newframe['Total Involved']=newframe['Total Killed']+newframe['Total Injured']

newframe['Crash Day Of Week']=newframe['Crash Day Of Week'].map(str.strip)

countyframe = newframe.groupby('County Name').sum()

%pylab inline

import matplotlib.pyplot as plt

countydeaths.plot(kind='bar')

crashesbydate.plot()
Say you want to look at the distribution of the number of people injured or killed in accidents. You can run the ```.hist()``` function on a column in the DataFrame and see that in the vast majority of accidents, no one was hurt.
newframe['Total Involved'].hist(bins=50)

#let's scatter plot county name by total killed and pedestrians killed.
countyframe.plot(kind='scatter', x='Total Killed', y='Pedestrians Killed')