In [3]:
%matplotlib inline 
import matplotlib.pyplot as plt 
import pandas 
import numpy as np

import statsmodels.api as sm
from statsmodels.sandbox.regression.predstd import wls_prediction_std

import plotly.plotly as py
import plotly.graph_objs as go
from plotly.tools import FigureFactory as FF
py.sign_in('erikrood','3eqsrype8v')
#source: http://www.cpsc.gov/en/Research--Statistics/NEISS-Injury-Data/
#source u.s. pop. http://factfinder.census.gov/faces/tableservices/jsf/pages/productview.xhtml?src=bkmk
In [234]:
NEISS = pandas.read_excel('/Users/erikrood/desktop/ipython_datasets/NEISS_data_04_16.xlsx') 
In [5]:
NEISS.head()
Out[5]:
CPSC Case # trmt_date psu weight stratum age sex race race_other diag diag_other body_part disposition location fmv prod1 prod2 narr1 narr2
0 150733174 2015-07-11 38 15.7762 V 5 1 0 NaN 57 NaN 33 1 9 0 1267 NaN 5 YR OLD MALE PLAYING SOCCER BLOCKING BALL AND... X IT
1 150734723 2015-07-06 43 83.2157 S 36 1 1 NaN 57 NaN 34 1 1 0 1439 4057.0 L WRIST FX(?)/36YOWM 2 WKS AGO@G-FRIENDS HM,MO... AN OBJECT&TWISTED WRIST TRYING TO CONTOL MOWER...
2 150817487 2015-08-02 51 74.8813 L 20 2 0 NaN 71 OTITIS EXTERNA 94 1 0 0 3274 NaN 20 YO F C/O EAR PAIN 1 DAY SAS WAS SWIMMING YE... T TRIED TO USE ***S TO REMOVE WAX MADE IT WORS...
3 150717776 2015-06-26 41 15.7762 V 61 1 0 NaN 71 PAIN 35 1 0 0 611 NaN 61YOM W/KNEE PAIN S/P FALLING IN SHOWER 2 DAYS... NaN
4 150721694 2015-07-04 42 74.8813 L 88 2 3 HISPANIC 62 NaN 75 1 0 0 1893 1807.0 88YOF PAIN TO HEAD WHEN FALL TO FLOOR WHEN WAL... DOOR AND FELL.DENIES LOC. DX HEAD INJURY
In [6]:
len(NEISS)
Out[6]:
359129
In [ ]:
#sex: male = 1, female = 2
df = NEISS[[5,6,15]]
df.head(10)
In [62]:
df1 = df[[2,0]]
df1.head()
Out[62]:
prod1 age
0 1267 5
1 1439 36
2 3274 20
3 611 61
4 1893 88
In [38]:
#most common product
df2 = df1.groupby('prod1').count().sort_values(by='age',ascending=False).round(decimals=0).reset_index()
df2[['prod1']] = df2[['prod1']].astype(str)
df2.head(5)
Out[38]:
prod1 age
0 1842 28712
1 1807 28351
2 4076 16784
3 1205 14147
4 5040 12787
In [11]:
len(df1)
Out[11]:
359129
In [166]:
#reading in product key/description map
NEISS_key = pandas.read_csv('/Users/erikrood/desktop/ipython_datasets/NEISS_key.csv') 
#renaming columns
NEISS_key.columns = ['prod1', 'Description'] 
#ensuring prod1 is same data type as main table
NEISS_key[['prod1']] = NEISS_key[['prod1']].astype(str)
#need to do this prior to performing the join
NEISS_key1 = NEISS_key.groupby('prod1')
In [167]:
#left join to pull in product description
df3 = df2.merge(NEISS_key, how='left', on="prod1")
#df3 = pandas.merge(df2, NEISS_key, left_index='prod1', right_index='clean', how='left')
In [66]:
df3.head(5)
Out[66]:
prod1 age Description
0 1842 28712 Stairs or steps
1 1807 28351 Floors or flooring materials
2 4076 16784 Beds or bedframes, other or not specified
3 1205 14147 Basketball
4 5040 12787 Bicycles or accessories
In [168]:
#renaming columns in df3
df3.columns = ['product_key', 'total_injuries','description'] 
df3.head(5)
Out[168]:
product_key total_injuries description
0 1842 28712 Stairs or steps
1 1807 28351 Floors or flooring materials
2 4076 16784 Beds or bedframes
3 1205 14147 Basketball
4 5040 12787 Bicycles or accessories

could add as a % of total reported injuries column

box and whisker of top 10 issues cut by age distribution

most common age agnostic of product

male v. female agnostic of product

In [102]:
#setting up top 10
df4 = df3[[2,1]].head(10)
df4.head(11)
Out[102]:
description total_injuries
0 Stairs or steps 28712
1 Floors or flooring materials 28351
2 Beds or bedframes 16784
3 Basketball 14147
4 Bicycles or accessories 12787
5 Football 11664
6 Chairs 8271
7 Ceilings and walls 7783
8 Doors 7723
9 Tables 7354
In [103]:
#top 10 overall

x = df4['description']
y = df4['total_injuries']

data = [
    go.Bar(
        x=x,
        y=y,
        
        marker=dict(
            color='rgb(255, 217, 102)',
            line=dict(
                color='rgb(8,48,107)',
                width=1.5
            ),
        ),
        opacity=0.6
    )
]
layout = go.Layout(
      title='top U.S. consumer product injuries by type',
    xaxis=dict(
        title='product type',
        tickangle = 47,
    ),
    yaxis=dict(
        title='total injuries'
    ),
     margin=go.Margin(
        b = 220
        
    )

    
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='top10_productinjuries')
Out[103]:
In [ ]:
#box and whisker of top 10 issues cut by age distribution
#setting up y values for box plot (step 0), loading in the data I want
df5 = df[[2,0]]
#ensuring same data type for merge
df5[['prod1']] = df5[['prod1']].astype(str)

df5 = df5[(df5['age'] <= 100)].reset_index()
df5.drop('index', axis=1, inplace=True)
#need to do this prior to performing the join
#df7 = df5.groupby('prod1')
#df7.head()
In [ ]:
df6 = df5.merge(NEISS_key, how='left', on="prod1")
df6.head(10)
In [171]:
product_injuries_age = df6[[2,1]]
product_injuries_age[['age']] = product_injuries_age[['age']].astype(float)
product_injuries_age.head()
Out[171]:
Description age
0 Soccer 5.0
1 Lawn mowers, not specified 74-- 74-- 36.0
2 Swimming (activity, apparel or equipment; 20.0
3 Bathtubs or showers 61.0
4 Doors 88.0
In [114]:
#setting up x variables for box plot
x_data1 = {'Stairs or steps','Floors or flooring materials','Beds or bedframes','Basketball',\
         'Bicycles or accessories', 'Football','Chairs',\
          'Ceilings and walls','Doors','Tables'}
x_data1
Out[114]:
{'Basketball',
 'Beds or bedframes',
 'Bicycles or accessories',
 'Ceilings and walls',
 'Chairs',
 'Doors',
 'Floors or flooring materials',
 'Football',
 'Stairs or steps',
 'Tables'}
In [120]:
#setting up y values for box plot (step 1)
product_injuries_age_bball = product_injuries_age[product_injuries_age['Description'] == 'Basketball'].copy()
Basketball = product_injuries_age_bball[['age']].copy()

product_injuries_age_bed = product_injuries_age[product_injuries_age['Description']== 'Beds or bedframes'].copy()
Beds = product_injuries_age_bed[['age']].copy()

product_injuries_age_bike = product_injuries_age[product_injuries_age['Description'] == 'Bicycles or accessories'].copy()
Bicycles = product_injuries_age_bike[['age']].copy()

product_injuries_age_ceiling = product_injuries_age[product_injuries_age['Description'] == 'Ceilings and walls'].copy()
Ceilings = product_injuries_age_ceiling[['age']].copy()

product_injuries_age_chair = product_injuries_age[product_injuries_age['Description'] == 'Chairs'].copy()
Chairs = product_injuries_age_chair[['age']].copy()

product_injuries_age_door = product_injuries_age[product_injuries_age['Description'] == 'Doors'].copy()
Doors = product_injuries_age_door[['age']].copy()

product_injuries_age_floor = product_injuries_age[product_injuries_age['Description'] == 'Floors or flooring materials'].copy()
Floors = product_injuries_age_floor[['age']].copy()

product_injuries_age_football = product_injuries_age[product_injuries_age['Description'] == 'Football'].copy()
Football = product_injuries_age_football[['age']].copy()

product_injuries_age_stairs = product_injuries_age[product_injuries_age['Description'] == 'Stairs or steps'].copy()
Stairs = product_injuries_age_stairs[['age']].copy()

product_injuries_age_tables = product_injuries_age[product_injuries_age['Description'] == 'Tables'].copy()
Tables = product_injuries_age_tables[['age']].copy()
In [131]:
#setting up y values for box plot (step 2)
y_data = [
    Bicycles.values,
    Tables.values,
    Basketball.values,
    Chairs.values,
    Football.values,
    Floors.values,
    Ceilings.values,
    Stairs.values,
    Doors.values,
    Beds.values

]
In [133]:
#putting together the box plot
traces = []
for xd, yd in zip(x_data1, y_data):
        traces.append(go.Box(
            y=yd,
            name=xd,
            #boxpoints='all',
            #jitter=0.5,
            whiskerwidth=0.2,
            marker=dict(
                size=2,
            ),
            line=dict(width=1),
        ))

layout = go.Layout(
    title='injury distribution by age',
    yaxis=dict(
        range=[0, 110],
        showgrid=True,
        zeroline=True,
        #dtick=5,
        gridcolor='rgb(255, 255, 255)',
        gridwidth=1,
        zerolinecolor='rgb(255, 255, 255)',
        zerolinewidth=2,
    ),
    margin=go.Margin(
        l=50,
        r=90,
        b=200,
        t=60,
        pad=4
    ),
    width=800,
    height=650,
    showlegend=False
)

    
fig = go.Figure(data=traces, layout=layout)
py.iplot(fig, filename = 'injury_age_distribution')
Out[133]:
In [129]:
product_injuries_age_mean = product_injuries_age.groupby('Description').median().round(decimals=0).reset_index()
product_injuries_age_mean_tables = product_injuries_age_mean[product_injuries_age_mean.Description == 'Tables']
#df = df[df.Clean_launch_year != 'None']
product_injuries_age_mean_tables.head()
Out[129]:
Description age
700 Tables 17.0
In [172]:
df6[['age']] = df6[['age']].astype(float)
df_18under = df6[(df6['age'] <= 18)].reset_index()
df_19_40 = df6[(df6['age'] <= 40) & (df6['age'] >=19)].reset_index()
df_41_65 = df6[(df6['age'] <= 65) & (df6['age'] >=41)].reset_index()
df_65over = df6[(df6['age'] >= 65)].reset_index()

df_18under.drop('index', axis=1, inplace=True)
df_19_40.drop('index', axis=1, inplace=True)
df_41_65.drop('index', axis=1, inplace=True)
df_65over.drop('index', axis=1, inplace=True)
In [173]:
df_18under.head()
Out[173]:
prod1 age Description
0 1267 5.0 Soccer
1 3274 11.0 Swimming (activity, apparel or equipment;
2 380 2.0 Fans 83-- 83-- Combines 0111, 0344 & 0359
3 5036 16.0 Two-wheeled, powered, off-road vehicles (incl.
4 1329 7.0 Scooters, unpowered 72-- 72--
In [174]:
df_18under1 = df_18under.groupby('Description').count().sort_values(by='age',ascending=False).round(decimals=0).reset_index()
df_18under1 = df_18under1[[0,2]].head(10)
df_18under1.columns = ['Description', 'Total_injuries'] 
df_18under1.head()
Out[174]:
Description Total_injuries
0 Football 9966
1 Basketball 9503
2 Bicycles or accessories 6165
3 Stairs or steps 5698
4 Soccer 5540
In [175]:
df_41_651 = df_41_65.groupby('Description').count().sort_values(by='age',ascending=False).round(decimals=0).reset_index()
df_41_651 = df_41_651[[0,2]].head(10)
df_41_651.columns = ['Description', 'Total_injuries'] 
df_41_651.head()
Out[175]:
Description Total_injuries
0 Stairs or steps 8586
1 Floors or flooring materials 6267
2 Bicycles or accessories 2730
3 Beds or bedframes 2694
4 Bathtubs or showers 2167
In [176]:
df_19_401 = df_19_40.groupby('Description').count().sort_values(by='age',ascending=False).round(decimals=0).reset_index()
df_19_401 = df_19_401[[0,2]].head(10)
df_19_401.columns = ['Description', 'Total_injuries'] 
df_19_401.head()
Out[176]:
Description Total_injuries
0 Stairs or steps 8067
1 Basketball 4062
2 Knives 3529
3 Exercise 3451
4 Floors or flooring materials 3408
In [177]:
df_65over1 = df_65over.groupby('Description').count().sort_values(by='age',ascending=False).round(decimals=0).reset_index()
df_65over1 = df_65over1[[0,2]].head(10)
df_65over1.columns = ['Description', 'Total_injuries'] 
df_65over1.head()
Out[177]:
Description Total_injuries
0 Floors or flooring materials 12643
1 Beds or bedframes 5226
2 Stairs or steps 5129
3 Chairs 2574
4 Bathtubs or showers 1616
In [187]:
#top 10 overall

x = df_18under1['Description']
y = df_18under1['Total_injuries']

data = [
    go.Bar(
        x=x,
        y=y,
        
        marker=dict(
            color='rgb(247, 135, 22)',
            line=dict(
                color='rgb(8,48,107)',
                width=1.5
            ),
        ),
        opacity=0.6
    )
]
layout = go.Layout(
      title='top U.S. consumer product injuries by type (18 and under)',
    xaxis=dict(
        title='description',
        tickangle = 47,
    ),
    yaxis=dict(
        title='total injuries'
    ),
     margin=go.Margin(
        b = 220
        
    )

    
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='top10_productinjuries_18under')
Out[187]:
In [186]:
#top 10 overall

x = df_19_401['Description']
y = df_19_401['Total_injuries']

data = [
    go.Bar(
        x=x,
        y=y,
        
        marker=dict(
            color='rgb(84, 226, 129)',
            line=dict(
                color='rgb(8,48,107)',
                width=1.5
            ),
        ),
        opacity=0.6
    )
]
layout = go.Layout(
      title='top U.S. consumer product injuries by type (19 to 40 years old)',
    xaxis=dict(
        title='description',
        tickangle = 47,
    ),
    yaxis=dict(
        title='total injuries'
    ),
     margin=go.Margin(
        b = 220
        
    )

    
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='top10_productinjuries_19to40')
Out[186]:
In [185]:
#top 10 overall

x = df_41_651['Description']
y = df_41_651['Total_injuries']

data = [
    go.Bar(
        x=x,
        y=y,
        
        marker=dict(
            color='rgb(178, 227, 250)',
            line=dict(
                color='rgb(8,48,107)',
                width=1.5
            ),
        ),
        opacity=0.6
    )
]
layout = go.Layout(
      title='top U.S. consumer product injuries by type (41 to 65 years old)',
    xaxis=dict(
        title='description',
        tickangle = 47,
    ),
    yaxis=dict(
        title='total injuries'
    ),
     margin=go.Margin(
        b = 220
        
    )

    
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='top10_productinjuries_41to65')
Out[185]:
In [184]:
#top 10 overall

x = df_65over1['Description']
y = df_65over1['Total_injuries']

data = [
    go.Bar(
        x=x,
        y=y,
        
        marker=dict(
            color='rgba(0, 0, 0, 0.52)',
            line=dict(
                color='rgb(8,48,107)',
                width=1.5
            ),
        ),
        opacity=0.6
    )
]
layout = go.Layout(
      title='top U.S. consumer product injuries by type (>=65 years old)',
    xaxis=dict(
        title='description',
        tickangle = 47,
    ),
    yaxis=dict(
        title='total injuries'
    ),
     margin=go.Margin(
        b = 220
        
    )

    
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='top10_productinjuries_over65')
Out[184]:
In [190]:
#Creating a histogram of injuries by age
x = df6['age']
data = [
    go.Histogram(
        x=x,
        marker=dict(
        color='rgb(255, 217, 102)'
    ),
    opacity=.45
)
    ]

layout = go.Layout(
    title='Age distribution of injuries',
    xaxis=dict(
        title='Age bin'
    ),
    yaxis=dict(
        title='total_injuries'
    ))

fig = dict(data=data, layout=layout)
py.iplot(fig, filename='injury_histogram')
Out[190]:
In [189]:
len(df6)
Out[189]:
346280
In [232]:
df_2to4 = df6[(df6['age'] <= 4)].reset_index()
df_2to4.drop('index', axis=1, inplace=True)
In [233]:
#splitting out 2 to 4 year-olds since they have the highest injury counts
df_2to41 = df_2to4.groupby('Description').count().sort_values(by='age',ascending=False).round(decimals=0).reset_index()
df_2to41 = df_2to41[[0,2]].head(10)
df_2to41.columns = ['Description', 'Total_injuries'] 
df_2to41.head(10)
Out[233]:
Description Total_injuries
0 Beds or bedframes 2365
1 Tables 1804
2 Stairs or steps 1645
3 Floors or flooring materials 1614
4 Sofas, couches, davenports, divans or studio 1246
5 Chairs 1184
6 Doors 1110
7 Jewelry (excluding watches) 72-- 72-- 785
8 Ceilings and walls 688
9 Coins 95-- 95-- Previously included in code 1630. 625
In [205]:
#take groups % of total population, multiple the number of injuries * the % of total pop to normalize
#reading in product key/description map
Age_perc_pop = pandas.read_csv('/Users/erikrood/desktop/ipython_datasets/Age_perc_of_pop.csv') 
#ensuring prod1 is same data type as main table
Age_perc_pop[['Age']] = Age_perc_pop[['Age']].astype(float)
Age_perc_pop.columns = ['age', 'perc_of_population'] 

Age_perc_pop.head()
Out[205]:
age perc_of_population
0 0.0 0.012775
1 1.0 0.012885
2 2.0 0.013270
3 3.0 0.013341
4 4.0 0.013160
In [209]:
#normalizing age groups to account for u.s. population distribution 
age_data = df6[[1,2]]
age_data = age_data.groupby('age').count().reset_index()
normalized_age_data = age_data.merge(Age_perc_pop, how='left', on="age")
normalized_age_data.columns = ['age', 'total_injuries','perc_of_population'] 
normalized_age_data.head()
Out[209]:
age total_injuries perc_of_population
0 0.0 37 0.012775
1 2.0 12830 0.013270
2 3.0 10793 0.013341
3 4.0 8663 0.013160
4 5.0 7851 0.013140
In [210]:
normalized_age_data['normalized'] = normalized_age_data['total_injuries']*normalized_age_data['perc_of_population']
In [216]:
normalized_age_data['normalized_perc'] = normalized_age_data['normalized']/normalized_age_data['normalized'].sum()
In [220]:
#top 10 overall

x = normalized_age_data['age']
y = normalized_age_data['normalized_perc']

data = [
    go.Bar(
        x=x,
        y=y,
        
        marker=dict(
            color='rgb(13, 24, 67)',
            line=dict(
                color='rgb(8,48,107)',
                width=1.5
            ),
        ),
        opacity=0.6
    )
]
layout = go.Layout(
      title='consumer product injuries by age (normalized for population distribution)',
    xaxis=dict(
        title='age',
        tickangle = 47,
    ),
    yaxis=dict(
        title='percent of total injuries'
    ),
     margin=go.Margin(
        b = 220
        
    )

    
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='top10_productinjuries_pop_dist_normalized')
Out[220]:
In [218]:
normalized_age_data['normalized_perc'].sum()
Out[218]:
1.0000000000000004
In [221]:
#top 10 overall

x = df_2to41['Description']
y = df_2to41['Total_injuries']

data = [
    go.Bar(
        x=x,
        y=y,
        
        marker=dict(
            color='rgb(17, 84, 37)',
            line=dict(
                color='rgb(8,48,107)',
                width=1.5
            ),
        ),
        opacity=0.6
    )
]
layout = go.Layout(
      title='top U.S. consumer product injuries by type (2 to 4 years old)',
    xaxis=dict(
        title='description',
        tickangle = 47,
    ),
    yaxis=dict(
        title='total injuries'
    ),
     margin=go.Margin(
        b = 220
        
    )

    
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='top10_productinjuries_2to4')
Out[221]: