In [1]:

import json
import pandas as pd
import numpy as np
import missingno as msno
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:

with open('../dtypes.json', 'r') as jsonfile:
    dtyp = json.load(jsonfile)

In [3]:

dtyp

Out[3]:

{'id': 'float',
 'flag_tsunami': 'str',
 'year': 'float',
 'month': 'float',
 'day': 'float',
 'hour': 'float',
 'minute': 'float',
 'second': 'float',
 'focal_depth': 'float',
 'eq_primary': 'float',
 'eq_mag_mw': 'float',
 'eq_mag_ms': 'float',
 'eq_mag_mb': 'float',
 'intensity': 'float',
 'country': 'str',
 'state': 'str',
 'location_name': 'str',
 'latitude': 'float',
 'longitude': 'float',
 'region_code': 'str',
 'injuries': 'float',
 'injuries_description': 'str',
 'damage_millions_dollars': 'float',
 'damage_description': 'str',
 'total_injuries': 'float',
 'total_injuries_description': 'str',
 'total_damage_millions_dollars': 'float',
 'total_damage_description': 'str'}

In [4]:

data = pd.read_csv('../Datasets/earthquake_data.csv', dtype = dtyp)

In [5]:

description_features = [
    'injuries_description', 'damage_description',
    'total_injuries_description', 'total_damage_description'
]

In [6]:

imp = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='NA')

In [7]:

data[description_features] = imp.fit_transform(data[description_features])

In [8]:

category_means = data[['damage_description', 'damage_millions_dollars']].groupby('damage_description').mean()
category_means

Out[8]:

	damage_millions_dollars
damage_description
1	0.417211
2	3.078840
3	13.818806
4	3574.998799
NA	NaN

In [9]:

replacement_values = category_means.damage_millions_dollars.to_dict()
replacement_values['NA'] = -1
replacement_values['0'] = 0
replacement_values

Out[9]:

{'1': 0.4172105263157895,
 '2': 3.0788402777777772,
 '3': 13.818805970149256,
 '4': 3574.9987991266385,
 'NA': -1,
 '0': 0}

In [10]:

imputed_values = data.damage_description.map(replacement_values)

In [11]:

data['damage_millions_dollars'] = np.where(data.damage_millions_dollars.isnull(),
                                           data.damage_description.map(replacement_values),
                                           data.damage_millions_dollars)

In [12]:

numeric_variables = data.select_dtypes(include=[np.number])
numeric_variables.columns

Out[12]:

Index(['id', 'year', 'month', 'day', 'hour', 'minute', 'second', 'focal_depth',
       'eq_primary', 'eq_mag_mw', 'eq_mag_ms', 'eq_mag_mb', 'intensity',
       'latitude', 'longitude', 'injuries', 'damage_millions_dollars',
       'total_injuries', 'total_damage_millions_dollars'],
      dtype='object')

In [13]:

object_variables = data.select_dtypes(include=[np.object])
object_variables.columns

Out[13]:

Index(['flag_tsunami', 'country', 'state', 'location_name', 'region_code',
       'injuries_description', 'damage_description',
       'total_injuries_description', 'total_damage_description'],
      dtype='object')

In [14]:

counts = data.injuries_description.value_counts(dropna=False)
counts

Out[14]:

NA    4723
1      666
3      347
2      193
4      143
Name: injuries_description, dtype: int64

In [15]:

counts = data.damage_description.value_counts()
counts = counts.sort_index()

In [16]:

data_to_plot = data[~pd.isnull(data.injuries) & ~pd.isnull(data.eq_primary)]

In [17]:

plt.figure(figsize=(12,9))

plt.scatter(x=data_to_plot.eq_primary, y=data_to_plot.injuries)

plt.xlabel('Primary earthquake magnitude')
plt.ylabel('No. of injuries')
plt.show()

In [ ]: