import json
import pandas as pd
import numpy as np
import missingno as msno
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import seaborn as sns
with open('../dtypes.json', 'r') as jsonfile:
dtyp = json.load(jsonfile)
dtyp
{'id': 'float', 'flag_tsunami': 'str', 'year': 'float', 'month': 'float', 'day': 'float', 'hour': 'float', 'minute': 'float', 'second': 'float', 'focal_depth': 'float', 'eq_primary': 'float', 'eq_mag_mw': 'float', 'eq_mag_ms': 'float', 'eq_mag_mb': 'float', 'intensity': 'float', 'country': 'str', 'state': 'str', 'location_name': 'str', 'latitude': 'float', 'longitude': 'float', 'region_code': 'str', 'injuries': 'float', 'injuries_description': 'str', 'damage_millions_dollars': 'float', 'damage_description': 'str', 'total_injuries': 'float', 'total_injuries_description': 'str', 'total_damage_millions_dollars': 'float', 'total_damage_description': 'str'}
data = pd.read_csv('../Datasets/earthquake_data.csv', dtype = dtyp)
description_features = [
'injuries_description', 'damage_description',
'total_injuries_description', 'total_damage_description'
]
imp = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='NA')
data[description_features] = imp.fit_transform(data[description_features])
category_means = data[['damage_description', 'damage_millions_dollars']].groupby('damage_description').mean()
category_means
damage_millions_dollars | |
---|---|
damage_description | |
1 | 0.417211 |
2 | 3.078840 |
3 | 13.818806 |
4 | 3574.998799 |
NA | NaN |
replacement_values = category_means.damage_millions_dollars.to_dict()
replacement_values['NA'] = -1
replacement_values['0'] = 0
replacement_values
{'1': 0.4172105263157895, '2': 3.0788402777777772, '3': 13.818805970149256, '4': 3574.9987991266385, 'NA': -1, '0': 0}
imputed_values = data.damage_description.map(replacement_values)
data['damage_millions_dollars'] = np.where(data.damage_millions_dollars.isnull(),
data.damage_description.map(replacement_values),
data.damage_millions_dollars)
numeric_variables = data.select_dtypes(include=[np.number])
numeric_variables.columns
Index(['id', 'year', 'month', 'day', 'hour', 'minute', 'second', 'focal_depth', 'eq_primary', 'eq_mag_mw', 'eq_mag_ms', 'eq_mag_mb', 'intensity', 'latitude', 'longitude', 'injuries', 'damage_millions_dollars', 'total_injuries', 'total_damage_millions_dollars'], dtype='object')
object_variables = data.select_dtypes(include=[np.object])
object_variables.columns
Index(['flag_tsunami', 'country', 'state', 'location_name', 'region_code', 'injuries_description', 'damage_description', 'total_injuries_description', 'total_damage_description'], dtype='object')
counts = data.injuries_description.value_counts(dropna=False)
counts
NA 4723 1 666 3 347 2 193 4 143 Name: injuries_description, dtype: int64
counts = data.damage_description.value_counts()
counts = counts.sort_index()
data_to_plot = data[~pd.isnull(data.injuries) & ~pd.isnull(data.eq_primary)]
plt.figure(figsize=(12,9))
plt.scatter(x=data_to_plot.eq_primary, y=data_to_plot.injuries)
plt.xlabel('Primary earthquake magnitude')
plt.ylabel('No. of injuries')
plt.show()