#!/usr/bin/env python
# coding: utf-8
# # Global Shark Attacks EDA
# #### Who is the deadliest shark and what activities should you avoid to stay save
# Today we are looking global shark attacks
# Source: https://www.kaggle.com/teajay/global-shark-attacks
# In[1]:
import pandas as pd
df = pd.read_csv("attacks.csv", encoding = "ISO-8859-1")
df['Year_cleaned'] = df[df['Year']>1930]['Year']
Years_values = df['Year_cleaned'].value_counts().sort_index()
get_ipython().run_line_magic('matplotlib', 'inline')
Years_values.plot(title="Shark Attacks per year")
# ## What is the age of shark victims?
# In[2]:
#Kommentar: Alter der angegriffenen Menschen
df['Age_Clean'] = df['Age'].str.extract("([0-9]+)", expand=False).dropna().astype(int)
age_values = df['Age_Clean'].value_counts().sort_index()
age_values.plot(style=".",title = "Age of victims")
# ## When do Shark Attacks happen?
# In[4]:
df.groupby("Activity")\
.filter(lambda x: len(x) > 100)\
.Activity.value_counts()\
.plot.bar(color='grey')
# ## What Sharks attack?
# In[5]:
sharks = df['Species '].value_counts()
sharks[sharks>50].plot.barh(color='purple')
# ## Are the victims male or female?
# Funfact: This plot inspired the makers of pacman to create a popular game.
# In[7]:
df[df['Sex '].isin(['M','F'])]["Sex "].value_counts().plot(kind='pie',title= "Gender of victims",colors= ['yellow','black'], table =True)
# ## Are Sharks Attacks deadly??
# In[14]:
df[df['Fatal (Y/N)'].isin(['Y','N'])]["Fatal (Y/N)"].value_counts()\
.plot(kind='bar', title= "Was the attack fatal?",legend = True, color= ['green','red'])
# ## At what time are Sharks Attacks happening?
# In[19]:
df['Hour_Clean'] = df['Time'].str.extract("([0-9]+)", expand=False).dropna().astype(int)
df['Hour_Clean']= df[df['Hour_Clean']<25]['Hour_Clean']
df['Hour_Clean'].plot(kind='hist', color='lightblue', title= "Time of attack.")
# ## Where do Sharks Attacks happen?
# In[20]:
df.groupby("Country")\
.filter(lambda x: len(x) > 100)\
.Country.value_counts()\
.plot.barh(color='gold')
# ## In the winter or the summer?
# In[24]:
df['Month'] = df['Case Number'].str.extract("\.([0-9]+)\.", expand=False).dropna().astype(int)
df['Month'] = df[(df['Month']>0) & ( df['Month']<13)]['Month']
df.groupby("Country").filter(lambda x: len(x) > 300)\
.groupby(["Country","Month"])\
.size().to_frame().unstack(level="Country")[0].plot.bar(stacked=True)
# ## Where do the Sharkies live?
# In[26]:
df.groupby("Species ").filter(lambda x: len(x) > 35)\
.groupby("Country").filter(lambda x: len(x) > 20)\
.groupby(["Country","Species "])\
.size().to_frame().unstack(level=0)[0].plot.barh(title= "Where do what species attack?",stacked=True,color=['aqua','orange','blue'])
# ## What months are the deadliest?
# In[30]:
df['Fatal_Binary'] = df[df['Fatal (Y/N)'].isin(['Y','N'])]['Fatal (Y/N)']
df['Fatal_Binary'] = df['Fatal_Binary'].str.replace('N',"0").str.replace('Y',"1").dropna().astype(int)
df.groupby(["Month","Fatal_Binary"])\
.size().to_frame().unstack(level="Fatal_Binary")[0].plot.bar(title= "What months are the deadliest?")
# ## Who are the killer sharks?
# In[31]:
# Kommentar: Tode durch Spezies verursacht
df[df["Fatal_Binary"]==1]\
.groupby("Species ").filter(lambda x: len(x) > 10)\
.groupby(["Species ","Fatal_Binary"])\
.size().to_frame().unstack(level=0)[0].plot.bar(title='Death by species',figsize=[10,5])
# ## Which activities are most fatal?
#
# WOW, this is a real discovery. Even though most attacks happen while surfing, swimming is the most fatal!!
# In[34]:
df\
.groupby("Activity").filter(lambda x: len(x) > 300)\
.groupby(["Activity","Fatal_Binary"])\
.size().to_frame().unstack(level=0)[0].plot.bar(stacked=True, title = "Which activities are most fatal?")
# ## Thanks for reading ❤️
# ###### by http://franz.media/