#!/usr/bin/env python # coding: utf-8 # # Global Shark Attacks EDA # #### Who is the deadliest shark and what activities should you avoid to stay save # Today we are looking global shark attacks
# Source: https://www.kaggle.com/teajay/global-shark-attacks # In[1]: import pandas as pd df = pd.read_csv("attacks.csv", encoding = "ISO-8859-1") df['Year_cleaned'] = df[df['Year']>1930]['Year'] Years_values = df['Year_cleaned'].value_counts().sort_index() get_ipython().run_line_magic('matplotlib', 'inline') Years_values.plot(title="Shark Attacks per year") # ## What is the age of shark victims? # In[2]: #Kommentar: Alter der angegriffenen Menschen df['Age_Clean'] = df['Age'].str.extract("([0-9]+)", expand=False).dropna().astype(int) age_values = df['Age_Clean'].value_counts().sort_index() age_values.plot(style=".",title = "Age of victims") # ## When do Shark Attacks happen? # In[4]: df.groupby("Activity")\ .filter(lambda x: len(x) > 100)\ .Activity.value_counts()\ .plot.bar(color='grey') # ## What Sharks attack? # In[5]: sharks = df['Species '].value_counts() sharks[sharks>50].plot.barh(color='purple') # ## Are the victims male or female? # Funfact: This plot inspired the makers of pacman to create a popular game. # In[7]: df[df['Sex '].isin(['M','F'])]["Sex "].value_counts().plot(kind='pie',title= "Gender of victims",colors= ['yellow','black'], table =True) # ## Are Sharks Attacks deadly?? # In[14]: df[df['Fatal (Y/N)'].isin(['Y','N'])]["Fatal (Y/N)"].value_counts()\ .plot(kind='bar', title= "Was the attack fatal?",legend = True, color= ['green','red']) # ## At what time are Sharks Attacks happening? # In[19]: df['Hour_Clean'] = df['Time'].str.extract("([0-9]+)", expand=False).dropna().astype(int) df['Hour_Clean']= df[df['Hour_Clean']<25]['Hour_Clean'] df['Hour_Clean'].plot(kind='hist', color='lightblue', title= "Time of attack.") # ## Where do Sharks Attacks happen? # In[20]: df.groupby("Country")\ .filter(lambda x: len(x) > 100)\ .Country.value_counts()\ .plot.barh(color='gold') # ## In the winter or the summer? # In[24]: df['Month'] = df['Case Number'].str.extract("\.([0-9]+)\.", expand=False).dropna().astype(int) df['Month'] = df[(df['Month']>0) & ( df['Month']<13)]['Month'] df.groupby("Country").filter(lambda x: len(x) > 300)\ .groupby(["Country","Month"])\ .size().to_frame().unstack(level="Country")[0].plot.bar(stacked=True) # ## Where do the Sharkies live? # In[26]: df.groupby("Species ").filter(lambda x: len(x) > 35)\ .groupby("Country").filter(lambda x: len(x) > 20)\ .groupby(["Country","Species "])\ .size().to_frame().unstack(level=0)[0].plot.barh(title= "Where do what species attack?",stacked=True,color=['aqua','orange','blue']) # ## What months are the deadliest? # In[30]: df['Fatal_Binary'] = df[df['Fatal (Y/N)'].isin(['Y','N'])]['Fatal (Y/N)'] df['Fatal_Binary'] = df['Fatal_Binary'].str.replace('N',"0").str.replace('Y',"1").dropna().astype(int) df.groupby(["Month","Fatal_Binary"])\ .size().to_frame().unstack(level="Fatal_Binary")[0].plot.bar(title= "What months are the deadliest?") # ## Who are the killer sharks? # In[31]: # Kommentar: Tode durch Spezies verursacht df[df["Fatal_Binary"]==1]\ .groupby("Species ").filter(lambda x: len(x) > 10)\ .groupby(["Species ","Fatal_Binary"])\ .size().to_frame().unstack(level=0)[0].plot.bar(title='Death by species',figsize=[10,5]) # ## Which activities are most fatal? # # WOW, this is a real discovery. Even though most attacks happen while surfing, swimming is the most fatal!! # In[34]: df\ .groupby("Activity").filter(lambda x: len(x) > 300)\ .groupby(["Activity","Fatal_Binary"])\ .size().to_frame().unstack(level=0)[0].plot.bar(stacked=True, title = "Which activities are most fatal?") # ## Thanks for reading ❤️ # ###### by http://franz.media/