Today we are looking global shark attacks
Source: https://www.kaggle.com/teajay/global-shark-attacks
import pandas as pd
df = pd.read_csv("attacks.csv", encoding = "ISO-8859-1")
df['Year_cleaned'] = df[df['Year']>1930]['Year']
Years_values = df['Year_cleaned'].value_counts().sort_index()
%matplotlib inline
Years_values.plot(title="Shark Attacks per year")
<matplotlib.axes._subplots.AxesSubplot at 0x7f3f3d262748>
#Kommentar: Alter der angegriffenen Menschen
df['Age_Clean'] = df['Age'].str.extract("([0-9]+)", expand=False).dropna().astype(int)
age_values = df['Age_Clean'].value_counts().sort_index()
age_values.plot(style=".",title = "Age of victims")
<matplotlib.axes._subplots.AxesSubplot at 0x7f3f35404be0>
df.groupby("Activity")\
.filter(lambda x: len(x) > 100)\
.Activity.value_counts()\
.plot.bar(color='grey')
<matplotlib.axes._subplots.AxesSubplot at 0x7f3f3533ba90>
sharks = df['Species '].value_counts()
sharks[sharks>50].plot.barh(color='purple')
<matplotlib.axes._subplots.AxesSubplot at 0x7f3f353722b0>
Funfact: This plot inspired the makers of pacman to create a popular game.
df[df['Sex '].isin(['M','F'])]["Sex "].value_counts().plot(kind='pie',title= "Gender of victims",colors= ['yellow','black'], table =True)
<matplotlib.axes._subplots.AxesSubplot at 0x7f3f352377b8>
df[df['Fatal (Y/N)'].isin(['Y','N'])]["Fatal (Y/N)"].value_counts()\
.plot(kind='bar', title= "Was the attack fatal?",legend = True, color= ['green','red'])
<matplotlib.axes._subplots.AxesSubplot at 0x7f3f34f80550>
df['Hour_Clean'] = df['Time'].str.extract("([0-9]+)", expand=False).dropna().astype(int)
df['Hour_Clean']= df[df['Hour_Clean']<25]['Hour_Clean']
df['Hour_Clean'].plot(kind='hist', color='lightblue', title= "Time of attack.")
<matplotlib.axes._subplots.AxesSubplot at 0x7f3f349e6710>
df.groupby("Country")\
.filter(lambda x: len(x) > 100)\
.Country.value_counts()\
.plot.barh(color='gold')
<matplotlib.axes._subplots.AxesSubplot at 0x7f3f34ce2588>
df['Month'] = df['Case Number'].str.extract("\.([0-9]+)\.", expand=False).dropna().astype(int)
df['Month'] = df[(df['Month']>0) & ( df['Month']<13)]['Month']
df.groupby("Country").filter(lambda x: len(x) > 300)\
.groupby(["Country","Month"])\
.size().to_frame().unstack(level="Country")[0].plot.bar(stacked=True)
<matplotlib.axes._subplots.AxesSubplot at 0x7f3f34cf9c50>
df.groupby("Species ").filter(lambda x: len(x) > 35)\
.groupby("Country").filter(lambda x: len(x) > 20)\
.groupby(["Country","Species "])\
.size().to_frame().unstack(level=0)[0].plot.barh(title= "Where do what species attack?",stacked=True,color=['aqua','orange','blue'])
<matplotlib.axes._subplots.AxesSubplot at 0x7f3f34d06908>
df['Fatal_Binary'] = df[df['Fatal (Y/N)'].isin(['Y','N'])]['Fatal (Y/N)']
df['Fatal_Binary'] = df['Fatal_Binary'].str.replace('N',"0").str.replace('Y',"1").dropna().astype(int)
df.groupby(["Month","Fatal_Binary"])\
.size().to_frame().unstack(level="Fatal_Binary")[0].plot.bar(title= "What months are the deadliest?")
<matplotlib.axes._subplots.AxesSubplot at 0x7f3f3457d6a0>
# Kommentar: Tode durch Spezies verursacht
df[df["Fatal_Binary"]==1]\
.groupby("Species ").filter(lambda x: len(x) > 10)\
.groupby(["Species ","Fatal_Binary"])\
.size().to_frame().unstack(level=0)[0].plot.bar(title='Death by species',figsize=[10,5])
<matplotlib.axes._subplots.AxesSubplot at 0x7f3f34658ba8>
WOW, this is a real discovery. Even though most attacks happen while surfing, swimming is the most fatal!!
df\
.groupby("Activity").filter(lambda x: len(x) > 300)\
.groupby(["Activity","Fatal_Binary"])\
.size().to_frame().unstack(level=0)[0].plot.bar(stacked=True, title = "Which activities are most fatal?")
<matplotlib.axes._subplots.AxesSubplot at 0x7f3f3449e4e0>