#!/usr/bin/env python # coding: utf-8 # # Titanic Dataset: Basic Data Exploration # # # In[1]: #Import Libraries from matplotlib import pyplot as plt import sklearn import pandas as pd import numpy as np from scipy.stats import pearsonr # In[2]: #Import Dataset train_dataset = pd.read_csv("dataset/train.csv") test_datset = pd.read_csv("dataset/test.csv") # In[3]: #Chnage Male/Female to numeric classes train_dataset['Sex'].replace(to_replace=dict(female=0, male=1), inplace=True) train_dataset.head() # In[4]: train_dataset.groupby('Survived',as_index=False).describe() # ## Exploratory Data Analysis # ### How does Gender affect chances of Survival? # In[6]: male = [] female = [] for i in [0, 1]: male.append(len(train_dataset.loc[(train_dataset['Survived'] == i) & (train_dataset['Sex'] == 1)]) ) female.append(len(train_dataset.loc[(train_dataset['Survived'] == i) & (train_dataset['Sex'] == 0)])) ind = np.arange(len(male)) width = 0.3 fig, ax = plt.subplots(figsize=(8, 5)) ax.bar(ind - width/2, male, width, color='SkyBlue', label='Men') ax.bar(ind + width/2, female, width, color='IndianRed', label='Women') ax.set_ylabel('Number of people') ax.set_xlabel('Survived') ax.set_title('Survivors by gender') ax.set_xticks(ind) ax.set_xticklabels(["0", "1"]) ax.legend() print("Pearson Correlation Test = {}".format(pearsonr(train_dataset["Sex"], train_dataset["Survived"])[0]) ) # ### How does Class affect chances of Survival? # In[8]: uniq_pclass = [1,2,3] Pclass_survivors = [] total_survivors = [] for i in uniq_pclass: Pclass_survivors.append(len(train_dataset.loc[(train_dataset['Survived'] == 1) & (train_dataset['Pclass'] == i)]) ) total_survivors.append(len(train_dataset.loc[(train_dataset['Pclass'] == i)]) ) ind = np.arange(len(Pclass_survivors)) width = 0.35 fig, ax = plt.subplots(figsize=(8, 5)) ax.bar(ind, total_survivors, width, color='IndianRed', label="dead folks") ax.bar(ind, Pclass_survivors, width, color='SkyBlue', label="survivors") ax.set_ylabel('Survivors') ax.set_xlabel('Pclass') ax.set_title('Amount of Survivors per class') ax.set_xticks(ind) ax.set_xticklabels(uniq_pclass) ax.legend() plt.show() print("Pearson Correlation Test = {}".format(pearsonr(train_dataset["Pclass"], train_dataset["Survived"])[0])) # ### The effect of Age on Survival? # In[9]: survived_age = pd.DataFrame(train_dataset.loc[(train_dataset['Survived'] == 1)].Age) dead_age = pd.DataFrame(train_dataset.loc[(train_dataset['Survived'] == 0)].Age) ages=pd.concat([survived_age, dead_age], axis=1) ages.columns = ["Alive", "Dead"] ax = ages.plot(kind='hist', alpha=0.5, stacked=True, bins=15, figsize=(8,5)) ax.set_xlabel("Age") ax.set_ylabel("Number of People") #Drop missing Ages tra = train_dataset.dropna() print("Pearson Correlation Test = {}".format(pearsonr(tra["Age"], tra["Survived"])[0])) # ### Does Fare have an effect on who survives? # In[10]: df1 = pd.DataFrame(train_dataset[["Survived", "Age", "Fare"]]) col = df1.Survived.map({0:'r', 1:'b'}) d=df1.plot(x='Age', y='Fare', c=col, kind='scatter', figsize=(8,5)) d.legend(["Dead"]) # Pearsons Correlation Test tra = train_dataset.dropna() print("Pearson Correlation Test = {}".format(pearsonr(tra["Fare"], tra["Survived"])[0])) # ### Embarked? # In[55]: dropped_ = train_dataset.dropna() dropped_['Embarked'].replace(to_replace=dict(S=0, C=1, Q=2), inplace=True) dropped_.head() df_s = pd.DataFrame(dropped_[["Survived", "Embarked"]].loc[(train_dataset['Survived'] == 1)]).groupby(['Embarked']).agg(['count']).reset_index() df_d = pd.DataFrame(dropped_[["Survived", "Embarked"]].loc[(train_dataset['Survived'] == 0)]).groupby(['Embarked']).agg(['count']).reset_index() df_embarked=pd.merge(df_s,df_d,on='Embarked', how='left') df_embarked.plot(x = "Embarked", kind="bar")