import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
from sklearn.preprocessing import minmax_scale
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
pd.set_option('display.max_columns', None)
# Load the datasets
train = pd.read_csv("train.csv")
holdout = pd.read_csv("test.csv")
# Display the head of train
display(train.head())
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
# %load functions.py
def process_missing(df):
"""Handle various missing values from the data set
Usage
------
holdout = process_missing(holdout)
"""
df["Fare"] = df["Fare"].fillna(train["Fare"].mean())
df["Embarked"] = df["Embarked"].fillna("S")
return df
def process_age(df):
"""Process the Age column into pre-defined 'bins'
Usage
------
train = process_age(train)
"""
df["Age"] = df["Age"].fillna(-0.5)
cut_points = [-1,0,5,12,18,35,60,100]
label_names = ["Missing","Infant","Child","Teenager","Young Adult","Adult","Senior"]
df["Age_categories"] = pd.cut(df["Age"],cut_points,labels=label_names)
return df
def process_fare(df):
"""Process the Fare column into pre-defined 'bins'
Usage
------
train = process_fare(train)
"""
cut_points = [-1,12,50,100,1000]
label_names = ["0-12","12-50","50-100","100+"]
df["Fare_categories"] = pd.cut(df["Fare"],cut_points,labels=label_names)
return df
def process_cabin(df):
"""Process the Cabin column into pre-defined 'bins'
Usage
------
train process_cabin(train)
"""
df["Cabin_type"] = df["Cabin"].str[0]
df["Cabin_type"] = df["Cabin_type"].fillna("Unknown")
df = df.drop('Cabin',axis=1)
return df
def process_titles(df):
"""Extract and categorize the title from the name column
Usage
------
train = process_titles(train)
"""
titles = {
"Mr" : "Mr",
"Mme": "Mrs",
"Ms": "Mrs",
"Mrs" : "Mrs",
"Master" : "Master",
"Mlle": "Miss",
"Miss" : "Miss",
"Capt": "Officer",
"Col": "Officer",
"Major": "Officer",
"Dr": "Officer",
"Rev": "Officer",
"Jonkheer": "Royalty",
"Don": "Royalty",
"Sir" : "Royalty",
"Countess": "Royalty",
"Dona": "Royalty",
"Lady" : "Royalty"
}
extracted_titles = df["Name"].str.extract(' ([A-Za-z]+)\.',expand=False)
df["Title"] = extracted_titles.map(titles)
return df
def create_dummies(df,column_name):
"""Create Dummy Columns (One Hot Encoding) from a single Column
Usage
------
train = create_dummies(train,"Age")
"""
dummies = pd.get_dummies(df[column_name],prefix=column_name)
df = pd.concat([df,dummies],axis=1)
return df
def process_dataframe(df):
"""
Takes a dataframe as input, and applies the different transformation functions to it
Args:
df: the dataframe to transform
Returns:
df: the dataframe after the modifications have been made
"""
df = process_missing(df)
df = process_age(df)
df = process_fare(df)
df = process_titles(df)
df = process_cabin(df)
for col in ["Age_categories","Fare_categories", "Title","Cabin_type","Sex"]:
df = create_dummies(df,col)
# Removes redundant dummy columns to avoid colinearity
df.drop(["Age_categories_Missing", "Fare_categories_100+", "Title_Royalty", "Cabin_type_Unknown", "Sex_male"], axis=1, inplace=True)
return df
# Apply the function to both dataframes
train = process_dataframe(train)
holdout = process_dataframe(holdout)
# Display the results
display(holdout.head())
PassengerId | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Embarked | Age_categories | Fare_categories | Title | Cabin_type | Age_categories_Infant | Age_categories_Child | Age_categories_Teenager | Age_categories_Young Adult | Age_categories_Adult | Age_categories_Senior | Fare_categories_0-12 | Fare_categories_12-50 | Fare_categories_50-100 | Title_Master | Title_Miss | Title_Mr | Title_Mrs | Title_Officer | Cabin_type_A | Cabin_type_B | Cabin_type_C | Cabin_type_D | Cabin_type_E | Cabin_type_F | Cabin_type_G | Sex_female | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 892 | 3 | Kelly, Mr. James | male | 34.5 | 0 | 0 | 330911 | 7.8292 | Q | Young Adult | 0-12 | Mr | Unknown | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 893 | 3 | Wilkes, Mrs. James (Ellen Needs) | female | 47.0 | 1 | 0 | 363272 | 7.0000 | S | Adult | 0-12 | Mrs | Unknown | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
2 | 894 | 2 | Myles, Mr. Thomas Francis | male | 62.0 | 0 | 0 | 240276 | 9.6875 | Q | Senior | 0-12 | Mr | Unknown | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | 895 | 3 | Wirz, Mr. Albert | male | 27.0 | 0 | 0 | 315154 | 8.6625 | S | Young Adult | 0-12 | Mr | Unknown | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 896 | 3 | Hirvonen, Mrs. Alexander (Helga E Lindqvist) | female | 22.0 | 1 | 1 | 3101298 | 12.2875 | S | Young Adult | 12-50 | Mrs | Unknown | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
features = ["SibSp", "Parch"]
features_dict = {"SibSp": "siblings / spouses",
"Parch": "parents / children"}
# Display the type of the features
display(train[features].dtypes)
# Start a grid plot to visualize distributions
plt.figure(figsize=(10,7))
for feature in features:
# Get the index of the feature
id_val = features.index(feature)
# Generate a histogram of the feature
plt.subplot(2, 2, id_val+1)
plt.hist(train[feature], bins = np.arange(10) - .5)
plt.xticks(range(9))
plt.xlim([-1, 9])
plt.xlabel("# of " + features_dict[feature] + " aboard the Titanic")
plt.ylim([0, 700])
plt.ylabel("# of observations")
# Create a pivot table and plot the results
var_name = str("pivot_" + feature)
vars()[var_name] = train.pivot_table(index=feature, values="Survived")
plt.subplot(2, 2, id_val+3)
plt.bar(x=vars()[var_name].index, height=vars()[var_name]["Survived"])
plt.xticks(range(9))
plt.xlim([-1, 9])
plt.xlabel("# of " + features_dict[feature] + " aboard the Titanic")
plt.ylim([0, 1])
plt.ylabel("proportion of survivors")
plt.show()
SibSp int64 Parch int64 dtype: object
The features "SibSp" and "Parch" could be added together to create a feature that indicates the family size. Therefore, we could differenciate people traveling with or without their families.
# Generate a feature "family_size" that adds "SibSp" and "Parch" features
train["family_size"] = train[["SibSp", "Parch"]].sum(axis=1)
holdout["family_size"] = holdout[["SibSp", "Parch"]].sum(axis=1)
# Generate a histogram of the feature
plt.figure(figsize=(10,3))
plt.subplot(1, 2, 1)
plt.hist(train["family_size"], bins = np.arange(10) - .5)
plt.xticks(range(9))
plt.xlim([-1, 9])
plt.xlabel("# of family members aboard the Titanic")
plt.ylim([0, 700])
plt.ylabel("# of observations")
# Create a pivot table and plot the results
plt.subplot(1, 2, 2)
family_pivot = train.pivot_table(index="family_size", values="Survived")
plt.bar(family_pivot.index, height=family_pivot["Survived"])
plt.xticks(range(9))
plt.xlim([-1, 9])
plt.xlabel("# of family members aboard the Titanic")
plt.ylim([0, 1])
plt.ylabel("proportion of survivors")
plt.show()
The data above shows that most people (500+ observations) didn't have any family members aboard the Titanic. Also, these people's chances of surviving seemed lower than people traveling with their families.
It might appear that people traveling with 4+ family members had even lower chances of surviving, but due to the low number of people in these groups, we shouldn't trust this effect just based on this data.
# Generate a new feature "is_alone" for train and holdout datasets
def isalone(df):
"""
Adds a new feature "is_alone" that equals 1 if the passenger has any family members aboard, 0 otherwise
Args:
df: dataframe with the feature "family_size" in it
Returns:
df: modified dataframe
"""
df["is_alone"] = df["family_size"].apply(lambda x: 1 if x == 0 else 0)
return df
isalone(train)
isalone(holdout)
# Display the results
display(train.head())
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Embarked | Age_categories | Fare_categories | Title | Cabin_type | Age_categories_Infant | Age_categories_Child | Age_categories_Teenager | Age_categories_Young Adult | Age_categories_Adult | Age_categories_Senior | Fare_categories_0-12 | Fare_categories_12-50 | Fare_categories_50-100 | Title_Master | Title_Miss | Title_Mr | Title_Mrs | Title_Officer | Cabin_type_A | Cabin_type_B | Cabin_type_C | Cabin_type_D | Cabin_type_E | Cabin_type_F | Cabin_type_G | Cabin_type_T | Sex_female | family_size | is_alone | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | S | Young Adult | 0-12 | Mr | Unknown | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C | Adult | 50-100 | Mrs | C | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | S | Young Adult | 0-12 | Miss | Unknown | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | S | Young Adult | 50-100 | Mrs | C | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | S | Young Adult | 0-12 | Mr | Unknown | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
def select_features(df):
"""
Uses RFECV to select the optimal columns in a Random Forest Classifier
Args:
df: dataframe to estimate best columns from
Returns:
optimized_columns: list of columns that the selector finds optimal
"""
features = ['Pclass', 'Age_categories_Infant', 'Age_categories_Child', 'Age_categories_Teenager',
'Age_categories_Young Adult', 'Age_categories_Adult', 'Age_categories_Senior', 'Fare_categories_0-12',
'Fare_categories_12-50', 'Fare_categories_50-100', 'Title_Master', 'Title_Miss', 'Title_Mr',
'Title_Mrs', 'Title_Officer', 'Cabin_type_A', 'Cabin_type_B', 'Cabin_type_C', 'Cabin_type_D', 'Cabin_type_E',
'Cabin_type_F', 'Cabin_type_G', 'Cabin_type_T', 'Sex_female', 'is_alone']
df = df.select_dtypes(exclude=["object", "category"])
df.dropna(axis=0, inplace=True)
all_X = df[features]
all_y = df["Survived"]
rf = RandomForestClassifier(random_state=1)
selector = RFECV(rf, cv=10)
selector.fit(all_X, all_y)
optimized_columns = all_X.columns[selector.support_]
display("The optimal columns to use are:", optimized_columns)
return optimized_columns
# Apply the function to select the best columns
best_columns = select_features(train)
'The optimal columns to use are:'
Index(['Pclass', 'Age_categories_Infant', 'Age_categories_Young Adult', 'Fare_categories_0-12', 'Fare_categories_12-50', 'Title_Miss', 'Title_Mr', 'Title_Mrs', 'Sex_female', 'is_alone'], dtype='object')
def select_model(df, features):
"""
Takes a dataframe and the features to predict on, and returns a dictionary with the best models to implement.
Args:
df: the dataframe to work with
features: the predictor variables
Returns:
models: updated dictionary with best models parameters and values
"""
# Separate dataframe in predictor variables and target
all_X = df[features]
all_y = df["Survived"]
# Initiate list of dictionaries, each dictionary representing a model and several hyperparameters to try
models = [
{
"name": "LogisticRegression",
"estimator": LogisticRegression(),
"hyperparameters":
{
"solver": ["newton-cg", "lbfgs", "liblinear"]
}
},
{
"name": "KNeighborsClassifier",
"estimator": KNeighborsClassifier(),
"hyperparameters":
{
"n_neighbors": range(1,20,2),
"weights": ["distance", "uniform"],
"algorithm": ["ball_tree", "kd_tree", "brute"],
"p": [1,2]
}
},
{
"name": "RandomForestClassifier",
"estimator": RandomForestClassifier(),
"hyperparameters":
{
"n_estimators": [4, 6, 9],
"criterion": ["entropy", "gini"],
"max_depth": [2, 5, 10],
"max_features": ["log2", "sqrt"],
"min_samples_leaf": [1, 5, 8],
"min_samples_split": [2, 3, 5]
}
}
]
# Grid search over each model to find the best parameters and score, and display the results
for model in models:
display(model["name"])
grid = GridSearchCV(model["estimator"], param_grid=model["hyperparameters"], cv=10)
grid.fit(all_X, all_y)
model["best_params"] = grid.best_params_
model["best_score"] = grid.best_score_
model["best_estimator"] = grid.best_estimator_
display(model["best_params"], model["best_score"])
return models
# Apply the function to see the best models
best_models = select_model(train, best_columns)
'LogisticRegression'
{'solver': 'newton-cg'}
0.8046941323345816
'KNeighborsClassifier'
{'algorithm': 'ball_tree', 'n_neighbors': 13, 'p': 1, 'weights': 'uniform'}
0.819338327091136
'RandomForestClassifier'
{'criterion': 'gini', 'max_depth': 10, 'max_features': 'log2', 'min_samples_leaf': 5, 'min_samples_split': 3, 'n_estimators': 6}
0.8226716604244693
The displayed info above shows that the best model is the Random Forest, with an accuracy score of 82.4%.
rf = RandomForestClassifier(criterion = 'gini',
max_depth = 10,
max_features = 'log2',
min_samples_leaf = 5,
min_samples_split = 2,
n_estimators = 4,
random_state = 1)
rf.fit(train[best_columns], train["Survived"])
RandomForestClassifier(max_depth=10, max_features='log2', min_samples_leaf=5, n_estimators=4, random_state=1)
def save_submission_file(trained_model, columns, name):
"""
Takes a trained model and columns to predict on, and creates the submission file for the Kaggle competition
Args:
trained_model: an already trained model to predict on
columns: columns used by the trained model, that will be used now for predictions
name: name of the file
Returns:
None
"""
name_csv = str(name + ".csv")
predictions = trained_model.predict(holdout[columns])
submission_df = {
"PassengerId": holdout["PassengerId"],
"Survived": predictions}
submission = pd.DataFrame(submission_df)
submission.to_csv(name_csv, index=False)
return None
# Apply the function
save_submission_file(rf, best_columns, "final_submission")