1) Introduction¶

In this lesson, we're going to explore a workflow to make competing in the Kaggle Titanic competition easier, using a pipeline of functions to reduce the number of dimensions you need to focus on.

2) Data preparation¶

2.1) Set up the environment¶

In [1]:

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
from sklearn.preprocessing import minmax_scale
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
pd.set_option('display.max_columns', None)

2.2) Data reading and initial inspection¶

In [2]:

# Load the datasets
train = pd.read_csv("train.csv")
holdout = pd.read_csv("test.csv")

# Display the head of train
display(train.head())

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	STON/O2. 3101282	7.9250	NaN	S
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	113803	53.1000	C123	S
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	373450	8.0500	NaN	S

2.3) Preparing the functions¶

2.3.1) Copying functions from the DataQuest course¶

In [3]:

# %load functions.py
def process_missing(df):
    """Handle various missing values from the data set

    Usage
    ------

    holdout = process_missing(holdout)
    """
    df["Fare"] = df["Fare"].fillna(train["Fare"].mean())
    df["Embarked"] = df["Embarked"].fillna("S")
    return df

def process_age(df):
    """Process the Age column into pre-defined 'bins' 

    Usage
    ------

    train = process_age(train)
    """
    df["Age"] = df["Age"].fillna(-0.5)
    cut_points = [-1,0,5,12,18,35,60,100]
    label_names = ["Missing","Infant","Child","Teenager","Young Adult","Adult","Senior"]
    df["Age_categories"] = pd.cut(df["Age"],cut_points,labels=label_names)
    return df

def process_fare(df):
    """Process the Fare column into pre-defined 'bins' 

    Usage
    ------

    train = process_fare(train)
    """
    cut_points = [-1,12,50,100,1000]
    label_names = ["0-12","12-50","50-100","100+"]
    df["Fare_categories"] = pd.cut(df["Fare"],cut_points,labels=label_names)
    return df

def process_cabin(df):
    """Process the Cabin column into pre-defined 'bins' 

    Usage
    ------

    train process_cabin(train)
    """
    df["Cabin_type"] = df["Cabin"].str[0]
    df["Cabin_type"] = df["Cabin_type"].fillna("Unknown")
    df = df.drop('Cabin',axis=1)
    return df

def process_titles(df):
    """Extract and categorize the title from the name column 

    Usage
    ------

    train = process_titles(train)
    """
    titles = {
        "Mr" :         "Mr",
        "Mme":         "Mrs",
        "Ms":          "Mrs",
        "Mrs" :        "Mrs",
        "Master" :     "Master",
        "Mlle":        "Miss",
        "Miss" :       "Miss",
        "Capt":        "Officer",
        "Col":         "Officer",
        "Major":       "Officer",
        "Dr":          "Officer",
        "Rev":         "Officer",
        "Jonkheer":    "Royalty",
        "Don":         "Royalty",
        "Sir" :        "Royalty",
        "Countess":    "Royalty",
        "Dona":        "Royalty",
        "Lady" :       "Royalty"
    }
    extracted_titles = df["Name"].str.extract(' ([A-Za-z]+)\.',expand=False)
    df["Title"] = extracted_titles.map(titles)
    return df

def create_dummies(df,column_name):
    """Create Dummy Columns (One Hot Encoding) from a single Column

    Usage
    ------

    train = create_dummies(train,"Age")
    """
    dummies = pd.get_dummies(df[column_name],prefix=column_name)
    df = pd.concat([df,dummies],axis=1)
    return df

2.3.2) Aggregate every function¶

In [4]:

def process_dataframe(df):
    """
    Takes a dataframe as input, and applies the different transformation functions to it
    
    Args:
        df: the dataframe to transform
        
    Returns:
        df: the dataframe after the modifications have been made
    """
    df = process_missing(df)
    df = process_age(df)
    df = process_fare(df)
    df = process_titles(df)
    df = process_cabin(df)
    for col in ["Age_categories","Fare_categories", "Title","Cabin_type","Sex"]:
        df = create_dummies(df,col)
    
    # Removes redundant dummy columns to avoid colinearity
    df.drop(["Age_categories_Missing", "Fare_categories_100+", "Title_Royalty", "Cabin_type_Unknown", "Sex_male"], axis=1, inplace=True)
    return df

# Apply the function to both dataframes
train = process_dataframe(train)
holdout = process_dataframe(holdout)

# Display the results
display(holdout.head())

	PassengerId	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Embarked	Age_categories	Fare_categories	Title	Cabin_type	Age_categories_Young Adult	Age_categories_Adult	Age_categories_Senior	Fare_categories_0-12	Fare_categories_12-50	Title_Mr	Title_Mrs	Sex_female
0	892	3	Kelly, Mr. James	male	34.5	0	0	330911	7.8292	Q	Young Adult	0-12	Mr	Unknown	1	0	0	1	0	1	0	0
1	893	3	Wilkes, Mrs. James (Ellen Needs)	female	47.0	1	0	363272	7.0000	S	Adult	0-12	Mrs	Unknown	0	1	0	1	0	0	1	1
2	894	2	Myles, Mr. Thomas Francis	male	62.0	0	0	240276	9.6875	Q	Senior	0-12	Mr	Unknown	0	0	1	1	0	1	0	0
3	895	3	Wirz, Mr. Albert	male	27.0	0	0	315154	8.6625	S	Young Adult	0-12	Mr	Unknown	1	0	0	1	0	1	0	0
4	896	3	Hirvonen, Mrs. Alexander (Helga E Lindqvist)	female	22.0	1	1	3101298	12.2875	S	Young Adult	12-50	Mrs	Unknown	1	0	0	0	1	0	1	1

2.4) Looking at additional features¶

2.4.1) Number of siblings/spouses and number of partens/children¶

We're going to examine the two columns that contain information about the family members each passenger had onboard: SibSp and Parch.

In [5]:

features = ["SibSp", "Parch"]
features_dict = {"SibSp": "siblings / spouses",
                 "Parch": "parents / children"}

# Display the type of the features
display(train[features].dtypes)

# Start a grid plot to visualize distributions
plt.figure(figsize=(10,7))

for feature in features:
    # Get the index of the feature
    id_val = features.index(feature)
        
    # Generate a histogram of the feature
    plt.subplot(2, 2, id_val+1)
    plt.hist(train[feature], bins = np.arange(10) - .5)
    plt.xticks(range(9))
    plt.xlim([-1, 9])
    plt.xlabel("# of " + features_dict[feature] + " aboard the Titanic")
    plt.ylim([0, 700])
    plt.ylabel("# of observations")
    
    # Create a pivot table and plot the results
    var_name = str("pivot_" + feature)
    vars()[var_name] = train.pivot_table(index=feature, values="Survived")
    plt.subplot(2, 2, id_val+3)
    plt.bar(x=vars()[var_name].index, height=vars()[var_name]["Survived"])
    plt.xticks(range(9))
    plt.xlim([-1, 9])
    plt.xlabel("# of " + features_dict[feature] + " aboard the Titanic")
    plt.ylim([0, 1])
    plt.ylabel("proportion of survivors")
    
plt.show()

SibSp    int64
Parch    int64
dtype: object

2.4.2) Feature engineering: Family size¶

The features "SibSp" and "Parch" could be added together to create a feature that indicates the family size. Therefore, we could differenciate people traveling with or without their families.

In [6]:

# Generate a feature "family_size" that adds "SibSp" and "Parch" features
train["family_size"] = train[["SibSp", "Parch"]].sum(axis=1)
holdout["family_size"] = holdout[["SibSp", "Parch"]].sum(axis=1)

# Generate a histogram of the feature
plt.figure(figsize=(10,3))
plt.subplot(1, 2, 1)
plt.hist(train["family_size"], bins = np.arange(10) - .5)
plt.xticks(range(9))
plt.xlim([-1, 9])
plt.xlabel("# of family members aboard the Titanic")
plt.ylim([0, 700])
plt.ylabel("# of observations")

# Create a pivot table and plot the results
plt.subplot(1, 2, 2)
family_pivot = train.pivot_table(index="family_size", values="Survived")
plt.bar(family_pivot.index, height=family_pivot["Survived"])
plt.xticks(range(9))
plt.xlim([-1, 9])
plt.xlabel("# of family members aboard the Titanic")
plt.ylim([0, 1])
plt.ylabel("proportion of survivors")

plt.show()

The data above shows that most people (500+ observations) didn't have any family members aboard the Titanic. Also, these people's chances of surviving seemed lower than people traveling with their families.

It might appear that people traveling with 4+ family members had even lower chances of surviving, but due to the low number of people in these groups, we shouldn't trust this effect just based on this data.

2.4.3) Feature engineering: Alone traveler¶

In [7]:

# Generate a new feature "is_alone" for train and holdout datasets
def isalone(df):
    """
    Adds a new feature "is_alone" that equals 1 if the passenger has any family members aboard, 0 otherwise
    
    Args:
        df: dataframe with the feature "family_size" in it
        
    Returns:
        df: modified dataframe
    """
    df["is_alone"] = df["family_size"].apply(lambda x: 1 if x == 0 else 0)
    return df
isalone(train)
isalone(holdout)

# Display the results
display(train.head())

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Embarked	Age_categories	Fare_categories	Title	Cabin_type	Age_categories_Young Adult	Age_categories_Adult	Fare_categories_0-12	Fare_categories_50-100	Title_Miss	Title_Mr	Title_Mrs	Cabin_type_C	Sex_female	family_size	is_alone
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	7.2500	S	Young Adult	0-12	Mr	Unknown	1	0	1	0	0	1	0	0	0	1	0
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	PC 17599	71.2833	C	Adult	50-100	Mrs	C	0	1	0	1	0	0	1	1	1	1	0
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	STON/O2. 3101282	7.9250	S	Young Adult	0-12	Miss	Unknown	1	0	1	0	1	0	0	0	1	0	1
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	113803	53.1000	S	Young Adult	50-100	Mrs	C	1	0	0	1	0	0	1	1	1	1	0
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	373450	8.0500	S	Young Adult	0-12	Mr	Unknown	1	0	1	0	0	1	0	0	0	0	1

3) Machine learning model¶

3.1) Features selection¶

In [8]:

def select_features(df):
    """
    Uses RFECV to select the optimal columns in a Random Forest Classifier
    
    Args:
        df: dataframe to estimate best columns from
        
    Returns:
        optimized_columns: list of columns that the selector finds optimal
    """
    features = ['Pclass', 'Age_categories_Infant', 'Age_categories_Child', 'Age_categories_Teenager',
                'Age_categories_Young Adult', 'Age_categories_Adult', 'Age_categories_Senior', 'Fare_categories_0-12',
                'Fare_categories_12-50', 'Fare_categories_50-100', 'Title_Master', 'Title_Miss', 'Title_Mr',
                'Title_Mrs', 'Title_Officer', 'Cabin_type_A', 'Cabin_type_B', 'Cabin_type_C', 'Cabin_type_D', 'Cabin_type_E',
                'Cabin_type_F', 'Cabin_type_G', 'Cabin_type_T', 'Sex_female', 'is_alone']
    
    df = df.select_dtypes(exclude=["object", "category"])
    df.dropna(axis=0, inplace=True)
    
    all_X = df[features]
    all_y = df["Survived"]
    
    rf = RandomForestClassifier(random_state=1)
    
    selector = RFECV(rf, cv=10)
    selector.fit(all_X, all_y)
    optimized_columns = all_X.columns[selector.support_]
    
    display("The optimal columns to use are:", optimized_columns)
    
    return optimized_columns

In [9]:

# Apply the function to select the best columns
best_columns = select_features(train)

'The optimal columns to use are:'

Index(['Pclass', 'Age_categories_Infant', 'Age_categories_Young Adult',
       'Fare_categories_0-12', 'Fare_categories_12-50', 'Title_Miss',
       'Title_Mr', 'Title_Mrs', 'Sex_female', 'is_alone'],
      dtype='object')

3.2) Model selection¶

In [10]:

def select_model(df, features):
    """
    Takes a dataframe and the features to predict on, and returns a dictionary with the best models to implement.
    
    Args:
        df: the dataframe to work with
        features: the predictor variables
    
    Returns:
        models: updated dictionary with best models parameters and values
    """
    # Separate dataframe in predictor variables and target
    all_X = df[features]
    all_y = df["Survived"]
    
    # Initiate list of dictionaries, each dictionary representing a model and several hyperparameters to try
    models = [
        {
            "name": "LogisticRegression",
            "estimator": LogisticRegression(),
            "hyperparameters":
            {
                "solver": ["newton-cg", "lbfgs", "liblinear"]
            }
        },
        {
            "name": "KNeighborsClassifier",
            "estimator": KNeighborsClassifier(),
            "hyperparameters":
            {
                "n_neighbors": range(1,20,2),
                "weights": ["distance", "uniform"],
                "algorithm": ["ball_tree", "kd_tree", "brute"],
                "p": [1,2]
            }
        },
        {
            "name": "RandomForestClassifier",
            "estimator": RandomForestClassifier(),
            "hyperparameters":
            {
                "n_estimators": [4, 6, 9],
                "criterion": ["entropy", "gini"],
                "max_depth": [2, 5, 10],
                "max_features": ["log2", "sqrt"],
                "min_samples_leaf": [1, 5, 8],
                "min_samples_split": [2, 3, 5]
            }
        }
    ]
    
    # Grid search over each model to find the best parameters and score, and display the results
    for model in models:
        display(model["name"])
        grid = GridSearchCV(model["estimator"], param_grid=model["hyperparameters"], cv=10)
        grid.fit(all_X, all_y)
        model["best_params"] = grid.best_params_
        model["best_score"] = grid.best_score_
        model["best_estimator"] = grid.best_estimator_
        display(model["best_params"], model["best_score"])
    
    return models

In [11]:

# Apply the function to see the best models
best_models = select_model(train, best_columns)

'LogisticRegression'

{'solver': 'newton-cg'}

0.8046941323345816

'KNeighborsClassifier'

{'algorithm': 'ball_tree', 'n_neighbors': 13, 'p': 1, 'weights': 'uniform'}

0.819338327091136

'RandomForestClassifier'

{'criterion': 'gini',
 'max_depth': 10,
 'max_features': 'log2',
 'min_samples_leaf': 5,
 'min_samples_split': 3,
 'n_estimators': 6}

0.8226716604244693

The displayed info above shows that the best model is the Random Forest, with an accuracy score of 82.4%.

3.3) Train the model¶

In [12]:

rf = RandomForestClassifier(criterion = 'gini',
                            max_depth = 10,
                            max_features = 'log2',
                            min_samples_leaf = 5,
                            min_samples_split = 2,
                            n_estimators = 4,
                            random_state = 1)
rf.fit(train[best_columns], train["Survived"])

Out[12]:

RandomForestClassifier(max_depth=10, max_features='log2', min_samples_leaf=5,
                       n_estimators=4, random_state=1)

4) Preparing the submission¶

In [13]:

def save_submission_file(trained_model, columns, name):
    """
    Takes a trained model and columns to predict on, and creates the submission file for the Kaggle competition
    
    Args:
        trained_model: an already trained model to predict on
        columns: columns used by the trained model, that will be used now for predictions
        name: name of the file
    
    Returns:
        None
    """
    name_csv = str(name + ".csv")
    predictions = trained_model.predict(holdout[columns])
    submission_df = {
        "PassengerId": holdout["PassengerId"],
        "Survived": predictions}
    submission = pd.DataFrame(submission_df)
    submission.to_csv(name_csv, index=False)
    return None

# Apply the function
save_submission_file(rf, best_columns, "final_submission")