In [1]:

%matplotlib inline
from pyspark.sql import SparkSession
import pandas as pd

import sys
sys.path.append('..')
from utils.pysparkutils import *

spark = SparkSession.builder.appName("titanic").getOrCreate()

In [2]:

train = spark.read.csv('./train.csv', header="true", inferSchema="true")
test = spark.read.csv('./test.csv', header="true", inferSchema="true")

train.printSchema()
test.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)

root
 |-- PassengerId: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)

In [3]:

train.limit(20).toPandas()

Out[3]:

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	0	A/5 21171	7.2500	None	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	0	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	0	STON/O2. 3101282	7.9250	None	S
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	0	113803	53.1000	C123	S
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	0	373450	8.0500	None	S
5	6	0	3	Moran, Mr. James	male	NaN	0	0	330877	8.4583	None	Q
6	7	0	1	McCarthy, Mr. Timothy J	male	54.0	0	0	17463	51.8625	E46	S
7	8	0	3	Palsson, Master. Gosta Leonard	male	2.0	3	1	349909	21.0750	None	S
8	9	1	3	Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)	female	27.0	0	2	347742	11.1333	None	S
9	10	1	2	Nasser, Mrs. Nicholas (Adele Achem)	female	14.0	1	0	237736	30.0708	None	C
10	11	1	3	Sandstrom, Miss. Marguerite Rut	female	4.0	1	1	PP 9549	16.7000	G6	S
11	12	1	1	Bonnell, Miss. Elizabeth	female	58.0	0	0	113783	26.5500	C103	S
12	13	0	3	Saundercock, Mr. William Henry	male	20.0	0	0	A/5. 2151	8.0500	None	S
13	14	0	3	Andersson, Mr. Anders Johan	male	39.0	1	5	347082	31.2750	None	S
14	15	0	3	Vestrom, Miss. Hulda Amanda Adolfina	female	14.0	0	0	350406	7.8542	None	S
15	16	1	2	Hewlett, Mrs. (Mary D Kingcome)	female	55.0	0	0	248706	16.0000	None	S
16	17	0	3	Rice, Master. Eugene	male	2.0	4	1	382652	29.1250	None	Q
17	18	1	2	Williams, Mr. Charles Eugene	male	NaN	0	0	244373	13.0000	None	S
18	19	0	3	Vander Planke, Mrs. Julius (Emelia Maria Vande...	female	31.0	1	0	345763	18.0000	None	S
19	20	1	3	Masselmani, Mrs. Fatima	female	NaN	0	0	2649	7.2250	None	C

In this section we will explore missing data.

In [4]:

findMissingValuesCols(train)

Out[4]:

[('Age', 0.19865319865319866),
 ('Cabin', 0.7710437710437711),
 ('Embarked', 0.002244668911335578)]

We can see almost 80% of Cabin column is missing data. So we will drop the Cabin column. Very few data is missing in Embarked column. We will just drop those rows.

In [5]:

from pyspark.ml.feature import Imputer
ageImputer = Imputer(inputCols=['Age'], outputCols=['imputedAge'], strategy='median')

In [6]:

train = train.filter(train.Embarked.isNotNull())
train = train.drop('Cabin')
train.printSchema()
train.count()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Embarked: string (nullable = true)

Out[6]:

Exploratory Data Analysis¶

In next few sections, we will explore training data and the relationship between different features and labels. As we already know, most of passengers in Titanic didn't survive. Our training data suggests the same, around one-third of the passengers survived. Same goes for passenger class and sex.

In [7]:

labelCol = 'Survived'
train.groupby(labelCol).count().toPandas()

Out[7]:

	Survived	count
0	1	340
1	0	549

In [8]:

train.crosstab(labelCol, 'Sex').toPandas()

Out[8]:

	Survived_Sex	female	male
0	1	231	109
1	0	81	468

Pointwise Mutual Information (PMI) is useful metric for exploring the relationship between two categorical features. PMI gives a scalar value for a pair of values in features. The value denotes the amount of information can be derived about other categori

PMI can be normalized between [-1,+1], is called Normalized PMI, resulting in:

-1 (in the limit) for never occurring together,
0 for independence,
+1 for complete co-occurrence

calcNormalizedPointwiseMutualInformation function is implemented in pysparkutils.py file in utils directory.

In [9]:

pmis = calcNormalizedPointwiseMutualInformation(train, 'Sex', labelCol)
toPandasDF(pmis, 'Normalized PMI', 'Sex', labelCol)

Out[9]:

	Sex	Survived	Normalized PMI
2	female	0	-0.361721
1	female	1	0.490151
0	male	0	0.424895
3	male	1	-0.336078

In [10]:

train.crosstab(labelCol, 'Pclass').toPandas()

Out[10]:

	Survived_Pclass	1	2	3
0	1	134	87	119
1	0	80	97	372

In [11]:

pmis = calcNormalizedPointwiseMutualInformation(train, 'Pclass', labelCol)
toPandasDF(pmis, 'Normalized PMI', 'Pclass', labelCol)

Out[11]:

	Pclass	Survived	Normalized PMI
0	1	0	-0.208445
2	1	1	0.260544
4	2	0	-0.071421
3	2	1	0.091268
5	3	0	0.234674
1	3	1	-0.226840

In [12]:

train.crosstab(labelCol, 'Embarked').toPandas()

Out[12]:

	Survived_Embarked	C	Q	S
0	1	93	30	217
1	0	75	47	427

In [13]:

pmis = calcNormalizedPointwiseMutualInformation(train, 'Embarked', labelCol)
toPandasDF(pmis, 'Normalized PMI', 'Embarked', labelCol)

Out[13]:

	Embarked	Survived	Normalized PMI
5	C	0	-0.131229
3	C	1	0.163804
4	Q	0	-0.003966
0	Q	1	0.005472
1	S	0	0.096935
2	S	1	-0.089810

In [14]:

train.crosstab(labelCol, 'SibSp').toPandas()

Out[14]:

	Survived_SibSp	0	1	2	3	4	5	8
0	1	208	112	13	4	3	0	0
1	0	398	97	15	12	15	5	7

In [15]:

pmis = calcNormalizedPointwiseMutualInformation(train, 'SibSp', labelCol)
toPandasDF(pmis, 'Normalized PMI', 'SibSp', labelCol)

Out[15]:

	SibSp	Survived	Normalized PMI
4	0	0	0.076614
6	0	1	-0.074483
0	1	0	-0.128928
3	1	1	0.162829
7	2	0	-0.034825
5	2	1	0.045891
10	3	0	0.045135
1	3	1	-0.078675
2	4	0	0.073413
11	4	1	-0.145939
8	5	0	0.093038
9	8	0	0.099500

In [16]:

train.crosstab(labelCol, 'Parch').toPandas()

Out[16]:

	Survived_Parch	0	1	2	3	4	5	6
0	1	231	65	40	3	0	1	0
1	0	445	53	40	2	4	4	1

In [17]:

pmis = calcNormalizedPointwiseMutualInformation(train, 'Parch', labelCol)
toPandasDF(pmis, 'Normalized PMI', 'Parch', labelCol)

Out[17]:

	Parch	Survived	Normalized PMI
5	0	0	0.092309
7	0	1	-0.083569
0	1	0	-0.112913
4	1	1	0.139486
8	2	0	-0.068086
6	2	1	0.086419
11	3	0	-0.071231
1	3	1	0.079123
3	4	0	0.089196
9	5	0	0.047902
10	5	1	-0.095475
2	6	0	0.070986

Now we will calculate the entropy of categorical features, which will give us the variance for categorical features.

In [18]:

columns = ['Sex', 'Pclass', 'Embarked', 'SibSp', 'Parch']
entropies = calcNormalizedEntropy(train, *columns)
dictToPandasDF(entropies, 'Feature', 'Entropy')

Out[18]:

	Feature	Entropy
0	Sex	0.934919
1	Pclass	0.907245
2	Embarked	0.692048
3	SibSp	0.477435
4	Parch	0.402510

Categorical feature independence test via chi square test.

In [19]:

from pyspark.ml import Pipeline
from pyspark.ml.stat import ChiSquareTest
from pyspark.ml.feature import Bucketizer, OneHotEncoderEstimator, StringIndexer, VectorAssembler, VectorIndexer

edaEmbarkedIndexer = StringIndexer(inputCol='Embarked', outputCol='indexedEmbarked')
edaSexIndexer = StringIndexer(inputCol='Sex', outputCol='indexedSex')

edaAgeImputer = Imputer(inputCols=['Age'], outputCols=['imputedAge'], strategy='median')

ageSplits = [0, 16, 32, 48, 64, 200]
edaAgeBucketizer = Bucketizer(splits=ageSplits, inputCol='imputedAge', outputCol='bucketedAge')

fareSplits = [-float('inf'), 7.91, 14.454, 31, float('inf')]
edaFareBucketizer = Bucketizer(splits=fareSplits, inputCol='Fare', outputCol='bucketedFare')

oneHotEncoderEstimator = OneHotEncoderEstimator(inputCols=['indexedSex', 'indexedEmbarked', 'bucketedFare', 'bucketedAge'], 
                                                outputCols=['oneHotSex', 'oneHotEmbarked','oneHotFare', 'oneHotAge'])
inputCols=['Pclass', 'oneHotSex', 'oneHotEmbarked','oneHotFare', 'oneHotAge']
edaAssembler = VectorAssembler(inputCols=inputCols, outputCol='features')

pipeline = Pipeline(stages=[edaEmbarkedIndexer, edaSexIndexer, edaAgeImputer, edaAgeBucketizer, 
                            edaFareBucketizer, oneHotEncoderEstimator, edaAssembler])
chiSqTrain = pipeline.fit(train).transform(train)

r = ChiSquareTest.test(chiSqTrain, 'features', 'Survived').head()
print("pValues: " + str(r.pValues))
print("degreesOfFreedom: " + str(r.degreesOfFreedom))
print("statistics: " + str(r.statistics))

pValues: [0.0,0.0,6.02813466444e-06,4.02603175464e-07,4.93843854699e-11,0.0101897422598,0.0315461645121,4.25298058386e-05,0.00150622342036,0.612884928604,0.116808580457]
degreesOfFreedom: [2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
statistics: [100.980407261,260.756342249,20.4792462347,25.6818141585,43.2014376768,6.60142083307,4.62299205997,16.755010532,10.0709867693,0.255995235761,2.45959925254]

Classification¶

In [20]:

from pyspark.ml.feature import StringIndexer

embarkedIndexer = StringIndexer(inputCol='Embarked', outputCol='indexedEmbarked', handleInvalid='skip')
sexFeatureIndexer = StringIndexer(inputCol='Sex', outputCol='indexedSex', handleInvalid='skip')

In [21]:

from pyspark.ml.feature import Bucketizer

ageSplits = [0, 16, 32, 48, 64, 200]
ageBucketizer = Bucketizer(splits=ageSplits, inputCol='imputedAge', outputCol='bucketedAge', handleInvalid='skip')
fareSplits = [-float('inf'), 7.91, 14.454, 31, float('inf')]
fareBucketizer = Bucketizer(splits=fareSplits, inputCol='Fare', outputCol='bucketedFare', handleInvalid='skip')

In [22]:

from pyspark.ml.feature import OneHotEncoderEstimator, VectorIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier

oneHotEncoderEstimator = OneHotEncoderEstimator(inputCols=['indexedSex', 'indexedEmbarked', 'bucketedFare', 'bucketedAge'], 
                                                outputCols=['oneHotSex', 'oneHotEmbarked','oneHotFare', 'oneHotAge'])
assembler = VectorAssembler(inputCols=['Pclass', 'SibSp', 'Parch', 'bucketedAge', 
                                       'bucketedFare', 'indexedEmbarked', 'indexedSex'], outputCol='features')
rf = RandomForestClassifier(labelCol=labelCol, featuresCol='features')

In [23]:

from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml import Pipeline

pipeline = Pipeline(stages=[ageImputer, embarkedIndexer, sexFeatureIndexer, ageBucketizer, 
                            fareBucketizer, oneHotEncoderEstimator, assembler, rf])

grid = ParamGridBuilder().addGrid(rf.numTrees, [15, 20, 25, 30])\
                         .addGrid(rf.maxDepth, [5, 8])\
                         .build()

cv = CrossValidator(estimator=pipeline, 
                    estimatorParamMaps=grid, 
                    evaluator=BinaryClassificationEvaluator(labelCol=labelCol, metricName='areaUnderROC'), 
                    numFolds=10)

model = cv.fit(train)
train = model.transform(train)

In [24]:

evaluator = model.getEvaluator()
evaluator.evaluate(train)

Out[24]:

0.9265509482481523

In [25]:

test = model.transform(test)

In [26]:

test.limit(20).toPandas()

Out[26]:

	PassengerId	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	...	bucketedAge	bucketedFare	oneHotSex	oneHotEmbarked	oneHotFare	oneHotAge	features	rawPrediction	probability	prediction
0	892	3	Kelly, Mr. James	male	34.5	0	0	330911	7.8292	None	...	2.0	0.0	(1.0)	(0.0, 0.0)	(1.0, 0.0, 0.0)	(0.0, 0.0, 1.0, 0.0)	(3.0, 0.0, 0.0, 2.0, 0.0, 2.0, 0.0)	[29.2960574888, 0.703942511236]	[0.976535249625, 0.0234647503745]	0.0
1	893	3	Wilkes, Mrs. James (Ellen Needs)	female	47.0	1	0	363272	7.0000	None	...	2.0	0.0	(0.0)	(1.0, 0.0)	(1.0, 0.0, 0.0)	(0.0, 0.0, 1.0, 0.0)	[3.0, 1.0, 0.0, 2.0, 0.0, 0.0, 1.0]	[17.9363522365, 12.0636477635]	[0.597878407882, 0.402121592118]	0.0
2	894	2	Myles, Mr. Thomas Francis	male	62.0	0	0	240276	9.6875	None	...	3.0	1.0	(1.0)	(0.0, 0.0)	(0.0, 1.0, 0.0)	(0.0, 0.0, 0.0, 1.0)	[2.0, 0.0, 0.0, 3.0, 1.0, 2.0, 0.0]	[26.2911066277, 3.70889337228]	[0.876370220924, 0.123629779076]	0.0
3	895	3	Wirz, Mr. Albert	male	27.0	0	0	315154	8.6625	None	...	1.0	1.0	(1.0)	(1.0, 0.0)	(0.0, 1.0, 0.0)	(0.0, 1.0, 0.0, 0.0)	(3.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0)	[25.9086506078, 4.09134939223]	[0.863621686926, 0.136378313074]	0.0
4	896	3	Hirvonen, Mrs. Alexander (Helga E Lindqvist)	female	22.0	1	1	3101298	12.2875	None	...	1.0	1.0	(0.0)	(1.0, 0.0)	(0.0, 1.0, 0.0)	(0.0, 1.0, 0.0, 0.0)	[3.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0]	[21.2731321988, 8.72686780122]	[0.709104406626, 0.290895593374]	0.0
5	897	3	Svensson, Mr. Johan Cervin	male	14.0	0	0	7538	9.2250	None	...	0.0	1.0	(1.0)	(1.0, 0.0)	(0.0, 1.0, 0.0)	(1.0, 0.0, 0.0, 0.0)	(3.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0)	[21.79357844, 8.20642155996]	[0.726452614668, 0.273547385332]	0.0
6	898	3	Connolly, Miss. Kate	female	30.0	0	0	330972	7.6292	None	...	1.0	0.0	(0.0)	(0.0, 0.0)	(1.0, 0.0, 0.0)	(0.0, 1.0, 0.0, 0.0)	[3.0, 0.0, 0.0, 1.0, 0.0, 2.0, 1.0]	[6.58546403436, 23.4145359656]	[0.219515467812, 0.780484532188]	1.0
7	899	2	Caldwell, Mr. Albert Francis	male	26.0	1	1	248738	29.0000	None	...	1.0	2.0	(1.0)	(1.0, 0.0)	(0.0, 0.0, 1.0)	(0.0, 1.0, 0.0, 0.0)	[2.0, 1.0, 1.0, 1.0, 2.0, 0.0, 0.0]	[26.9219114219, 3.07808857809]	[0.897397047397, 0.102602952603]	0.0
8	900	3	Abrahim, Mrs. Joseph (Sophie Halaut Easu)	female	18.0	0	0	2657	7.2292	None	...	1.0	0.0	(0.0)	(0.0, 1.0)	(1.0, 0.0, 0.0)	(0.0, 1.0, 0.0, 0.0)	[3.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0]	[5.95376028838, 24.0462397116]	[0.198458676279, 0.801541323721]	1.0
9	901	3	Davies, Mr. John Samuel	male	21.0	2	0	A/4 48871	24.1500	None	...	1.0	2.0	(1.0)	(1.0, 0.0)	(0.0, 0.0, 1.0)	(0.0, 1.0, 0.0, 0.0)	[3.0, 2.0, 0.0, 1.0, 2.0, 0.0, 0.0]	[27.683257006, 2.31674299405]	[0.922775233532, 0.0772247664683]	0.0
10	902	3	Ilieff, Mr. Ylio	male	NaN	0	0	349220	7.8958	None	...	1.0	0.0	(1.0)	(1.0, 0.0)	(1.0, 0.0, 0.0)	(0.0, 1.0, 0.0, 0.0)	(3.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0)	[27.1796680333, 2.8203319667]	[0.905988934443, 0.0940110655565]	0.0
11	903	1	Jones, Mr. Charles Cresson	male	46.0	0	0	694	26.0000	None	...	2.0	2.0	(1.0)	(1.0, 0.0)	(0.0, 0.0, 1.0)	(0.0, 0.0, 1.0, 0.0)	(1.0, 0.0, 0.0, 2.0, 2.0, 0.0, 0.0)	[14.2653782322, 15.7346217678]	[0.475512607739, 0.524487392261]	1.0
12	904	1	Snyder, Mrs. John Pillsbury (Nelle Stevenson)	female	23.0	1	0	21228	82.2667	B45	...	1.0	3.0	(0.0)	(1.0, 0.0)	(0.0, 0.0, 0.0)	(0.0, 1.0, 0.0, 0.0)	[1.0, 1.0, 0.0, 1.0, 3.0, 0.0, 1.0]	[0.295454545455, 29.7045454545]	[0.00984848484848, 0.990151515152]	1.0
13	905	2	Howard, Mr. Benjamin	male	63.0	1	0	24065	26.0000	None	...	3.0	2.0	(1.0)	(1.0, 0.0)	(0.0, 0.0, 1.0)	(0.0, 0.0, 0.0, 1.0)	[2.0, 1.0, 0.0, 3.0, 2.0, 0.0, 0.0]	[27.5151896549, 2.4848103451]	[0.917172988497, 0.0828270115032]	0.0
14	906	1	Chaffee, Mrs. Herbert Fuller (Carrie Constance...	female	47.0	1	0	W.E.P. 5734	61.1750	E31	...	2.0	3.0	(0.0)	(1.0, 0.0)	(0.0, 0.0, 0.0)	(0.0, 0.0, 1.0, 0.0)	[1.0, 1.0, 0.0, 2.0, 3.0, 0.0, 1.0]	[0.0, 30.0]	[0.0, 1.0]	1.0
15	907	2	del Carlo, Mrs. Sebastiano (Argenia Genovesi)	female	24.0	1	0	SC/PARIS 2167	27.7208	None	...	1.0	2.0	(0.0)	(0.0, 1.0)	(0.0, 0.0, 1.0)	(0.0, 1.0, 0.0, 0.0)	[2.0, 1.0, 0.0, 1.0, 2.0, 1.0, 1.0]	[2.02801517479, 27.9719848252]	[0.0676005058263, 0.932399494174]	1.0
16	908	2	Keane, Mr. Daniel	male	35.0	0	0	233734	12.3500	None	...	2.0	1.0	(1.0)	(0.0, 0.0)	(0.0, 1.0, 0.0)	(0.0, 0.0, 1.0, 0.0)	[2.0, 0.0, 0.0, 2.0, 1.0, 2.0, 0.0]	[26.5861882672, 3.41381173276]	[0.886206275575, 0.113793724425]	0.0
17	909	3	Assaf, Mr. Gerios	male	21.0	0	0	2692	7.2250	None	...	1.0	0.0	(1.0)	(0.0, 1.0)	(1.0, 0.0, 0.0)	(0.0, 1.0, 0.0, 0.0)	(3.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0)	[25.8019873912, 4.19801260875]	[0.860066246375, 0.139933753625]	0.0
18	910	3	Ilmakangas, Miss. Ida Livija	female	27.0	1	0	STON/O2. 3101270	7.9250	None	...	1.0	1.0	(0.0)	(1.0, 0.0)	(0.0, 1.0, 0.0)	(0.0, 1.0, 0.0, 0.0)	[3.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0]	[20.8412096143, 9.15879038571]	[0.694706987143, 0.305293012857]	0.0
19	911	3	"Assaf Khalil, Mrs. Mariana (Miriam"")"""	female	45.0	0	0	2696	7.2250	None	...	2.0	0.0	(0.0)	(0.0, 1.0)	(1.0, 0.0, 0.0)	(0.0, 0.0, 1.0, 0.0)	[3.0, 0.0, 0.0, 2.0, 0.0, 1.0, 1.0]	[17.3386478658, 12.6613521342]	[0.577954928861, 0.422045071139]	0.0

20 rows × 24 columns

Write the predictions to CSV file in Kaggle specified format.

In [27]:

from pyspark.sql.types import IntegerType

csvPath = 'prediction.csv'
test.select('PassengerId', 'prediction')\
    .coalesce(1)\
    .withColumn('Survived', test['prediction'].cast(IntegerType()))\
    .drop('prediction')\
    .write.csv(csvPath, header='true', mode='ignore')