Recap from Lecture 29 on using Pandas¶

Import Excel table as Pandas DataFrame object¶

In [1]:

import pandas as pd
xls = pd.ExcelFile('Lecture30_data/titanic3.xls')
sheet_1 = xls.parse(0)
sheet_1

Out[1]:

	pclass	survived	name	sex	age	sibsp	parch	ticket	fare	cabin	embarked	boat	body	home.dest
0	1	1	Allen, Miss. Elisabeth Walton	female	29.0000	0	0	24160	211.3375	B5	S	2	NaN	St Louis, MO
1	1	1	Allison, Master. Hudson Trevor	male	0.9167	1	2	113781	151.5500	C22 C26	S	11	NaN	Montreal, PQ / Chesterville, ON
2	1	0	Allison, Miss. Helen Loraine	female	2.0000	1	2	113781	151.5500	C22 C26	S	NaN	NaN	Montreal, PQ / Chesterville, ON
3	1	0	Allison, Mr. Hudson Joshua Creighton	male	30.0000	1	2	113781	151.5500	C22 C26	S	NaN	135.0	Montreal, PQ / Chesterville, ON
4	1	0	Allison, Mrs. Hudson J C (Bessie Waldo Daniels)	female	25.0000	1	2	113781	151.5500	C22 C26	S	NaN	NaN	Montreal, PQ / Chesterville, ON
5	1	1	Anderson, Mr. Harry	male	48.0000	0	0	19952	26.5500	E12	S	3	NaN	New York, NY
6	1	1	Andrews, Miss. Kornelia Theodosia	female	63.0000	1	0	13502	77.9583	D7	S	10	NaN	Hudson, NY
7	1	0	Andrews, Mr. Thomas Jr	male	39.0000	0	0	112050	0.0000	A36	S	NaN	NaN	Belfast, NI
8	1	1	Appleton, Mrs. Edward Dale (Charlotte Lamson)	female	53.0000	2	0	11769	51.4792	C101	S	D	NaN	Bayside, Queens, NY
9	1	0	Artagaveytia, Mr. Ramon	male	71.0000	0	0	PC 17609	49.5042	NaN	C	NaN	22.0	Montevideo, Uruguay
10	1	0	Astor, Col. John Jacob	male	47.0000	1	0	PC 17757	227.5250	C62 C64	C	NaN	124.0	New York, NY
11	1	1	Astor, Mrs. John Jacob (Madeleine Talmadge Force)	female	18.0000	1	0	PC 17757	227.5250	C62 C64	C	4	NaN	New York, NY
12	1	1	Aubart, Mme. Leontine Pauline	female	24.0000	0	0	PC 17477	69.3000	B35	C	9	NaN	Paris, France
13	1	1	Barber, Miss. Ellen "Nellie"	female	26.0000	0	0	19877	78.8500	NaN	S	6	NaN	NaN
14	1	1	Barkworth, Mr. Algernon Henry Wilson	male	80.0000	0	0	27042	30.0000	A23	S	B	NaN	Hessle, Yorks
15	1	0	Baumann, Mr. John D	male	NaN	0	0	PC 17318	25.9250	NaN	S	NaN	NaN	New York, NY
16	1	0	Baxter, Mr. Quigg Edmond	male	24.0000	0	1	PC 17558	247.5208	B58 B60	C	NaN	NaN	Montreal, PQ
17	1	1	Baxter, Mrs. James (Helene DeLaudeniere Chaput)	female	50.0000	0	1	PC 17558	247.5208	B58 B60	C	6	NaN	Montreal, PQ
18	1	1	Bazzani, Miss. Albina	female	32.0000	0	0	11813	76.2917	D15	C	8	NaN	NaN
19	1	0	Beattie, Mr. Thomson	male	36.0000	0	0	13050	75.2417	C6	C	A	NaN	Winnipeg, MN
20	1	1	Beckwith, Mr. Richard Leonard	male	37.0000	1	1	11751	52.5542	D35	S	5	NaN	New York, NY
21	1	1	Beckwith, Mrs. Richard Leonard (Sallie Monypeny)	female	47.0000	1	1	11751	52.5542	D35	S	5	NaN	New York, NY
22	1	1	Behr, Mr. Karl Howell	male	26.0000	0	0	111369	30.0000	C148	C	5	NaN	New York, NY
23	1	1	Bidois, Miss. Rosalie	female	42.0000	0	0	PC 17757	227.5250	NaN	C	4	NaN	NaN
24	1	1	Bird, Miss. Ellen	female	29.0000	0	0	PC 17483	221.7792	C97	S	8	NaN	NaN
25	1	0	Birnbaum, Mr. Jakob	male	25.0000	0	0	13905	26.0000	NaN	C	NaN	148.0	San Francisco, CA
26	1	1	Bishop, Mr. Dickinson H	male	25.0000	1	0	11967	91.0792	B49	C	7	NaN	Dowagiac, MI
27	1	1	Bishop, Mrs. Dickinson H (Helen Walton)	female	19.0000	1	0	11967	91.0792	B49	C	7	NaN	Dowagiac, MI
28	1	1	Bissette, Miss. Amelia	female	35.0000	0	0	PC 17760	135.6333	C99	S	8	NaN	NaN
29	1	1	Bjornstrom-Steffansson, Mr. Mauritz Hakan	male	28.0000	0	0	110564	26.5500	C52	S	D	NaN	Stockholm, Sweden / Washington, DC
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1279	3	0	Vestrom, Miss. Hulda Amanda Adolfina	female	14.0000	0	0	350406	7.8542	NaN	S	NaN	NaN	NaN
1280	3	0	Vovk, Mr. Janko	male	22.0000	0	0	349252	7.8958	NaN	S	NaN	NaN	NaN
1281	3	0	Waelens, Mr. Achille	male	22.0000	0	0	345767	9.0000	NaN	S	NaN	NaN	Antwerp, Belgium / Stanton, OH
1282	3	0	Ware, Mr. Frederick	male	NaN	0	0	359309	8.0500	NaN	S	NaN	NaN	NaN
1283	3	0	Warren, Mr. Charles William	male	NaN	0	0	C.A. 49867	7.5500	NaN	S	NaN	NaN	NaN
1284	3	0	Webber, Mr. James	male	NaN	0	0	SOTON/OQ 3101316	8.0500	NaN	S	NaN	NaN	NaN
1285	3	0	Wenzel, Mr. Linhart	male	32.5000	0	0	345775	9.5000	NaN	S	NaN	298.0	NaN
1286	3	1	Whabee, Mrs. George Joseph (Shawneene Abi-Saab)	female	38.0000	0	0	2688	7.2292	NaN	C	C	NaN	NaN
1287	3	0	Widegren, Mr. Carl/Charles Peter	male	51.0000	0	0	347064	7.7500	NaN	S	NaN	NaN	NaN
1288	3	0	Wiklund, Mr. Jakob Alfred	male	18.0000	1	0	3101267	6.4958	NaN	S	NaN	314.0	NaN
1289	3	0	Wiklund, Mr. Karl Johan	male	21.0000	1	0	3101266	6.4958	NaN	S	NaN	NaN	NaN
1290	3	1	Wilkes, Mrs. James (Ellen Needs)	female	47.0000	1	0	363272	7.0000	NaN	S	NaN	NaN	NaN
1291	3	0	Willer, Mr. Aaron ("Abi Weller")	male	NaN	0	0	3410	8.7125	NaN	S	NaN	NaN	NaN
1292	3	0	Willey, Mr. Edward	male	NaN	0	0	S.O./P.P. 751	7.5500	NaN	S	NaN	NaN	NaN
1293	3	0	Williams, Mr. Howard Hugh "Harry"	male	NaN	0	0	A/5 2466	8.0500	NaN	S	NaN	NaN	NaN
1294	3	0	Williams, Mr. Leslie	male	28.5000	0	0	54636	16.1000	NaN	S	NaN	14.0	NaN
1295	3	0	Windelov, Mr. Einar	male	21.0000	0	0	SOTON/OQ 3101317	7.2500	NaN	S	NaN	NaN	NaN
1296	3	0	Wirz, Mr. Albert	male	27.0000	0	0	315154	8.6625	NaN	S	NaN	131.0	NaN
1297	3	0	Wiseman, Mr. Phillippe	male	NaN	0	0	A/4. 34244	7.2500	NaN	S	NaN	NaN	NaN
1298	3	0	Wittevrongel, Mr. Camille	male	36.0000	0	0	345771	9.5000	NaN	S	NaN	NaN	NaN
1299	3	0	Yasbeck, Mr. Antoni	male	27.0000	1	0	2659	14.4542	NaN	C	C	NaN	NaN
1300	3	1	Yasbeck, Mrs. Antoni (Selini Alexander)	female	15.0000	1	0	2659	14.4542	NaN	C	NaN	NaN	NaN
1301	3	0	Youseff, Mr. Gerious	male	45.5000	0	0	2628	7.2250	NaN	C	NaN	312.0	NaN
1302	3	0	Yousif, Mr. Wazli	male	NaN	0	0	2647	7.2250	NaN	C	NaN	NaN	NaN
1303	3	0	Yousseff, Mr. Gerious	male	NaN	0	0	2627	14.4583	NaN	C	NaN	NaN	NaN
1304	3	0	Zabour, Miss. Hileni	female	14.5000	1	0	2665	14.4542	NaN	C	NaN	328.0	NaN
1305	3	0	Zabour, Miss. Thamine	female	NaN	1	0	2665	14.4542	NaN	C	NaN	NaN	NaN
1306	3	0	Zakarian, Mr. Mapriededer	male	26.5000	0	0	2656	7.2250	NaN	C	NaN	304.0	NaN
1307	3	0	Zakarian, Mr. Ortin	male	27.0000	0	0	2670	7.2250	NaN	C	NaN	NaN	NaN
1308	3	0	Zimmerman, Mr. Leo	male	29.0000	0	0	315082	7.8750	NaN	S	NaN	NaN	NaN

1309 rows × 14 columns

Side-track: Counter class (a subclass of dictionary)¶

Counter is a subclass of Dictionary class that we are more familiar with. It provides faster way to tally distinct values and saved the results as a dictionary.

In [2]:

from collections import Counter
c = Counter('baracadabra')
print(c)
d = c.most_common(3)
print(d)

Counter({'a': 5, 'b': 2, 'r': 2, 'c': 1, 'd': 1})
[('a', 5), ('b', 2), ('r', 2)]

We want the Counter to return a dictionary subclass with key as the values of "survived" (i.e., either 0 or 1) and values as the number of passengers who have the value.

In [3]:

counter = Counter(sheet_1.loc[sheet_1["body"].notna(), "survived"].values)
print(counter)

Counter({0: 121})

This means that all of the passengers who have non NA 'body' value have 'survived' value equal to 0 (means that they did not survive), which obviously makes sense.

Count the number of passengers who have non-NaN values in each input variable¶

In [4]:

print(sheet_1.shape)
sheet_1.count()

(1309, 14)

Out[4]:

pclass       1309
survived     1309
name         1309
sex          1309
age          1046
sibsp        1309
parch        1309
ticket       1309
fare         1308
cabin         295
embarked     1307
boat          486
body          121
home.dest     745
dtype: int64

We now want ot obtain all of the columns with no missing information

In [5]:

criteria = sheet_1.count() == sheet_1.shape[0]
data = sheet_1[criteria.index[criteria]]
data=data.drop(['name','ticket'],axis=1)
data

Out[5]:

	pclass	survived	sex	sibsp	parch
0	1	1	female	0	0
1	1	1	male	1	2
2	1	0	female	1	2
3	1	0	male	1	2
4	1	0	female	1	2
5	1	1	male	0	0
6	1	1	female	1	0
7	1	0	male	0	0
8	1	1	female	2	0
9	1	0	male	0	0
10	1	0	male	1	0
11	1	1	female	1	0
12	1	1	female	0	0
13	1	1	female	0	0
14	1	1	male	0	0
15	1	0	male	0	0
16	1	0	male	0	1
17	1	1	female	0	1
18	1	1	female	0	0
19	1	0	male	0	0
20	1	1	male	1	1
21	1	1	female	1	1
22	1	1	male	0	0
23	1	1	female	0	0
24	1	1	female	0	0
25	1	0	male	0	0
26	1	1	male	1	0
27	1	1	female	1	0
28	1	1	female	0	0
29	1	1	male	0	0
...	...	...	...	...	...
1279	3	0	female	0	0
1280	3	0	male	0	0
1281	3	0	male	0	0
1282	3	0	male	0	0
1283	3	0	male	0	0
1284	3	0	male	0	0
1285	3	0	male	0	0
1286	3	1	female	0	0
1287	3	0	male	0	0
1288	3	0	male	1	0
1289	3	0	male	1	0
1290	3	1	female	1	0
1291	3	0	male	0	0
1292	3	0	male	0	0
1293	3	0	male	0	0
1294	3	0	male	0	0
1295	3	0	male	0	0
1296	3	0	male	0	0
1297	3	0	male	0	0
1298	3	0	male	0	0
1299	3	0	male	1	0
1300	3	1	female	1	0
1301	3	0	male	0	0
1302	3	0	male	0	0
1303	3	0	male	0	0
1304	3	0	female	1	0
1305	3	0	female	1	0
1306	3	0	male	0	0
1307	3	0	male	0	0
1308	3	0	male	0	0

1309 rows × 5 columns

In [6]:

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
data.loc[0:data.shape[0],'sex'] = le.fit_transform(data.sex)
data

Out[6]:

	pclass	survived	sex	sibsp	parch
0	1	1	0	0	0
1	1	1	1	1	2
2	1	0	0	1	2
3	1	0	1	1	2
4	1	0	0	1	2
5	1	1	1	0	0
6	1	1	0	1	0
7	1	0	1	0	0
8	1	1	0	2	0
9	1	0	1	0	0
10	1	0	1	1	0
11	1	1	0	1	0
12	1	1	0	0	0
13	1	1	0	0	0
14	1	1	1	0	0
15	1	0	1	0	0
16	1	0	1	0	1
17	1	1	0	0	1
18	1	1	0	0	0
19	1	0	1	0	0
20	1	1	1	1	1
21	1	1	0	1	1
22	1	1	1	0	0
23	1	1	0	0	0
24	1	1	0	0	0
25	1	0	1	0	0
26	1	1	1	1	0
27	1	1	0	1	0
28	1	1	0	0	0
29	1	1	1	0	0
...	...	...	...	...	...
1279	3	0	0	0	0
1280	3	0	1	0	0
1281	3	0	1	0	0
1282	3	0	1	0	0
1283	3	0	1	0	0
1284	3	0	1	0	0
1285	3	0	1	0	0
1286	3	1	0	0	0
1287	3	0	1	0	0
1288	3	0	1	1	0
1289	3	0	1	1	0
1290	3	1	0	1	0
1291	3	0	1	0	0
1292	3	0	1	0	0
1293	3	0	1	0	0
1294	3	0	1	0	0
1295	3	0	1	0	0
1296	3	0	1	0	0
1297	3	0	1	0	0
1298	3	0	1	0	0
1299	3	0	1	1	0
1300	3	1	0	1	0
1301	3	0	1	0	0
1302	3	0	1	0	0
1303	3	0	1	0	0
1304	3	0	0	1	0
1305	3	0	0	1	0
1306	3	0	1	0	0
1307	3	0	1	0	0
1308	3	0	1	0	0

1309 rows × 5 columns

Back to the Lecture 30 slide¶

Split data into training and test¶

In [ ]:

from sklearn import model_selection
X = data.drop(["survived"], axis=1).values
y = data["survived"].values
X_train, X_test, y_train, y_test = model_selection.train_test_split(X,y,test_size=0.2, shuffle=True)

Training classifier¶

In [ ]:

from sklearn.linear_model import LogisticRegression
logitreg = LogisticRegression(solver='liblinear')
fit = logitreg.fit(X_train, y_train)

Get predicted probabilities¶

First column contains the probabilities for non-survived

Second column contains the probabilities for survived

In each row, the two probabilities should sum to one

In [ ]:

y_test_prob = fit.predict_proba(X_test)
y_test_prob

To check our predictions, add probabilities for survied (i.e., 2nd column) to the testing data together with the true survived label

In [ ]:

pred = y_test_prob[:,[1]]

true_label = y_test.reshape((y_test.shape[0], 1))

testdata=np.append(X_test, pred, axis=1)

testdata=np.append(testdata, true_label, axis=1)

testdata_df = pd.DataFrame(testdata, columns=['pclass', 'sex', 'sibsp', 'parch', 'pred_prob', 'true_label'])

testdata_df

In [ ]:

print(testdata_df.iloc[0:10].to_latex(index=False))

Receiver Operator Characteristic (ROC) curve¶

Often we don't want to set a single threshold but rather evaluate the model based on all thresholds. ROC is the a way to this.

At each threshold, it calculates two rates:

True Positive Rate = True Positives / (True Positives + False Negatives)

False Positive Rate = False Positives / (False Positives + True Negatives)

In [ ]:

from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from matplotlib import pyplot

# calculate AUC
probs = y_test_prob[:,1]
auc = roc_auc_score(y_test, probs)
print('AUC: %.3f' % auc)

fpr, tpr, thresholds = roc_curve(y_test, probs)

roc_table = np.append(tpr.reshape((tpr.shape[0], 1)),
 fpr.reshape((fpr.shape[0], 1)), axis=1)
roc_table = np.append(roc_table, thresholds.reshape((thresholds.shape[0], 1)), axis=1)

roc_df = pd.DataFrame(roc_table, columns=['TPR', 'FPR', 'Threshold'])

roc_df

In [ ]:

print(roc_df.to_latex(index=False))

In [ ]:

pyplot.plot([0, 1], [0, 1], linestyle='--')
# plot the roc curve for the model
pyplot.plot(fpr, tpr, marker='.')
pyplot.xlabel("False Positive Rate")
pyplot.ylabel("True Positive Rate")
pyplot.savefig('figures/roc.eps')

In [ ]: