import pandas as pd
xls = pd.ExcelFile('Lecture30_data/titanic3.xls')
sheet_1 = xls.parse(0)
sheet_1
pclass | survived | name | sex | age | sibsp | parch | ticket | fare | cabin | embarked | boat | body | home.dest | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 1 | Allen, Miss. Elisabeth Walton | female | 29.0000 | 0 | 0 | 24160 | 211.3375 | B5 | S | 2 | NaN | St Louis, MO |
1 | 1 | 1 | Allison, Master. Hudson Trevor | male | 0.9167 | 1 | 2 | 113781 | 151.5500 | C22 C26 | S | 11 | NaN | Montreal, PQ / Chesterville, ON |
2 | 1 | 0 | Allison, Miss. Helen Loraine | female | 2.0000 | 1 | 2 | 113781 | 151.5500 | C22 C26 | S | NaN | NaN | Montreal, PQ / Chesterville, ON |
3 | 1 | 0 | Allison, Mr. Hudson Joshua Creighton | male | 30.0000 | 1 | 2 | 113781 | 151.5500 | C22 C26 | S | NaN | 135.0 | Montreal, PQ / Chesterville, ON |
4 | 1 | 0 | Allison, Mrs. Hudson J C (Bessie Waldo Daniels) | female | 25.0000 | 1 | 2 | 113781 | 151.5500 | C22 C26 | S | NaN | NaN | Montreal, PQ / Chesterville, ON |
5 | 1 | 1 | Anderson, Mr. Harry | male | 48.0000 | 0 | 0 | 19952 | 26.5500 | E12 | S | 3 | NaN | New York, NY |
6 | 1 | 1 | Andrews, Miss. Kornelia Theodosia | female | 63.0000 | 1 | 0 | 13502 | 77.9583 | D7 | S | 10 | NaN | Hudson, NY |
7 | 1 | 0 | Andrews, Mr. Thomas Jr | male | 39.0000 | 0 | 0 | 112050 | 0.0000 | A36 | S | NaN | NaN | Belfast, NI |
8 | 1 | 1 | Appleton, Mrs. Edward Dale (Charlotte Lamson) | female | 53.0000 | 2 | 0 | 11769 | 51.4792 | C101 | S | D | NaN | Bayside, Queens, NY |
9 | 1 | 0 | Artagaveytia, Mr. Ramon | male | 71.0000 | 0 | 0 | PC 17609 | 49.5042 | NaN | C | NaN | 22.0 | Montevideo, Uruguay |
10 | 1 | 0 | Astor, Col. John Jacob | male | 47.0000 | 1 | 0 | PC 17757 | 227.5250 | C62 C64 | C | NaN | 124.0 | New York, NY |
11 | 1 | 1 | Astor, Mrs. John Jacob (Madeleine Talmadge Force) | female | 18.0000 | 1 | 0 | PC 17757 | 227.5250 | C62 C64 | C | 4 | NaN | New York, NY |
12 | 1 | 1 | Aubart, Mme. Leontine Pauline | female | 24.0000 | 0 | 0 | PC 17477 | 69.3000 | B35 | C | 9 | NaN | Paris, France |
13 | 1 | 1 | Barber, Miss. Ellen "Nellie" | female | 26.0000 | 0 | 0 | 19877 | 78.8500 | NaN | S | 6 | NaN | NaN |
14 | 1 | 1 | Barkworth, Mr. Algernon Henry Wilson | male | 80.0000 | 0 | 0 | 27042 | 30.0000 | A23 | S | B | NaN | Hessle, Yorks |
15 | 1 | 0 | Baumann, Mr. John D | male | NaN | 0 | 0 | PC 17318 | 25.9250 | NaN | S | NaN | NaN | New York, NY |
16 | 1 | 0 | Baxter, Mr. Quigg Edmond | male | 24.0000 | 0 | 1 | PC 17558 | 247.5208 | B58 B60 | C | NaN | NaN | Montreal, PQ |
17 | 1 | 1 | Baxter, Mrs. James (Helene DeLaudeniere Chaput) | female | 50.0000 | 0 | 1 | PC 17558 | 247.5208 | B58 B60 | C | 6 | NaN | Montreal, PQ |
18 | 1 | 1 | Bazzani, Miss. Albina | female | 32.0000 | 0 | 0 | 11813 | 76.2917 | D15 | C | 8 | NaN | NaN |
19 | 1 | 0 | Beattie, Mr. Thomson | male | 36.0000 | 0 | 0 | 13050 | 75.2417 | C6 | C | A | NaN | Winnipeg, MN |
20 | 1 | 1 | Beckwith, Mr. Richard Leonard | male | 37.0000 | 1 | 1 | 11751 | 52.5542 | D35 | S | 5 | NaN | New York, NY |
21 | 1 | 1 | Beckwith, Mrs. Richard Leonard (Sallie Monypeny) | female | 47.0000 | 1 | 1 | 11751 | 52.5542 | D35 | S | 5 | NaN | New York, NY |
22 | 1 | 1 | Behr, Mr. Karl Howell | male | 26.0000 | 0 | 0 | 111369 | 30.0000 | C148 | C | 5 | NaN | New York, NY |
23 | 1 | 1 | Bidois, Miss. Rosalie | female | 42.0000 | 0 | 0 | PC 17757 | 227.5250 | NaN | C | 4 | NaN | NaN |
24 | 1 | 1 | Bird, Miss. Ellen | female | 29.0000 | 0 | 0 | PC 17483 | 221.7792 | C97 | S | 8 | NaN | NaN |
25 | 1 | 0 | Birnbaum, Mr. Jakob | male | 25.0000 | 0 | 0 | 13905 | 26.0000 | NaN | C | NaN | 148.0 | San Francisco, CA |
26 | 1 | 1 | Bishop, Mr. Dickinson H | male | 25.0000 | 1 | 0 | 11967 | 91.0792 | B49 | C | 7 | NaN | Dowagiac, MI |
27 | 1 | 1 | Bishop, Mrs. Dickinson H (Helen Walton) | female | 19.0000 | 1 | 0 | 11967 | 91.0792 | B49 | C | 7 | NaN | Dowagiac, MI |
28 | 1 | 1 | Bissette, Miss. Amelia | female | 35.0000 | 0 | 0 | PC 17760 | 135.6333 | C99 | S | 8 | NaN | NaN |
29 | 1 | 1 | Bjornstrom-Steffansson, Mr. Mauritz Hakan | male | 28.0000 | 0 | 0 | 110564 | 26.5500 | C52 | S | D | NaN | Stockholm, Sweden / Washington, DC |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1279 | 3 | 0 | Vestrom, Miss. Hulda Amanda Adolfina | female | 14.0000 | 0 | 0 | 350406 | 7.8542 | NaN | S | NaN | NaN | NaN |
1280 | 3 | 0 | Vovk, Mr. Janko | male | 22.0000 | 0 | 0 | 349252 | 7.8958 | NaN | S | NaN | NaN | NaN |
1281 | 3 | 0 | Waelens, Mr. Achille | male | 22.0000 | 0 | 0 | 345767 | 9.0000 | NaN | S | NaN | NaN | Antwerp, Belgium / Stanton, OH |
1282 | 3 | 0 | Ware, Mr. Frederick | male | NaN | 0 | 0 | 359309 | 8.0500 | NaN | S | NaN | NaN | NaN |
1283 | 3 | 0 | Warren, Mr. Charles William | male | NaN | 0 | 0 | C.A. 49867 | 7.5500 | NaN | S | NaN | NaN | NaN |
1284 | 3 | 0 | Webber, Mr. James | male | NaN | 0 | 0 | SOTON/OQ 3101316 | 8.0500 | NaN | S | NaN | NaN | NaN |
1285 | 3 | 0 | Wenzel, Mr. Linhart | male | 32.5000 | 0 | 0 | 345775 | 9.5000 | NaN | S | NaN | 298.0 | NaN |
1286 | 3 | 1 | Whabee, Mrs. George Joseph (Shawneene Abi-Saab) | female | 38.0000 | 0 | 0 | 2688 | 7.2292 | NaN | C | C | NaN | NaN |
1287 | 3 | 0 | Widegren, Mr. Carl/Charles Peter | male | 51.0000 | 0 | 0 | 347064 | 7.7500 | NaN | S | NaN | NaN | NaN |
1288 | 3 | 0 | Wiklund, Mr. Jakob Alfred | male | 18.0000 | 1 | 0 | 3101267 | 6.4958 | NaN | S | NaN | 314.0 | NaN |
1289 | 3 | 0 | Wiklund, Mr. Karl Johan | male | 21.0000 | 1 | 0 | 3101266 | 6.4958 | NaN | S | NaN | NaN | NaN |
1290 | 3 | 1 | Wilkes, Mrs. James (Ellen Needs) | female | 47.0000 | 1 | 0 | 363272 | 7.0000 | NaN | S | NaN | NaN | NaN |
1291 | 3 | 0 | Willer, Mr. Aaron ("Abi Weller") | male | NaN | 0 | 0 | 3410 | 8.7125 | NaN | S | NaN | NaN | NaN |
1292 | 3 | 0 | Willey, Mr. Edward | male | NaN | 0 | 0 | S.O./P.P. 751 | 7.5500 | NaN | S | NaN | NaN | NaN |
1293 | 3 | 0 | Williams, Mr. Howard Hugh "Harry" | male | NaN | 0 | 0 | A/5 2466 | 8.0500 | NaN | S | NaN | NaN | NaN |
1294 | 3 | 0 | Williams, Mr. Leslie | male | 28.5000 | 0 | 0 | 54636 | 16.1000 | NaN | S | NaN | 14.0 | NaN |
1295 | 3 | 0 | Windelov, Mr. Einar | male | 21.0000 | 0 | 0 | SOTON/OQ 3101317 | 7.2500 | NaN | S | NaN | NaN | NaN |
1296 | 3 | 0 | Wirz, Mr. Albert | male | 27.0000 | 0 | 0 | 315154 | 8.6625 | NaN | S | NaN | 131.0 | NaN |
1297 | 3 | 0 | Wiseman, Mr. Phillippe | male | NaN | 0 | 0 | A/4. 34244 | 7.2500 | NaN | S | NaN | NaN | NaN |
1298 | 3 | 0 | Wittevrongel, Mr. Camille | male | 36.0000 | 0 | 0 | 345771 | 9.5000 | NaN | S | NaN | NaN | NaN |
1299 | 3 | 0 | Yasbeck, Mr. Antoni | male | 27.0000 | 1 | 0 | 2659 | 14.4542 | NaN | C | C | NaN | NaN |
1300 | 3 | 1 | Yasbeck, Mrs. Antoni (Selini Alexander) | female | 15.0000 | 1 | 0 | 2659 | 14.4542 | NaN | C | NaN | NaN | NaN |
1301 | 3 | 0 | Youseff, Mr. Gerious | male | 45.5000 | 0 | 0 | 2628 | 7.2250 | NaN | C | NaN | 312.0 | NaN |
1302 | 3 | 0 | Yousif, Mr. Wazli | male | NaN | 0 | 0 | 2647 | 7.2250 | NaN | C | NaN | NaN | NaN |
1303 | 3 | 0 | Yousseff, Mr. Gerious | male | NaN | 0 | 0 | 2627 | 14.4583 | NaN | C | NaN | NaN | NaN |
1304 | 3 | 0 | Zabour, Miss. Hileni | female | 14.5000 | 1 | 0 | 2665 | 14.4542 | NaN | C | NaN | 328.0 | NaN |
1305 | 3 | 0 | Zabour, Miss. Thamine | female | NaN | 1 | 0 | 2665 | 14.4542 | NaN | C | NaN | NaN | NaN |
1306 | 3 | 0 | Zakarian, Mr. Mapriededer | male | 26.5000 | 0 | 0 | 2656 | 7.2250 | NaN | C | NaN | 304.0 | NaN |
1307 | 3 | 0 | Zakarian, Mr. Ortin | male | 27.0000 | 0 | 0 | 2670 | 7.2250 | NaN | C | NaN | NaN | NaN |
1308 | 3 | 0 | Zimmerman, Mr. Leo | male | 29.0000 | 0 | 0 | 315082 | 7.8750 | NaN | S | NaN | NaN | NaN |
1309 rows × 14 columns
Counter is a subclass of Dictionary class that we are more familiar with. It provides faster way to tally distinct values and saved the results as a dictionary.
from collections import Counter
c = Counter('baracadabra')
print(c)
d = c.most_common(3)
print(d)
Counter({'a': 5, 'b': 2, 'r': 2, 'c': 1, 'd': 1}) [('a', 5), ('b', 2), ('r', 2)]
We want the Counter to return a dictionary subclass with key as the values of "survived" (i.e., either 0 or 1) and values as the number of passengers who have the value.
counter = Counter(sheet_1.loc[sheet_1["body"].notna(), "survived"].values)
print(counter)
Counter({0: 121})
This means that all of the passengers who have non NA 'body' value have 'survived' value equal to 0 (means that they did not survive), which obviously makes sense.
print(sheet_1.shape)
sheet_1.count()
(1309, 14)
pclass 1309 survived 1309 name 1309 sex 1309 age 1046 sibsp 1309 parch 1309 ticket 1309 fare 1308 cabin 295 embarked 1307 boat 486 body 121 home.dest 745 dtype: int64
We now want ot obtain all of the columns with no missing information
criteria = sheet_1.count() == sheet_1.shape[0]
data = sheet_1[criteria.index[criteria]]
data=data.drop(['name','ticket'],axis=1)
data
pclass | survived | sex | sibsp | parch | |
---|---|---|---|---|---|
0 | 1 | 1 | female | 0 | 0 |
1 | 1 | 1 | male | 1 | 2 |
2 | 1 | 0 | female | 1 | 2 |
3 | 1 | 0 | male | 1 | 2 |
4 | 1 | 0 | female | 1 | 2 |
5 | 1 | 1 | male | 0 | 0 |
6 | 1 | 1 | female | 1 | 0 |
7 | 1 | 0 | male | 0 | 0 |
8 | 1 | 1 | female | 2 | 0 |
9 | 1 | 0 | male | 0 | 0 |
10 | 1 | 0 | male | 1 | 0 |
11 | 1 | 1 | female | 1 | 0 |
12 | 1 | 1 | female | 0 | 0 |
13 | 1 | 1 | female | 0 | 0 |
14 | 1 | 1 | male | 0 | 0 |
15 | 1 | 0 | male | 0 | 0 |
16 | 1 | 0 | male | 0 | 1 |
17 | 1 | 1 | female | 0 | 1 |
18 | 1 | 1 | female | 0 | 0 |
19 | 1 | 0 | male | 0 | 0 |
20 | 1 | 1 | male | 1 | 1 |
21 | 1 | 1 | female | 1 | 1 |
22 | 1 | 1 | male | 0 | 0 |
23 | 1 | 1 | female | 0 | 0 |
24 | 1 | 1 | female | 0 | 0 |
25 | 1 | 0 | male | 0 | 0 |
26 | 1 | 1 | male | 1 | 0 |
27 | 1 | 1 | female | 1 | 0 |
28 | 1 | 1 | female | 0 | 0 |
29 | 1 | 1 | male | 0 | 0 |
... | ... | ... | ... | ... | ... |
1279 | 3 | 0 | female | 0 | 0 |
1280 | 3 | 0 | male | 0 | 0 |
1281 | 3 | 0 | male | 0 | 0 |
1282 | 3 | 0 | male | 0 | 0 |
1283 | 3 | 0 | male | 0 | 0 |
1284 | 3 | 0 | male | 0 | 0 |
1285 | 3 | 0 | male | 0 | 0 |
1286 | 3 | 1 | female | 0 | 0 |
1287 | 3 | 0 | male | 0 | 0 |
1288 | 3 | 0 | male | 1 | 0 |
1289 | 3 | 0 | male | 1 | 0 |
1290 | 3 | 1 | female | 1 | 0 |
1291 | 3 | 0 | male | 0 | 0 |
1292 | 3 | 0 | male | 0 | 0 |
1293 | 3 | 0 | male | 0 | 0 |
1294 | 3 | 0 | male | 0 | 0 |
1295 | 3 | 0 | male | 0 | 0 |
1296 | 3 | 0 | male | 0 | 0 |
1297 | 3 | 0 | male | 0 | 0 |
1298 | 3 | 0 | male | 0 | 0 |
1299 | 3 | 0 | male | 1 | 0 |
1300 | 3 | 1 | female | 1 | 0 |
1301 | 3 | 0 | male | 0 | 0 |
1302 | 3 | 0 | male | 0 | 0 |
1303 | 3 | 0 | male | 0 | 0 |
1304 | 3 | 0 | female | 1 | 0 |
1305 | 3 | 0 | female | 1 | 0 |
1306 | 3 | 0 | male | 0 | 0 |
1307 | 3 | 0 | male | 0 | 0 |
1308 | 3 | 0 | male | 0 | 0 |
1309 rows × 5 columns
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
data.loc[0:data.shape[0],'sex'] = le.fit_transform(data.sex)
data
pclass | survived | sex | sibsp | parch | |
---|---|---|---|---|---|
0 | 1 | 1 | 0 | 0 | 0 |
1 | 1 | 1 | 1 | 1 | 2 |
2 | 1 | 0 | 0 | 1 | 2 |
3 | 1 | 0 | 1 | 1 | 2 |
4 | 1 | 0 | 0 | 1 | 2 |
5 | 1 | 1 | 1 | 0 | 0 |
6 | 1 | 1 | 0 | 1 | 0 |
7 | 1 | 0 | 1 | 0 | 0 |
8 | 1 | 1 | 0 | 2 | 0 |
9 | 1 | 0 | 1 | 0 | 0 |
10 | 1 | 0 | 1 | 1 | 0 |
11 | 1 | 1 | 0 | 1 | 0 |
12 | 1 | 1 | 0 | 0 | 0 |
13 | 1 | 1 | 0 | 0 | 0 |
14 | 1 | 1 | 1 | 0 | 0 |
15 | 1 | 0 | 1 | 0 | 0 |
16 | 1 | 0 | 1 | 0 | 1 |
17 | 1 | 1 | 0 | 0 | 1 |
18 | 1 | 1 | 0 | 0 | 0 |
19 | 1 | 0 | 1 | 0 | 0 |
20 | 1 | 1 | 1 | 1 | 1 |
21 | 1 | 1 | 0 | 1 | 1 |
22 | 1 | 1 | 1 | 0 | 0 |
23 | 1 | 1 | 0 | 0 | 0 |
24 | 1 | 1 | 0 | 0 | 0 |
25 | 1 | 0 | 1 | 0 | 0 |
26 | 1 | 1 | 1 | 1 | 0 |
27 | 1 | 1 | 0 | 1 | 0 |
28 | 1 | 1 | 0 | 0 | 0 |
29 | 1 | 1 | 1 | 0 | 0 |
... | ... | ... | ... | ... | ... |
1279 | 3 | 0 | 0 | 0 | 0 |
1280 | 3 | 0 | 1 | 0 | 0 |
1281 | 3 | 0 | 1 | 0 | 0 |
1282 | 3 | 0 | 1 | 0 | 0 |
1283 | 3 | 0 | 1 | 0 | 0 |
1284 | 3 | 0 | 1 | 0 | 0 |
1285 | 3 | 0 | 1 | 0 | 0 |
1286 | 3 | 1 | 0 | 0 | 0 |
1287 | 3 | 0 | 1 | 0 | 0 |
1288 | 3 | 0 | 1 | 1 | 0 |
1289 | 3 | 0 | 1 | 1 | 0 |
1290 | 3 | 1 | 0 | 1 | 0 |
1291 | 3 | 0 | 1 | 0 | 0 |
1292 | 3 | 0 | 1 | 0 | 0 |
1293 | 3 | 0 | 1 | 0 | 0 |
1294 | 3 | 0 | 1 | 0 | 0 |
1295 | 3 | 0 | 1 | 0 | 0 |
1296 | 3 | 0 | 1 | 0 | 0 |
1297 | 3 | 0 | 1 | 0 | 0 |
1298 | 3 | 0 | 1 | 0 | 0 |
1299 | 3 | 0 | 1 | 1 | 0 |
1300 | 3 | 1 | 0 | 1 | 0 |
1301 | 3 | 0 | 1 | 0 | 0 |
1302 | 3 | 0 | 1 | 0 | 0 |
1303 | 3 | 0 | 1 | 0 | 0 |
1304 | 3 | 0 | 0 | 1 | 0 |
1305 | 3 | 0 | 0 | 1 | 0 |
1306 | 3 | 0 | 1 | 0 | 0 |
1307 | 3 | 0 | 1 | 0 | 0 |
1308 | 3 | 0 | 1 | 0 | 0 |
1309 rows × 5 columns
from sklearn import model_selection
X = data.drop(["survived"], axis=1).values
y = data["survived"].values
X_train, X_test, y_train, y_test = model_selection.train_test_split(X,y,test_size=0.2, shuffle=True)
from sklearn.linear_model import LogisticRegression
logitreg = LogisticRegression(solver='liblinear')
fit = logitreg.fit(X_train, y_train)
First column contains the probabilities for non-survived
Second column contains the probabilities for survived
In each row, the two probabilities should sum to one
y_test_prob = fit.predict_proba(X_test)
y_test_prob
To check our predictions, add probabilities for survied (i.e., 2nd column) to the testing data together with the true survived label
pred = y_test_prob[:,[1]]
true_label = y_test.reshape((y_test.shape[0], 1))
testdata=np.append(X_test, pred, axis=1)
testdata=np.append(testdata, true_label, axis=1)
testdata_df = pd.DataFrame(testdata, columns=['pclass', 'sex', 'sibsp', 'parch', 'pred_prob', 'true_label'])
testdata_df
print(testdata_df.iloc[0:10].to_latex(index=False))
Often we don't want to set a single threshold but rather evaluate the model based on all thresholds. ROC is the a way to this.
At each threshold, it calculates two rates:
True Positive Rate = True Positives / (True Positives + False Negatives)
False Positive Rate = False Positives / (False Positives + True Negatives)
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from matplotlib import pyplot
# calculate AUC
probs = y_test_prob[:,1]
auc = roc_auc_score(y_test, probs)
print('AUC: %.3f' % auc)
fpr, tpr, thresholds = roc_curve(y_test, probs)
roc_table = np.append(tpr.reshape((tpr.shape[0], 1)),
fpr.reshape((fpr.shape[0], 1)), axis=1)
roc_table = np.append(roc_table, thresholds.reshape((thresholds.shape[0], 1)), axis=1)
roc_df = pd.DataFrame(roc_table, columns=['TPR', 'FPR', 'Threshold'])
roc_df
print(roc_df.to_latex(index=False))
pyplot.plot([0, 1], [0, 1], linestyle='--')
# plot the roc curve for the model
pyplot.plot(fpr, tpr, marker='.')
pyplot.xlabel("False Positive Rate")
pyplot.ylabel("True Positive Rate")
pyplot.savefig('figures/roc.eps')