In [3]:
import pandas as pd
xls = pd.ExcelFile('code/titanic3.xls')
sheet_1 = xls.parse(0)
In [4]:
sheet_1.columns
Out[4]:
Index(['pclass', 'survived', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket',
       'fare', 'cabin', 'embarked', 'boat', 'body', 'home.dest'],
      dtype='object')
In [5]:
print(type(sheet_1))
<class 'pandas.core.frame.DataFrame'>
In [6]:
sheet_1
Out[6]:
pclass survived name sex age sibsp parch ticket fare cabin embarked boat body home.dest
0 1 1 Allen, Miss. Elisabeth Walton female 29.0000 0 0 24160 211.3375 B5 S 2 NaN St Louis, MO
1 1 1 Allison, Master. Hudson Trevor male 0.9167 1 2 113781 151.5500 C22 C26 S 11 NaN Montreal, PQ / Chesterville, ON
2 1 0 Allison, Miss. Helen Loraine female 2.0000 1 2 113781 151.5500 C22 C26 S NaN NaN Montreal, PQ / Chesterville, ON
3 1 0 Allison, Mr. Hudson Joshua Creighton male 30.0000 1 2 113781 151.5500 C22 C26 S NaN 135.0 Montreal, PQ / Chesterville, ON
4 1 0 Allison, Mrs. Hudson J C (Bessie Waldo Daniels) female 25.0000 1 2 113781 151.5500 C22 C26 S NaN NaN Montreal, PQ / Chesterville, ON
5 1 1 Anderson, Mr. Harry male 48.0000 0 0 19952 26.5500 E12 S 3 NaN New York, NY
6 1 1 Andrews, Miss. Kornelia Theodosia female 63.0000 1 0 13502 77.9583 D7 S 10 NaN Hudson, NY
7 1 0 Andrews, Mr. Thomas Jr male 39.0000 0 0 112050 0.0000 A36 S NaN NaN Belfast, NI
8 1 1 Appleton, Mrs. Edward Dale (Charlotte Lamson) female 53.0000 2 0 11769 51.4792 C101 S D NaN Bayside, Queens, NY
9 1 0 Artagaveytia, Mr. Ramon male 71.0000 0 0 PC 17609 49.5042 NaN C NaN 22.0 Montevideo, Uruguay
10 1 0 Astor, Col. John Jacob male 47.0000 1 0 PC 17757 227.5250 C62 C64 C NaN 124.0 New York, NY
11 1 1 Astor, Mrs. John Jacob (Madeleine Talmadge Force) female 18.0000 1 0 PC 17757 227.5250 C62 C64 C 4 NaN New York, NY
12 1 1 Aubart, Mme. Leontine Pauline female 24.0000 0 0 PC 17477 69.3000 B35 C 9 NaN Paris, France
13 1 1 Barber, Miss. Ellen "Nellie" female 26.0000 0 0 19877 78.8500 NaN S 6 NaN NaN
14 1 1 Barkworth, Mr. Algernon Henry Wilson male 80.0000 0 0 27042 30.0000 A23 S B NaN Hessle, Yorks
15 1 0 Baumann, Mr. John D male NaN 0 0 PC 17318 25.9250 NaN S NaN NaN New York, NY
16 1 0 Baxter, Mr. Quigg Edmond male 24.0000 0 1 PC 17558 247.5208 B58 B60 C NaN NaN Montreal, PQ
17 1 1 Baxter, Mrs. James (Helene DeLaudeniere Chaput) female 50.0000 0 1 PC 17558 247.5208 B58 B60 C 6 NaN Montreal, PQ
18 1 1 Bazzani, Miss. Albina female 32.0000 0 0 11813 76.2917 D15 C 8 NaN NaN
19 1 0 Beattie, Mr. Thomson male 36.0000 0 0 13050 75.2417 C6 C A NaN Winnipeg, MN
20 1 1 Beckwith, Mr. Richard Leonard male 37.0000 1 1 11751 52.5542 D35 S 5 NaN New York, NY
21 1 1 Beckwith, Mrs. Richard Leonard (Sallie Monypeny) female 47.0000 1 1 11751 52.5542 D35 S 5 NaN New York, NY
22 1 1 Behr, Mr. Karl Howell male 26.0000 0 0 111369 30.0000 C148 C 5 NaN New York, NY
23 1 1 Bidois, Miss. Rosalie female 42.0000 0 0 PC 17757 227.5250 NaN C 4 NaN NaN
24 1 1 Bird, Miss. Ellen female 29.0000 0 0 PC 17483 221.7792 C97 S 8 NaN NaN
25 1 0 Birnbaum, Mr. Jakob male 25.0000 0 0 13905 26.0000 NaN C NaN 148.0 San Francisco, CA
26 1 1 Bishop, Mr. Dickinson H male 25.0000 1 0 11967 91.0792 B49 C 7 NaN Dowagiac, MI
27 1 1 Bishop, Mrs. Dickinson H (Helen Walton) female 19.0000 1 0 11967 91.0792 B49 C 7 NaN Dowagiac, MI
28 1 1 Bissette, Miss. Amelia female 35.0000 0 0 PC 17760 135.6333 C99 S 8 NaN NaN
29 1 1 Bjornstrom-Steffansson, Mr. Mauritz Hakan male 28.0000 0 0 110564 26.5500 C52 S D NaN Stockholm, Sweden / Washington, DC
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1279 3 0 Vestrom, Miss. Hulda Amanda Adolfina female 14.0000 0 0 350406 7.8542 NaN S NaN NaN NaN
1280 3 0 Vovk, Mr. Janko male 22.0000 0 0 349252 7.8958 NaN S NaN NaN NaN
1281 3 0 Waelens, Mr. Achille male 22.0000 0 0 345767 9.0000 NaN S NaN NaN Antwerp, Belgium / Stanton, OH
1282 3 0 Ware, Mr. Frederick male NaN 0 0 359309 8.0500 NaN S NaN NaN NaN
1283 3 0 Warren, Mr. Charles William male NaN 0 0 C.A. 49867 7.5500 NaN S NaN NaN NaN
1284 3 0 Webber, Mr. James male NaN 0 0 SOTON/OQ 3101316 8.0500 NaN S NaN NaN NaN
1285 3 0 Wenzel, Mr. Linhart male 32.5000 0 0 345775 9.5000 NaN S NaN 298.0 NaN
1286 3 1 Whabee, Mrs. George Joseph (Shawneene Abi-Saab) female 38.0000 0 0 2688 7.2292 NaN C C NaN NaN
1287 3 0 Widegren, Mr. Carl/Charles Peter male 51.0000 0 0 347064 7.7500 NaN S NaN NaN NaN
1288 3 0 Wiklund, Mr. Jakob Alfred male 18.0000 1 0 3101267 6.4958 NaN S NaN 314.0 NaN
1289 3 0 Wiklund, Mr. Karl Johan male 21.0000 1 0 3101266 6.4958 NaN S NaN NaN NaN
1290 3 1 Wilkes, Mrs. James (Ellen Needs) female 47.0000 1 0 363272 7.0000 NaN S NaN NaN NaN
1291 3 0 Willer, Mr. Aaron ("Abi Weller") male NaN 0 0 3410 8.7125 NaN S NaN NaN NaN
1292 3 0 Willey, Mr. Edward male NaN 0 0 S.O./P.P. 751 7.5500 NaN S NaN NaN NaN
1293 3 0 Williams, Mr. Howard Hugh "Harry" male NaN 0 0 A/5 2466 8.0500 NaN S NaN NaN NaN
1294 3 0 Williams, Mr. Leslie male 28.5000 0 0 54636 16.1000 NaN S NaN 14.0 NaN
1295 3 0 Windelov, Mr. Einar male 21.0000 0 0 SOTON/OQ 3101317 7.2500 NaN S NaN NaN NaN
1296 3 0 Wirz, Mr. Albert male 27.0000 0 0 315154 8.6625 NaN S NaN 131.0 NaN
1297 3 0 Wiseman, Mr. Phillippe male NaN 0 0 A/4. 34244 7.2500 NaN S NaN NaN NaN
1298 3 0 Wittevrongel, Mr. Camille male 36.0000 0 0 345771 9.5000 NaN S NaN NaN NaN
1299 3 0 Yasbeck, Mr. Antoni male 27.0000 1 0 2659 14.4542 NaN C C NaN NaN
1300 3 1 Yasbeck, Mrs. Antoni (Selini Alexander) female 15.0000 1 0 2659 14.4542 NaN C NaN NaN NaN
1301 3 0 Youseff, Mr. Gerious male 45.5000 0 0 2628 7.2250 NaN C NaN 312.0 NaN
1302 3 0 Yousif, Mr. Wazli male NaN 0 0 2647 7.2250 NaN C NaN NaN NaN
1303 3 0 Yousseff, Mr. Gerious male NaN 0 0 2627 14.4583 NaN C NaN NaN NaN
1304 3 0 Zabour, Miss. Hileni female 14.5000 1 0 2665 14.4542 NaN C NaN 328.0 NaN
1305 3 0 Zabour, Miss. Thamine female NaN 1 0 2665 14.4542 NaN C NaN NaN NaN
1306 3 0 Zakarian, Mr. Mapriededer male 26.5000 0 0 2656 7.2250 NaN C NaN 304.0 NaN
1307 3 0 Zakarian, Mr. Ortin male 27.0000 0 0 2670 7.2250 NaN C NaN NaN NaN
1308 3 0 Zimmerman, Mr. Leo male 29.0000 0 0 315082 7.8750 NaN S NaN NaN NaN

1309 rows × 14 columns

In [7]:
sheet_1.iloc[0]
Out[7]:
pclass                                   1
survived                                 1
name         Allen, Miss. Elisabeth Walton
sex                                 female
age                                     29
sibsp                                    0
parch                                    0
ticket                               24160
fare                               211.338
cabin                                   B5
embarked                                 S
boat                                     2
body                                   NaN
home.dest                     St Louis, MO
Name: 0, dtype: object
In [40]:
from collections import Counter
c = Counter('baracadabra')
print(c)
d = c.most_common(3)
print(d)
Counter({'a': 5, 'b': 2, 'r': 2, 'c': 1, 'd': 1})
[('a', 5), ('b', 2), ('r', 2)]
In [34]:
counter = Counter(sheet_1["survived"].values)
print(counter)
print("survived:", counter[1])
print("survival rate:", counter[1]/(counter[0]+counter[1]))
Counter({0: 809, 1: 500})
survived: 500
survival rate: 0.3819709702062643
In [9]:
counter = Counter(sheet_1.loc[sheet_1["body"].notna(), "survived"].values)
print(counter)
Counter({0: 121})
In [17]:
print(sheet_1.groupby('pclass')['survived'].mean())
Out[17]:
pclass
1    0.619195
2    0.429603
3    0.255289
Name: survived, dtype: float64
In [10]:
print(sheet_1.groupby('sex')['survived'].mean())
sex
female    0.727468
male      0.190985
Name: survived, dtype: float64
In [144]:
print(sheet_1.groupby(["pclass","sex"])["survived"].mean())
sex
female    0.727468
male      0.190985
Name: survived, dtype: float64
pclass  sex   
1       female    0.965278
        male      0.340782
2       female    0.886792
        male      0.146199
3       female    0.490741
        male      0.152130
Name: survived, dtype: float64
In [22]:
print(sheet_1.groupby(["age"])["survived"].mean())
age
0.1667     1.000000
0.3333     0.000000
0.4167     1.000000
0.6667     1.000000
0.7500     0.666667
0.8333     1.000000
0.9167     1.000000
1.0000     0.700000
2.0000     0.333333
3.0000     0.714286
4.0000     0.700000
5.0000     0.800000
6.0000     0.500000
7.0000     0.500000
8.0000     0.666667
9.0000     0.400000
10.0000    0.000000
11.0000    0.250000
11.5000    0.000000
12.0000    1.000000
13.0000    0.600000
14.0000    0.500000
14.5000    0.000000
15.0000    0.833333
16.0000    0.421053
17.0000    0.350000
18.0000    0.358974
18.5000    0.000000
19.0000    0.379310
20.0000    0.347826
             ...   
46.0000    0.000000
47.0000    0.214286
48.0000    0.714286
49.0000    0.555556
50.0000    0.400000
51.0000    0.375000
52.0000    0.500000
53.0000    1.000000
54.0000    0.500000
55.0000    0.500000
55.5000    0.000000
56.0000    0.500000
57.0000    0.000000
58.0000    0.666667
59.0000    0.333333
60.0000    0.571429
60.5000    0.000000
61.0000    0.000000
62.0000    0.400000
63.0000    0.500000
64.0000    0.400000
65.0000    0.000000
66.0000    0.000000
67.0000    0.000000
70.0000    0.000000
70.5000    0.000000
71.0000    0.000000
74.0000    0.000000
76.0000    1.000000
80.0000    1.000000
Name: survived, Length: 98, dtype: float64
In [25]:
Counter(sheet_1["age"])
Out[25]:
Counter({29.0: 30,
         0.9167: 2,
         2.0: 12,
         30.0: 40,
         25.0: 34,
         48.0: 14,
         63.0: 4,
         39.0: 20,
         53.0: 4,
         71.0: 2,
         47.0: 14,
         18.0: 39,
         24.0: 47,
         26.0: 30,
         80.0: 1,
         nan: 1,
         50.0: 15,
         32.0: 24,
         36.0: 31,
         37.0: 9,
         42.0: 18,
         19.0: 29,
         35.0: 23,
         28.0: 32,
         45.0: 21,
         40.0: 18,
         58.0: 6,
         22.0: 43,
         nan: 1,
         41.0: 11,
         nan: 1,
         44.0: 10,
         59.0: 3,
         60.0: 7,
         nan: 1,
         33.0: 21,
         17.0: 20,
         11.0: 4,
         14.0: 8,
         49.0: 9,
         nan: 1,
         76.0: 1,
         46.0: 6,
         27.0: 30,
         nan: 1,
         nan: 1,
         nan: 1,
         64.0: 5,
         55.0: 8,
         nan: 1,
         70.0: 2,
         38.0: 14,
         51.0: 8,
         31.0: 23,
         4.0: 10,
         54.0: 10,
         23.0: 26,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         43.0: 9,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         52.0: 6,
         nan: 1,
         16.0: 19,
         nan: 1,
         32.5: 4,
         nan: 1,
         nan: 1,
         nan: 1,
         21.0: 41,
         15.0: 6,
         nan: 1,
         nan: 1,
         65.0: 3,
         nan: 1,
         28.5: 3,
         nan: 1,
         45.5: 2,
         56.0: 4,
         nan: 1,
         nan: 1,
         nan: 1,
         13.0: 5,
         61.0: 5,
         nan: 1,
         nan: 1,
         34.0: 16,
         nan: 1,
         6.0: 6,
         57.0: 5,
         nan: 1,
         62.0: 5,
         nan: 1,
         67.0: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         1.0: 10,
         12.0: 3,
         20.0: 23,
         0.8333: 3,
         nan: 1,
         8.0: 6,
         nan: 1,
         nan: 1,
         nan: 1,
         0.6667: 1,
         7.0: 4,
         nan: 1,
         nan: 1,
         nan: 1,
         3.0: 7,
         nan: 1,
         nan: 1,
         nan: 1,
         36.5: 2,
         nan: 1,
         nan: 1,
         nan: 1,
         18.5: 3,
         nan: 1,
         5.0: 5,
         66.0: 1,
         nan: 1,
         nan: 1,
         9.0: 10,
         0.75: 3,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         70.5: 1,
         22.5: 1,
         0.3333: 1,
         nan: 1,
         nan: 1,
         0.1667: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         40.5: 3,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         10.0: 4,
         nan: 1,
         nan: 1,
         nan: 1,
         23.5: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         34.5: 2,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         20.5: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         30.5: 2,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         55.5: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         38.5: 1,
         nan: 1,
         14.5: 2,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         24.5: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         60.5: 1,
         74.0: 1,
         0.4167: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         11.5: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         26.5: 1})
In [47]:
print(sheet_1.shape)
sheet_1.count()
(1309, 14)
Out[47]:
pclass       1309
survived     1309
name         1309
sex          1309
age          1046
sibsp        1309
parch        1309
ticket       1309
fare         1308
cabin         295
embarked     1307
boat          486
body          121
home.dest     745
dtype: int64
In [14]:
criteria = sheet_1.count() == sheet_1.shape[0]
In [15]:
print(criteria)
pclass        True
survived      True
name          True
sex           True
age          False
sibsp         True
parch         True
ticket        True
fare         False
cabin        False
embarked     False
boat         False
body         False
home.dest    False
dtype: bool
In [83]:
df = sheet_1[criteria.index[criteria]]
df
Out[83]:
pclass survived name sex sibsp parch ticket
0 1 1 Allen, Miss. Elisabeth Walton female 0 0 24160
1 1 1 Allison, Master. Hudson Trevor male 1 2 113781
2 1 0 Allison, Miss. Helen Loraine female 1 2 113781
3 1 0 Allison, Mr. Hudson Joshua Creighton male 1 2 113781
4 1 0 Allison, Mrs. Hudson J C (Bessie Waldo Daniels) female 1 2 113781
5 1 1 Anderson, Mr. Harry male 0 0 19952
6 1 1 Andrews, Miss. Kornelia Theodosia female 1 0 13502
7 1 0 Andrews, Mr. Thomas Jr male 0 0 112050
8 1 1 Appleton, Mrs. Edward Dale (Charlotte Lamson) female 2 0 11769
9 1 0 Artagaveytia, Mr. Ramon male 0 0 PC 17609
10 1 0 Astor, Col. John Jacob male 1 0 PC 17757
11 1 1 Astor, Mrs. John Jacob (Madeleine Talmadge Force) female 1 0 PC 17757
12 1 1 Aubart, Mme. Leontine Pauline female 0 0 PC 17477
13 1 1 Barber, Miss. Ellen "Nellie" female 0 0 19877
14 1 1 Barkworth, Mr. Algernon Henry Wilson male 0 0 27042
15 1 0 Baumann, Mr. John D male 0 0 PC 17318
16 1 0 Baxter, Mr. Quigg Edmond male 0 1 PC 17558
17 1 1 Baxter, Mrs. James (Helene DeLaudeniere Chaput) female 0 1 PC 17558
18 1 1 Bazzani, Miss. Albina female 0 0 11813
19 1 0 Beattie, Mr. Thomson male 0 0 13050
20 1 1 Beckwith, Mr. Richard Leonard male 1 1 11751
21 1 1 Beckwith, Mrs. Richard Leonard (Sallie Monypeny) female 1 1 11751
22 1 1 Behr, Mr. Karl Howell male 0 0 111369
23 1 1 Bidois, Miss. Rosalie female 0 0 PC 17757
24 1 1 Bird, Miss. Ellen female 0 0 PC 17483
25 1 0 Birnbaum, Mr. Jakob male 0 0 13905
26 1 1 Bishop, Mr. Dickinson H male 1 0 11967
27 1 1 Bishop, Mrs. Dickinson H (Helen Walton) female 1 0 11967
28 1 1 Bissette, Miss. Amelia female 0 0 PC 17760
29 1 1 Bjornstrom-Steffansson, Mr. Mauritz Hakan male 0 0 110564
... ... ... ... ... ... ... ...
1279 3 0 Vestrom, Miss. Hulda Amanda Adolfina female 0 0 350406
1280 3 0 Vovk, Mr. Janko male 0 0 349252
1281 3 0 Waelens, Mr. Achille male 0 0 345767
1282 3 0 Ware, Mr. Frederick male 0 0 359309
1283 3 0 Warren, Mr. Charles William male 0 0 C.A. 49867
1284 3 0 Webber, Mr. James male 0 0 SOTON/OQ 3101316
1285 3 0 Wenzel, Mr. Linhart male 0 0 345775
1286 3 1 Whabee, Mrs. George Joseph (Shawneene Abi-Saab) female 0 0 2688
1287 3 0 Widegren, Mr. Carl/Charles Peter male 0 0 347064
1288 3 0 Wiklund, Mr. Jakob Alfred male 1 0 3101267
1289 3 0 Wiklund, Mr. Karl Johan male 1 0 3101266
1290 3 1 Wilkes, Mrs. James (Ellen Needs) female 1 0 363272
1291 3 0 Willer, Mr. Aaron ("Abi Weller") male 0 0 3410
1292 3 0 Willey, Mr. Edward male 0 0 S.O./P.P. 751
1293 3 0 Williams, Mr. Howard Hugh "Harry" male 0 0 A/5 2466
1294 3 0 Williams, Mr. Leslie male 0 0 54636
1295 3 0 Windelov, Mr. Einar male 0 0 SOTON/OQ 3101317
1296 3 0 Wirz, Mr. Albert male 0 0 315154
1297 3 0 Wiseman, Mr. Phillippe male 0 0 A/4. 34244
1298 3 0 Wittevrongel, Mr. Camille male 0 0 345771
1299 3 0 Yasbeck, Mr. Antoni male 1 0 2659
1300 3 1 Yasbeck, Mrs. Antoni (Selini Alexander) female 1 0 2659
1301 3 0 Youseff, Mr. Gerious male 0 0 2628
1302 3 0 Yousif, Mr. Wazli male 0 0 2647
1303 3 0 Yousseff, Mr. Gerious male 0 0 2627
1304 3 0 Zabour, Miss. Hileni female 1 0 2665
1305 3 0 Zabour, Miss. Thamine female 1 0 2665
1306 3 0 Zakarian, Mr. Mapriededer male 0 0 2656
1307 3 0 Zakarian, Mr. Ortin male 0 0 2670
1308 3 0 Zimmerman, Mr. Leo male 0 0 315082

1309 rows × 7 columns

In [75]:
print(sheet_1.shape)
print(df.shape)
(1309, 14)
(1309, 7)
In [11]:
from sklearn import preprocessing
In [12]:
le = preprocessing.LabelEncoder()
In [16]:
data = sheet_1[criteria.index[criteria]]
data.loc[0:data.shape[0],'sex'] = le.fit_transform(data.sex)
/Users/yueli/anaconda3/lib/python3.7/site-packages/pandas/core/indexing.py:543: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
In [17]:
data = data.drop(['name','ticket'],axis=1)
In [18]:
data
Out[18]:
pclass survived sex sibsp parch
0 1 1 0 0 0
1 1 1 1 1 2
2 1 0 0 1 2
3 1 0 1 1 2
4 1 0 0 1 2
5 1 1 1 0 0
6 1 1 0 1 0
7 1 0 1 0 0
8 1 1 0 2 0
9 1 0 1 0 0
10 1 0 1 1 0
11 1 1 0 1 0
12 1 1 0 0 0
13 1 1 0 0 0
14 1 1 1 0 0
15 1 0 1 0 0
16 1 0 1 0 1
17 1 1 0 0 1
18 1 1 0 0 0
19 1 0 1 0 0
20 1 1 1 1 1
21 1 1 0 1 1
22 1 1 1 0 0
23 1 1 0 0 0
24 1 1 0 0 0
25 1 0 1 0 0
26 1 1 1 1 0
27 1 1 0 1 0
28 1 1 0 0 0
29 1 1 1 0 0
... ... ... ... ... ...
1279 3 0 0 0 0
1280 3 0 1 0 0
1281 3 0 1 0 0
1282 3 0 1 0 0
1283 3 0 1 0 0
1284 3 0 1 0 0
1285 3 0 1 0 0
1286 3 1 0 0 0
1287 3 0 1 0 0
1288 3 0 1 1 0
1289 3 0 1 1 0
1290 3 1 0 1 0
1291 3 0 1 0 0
1292 3 0 1 0 0
1293 3 0 1 0 0
1294 3 0 1 0 0
1295 3 0 1 0 0
1296 3 0 1 0 0
1297 3 0 1 0 0
1298 3 0 1 0 0
1299 3 0 1 1 0
1300 3 1 0 1 0
1301 3 0 1 0 0
1302 3 0 1 0 0
1303 3 0 1 0 0
1304 3 0 0 1 0
1305 3 0 0 1 0
1306 3 0 1 0 0
1307 3 0 1 0 0
1308 3 0 1 0 0

1309 rows × 5 columns

In [19]:
from sklearn import model_selection
X = data.drop(["survived"], axis=1).values
y = data["survived"].values
X_train, X_test, y_train, y_test = model_selection.train_test_split(X,y,test_size=0.2, shuffle=True)
In [20]:
X_train
Out[20]:
array([[1, 0, 1, 2],
       [2, 0, 0, 0],
       [3, 1, 0, 0],
       ...,
       [3, 1, 0, 0],
       [1, 1, 0, 0],
       [3, 0, 0, 0]])
In [21]:
from sklearn.linear_model import LogisticRegression
In [22]:
logitreg = LogisticRegression(solver='liblinear')
fit = logitreg.fit(X_train, y_train)
In [23]:
fit.coef_
Out[23]:
array([[-0.80616445, -2.53244105, -0.20512107,  0.04807778]])
In [24]:
effect_size = pd.DataFrame(fit.coef_)
effect_size.columns = data.drop(["survived"], axis=1).columns
effect_size
Out[24]:
pclass sex sibsp parch
0 -0.806164 -2.532441 -0.205121 0.048078
In [25]:
y_train_pred = fit.predict(X_train)
y_test_pred = fit.predict(X_test)

The predictions are already thesholded to be either 0 or 1

In [26]:
print(y_test_pred)
[0 0 1 1 0 1 0 0 0 1 1 0 1 1 1 0 0 0 0 0 1 0 0 0 1 0 0 1 1 1 1 0 1 1 1 1 0
 1 1 1 1 1 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1
 1 0 0 1 1 0 0 0 0 1 0 1 0 0 1 1 0 0 1 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 1 1 1
 1 1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 1 1 0 0 1 1 0 0 1 0 1 0 0 1 1 1 1 0 1 0 1
 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 1 0 0 1 0 0 1 0 1 0 1 1 0 1 0 1
 1 0 0 0 1 1 0 0 0 1 0 1 1 1 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 0 0 0 0 0 0 1 0
 1 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 0 0 1 0
 0 0 0]
In [27]:
acc_train = sum(y_train_pred==y_train)/len(y_train)
acc_test = sum(y_test_pred==y_test)/len(y_test)
print(f"train accuracy: {acc_train:.3f}; \
test accuracy: {acc_test:.3f}")
train accuracy: 0.799; test accuracy: 0.760

To get probabilities from logistic regression, we need to call the following function

In [28]:
y_test_prob = fit.predict_proba(X_test)
In [29]:
y_test_prob
Out[29]:
array([[0.88579894, 0.11420106],
       [0.88579894, 0.11420106],
       [0.38132888, 0.61867112],
       [0.38132888, 0.61867112],
       [0.88579894, 0.11420106],
       [0.48159095, 0.51840905],
       [0.75881787, 0.24118213],
       [0.77597541, 0.22402459],
       [0.92120054, 0.07879946],
       [0.24360185, 0.75639815],
       [0.38132888, 0.61867112],
       [0.77597541, 0.22402459],
       [0.25256948, 0.74743052],
       [0.25256948, 0.74743052],
       [0.35891714, 0.64108286],
       [0.88579894, 0.11420106],
       [0.88579894, 0.11420106],
       [0.88579894, 0.11420106],
       [0.77597541, 0.22402459],
       [0.80961123, 0.19038877],
       [0.43075031, 0.56924969],
       [0.60735332, 0.39264668],
       [0.88579894, 0.11420106],
       [0.60957787, 0.39042213],
       [0.12573608, 0.87426392],
       [0.60735332, 0.39264668],
       [0.88579894, 0.11420106],
       [0.23485257, 0.76514743],
       [0.43075031, 0.56924969],
       [0.38132888, 0.61867112],
       [0.13111688, 0.86888312],
       [0.90496496, 0.09503504],
       [0.12573608, 0.87426392],
       [0.13111688, 0.86888312],
       [0.10486335, 0.89513665],
       [0.38132888, 0.61867112],
       [0.88579894, 0.11420106],
       [0.38132888, 0.61867112],
       [0.10043541, 0.89956459],
       [0.38132888, 0.61867112],
       [0.38132888, 0.61867112],
       [0.13111688, 0.86888312],
       [0.88579894, 0.11420106],
       [0.4190028 , 0.5809972 ],
       [0.88579894, 0.11420106],
       [0.88579894, 0.11420106],
       [0.59583071, 0.40416929],
       [0.88579894, 0.11420106],
       [0.8870884 , 0.1129116 ],
       [0.77597541, 0.22402459],
       [0.38132888, 0.61867112],
       [0.88579894, 0.11420106],
       [0.88579894, 0.11420106],
       [0.88579894, 0.11420106],
       [0.21583994, 0.78416006],
       [0.88579894, 0.11420106],
       [0.90496496, 0.09503504],
       [0.88579894, 0.11420106],
       [0.65505215, 0.34494785],
       [0.60735332, 0.39264668],
       [0.90496496, 0.09503504],
       [0.90496496, 0.09503504],
       [0.77597541, 0.22402459],
       [0.60735332, 0.39264668],
       [0.59583071, 0.40416929],
       [0.77597541, 0.22402459],
       [0.55981643, 0.44018357],
       [0.88579894, 0.11420106],
       [0.13111688, 0.86888312],
       [0.13111688, 0.86888312],
       [0.88579894, 0.11420106],
       [0.88579894, 0.11420106],
       [0.90496496, 0.09503504],
       [0.43075031, 0.56924969],
       [0.37005301, 0.62994699],
       [0.88579894, 0.11420106],
       [0.88579894, 0.11420106],
       [0.21583994, 0.78416006],
       [0.40734641, 0.59265359],
       [0.88579894, 0.11420106],
       [0.90074883, 0.09925117],
       [0.60735332, 0.39264668],
       [0.88579894, 0.11420106],
       [0.38132888, 0.61867112],
       [0.90496496, 0.09503504],
       [0.21583994, 0.78416006],
       [0.94379673, 0.05620327],
       [0.64410908, 0.35589092],
       [0.38132888, 0.61867112],
       [0.38132888, 0.61867112],
       [0.64410908, 0.35589092],
       [0.77597541, 0.22402459],
       [0.25256948, 0.74743052],
       [0.88579894, 0.11420106],
       [0.88579894, 0.11420106],
       [0.88579894, 0.11420106],
       [0.88579894, 0.11420106],
       [0.88579894, 0.11420106],
       [0.32926956, 0.67073044],
       [0.38132888, 0.61867112],
       [0.90496496, 0.09503504],
       [0.88579894, 0.11420106],
       [0.88579894, 0.11420106],
       [0.90074883, 0.09925117],
       [0.10946276, 0.89053724],
       [0.88579894, 0.11420106],
       [0.65505215, 0.34494785],
       [0.88579894, 0.11420106],
       [0.10946276, 0.89053724],
       [0.35891714, 0.64108286],
       [0.37005301, 0.62994699],
       [0.13111688, 0.86888312],
       [0.38132888, 0.61867112],
       [0.88579894, 0.11420106],
       [0.12573608, 0.87426392],
       [0.88579894, 0.11420106],
       [0.88579894, 0.11420106],
       [0.88579894, 0.11420106],
       [0.67924126, 0.32075874],
       [0.77597541, 0.22402459],
       [0.88579894, 0.11420106],
       [0.90496496, 0.09503504],
       [0.58420145, 0.41579855],
       [0.38132888, 0.61867112],
       [0.88579894, 0.11420106],
       [0.90496496, 0.09503504],
       [0.88579894, 0.11420106],
       [0.10946276, 0.89053724],
       [0.38132888, 0.61867112],
       [0.60735332, 0.39264668],
       [0.88579894, 0.11420106],
       [0.43075031, 0.56924969],
       [0.43075031, 0.56924969],
       [0.60735332, 0.39264668],
       [0.77597541, 0.22402459],
       [0.13111688, 0.86888312],
       [0.90496496, 0.09503504],
       [0.38132888, 0.61867112],
       [0.94379673, 0.05620327],
       [0.77597541, 0.22402459],
       [0.21583994, 0.78416006],
       [0.43075031, 0.56924969],
       [0.21583994, 0.78416006],
       [0.13111688, 0.86888312],
       [0.88579894, 0.11420106],
       [0.25256948, 0.74743052],
       [0.88579894, 0.11420106],
       [0.10946276, 0.89053724],
       [0.80208997, 0.19791003],
       [0.60735332, 0.39264668],
       [0.60735332, 0.39264668],
       [0.88579894, 0.11420106],
       [0.77597541, 0.22402459],
       [0.38132888, 0.61867112],
       [0.77597541, 0.22402459],
       [0.88579894, 0.11420106],
       [0.88579894, 0.11420106],
       [0.88579894, 0.11420106],
       [0.88579894, 0.11420106],
       [0.63301335, 0.36698665],
       [0.88579894, 0.11420106],
       [0.88579894, 0.11420106],
       [0.88579894, 0.11420106],
       [0.88579894, 0.11420106],
       [0.43075031, 0.56924969],
       [0.88579894, 0.11420106],
       [0.94379673, 0.05620327],
       [0.24360185, 0.75639815],
       [0.20781382, 0.79218618],
       [0.43075031, 0.56924969],
       [0.88579894, 0.11420106],
       [0.91393181, 0.08606819],
       [0.19242812, 0.80757188],
       [0.94379673, 0.05620327],
       [0.52083139, 0.47916861],
       [0.38132888, 0.61867112],
       [0.60735332, 0.39264668],
       [0.21583994, 0.78416006],
       [0.77597541, 0.22402459],
       [0.10486335, 0.89513665],
       [0.40734641, 0.59265359],
       [0.88579894, 0.11420106],
       [0.24360185, 0.75639815],
       [0.88579894, 0.11420106],
       [0.21583994, 0.78416006],
       [0.13111688, 0.86888312],
       [0.88579894, 0.11420106],
       [0.92120054, 0.07879946],
       [0.77597541, 0.22402459],
       [0.17121611, 0.82878389],
       [0.35891714, 0.64108286],
       [0.88579894, 0.11420106],
       [0.59583071, 0.40416929],
       [0.88579894, 0.11420106],
       [0.13111688, 0.86888312],
       [0.92120054, 0.07879946],
       [0.11071774, 0.88928226],
       [0.21583994, 0.78416006],
       [0.38132888, 0.61867112],
       [0.88579894, 0.11420106],
       [0.60735332, 0.39264668],
       [0.77597541, 0.22402459],
       [0.88579894, 0.11420106],
       [0.64410908, 0.35589092],
       [0.90074883, 0.09925117],
       [0.59583071, 0.40416929],
       [0.77597541, 0.22402459],
       [0.88579894, 0.11420106],
       [0.80208997, 0.19791003],
       [0.38132888, 0.61867112],
       [0.38132888, 0.61867112],
       [0.48159095, 0.51840905],
       [0.38132888, 0.61867112],
       [0.38132888, 0.61867112],
       [0.88579894, 0.11420106],
       [0.65505215, 0.34494785],
       [0.88579894, 0.11420106],
       [0.60735332, 0.39264668],
       [0.88579894, 0.11420106],
       [0.88579894, 0.11420106],
       [0.13111688, 0.86888312],
       [0.65505215, 0.34494785],
       [0.10946276, 0.89053724],
       [0.88579894, 0.11420106],
       [0.38132888, 0.61867112],
       [0.92875611, 0.07124389],
       [0.88579894, 0.11420106],
       [0.88579894, 0.11420106],
       [0.55981643, 0.44018357],
       [0.37005301, 0.62994699],
       [0.88579894, 0.11420106],
       [0.88579894, 0.11420106],
       [0.88579894, 0.11420106],
       [0.87570526, 0.12429474],
       [0.35891714, 0.64108286],
       [0.90496496, 0.09503504],
       [0.60735332, 0.39264668],
       [0.92875611, 0.07124389],
       [0.95156933, 0.04843067],
       [0.65505215, 0.34494785],
       [0.77597541, 0.22402459],
       [0.77597541, 0.22402459],
       [0.77597541, 0.22402459],
       [0.60735332, 0.39264668],
       [0.88579894, 0.11420106],
       [0.80961123, 0.19038877],
       [0.88579894, 0.11420106],
       [0.33710487, 0.66289513],
       [0.88579894, 0.11420106],
       [0.38132888, 0.61867112],
       [0.80961123, 0.19038877],
       [0.12573608, 0.87426392],
       [0.60735332, 0.39264668],
       [0.88579894, 0.11420106],
       [0.64410908, 0.35589092],
       [0.80208997, 0.19791003],
       [0.90074883, 0.09925117],
       [0.10946276, 0.89053724],
       [0.74286276, 0.25713724],
       [0.88579894, 0.11420106],
       [0.93187249, 0.06812751],
       [0.60735332, 0.39264668]])
In [30]:
y_test_prob[:,1]
Out[30]:
array([0.11420106, 0.11420106, 0.61867112, 0.61867112, 0.11420106,
       0.51840905, 0.24118213, 0.22402459, 0.07879946, 0.75639815,
       0.61867112, 0.22402459, 0.74743052, 0.74743052, 0.64108286,
       0.11420106, 0.11420106, 0.11420106, 0.22402459, 0.19038877,
       0.56924969, 0.39264668, 0.11420106, 0.39042213, 0.87426392,
       0.39264668, 0.11420106, 0.76514743, 0.56924969, 0.61867112,
       0.86888312, 0.09503504, 0.87426392, 0.86888312, 0.89513665,
       0.61867112, 0.11420106, 0.61867112, 0.89956459, 0.61867112,
       0.61867112, 0.86888312, 0.11420106, 0.5809972 , 0.11420106,
       0.11420106, 0.40416929, 0.11420106, 0.1129116 , 0.22402459,
       0.61867112, 0.11420106, 0.11420106, 0.11420106, 0.78416006,
       0.11420106, 0.09503504, 0.11420106, 0.34494785, 0.39264668,
       0.09503504, 0.09503504, 0.22402459, 0.39264668, 0.40416929,
       0.22402459, 0.44018357, 0.11420106, 0.86888312, 0.86888312,
       0.11420106, 0.11420106, 0.09503504, 0.56924969, 0.62994699,
       0.11420106, 0.11420106, 0.78416006, 0.59265359, 0.11420106,
       0.09925117, 0.39264668, 0.11420106, 0.61867112, 0.09503504,
       0.78416006, 0.05620327, 0.35589092, 0.61867112, 0.61867112,
       0.35589092, 0.22402459, 0.74743052, 0.11420106, 0.11420106,
       0.11420106, 0.11420106, 0.11420106, 0.67073044, 0.61867112,
       0.09503504, 0.11420106, 0.11420106, 0.09925117, 0.89053724,
       0.11420106, 0.34494785, 0.11420106, 0.89053724, 0.64108286,
       0.62994699, 0.86888312, 0.61867112, 0.11420106, 0.87426392,
       0.11420106, 0.11420106, 0.11420106, 0.32075874, 0.22402459,
       0.11420106, 0.09503504, 0.41579855, 0.61867112, 0.11420106,
       0.09503504, 0.11420106, 0.89053724, 0.61867112, 0.39264668,
       0.11420106, 0.56924969, 0.56924969, 0.39264668, 0.22402459,
       0.86888312, 0.09503504, 0.61867112, 0.05620327, 0.22402459,
       0.78416006, 0.56924969, 0.78416006, 0.86888312, 0.11420106,
       0.74743052, 0.11420106, 0.89053724, 0.19791003, 0.39264668,
       0.39264668, 0.11420106, 0.22402459, 0.61867112, 0.22402459,
       0.11420106, 0.11420106, 0.11420106, 0.11420106, 0.36698665,
       0.11420106, 0.11420106, 0.11420106, 0.11420106, 0.56924969,
       0.11420106, 0.05620327, 0.75639815, 0.79218618, 0.56924969,
       0.11420106, 0.08606819, 0.80757188, 0.05620327, 0.47916861,
       0.61867112, 0.39264668, 0.78416006, 0.22402459, 0.89513665,
       0.59265359, 0.11420106, 0.75639815, 0.11420106, 0.78416006,
       0.86888312, 0.11420106, 0.07879946, 0.22402459, 0.82878389,
       0.64108286, 0.11420106, 0.40416929, 0.11420106, 0.86888312,
       0.07879946, 0.88928226, 0.78416006, 0.61867112, 0.11420106,
       0.39264668, 0.22402459, 0.11420106, 0.35589092, 0.09925117,
       0.40416929, 0.22402459, 0.11420106, 0.19791003, 0.61867112,
       0.61867112, 0.51840905, 0.61867112, 0.61867112, 0.11420106,
       0.34494785, 0.11420106, 0.39264668, 0.11420106, 0.11420106,
       0.86888312, 0.34494785, 0.89053724, 0.11420106, 0.61867112,
       0.07124389, 0.11420106, 0.11420106, 0.44018357, 0.62994699,
       0.11420106, 0.11420106, 0.11420106, 0.12429474, 0.64108286,
       0.09503504, 0.39264668, 0.07124389, 0.04843067, 0.34494785,
       0.22402459, 0.22402459, 0.22402459, 0.39264668, 0.11420106,
       0.19038877, 0.11420106, 0.66289513, 0.11420106, 0.61867112,
       0.19038877, 0.87426392, 0.39264668, 0.11420106, 0.35589092,
       0.19791003, 0.09925117, 0.89053724, 0.25713724, 0.11420106,
       0.06812751, 0.39264668])

Receiver Operator Characteristic (ROC) curve

Often we don't want to set a single threshold but rather evaluate the model based on all thresholds. ROC is the a way to this.

At each threshold, it calculates two rates:

True Positive Rate = True Positives / (True Positives + False Negatives) False Positive Rate = False Positives / (False Positives + True Negatives)

In [32]:
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from matplotlib import pyplot

# calculate AUC
probs = y_test_prob[:,1]
auc = roc_auc_score(y_test, probs)
print('AUC: %.3f' % auc)
pyplot.plot([0, 1], [0, 1], linestyle='--')
# plot the roc curve for the model
fpr, tpr, thresholds = roc_curve(y_test, probs)
pyplot.plot(fpr, tpr, marker='.')
pyplot.xlabel("False Positive Rate")
pyplot.ylabel("True Positive Rate")
pyplot.show()
AUC: 0.794
In [ ]: