import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
cols = ['Parch', 'Fare', 'Embarked', 'Sex', 'Name', 'Age']
df = pd.read_csv('http://bit.ly/kaggletrain')
X = df[cols]
y = df['Survived']
df_new = pd.read_csv('http://bit.ly/kaggletest')
X_new = df_new[cols]
imp_constant = SimpleImputer(strategy='constant', fill_value='missing')
ohe = OneHotEncoder()
imp_ohe = make_pipeline(imp_constant, ohe)
vect = CountVectorizer()
imp = SimpleImputer()
ct = make_column_transformer(
(imp_ohe, ['Embarked', 'Sex']),
(vect, 'Name'),
(imp, ['Age', 'Fare']),
remainder='passthrough')
logreg = LogisticRegression(solver='liblinear', random_state=1)
pipe = make_pipeline(ct, logreg)
pipe.fit(X, y)
pipe.predict(X_new)
array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1])
imp_ohe
pipeline? Why didn't you instead add imp_constant
to the ColumnTransformer?¶Here's a reminder of what imp_ohe contains:
imp_ohe = make_pipeline(imp_constant, ohe)
Here's how I used it in the ColumnTransformer:
ct = make_column_transformer(
(imp_ohe, ['Embarked', 'Sex']),
(vect, 'Name'),
(imp, ['Age', 'Fare']),
remainder='passthrough')
Many people suggested that I use something like this instead (which will not work):
ct_suggestion = make_column_transformer(
(imp_constant, ['Embarked', 'Sex']),
(ohe, ['Embarked', 'Sex']),
(vect, 'Name'),
(imp, ['Age', 'Fare']),
remainder='passthrough')
I'll create the 10 row dataset to help me explain why:
df_tiny = df.head(10).copy()
df_tiny
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
5 | 6 | 0 | 3 | Moran, Mr. James | male | NaN | 0 | 0 | 330877 | 8.4583 | NaN | Q |
6 | 7 | 0 | 1 | McCarthy, Mr. Timothy J | male | 54.0 | 0 | 0 | 17463 | 51.8625 | E46 | S |
7 | 8 | 0 | 3 | Palsson, Master. Gosta Leonard | male | 2.0 | 3 | 1 | 349909 | 21.0750 | NaN | S |
8 | 9 | 1 | 3 | Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) | female | 27.0 | 0 | 2 | 347742 | 11.1333 | NaN | S |
9 | 10 | 1 | 2 | Nasser, Mrs. Nicholas (Adele Achem) | female | 14.0 | 1 | 0 | 237736 | 30.0708 | NaN | C |
X_tiny = df_tiny[cols]
X_tiny
Parch | Fare | Embarked | Sex | Name | Age | |
---|---|---|---|---|---|---|
0 | 0 | 7.2500 | S | male | Braund, Mr. Owen Harris | 22.0 |
1 | 0 | 71.2833 | C | female | Cumings, Mrs. John Bradley (Florence Briggs Th... | 38.0 |
2 | 0 | 7.9250 | S | female | Heikkinen, Miss. Laina | 26.0 |
3 | 0 | 53.1000 | S | female | Futrelle, Mrs. Jacques Heath (Lily May Peel) | 35.0 |
4 | 0 | 8.0500 | S | male | Allen, Mr. William Henry | 35.0 |
5 | 0 | 8.4583 | Q | male | Moran, Mr. James | NaN |
6 | 0 | 51.8625 | S | male | McCarthy, Mr. Timothy J | 54.0 |
7 | 1 | 21.0750 | S | male | Palsson, Master. Gosta Leonard | 2.0 |
8 | 2 | 11.1333 | S | female | Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) | 27.0 |
9 | 0 | 30.0708 | C | female | Nasser, Mrs. Nicholas (Adele Achem) | 14.0 |
Here's a smaller ColumnTransformer that uses imp_ohe, but only on "Embarked":
make_column_transformer(
(imp_ohe, ['Embarked']),
remainder='drop').fit_transform(X_tiny)
array([[0., 0., 1.], [1., 0., 0.], [0., 0., 1.], [0., 0., 1.], [0., 0., 1.], [0., 1., 0.], [0., 0., 1.], [0., 0., 1.], [0., 0., 1.], [1., 0., 0.]])
Try splitting imp_ohe into two separate transformers:
make_column_transformer(
(imp_constant, ['Embarked']),
(ohe, ['Embarked']),
remainder='drop').fit_transform(X_tiny)
array([['S', 0.0, 0.0, 1.0], ['C', 1.0, 0.0, 0.0], ['S', 0.0, 0.0, 1.0], ['S', 0.0, 0.0, 1.0], ['S', 0.0, 0.0, 1.0], ['Q', 0.0, 1.0, 0.0], ['S', 0.0, 0.0, 1.0], ['S', 0.0, 0.0, 1.0], ['S', 0.0, 0.0, 1.0], ['C', 1.0, 0.0, 0.0]], dtype=object)
Try reversing the order of the transformers:
make_column_transformer(
(ohe, ['Embarked']),
(imp_constant, ['Embarked']),
remainder='drop').fit_transform(X_tiny)
array([[0.0, 0.0, 1.0, 'S'], [1.0, 0.0, 0.0, 'C'], [0.0, 0.0, 1.0, 'S'], [0.0, 0.0, 1.0, 'S'], [0.0, 0.0, 1.0, 'S'], [0.0, 1.0, 0.0, 'Q'], [0.0, 0.0, 1.0, 'S'], [0.0, 0.0, 1.0, 'S'], [0.0, 0.0, 1.0, 'S'], [1.0, 0.0, 0.0, 'C']], dtype=object)
Key ideas:
Take one more look at the correct ColumnTransformer (which matches the diagram above):
ct = make_column_transformer(
(imp_ohe, ['Embarked', 'Sex']),
(vect, 'Name'),
(imp, ['Age', 'Fare']),
remainder='passthrough')
ct.fit_transform(X)
<891x1518 sparse matrix of type '<class 'numpy.float64'>' with 7328 stored elements in Compressed Sparse Row format>
Compare that to the ColumnTransformer many people were suggesting:
ct_suggestion = make_column_transformer(
(imp_constant, ['Embarked', 'Sex']),
(ohe, ['Embarked', 'Sex']),
(vect, 'Name'),
(imp, ['Age', 'Fare']),
remainder='passthrough')
It will error during fit_transform because the ohe transformer doesn't know how to handle missing values:
# ct_suggestion.fit_transform(X)
Cross-validate our pipeline:
from sklearn.model_selection import cross_val_score
%time cross_val_score(pipe, X, y, cv=5, scoring='accuracy').mean()
CPU times: user 112 ms, sys: 2.32 ms, total: 114 ms Wall time: 114 ms
0.8114619295712762
Remove "Name" from the ColumnTransformer:
ct_no_name = make_column_transformer(
(imp_ohe, ['Embarked', 'Sex']),
('drop', 'Name'),
(imp, ['Age', 'Fare']),
remainder='passthrough')
Cross-validate the pipeline that doesn't include "Name":
no_name = make_pipeline(ct_no_name, logreg)
%time cross_val_score(no_name, X, y, cv=5, scoring='accuracy').mean()
CPU times: user 65.8 ms, sys: 1.23 ms, total: 67 ms Wall time: 68.1 ms
0.7833908731404181
Create a pipeline that only includes "Name" and cross-validate it:
only_name = make_pipeline(vect, logreg)
%time cross_val_score(only_name, X['Name'], y, cv=5, scoring='accuracy').mean()
CPU times: user 44 ms, sys: 1.4 ms, total: 45.4 ms Wall time: 45.6 ms
0.7945954428472788
What were the results?
What are the benefits of including "Name"?
What are the costs of including "Name"?
Other thoughts:
Here's a reminder of what our ColumnTransformer and Pipeline look like:
ct = make_column_transformer(
(imp_ohe, ['Embarked', 'Sex']),
(vect, 'Name'),
(imp, ['Age', 'Fare']),
remainder='passthrough')
pipe = make_pipeline(ct, logreg)
Here's a reminder of how we cross-validate the Pipeline:
cross_val_score(pipe, X, y, cv=5, scoring='accuracy').mean()
0.8114619295712762
What happens "under the hood" when we run cross_val_score?
The key point is that cross_val_score does the transformations (steps 2 and 3) after splitting the data (step 1):
When using cross_val_score, I've been passing an integer that specifies the number of folds:
cross_val_score(pipe, X, y, cv=5, scoring='accuracy')
array([0.79888268, 0.8258427 , 0.80337079, 0.78651685, 0.84269663])
Here's what happens "under the hood" when you specify 5 folds for a classification problem:
from sklearn.model_selection import StratifiedKFold
kf = StratifiedKFold(5)
cross_val_score(pipe, X, y, cv=kf, scoring='accuracy')
array([0.79888268, 0.8258427 , 0.80337079, 0.78651685, 0.84269663])
StratifiedKFold is a cross-validation splitter, meaning that its role is to split datasets:
You can examine the data used in each fold. Here are the training and testing indices used by the first split:
list(kf.split(X, y))[0]
(array([168, 169, 170, 171, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 185, 188, 189, 191, 196, 197, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 522, 523, 524, 525, 526, 527, 528, 529, 530, 531, 532, 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, 546, 547, 548, 549, 550, 551, 552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574, 575, 576, 577, 578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 588, 589, 590, 591, 592, 593, 594, 595, 596, 597, 598, 599, 600, 601, 602, 603, 604, 605, 606, 607, 608, 609, 610, 611, 612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623, 624, 625, 626, 627, 628, 629, 630, 631, 632, 633, 634, 635, 636, 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652, 653, 654, 655, 656, 657, 658, 659, 660, 661, 662, 663, 664, 665, 666, 667, 668, 669, 670, 671, 672, 673, 674, 675, 676, 677, 678, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688, 689, 690, 691, 692, 693, 694, 695, 696, 697, 698, 699, 700, 701, 702, 703, 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 714, 715, 716, 717, 718, 719, 720, 721, 722, 723, 724, 725, 726, 727, 728, 729, 730, 731, 732, 733, 734, 735, 736, 737, 738, 739, 740, 741, 742, 743, 744, 745, 746, 747, 748, 749, 750, 751, 752, 753, 754, 755, 756, 757, 758, 759, 760, 761, 762, 763, 764, 765, 766, 767, 768, 769, 770, 771, 772, 773, 774, 775, 776, 777, 778, 779, 780, 781, 782, 783, 784, 785, 786, 787, 788, 789, 790, 791, 792, 793, 794, 795, 796, 797, 798, 799, 800, 801, 802, 803, 804, 805, 806, 807, 808, 809, 810, 811, 812, 813, 814, 815, 816, 817, 818, 819, 820, 821, 822, 823, 824, 825, 826, 827, 828, 829, 830, 831, 832, 833, 834, 835, 836, 837, 838, 839, 840, 841, 842, 843, 844, 845, 846, 847, 848, 849, 850, 851, 852, 853, 854, 855, 856, 857, 858, 859, 860, 861, 862, 863, 864, 865, 866, 867, 868, 869, 870, 871, 872, 873, 874, 875, 876, 877, 878, 879, 880, 881, 882, 883, 884, 885, 886, 887, 888, 889, 890]), array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 172, 183, 184, 186, 187, 190, 192, 193, 194, 195, 198]))
By default, StratifiedKFold does not shuffle the rows:
If the order of your rows is not arbitrary, then you can (and should) shuffle the rows by modifying the splitter:
kf = StratifiedKFold(5, shuffle=True, random_state=0)
When we use grid search, we're choosing hyperparameters that maximize the cross-validation score on this dataset:
If you want a more realistic performance estimate and you have enough data to spare:
If you want a more realistic performance estimate and you don't have enough data to spare:
Bottom line:
Recommended resources:
As a reminder, you can add a "missing indicator" to SimpleImputer (new in version 0.21):
SimpleImputer(add_indicator=True).fit_transform(X_tiny[['Age']])
array([[22. , 0. ], [38. , 0. ], [26. , 0. ], [35. , 0. ], [35. , 0. ], [28.11111111, 1. ], [54. , 0. ], [ 2. , 0. ], [27. , 0. ], [14. , 0. ]])
How could we create the same output without using the "add_indicator" parameter?
We can create the left column using the SimpleImputer:
imp.fit_transform(X_tiny[['Age']])
array([[22. ], [38. ], [26. ], [35. ], [35. ], [28.11111111], [54. ], [ 2. ], [27. ], [14. ]])
We can create the right column using the MissingIndicator class:
from sklearn.impute import MissingIndicator
indicator = MissingIndicator()
indicator.fit_transform(X_tiny[['Age']])
array([[False], [False], [False], [False], [False], [ True], [False], [False], [False], [False]])
We can use FeatureUnion to stack these two columns side-by-side:
from sklearn.pipeline import make_union
imp_indicator = make_union(imp, indicator)
imp_indicator.fit_transform(X_tiny[['Age']])
array([[22. , 0. ], [38. , 0. ], [26. , 0. ], [35. , 0. ], [35. , 0. ], [28.11111111, 1. ], [54. , 0. ], [ 2. , 0. ], [27. , 0. ], [14. , 0. ]])
Comparing FeatureUnion and ColumnTransformer:
Thus we could include our FeatureUnion in a ColumnTransformer:
make_column_transformer(
(imp_indicator, ['Age']),
remainder='drop').fit_transform(X_tiny)
array([[22. , 0. ], [38. , 0. ], [26. , 0. ], [35. , 0. ], [35. , 0. ], [28.11111111, 1. ], [54. , 0. ], [ 2. , 0. ], [27. , 0. ], [14. , 0. ]])
Or we could achieve the same results without the FeatureUnion, by passing the "Age" column to the ColumnTransformer twice:
make_column_transformer(
(imp, ['Age']),
(indicator, ['Age']),
remainder='drop').fit_transform(X_tiny)
array([[22. , 0. ], [38. , 0. ], [26. , 0. ], [35. , 0. ], [35. , 0. ], [28.11111111, 1. ], [54. , 0. ], [ 2. , 0. ], [27. , 0. ], [14. , 0. ]])
Conclusion:
IterativeImputer (new in version 0.21) is experimental, meaning the API and predictions may change:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
Pass it "Parch" (no missing values), "Fare" (no missing values), and "Age" (1 missing value):
imp_iterative = IterativeImputer()
imp_iterative.fit_transform(X_tiny[['Parch', 'Fare', 'Age']])
array([[ 0. , 7.25 , 22. ], [ 0. , 71.2833 , 38. ], [ 0. , 7.925 , 26. ], [ 0. , 53.1 , 35. ], [ 0. , 8.05 , 35. ], [ 0. , 8.4583 , 24.23702669], [ 0. , 51.8625 , 54. ], [ 1. , 21.075 , 2. ], [ 2. , 11.1333 , 27. ], [ 0. , 30.0708 , 14. ]])
How it works:
Notes:
KNNImputer (new in version 0.22) is another option:
from sklearn.impute import KNNImputer
imp_knn = KNNImputer(n_neighbors=2)
imp_knn.fit_transform(X_tiny[['Parch', 'Fare', 'Age']])
array([[ 0. , 7.25 , 22. ], [ 0. , 71.2833, 38. ], [ 0. , 7.925 , 26. ], [ 0. , 53.1 , 35. ], [ 0. , 8.05 , 35. ], [ 0. , 8.4583, 30.5 ], [ 0. , 51.8625, 54. ], [ 1. , 21.075 , 2. ], [ 2. , 11.1333, 27. ], [ 0. , 30.0708, 14. ]])
How it works:
The intuition behind both of these imputers is that it can be useful to take other features into account when deciding what value to impute:
It can be useful to add feature selection to your workflow:
Cross-validate our pipeline (without any hyperparameter tuning) to generate a "baseline" accuracy:
cross_val_score(pipe, X, y, cv=5, scoring='accuracy').mean()
0.8114619295712762
SelectPercentile selects features based on statistical tests:
Create a feature selection object:
from sklearn.feature_selection import SelectPercentile, chi2
selection = SelectPercentile(chi2, percentile=50)
Add feature selection to the Pipeline after the ColumnTransformer but before the model:
pipe_selection = make_pipeline(ct, selection, logreg)
Cross-validate the updated Pipeline, and the score has improved:
cross_val_score(pipe_selection, X, y, cv=5, scoring='accuracy').mean()
0.8193019898311469
SelectFromModel scores features using a model:
We'll try using logistic regression for feature selection:
logreg_selection = LogisticRegression(solver='liblinear', penalty='l1', random_state=1)
Create a feature selection object:
from sklearn.feature_selection import SelectFromModel
selection = SelectFromModel(logreg_selection, threshold='mean')
Update the Pipeline to use the new feature selection object:
pipe_selection = make_pipeline(ct, selection, logreg)
Cross-validate the updated Pipeline, and the score has improved again:
cross_val_score(pipe_selection, X, y, cv=5, scoring='accuracy').mean()
0.8260121775155358
Both of these approaches should be further tuned:
Some (but not all) Machine Learning models benefit from feature standardization, and this is often done with StandardScaler:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
Here's a reminder of our existing ColumnTransformer:
ct = make_column_transformer(
(imp_ohe, ['Embarked', 'Sex']),
(vect, 'Name'),
(imp, ['Age', 'Fare']),
remainder='passthrough')
If we wanted to scale "Age" and "Fare", we could make a Pipeline of imputation and scaling:
imp_scaler = make_pipeline(imp, scaler)
Then replace imp with imp_scaler in our ColumnTransformer:
ct_scaler = make_column_transformer(
(imp_ohe, ['Embarked', 'Sex']),
(vect, 'Name'),
(imp_scaler, ['Age', 'Fare']),
remainder='passthrough')
Update the Pipeline:
pipe_scaler = make_pipeline(ct_scaler, logreg)
Cross-validated accuracy has decreased slightly:
cross_val_score(pipe_scaler, X, y, cv=5, scoring='accuracy').mean()
0.8092210156299039
An alternative way to include scaling is to scale all columns:
from sklearn.preprocessing import MaxAbsScaler
scaler = MaxAbsScaler()
Update the Pipeline:
pipe_scaler = make_pipeline(ct, scaler, logreg)
Cross-validated accuracy is basically the same as our baseline (which did not include scaling):
cross_val_score(pipe_scaler, X, y, cv=5, scoring='accuracy').mean()
0.8114556525014123
We tried both approaches above:
I suggest trying both approaches, and see which one works better:
Here are two approaches you can try:
Feel free to email me if you have other suggestions for outlier handling!
All that is required is switching out the final step of the Pipeline to use a different model:
Here's a reminder of what's in our 10 row dataset, so that we can plan out a few custom transformations:
df_tiny
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
5 | 6 | 0 | 3 | Moran, Mr. James | male | NaN | 0 | 0 | 330877 | 8.4583 | NaN | Q |
6 | 7 | 0 | 1 | McCarthy, Mr. Timothy J | male | 54.0 | 0 | 0 | 17463 | 51.8625 | E46 | S |
7 | 8 | 0 | 3 | Palsson, Master. Gosta Leonard | male | 2.0 | 3 | 1 | 349909 | 21.0750 | NaN | S |
8 | 9 | 1 | 3 | Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) | female | 27.0 | 0 | 2 | 347742 | 11.1333 | NaN | S |
9 | 10 | 1 | 2 | Nasser, Mrs. Nicholas (Adele Achem) | female | 14.0 | 1 | 0 | 237736 | 30.0708 | NaN | C |
Let's pretend we believe that "Age" and "Fare" might be better features if we floor them (meaning round them down):
import numpy as np
np.floor(df_tiny[['Age', 'Fare']])
Age | Fare | |
---|---|---|
0 | 22.0 | 7.0 |
1 | 38.0 | 71.0 |
2 | 26.0 | 7.0 |
3 | 35.0 | 53.0 |
4 | 35.0 | 8.0 |
5 | NaN | 8.0 |
6 | 54.0 | 51.0 |
7 | 2.0 | 21.0 |
8 | 27.0 | 11.0 |
9 | 14.0 | 30.0 |
In order to do this transformation in scikit-learn, we need to convert the floor function into a scikit-learn transformer using FunctionTransformer:
from sklearn.preprocessing import FunctionTransformer
Pass the floor function to FunctionTransformer and it returns a scikit-learn transformer:
get_floor = FunctionTransformer(np.floor)
Because get_floor is a transformer, you can use the fit_transform method to perform transformations:
get_floor.fit_transform(df_tiny[['Age', 'Fare']])
Age | Fare | |
---|---|---|
0 | 22.0 | 7.0 |
1 | 38.0 | 71.0 |
2 | 26.0 | 7.0 |
3 | 35.0 | 53.0 |
4 | 35.0 | 8.0 |
5 | NaN | 8.0 |
6 | 54.0 | 51.0 |
7 | 2.0 | 21.0 |
8 | 27.0 | 11.0 |
9 | 14.0 | 30.0 |
get_floor can be included in a ColumnTransformer:
make_column_transformer(
(get_floor, ['Age', 'Fare']),
remainder='drop').fit_transform(df_tiny)
array([[22., 7.], [38., 71.], [26., 7.], [35., 53.], [35., 8.], [nan, 8.], [54., 51.], [ 2., 21.], [27., 11.], [14., 30.]])
Let's plan out a second custom transformation:
df_tiny
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
5 | 6 | 0 | 3 | Moran, Mr. James | male | NaN | 0 | 0 | 330877 | 8.4583 | NaN | Q |
6 | 7 | 0 | 1 | McCarthy, Mr. Timothy J | male | 54.0 | 0 | 0 | 17463 | 51.8625 | E46 | S |
7 | 8 | 0 | 3 | Palsson, Master. Gosta Leonard | male | 2.0 | 3 | 1 | 349909 | 21.0750 | NaN | S |
8 | 9 | 1 | 3 | Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) | female | 27.0 | 0 | 2 | 347742 | 11.1333 | NaN | S |
9 | 10 | 1 | 2 | Nasser, Mrs. Nicholas (Adele Achem) | female | 14.0 | 1 | 0 | 237736 | 30.0708 | NaN | C |
Use the pandas string slice method to extract the first letter:
df_tiny[['Cabin']].apply(lambda x: x.str.slice(0, 1))
Cabin | |
---|---|
0 | NaN |
1 | C |
2 | NaN |
3 | C |
4 | NaN |
5 | NaN |
6 | E |
7 | NaN |
8 | NaN |
9 | NaN |
Convert this operation into a custom function:
def first_letter(df):
return pd.DataFrame(df).apply(lambda x: x.str.slice(0, 1))
Convert the function to a transformer:
get_first_letter = FunctionTransformer(first_letter)
Add this transformer to the ColumnTransformer:
make_column_transformer(
(get_floor, ['Age', 'Fare']),
(get_first_letter, ['Cabin']),
remainder='drop').fit_transform(df_tiny)
array([[22.0, 7.0, nan], [38.0, 71.0, 'C'], [26.0, 7.0, nan], [35.0, 53.0, 'C'], [35.0, 8.0, nan], [nan, 8.0, nan], [54.0, 51.0, 'E'], [2.0, 21.0, nan], [27.0, 11.0, nan], [14.0, 30.0, nan]], dtype=object)
Two shape considerations to keep in mind when writing functions that will be used in a ColumnTransformer:
Let's plan out a third custom transformation:
df_tiny
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
5 | 6 | 0 | 3 | Moran, Mr. James | male | NaN | 0 | 0 | 330877 | 8.4583 | NaN | Q |
6 | 7 | 0 | 1 | McCarthy, Mr. Timothy J | male | 54.0 | 0 | 0 | 17463 | 51.8625 | E46 | S |
7 | 8 | 0 | 3 | Palsson, Master. Gosta Leonard | male | 2.0 | 3 | 1 | 349909 | 21.0750 | NaN | S |
8 | 9 | 1 | 3 | Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) | female | 27.0 | 0 | 2 | 347742 | 11.1333 | NaN | S |
9 | 10 | 1 | 2 | Nasser, Mrs. Nicholas (Adele Achem) | female | 14.0 | 1 | 0 | 237736 | 30.0708 | NaN | C |
Use the sum method over the columns axis:
df_tiny[['SibSp', 'Parch']].sum(axis=1)
0 1 1 1 2 0 3 1 4 0 5 0 6 0 7 4 8 2 9 1 dtype: int64
Convert this operation into a custom function:
def sum_cols(df):
return np.array(df).sum(axis=1).reshape(-1, 1)
Confirm that sum_cols returns 2D output:
sum_cols(df_tiny[['SibSp', 'Parch']])
array([[1], [1], [0], [1], [0], [0], [0], [4], [2], [1]])
Convert the function to a transformer and add it to the ColumnTransformer:
get_sum = FunctionTransformer(sum_cols)
make_column_transformer(
(get_floor, ['Age', 'Fare']),
(get_first_letter, ['Cabin']),
(get_sum, ['SibSp', 'Parch']),
remainder='drop').fit_transform(df_tiny)
array([[22.0, 7.0, nan, 1], [38.0, 71.0, 'C', 1], [26.0, 7.0, nan, 0], [35.0, 53.0, 'C', 1], [35.0, 8.0, nan, 0], [nan, 8.0, nan, 0], [54.0, 51.0, 'E', 0], [2.0, 21.0, nan, 4], [27.0, 11.0, nan, 2], [14.0, 30.0, nan, 1]], dtype=object)
Let's use these custom transformers on our entire dataset!
First, add "Cabin" and "SibSp" to the list of columns:
cols = ['Parch', 'Fare', 'Embarked', 'Sex', 'Name', 'Age', 'Cabin', 'SibSp']
Update X and X_new to include these columns:
X = df[cols]
X_new = df_new[cols]
Before we can add the custom transformers to our ColumnTransformer, we need to create two new Pipelines.
First, we need to account for the fact that "Age" and "Fare" have missing values:
imp_floor = make_pipeline(imp, get_floor)
Second, there are multiple complications with using get_first_letter on "Cabin", which you can see by examining the value_counts of the first letter:
X['Cabin'].str.slice(0, 1).value_counts(dropna=False)
NaN 687 C 59 B 47 D 33 E 32 A 15 F 13 G 4 T 1 Name: Cabin, dtype: int64
Create a Pipeline to get the first letter, then impute a constant value, then one-hot encode the results:
ohe_ignore = OneHotEncoder(handle_unknown='ignore')
letter_imp_ohe = make_pipeline(get_first_letter, imp_constant, ohe_ignore)
Add the three custom transformations to our primary ColumnTransformer:
ct = make_column_transformer(
(imp_ohe, ['Embarked', 'Sex']),
(vect, 'Name'),
(imp_floor, ['Age', 'Fare']),
(letter_imp_ohe, ['Cabin']),
(get_sum, ['SibSp', 'Parch']),
remainder='drop')
Update the Pipeline, fit on X, and make predictions for X_new:
pipe = make_pipeline(ct, logreg)
pipe.fit(X, y)
pipe.predict(X_new)
array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1])
Cross-validate the Pipeline:
cross_val_score(pipe, X, y, cv=5, scoring='accuracy').mean()
0.8271420500910175
Even though it's more work to do these custom transformations in scikit-learn, there are some considerable benefits:
Read the what's new page for all major releases:
Otherwise, just watch out for tutorials that teach new scikit-learn features: