#!/usr/bin/env python # coding: utf-8 # In[1]: np.set_printoptions(precision=3, linewidth=100) # # Introducing the `ColumnTransformer`: applying different transformations to different features in a scikit-learn pipeline # *This work is supported by the Université Paris-Saclay Center for Data Science* # #

# Short summary: the ColumnTransformer, which allows to apply different transformers to different features, has landed in scikit-learn (the PR has been merged in master and this will be included in the upcoming release 0.20). #

# # --- # Real-world data often contains heterogeneous data types. When processing the data before applying the final prediction model, we typically want to use different preprocessing steps and transformations for those different types of columns. # A simple example: we may want to scale the numerical features and one-hot encode the categorical features. # # Up to now, scikit-learn did not provide a good solution to do this out of the box. You can do the preprocessing beforehand using eg pandas, or you can select subsets of columns and apply different transformers on them manually. But, that does not easily allow to put those preprocessing steps in a scikit-learn `Pipeline`, which can be important to avoid data leakage or to do a grid search over preprocessing parameters. # # There are third-party projects that try to address this. For example, the [`sklearn_pandas`](https://github.com/scikit-learn-contrib/sklearn-pandas) package has a `DataFrameMapper` that maps subsets of a DataFrame's columns to a specific transformation. Many thanks to the authors of this library, as such "contrib" packages are essential in extending the functionality of scikit-learn, and to explore things that would take a long time in scikit-learn itself. # The `ColumnTransformer` aims to bring this functionality into the core scikit-learn library, with support for numpy arrays and sparse matrices, and good integration with the rest of scikit-learn. # # ### Basic example # To illustrate the basic usage of the `ColumnTransformer`, let's load the titanic survival dataset: # In[2]: titanic = pd.read_csv("https://raw.githubusercontent.com/amueller/scipy-2017-sklearn/master/notebooks/datasets/titanic3.csv") # there is still a small problem with using the OneHotEncoder and missing values, # so for now I am going to assume there are no missing values by dropping them titanic2 = titanic.dropna(subset=['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked']) # Selecting some of the features and target: # In[3]: target = titanic2.survived.values features = titanic2[['pclass', 'sex', 'age', 'fare', 'embarked']] # In[4]: features.head() # This dataset contains some categorical variables ("pclass", "sex" and "embarked"), and some numerical variables ("age" and "fare"). Note that the "pclass", although categorical, is already encoded as integers in the dataset. # So let's use the `ColumnTransformer` to **combine transformers** for those two types of features: # In[5]: from sklearn.preprocessing import StandardScaler, OneHotEncoder from sklearn.compose import ColumnTransformer, make_column_transformer # In[6]: preprocess = make_column_transformer( (['age', 'fare'], StandardScaler()), (['pclass', 'sex', 'embarked'], OneHotEncoder()) ) # The above creates a simple preprocessing pipeline (that will be combined in a full prediction pipeline below) to scale the numerical features and one-hot encode the categorical features. # We can check this is indeed working as expected by transforming the input data # In[7]: preprocess.fit_transform(features).toarray()[:5] # In the above, we specified the subsets of columns as lists. We can also use boolean masks (eg to make a selection of the columns based on the data types), integer positions and slices. Further, the `ColumnTransformer` allows you to specify whether to drop or pass through other columns that were not specified. See the [development docs](http://scikit-learn.org/dev/modules/generated/sklearn.compose.ColumnTransformer.html) for more details. # # **This is new functionality in scikit-learn, so you are very welcome to try out the development version, experiment with it in your use cases, and provide feedback!** I am sure there are ways to further improve this functionality (the [PR](https://github.com/scikit-learn/scikit-learn/pull/9012)) # The rest of the post shows a more complete example of using the ColumnTransformer in a scikit-learn pipeline. # # --- # # ### Integrating in a full pipeline # Now let's show a full example where we integrate the `ColumnTransformer` in a prediction pipeline. # In[8]: from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline from sklearn.linear_model import LogisticRegression from sklearn.impute import SimpleImputer # Redefining `target` and `features` to take the full dataset this time including the missing values: # In[9]: target = titanic.survived.values features = titanic[['pclass', 'sex', 'age', 'fare', 'embarked']].copy() # I still fill the missing values for the embarked column, because we cannot (yet) easily handle categorical missing values features['embarked'].fillna(features['embarked'].value_counts().index[0], inplace=True) # Then, let's split the data in training and testing data: # In[10]: X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=0) # Here we will create again a ColumnTransformer, but now using more advanced features: we use a mask to select the column subsets based on the dtypes, and we use another pipeline to combine imputation and scaling for the numerical features: # In[11]: numerical_features = features.dtypes == 'float' categorical_features = ~numerical_features # In[12]: preprocess = make_column_transformer( (numerical_features, make_pipeline(SimpleImputer(), StandardScaler())), (categorical_features, OneHotEncoder())) # Now we can combine this preprocessing step based on the ColumnTransformer with a classifier in a Pipeline to predict whether passengers of the Titanic survived or not: # In[13]: model = make_pipeline( preprocess, LogisticRegression()) # In[14]: model.fit(X_train, y_train) print("logistic regression score: %f" % model.score(X_test, y_test)) # ### Using our pipeline in a grid search # # The ColumnTransformer integrates well with the rest of scikit-learn. For example, we can now also do a grid search on the parameters of the different preprocessing steps. # In[15]: from sklearn.model_selection import GridSearchCV # Defining a simple search grid, where I search for both the Imputer strategy of the numerical preprocessing step as for the regularization parameter of the logistic regression step: # In[16]: param_grid = { 'columntransformer__pipeline__simpleimputer__strategy': ['mean', 'median'], 'logisticregression__C': [0.1, 1.0, 1.0], } # Performing the grid search: # In[17]: grid_clf = GridSearchCV(model, param_grid, cv=10, iid=False) grid_clf.fit(X_train, y_train); # In[18]: grid_clf.best_params_ # In[19]: print("best logistic regression from grid search: %f" % grid_clf.best_estimator_.score(X_test, y_test)) # --- # # *This post was written in a Jupyter notebook. You can [download](http://jorisvandenbossche.github.io/downloads/notebooks/scikit-learn-columntransformer.ipynb) this notebook, or see a static view [on nbviewer](http://nbviewer.jupyter.org/url/jorisvandenbossche.github.io/downloads/notebooks/scikit-learn-columntransformer.ipynb).*