Note
Click here to download the full example code
Column Transformer with Mixed Types¶
This example illustrates how to apply different preprocessing and
feature extraction pipelines to different subsets of features,
using sklearn.compose.ColumnTransformer
.
This is particularly handy for the case of datasets that contain
heterogeneous data types, since we may want to scale the
numeric features and one-hot encode the categorical ones.
In this example, the numeric data is standard-scaled after
mean-imputation, while the categorical data is one-hot
encoded after imputing missing values with a new category
('missing'
).
Finally, the preprocessing pipeline is integrated in a
full prediction pipeline using sklearn.pipeline.Pipeline
,
together with a simple classification model.
# Author: Pedro Morales <part.morales@gmail.com>
#
# License: BSD 3 clause
from __future__ import print_function
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
np.random.seed(0)
# Read data from Titanic dataset.
titanic_url = ('https://raw.githubusercontent.com/amueller/'
'scipy-2017-sklearn/091d371/notebooks/datasets/titanic3.csv')
data = pd.read_csv(titanic_url)
# We will train our classifier with the following features:
# Numeric Features:
# - age: float.
# - fare: float.
# Categorical Features:
# - embarked: categories encoded as strings {'C', 'S', 'Q'}.
# - sex: categories encoded as strings {'female', 'male'}.
# - pclass: ordinal integers {1, 2, 3}.
# We create the preprocessing pipelines for both numeric and categorical data.
numeric_features = ['age', 'fare']
numeric_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())])
categorical_features = ['embarked', 'sex', 'pclass']
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
('onehot', OneHotEncoder(handle_unknown='ignore'))])
preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, numeric_features),
('cat', categorical_transformer, categorical_features)])
# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', LogisticRegression(solver='lbfgs'))])
X = data.drop('survived', axis=1)
y = data['survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_test, y_test))
Traceback (most recent call last):
File "/build/scikit-learn-qZGLk4/scikit-learn-0.20.2+dfsg/examples/compose/plot_column_transformer_mixed_types.py", line 44, in <module>
data = pd.read_csv(titanic_url)
File "/usr/lib/python3/dist-packages/pandas/io/parsers.py", line 678, in parser_f
return _read(filepath_or_buffer, kwds)
File "/usr/lib/python3/dist-packages/pandas/io/parsers.py", line 424, in _read
filepath_or_buffer, encoding, compression)
File "/usr/lib/python3/dist-packages/pandas/io/common.py", line 195, in get_filepath_or_buffer
req = _urlopen(filepath_or_buffer)
File "/usr/lib/python3.7/urllib/request.py", line 222, in urlopen
return opener.open(url, data, timeout)
File "/usr/lib/python3.7/urllib/request.py", line 525, in open
response = self._open(req, data)
File "/usr/lib/python3.7/urllib/request.py", line 543, in _open
'_open', req)
File "/usr/lib/python3.7/urllib/request.py", line 503, in _call_chain
result = func(*args)
File "/usr/lib/python3.7/urllib/request.py", line 1360, in https_open
context=self._context, check_hostname=self._check_hostname)
File "/usr/lib/python3.7/urllib/request.py", line 1319, in do_open
raise URLError(err)
urllib.error.URLError: <urlopen error [Errno 111] Connection refused>
Using the prediction pipeline in a grid search¶
Grid search can also be performed on the different preprocessing steps defined in theColumnTransformer
object, together with the classifier’s hyperparameters as part of thePipeline
. We will search for both the imputer strategy of the numeric preprocessing and the regularization parameter of the logistic regression usingsklearn.model_selection.GridSearchCV
.
param_grid = {
'preprocessor__num__imputer__strategy': ['mean', 'median'],
'classifier__C': [0.1, 1.0, 10, 100],
}
grid_search = GridSearchCV(clf, param_grid, cv=10, iid=False)
grid_search.fit(X_train, y_train)
print(("best logistic regression from grid search: %.3f"
% grid_search.score(X_test, y_test)))
Total running time of the script: ( 0 minutes 0.000 seconds)