sklearn Pipelines

Custom pipelines

Let’s start with the Titanic data as an example.

import pandas as pd
from typing import List

df = pd.read_csv("data/titanic/train.csv")
df.head()
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S

Select columns

A TransformerMixin which allows to select a dataframe column subset.

from sklearn.base import TransformerMixin

class SelectColumns(TransformerMixin):
    def __init__(self, cols: List[str]) -> None:
        self.cols = cols

    def fit(self, x: None) -> "SelectColumns":
        return self

    def transform(self, x: pd.DataFrame) -> pd.DataFrame:
        return x[self.cols]
transformer = SelectColumns(cols=['Name', 'Survived'])
transformer.transform(df)
Name Survived
0 Braund, Mr. Owen Harris 0
1 Cumings, Mrs. John Bradley (Florence Briggs Th... 1
2 Heikkinen, Miss. Laina 1
3 Futrelle, Mrs. Jacques Heath (Lily May Peel) 1
4 Allen, Mr. William Henry 0
... ... ...
886 Montvila, Rev. Juozas 0
887 Graham, Miss. Margaret Edith 1
888 Johnston, Miss. Catherine Helen "Carrie" 0
889 Behr, Mr. Karl Howell 1
890 Dooley, Mr. Patrick 0

891 rows × 2 columns

Value manipulation

Extracting name

Creating a custom pipeline to extract a partial Name from the dataframe.

from sklearn.pipeline import Pipeline

class DataframeFunctionTransformer(TransformerMixin):
    def __init__(self, func):
        self.func = func

    def transform(self, input_df, **transform_params):
        return self.func(input_df)

    def fit(self, X, y=None, **fit_params):
        return self
def process_name(input_df):
    input_df["Name"] = input_df["Name"].map(lambda name: name.split(",")[0])
    return input_df
pipeline = Pipeline([
    ("name", DataframeFunctionTransformer(process_name))
])
pipeline.fit_transform(df)[["Name", "Survived"]]
Name Survived
0 Braund 0
1 Cumings 1
2 Heikkinen 1
3 Futrelle 1
4 Allen 0
... ... ...
886 Montvila 0
887 Graham 1
888 Johnston 0
889 Behr 1
890 Dooley 0

891 rows × 2 columns

Encoding labels

import numpy as np

class LabelEncoder(TransformerMixin):
    def fit(self, x: pd.DataFrame) -> "LabelEncoder":
        encoders = {}
        for c in x:
            v, k = zip(pd.factorize(x[c].unique()))
            encoders[c] = dict(zip(k[0], v[0]))
        self.encoders_ = encoders
        return self

    def transform(self, x) -> pd.DataFrame:
        x = x.copy()
        for c in x:
            # Ignore new, unseen values
            x.loc[~x[c].isin(self.encoders_[c]), c] = np.nan
            # Map learned labels
            x.loc[:, c] = x[c].map(self.encoders_[c])

        # Return without nans
        return x.fillna(-2).astype(int)
le = LabelEncoder()
le.fit_transform(df[['Pclass', 'Sex']])
Pclass Sex
0 0 0
1 1 1
2 0 1
3 1 1
4 0 0
... ... ...
886 2 0
887 1 1
888 0 1
889 1 0
890 0 0

891 rows × 2 columns