import sklearn
import pandas as pd
import numpy as np
import scipy
sklearn.__version__, pd.__version__, np.__version__, scipy.__version__
('0.23.1', '1.0.5', '1.19.0', '1.5.1')
from sklearn.pipeline import Pipeline
class DataframeFunctionTransformer():
def __init__(self, func):
self.func = func
def transform(self, input_df, **transform_params):
return self.func(input_df)
def fit(self, X, y=None, **fit_params):
return self
def process_dataframe(input_df):
input_df["text"] = input_df["text"].map(lambda t: t.upper())
return input_df
df = pd.DataFrame({
"id":[1,2,3,4],
"text":["foo","Bar","BAz","quux"]
})
df
id | text | |
---|---|---|
0 | 1 | foo |
1 | 2 | Bar |
2 | 3 | BAz |
3 | 4 | quux |
pipeline = Pipeline([
("lowercase", DataframeFunctionTransformer(process_dataframe))
])
pipeline.fit_transform(df)
id | text | |
---|---|---|
0 | 1 | FOO |
1 | 2 | BAR |
2 | 3 | BAZ |
3 | 4 | QUUX |
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.base import TransformerMixin,BaseEstimator
from sklearn.pipeline import Pipeline
data = scipy.sparse.csr_matrix([
[1.,0.,0.,0.,0.,0.],
[0.,1.,0.,0.,0.,0.],
[1.,0.,0.,0.,0.,0.],
[0.,0.,0.,0.,1.,0.],
[0.,0.,0.,1.,0.,0.],
[1.,0.,0.,0.,0.,0.],
[1.,1.,0.,0.,0.,0.],
[1.,1.,0.,0.,0.,0.],
])
target = np.array([1,1,1,0,0,0,1,1])
class ToDenseTransformer():
# here you define the operation it should perform
def transform(self, X, y=None, **fit_params):
return X.todense()
# just return self
def fit(self, X, y=None, **fit_params):
return self
# need to make matrices dense because PCA does not work with sparse vectors.
pipeline = Pipeline([
('to_dense',ToDenseTransformer()),
('pca',PCA()),
('clf',DecisionTreeClassifier())
])
pipeline.fit(data,target)
pipeline.predict(data)
array([1, 1, 1, 0, 0, 1, 1, 1])
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
df = pd.DataFrame({
'name':['alice','bob','charlie','david','edward'],
'age':[24,32,np.nan,38,20]
})
df.head()
name | age | |
---|---|---|
0 | alice | 24.0 |
1 | bob | 32.0 |
2 | charlie | NaN |
3 | david | 38.0 |
4 | edward | 20.0 |
transformer_step = ColumnTransformer([
('impute_mean', SimpleImputer(strategy='mean'), ['age'])
], remainder='passthrough')
pipe = Pipeline([
('transformer', transformer_step)
])
pipe.fit(df)
pd.DataFrame(
data=pipe.transform(df),
columns=['age', 'name']
)[["name","age"]]
name | age | |
---|---|---|
0 | alice | 24 |
1 | bob | 32 |
2 | charlie | 28.5 |
3 | david | 38 |
4 | edward | 20 |
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
df = pd.DataFrame({
'favorite_color':['blue','green','red','green','blue'],
'age': [10,15,10,np.nan,10],
'target':[1,0,1,0,1]
})
df
favorite_color | age | target | |
---|---|---|---|
0 | blue | 10.0 | 1 |
1 | green | 15.0 | 0 |
2 | red | 10.0 | 1 |
3 | green | NaN | 0 |
4 | blue | 10.0 | 1 |
# define individual transformers in a pipeline
categorical_preprocessing = Pipeline([('ohe', OneHotEncoder())])
numerical_preprocessing = Pipeline([('imputation', SimpleImputer())])
# define which transformer applies to which columns
preprocess = ColumnTransformer([
('categorical_preprocessing', categorical_preprocessing, ['favorite_color']),
('numerical_preprocessing', numerical_preprocessing, ['age'])
])
# create the final pipeline with preprocessing steps and
# the final classifier step
pipeline = Pipeline([
('preprocess', preprocess),
('clf', DecisionTreeClassifier())
])
# now fit the pipeline using the whole dataframe
df_features = df[['favorite_color','age']]
df_target = df['target']
pipeline.fit(df_features, df_target)
Pipeline(steps=[('preprocess', ColumnTransformer(transformers=[('categorical_preprocessing', Pipeline(steps=[('ohe', OneHotEncoder())]), ['favorite_color']), ('numerical_preprocessing', Pipeline(steps=[('imputation', SimpleImputer())]), ['age'])])), ('clf', DecisionTreeClassifier())])
import pandas as pd
from sklearn.base import TransformerMixin,BaseEstimator
from sklearn.pipeline import Pipeline
class SelectColumnsTransfomer():
def __init__(self, columns=None):
self.columns = columns
def transform(self, X, **transform_params):
cpy_df = X[self.columns].copy()
return cpy_df
def fit(self, X, y=None, **fit_params):
return self
df = pd.DataFrame({
'name':['alice','bob','charlie','david','edward'],
'age':[24,32,np.nan,38,20]
})
df.head()
name | age | |
---|---|---|
0 | alice | 24.0 |
1 | bob | 32.0 |
2 | charlie | NaN |
3 | david | 38.0 |
4 | edward | 20.0 |
pipe = Pipeline([
('selector', SelectColumnsTransfomer(["name"]))
])
pipe.fit_transform(df)
name | |
---|---|
0 | alice |
1 | bob |
2 | charlie |
3 | david |
4 | edward |
import pandas as pd
from nltk.stem import RSLPStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import FunctionTransformer
def stem_str(input_series, stemmer):
def stem(input_str):
return " ".join([stemmer.stem(t) for t in input_str.split(" ")]).strip()
return input_series.apply(stem)
pipeline = Pipeline([
('stemmer', FunctionTransformer(
func=stem_str,
kw_args={'stemmer': RSLPStemmer()})),
('vect', TfidfVectorizer()),
('clf', LogisticRegression())
])
df = pd.DataFrame({
'text':[
'Lorem ipsum dolor sit amet, consectetur adipiscing elit.',
'Sed accumsan congue enim non pretium.',
'In hac habitasse platea dictumst.',
'Sed tincidunt ipsum nec urna vulputate luctus.'
],
'target':[0, 1, 0, 1]
})
df
text | target | |
---|---|---|
0 | Lorem ipsum dolor sit amet, consectetur adipis... | 0 |
1 | Sed accumsan congue enim non pretium. | 1 |
2 | In hac habitasse platea dictumst. | 0 |
3 | Sed tincidunt ipsum nec urna vulputate luctus. | 1 |
pipeline.fit(df['text'],df['target'])
Pipeline(steps=[('stemmer', FunctionTransformer(func=<function stem_str at 0x7f2a259ffe18>, kw_args={'stemmer': <nltk.stem.rslp.RSLPStemmer object at 0x7f2a25429630>})), ('vect', TfidfVectorizer()), ('clf', LogisticRegression())])
pipeline.predict(df['text'])
array([0, 1, 0, 1])