import sklearn
import pandas as pd
import numpy as np
import scipy
sklearn.__version__, pd.__version__, np.__version__, scipy.__version__
from sklearn.pipeline import Pipeline
class DataframeFunctionTransformer():
def __init__(self, func):
self.func = func
def transform(self, input_df, **transform_params):
return self.func(input_df)
def fit(self, X, y=None, **fit_params):
return self
def process_dataframe(input_df):
input_df["text"] = input_df["text"].map(lambda t: t.upper())
return input_df
df = pd.DataFrame({
"id":[1,2,3,4],
"text":["foo","Bar","BAz","quux"]
})
df
pipeline = Pipeline([
("lowercase", DataframeFunctionTransformer(process_dataframe))
])
pipeline.fit_transform(df)
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.base import TransformerMixin,BaseEstimator
from sklearn.pipeline import Pipeline
data = scipy.sparse.csr_matrix([
[1.,0.,0.,0.,0.,0.],
[0.,1.,0.,0.,0.,0.],
[1.,0.,0.,0.,0.,0.],
[0.,0.,0.,0.,1.,0.],
[0.,0.,0.,1.,0.,0.],
[1.,0.,0.,0.,0.,0.],
[1.,1.,0.,0.,0.,0.],
[1.,1.,0.,0.,0.,0.],
])
target = np.array([1,1,1,0,0,0,1,1])
class ToDenseTransformer():
# here you define the operation it should perform
def transform(self, X, y=None, **fit_params):
return X.todense()
# just return self
def fit(self, X, y=None, **fit_params):
return self
# need to make matrices dense because PCA does not work with sparse vectors.
pipeline = Pipeline([
('to_dense',ToDenseTransformer()),
('pca',PCA()),
('clf',DecisionTreeClassifier())
])
pipeline.fit(data,target)
pipeline.predict(data)
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
df = pd.DataFrame({
'name':['alice','bob','charlie','david','edward'],
'age':[24,32,np.nan,38,20]
})
df.head()
transformer_step = ColumnTransformer([
('impute_mean', SimpleImputer(strategy='mean'), ['age'])
], remainder='passthrough')
pipe = Pipeline([
('transformer', transformer_step)
])
pipe.fit(df)
pd.DataFrame(
data=pipe.transform(df),
columns=['age', 'name']
)[["name","age"]]
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
df = pd.DataFrame({
'favorite_color':['blue','green','red','green','blue'],
'age': [10,15,10,np.nan,10],
'target':[1,0,1,0,1]
})
df
# define individual transformers in a pipeline
categorical_preprocessing = Pipeline([('ohe', OneHotEncoder())])
numerical_preprocessing = Pipeline([('imputation', SimpleImputer())])
# define which transformer applies to which columns
preprocess = ColumnTransformer([
('categorical_preprocessing', categorical_preprocessing, ['favorite_color']),
('numerical_preprocessing', numerical_preprocessing, ['age'])
])
# create the final pipeline with preprocessing steps and
# the final classifier step
pipeline = Pipeline([
('preprocess', preprocess),
('clf', DecisionTreeClassifier())
])
# now fit the pipeline using the whole dataframe
df_features = df[['favorite_color','age']]
df_target = df['target']
pipeline.fit(df_features, df_target)
import pandas as pd
from sklearn.base import TransformerMixin,BaseEstimator
from sklearn.pipeline import Pipeline
class SelectColumnsTransfomer():
def __init__(self, columns=None):
self.columns = columns
def transform(self, X, **transform_params):
cpy_df = X[self.columns].copy()
return cpy_df
def fit(self, X, y=None, **fit_params):
return self
df = pd.DataFrame({
'name':['alice','bob','charlie','david','edward'],
'age':[24,32,np.nan,38,20]
})
df.head()
pipe = Pipeline([
('selector', SelectColumnsTransfomer(["name"]))
])
pipe.fit_transform(df)
import pandas as pd
from nltk.stem import RSLPStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import FunctionTransformer
def stem_str(input_series, stemmer):
def stem(input_str):
return " ".join([stemmer.stem(t) for t in input_str.split(" ")]).strip()
return input_series.apply(stem)
pipeline = Pipeline([
('stemmer', FunctionTransformer(
func=stem_str,
kw_args={'stemmer': RSLPStemmer()})),
('vect', TfidfVectorizer()),
('clf', LogisticRegression())
])
df = pd.DataFrame({
'text':[
'Lorem ipsum dolor sit amet, consectetur adipiscing elit.',
'Sed accumsan congue enim non pretium.',
'In hac habitasse platea dictumst.',
'Sed tincidunt ipsum nec urna vulputate luctus.'
],
'target':[0, 1, 0, 1]
})
df
pipeline.fit(df['text'],df['target'])
pipeline.predict(df['text'])