Created by Data School. Watch all 10 videos on YouTube. Download the notebooks from GitHub.
Note: This notebook uses Python 3.9.1 and scikit-learn 0.23.2. The original notebook (shown in the video) used Python 3.7 and scikit-learn 0.20.2.
import pandas as pd
df = pd.read_csv('http://bit.ly/kaggletrain')
df.shape
(891, 12)
df.columns
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'], dtype='object')
df.isna().sum()
PassengerId 0 Survived 0 Pclass 0 Name 0 Sex 0 Age 177 SibSp 0 Parch 0 Ticket 0 Fare 0 Cabin 687 Embarked 2 dtype: int64
df = df.loc[df.Embarked.notna(), ['Survived', 'Pclass', 'Sex', 'Embarked']]
df.shape
(889, 4)
df.isna().sum()
Survived 0 Pclass 0 Sex 0 Embarked 0 dtype: int64
df.head()
Survived | Pclass | Sex | Embarked | |
---|---|---|---|---|
0 | 0 | 3 | male | S |
1 | 1 | 1 | female | C |
2 | 1 | 3 | female | S |
3 | 1 | 1 | female | S |
4 | 0 | 3 | male | S |
X = df.loc[:, ['Pclass']]
y = df.Survived
X.shape
(889, 1)
y.shape
(889,)
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
from sklearn.model_selection import cross_val_score
cross_val_score(logreg, X, y, cv=5, scoring='accuracy').mean()
0.6783406335301212
y.value_counts(normalize=True)
0 0.617548 1 0.382452 Name: Survived, dtype: float64
df.head()
Survived | Pclass | Sex | Embarked | |
---|---|---|---|---|
0 | 0 | 3 | male | S |
1 | 1 | 1 | female | C |
2 | 1 | 3 | female | S |
3 | 1 | 1 | female | S |
4 | 0 | 3 | male | S |
# dummy encoding of categorical features
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse=False)
ohe.fit_transform(df[['Sex']])
array([[0., 1.], [1., 0.], [1., 0.], ..., [1., 0.], [0., 1.], [0., 1.]])
ohe.categories_
[array(['female', 'male'], dtype=object)]
ohe.fit_transform(df[['Embarked']])
array([[0., 0., 1.], [1., 0., 0.], [0., 0., 1.], ..., [0., 0., 1.], [1., 0., 0.], [0., 1., 0.]])
ohe.categories_
[array(['C', 'Q', 'S'], dtype=object)]
X = df.drop('Survived', axis='columns')
X.head()
Pclass | Sex | Embarked | |
---|---|---|---|
0 | 3 | male | S |
1 | 1 | female | C |
2 | 3 | female | S |
3 | 1 | female | S |
4 | 3 | male | S |
# use when different features need different preprocessing
from sklearn.compose import make_column_transformer
column_trans = make_column_transformer(
(OneHotEncoder(), ['Sex', 'Embarked']),
remainder='passthrough')
column_trans.fit_transform(X)
array([[0., 1., 0., 0., 1., 3.], [1., 0., 1., 0., 0., 1.], [1., 0., 0., 0., 1., 3.], ..., [1., 0., 0., 0., 1., 3.], [0., 1., 1., 0., 0., 1.], [0., 1., 0., 1., 0., 3.]])
# chain sequential steps together
from sklearn.pipeline import make_pipeline
pipe = make_pipeline(column_trans, logreg)
# cross-validate the entire process
# thus, preprocessing occurs within each fold of cross-validation
cross_val_score(pipe, X, y, cv=5, scoring='accuracy').mean()
0.7727924839713071
# added empty cell so that the cell numbering matches the video
X_new = X.sample(5, random_state=99)
X_new
Pclass | Sex | Embarked | |
---|---|---|---|
599 | 1 | male | C |
512 | 1 | male | S |
273 | 1 | male | C |
215 | 1 | female | C |
790 | 3 | male | Q |
pipe.fit(X, y)
Pipeline(steps=[('columntransformer', ColumnTransformer(remainder='passthrough', transformers=[('onehotencoder', OneHotEncoder(), ['Sex', 'Embarked'])])), ('logisticregression', LogisticRegression())])
pipe.predict(X_new)
array([1, 0, 1, 1, 0])
import pandas as pd
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
df = pd.read_csv('http://bit.ly/kaggletrain')
df = df.loc[df.Embarked.notna(), ['Survived', 'Pclass', 'Sex', 'Embarked']]
X = df.drop('Survived', axis='columns')
y = df.Survived
column_trans = make_column_transformer(
(OneHotEncoder(), ['Sex', 'Embarked']),
remainder='passthrough')
logreg = LogisticRegression(solver='lbfgs')
pipe = make_pipeline(column_trans, logreg)
cross_val_score(pipe, X, y, cv=5, scoring='accuracy').mean()
0.7727924839713071
X_new = X.sample(5, random_state=99)
pipe.fit(X, y)
pipe.predict(X_new)
array([1, 0, 1, 1, 0])
© 2021 Data School. All rights reserved.