import pandas as pd
import sklearn.preprocessing as pre
import numpy as np
df = pd.read_csv("https://raw.githubusercontent.com/ianmcloughlin/datasets/master/iris.csv")
df
sepal_length | sepal_width | petal_length | petal_width | class | |
---|---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
1 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
2 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
3 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
4 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
... | ... | ... | ... | ... | ... |
145 | 6.7 | 3.0 | 5.2 | 2.3 | virginica |
146 | 6.3 | 2.5 | 5.0 | 1.9 | virginica |
147 | 6.5 | 3.0 | 5.2 | 2.0 | virginica |
148 | 6.2 | 3.4 | 5.4 | 2.3 | virginica |
149 | 5.9 | 3.0 | 5.1 | 1.8 | virginica |
150 rows × 5 columns
x = df.iloc[:, 0:4]
x
sepal_length | sepal_width | petal_length | petal_width | |
---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 |
1 | 4.9 | 3.0 | 1.4 | 0.2 |
2 | 4.7 | 3.2 | 1.3 | 0.2 |
3 | 4.6 | 3.1 | 1.5 | 0.2 |
4 | 5.0 | 3.6 | 1.4 | 0.2 |
... | ... | ... | ... | ... |
145 | 6.7 | 3.0 | 5.2 | 2.3 |
146 | 6.3 | 2.5 | 5.0 | 1.9 |
147 | 6.5 | 3.0 | 5.2 | 2.0 |
148 | 6.2 | 3.4 | 5.4 | 2.3 |
149 | 5.9 | 3.0 | 5.1 | 1.8 |
150 rows × 4 columns
# https://scikit-learn.org/stable/modules/preprocessing.html#preprocessing
xscale = pd.DataFrame(pre.scale(x), columns=x.columns)
xscale
sepal_length | sepal_width | petal_length | petal_width | |
---|---|---|---|---|
0 | -0.900681 | 1.019004 | -1.340227 | -1.315444 |
1 | -1.143017 | -0.131979 | -1.340227 | -1.315444 |
2 | -1.385353 | 0.328414 | -1.397064 | -1.315444 |
3 | -1.506521 | 0.098217 | -1.283389 | -1.315444 |
4 | -1.021849 | 1.249201 | -1.340227 | -1.315444 |
... | ... | ... | ... | ... |
145 | 1.038005 | -0.131979 | 0.819596 | 1.448832 |
146 | 0.553333 | -1.282963 | 0.705921 | 0.922303 |
147 | 0.795669 | -0.131979 | 0.819596 | 1.053935 |
148 | 0.432165 | 0.788808 | 0.933271 | 1.448832 |
149 | 0.068662 | -0.131979 | 0.762758 | 0.790671 |
150 rows × 4 columns
scaler = pre.StandardScaler()
scaler.fit(x)
scaler.mean_, scaler.scale_
(array([5.84333333, 3.05733333, 3.758 , 1.19933333]), array([0.82530129, 0.43441097, 1.75940407, 0.75969263]))
xscale = pd.DataFrame(scaler.transform(x), columns=x.columns)
xscale
sepal_length | sepal_width | petal_length | petal_width | |
---|---|---|---|---|
0 | -0.900681 | 1.019004 | -1.340227 | -1.315444 |
1 | -1.143017 | -0.131979 | -1.340227 | -1.315444 |
2 | -1.385353 | 0.328414 | -1.397064 | -1.315444 |
3 | -1.506521 | 0.098217 | -1.283389 | -1.315444 |
4 | -1.021849 | 1.249201 | -1.340227 | -1.315444 |
... | ... | ... | ... | ... |
145 | 1.038005 | -0.131979 | 0.819596 | 1.448832 |
146 | 0.553333 | -1.282963 | 0.705921 | 0.922303 |
147 | 0.795669 | -0.131979 | 0.819596 | 1.053935 |
148 | 0.432165 | 0.788808 | 0.933271 | 1.448832 |
149 | 0.068662 | -0.131979 | 0.762758 | 0.790671 |
150 rows × 4 columns
scaler.transform(np.array([[1.0, 0.5, 10.0, 4.1]]))
array([[-5.86856386, -5.88689864, 3.54779219, 3.81821089]])
y = df[['class']]
y
class | |
---|---|
0 | setosa |
1 | setosa |
2 | setosa |
3 | setosa |
4 | setosa |
... | ... |
145 | virginica |
146 | virginica |
147 | virginica |
148 | virginica |
149 | virginica |
150 rows × 1 columns
encoder = pre.OneHotEncoder()
encoder.fit(y)
yencoded = encoder.transform(y)
yencoded.toarray()
array([[1., 0., 0.], [1., 0., 0.], [1., 0., 0.], [1., 0., 0.], [1., 0., 0.], [1., 0., 0.], [1., 0., 0.], [1., 0., 0.], [1., 0., 0.], [1., 0., 0.], [1., 0., 0.], [1., 0., 0.], [1., 0., 0.], [1., 0., 0.], [1., 0., 0.], [1., 0., 0.], [1., 0., 0.], [1., 0., 0.], [1., 0., 0.], [1., 0., 0.], [1., 0., 0.], [1., 0., 0.], [1., 0., 0.], [1., 0., 0.], [1., 0., 0.], [1., 0., 0.], [1., 0., 0.], [1., 0., 0.], [1., 0., 0.], [1., 0., 0.], [1., 0., 0.], [1., 0., 0.], [1., 0., 0.], [1., 0., 0.], [1., 0., 0.], [1., 0., 0.], [1., 0., 0.], [1., 0., 0.], [1., 0., 0.], [1., 0., 0.], [1., 0., 0.], [1., 0., 0.], [1., 0., 0.], [1., 0., 0.], [1., 0., 0.], [1., 0., 0.], [1., 0., 0.], [1., 0., 0.], [1., 0., 0.], [1., 0., 0.], [0., 1., 0.], [0., 1., 0.], [0., 1., 0.], [0., 1., 0.], [0., 1., 0.], [0., 1., 0.], [0., 1., 0.], [0., 1., 0.], [0., 1., 0.], [0., 1., 0.], [0., 1., 0.], [0., 1., 0.], [0., 1., 0.], [0., 1., 0.], [0., 1., 0.], [0., 1., 0.], [0., 1., 0.], [0., 1., 0.], [0., 1., 0.], [0., 1., 0.], [0., 1., 0.], [0., 1., 0.], [0., 1., 0.], [0., 1., 0.], [0., 1., 0.], [0., 1., 0.], [0., 1., 0.], [0., 1., 0.], [0., 1., 0.], [0., 1., 0.], [0., 1., 0.], [0., 1., 0.], [0., 1., 0.], [0., 1., 0.], [0., 1., 0.], [0., 1., 0.], [0., 1., 0.], [0., 1., 0.], [0., 1., 0.], [0., 1., 0.], [0., 1., 0.], [0., 1., 0.], [0., 1., 0.], [0., 1., 0.], [0., 1., 0.], [0., 1., 0.], [0., 1., 0.], [0., 1., 0.], [0., 1., 0.], [0., 1., 0.], [0., 0., 1.], [0., 0., 1.], [0., 0., 1.], [0., 0., 1.], [0., 0., 1.], [0., 0., 1.], [0., 0., 1.], [0., 0., 1.], [0., 0., 1.], [0., 0., 1.], [0., 0., 1.], [0., 0., 1.], [0., 0., 1.], [0., 0., 1.], [0., 0., 1.], [0., 0., 1.], [0., 0., 1.], [0., 0., 1.], [0., 0., 1.], [0., 0., 1.], [0., 0., 1.], [0., 0., 1.], [0., 0., 1.], [0., 0., 1.], [0., 0., 1.], [0., 0., 1.], [0., 0., 1.], [0., 0., 1.], [0., 0., 1.], [0., 0., 1.], [0., 0., 1.], [0., 0., 1.], [0., 0., 1.], [0., 0., 1.], [0., 0., 1.], [0., 0., 1.], [0., 0., 1.], [0., 0., 1.], [0., 0., 1.], [0., 0., 1.], [0., 0., 1.], [0., 0., 1.], [0., 0., 1.], [0., 0., 1.], [0., 0., 1.], [0., 0., 1.], [0., 0., 1.], [0., 0., 1.], [0., 0., 1.], [0., 0., 1.]])
encoder.inverse_transform(yencoded)
array([['setosa'], ['setosa'], ['setosa'], ['setosa'], ['setosa'], ['setosa'], ['setosa'], ['setosa'], ['setosa'], ['setosa'], ['setosa'], ['setosa'], ['setosa'], ['setosa'], ['setosa'], ['setosa'], ['setosa'], ['setosa'], ['setosa'], ['setosa'], ['setosa'], ['setosa'], ['setosa'], ['setosa'], ['setosa'], ['setosa'], ['setosa'], ['setosa'], ['setosa'], ['setosa'], ['setosa'], ['setosa'], ['setosa'], ['setosa'], ['setosa'], ['setosa'], ['setosa'], ['setosa'], ['setosa'], ['setosa'], ['setosa'], ['setosa'], ['setosa'], ['setosa'], ['setosa'], ['setosa'], ['setosa'], ['setosa'], ['setosa'], ['setosa'], ['versicolor'], ['versicolor'], ['versicolor'], ['versicolor'], ['versicolor'], ['versicolor'], ['versicolor'], ['versicolor'], ['versicolor'], ['versicolor'], ['versicolor'], ['versicolor'], ['versicolor'], ['versicolor'], ['versicolor'], ['versicolor'], ['versicolor'], ['versicolor'], ['versicolor'], ['versicolor'], ['versicolor'], ['versicolor'], ['versicolor'], ['versicolor'], ['versicolor'], ['versicolor'], ['versicolor'], ['versicolor'], ['versicolor'], ['versicolor'], ['versicolor'], ['versicolor'], ['versicolor'], ['versicolor'], ['versicolor'], ['versicolor'], ['versicolor'], ['versicolor'], ['versicolor'], ['versicolor'], ['versicolor'], ['versicolor'], ['versicolor'], ['versicolor'], ['versicolor'], ['versicolor'], ['versicolor'], ['versicolor'], ['versicolor'], ['versicolor'], ['virginica'], ['virginica'], ['virginica'], ['virginica'], ['virginica'], ['virginica'], ['virginica'], ['virginica'], ['virginica'], ['virginica'], ['virginica'], ['virginica'], ['virginica'], ['virginica'], ['virginica'], ['virginica'], ['virginica'], ['virginica'], ['virginica'], ['virginica'], ['virginica'], ['virginica'], ['virginica'], ['virginica'], ['virginica'], ['virginica'], ['virginica'], ['virginica'], ['virginica'], ['virginica'], ['virginica'], ['virginica'], ['virginica'], ['virginica'], ['virginica'], ['virginica'], ['virginica'], ['virginica'], ['virginica'], ['virginica'], ['virginica'], ['virginica'], ['virginica'], ['virginica'], ['virginica'], ['virginica'], ['virginica'], ['virginica'], ['virginica'], ['virginica']], dtype=object)
x.corr()
sepal_length | sepal_width | petal_length | petal_width | |
---|---|---|---|---|
sepal_length | 1.000000 | -0.117570 | 0.871754 | 0.817941 |
sepal_width | -0.117570 | 1.000000 | -0.428440 | -0.366126 |
petal_length | 0.871754 | -0.428440 | 1.000000 | 0.962865 |
petal_width | 0.817941 | -0.366126 | 0.962865 | 1.000000 |
import sklearn.decomposition as dec
pca = dec.PCA(n_components=4, whiten=True)
pca.fit(x)
xwhite = pd.DataFrame(pca.transform(x), columns=x.columns)
xwhite
sepal_length | sepal_width | petal_length | petal_width | |
---|---|---|---|---|
0 | -1.305338 | 0.648369 | -0.099817 | -0.014654 |
1 | -1.319935 | -0.359309 | -0.752573 | -0.641421 |
2 | -1.404967 | -0.294244 | 0.064007 | -0.129341 |
3 | -1.335109 | -0.646140 | 0.112849 | 0.489524 |
4 | -1.327023 | 0.663304 | 0.322103 | 0.396788 |
... | ... | ... | ... | ... |
145 | 0.945455 | 0.380686 | 0.635863 | -2.760583 |
146 | 0.742688 | -0.761885 | -0.435880 | -1.647605 |
147 | 0.858033 | 0.160082 | 0.466573 | -0.887393 |
148 | 0.924462 | 0.236752 | 2.586185 | -0.288856 |
149 | 0.676073 | -0.573795 | 1.297683 | 1.004226 |
150 rows × 4 columns
xwhite.corr().round()
sepal_length | sepal_width | petal_length | petal_width | |
---|---|---|---|---|
sepal_length | 1.0 | 0.0 | -0.0 | 0.0 |
sepal_width | 0.0 | 1.0 | -0.0 | 0.0 |
petal_length | -0.0 | -0.0 | 1.0 | -0.0 |
petal_width | 0.0 | 0.0 | -0.0 | 1.0 |
xwhite.mean().round()
sepal_length -0.0 sepal_width -0.0 petal_length -0.0 petal_width -0.0 dtype: float64
xwhite.std().round()
sepal_length 1.0 sepal_width 1.0 petal_length 1.0 petal_width 1.0 dtype: float64