In [1]:
import pandas as pd
import sklearn.preprocessing as pre
import numpy as np
In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/ianmcloughlin/datasets/master/iris.csv")
df
Out[2]:
sepal_length sepal_width petal_length petal_width class
0 5.1 3.5 1.4 0.2 setosa
1 4.9 3.0 1.4 0.2 setosa
2 4.7 3.2 1.3 0.2 setosa
3 4.6 3.1 1.5 0.2 setosa
4 5.0 3.6 1.4 0.2 setosa
... ... ... ... ... ...
145 6.7 3.0 5.2 2.3 virginica
146 6.3 2.5 5.0 1.9 virginica
147 6.5 3.0 5.2 2.0 virginica
148 6.2 3.4 5.4 2.3 virginica
149 5.9 3.0 5.1 1.8 virginica

150 rows × 5 columns

In [3]:
x = df.iloc[:, 0:4]
x
Out[3]:
sepal_length sepal_width petal_length petal_width
0 5.1 3.5 1.4 0.2
1 4.9 3.0 1.4 0.2
2 4.7 3.2 1.3 0.2
3 4.6 3.1 1.5 0.2
4 5.0 3.6 1.4 0.2
... ... ... ... ...
145 6.7 3.0 5.2 2.3
146 6.3 2.5 5.0 1.9
147 6.5 3.0 5.2 2.0
148 6.2 3.4 5.4 2.3
149 5.9 3.0 5.1 1.8

150 rows × 4 columns

One-off scaling

In [4]:
# https://scikit-learn.org/stable/modules/preprocessing.html#preprocessing
xscale = pd.DataFrame(pre.scale(x), columns=x.columns)
xscale
Out[4]:
sepal_length sepal_width petal_length petal_width
0 -0.900681 1.019004 -1.340227 -1.315444
1 -1.143017 -0.131979 -1.340227 -1.315444
2 -1.385353 0.328414 -1.397064 -1.315444
3 -1.506521 0.098217 -1.283389 -1.315444
4 -1.021849 1.249201 -1.340227 -1.315444
... ... ... ... ...
145 1.038005 -0.131979 0.819596 1.448832
146 0.553333 -1.282963 0.705921 0.922303
147 0.795669 -0.131979 0.819596 1.053935
148 0.432165 0.788808 0.933271 1.448832
149 0.068662 -0.131979 0.762758 0.790671

150 rows × 4 columns

Fitting and transforming

In [5]:
scaler = pre.StandardScaler()
scaler.fit(x)
scaler.mean_, scaler.scale_
Out[5]:
(array([5.84333333, 3.05733333, 3.758     , 1.19933333]),
 array([0.82530129, 0.43441097, 1.75940407, 0.75969263]))
In [6]:
xscale = pd.DataFrame(scaler.transform(x), columns=x.columns)
xscale
Out[6]:
sepal_length sepal_width petal_length petal_width
0 -0.900681 1.019004 -1.340227 -1.315444
1 -1.143017 -0.131979 -1.340227 -1.315444
2 -1.385353 0.328414 -1.397064 -1.315444
3 -1.506521 0.098217 -1.283389 -1.315444
4 -1.021849 1.249201 -1.340227 -1.315444
... ... ... ... ...
145 1.038005 -0.131979 0.819596 1.448832
146 0.553333 -1.282963 0.705921 0.922303
147 0.795669 -0.131979 0.819596 1.053935
148 0.432165 0.788808 0.933271 1.448832
149 0.068662 -0.131979 0.762758 0.790671

150 rows × 4 columns

In [7]:
scaler.transform(np.array([[1.0, 0.5, 10.0, 4.1]]))
Out[7]:
array([[-5.86856386, -5.88689864,  3.54779219,  3.81821089]])

Output values

In [8]:
y = df[['class']]
y
Out[8]:
class
0 setosa
1 setosa
2 setosa
3 setosa
4 setosa
... ...
145 virginica
146 virginica
147 virginica
148 virginica
149 virginica

150 rows × 1 columns

In [9]:
encoder = pre.OneHotEncoder()
encoder.fit(y)
yencoded = encoder.transform(y)
yencoded.toarray()
Out[9]:
array([[1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.]])
In [10]:
encoder.inverse_transform(yencoded)
Out[10]:
array([['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['virginica'],
       ['virginica'],
       ['virginica'],
       ['virginica'],
       ['virginica'],
       ['virginica'],
       ['virginica'],
       ['virginica'],
       ['virginica'],
       ['virginica'],
       ['virginica'],
       ['virginica'],
       ['virginica'],
       ['virginica'],
       ['virginica'],
       ['virginica'],
       ['virginica'],
       ['virginica'],
       ['virginica'],
       ['virginica'],
       ['virginica'],
       ['virginica'],
       ['virginica'],
       ['virginica'],
       ['virginica'],
       ['virginica'],
       ['virginica'],
       ['virginica'],
       ['virginica'],
       ['virginica'],
       ['virginica'],
       ['virginica'],
       ['virginica'],
       ['virginica'],
       ['virginica'],
       ['virginica'],
       ['virginica'],
       ['virginica'],
       ['virginica'],
       ['virginica'],
       ['virginica'],
       ['virginica'],
       ['virginica'],
       ['virginica'],
       ['virginica'],
       ['virginica'],
       ['virginica'],
       ['virginica'],
       ['virginica'],
       ['virginica']], dtype=object)

Whitening

In [11]:
x.corr()
Out[11]:
sepal_length sepal_width petal_length petal_width
sepal_length 1.000000 -0.117570 0.871754 0.817941
sepal_width -0.117570 1.000000 -0.428440 -0.366126
petal_length 0.871754 -0.428440 1.000000 0.962865
petal_width 0.817941 -0.366126 0.962865 1.000000
In [12]:
import sklearn.decomposition as dec
In [13]:
pca = dec.PCA(n_components=4, whiten=True)
pca.fit(x)
xwhite = pd.DataFrame(pca.transform(x), columns=x.columns)
xwhite
Out[13]:
sepal_length sepal_width petal_length petal_width
0 -1.305338 0.648369 -0.099817 -0.014654
1 -1.319935 -0.359309 -0.752573 -0.641421
2 -1.404967 -0.294244 0.064007 -0.129341
3 -1.335109 -0.646140 0.112849 0.489524
4 -1.327023 0.663304 0.322103 0.396788
... ... ... ... ...
145 0.945455 0.380686 0.635863 -2.760583
146 0.742688 -0.761885 -0.435880 -1.647605
147 0.858033 0.160082 0.466573 -0.887393
148 0.924462 0.236752 2.586185 -0.288856
149 0.676073 -0.573795 1.297683 1.004226

150 rows × 4 columns

In [14]:
xwhite.corr().round()
Out[14]:
sepal_length sepal_width petal_length petal_width
sepal_length 1.0 0.0 -0.0 0.0
sepal_width 0.0 1.0 -0.0 0.0
petal_length -0.0 -0.0 1.0 -0.0
petal_width 0.0 0.0 -0.0 1.0
In [15]:
xwhite.mean().round()
Out[15]:
sepal_length   -0.0
sepal_width    -0.0
petal_length   -0.0
petal_width    -0.0
dtype: float64
In [16]:
xwhite.std().round()
Out[16]:
sepal_length    1.0
sepal_width     1.0
petal_length    1.0
petal_width     1.0
dtype: float64

End