import numpy as np
from sklearn.preprocessing import *
rg = np.random.RandomState(2017)
X_train = rg.uniform(0, 5, (4,3))
X_scaled = scale(X_train)
print('Mean: {}, \nStd: {}'.format(X_scaled.mean(axis=0, dtype=np.int), X_scaled.std(axis=0)))
Mean: [0 0 0], Std: [1. 1. 1.]
def f(array):
result = (array - np.mean(array)) / np.std(array, ddof=0) # ddof默认为0
return result
scale_result = np.apply_along_axis(f, axis=0, arr=X_train)
assert np.allclose(X_scaled, scale_result)
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
dataset = load_iris()
np.random.seed(2017)
iris = pd.DataFrame(dataset.data, columns=dataset.feature_names).sample(5)
iris
sepal length (cm) | sepal width (cm) | petal length (cm) | petal width (cm) | |
---|---|---|---|---|
143 | 6.8 | 3.2 | 5.9 | 2.3 |
115 | 6.4 | 3.2 | 5.3 | 2.3 |
102 | 7.1 | 3.0 | 5.9 | 2.1 |
51 | 6.4 | 3.2 | 4.5 | 1.5 |
76 | 6.8 | 2.8 | 4.8 | 1.4 |
scaler = StandardScaler()
iris_scaled = scaler.fit_transform(iris)
iris_scaled
array([[ 0.372678 , 0.75 , 1.0932857 , 0.96958969], [-1.11803399, 0.75 , 0.03526728, 0.96958969], [ 1.49071198, -0.5 , 1.0932857 , 0.45927933], [-1.11803399, 0.75 , -1.37542395, -1.07165176], [ 0.372678 , -1.75 , -0.84641474, -1.32680694]])
iris_scaled.mean(axis=0, dtype=np.int), iris_scaled.std(axis=0)
(array([0, 0, 0, 0]), array([1., 1., 1., 1.]))
# 等价于对每列单独调用scale
np.allclose(scaler.fit_transform(iris), scale(iris))
True
X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))
X_scaled = X_std * (max - min) + min
data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]
scaler = MinMaxScaler()
scaler.fit(data)
MinMaxScaler(copy=True, feature_range=(0, 1))
scaler.transform(data)
array([[0. , 0. ], [0.25, 0.25], [0.5 , 0.5 ], [1. , 1. ]])
# 或者直接调用fit_transform
scaler.fit_transform(data)
array([[0. , 0. ], [0.25, 0.25], [0.5 , 0.5 ], [1. , 1. ]])
X_train = np.array([[ 1., -1., 2.],
[ 2., 0., 0.],
[ 0., 1., -1.]])
max_abs_scaler = MaxAbsScaler()
X_train_maxabs = max_abs_scaler.fit_transform(X_train)
X_train_maxabs
array([[ 0.5, -1. , 1. ], [ 1. , 0. , 0. ], [ 0. , 1. , -0.5]])
max_abs_scaler.scale_
array([2., 1., 2.])
X_test = np.array([[ -3., -1., 4.]])
X_test_maxabs = max_abs_scaler.transform(X_test)
X_test_maxabs
array([[-1.5, -1. , 2. ]])
np.random.seed(2018)
X_train = np.random.randn(4,3)
max_abs_scaler = RobustScaler()
X_train_maxabs = max_abs_scaler.fit_transform(X_train)
X_train_maxabs
array([[-0.12669367, 0.61018033, 1.22235048], [-2.02310438, -0.03205827, 0.34615926], [ 0.12669367, -3.19747007, -0.70069397], [ 1.2167336 , 0.03205827, -0.34615926]])
# 求各列的中位数
max_abs_scaler.center_
array([-0.20977884, 0.50624895, 0.34544916])
# 求各列IQR值
max_abs_scaler.scale_
array([0.52874591, 0.12390117, 1.47498622])
# 验证max_abs_scaler.scale_返回的是否为IQR值
IQR = np.percentile(X_train, 75, axis=0) - np.percentile(X_train, 25, axis=0)
np.allclose(max_abs_scaler.scale_ ,IQR)
True
x = [[1,-1,2],[2, 0,0],[0, 1, -1]]
df = pd.DataFrame(x, columns=list('ABC'))
x_norm1 = normalize(x, norm='l1')
df_norm1 = pd.DataFrame(x_norm1)
print('L1正则化:')
df_norm1
L1正则化:
0 | 1 | 2 | |
---|---|---|---|
0 | 0.25 | -0.25 | 0.5 |
1 | 1.00 | 0.00 | 0.0 |
2 | 0.00 | 0.50 | -0.5 |
df_norm1 = df.copy()
for idx in df.index:
l1_row = sum(abs(df.iloc[idx]))
df_norm1.iloc[idx] = df.iloc[idx] / l1_row
df_norm1
A | B | C | |
---|---|---|---|
0 | 0.25 | -0.25 | 0.5 |
1 | 1.00 | 0.00 | 0.0 |
2 | 0.00 | 0.50 | -0.5 |
x = [[1,-1,2],[2, 0,0],[0, 1, -1]]
df = pd.DataFrame(x, columns=list('ABC'))
df
A | B | C | |
---|---|---|---|
0 | 1 | -1 | 2 |
1 | 2 | 0 | 0 |
2 | 0 | 1 | -1 |
x_norm2 = normalize(x, norm='l2')
df_norm2 = pd.DataFrame(x_norm2)
df_norm2
0 | 1 | 2 | |
---|---|---|---|
0 | 0.408248 | -0.408248 | 0.816497 |
1 | 1.000000 | 0.000000 | 0.000000 |
2 | 0.000000 | 0.707107 | -0.707107 |
df_norm2 = df.copy()
for idx in df.index:
l2_row = np.sqrt(sum(np.square(df.iloc[idx])))
df_norm2.iloc[idx] = df.iloc[idx] / l2_row
df_norm2
A | B | C | |
---|---|---|---|
0 | 0.408248 | -0.408248 | 0.816497 |
1 | 1.000000 | 0.000000 | 0.000000 |
2 | 0.000000 | 0.707107 | -0.707107 |