## 随机梯度下降和独热编码¶

• 随机梯度下降
• 在线学习
• 独热编码
• 哈希技巧

### 随机梯度下降¶

#### 实验例子¶

In [1]:
import warnings
import seaborn as sns
import os
import re
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook
from sklearn.datasets import fetch_20newsgroups, load_files
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, log_loss
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
%matplotlib inline
warnings.filterwarnings('ignore')


In [2]:
data_demo = pd.read_csv(
'../../data/weights_heights.csv')  # 导入数据集
data_demo.head()

Out[2]:
Index Height Weight
0 1 65.78331 112.9925
1 2 71.51521 136.4873
2 3 69.39874 153.0269
3 4 68.21660 142.3354
4 5 67.78781 144.2971

In [3]:
plt.scatter(data_demo['Weight'], data_demo['Height'])
plt.xlabel('Weight in lb')
plt.ylabel('Height in inches')

Out[3]:
Text(0, 0.5, 'Height in inches')

$$y_i = w_0 + w_1 x_i$$

$$SE(w_0, w_1) = \frac{1}{2}\sum_{i=1}^{n}(y_i - (w_0 + w_1x_{i}))^2 \rightarrow min_{w_0, w_1}$$

$$w_0^{(t+1)} = w_0^{(t)} -\eta \frac{\partial SE}{\partial w_0} |_{t}$$
$$w_1^{(t+1)} = w_1^{(t)} -\eta \frac{\partial SE}{\partial w_1} |_{t}$$

$$w_0^{(t+1)} = w_0^{(t)} + \eta \sum_{i=1}^{n}(y_i - w_0^{(t)} - w_1^{(t)}x_i)$$
$$w_1^{(t+1)} = w_1^{(t)} + \eta \sum_{i=1}^{n}(y_i - w_0^{(t)} - w_1^{(t)}x_i)x_i$$

$$w_0^{(t+1)} = w_0^{(t)} + \eta (y_i - w_0^{(t)} - w_1^{(t)}x_i)$$
$$w_1^{(t+1)} = w_1^{(t)} + \eta (y_i - w_0^{(t)} - w_1^{(t)}x_i)x_i$$

#### 在线学习方法¶

$$w_0^{(t+1)} = w_0^{(t)} + \eta (y_i - w_0^{(t)} - w_1^{(t)}x_i)$$
$$w_1^{(t+1)} = w_1^{(t)} + \eta (y_i - w_0^{(t)} - w_1^{(t)}x_i)x_i$$

### 类别型特征处理¶

In [4]:
df = pd.read_csv(
'../../data/bank_train.csv')
labels = pd.read_csv(
'../../data/bank_train_target.csv', header=None)
df.head()

Out[4]:
age job marital education default housing loan contact month day_of_week duration campaign pdays previous poutcome emp.var.rate cons.price.idx cons.conf.idx euribor3m nr.employed
0 26 student single high.school no no no telephone jun mon 901 1 999 0 nonexistent 1.4 94.465 -41.8 4.961 5228.1
1 46 admin. married university.degree no yes no cellular aug tue 208 2 999 0 nonexistent 1.4 93.444 -36.1 4.963 5228.1
2 49 blue-collar married basic.4y unknown yes yes telephone jun tue 131 5 999 0 nonexistent 1.4 94.465 -41.8 4.864 5228.1
3 31 technician married university.degree no no no cellular jul tue 404 1 999 0 nonexistent -2.9 92.469 -33.6 1.044 5076.2
4 42 housemaid married university.degree no yes no telephone nov mon 85 1 999 0 nonexistent -0.1 93.200 -42.0 4.191 5195.8

In [5]:
df['education'].value_counts().plot.barh()

Out[5]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f1cfb4f23c8>

In [6]:
label_encoder = LabelEncoder()


In [7]:
mapped_education = pd.Series(label_encoder.fit_transform(df['education']))
mapped_education.value_counts().plot.barh()
dict(enumerate(label_encoder.classes_))

Out[7]:
{0: 'basic.4y',
1: 'basic.6y',
2: 'basic.9y',
3: 'high.school',
4: 'illiterate',
5: 'professional.course',
6: 'university.degree',
7: 'unknown'}

In [8]:
df['education'] = mapped_education
df.head()

Out[8]:
age job marital education default housing loan contact month day_of_week duration campaign pdays previous poutcome emp.var.rate cons.price.idx cons.conf.idx euribor3m nr.employed
0 26 student single 3 no no no telephone jun mon 901 1 999 0 nonexistent 1.4 94.465 -41.8 4.961 5228.1
1 46 admin. married 6 no yes no cellular aug tue 208 2 999 0 nonexistent 1.4 93.444 -36.1 4.963 5228.1
2 49 blue-collar married 0 unknown yes yes telephone jun tue 131 5 999 0 nonexistent 1.4 94.465 -41.8 4.864 5228.1
3 31 technician married 6 no no no cellular jul tue 404 1 999 0 nonexistent -2.9 92.469 -33.6 1.044 5076.2
4 42 housemaid married 6 no yes no telephone nov mon 85 1 999 0 nonexistent -0.1 93.200 -42.0 4.191 5195.8

In [9]:
categorical_columns = df.columns[df.dtypes == 'object'].union(['education'])
for column in categorical_columns:
df[column] = label_encoder.fit_transform(df[column])
df.head()

Out[9]:
age job marital education default housing loan contact month day_of_week duration campaign pdays previous poutcome emp.var.rate cons.price.idx cons.conf.idx euribor3m nr.employed
0 26 8 2 3 0 0 0 1 4 1 901 1 999 0 1 1.4 94.465 -41.8 4.961 5228.1
1 46 0 1 6 0 2 0 0 1 3 208 2 999 0 1 1.4 93.444 -36.1 4.963 5228.1
2 49 1 1 0 1 2 2 1 4 3 131 5 999 0 1 1.4 94.465 -41.8 4.864 5228.1
3 31 9 1 6 0 0 0 0 3 3 404 1 999 0 1 -2.9 92.469 -33.6 1.044 5076.2
4 42 3 1 6 0 2 0 1 7 1 85 1 999 0 1 -0.1 93.200 -42.0 4.191 5195.8

In [10]:
df.loc[1].job - df.loc[2].job

Out[10]:
-1.0

In [11]:
def logistic_regression_accuracy_on(dataframe, labels):
features = dataframe.values
labels = np.array(labels)
train_features, test_features, train_labels, test_labels = train_test_split(
features, labels.ravel())

logit = LogisticRegression(max_iter=1000, solver='lbfgs')
logit.fit(train_features, train_labels)
return classification_report(test_labels, logit.predict(test_features))

print(logistic_regression_accuracy_on(df[categorical_columns], labels))

              precision    recall  f1-score   support

0       0.89      1.00      0.94      6128
1       0.62      0.01      0.02       771

micro avg       0.89      0.89      0.89      6899
macro avg       0.75      0.50      0.48      6899
weighted avg       0.86      0.89      0.84      6899



#### 独热编码¶

In [12]:
one_hot_example = pd.DataFrame([{i: 0 for i in range(10)}])
one_hot_example.loc[0, 6] = 1
one_hot_example

Out[12]:
0 1 2 3 4 5 6 7 8 9
0 0 0 0 0 0 0 1 0 0 0

In [13]:
onehot_encoder = OneHotEncoder(sparse=False, categories='auto')

In [14]:
encoded_categorical_columns = pd.DataFrame(
onehot_encoder.fit_transform(df[categorical_columns]))
encoded_categorical_columns.head()

Out[14]:
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52
0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0
1 1.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0
2 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0
3 1.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0
4 0.0 1.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0

In [15]:
print(logistic_regression_accuracy_on(encoded_categorical_columns, labels))

              precision    recall  f1-score   support

0       0.90      0.99      0.94      6099
1       0.61      0.17      0.26       800

micro avg       0.89      0.89      0.89      6899
macro avg       0.76      0.58      0.60      6899
weighted avg       0.87      0.89      0.86      6899



#### 哈希技巧¶

In [16]:
for s in ('university.degree', 'high.school', 'illiterate'):
print(s, '→', hash(s))

university.degree → -5370095693728667446
high.school → -7042998680499890429
illiterate → -7750457402342120656


In [17]:
hash_space = 25
for s in ('university.degree', 'high.school', 'illiterate'):
print(s, '→', hash(s) % hash_space)

university.degree → 4
high.school → 21
illiterate → 19


In [18]:
hashing_example = pd.DataFrame([{i: 0.0 for i in range(hash_space)}])
for s in ('job=student', 'marital=single', 'day_of_week=mon'):
print(s, '→', hash(s) % hash_space)
hashing_example.loc[0, hash(s) % hash_space] = 1
hashing_example

job=student → 14
marital=single → 21
day_of_week=mon → 1

Out[18]:
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0

In [19]:
assert hash('no') == hash('no')
assert hash('housing=no') != hash('loan=no')