1. Some materials are taken from machine learning course of Victor Kitov
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('seaborn-talk')
plt.rcParams['figure.figsize'] = (12,8)
# Для кириллицы на графиках
font = {'family': 'Verdana',
'weight': 'normal'}
plt.rc('font', **font)
try:
from ipywidgets import interact, IntSlider, fixed, FloatSlider
except ImportError:
print u'Так надо'
(lat, lon)
(lat, lon)
$$ \rho(x_i, x_j) = \sum\limits_{d=1}^{D}(x^d_i - x^d_j)^2 \text{: euclidean distance} $$
$$ \rho(x_i, x_j) = \sum\limits_{d=1}^{D}|x^d_i - x^d_j| \text{: manhattan distance} $$
$$ \rho(x_i, x_j) = 1 - \frac{\langle x_i,x_j \rangle}{||x_i||_2\cdot||x_j||_2} \text{: cosine distance} $$
\begin{equation} D ( i , j ) = \begin{cases} {\begin{array}{llcl}0,&&&i=0,\ j=0\\i,&&&j=0,\ i>0\\j,&&&i=0,\ j>0\\\min\{\\&D(i,j-1)+1,\\&D(i-1,j)+1,&&j>0,\ i>0\\&D(i-1,j-1)+{\rm {m}}(S_{1}[i],S_{2}[j])\\\}\end{array}}, \end{cases} \end{equation} where $m(a,b) = 0$, if $a = b$ and $1$ otherwise
Consider training sample $\left(x_{1},y_{1}\right),...\left(x_{N},y_{N}\right)$ with
Training: Calculate centroids for each class $c=1,2,...C:$ $$ \mu_{c}=\frac{1}{N_{1}}\sum_{n=1}^{N}x_{n}\mathbb{I}[y_{n}=c] $$
Classification:
from sklearn.datasets import make_blobs
from sklearn.neighbors import NearestCentroid
def plot_centroid_class():
X_blobs, y_blobs = make_blobs(n_samples=300, n_features=2, centers=4, cluster_std=1.4, random_state=123)
X_centers = np.zeros((4, 2))
for i in range(4):
X_centers[i,:] = X_blobs[y_blobs==i,:].mean(axis=0)
plt.scatter(X_blobs[:,0], X_blobs[:,1], c=y_blobs)
plt.xlabel('$x_1$')
plt.ylabel('$x_2$')
knn = NearestCentroid()
knn.fit(X_blobs, y_blobs)
x_range = np.linspace(X_blobs.min(), X_blobs.max(), 500)
# ОДЗ значений признаков
xx1, xx2 = np.meshgrid(x_range, x_range)
Y = knn.predict(np.c_[xx1.ravel(), xx2.ravel()])
Y = Y.reshape(xx1.shape)
plt.contourf(xx1, xx2, Y, alpha=0.3)
plt.scatter(X_blobs[:,0], X_blobs[:,1],c=y_blobs, cmap=plt.cm.spectral)
plt.scatter(X_centers[:, 0], X_centers[:, 1], marker='*', s=400)
plt.axis('equal')
plt.show()
interact(plot_centroid_class)
Classification:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.datasets import make_moons
X_moons, y_moons = make_moons(noise=0.3, random_state=1234)
plt.scatter(X_moons[:,0], X_moons[:,1], c=y_moons, cmap=plt.cm.spectral)
plt.xlabel('$x_1$')
plt.ylabel('$x_2$')
def plot_knn_class(k=1, prob=False):
plt.scatter(X_moons[:,0], X_moons[:,1], c=y_moons)
plt.xlabel('$x_1$')
plt.ylabel('$x_2$')
knn = KNeighborsClassifier(n_neighbors=k, metric='minkowski', p=2)
knn.fit(X_moons, y_moons)
x_range = np.linspace(X_moons.min(), X_moons.max(), 500)
# ОДЗ значений признаков
xx1, xx2 = np.meshgrid(x_range, x_range)
# всевозможные попарные значения признаков
if prob:
Y = knn.predict_proba(np.c_[xx1.ravel(), xx2.ravel()])[:,1]
else:
Y = knn.predict(np.c_[xx1.ravel(), xx2.ravel()])
Y = Y.reshape(xx1.shape)
plt.contourf(xx1, xx2, Y, alpha=0.3)
plt.scatter(X_moons[:,0], X_moons[:,1], c=y_moons, cmap=plt.cm.spectral)
plt.show()
interact(plot_knn_class, k=IntSlider(min=1, max=10, value=1))
Regression:
x_true = np.arange(-5, 5, 0.2)
x = x_true + np.random.rand(x_true.shape[0]) - 0.5
y_true = np.sin(x_true)+x_true/3
y = y_true + np.random.rand(x_true.shape[0]) - 0.5
plt.plot(x_true, y_true, c='g', label='$f(x)$')
plt.scatter(x, y, label='actual data')
plt.xlabel('x')
plt.ylabel('y')
plt.legend(loc=2)
def plot_linreg():
lin_reg = LinearRegression()
lin_reg.fit(x.reshape(-1,1), y)
y_hat = lin_reg.predict(x_true.reshape(-1,1))
plt.plot(x_true, y_true, c='g', label='$f(x)$')
plt.scatter(x, y, label='actual data')
plt.xlabel('x')
plt.ylabel('y')
plt.plot(x_true, y_hat, c='r', label='linear regression')
plt.legend(loc=2)
plot_linreg()
def plot_knn(k=1):
knn = KNeighborsRegressor(n_neighbors=k)
knn.fit(x.reshape(-1,1), y)
y_hat = knn.predict(x_true.reshape(-1,1))
plt.plot(x_true, y_true, c='g', label='$f(x)$')
plt.scatter(x, y, label='actual data')
plt.xlabel('x')
plt.ylabel('y')
plt.plot(x_true, y_hat, c='r', label='knn, $k=%d$' % k)
plt.legend(loc=2)
plt.show()
return None
interact(plot_knn, k=IntSlider(min=1, max=10, value=1))
When several classes get the same rank, we can assign to class:
* None
Advantages:
where $\mu_{j},\,\sigma_{j},\,L_{j},\,U_{j}$ are mean value, standard deviation, minimum and maximum value of the $j$-th feature.
$D=2$ | $D=2 \dots 100$ |
---|
$$ \lim_{D \rightarrow \infty} \frac{\text{dist}_{max} - \text{dist}_{min}}{\text{dist}_{min}} = 0$$
Consider for object $x$:
Classification: $$\begin{align*} g_{c}(x) & =\sum_{k=1}^{K}\mathbb{I}[y_{i_{k}}=c],\quad c=1,2,...C.\\ \widehat{y}(x) & =\arg\max_{c}g_{c}(x) \end{align*} $$
Regression: $$ \widehat{y}(x)=\frac{1}{K}\sum_{k=1}^{K}y_{i_{k}} $$
Weighted classification: $$\begin{align*} g_{c}(x) & =\sum_{k=1}^{K}w(k,\,\rho(x,x_{i_{k}}))\mathbb{I}[y_{i_{k}}=c],\quad c=1,2,...C.\\ \widehat{y}(x) & =\arg\max_{c}g_{c}(x) \end{align*} $$
Weighted regression: $$ \widehat{y}(x)=\frac{\sum_{k=1}^{K}w(k,\,\rho(x,x_{i_{k}}))y_{i_{k}}}{\sum_{k=1}^{K}w(k,\,\rho(x,x_{i_{k}}))} $$
Index dependent weights: $$ w_{k}=\alpha^{k},\quad\alpha\in(0,1) $$ $$ w_{k}=\frac{K+1-k}{K} $$
Distance dependent weights:
$$ w_{k}=\begin{cases} \frac{\rho(z_{K},x)-\rho(z_{k},x)}{\rho(z_{K},x)-\rho(z_{1},x)}, & \rho(z_{K},x)\ne\rho(z_{1},x)\\ 1 & \rho(z_{K},x)=\rho(z_{1},x) \end{cases} $$ $$ w_{k}=\frac{1}{\rho(z_{k},x)} $$
def plot_knn_class_kernel(k=1, h=0.5, prob=False, use_all=False):
def gauss_kernel(d, h=h):
return np.exp(-(d**2)/(2*h**2))
plt.scatter(X_moons[:,0], X_moons[:,1], c=y_moons)
plt.xlabel('$x_1$')
plt.ylabel('$x_2$')
if use_all:
knn = KNeighborsClassifier(n_neighbors=70, metric='minkowski', p=2, weights=gauss_kernel)
else:
knn = KNeighborsClassifier(n_neighbors=k, metric='minkowski', p=2, weights=gauss_kernel)
knn.fit(X_moons, y_moons)
x_range = np.linspace(X_moons.min(), X_moons.max(), 500)
# ОДЗ значений признаков
xx1, xx2 = np.meshgrid(x_range, x_range)
# всевозможные попарные значения признаков
if prob:
Y = knn.predict_proba(np.c_[xx1.ravel(), xx2.ravel()])[:,1]
else:
Y = knn.predict(np.c_[xx1.ravel(), xx2.ravel()])
Y = Y.reshape(xx1.shape)
plt.contourf(xx1, xx2, Y, alpha=0.3)
plt.scatter(X_moons[:,0], X_moons[:,1], c=y_moons)
plt.show()
interact(plot_knn_class_kernel, k=IntSlider(min=1, max=10, value=1),
h=FloatSlider(min=0.05, max=5, value=1, step=0.05))
Important hyperparameters of K-NN:
Output depends on feature scaling.