t-Distributed Stochastic Neighbor Embedding (t-SNE)
Buen vídeo básico de introducción al t-SNE https://www.youtube.com/watch?v=NEaUSP4YerM
https://towardsdatascience.com/an-introduction-to-t-sne-with-python-example-5a3a293108d1
Para probar el método, vamos a crear un dataset formado por tres grupos de puntos generados con distintas localizaciones y varianzas.
import numpy as np
import pandas as pd
C1 = np.random.normal(loc=0., scale=1., size=(10,3))
I1 = np.ones(10, dtype=int)
#C2 = np.random.normal(loc=2., scale=0.1, size=(10,3))
C2 = np.random.normal(loc=5., scale=1., size=(10,3))
I2 = np.ones(10, dtype=int)*2
#C3 = np.random.normal(loc=5, scale=2, size=(10,3))
C3 = np.random.normal(loc=10, scale=1., size=(10,3))
I3 = np.ones(10, dtype=int)*3
df1 = pd.DataFrame(data=C1, columns=["x", "y", "z"])
df1I = pd.DataFrame(data=I1, columns=["class"])
df2 = pd.DataFrame(data=C2, columns=["x", "y", "z"])
df2I = pd.DataFrame(data=I2, columns=["class"])
df3 = pd.DataFrame(data=C3, columns=["x", "y", "z"])
df3I = pd.DataFrame(data=I3, columns=["class"])
result1 = pd.concat([df1, df1I], axis = 1, ignore_index=True, sort=False)
result2 = pd.concat([df2, df2I], axis = 1, ignore_index=True, sort=False)
result3 = pd.concat([df3, df3I], axis = 1, ignore_index=True, sort=False)
df = pd.concat([result1, result2, result3])
df.columns = ["x","y", "z","class"]
df = df.reset_index(drop=True)
Veamos los puntos de forma tabulada.
df
x | y | z | class | |
---|---|---|---|---|
0 | 0.166524 | 0.476437 | -0.748905 | 1 |
1 | -0.260890 | -1.210117 | 1.111706 | 1 |
2 | 1.483352 | 0.980338 | 2.013879 | 1 |
3 | -0.008549 | 0.257644 | -1.536449 | 1 |
4 | -1.467212 | -0.370246 | 0.176544 | 1 |
5 | -0.035987 | 0.426323 | -1.165189 | 1 |
6 | -1.098647 | 0.331815 | 1.964918 | 1 |
7 | -0.272549 | -0.040916 | -0.370012 | 1 |
8 | 2.678083 | 0.933871 | -0.728749 | 1 |
9 | 1.314680 | 0.550044 | -1.232873 | 1 |
10 | 5.063146 | 4.676758 | 6.091572 | 2 |
11 | 4.005999 | 4.868296 | 5.885338 | 2 |
12 | 4.395932 | 5.603112 | 5.080207 | 2 |
13 | 5.908465 | 5.186550 | 6.903519 | 2 |
14 | 3.197914 | 5.841706 | 4.253857 | 2 |
15 | 5.017807 | 4.312943 | 3.940999 | 2 |
16 | 3.890963 | 6.252370 | 5.864753 | 2 |
17 | 4.683445 | 5.941964 | 4.850334 | 2 |
18 | 4.606564 | 6.254284 | 4.996100 | 2 |
19 | 3.944945 | 4.605639 | 5.809857 | 2 |
20 | 10.027336 | 11.041746 | 10.127516 | 3 |
21 | 8.923284 | 9.785343 | 11.444244 | 3 |
22 | 8.983430 | 9.807272 | 9.374686 | 3 |
23 | 8.087111 | 8.450916 | 8.720797 | 3 |
24 | 9.362836 | 9.986216 | 8.472149 | 3 |
25 | 10.091924 | 9.338093 | 9.719824 | 3 |
26 | 8.936871 | 11.359872 | 10.348214 | 3 |
27 | 10.474326 | 11.126252 | 10.260523 | 3 |
28 | 9.603450 | 9.553276 | 10.102165 | 3 |
29 | 8.888365 | 10.522096 | 8.616896 | 3 |
Visualicemos el conjunto de puntos tridimensional:
import plotly.express as px
import matplotlib.pyplot as plt
fig = px.scatter_3d(df, x="x", y="y", z="z", color="class")
fig.show()
Q_points
será nuestro conjunto de puntos tridimensionales y P_points
será nuestro conjunto de puntos bidimensionales con los que intentaremos visualizar el primer conjunto.
Q_points = df.values[:,0:3] # Tomamos las coordenadas de los puntos, no la clase
P_points = np.random.uniform(0, 10, size=(30,2)) # Generamos puntos al azar
Si visualizamos el conjunto de puntos bidimensional, veremos que está totalmente desordenado (está generado al azar).
import matplotlib.pyplot as plt
P_index = np.concatenate((np.ones(10, dtype=int), np.ones(10, dtype=int)*2, np.ones(10, dtype=int)*3))
plt.scatter(P_points[:,0], P_points[:,1], c=P_index)
<matplotlib.collections.PathCollection at 0x12fbf7370>
Antes de seguir vamos a echarle un vistazo a las matrices de distancia.
PM = np.zeros((30,30))
QM = np.zeros((30,30))
def distance(p1, p2):
return np.sqrt(np.sum((p1 - p2)**2))
for i, p in enumerate(P_points):
PM[i] = [distance(p, j) for j in P_points]
for i, q in enumerate(Q_points):
QM[i] = [distance(q, j) for j in Q_points]
plt.matshow(PM, cmap=plt.cm.Blues) # Representamos la matriz P
plt.matshow(QM, cmap=plt.cm.Blues) # Representamos la matriz Q
<matplotlib.image.AxesImage at 0x12fd423b0>
Ahora nos hace falta definir una función para calcular la distancia entre puntos, otra para obtener la función de distribución de probabilidad y una última para calcular la divergencia de Kullback-Leibler.
import torch
import math
device = 'cuda' if torch.cuda.is_available() else 'cpu'
distT_torch = torch.distributions.studentT.StudentT(df=1, loc=0, scale=1)
def distance(p1, p2):
return torch.sqrt(torch.sum((p1 - p2)**2) + 1e-6)
def pdf(x):
return torch.exp(distT_torch.log_prob(x))
def KL(p, q):
return torch.where(q > 0.1e-8, p * torch.log(p/q), torch.tensor([0.], dtype=torch.double, device=device)).sum()
Pasamos los puntos creados en Numpy a tensores de PyTorch P
y Q
. Los puntos P
son los que tenemos que "mover", por eso requieren gradiente.
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device = 'cpu'
P = torch.tensor(P_points, requires_grad=True, dtype=torch.float, device=device)
Q = torch.tensor(Q_points, dtype=torch.float, device=device)
optimizer = torch.optim.RMSprop([P], lr=0.1)
epochs = 500
for k in range(epochs):
PM = torch.tensor(np.zeros((len(P_points), len(P_points))), dtype=float, device=device)
QM = torch.tensor(np.zeros((len(P_points), len(P_points))), dtype=float, device=device)
for i, q_row in enumerate(Q):
for j, q_column in enumerate(Q):
QM[i, j] = distance(q_row, q_column)
QD = pdf(QM)
QD = torch.div(QD.t(), torch.sum(QD, dim=1)).t()
for i, p in enumerate(P):
for j, q in enumerate(P):
PM[i, j] = distance(p, q)
PD = pdf(PM)
PD = torch.div(PD.t(), torch.sum(PD, dim=1)).t()
loss = torch.tensor([0.], device=device)
for pd, qd in zip(PD, QD):
loss += KL(pd, qd)
if k%10 == 0:
print("Epoch:", k, "loss:", loss.item())
loss.backward()
optimizer.step()
optimizer.zero_grad()
Epoch: 0 loss: 39.479583740234375 Epoch: 10 loss: 15.956965446472168 Epoch: 20 loss: 13.391416549682617 Epoch: 30 loss: 12.188273429870605 Epoch: 40 loss: 10.87851333618164 Epoch: 50 loss: 10.16821575164795 Epoch: 60 loss: 9.989543914794922 Epoch: 70 loss: 9.744491577148438 Epoch: 80 loss: 9.489737510681152 Epoch: 90 loss: 8.615865707397461 Epoch: 100 loss: 8.053683280944824 Epoch: 110 loss: 7.864839553833008 Epoch: 120 loss: 7.308864593505859 Epoch: 130 loss: 6.854788780212402 Epoch: 140 loss: 5.771241188049316 Epoch: 150 loss: 4.689131736755371 Epoch: 160 loss: 4.214322566986084 Epoch: 170 loss: 4.072487831115723 Epoch: 180 loss: 4.051445007324219 Epoch: 190 loss: 4.03515625 Epoch: 200 loss: 4.021659851074219 Epoch: 210 loss: 4.013899803161621 Epoch: 220 loss: 3.9985787868499756 Epoch: 230 loss: 3.986757516860962 Epoch: 240 loss: 3.977696180343628 Epoch: 250 loss: 3.969468593597412 Epoch: 260 loss: 3.959566116333008 Epoch: 270 loss: 3.9487476348876953 Epoch: 280 loss: 3.9359965324401855 Epoch: 290 loss: 3.919861078262329 Epoch: 300 loss: 3.9039435386657715 Epoch: 310 loss: 3.8863189220428467 Epoch: 320 loss: 3.8603057861328125 Epoch: 330 loss: 3.8084962368011475 Epoch: 340 loss: 3.7537732124328613 Epoch: 350 loss: 3.695063591003418 Epoch: 360 loss: 3.590381383895874 Epoch: 370 loss: 3.4406182765960693 Epoch: 380 loss: 3.263678789138794 Epoch: 390 loss: 2.823730707168579 Epoch: 400 loss: 2.2120273113250732 Epoch: 410 loss: 0.6396467685699463 Epoch: 420 loss: 0.5637612342834473 Epoch: 430 loss: 0.5009985566139221 Epoch: 440 loss: 0.4680554270744324 Epoch: 450 loss: 0.4548220932483673 Epoch: 460 loss: 0.43580907583236694 Epoch: 470 loss: 0.436868280172348 Epoch: 480 loss: 0.4374592900276184 Epoch: 490 loss: 0.4361301362514496
Visualizamos el resultado:
import matplotlib.pyplot as plt
final_points = P.cpu().detach().numpy()
P_index = np.concatenate((np.ones(10, dtype=int), np.ones(10, dtype=int)*2, np.ones(10, dtype=int)*3))
plt.scatter(P_points[:,0], P_points[:,1], c=P_index)
plt.show()
plt.scatter(final_points[:,0], final_points[:,1], c=P_index)
<matplotlib.collections.PathCollection at 0x177981f90>
Crea el conjunto tridimensional anterior con tres clases con el mismo centro y dispersión. De esa forma, las clases serán no separables. Mira a ver qué ocurre con su visualización con t-SNE.
Calcula la visualización t-SNE usando el conjunto de datos IRIS.