In [1]:
import numpy as np

X = np.array([[6.6, 6.2, 1],
              [9.7, 9.9, 2],
              [8.0, 8.3, 2],
              [6.3, 5.4, 1],
              [1.3, 2.7, 0],
              [2.3, 3.1, 0],
              [6.6, 6.0, 1],
              [6.5, 6.4, 1],
              [6.3, 5.8, 1],
              [9.5, 9.9, 2],
              [8.9, 8.9, 2],
              [8.7, 9.5, 2],
              [2.5, 3.8, 0],
              [2.0, 3.1, 0],
              [1.3, 1.3, 0]])
In [2]:
import pandas as pd

df = pd.DataFrame(X, columns=['weight', 'length', 'label'])
df
Out[2]:
weight length label
0 6.6 6.2 1.0
1 9.7 9.9 2.0
2 8.0 8.3 2.0
3 6.3 5.4 1.0
4 1.3 2.7 0.0
5 2.3 3.1 0.0
6 6.6 6.0 1.0
7 6.5 6.4 1.0
8 6.3 5.8 1.0
9 9.5 9.9 2.0
10 8.9 8.9 2.0
11 8.7 9.5 2.0
12 2.5 3.8 0.0
13 2.0 3.1 0.0
14 1.3 1.3 0.0
In [3]:
%matplotlib inline

ax = df[df['label'] == 0].plot.scatter(x='weight', y='length', c='blue', label='young')
ax = df[df['label'] == 1].plot.scatter(x='weight', y='length', c='orange', label='mid', ax=ax)
ax = df[df['label'] == 2].plot.scatter(x='weight', y='length', c='red', label='adult', ax=ax)
ax
Out[3]:
<matplotlib.axes._subplots.AxesSubplot at 0x108e750b8>
In [4]:
df2 = pd.DataFrame([df.iloc[0], df.iloc[1], df.iloc[4]], columns=['weight', 'length', 'label'])
df3 = pd.DataFrame([df.iloc[14]], columns=['weight', 'length', 'label'])

ax = df2[df2['label'] == 0].plot.scatter(x='weight', y='length', c='blue', label='x4(young)')
ax = df2[df2['label'] == 1].plot.scatter(x='weight', y='length', c='orange', label='x0(mid)', ax=ax)
ax = df2[df2['label'] == 2].plot.scatter(x='weight', y='length', c='red', label='x1(adult)', ax=ax)
ax = df3.plot.scatter(x='weight', y='length', c='gray', label='x14(?)', ax=ax)
ax
Out[4]:
<matplotlib.axes._subplots.AxesSubplot at 0x108f4e0b8>
In [5]:
def euclidean_distance(x, y):   
    return np.sqrt(np.sum((x - y) ** 2))

$\sqrt{\sum^n_{i=1} (x_i - y_i)^2}$

In [6]:
x0 = X[0][:-1]
x1 = X[1][:-1]
x4 = X[4][:-1]
x14 = X[14][:-1]
print(" x0:", x0, "\n x1:", x1, "\n x4:", x4, "\nx14:", x14)
 x0: [6.6 6.2] 
 x1: [9.7 9.9] 
 x4: [1.3 2.7] 
x14: [1.3 1.3]
In [7]:
print(" x14 and x0:", euclidean_distance(x14, x0), "\n",
      "x14 and x1:", euclidean_distance(x14, x1), "\n",
      "x14 and x4:", euclidean_distance(x14, x4))
 x14 and x0: 7.218032973047436 
 x14 and x1: 12.021647141718974 
 x14 and x4: 1.4000000000000001
In [8]:
def cosine_similarity(x, y):
    return np.dot(x, y) / (np.sqrt(np.dot(x, x)) * np.sqrt(np.dot(y, y)))

$\frac{x \bullet y}{ \sqrt{x \bullet x} \sqrt{y \bullet y}}$

In [9]:
print(" x14 and x0:", cosine_similarity(x14, x0), "\n",
      "x14 and x1:", cosine_similarity(x14, x1), "\n",
      "x14 and x4:", cosine_similarity(x14, x4))
 x14 and x0: 0.9995120760870786 
 x14 and x1: 0.9999479424242859 
 x14 and x4: 0.9438583563660174

While cosine looks at the angle between vectors (thus not taking into regard their weight or magnitude), euclidean distance is similar to using a ruler to actually measure the distance.

Cosine Similarity in Action

In [10]:
import wikipedia

q1 = wikipedia.page('Machine Learning')
q2 = wikipedia.page('Artifical Intelligence')
q3 = wikipedia.page('Soccer')
q4 = wikipedia.page('Tennis')
In [11]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()
X = np.array(cv.fit_transform([q1.content, q2.content, q3.content, q4.content]).todense())
In [12]:
print("ML \t", len(q1.content.split()), "\n"
      "AI \t", len(q2.content.split()), "\n"
      "soccer \t", len(q3.content.split()), "\n"
      "tennis \t", len(q4.content.split()))
ML 	 4048 
AI 	 13742 
soccer 	 6470 
tennis 	 9736
In [13]:
q1.content[:100]
Out[13]:
'Machine learning is a field of computer science that often uses statistical techniques to give compu'
In [14]:
q1.content.split()[:10]
Out[14]:
['Machine',
 'learning',
 'is',
 'a',
 'field',
 'of',
 'computer',
 'science',
 'that',
 'often']
In [15]:
X[0][:20]
Out[15]:
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
      dtype=int64)
In [16]:
X[0].shape
Out[16]:
(5484,)
In [17]:
print("ML - AI \t", euclidean_distance(X[0], X[1]), "\n"
      "ML - soccer \t", euclidean_distance(X[0], X[2]), "\n"
      "ML - tennis \t", euclidean_distance(X[0], X[3]))
ML - AI 	 846.53411035823 
ML - soccer 	 479.75827246645787 
ML - tennis 	 789.7069076562519
In [18]:
print("ML - AI \t", cosine_similarity(X[0], X[1]), "\n"
      "ML - soccer \t", cosine_similarity(X[0], X[2]), "\n"
      "ML - tennis \t", cosine_similarity(X[0], X[3]))
ML - AI 	 0.8887965704386804 
ML - soccer 	 0.7839297821715802 
ML - tennis 	 0.7935675914311315
In [19]:
def l1_normalize(v):
    norm = np.sum(v)
    return v / norm

def l2_normalize(v):
    norm = np.sqrt(np.sum(np.square(v)))
    return v / norm
In [20]:
print("ML - AI \t", 1 - euclidean_distance(l1_normalize(X[0]), l1_normalize(X[1])), "\n"
      "ML - soccer \t", 1 - euclidean_distance(l1_normalize(X[0]), l1_normalize(X[2])), "\n"
      "ML - tennis \t", 1 - euclidean_distance(l1_normalize(X[0]), l1_normalize(X[3])))
ML - AI 	 0.9556356337470292 
ML - soccer 	 0.9291904899197152 
ML - tennis 	 0.9314819689984162
In [21]:
print("ML - AI \t", 1 - euclidean_distance(l2_normalize(X[0]), l2_normalize(X[1])), "\n"
      "ML - soccer \t", 1 - euclidean_distance(l2_normalize(X[0]), l2_normalize(X[2])), "\n"
      "ML - tennis \t", 1 - euclidean_distance(l2_normalize(X[0]), l2_normalize(X[3])))
ML - AI 	 0.5283996828641448 
ML - soccer 	 0.3426261066509869 
ML - tennis 	 0.3574544240773757

Categorize a Tweet

In [22]:
ml_tweet = "New research release: overcoming many of Reinforcement Learning's limitations with Evolution Strategies."
x = np.array(cv.transform([ml_tweet]).todense())[0]
In [23]:
print("tweet - ML \t", euclidean_distance(x, X[0]), "\n"
      "tweet - AI \t", euclidean_distance(x, X[1]), "\n"
      "tweet - soccer \t", euclidean_distance(x, X[2]), "\n"
      "tweet - tennis \t", euclidean_distance(x, X[3]))
tweet - ML 	 373.09114167988497 
tweet - AI 	 1160.7269274036853 
tweet - soccer 	 712.600168397398 
tweet - tennis 	 1052.5796881946753
In [24]:
print("tweet - ML \t", cosine_similarity(x, X[0]), "\n"
      "tweet - AI \t", cosine_similarity(x, X[1]), "\n"
      "tweet - soccer \t", cosine_similarity(x, X[2]), "\n"
      "tweet - tennis \t", cosine_similarity(x, X[3]))
tweet - ML 	 0.2613347291026786 
tweet - AI 	 0.19333084671126158 
tweet - soccer 	 0.1197543563241326 
tweet - tennis 	 0.11622680287651725
In [25]:
print("tweet - ML \t", 1 - euclidean_distance(l2_normalize(x), l2_normalize(X[0])), "\n"
      "tweet - AI \t", 1 - euclidean_distance(l2_normalize(x), l2_normalize(X[1])), "\n"
      "tweet - soccer \t", 1 - euclidean_distance(l2_normalize(x), l2_normalize(X[2])), "\n"
      "tweet - tennis \t", 1 - euclidean_distance(l2_normalize(x), l2_normalize(X[3])))
tweet - ML 	 -0.2154548703241279 
tweet - AI 	 -0.2701725499228351 
tweet - soccer 	 -0.32683506410998 
tweet - tennis 	 -0.3294910282687
In [26]:
so_tweet = "#LegendsDownUnder The Reds are out for the warm up at the @nibStadium. Not long now until kick-off in Perth."
x2 = np.array(cv.transform([so_tweet]).todense())[0]
In [27]:
print("tweet - ML \t", euclidean_distance(x2, X[0]), "\n"
      "tweet - AI \t", euclidean_distance(x2, X[1]), "\n"
      "tweet - soccer \t", euclidean_distance(x2, X[2]), "\n"
      "tweet - tennis \t", euclidean_distance(x2, X[3]))
tweet - ML 	 371.8669116767449 
tweet - AI 	 1159.1397672412072 
tweet - soccer 	 710.1035135809426 
tweet - tennis 	 1050.1485609188826
In [28]:
print("tweet - ML \t", cosine_similarity(x2, X[0]), "\n"
      "tweet - AI \t", cosine_similarity(x2, X[1]), "\n"
      "tweet - soccer \t", cosine_similarity(x2, X[2]), "\n"
      "tweet - tennis \t", cosine_similarity(x2, X[3]))
tweet - ML 	 0.4396242958582417 
tweet - AI 	 0.46942065152331963 
tweet - soccer 	 0.6136116162795926 
tweet - tennis 	 0.5971160690477066
In [29]:
print("tweet - ML \t", 1 - euclidean_distance(l2_normalize(x2), l2_normalize(X[0])), "\n"
      "tweet - AI \t", 1 - euclidean_distance(l2_normalize(x2), l2_normalize(X[1])), "\n"
      "tweet - soccer \t", 1 - euclidean_distance(l2_normalize(x2), l2_normalize(X[2])), "\n"
      "tweet - tennis \t", 1 - euclidean_distance(l2_normalize(x2), l2_normalize(X[3])))
tweet - ML 	 -0.0586554719470902 
tweet - AI 	 -0.030125573390623384 
tweet - soccer 	 0.12092277504145588 
tweet - tennis 	 0.10235426703816686