#!/usr/bin/env python
# coding: utf-8

# # Euclidean vs. Cosine Distance
# https://cmry.github.io/notes/euclidean-v-cosine

# In[1]:


import numpy as np

X = np.array([[6.6, 6.2, 1],
              [9.7, 9.9, 2],
              [8.0, 8.3, 2],
              [6.3, 5.4, 1],
              [1.3, 2.7, 0],
              [2.3, 3.1, 0],
              [6.6, 6.0, 1],
              [6.5, 6.4, 1],
              [6.3, 5.8, 1],
              [9.5, 9.9, 2],
              [8.9, 8.9, 2],
              [8.7, 9.5, 2],
              [2.5, 3.8, 0],
              [2.0, 3.1, 0],
              [1.3, 1.3, 0]])


# In[2]:


import pandas as pd

df = pd.DataFrame(X, columns=['weight', 'length', 'label'])
df


# In[3]:


get_ipython().run_line_magic('matplotlib', 'inline')

ax = df[df['label'] == 0].plot.scatter(x='weight', y='length', c='blue', label='young')
ax = df[df['label'] == 1].plot.scatter(x='weight', y='length', c='orange', label='mid', ax=ax)
ax = df[df['label'] == 2].plot.scatter(x='weight', y='length', c='red', label='adult', ax=ax)
ax


# In[4]:


df2 = pd.DataFrame([df.iloc[0], df.iloc[1], df.iloc[4]], columns=['weight', 'length', 'label'])
df3 = pd.DataFrame([df.iloc[14]], columns=['weight', 'length', 'label'])

ax = df2[df2['label'] == 0].plot.scatter(x='weight', y='length', c='blue', label='x4(young)')
ax = df2[df2['label'] == 1].plot.scatter(x='weight', y='length', c='orange', label='x0(mid)', ax=ax)
ax = df2[df2['label'] == 2].plot.scatter(x='weight', y='length', c='red', label='x1(adult)', ax=ax)
ax = df3.plot.scatter(x='weight', y='length', c='gray', label='x14(?)', ax=ax)
ax


# In[5]:


def euclidean_distance(x, y):   
    return np.sqrt(np.sum((x - y) ** 2))


# $\sqrt{\sum^n_{i=1} (x_i - y_i)^2}$

# In[6]:


x0 = X[0][:-1]
x1 = X[1][:-1]
x4 = X[4][:-1]
x14 = X[14][:-1]
print(" x0:", x0, "\n x1:", x1, "\n x4:", x4, "\nx14:", x14)


# In[7]:


print(" x14 and x0:", euclidean_distance(x14, x0), "\n",
      "x14 and x1:", euclidean_distance(x14, x1), "\n",
      "x14 and x4:", euclidean_distance(x14, x4))


# In[8]:


def cosine_similarity(x, y):
    return np.dot(x, y) / (np.sqrt(np.dot(x, x)) * np.sqrt(np.dot(y, y)))


# $\frac{x \bullet y}{ \sqrt{x \bullet x} \sqrt{y \bullet y}}$

# In[9]:


print(" x14 and x0:", cosine_similarity(x14, x0), "\n",
      "x14 and x1:", cosine_similarity(x14, x1), "\n",
      "x14 and x4:", cosine_similarity(x14, x4))


# While cosine looks at the angle between vectors (thus not taking into regard their weight or magnitude), euclidean distance is similar to using a ruler to actually measure the distance.

# # Cosine Similarity in Action

# In[10]:


import wikipedia

q1 = wikipedia.page('Machine Learning')
q2 = wikipedia.page('Artifical Intelligence')
q3 = wikipedia.page('Soccer')
q4 = wikipedia.page('Tennis')


# In[11]:


from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()
X = np.array(cv.fit_transform([q1.content, q2.content, q3.content, q4.content]).todense())


# In[12]:


print("ML \t", len(q1.content.split()), "\n"
      "AI \t", len(q2.content.split()), "\n"
      "soccer \t", len(q3.content.split()), "\n"
      "tennis \t", len(q4.content.split()))


# In[13]:


q1.content[:100]


# In[14]:


q1.content.split()[:10]


# In[15]:


X[0][:20]


# In[16]:


X[0].shape


# In[17]:


print("ML - AI \t", euclidean_distance(X[0], X[1]), "\n"
      "ML - soccer \t", euclidean_distance(X[0], X[2]), "\n"
      "ML - tennis \t", euclidean_distance(X[0], X[3]))


# In[18]:


print("ML - AI \t", cosine_similarity(X[0], X[1]), "\n"
      "ML - soccer \t", cosine_similarity(X[0], X[2]), "\n"
      "ML - tennis \t", cosine_similarity(X[0], X[3]))


# In[19]:


def l1_normalize(v):
    norm = np.sum(v)
    return v / norm

def l2_normalize(v):
    norm = np.sqrt(np.sum(np.square(v)))
    return v / norm


# In[20]:


print("ML - AI \t", 1 - euclidean_distance(l1_normalize(X[0]), l1_normalize(X[1])), "\n"
      "ML - soccer \t", 1 - euclidean_distance(l1_normalize(X[0]), l1_normalize(X[2])), "\n"
      "ML - tennis \t", 1 - euclidean_distance(l1_normalize(X[0]), l1_normalize(X[3])))


# In[21]:


print("ML - AI \t", 1 - euclidean_distance(l2_normalize(X[0]), l2_normalize(X[1])), "\n"
      "ML - soccer \t", 1 - euclidean_distance(l2_normalize(X[0]), l2_normalize(X[2])), "\n"
      "ML - tennis \t", 1 - euclidean_distance(l2_normalize(X[0]), l2_normalize(X[3])))


# # Categorize a Tweet

# In[22]:


ml_tweet = "New research release: overcoming many of Reinforcement Learning's limitations with Evolution Strategies."
x = np.array(cv.transform([ml_tweet]).todense())[0]


# In[23]:


print("tweet - ML \t", euclidean_distance(x, X[0]), "\n"
      "tweet - AI \t", euclidean_distance(x, X[1]), "\n"
      "tweet - soccer \t", euclidean_distance(x, X[2]), "\n"
      "tweet - tennis \t", euclidean_distance(x, X[3]))


# In[24]:


print("tweet - ML \t", cosine_similarity(x, X[0]), "\n"
      "tweet - AI \t", cosine_similarity(x, X[1]), "\n"
      "tweet - soccer \t", cosine_similarity(x, X[2]), "\n"
      "tweet - tennis \t", cosine_similarity(x, X[3]))


# In[25]:


print("tweet - ML \t", 1 - euclidean_distance(l2_normalize(x), l2_normalize(X[0])), "\n"
      "tweet - AI \t", 1 - euclidean_distance(l2_normalize(x), l2_normalize(X[1])), "\n"
      "tweet - soccer \t", 1 - euclidean_distance(l2_normalize(x), l2_normalize(X[2])), "\n"
      "tweet - tennis \t", 1 - euclidean_distance(l2_normalize(x), l2_normalize(X[3])))


# In[26]:


so_tweet = "#LegendsDownUnder The Reds are out for the warm up at the @nibStadium. Not long now until kick-off in Perth."
x2 = np.array(cv.transform([so_tweet]).todense())[0]


# In[27]:


print("tweet - ML \t", euclidean_distance(x2, X[0]), "\n"
      "tweet - AI \t", euclidean_distance(x2, X[1]), "\n"
      "tweet - soccer \t", euclidean_distance(x2, X[2]), "\n"
      "tweet - tennis \t", euclidean_distance(x2, X[3]))


# In[28]:


print("tweet - ML \t", cosine_similarity(x2, X[0]), "\n"
      "tweet - AI \t", cosine_similarity(x2, X[1]), "\n"
      "tweet - soccer \t", cosine_similarity(x2, X[2]), "\n"
      "tweet - tennis \t", cosine_similarity(x2, X[3]))


# In[29]:


print("tweet - ML \t", 1 - euclidean_distance(l2_normalize(x2), l2_normalize(X[0])), "\n"
      "tweet - AI \t", 1 - euclidean_distance(l2_normalize(x2), l2_normalize(X[1])), "\n"
      "tweet - soccer \t", 1 - euclidean_distance(l2_normalize(x2), l2_normalize(X[2])), "\n"
      "tweet - tennis \t", 1 - euclidean_distance(l2_normalize(x2), l2_normalize(X[3])))