#!/usr/bin/env python
# coding: utf-8

# There are many talks tomorrow at the [CSV Conf](https://csvconf.com/speakers/). I want to cluster the talks:
# 
# - Get html
# - Get talk titles
# - Match titles with description (to get more text)
# - Model with TF-IDF
# - Find clusters

# ### Get HTML

# In[1]:


from bs4 import BeautifulSoup
import requests
import pandas as pd


# In[2]:


website_to_parse = "https://csvconf.com/speakers/"

# Save HTML to soup
html_data = requests.get(website_to_parse).text
soup = BeautifulSoup(html_data, "html5lib")


# In[3]:


doc = soup.find_all("table", attrs={"class", "speakers"})[1]

names = doc.find_all("span", attrs={"class": "name"})
names = [t.getText().strip() for t in names]

titles = doc.find_all("p", attrs={"class": "title"})
titles = [t.getText().strip() for t in titles]

abstracts = doc.find_all("p", attrs={"class": "abstract"})
abstracts = [t.getText().strip() for t in abstracts]


# In[4]:


print(len(names), len(titles), len(abstracts))


# ### Document representation

# In[5]:


df = pd.DataFrame.from_dict({
    'names':names,
    'titles':titles,
    'abstracts':abstracts})

# Combine text of title and abstract
df['document'] = df['titles'] + " " + df['abstracts']

# Add index
df['index'] = df.index


# Preprocess text

# In[6]:


import sys
sys.path.append("/Users/csiu/repo/kick/src/python")
import sim_doc as sim_doc

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils.extmath import randomized_svd


# In[7]:


## Preprocess
_ = sim_doc.preprocess_data(df)

## TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['doc_processed'])


# ### Cluster the talks
# 
# I refer to [Jörn Hees (2015)](https://joernhees.de/blog/2015/08/26/scipy-hierarchical-clustering-and-dendrogram-tutorial/) to generate the hierarchical clustering and dendrogram using [`scipy.cluster.hierarchy.dendrogram`](https://docs.scipy.org/doc/scipy-0.19.0/reference/generated/scipy.cluster.hierarchy.dendrogram.html).
# 
# 

# In[8]:


import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage


# In[9]:


# generate the linkage matrix
Z = linkage(X.toarray(), 'ward')

# calculate full dendrogram
plt.figure(figsize=(25, 4))
plt.title('Hierarchical Clustering of CSV,Conf,V3 Non-Keynote talks')
plt.xlabel('')
plt.ylabel('Distance')
dn = dendrogram(
    Z,
    leaf_rotation=270,  # rotates the x axis labels
    leaf_font_size=12,  # font size for the x axis labels
    labels = df["titles"].tolist(),
    color_threshold=1.45, # where to cut for clusters
    above_threshold_color='#bcbddc'
)
plt.show()


# In[ ]: