There are many talks tomorrow at the CSV Conf. I want to cluster the talks:
from bs4 import BeautifulSoup
import requests
import pandas as pd
website_to_parse = "https://csvconf.com/speakers/"
# Save HTML to soup
html_data = requests.get(website_to_parse).text
soup = BeautifulSoup(html_data, "html5lib")
doc = soup.find_all("table", attrs={"class", "speakers"})[1]
names = doc.find_all("span", attrs={"class": "name"})
names = [t.getText().strip() for t in names]
titles = doc.find_all("p", attrs={"class": "title"})
titles = [t.getText().strip() for t in titles]
abstracts = doc.find_all("p", attrs={"class": "abstract"})
abstracts = [t.getText().strip() for t in abstracts]
print(len(names), len(titles), len(abstracts))
45 45 45
df = pd.DataFrame.from_dict({
'names':names,
'titles':titles,
'abstracts':abstracts})
# Combine text of title and abstract
df['document'] = df['titles'] + " " + df['abstracts']
# Add index
df['index'] = df.index
Preprocess text
import sys
sys.path.append("/Users/csiu/repo/kick/src/python")
import sim_doc as sim_doc
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils.extmath import randomized_svd
## Preprocess
_ = sim_doc.preprocess_data(df)
## TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['doc_processed'])
I refer to Jörn Hees (2015) to generate the hierarchical clustering and dendrogram using scipy.cluster.hierarchy.dendrogram
.
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage
# generate the linkage matrix
Z = linkage(X.toarray(), 'ward')
# calculate full dendrogram
plt.figure(figsize=(25, 4))
plt.title('Hierarchical Clustering of CSV,Conf,V3 Non-Keynote talks')
plt.xlabel('')
plt.ylabel('Distance')
dn = dendrogram(
Z,
leaf_rotation=270, # rotates the x axis labels
leaf_font_size=12, # font size for the x axis labels
labels = df["titles"].tolist(),
color_threshold=1.45, # where to cut for clusters
above_threshold_color='#bcbddc'
)
plt.show()