import pandas as pd from bs4 import BeautifulSoup import urllib import matplotlib.pyplot as plt from matplotlib.colors import LogNorm %matplotlib inline %config InlineBackend.figure_format = 'retina' path = r'data/genealogy.csv' df = pd.read_csv(path) df.head() def extract_article(url): site = urllib.urlopen(url) soup = BeautifulSoup(site) article = soup.find("div", "mw-body-content").get_text() return article pythagoras = extract_article(df.URL[0]) print pythagoras[760:1400] names = map(lambda x : x.split("/")[-1], df.URL) names[:25] wiki_entries = [] for url in df.URL: wiki_entries.append(extract_article(url)) len(wiki_entries) import pickle pickle.dump(wiki_entries, open('data/wiki_entries.pkl','w')) pkl_entries = pickle.load(open('data/wiki_entries.pkl','r')) from sklearn.feature_extraction.text import CountVectorizer count_vect = CountVectorizer() X_train_counts = count_vect.fit_transform(pkl_entries) X_train_counts.shape from sklearn.feature_extraction.text import TfidfTransformer tfidf_transformer = TfidfTransformer() X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) X_train_tfidf.shape from sklearn.feature_extraction.text import TfidfVectorizer vectors = TfidfVectorizer().fit_transform(pkl_entries) vectors.shape from sklearn.manifold import TSNE def plot_embedding(pos): fig = plt.figure(figsize=(10, 10)) ax = plt.axes(frameon=False) plt.setp(ax, xticks=(), yticks=()) plt.scatter(pos[:,0],pos[:,1], s=5, color='r') for i, txt in enumerate(names): plt.annotate(txt, (pos[i,0], pos[i,1]), fontsize=6) from sklearn.decomposition import TruncatedSVD X_reduced = TruncatedSVD(n_components=50, random_state=0).fit_transform(vectors) X_embedded = TSNE(n_components=2, perplexity=40, verbose=2).fit_transform(X_reduced) plot_embedding(X_embedded) vectorizer = TfidfVectorizer(min_df=2, stop_words = 'english',\ strip_accents = 'unicode', lowercase=True, ngram_range=(1,2),\ norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True) X = vectorizer.fit_transform(pkl_entries) D = -(X * X.T).todense() # Distance matrix: dot product between tfidf vectors ak_embed = TSNE(n_components=2, perplexity=40, verbose=2).fit_transform(D) plot_embedding(ak_embed) import numpy as np from sklearn.metrics.pairwise import pairwise_distances vecs = X_train_counts #force vectors to have unit length: norm = np.sqrt(vecs.multiply(vecs).sum(1)) vecs = vecs.multiply(1./norm) distance_matrix = pairwise_distances(vecs, metric='cosine') model = TSNE(early_exaggeration=4) rk_embed = model.fit_transform(distance_matrix) plot_embedding(rk_embed) plt.figure(figsize=(8,8)) plt.imshow(distance_matrix, cmap='coolwarm') plt.figure(figsize=(8,8)) plt.imshow(-D, norm=LogNorm(), cmap='coolwarm') ak_embed_pre = TSNE(n_components=2, perplexity=40, verbose=2, metric='precomputed').fit_transform(D) plot_embedding(ak_embed_pre)