topic modeling, document clustering, and dimensionality reduction

using TED Talks (2013)

alexander johnson

http://metasyn.pw

general assembly, data science sf 11

In [94]:
from __future__ import division # python 2, so old school
import numpy as np
import pandas as pd
import seaborn as sns
import nltk
import matplotlib.pyplot as plt
import re
from bs4 import BeautifulSoup
import codecs

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import Normalizer
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist, pdist

from sklearn.cluster import KMeans
from mpl_toolkits.mplot3d import Axes3D
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE
from sklearn.decomposition import FastICA as ICA
from sklearn.decomposition import PCA

pd.set_option('display.max_rows', 10)

%matplotlib inline
In [2]:
# From OPUS: http://opus.lingfil.uu.se/TED2013.php

infile=open('./ted.xml', 'r')
raw=infile.read()
infile.close()
raw = raw.decode('ascii', 'ignore')
raw = raw.encode('utf8', 'ignore')
In [3]:
soup = BeautifulSoup(raw)
In [4]:
text = soup.get_text()
In [5]:
number_of_talks = len(text.split('http://www.ted.com/talks/')); number_of_talks
Out[5]:
1170
In [6]:
url_split = text.split('http://www.ted.com/talks/')
In [7]:
url_split=url_split[1:]
In [8]:
url_split[0].split('\n')[:10]
Out[8]:
[u'stephen_palumbi_following_the_mercury_trail.html',
 u"There's a tight and surprising link between the ocean'shealth and ours, says marine biologist Stephen Palumbi. He showshow toxins at the bottom of the ocean food chain find their wayinto our bodies, with a shocking story of toxic contamination froma Japanese fish market. His work points a way forward for savingthe oceans' health -- and humanity's.",
 u'fish,health,mission blue,oceans,science',
 u'899',
 u'Stephen Palumbi: Following the mercury trail',
 u'',
 u'It can be a very complicated thing, the ocean.',
 u'And it can be a very complicated thing, what human healthis.',
 u'And bringing those two together might seem a very dauntingtask,',
 u"but what I'm going to try to say is that even in thatcomplexity, there's some simple themes that I think, if weunderstand, we can really move forward."]
In [9]:
# Word list from : http://www.keithv.com/software/wlist/

wordfile = open('./wlist_match7.txt', 'r')
wordlist = wordfile.readlines()
wordfile.close()

dictionary = {word.strip(): '' for word in wordlist}

# Stop list file from http://www.ranks.nl/stopwords + nltk 
stopword_file = open('./stopword.txt', 'r')
stopwords_raw = stopword_file.read()
stopword_file.close()
stopwords_list = [w for w in stopwords_raw.split()]
stopwords_list = stopwords_list + nltk.corpus.stopwords.words('english')
stopwords = list(set(stopwords_list))
stopwords.append('ha')
stopwords.append('wa')
stopwords[-10:]
Out[9]:
['serious',
 'e',
 'together',
 'hello',
 "we're",
 "ain't",
 'having',
 'once',
 'ha',
 'wa']
In [10]:
def getRealWords(word, dictionary):
    if word in dictionary:
        return str(word)
    else:
        wordlength = len(word)
        for i in range(wordlength):
            part = word[:i]
            if part in dictionary:
                if word[i:] in dictionary:
                    return str(part) + ' ' + str(word[i:])
                    
        return str(word)
    
def processText(text, dictionary):
    string = u''
    words = text.split()
    for word in words:
        if word in stopwords:
            pass
        else:
            string += ' ' + getRealWords(word, dictionary)
    return string
    
In [11]:
lemmatizer = nltk.stem.WordNetLemmatizer()
In [12]:
def scrub(text):
    lines = text.splitlines()
    url = lines[0]
    topics = lines[2]
    author = lines[4]
    tokens = [t for t in nltk.tokenize.word_tokenize(' '.join(lines[5:]))]
    clean_tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens \
                    if re.search(ur'^[a-zA-Z]+', token)]
    clean = processText(' '.join(clean_tokens), dictionary).split()

    clean = [w for w in clean if w not in stopwords if w in dictionary]
    return author, topics, url, clean
In [13]:
a = scrub(url_split[0]); a[3][:10]
Out[13]:
[u'complicated',
 u'thing',
 u'ocean',
 u'complicated',
 u'thing',
 u'human',
 u'heal',
 u'bringing',
 u'daunting',
 u'task']
In [14]:
scrubbed = []
total = len(url_split)
for talk in url_split:
    scrubbed.append(scrub(talk))
In [15]:
df = pd.DataFrame(scrubbed, columns=['author', 'topics', 'url', 'text'])
In [16]:
df.head()
Out[16]:
author topics url text
0 Stephen Palumbi: Following the mercury trail fish,health,mission blue,oceans,science stephen_palumbi_following_the_mercury_trail.html [complicated, thing, ocean, complicated, thing...
1 Jessa Gamble: Our natural sleep cycle evolution,humanity,personal growth,science,self jessa_gamble_how_to_sleep.html [start, day, night, life, evolved, condition, ...
2 Handspring Puppet Co.: The genius puppetry beh... animals,arts,design,entertainment,theater handpring_puppet_co_the_genius_puppetry_behind... [adrian, kohler, today, talk, evolution, puppe...
3 Katherine Fulton: You are the future ofphilant... activism,bottom-up,community,globalissues,phil... katherine_fulton_you_are_the_future_of_philant... [philanthropy, wha, tit, relationship, offer, ...
4 Chris Gerdes: The future race car -- 150mph, a... cars,future,technology chris_gerdes_the_future_race_car_150mph_and_no... [wheel, car, driving, road, long, day, wanted,...
In [17]:
df['text'] = df['text'].map(
    lambda x: ' '.join(x))
In [18]:
df.head()
Out[18]:
author topics url text
0 Stephen Palumbi: Following the mercury trail fish,health,mission blue,oceans,science stephen_palumbi_following_the_mercury_trail.html complicated thing ocean complicated thing huma...
1 Jessa Gamble: Our natural sleep cycle evolution,humanity,personal growth,science,self jessa_gamble_how_to_sleep.html start day night life evolved condition light d...
2 Handspring Puppet Co.: The genius puppetry beh... animals,arts,design,entertainment,theater handpring_puppet_co_the_genius_puppetry_behind... adrian kohler today talk evolution puppet hors...
3 Katherine Fulton: You are the future ofphilant... activism,bottom-up,community,globalissues,phil... katherine_fulton_you_are_the_future_of_philant... philanthropy wha tit relationship offer vision...
4 Chris Gerdes: The future race car -- 150mph, a... cars,future,technology chris_gerdes_the_future_race_car_150mph_and_no... wheel car driving road long day wanted tired f...
In [19]:
import cPickle
cPickle.dump(df, open('df.pkl', 'w')) # never have to do this again !
In [20]:
! ls
10kdocterm.pkl   UN.en-es.en      stopword.txt     un.ipynb
LICENSE          data-projector   ted.ipynb        wlist_match7.txt
README.md        data.json        ted.xml
TED2013          df.pkl           ted_old.ipynb
In [21]:
# Whats the text actually look like now?
In [22]:
df.text
Out[22]:
0    complicated thing ocean complicated thing huma...
1    start day night life evolved condition light d...
2    adrian kohler today talk evolution puppet hors...
...
1166    explain thing assume explain achieve thing def...
1167    park big parking lot remember parked car probl...
1168    gon talk bit security security start kind sens...
Name: text, Length: 1169, dtype: object
In [23]:
topic_words = []
for topics in df.topics:
    for topic in topics.split(','):
        topic_words.append(topic)
clean_topics = processText(' '.join(topic_words), dictionary).split()
tpx = pd.DataFrame(clean_topics, columns=['topics'])
tpx.topics.value_counts()[:10]
Out[23]:
technology       410
science          321
culture          317
design           285
global           278
issues           278
entertainment    207
business         199
arts             138
health           110
dtype: int64
In [24]:
tpx.topics.value_counts().plot(rot=90, figsize=(12,8), fontsize=20)
Out[24]:
<matplotlib.axes.AxesSubplot at 0x1227efa90>
In [25]:
tpx.topics.value_counts()[:35].plot(rot=90, xticks=range(35), figsize=(12,8), fontsize=20)
Out[25]:
<matplotlib.axes.AxesSubplot at 0x1229bbfd0>
In [26]:
tpx.topics.value_counts()[:20].plot(rot=90, xticks=range(20), figsize=(12,8), fontsize=20)
Out[26]:
<matplotlib.axes.AxesSubplot at 0x122c5fe50>
In [27]:
tpx.topics.value_counts()[:10].plot(rot=90, xticks=range(10), figsize=(12,8), fontsize=20)
Out[27]:
<matplotlib.axes.AxesSubplot at 0x123359390>

Document Clustering

How are we going to represent the words, numerically, so that we can cluster them?

TF-IDF: Term Frequency, Inverse Document Frequency

It's one way to do it !

http://en.wikipedia.org/wiki/Tf%E2%80%93idf

tf–idf, short for term frequency–inverse document frequency, is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus. It is often used as a weighting factor in information retrieval and text mining. The tf-idf value increases proportionally to the number of times a word appears in the document, but is offset by the frequency of the word in the corpus, which helps to adjust for the fact that some words appear more frequently in general. Variations of the tf–idf weighting scheme are often used by search engines as a central tool in scoring and ranking a document's relevance given a user query. tf–idf can be successfully used for stop-words filtering in various subject fields including text summarization and classification.

In [28]:
# Vectorize: we've already used a ton of stopword lists up above by why not do it again.
# Smoothing here is a +1 to avoid zero division
vectorizer = TfidfVectorizer(stop_words='english', smooth_idf=True)
tfidf = vectorizer.fit_transform(df.text)
/Library/Python/2.7/site-packages/numpy-1.9.1-py2.7-macosx-10.9-intel.egg/numpy/core/fromnumeric.py:2499: VisibleDeprecationWarning: `rank` is deprecated; use the `ndim` attribute or function instead. To find the rank of a matrix see `numpy.linalg.matrix_rank`.
  VisibleDeprecationWarning)
In [29]:
# Cosine Similarity
similarity_matrix = tfidf.dot(tfidf.T)
similarity_matrix = Normalizer(copy=False).fit_transform(similarity_matrix)
In [30]:
# Estimating K - http://www.slideshare.net/SarahGuido/kmeans-clustering-with-scikitlearn
k_range = range(5, 50, 5)
k_euclid = [KMeans(n_clusters=k).fit(similarity_matrix) for k in k_range]
k_centroids = [X.cluster_centers_ for X in k_euclid]
In [31]:
k_cosine = [cdist(similarity_matrix.toarray(), cent, 'cosine') for cent in k_centroids]
distances = [np.min(ke, axis=1) for ke in k_cosine]
In [32]:
# Within-cluster sum of squares
wcss = [sum(d**2) for d in distances]
In [33]:
# Total sum of squares
tss = sum(pdist(similarity_matrix.toarray()**2/similarity_matrix.toarray().shape[0]))
In [34]:
# Between cluster sum of squares
bss = tss - wcss
In [35]:
plt.plot(k_range, bss/tss*100)
plt.xlabel("Number of Clusters")
plt.ylabel("% Variance Explained")
Out[35]:
<matplotlib.text.Text at 0x123ba3190>
In [36]:
# So, we can see that even with just 5 clusters we have over 73% variance explained.
In [37]:
from sklearn.metrics import silhouette_score, silhouette_samples
In [38]:
silhouette_scores = [silhouette_score(tfidf, k.labels_) for k in k_euclid]
/Library/Python/2.7/site-packages/numpy-1.9.1-py2.7-macosx-10.9-intel.egg/numpy/core/_methods.py:59: RuntimeWarning: Mean of empty slice.
  warnings.warn("Mean of empty slice.", RuntimeWarning)
In [39]:
plt.plot(k_range, silhouette_scores)
Out[39]:
[<matplotlib.lines.Line2D at 0x123a27790>]
In [40]:
tfmat = pd.DataFrame(tfidf.todense(), index=df.author, columns=vectorizer.get_feature_names())
In [41]:
# Reduce the data to the top 10,000 most important words
some = tfmat.sum(axis=0)
sorter = some.argsort()
srtd = pd.DataFrame(sorter)
sorted_index = srtd.sort(columns=0).index
reduced = tfmat[sorted_index][:10000]
In [42]:
#cPickle.dump(reduced, open('10kdocterm.pkl', 'w'))
! ls
10kdocterm.pkl   UN.en-es.en      stopword.txt     un.ipynb
LICENSE          data-projector   ted.ipynb        wlist_match7.txt
README.md        data.json        ted.xml
TED2013          df.pkl           ted_old.ipynb
In [43]:
reduced.head(1)
Out[43]:
sel breather fraser fractured sei aceh crawler accepted pla crept ... ipl disintegrated assent app nietzsche bleep charging alluded nerd intercollegiate
author
Stephen Palumbi: Following the mercury trail 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

1 rows × 35071 columns

In [44]:
similarity_matrix = reduced.dot(reduced.T)
In [45]:
similarity_matrix.describe()
Out[45]:
Stephen Palumbi: Following the mercury trail Jessa Gamble: Our natural sleep cycle Handspring Puppet Co.: The genius puppetry behind WarHorse Katherine Fulton: You are the future ofphilanthropy Chris Gerdes: The future race car -- 150mph, and nodriver Stefana Broadbent: How the Internet enablesintimacy Majora Carter: 3 stories of localeco-entrepreneurship Britta Riley: A garden in my apartment Nicholas Negroponte on One Laptop per Child, two yearson Rodney Brooks says robots will invade our lives ... Craig Venter on DNA and the sea Paul Romer's radical idea: Charter cities Philip Zimbardo prescribes a healthy take on time Carolyn Porco flies us to Saturn Kirk Citron: And now, the real news Lalitesh Katragadda: Making maps to fight disaster, buildeconomies Julia Bacha: Pay attention to nonviolence Simon Sinek: How great leaders inspire action Neil Burgess: How your brain tells you where youare James Stavridis: How NATO's Supreme Commander thinks aboutglobal security
count 1169.000000 1169.000000 1169.000000 1169.000000 1169.000000 1169.000000 1169.000000 1169.000000 1169.000000 1169.000000 ... 1169.000000 1169.000000 1169.000000 1169.000000 1169.000000 1169.000000 1169.000000 1169.000000 1169.000000 1169.000000
mean 0.041158 0.039579 0.024480 0.040698 0.037514 0.059773 0.076774 0.053765 0.074385 0.034230 ... 0.050592 0.057745 0.037777 0.032613 0.044480 0.029542 0.029258 0.064016 0.036052 0.048827
std 0.037136 0.033824 0.030999 0.034068 0.039601 0.038727 0.044644 0.038444 0.043496 0.056849 ... 0.044985 0.043135 0.033596 0.037049 0.037689 0.035060 0.032516 0.039824 0.043407 0.038045
min 0.000880 0.000000 0.000967 0.000000 0.001010 0.000441 0.000653 0.000000 0.001494 0.000711 ... 0.000309 0.001801 0.000000 0.000000 0.000000 0.000000 0.000000 0.004357 0.000000 0.000000
25% 0.026108 0.025549 0.015786 0.026687 0.022243 0.038263 0.050517 0.034564 0.049623 0.019186 ... 0.030418 0.035169 0.024366 0.018994 0.026349 0.015775 0.017751 0.041783 0.018024 0.031943
50% 0.036715 0.036446 0.021726 0.037505 0.031550 0.056583 0.071622 0.049844 0.070002 0.026857 ... 0.042873 0.049978 0.034817 0.026776 0.041076 0.024674 0.025713 0.060156 0.026972 0.044455
75% 0.048296 0.048669 0.029211 0.050845 0.043238 0.076600 0.096492 0.066155 0.092490 0.034833 ... 0.058156 0.072267 0.047740 0.037061 0.056010 0.037590 0.036140 0.081782 0.041046 0.059218
max 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 ... 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000

8 rows × 1169 columns

In [46]:
n = Normalizer(copy=False)
normal = n.fit_transform(similarity_matrix)
normalized = pd.DataFrame(normal)
normalized.describe()
Out[46]:
0 1 2 3 4 5 6 7 8 9 ... 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168
count 1169.000000 1169.000000 1169.000000 1169.000000 1169.000000 1169.000000 1169.000000 1169.000000 1169.000000 1169.000000 ... 1169.000000 1169.000000 1169.000000 1169.000000 1169.000000 1169.000000 1169.000000 1169.000000 1169.000000 1169.000000
mean 0.019521 0.018878 0.011925 0.019041 0.017776 0.027939 0.035744 0.025148 0.034669 0.016225 ... 0.023905 0.026738 0.017889 0.015709 0.020838 0.013901 0.013969 0.029911 0.017143 0.022997
std 0.018245 0.017694 0.021978 0.016936 0.019386 0.014810 0.014737 0.015766 0.014404 0.026438 ... 0.019598 0.016756 0.018183 0.020270 0.017689 0.020287 0.020226 0.014522 0.021096 0.016771
min 0.000853 0.000000 0.000800 0.000000 0.000979 0.000407 0.000633 0.000000 0.001359 0.000690 ... 0.000300 0.001667 0.000000 0.000000 0.000000 0.000000 0.000000 0.003677 0.000000 0.000000
25% 0.014099 0.013528 0.008192 0.014548 0.011560 0.021010 0.027281 0.018725 0.027139 0.010477 ... 0.016031 0.019049 0.013040 0.009763 0.014633 0.008395 0.009466 0.022712 0.009182 0.016917
50% 0.017207 0.017242 0.010218 0.017983 0.014834 0.027451 0.034363 0.023530 0.034019 0.012788 ... 0.019946 0.024279 0.017016 0.012770 0.018916 0.011841 0.012077 0.029065 0.012784 0.020862
75% 0.020987 0.022067 0.012941 0.022207 0.019354 0.033798 0.042632 0.029272 0.040632 0.015307 ... 0.026324 0.031517 0.020693 0.016981 0.024320 0.016390 0.016003 0.035802 0.018514 0.025942
max 0.527706 0.561876 0.740655 0.551155 0.536301 0.410707 0.329363 0.442571 0.339461 0.440889 ... 0.432106 0.405845 0.578646 0.592703 0.501759 0.638106 0.668809 0.387987 0.518463 0.472586

8 rows × 1169 columns

In [47]:
similarity_matrix = normalized
In [48]:
# Estimating K

k_range = range(5, 100, 5)
k_variance = [KMeans(n_clusters=k).fit(similarity_matrix) for k in k_range]
k_centroids = [X.cluster_centers_ for X in k_variance]
In [49]:
k_cosine = [cdist(similarity_matrix, cent, 'cosine') for cent in k_centroids]
distances = [np.min(ke, axis=1) for ke in k_cosine]

# Within-cluster sum of squares
wcss = [sum(d**2) for d in distances]
# Total sum of squares
tss = sum(pdist(similarity_matrix**2/similarity_matrix.shape[0]))
# Between cluster sum of squares
bss = tss - wcss
In [50]:
plt.plot(k_range, bss/tss*100)
plt.xlabel("Number of Clusters")
plt.ylabel("% Variance Explained")
Out[50]:
<matplotlib.text.Text at 0x123ad4a90>

Estimating K

So when k is 10, we get around 75% ! Whereas we get another 10% by increasing to nearly 100. If we look at the topic distributions we plotted earlier, we'll see that there isn't much reason to focus much more than 10~20 topics.

So... now what...?

Now that we've clustered the documents in this huge vector space, we could peruse through them and see if the cluster members correlate to concepts we think are useful or interesting. I'm going to do this with k=10.

In [51]:
ten = KMeans(n_clusters=10).fit(similarity_matrix)
tendf = pd.DataFrame(columns=['cluster_id', 'author','topics', 'text'])
tendf.topics = df.topics
tendf.author = df.author
tendf.cluster_id = ten.labels_
tendf.text = df.text
tendf.head()
Out[51]:
cluster_id author topics text
0 8 Stephen Palumbi: Following the mercury trail fish,health,mission blue,oceans,science complicated thing ocean complicated thing huma...
1 4 Jessa Gamble: Our natural sleep cycle evolution,humanity,personal growth,science,self start day night life evolved condition light d...
2 7 Handspring Puppet Co.: The genius puppetry beh... animals,arts,design,entertainment,theater adrian kohler today talk evolution puppet hors...
3 1 Katherine Fulton: You are the future ofphilant... activism,bottom-up,community,globalissues,phil... philanthropy wha tit relationship offer vision...
4 5 Chris Gerdes: The future race car -- 150mph, a... cars,future,technology wheel car driving road long day wanted tired f...
In [52]:
tendf.cluster_id.hist()
Out[52]:
<matplotlib.axes.AxesSubplot at 0x1103a4f10>
In [53]:
tendf['length'] = [len(t) for t in tendf.text]
In [54]:
tendf.text[0].split()[:5]
Out[54]:
[u'complicated', u'thing', u'ocean', u'complicated', u'thing']
In [55]:
cluster_topics = []
cluster_text = []

for cluster_id in tendf.cluster_id.value_counts().index:
    cluster_df = tendf[tendf.cluster_id==cluster_id]
    topic_words = []
    for topics in cluster_df.topics:
        for topic in topics.split(','):
            topic_words.append(topic)
    clean_topics = processText(' '.join(topic_words), dictionary).split()
    clean_df = pd.DataFrame(clean_topics, columns=['topics'])
    
    cluster_text.append(pd.DataFrame(' '.join([text for text in cluster_df.text]).split(), columns=['text']))
    cluster_topics.append(clean_df)
    

Looking Deeper: Top Terms per Cluster

So this is really just one way of looking at our clusters. Our clusters are made of text, so lets look at the most frequently occuring terms, per cluster.

Topic 1: Everything

technology       160
design           145
culture          116
science           96
business          73
entertainment     66
arts              52
art               49
creativity        42
education         40

We'll that seems pretty general. Can't take too much from that - except that it talks alot about technology and design and culture... This is the most general category; it covers the T. E. and D. of TED. Since we can see that technology is number one, with a close second of design, then we move to seeing some of our other topics show up.

Topic 2: Everything Pt.2

culture          119
entertainment     78
issues            71
global            71
arts              49
storytelling      46
technology        45
design            40
education         38
business          35

Much more focused here on culture, entertainment, and global issues.

Topic 3: Green Tech?

technology     81
science        53
design         51
business       42
global         42
issues         42
environment    36
green          35
energy         30
invention      25

Hmm, once again, technology, design, business, but focusing more towards green energy at the end there.

Topic 4 : Politics / Global Issues

issues        106
global        106
politics       49
culture        44
business       34
economics      31
health         29
Africa         28
technology     28
war            22

Here we actually see a departure from the earlier topics; were talking more about global issues, business, war, politics.

Topic 5 : Art?

music            38
entertainment    37
performance      19
talk             16
arts             16
short            16
technology       16
design           13
live             12
culture          11

This is obviously more about art ! Music, performance, live performances.

Topic 6 : Health

science       53
technology    37
medicine      25
health        22
brain         19
biology       15
cancer        10
care          10
research       9
medical        9

Topic 7: Oceans

science        33
oceans         28
technology     15
issues         12
mission        12
global         12
fish           12
blue           12
environment    10
exploration    10

Topic 8: Space

science        27
physics        17
universe       17
technology     16
astronomy      12
cosmos          6
space           6
exploration     6
education       4
change          4

Topic 9: Robots

robots           12
technology       12
design            8
science           5
entertainment     3
engineering       3
evolution         3
animals           2
demo              2
AI                2

Topic 10: ?

animals         3
issues          2
oceans          2
global          2
science         2
biodiversity    1
storytelling    1
culture         1
photography     1
creativity      1
In [61]:
for i in cluster_topics:
    print i.topics.value_counts()[:10]
    print '$' * 70
design           154
technology       149
culture          107
entertainment     72
science           72
arts              67
business          60
art               59
education         50
creativity        46
dtype: int64
$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
global        143
issues        143
culture        93
business       78
politics       67
technology     58
economics      52
health         49
science        39
Africa         39
dtype: int64
$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
music            70
entertainment    69
culture          33
technology       33
design           30
arts             29
performance      27
talk             22
short            22
live             18
dtype: int64
$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
culture          53
issues           39
global           39
women            30
storytelling     27
entertainment    26
arts             20
education        19
politics         19
children         16
dtype: int64
$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
science       78
technology    57
medicine      33
health        30
brain         28
biology       25
design        14
care          13
cancer        10
business      10
dtype: int64
$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
science        45
oceans         29
technology     27
issues         21
global         21
exploration    17
animals        15
mission        13
design         13
blue           13
dtype: int64
$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
food           21
science        20
design         19
issues         16
environment    16
global         16
technology     15
biology        13
green          12
business       10
dtype: int64
$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
technology        37
energy            27
design            18
green             17
business          17
environment       14
science           12
transportation    11
culture            9
sustainability     9
dtype: int64
$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
science        27
physics        16
universe       15
technology     14
astronomy      13
cosmos          6
space           6
change          4
exploration     4
education       4
dtype: int64
$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
robots           12
technology       12
design            8
science           5
entertainment     3
engineering       3
evolution         3
animals           2
demo              2
AI                2
dtype: int64
$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
In [59]:
for i in cluster_text:
    print i.text.value_counts()[:10]
    print '-'*70
thing     3388
people    2888
time      2106
kind      1684
year      1571
world     1538
work      1523
lot       1136
life      1091
idea      1047
dtype: int64
----------------------------------------------------------------------
people     3856
world      2048
thing      1683
year       1594
time       1378
country    1027
life        879
lot         798
good        795
problem     772
dtype: int64
----------------------------------------------------------------------
thing     390
people    366
time      311
music     310
world     264
year      240
good      233
life      218
sound     206
kind      189
dtype: int64
----------------------------------------------------------------------
woman     927
people    632
year      492
time      472
story     463
child     450
thing     428
girl      425
world     413
life      384
dtype: int64
----------------------------------------------------------------------
brain      975
cell       774
people     563
thing      544
cancer     520
time       476
year       465
patient    357
life       332
body       329
dtype: int64
----------------------------------------------------------------------
year      595
water     490
ocean     471
thing     449
time      433
life      353
people    334
world     302
planet    271
earth     259
dtype: int64
----------------------------------------------------------------------
food      450
people    307
year      276
thing     269
plant     210
world     205
time      195
lot       166
kind      159
tree      152
dtype: int64
----------------------------------------------------------------------
energy        426
car           424
people        410
thing         406
year          385
time          282
world         262
technology    241
oil           225
city          218
dtype: int64
----------------------------------------------------------------------
universe    462
galaxy      222
thing       205
star        199
year        199
space       198
planet      167
time        165
earth       153
life        140
dtype: int64
----------------------------------------------------------------------
robot     344
thing      60
foot       59
animal     58
time       54
leg        49
doe        45
people     42
work       41
kind       37
dtype: int64
----------------------------------------------------------------------
In [71]:
# Agglomerative Clustering ? 
from sklearn.cluster import AgglomerativeClustering
ten = AgglomerativeClustering(n_clusters=10).fit(similarity_matrix)
tendf = pd.DataFrame(columns=['cluster_id', 'author','topics'])
tendf.topics = df.topics
tendf.author = df.author
tendf.cluster_id = ten.labels_
tendf['text'] = df.text
tendf.head()
Out[71]:
cluster_id author topics text
0 1 Stephen Palumbi: Following the mercury trail fish,health,mission blue,oceans,science complicated thing ocean complicated thing huma...
1 1 Jessa Gamble: Our natural sleep cycle evolution,humanity,personal growth,science,self start day night life evolved condition light d...
2 0 Handspring Puppet Co.: The genius puppetry beh... animals,arts,design,entertainment,theater adrian kohler today talk evolution puppet hors...
3 5 Katherine Fulton: You are the future ofphilant... activism,bottom-up,community,globalissues,phil... philanthropy wha tit relationship offer vision...
4 1 Chris Gerdes: The future race car -- 150mph, a... cars,future,technology wheel car driving road long day wanted tired f...
In [72]:
tendf.cluster_id.hist()

tendf['length'] = [len(t) for t in tendf.text]

tendf.text[0].split()[:5]
Out[72]:
[u'complicated', u'thing', u'ocean', u'complicated', u'thing']
In [73]:
tendf.head(1)
Out[73]:
cluster_id author topics text length
0 1 Stephen Palumbi: Following the mercury trail fish,health,mission blue,oceans,science complicated thing ocean complicated thing huma... 6680
In [74]:
cluster_topics = []
cluster_text = []

for cluster_id in tendf.cluster_id.value_counts().index:
    cluster_df = tendf[tendf.cluster_id==cluster_id]
    topic_words = []
    for topics in cluster_df.topics:
        for topic in topics.split(','):
            topic_words.append(topic)
    clean_topics = processText(' '.join(topic_words), dictionary).split()
    clean_df = pd.DataFrame(clean_topics, columns=['topics'])
    
    cluster_text.append(pd.DataFrame(' '.join([text for text in cluster_df.text]).split(), columns=['text']))
    cluster_topics.append(clean_df)
    
In [75]:
for i in cluster_topics:
    print i.topics.value_counts()[:10]
    print '$' * 70
design           140
culture          139
technology       114
entertainment     72
science           72
arts              67
art               61
business          61
education         55
global            49
dtype: int64
$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
global        105
issues        105
technology     83
business       76
culture        66
economics      43
politics       43
science        38
design         37
Africa         37
dtype: int64
$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
technology     78
science        77
design         48
issues         36
global         36
environment    32
oceans         30
energy         27
biology        24
business       23
dtype: int64
$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
global           67
issues           67
culture          61
war              37
politics         34
entertainment    33
women            27
storytelling     27
arts             25
technology       16
dtype: int64
$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
music            68
entertainment    61
technology       25
arts             25
performance      22
design           22
culture          21
live             20
short            19
talk             19
dtype: int64
$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
science       50
technology    39
health        39
medicine      27
biology       17
care          16
business      14
design        13
culture       13
issues        12
dtype: int64
$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
science        28
physics        17
technology     16
universe       16
astronomy      13
cosmos          6
design          5
space           5
education       4
exploration     4
dtype: int64
$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
science          25
brain            21
technology       15
neurology         7
neuroscience      6
computers         5
mind              5
biology           5
design            4
consciousness     4
dtype: int64
$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
technology    12
science       11
cancer         9
medicine       9
health         8
biology        5
medical        5
care           3
business       3
research       2
dtype: int64
$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
robots           12
technology       12
design            8
science           5
entertainment     3
engineering       3
evolution         3
animals           2
demo              2
AI                2
dtype: int64
$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
In [76]:
for i in cluster_text:
    print i.text.value_counts()[:10]
    print '-'*70
people    3078
thing     3043
time      2123
year      1666
world     1619
kind      1556
work      1329
life      1194
lot       1058
good       987
dtype: int64
----------------------------------------------------------------------
people     3291
thing      1825
world      1716
year       1386
time       1242
country     835
work        802
lot         796
kind        719
problem     706
dtype: int64
----------------------------------------------------------------------
year      1147
thing     1044
people     957
time       871
water      671
world      670
lot        568
life       567
food       525
ocean      461
dtype: int64
----------------------------------------------------------------------
people    1005
woman      835
world      696
year       577
story      576
thing      573
time       573
life       432
child      405
girl       394
dtype: int64
----------------------------------------------------------------------
music     333
play      312
thing     270
people    245
sound     241
time      228
year      173
good      167
world     164
yeah      152
dtype: int64
----------------------------------------------------------------------
thing         539
people        525
year          490
life          415
time          370
technology    333
cell          328
world         297
work          278
patient       249
dtype: int64
----------------------------------------------------------------------
universe    471
thing       245
galaxy      222
year        207
star        205
space       201
time        189
planet      168
earth       150
life        138
dtype: int64
----------------------------------------------------------------------
brain      719
people     173
thing      158
cell       154
time       146
neuron     139
human       97
kind        89
pattern     89
called      84
dtype: int64
----------------------------------------------------------------------
cancer     430
cell       320
disease    131
tumor      118
body       118
patient    111
stem        98
drug        97
woman       93
year        90
dtype: int64
----------------------------------------------------------------------
robot     344
thing      60
foot       59
animal     58
time       54
leg        49
doe        45
people     42
work       41
kind       37
dtype: int64
----------------------------------------------------------------------
In [77]:
import gensim
In [78]:
all_text = [doc.split() for doc in df.text]
In [79]:
gensim_d = gensim.corpora.Dictionary(all_text)
In [80]:
corpus = [gensim_d.doc2bow(text) for text in all_text]
In [81]:
lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=gensim_d, num_topics=10, update_every=1, chunksize=100, passes=1)
In [86]:
lda_topics = lda.print_topics(10)
lda_tops = [topic.split('+') for topic in lda_topics]
for topic in lda_tops:
    for pair in topic:
        print pair.split('*')[0] + '\t' + pair.split('*')[1]
    print '%' * 70
0.015	cell 
 0.011	patient 
 0.011	food 
 0.008	disease 
 0.008	cancer 
 0.007	body 
 0.007	brain 
 0.006	heart 
 0.006	people 
 0.006	year
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
0.014	space 
 0.009	universe 
 0.008	particle 
 0.007	thing 
 0.007	earth 
 0.007	light 
 0.006	planet 
 0.006	tree 
 0.006	theory 
 0.006	time
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
0.017	brain 
 0.014	human 
 0.009	thing 
 0.008	people 
 0.006	life 
 0.006	time 
 0.006	year 
 0.006	gene 
 0.004	evolution 
 0.004	genome
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
0.014	people 
 0.013	world 
 0.013	country 
 0.009	africa 
 0.009	year 
 0.007	woman 
 0.006	government 
 0.005	war 
 0.005	aid 
 0.005	india
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
0.026	people 
 0.012	world 
 0.012	thing 
 0.006	time 
 0.005	kind 
 0.005	idea 
 0.005	good 
 0.005	year 
 0.005	work 
 0.004	lot
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
0.010	life 
 0.008	music 
 0.008	compassion 
 0.007	people 
 0.006	time 
 0.006	sound 
 0.005	thing 
 0.005	world 
 0.005	god 
 0.004	year
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
0.013	year 
 0.009	technology 
 0.008	thing 
 0.008	people 
 0.007	energy 
 0.007	time 
 0.006	water 
 0.006	percent 
 0.005	world 
 0.005	system
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
0.011	thing 
 0.010	kind 
 0.007	time 
 0.006	water 
 0.006	animal 
 0.006	data 
 0.005	ocean 
 0.005	lot 
 0.005	robot 
 0.005	design
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
0.013	thing 
 0.010	people 
 0.010	time 
 0.007	work 
 0.007	year 
 0.006	day 
 0.006	life 
 0.005	kid 
 0.005	story 
 0.005	school
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
0.009	people 
 0.007	language 
 0.007	baby 
 0.007	child 
 0.006	love 
 0.006	time 
 0.005	year 
 0.005	thing 
 0.004	learning 
 0.004	english
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
In [88]:
# Nice husl from seaborn
colors = sns.husl_palette(n_colors=10)
sns.palplot(colors)
In [89]:
colors[9]=[0,0,0]
colors[8]=[1,1,1]
In [90]:
colors.reverse()
sns.palplot(colors)

Dimensionality Reduction

Google:

In machine learning and statistics, dimensionality reduction or dimension reduction is the process of reducing the number of random variables under consideration, and can be divided into feature selection and feature extraction.

Visualizing data is important

However, it's quite hard to visualize 1000's of dimensions. So below, I go about plotting our clusters in 3 dimensions; yes - we are losing tons of data ! But, this reduction allows us to actually see the data. In order to do these reductions, I compare the following dimensionality reduction algorithms.

Principal Component Analysis

Simply put, PCA is a way of finding the most important parts some data set. More exactly, it's an orthogonal transformation of observations into some number of linearally uncorrelated variables, in this case, trying to summarize 1000's of dimensions into three. Basically, the first principle component is the component which accounts for this highest variance of the data (it explains the most), and the subsequent component(s) is the next in terms of variance explanation and also orthogonal (i.e. uncorrelated) with the previous component(s).

Singular Value Decomposition

Very similar to PCA. A gross simplification; SVD is a way of factorizing a large matrix into 3 sub parts. These 3 parts can re-create the matrix, so we take some of one of the components to make a smaller, approximated copy of the original.

t-Distributed Stochastic Neighbor Embedding

This is a fascinating algorithm. It has a few main parts. Firstly, it creates a probability distribution that represents similarity between points (in the high dimensional space). Then, it creates a similar probability distribution over the low dimensional space and then minimizes the distance between the two (Kullback-Leibler divergence).

Independent Component Analysis

Wikipedia:

ICA finds the independent components (also called factors, latent variables or sources) by maximizing the statistical independence of the estimated components.

Basically:

Typical algorithms for ICA use centering (subtract the mean to create a zero mean signal), whitening (usually with the eigenvalue decomposition), and dimensionality reduction as preprocessing steps in order to simplify and reduce the complexity of the problem for the actual iterative algorithm. Whitening and dimension reduction can be achieved with principal component analysis or singular value decomposition.

In [91]:
def plot_reduction_kmeans(first_reduction, first_num, second_reduction, second_num, matrix=similarity_matrix):

    # Reduction #1 
    f = first_reduction(n_components=first_num)
    f_matrix = f.fit_transform(matrix)

    # Reduction #2 1000 dimensions ->3 dimensions
    s = second_reduction(n_components=second_num) 
    s_matrix = s.fit_transform(f_matrix)

    kmeans = KMeans(init='k-means++', n_clusters=10, n_init=100)
    kmeans.fit(s_matrix)
    
    d = {i:colors[i] for i in range(10)}
    kcolors = [d[i] for i in kmeans.labels_]
    
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    ax.scatter(s_matrix[:,0],s_matrix[:,1],s_matrix[:,2], c=kcolors, alpha=.6)
In [92]:
plot_reduction_kmeans(TruncatedSVD, 100, TSNE, 3)
In [95]:
plot_reduction_kmeans(TruncatedSVD, 100, PCA, 3)
In [116]:
plot_reduction_kmeans(TruncatedSVD, 500, PCA, 3)
In [96]:
plot_reduction_kmeans(TruncatedSVD, 100, ICA, 3)
In [117]:
plot_reduction_kmeans(TruncatedSVD, 500, ICA, 3)
In [97]:
plot_reduction_kmeans(PCA, 100, TSNE, 3)
In [118]:
plot_reduction_kmeans(PCA, 500, TSNE, 3)
In [98]:
plot_reduction_kmeans(PCA, 100, TruncatedSVD, 3)
In [119]:
plot_reduction_kmeans(PCA, 500, TruncatedSVD, 3)