from __future__ import division # python 2, so old school
import numpy as np
import pandas as pd
import seaborn as sns
import nltk
import matplotlib.pyplot as plt
import re
from bs4 import BeautifulSoup
import codecs
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import Normalizer
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist, pdist
from sklearn.cluster import KMeans
from mpl_toolkits.mplot3d import Axes3D
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE
from sklearn.decomposition import FastICA as ICA
from sklearn.decomposition import PCA
pd.set_option('display.max_rows', 10)
%matplotlib inline
url_split[0].split('\n')[:10]
[u'stephen_palumbi_following_the_mercury_trail.html', u"There's a tight and surprising link between the ocean'shealth and ours, says marine biologist Stephen Palumbi. He showshow toxins at the bottom of the ocean food chain find their wayinto our bodies, with a shocking story of toxic contamination froma Japanese fish market. His work points a way forward for savingthe oceans' health -- and humanity's.", u'fish,health,mission blue,oceans,science', u'899', u'Stephen Palumbi: Following the mercury trail', u'', u'It can be a very complicated thing, the ocean.', u'And it can be a very complicated thing, what human healthis.', u'And bringing those two together might seem a very dauntingtask,', u"but what I'm going to try to say is that even in thatcomplexity, there's some simple themes that I think, if weunderstand, we can really move forward."]
def getRealWords(word, dictionary):
if word in dictionary:
return str(word)
else:
wordlength = len(word)
for i in range(wordlength):
part = word[:i]
if part in dictionary:
if word[i:] in dictionary:
return str(part) + ' ' + str(word[i:])
return str(word)
def processText(text, dictionary):
string = u''
words = text.split()
for word in words:
if word in stopwords:
pass
else:
string += ' ' + getRealWords(word, dictionary)
return string
df.head()
author | topics | url | text | |
---|---|---|---|---|
0 | Stephen Palumbi: Following the mercury trail | fish,health,mission blue,oceans,science | stephen_palumbi_following_the_mercury_trail.html | complicated thing ocean complicated thing huma... |
1 | Jessa Gamble: Our natural sleep cycle | evolution,humanity,personal growth,science,self | jessa_gamble_how_to_sleep.html | start day night life evolved condition light d... |
2 | Handspring Puppet Co.: The genius puppetry beh... | animals,arts,design,entertainment,theater | handpring_puppet_co_the_genius_puppetry_behind... | adrian kohler today talk evolution puppet hors... |
3 | Katherine Fulton: You are the future ofphilant... | activism,bottom-up,community,globalissues,phil... | katherine_fulton_you_are_the_future_of_philant... | philanthropy wha tit relationship offer vision... |
4 | Chris Gerdes: The future race car -- 150mph, a... | cars,future,technology | chris_gerdes_the_future_race_car_150mph_and_no... | wheel car driving road long day wanted tired f... |
topic_words = []
for topics in df.topics:
for topic in topics.split(','):
topic_words.append(topic)
clean_topics = processText(' '.join(topic_words), dictionary).split()
tpx = pd.DataFrame(clean_topics, columns=['topics'])
tpx.topics.value_counts()[:10]
technology 410 science 321 culture 317 design 285 global 278 issues 278 entertainment 207 business 199 arts 138 health 110 dtype: int64
tpx.topics.value_counts().plot(rot=90, figsize=(12,8), fontsize=20)
<matplotlib.axes.AxesSubplot at 0x1227efa90>
tpx.topics.value_counts()[:35].plot(rot=90, xticks=range(35), figsize=(12,8), fontsize=20)
<matplotlib.axes.AxesSubplot at 0x1229bbfd0>
tpx.topics.value_counts()[:20].plot(rot=90, xticks=range(20), figsize=(12,8), fontsize=20)
<matplotlib.axes.AxesSubplot at 0x122c5fe50>
tpx.topics.value_counts()[:10].plot(rot=90, xticks=range(10), figsize=(12,8), fontsize=20)
<matplotlib.axes.AxesSubplot at 0x123359390>
# Estimating K
k_range = range(5, 100, 5)
k_variance = [KMeans(n_clusters=k).fit(similarity_matrix) for k in k_range]
k_centroids = [X.cluster_centers_ for X in k_variance]
k_cosine = [cdist(similarity_matrix, cent, 'cosine') for cent in k_centroids]
distances = [np.min(ke, axis=1) for ke in k_cosine]
# Within-cluster sum of squares
wcss = [sum(d**2) for d in distances]
# Total sum of squares
tss = sum(pdist(similarity_matrix**2/similarity_matrix.shape[0]))
# Between cluster sum of squares
bss = tss - wcss
plt.plot(k_range, bss/tss*100)
plt.xlabel("Number of Clusters")
plt.ylabel("% Variance Explained")
<matplotlib.text.Text at 0x123ad4a90>
So when k is 10, we get around 75% ! Whereas we get another 10% by increasing to nearly 100. If we look at the topic distributions we plotted earlier, we'll see that there isn't much reason to focus much more than 10~20 topics.
Now that we've clustered the documents in this huge vector space, we could peruse through them and see if the cluster members correlate to concepts we think are useful or interesting. I'm going to do this with k=10.
tendf.cluster_id.hist()
<matplotlib.axes.AxesSubplot at 0x1103a4f10>
So this is really just one way of looking at our clusters. Our clusters are made of text, so lets look at the most frequently occuring terms, per cluster.
technology 160
design 145
culture 116
science 96
business 73
entertainment 66
arts 52
art 49
creativity 42
education 40
We'll that seems pretty general. Can't take too much from that - except that it talks alot about technology and design and culture... This is the most general category; it covers the T. E. and D. of TED. Since we can see that technology is number one, with a close second of design, then we move to seeing some of our other topics show up.
culture 119
entertainment 78
issues 71
global 71
arts 49
storytelling 46
technology 45
design 40
education 38
business 35
Much more focused here on culture, entertainment, and global issues.
technology 81
science 53
design 51
business 42
global 42
issues 42
environment 36
green 35
energy 30
invention 25
Hmm, once again, technology, design, business, but focusing more towards green energy at the end there.
issues 106
global 106
politics 49
culture 44
business 34
economics 31
health 29
Africa 28
technology 28
war 22
Here we actually see a departure from the earlier topics; were talking more about global issues, business, war, politics.
music 38
entertainment 37
performance 19
talk 16
arts 16
short 16
technology 16
design 13
live 12
culture 11
This is obviously more about art ! Music, performance, live performances.
science 53
technology 37
medicine 25
health 22
brain 19
biology 15
cancer 10
care 10
research 9
medical 9
science 33
oceans 28
technology 15
issues 12
mission 12
global 12
fish 12
blue 12
environment 10
exploration 10
science 27
physics 17
universe 17
technology 16
astronomy 12
cosmos 6
space 6
exploration 6
education 4
change 4
robots 12
technology 12
design 8
science 5
entertainment 3
engineering 3
evolution 3
animals 2
demo 2
AI 2
animals 3
issues 2
oceans 2
global 2
science 2
biodiversity 1
storytelling 1
culture 1
photography 1
creativity 1
for i in cluster_topics:
print i.topics.value_counts()[:10]
print '$' * 70
design 154 technology 149 culture 107 entertainment 72 science 72 arts 67 business 60 art 59 education 50 creativity 46 dtype: int64 $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ global 143 issues 143 culture 93 business 78 politics 67 technology 58 economics 52 health 49 science 39 Africa 39 dtype: int64 $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ music 70 entertainment 69 culture 33 technology 33 design 30 arts 29 performance 27 talk 22 short 22 live 18 dtype: int64 $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ culture 53 issues 39 global 39 women 30 storytelling 27 entertainment 26 arts 20 education 19 politics 19 children 16 dtype: int64 $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ science 78 technology 57 medicine 33 health 30 brain 28 biology 25 design 14 care 13 cancer 10 business 10 dtype: int64 $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ science 45 oceans 29 technology 27 issues 21 global 21 exploration 17 animals 15 mission 13 design 13 blue 13 dtype: int64 $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ food 21 science 20 design 19 issues 16 environment 16 global 16 technology 15 biology 13 green 12 business 10 dtype: int64 $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ technology 37 energy 27 design 18 green 17 business 17 environment 14 science 12 transportation 11 culture 9 sustainability 9 dtype: int64 $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ science 27 physics 16 universe 15 technology 14 astronomy 13 cosmos 6 space 6 change 4 exploration 4 education 4 dtype: int64 $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ robots 12 technology 12 design 8 science 5 entertainment 3 engineering 3 evolution 3 animals 2 demo 2 AI 2 dtype: int64 $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
for i in cluster_text:
print i.text.value_counts()[:10]
print '-'*70
thing 3388 people 2888 time 2106 kind 1684 year 1571 world 1538 work 1523 lot 1136 life 1091 idea 1047 dtype: int64 ---------------------------------------------------------------------- people 3856 world 2048 thing 1683 year 1594 time 1378 country 1027 life 879 lot 798 good 795 problem 772 dtype: int64 ---------------------------------------------------------------------- thing 390 people 366 time 311 music 310 world 264 year 240 good 233 life 218 sound 206 kind 189 dtype: int64 ---------------------------------------------------------------------- woman 927 people 632 year 492 time 472 story 463 child 450 thing 428 girl 425 world 413 life 384 dtype: int64 ---------------------------------------------------------------------- brain 975 cell 774 people 563 thing 544 cancer 520 time 476 year 465 patient 357 life 332 body 329 dtype: int64 ---------------------------------------------------------------------- year 595 water 490 ocean 471 thing 449 time 433 life 353 people 334 world 302 planet 271 earth 259 dtype: int64 ---------------------------------------------------------------------- food 450 people 307 year 276 thing 269 plant 210 world 205 time 195 lot 166 kind 159 tree 152 dtype: int64 ---------------------------------------------------------------------- energy 426 car 424 people 410 thing 406 year 385 time 282 world 262 technology 241 oil 225 city 218 dtype: int64 ---------------------------------------------------------------------- universe 462 galaxy 222 thing 205 star 199 year 199 space 198 planet 167 time 165 earth 153 life 140 dtype: int64 ---------------------------------------------------------------------- robot 344 thing 60 foot 59 animal 58 time 54 leg 49 doe 45 people 42 work 41 kind 37 dtype: int64 ----------------------------------------------------------------------
import gensim
all_text = [doc.split() for doc in df.text]
gensim_d = gensim.corpora.Dictionary(all_text)
corpus = [gensim_d.doc2bow(text) for text in all_text]
lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=gensim_d, num_topics=10, update_every=1, chunksize=100, passes=1)
lda_topics = lda.print_topics(10)
lda_tops = [topic.split('+') for topic in lda_topics]
for topic in lda_tops:
for pair in topic:
print pair.split('*')[0] + '\t' + pair.split('*')[1]
print '%' * 70
0.015 cell 0.011 patient 0.011 food 0.008 disease 0.008 cancer 0.007 body 0.007 brain 0.006 heart 0.006 people 0.006 year %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 0.014 space 0.009 universe 0.008 particle 0.007 thing 0.007 earth 0.007 light 0.006 planet 0.006 tree 0.006 theory 0.006 time %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 0.017 brain 0.014 human 0.009 thing 0.008 people 0.006 life 0.006 time 0.006 year 0.006 gene 0.004 evolution 0.004 genome %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 0.014 people 0.013 world 0.013 country 0.009 africa 0.009 year 0.007 woman 0.006 government 0.005 war 0.005 aid 0.005 india %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 0.026 people 0.012 world 0.012 thing 0.006 time 0.005 kind 0.005 idea 0.005 good 0.005 year 0.005 work 0.004 lot %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 0.010 life 0.008 music 0.008 compassion 0.007 people 0.006 time 0.006 sound 0.005 thing 0.005 world 0.005 god 0.004 year %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 0.013 year 0.009 technology 0.008 thing 0.008 people 0.007 energy 0.007 time 0.006 water 0.006 percent 0.005 world 0.005 system %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 0.011 thing 0.010 kind 0.007 time 0.006 water 0.006 animal 0.006 data 0.005 ocean 0.005 lot 0.005 robot 0.005 design %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 0.013 thing 0.010 people 0.010 time 0.007 work 0.007 year 0.006 day 0.006 life 0.005 kid 0.005 story 0.005 school %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 0.009 people 0.007 language 0.007 baby 0.007 child 0.006 love 0.006 time 0.005 year 0.005 thing 0.004 learning 0.004 english %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# Nice husl from seaborn
colors = sns.husl_palette(n_colors=10)
sns.palplot(colors)
colors.reverse()
sns.palplot(colors)
Google:
In machine learning and statistics, dimensionality reduction or dimension reduction is the process of reducing the number of random variables under consideration, and can be divided into feature selection and feature extraction.
However, it's quite hard to visualize 1000's of dimensions. So below, I go about plotting our clusters in 3 dimensions; yes - we are losing tons of data ! But, this reduction allows us to actually see the data. In order to do these reductions, I compare the following dimensionality reduction algorithms.
Simply put, PCA is a way of finding the most important parts some data set. More exactly, it's an orthogonal transformation of observations into some number of linearally uncorrelated variables, in this case, trying to summarize 1000's of dimensions into three. Basically, the first principle component is the component which accounts for this highest variance of the data (it explains the most), and the subsequent component(s) is the next in terms of variance explanation and also orthogonal (i.e. uncorrelated) with the previous component(s).
Very similar to PCA. A gross simplification; SVD is a way of factorizing a large matrix into 3 sub parts. These 3 parts can re-create the matrix, so we take some of one of the components to make a smaller, approximated copy of the original.
This is a fascinating algorithm. It has a few main parts. Firstly, it creates a probability distribution that represents similarity between points (in the high dimensional space). Then, it creates a similar probability distribution over the low dimensional space and then minimizes the distance between the two (Kullback-Leibler divergence).
Wikipedia:
ICA finds the independent components (also called factors, latent variables or sources) by maximizing the statistical independence of the estimated components.
Basically:
Typical algorithms for ICA use centering (subtract the mean to create a zero mean signal), whitening (usually with the eigenvalue decomposition), and dimensionality reduction as preprocessing steps in order to simplify and reduce the complexity of the problem for the actual iterative algorithm. Whitening and dimension reduction can be achieved with principal component analysis or singular value decomposition.
def plot_reduction_kmeans(first_reduction, first_num, second_reduction, second_num, matrix=similarity_matrix):
# Reduction #1
f = first_reduction(n_components=first_num)
f_matrix = f.fit_transform(matrix)
# Reduction #2 1000 dimensions ->3 dimensions
s = second_reduction(n_components=second_num)
s_matrix = s.fit_transform(f_matrix)
kmeans = KMeans(init='k-means++', n_clusters=10, n_init=100)
kmeans.fit(s_matrix)
d = {i:colors[i] for i in range(10)}
kcolors = [d[i] for i in kmeans.labels_]
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(s_matrix[:,0],s_matrix[:,1],s_matrix[:,2], c=kcolors, alpha=.6)
plot_reduction_kmeans(TruncatedSVD, 100, TSNE, 3)
plot_reduction_kmeans(TruncatedSVD, 100, PCA, 3)
plot_reduction_kmeans(TruncatedSVD, 500, PCA, 3)
plot_reduction_kmeans(TruncatedSVD, 100, ICA, 3)
plot_reduction_kmeans(TruncatedSVD, 500, ICA, 3)
plot_reduction_kmeans(PCA, 100, TSNE, 3)
plot_reduction_kmeans(PCA, 500, TSNE, 3)
plot_reduction_kmeans(PCA, 100, TruncatedSVD, 3)
plot_reduction_kmeans(PCA, 500, TruncatedSVD, 3)
plot_reduction_agg(PCA, 100, PCA, 3)
## My favorite - for now
plot_reduction_kmeans(PCA, 500, TruncatedSVD, 3)
Interactive Visualization! https://github.com/datacratic/data-projector http://opensource.datacratic.com/data-projector/
# Reduction #1
f = PCA(n_components=500)
f_matrix = f.fit_transform(similarity_matrix)
# Reduction #2 1000 dimensions ->3 dimensions
s = TruncatedSVD(n_components=3)
s_matrix = s.fit_transform(f_matrix)
kmeans = KMeans(init='k-means++', n_clusters=10, n_init=100)
kmeans.fit(s_matrix)
data_matrix = s_matrix.copy()
data_matrix = pd.DataFrame(data_matrix*200) # Gotta make everything a bit larger
data_matrix['cid'] = kmeans.labels_
data_matrix = data_matrix[[1,0,2,'cid']]
data_matrix.columns=['y','x','z','cid']
data_matrix.cid = data_matrix.cid.astype(int)
data_matrix = data_matrix.astype(str)
data_matrix
y | x | z | cid | |
---|---|---|---|---|
0 | -13.0348188266 | -10.0552407715 | 34.6405097522 | 4 |
1 | -15.0734196879 | -16.0528409209 | 0.0902980978558 | 7 |
2 | 16.0918851228 | -29.1836998758 | -11.0857616069 | 3 |
3 | -0.935772800338 | 21.9657836093 | 5.74032885375 | 0 |
4 | -11.3979443438 | -18.4993158635 | -7.80118007466 | 7 |
... | ... | ... | ... | ... |
1164 | 3.52751145427 | -0.971211989561 | 22.1971858237 | 2 |
1165 | 33.7620250944 | 9.99174149862 | 13.9537345578 | 5 |
1166 | -11.1691445003 | 28.9441680889 | -11.2355379091 | 6 |
1167 | -22.3281534736 | -69.6905826011 | -0.572318236642 | 7 |
1168 | 14.2762038784 | 28.56306616 | 16.2803658902 | 8 |
1169 rows × 4 columns