Run all imports (lots and a bit ugly, i know) and define some helper functions.
import sys
import pandas as pd
from gensim.models import Phrases, Word2Vec
from gensim.models.phrases import Phraser
from gensim.models.keyedvectors import KeyedVectors
import nltk
from nltk.corpus import stopwords
import string
import multiprocessing
import itertools
from collections import defaultdict, Mapping, Container
import random
from tqdm import tqdm
from sys import getsizeof
import pickle
import re
import networkx as nx
from itertools import chain
import matplotlib.pyplot as plt
from scipy.spatial.distance import pdist, squareform
from scipy.cluster.hierarchy import linkage, dendrogram
import plotly.plotly as py
from plotly.graph_objs import *
import plotly.figure_factory as FF
import plotly.graph_objs as go
import plotly.tools
import numpy as np
from jinja2 import Template
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
# set plotly creds
plotly.tools.set_credentials_file(username='andrewm4894', api_key='YOUR_KEY_HERE')
# config vars for bq
project_id = "MY_BQ_PROJECT"
private_key = "C:/Users/Andrew/Documents/PATH_TO_YOUR_KEY/MY_KEY.json"
# set wider prints for pd
pd.options.display.max_colwidth = 500
# function to strip html
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
# function to print shape of df
def print_dim(df):
print("### df SHAPE = "+str(df.shape)+" ###")
Our data happens to be stored in Google Bigquery - its awesome and i'd reccomend it to everyone!
So this part might be a bit specific to Bigquery. We also use jinja templating here to pull from Bigquery one year at a time. This is because the pandas read_gbq module can only comfortably pull a certain amount of data in one go. So templating by year is essentially a natural way to shard our data pulls.
# use jinja2 template to run query for each year to avoid pd gbq crapping out.
# query to pull from data stored in Google Bigquery
qry_template = '''
select pid, text from
(
select
-- hash the id so is still an id but more anonamous
sha1(post_id_domain) as pid,
post_content as text
from
hollywoodlife.post_content
where
post_content is not null
and
post_content<>''
and
post_date like '{{ post_year }}%'
group by 1,2
--limit 250 #uncomment when pulling in smaller sample
)
'''
template_qry = Template( qry_template )
loop_num = 0
# loop through each year
for year in range(2012 , 2018):
print(year)
# just track if first loop or not to handle the append
loop_num += 1
# render the template query for the year of the loop
qry_rendered = template_qry.render( post_year = year )
# pull data from google bigquery
df_tmp = pd.read_gbq( qry_rendered, project_id, private_key = private_key )
# if first loop then obviously nothing to append as only have results for the first year
if loop_num == 1:
df = df_tmp
# if not the first year then append this year to all others
else:
# union df's
frames = [df, df_tmp]
df = pd.concat(frames)
# reset index
df.reset_index(drop=True)
print_dim(df)
print(df.head(10))
2012 Requesting query... ok. Query running... Query done. Processed: 134.1 MB Retrieving results... Got page: 1; 100% done. Elapsed 9.84 s. Got 17214 rows. Total time taken 9.96 s. Finished at 2017-08-21 14:44:53. 2013 Requesting query... ok. Query running... Query done. Processed: 134.1 MB Retrieving results... Got page: 1; 100% done. Elapsed 10.27 s. Got 17881 rows. Total time taken 10.37 s. Finished at 2017-08-21 14:45:04. 2014 Requesting query... ok. Query running... Query done. Processed: 134.1 MB Retrieving results... Got page: 1; 100% done. Elapsed 13.45 s. Got 18760 rows. Total time taken 13.55 s. Finished at 2017-08-21 14:45:18. 2015 Requesting query... ok. Query running... Query done. Processed: 134.1 MB Retrieving results... Got page: 1; 90% done. Elapsed 16.21 s. Got page: 2; 100% done. Elapsed 17.94 s. Got 23819 rows. Total time taken 18.07 s. Finished at 2017-08-21 14:45:37. 2016 Requesting query... ok. Query running... Query done. Processed: 134.1 MB Retrieving results... Got page: 1; 21% done. Elapsed 12.72 s. Got page: 2; 43% done. Elapsed 17.02 s. Got page: 3; 64% done. Elapsed 21.03 s. Got page: 4; 85% done. Elapsed 25.68 s. Got page: 5; 100% done. Elapsed 30.06 s. Got 19279 rows. Total time taken 30.21 s. Finished at 2017-08-21 14:46:08. 2017 Requesting query... ok. Query running... Query done. Processed: 134.1 MB Retrieving results... Got page: 1; 20% done. Elapsed 13.76 s. Got page: 2; 40% done. Elapsed 20.51 s. Got page: 3; 60% done. Elapsed 28.0 s. Got page: 4; 80% done. Elapsed 35.85 s. Got page: 5; 100% done. Elapsed 43.53 s. Got page: 6; 100% done. Elapsed 45.17 s. Got 21071 rows. Total time taken 45.32 s. Finished at 2017-08-21 14:46:54. ### df SHAPE = (118024, 2) ### pid \ 0 I8bC0jze55Ow0LZoJSyRYGr1K0M= 1 Oj/mjv0XIpoJPaLX+XCQep65ToU= 2 dOlSCJ9+2t1xLe/zB2Re+H+KEg4= 3 JSUPPhMmW+EG7bk3SwjuGsRIOQs= 4 VUu+U6+AoDr3dM7MjPldJzrom5g= 5 Pf0w8g3yHVzCPGANWb1yFBih/g8= 6 h/CArboWvz7qHyz9tAoqqvKVtGY= 7 2TLuF1x7XFeXoMbgjPYwElsA6tc= 8 gATed2Df6aFdRtMNLbaYC9ZEcnU= 9 srPxWwm5B1Gw2mqvGTYZfzLfLNc= text 0 'Bachelorette' Ashley Hebert: How I'm Getting In Shape For My Wedding . <h3>Find out how you can get as fit as Ashley with all of her secret diet and workout tips.</h3>\n<p><em><a href=\"http://www.hollywoodlife.com/tag/The-Bachelorette/\">Bachelorette</a></em> <strong><a href=\"http://www.hollywoodlife.com/tag/Ashley-Hebert/\">Ashley Hebert</a></strong> is trying hard to stay fit for her wedding to <strong><a href=\"http://www.hollywoodlife.com/tag/J.P.-Rosenbaum/\">J.P. Rose... 1 'Teen Mom 2' Star Leah Messer's Dream Wedding — The Details Revealed . <h3>Leah’s fiance Jeremy Calvert gushes about the type of ceremony they can’t wait to have, the guest list and even the honeymoon destination.</h3>\n<p><em><a href=\"http://www.hollywoodlife.com/tag/Teen-Mom-2/\">Teen Mom 2</a></em> star <strong><a href=\"http://www.hollywoodlife.com/tag/Leah-Messer/\">Leah Messer</a></strong> is happily engaged to her new fiance <strong><a href=\"http://w... 2 Are 'Teen Mom' Leah Messer & Jeremy Calvert Already Fighting? . <h3>Leah nags her fiance Jeremy on Twitter about being late — and he responds angrily. Is there trouble in paradise?</h3>\n<p><em><a href=\"http://www.hollywoodlife.com/tag/Teen-Mom-2/\">Teen Mom 2</a></em> star <strong><a href=\"http://www.hollywoodlife.com/tag/Leah-Messer/\">Leah Messer</a></strong>, 19, has been engaged to her fiance <strong><a href=\"http://www.hollywoodlife.com/tag/jeremy-calvert/\">Jere... 3 Christian Siriano Designs Wedding Dresses For Nordstrom . <h3>The designer is branching out, into the wedding business! Now you can say ‘I Do’ in one of his coveted designs — get all the details on his latest fashion venture here.</h3>\n<p>Former <a href=\"http://www.hollywoodlife.com/2012/01/13/project-runway-all-stars-episode-2-recap-kara-janx-couture-challenge/\" target=\"_blank\">Project Runway</a> winner <strong>Christian Siriano</strong>, 26, has designed countless dr... 4 Tips & Tricks For A Flawless Valentine’s Day Look Just Like Reese Witherspoon . <p>Do you want to create a sexy makeup look for a Valentine’s Day date, but you’re sick of the classic smokey eye? We have the perfect look for you! Celebrity makeup artist Jillian Dempsey gives you a how-to for a twist on the over-played smokey eye. Keep reading to see her beauty tricks and shop […]</p>\n 5 Courtney Robertson Tries On Wedding Dresses — Engaged To 'Bachelor' Ben? . <p>This is more proof that Courtney is engaged to ‘Bachelor’ Ben Flajnik. Check out 24 pics of Courtney trying on wedding gowns. Courtney Robertson seems like she’s getting ready to walk down the aisle with Bachelor Ben Flajnik. The 28-year-old model, who is rumored to be Ben’s fiancee, was spotted trying on wedding gowns at […]</p>\n 6 Courtney Robertson Shocks Nicki Sterling By Shopping For A Wedding Dress . <h3>The thought that Ben Flajnik may actually be engaged to Courtney is quite unsettling for Nicki.</h3>\n<p><strong><a href=\"http://www.hollywoodlife.com/tag/Nicki-Sterling/\">Nicki Sterling</a></strong>, who was the third eliminated contestant on <em><a href=\"http://www.hollywoodlife.com/tag/The-Bachelor/\">The Bachelor</a></em>, can’t believe that <strong><a href=\"http://www.hollywoodlife.com/tag/Courtney-... 7 Blue Ivy, Suri Cruise & More: Hollywood's Cutest Babies -- PICS . [gallery] 8 Jonathan Cheban -- SEE PICS . [gallery] 9 Will Smith . <span class="celeb_profile_label">Bio:</span>\nWill Smith is an American actor, producer, and rapper. In the late 1980s, he achieved some fame as a rapper under the name The Fresh Prince. In 1990, his popularity increased when he starred in the TV sitcom The Fresh Prince of Bel-Air. The show ran for six years and ended in 1996 on NBC with syndications on various networks. During his TV career, he also went into films, and then blockbuster movies. His most successful films have b...
Now save the data to csv so we don't need to rerun the pull from BigQuery each time.
# save data to csv to be read in easy later.
df.to_csv("input_data.csv", encoding = "utf-8", index = False)
# copy df to another dataframe
df_orig = df
del df
Load in saved data from file so no need to pull from BQ each time.
# load in saved data
df_orig = pd.read_csv("input_data.csv", encoding = "utf-8")
df_orig.head()
pid | text | |
---|---|---|
0 | I8bC0jze55Ow0LZoJSyRYGr1K0M= | 'Bachelorette' Ashley Hebert: How I'm Getting In Shape For My Wedding . <h3>Find out how you can get as fit as Ashley with all of her secret diet and workout tips.</h3>\n<p><em><a href=\"http://www.hollywoodlife.com/tag/The-Bachelorette/\">Bachelorette</a></em> <strong><a href=\"http://www.hollywoodlife.com/tag/Ashley-Hebert/\">Ashley Hebert</a></strong> is trying hard to stay fit for her wedding to <strong><a href=\"http://www.hollywoodlife.com/tag/J.P.-Rosenbaum/\">J.P. Rose... |
1 | Oj/mjv0XIpoJPaLX+XCQep65ToU= | 'Teen Mom 2' Star Leah Messer's Dream Wedding — The Details Revealed . <h3>Leah’s fiance Jeremy Calvert gushes about the type of ceremony they can’t wait to have, the guest list and even the honeymoon destination.</h3>\n<p><em><a href=\"http://www.hollywoodlife.com/tag/Teen-Mom-2/\">Teen Mom 2</a></em> star <strong><a href=\"http://www.hollywoodlife.com/tag/Leah-Messer/\">Leah Messer</a></strong> is happily engaged to her new fiance <strong><a href=\"http://w... |
2 | dOlSCJ9+2t1xLe/zB2Re+H+KEg4= | Are 'Teen Mom' Leah Messer & Jeremy Calvert Already Fighting? . <h3>Leah nags her fiance Jeremy on Twitter about being late — and he responds angrily. Is there trouble in paradise?</h3>\n<p><em><a href=\"http://www.hollywoodlife.com/tag/Teen-Mom-2/\">Teen Mom 2</a></em> star <strong><a href=\"http://www.hollywoodlife.com/tag/Leah-Messer/\">Leah Messer</a></strong>, 19, has been engaged to her fiance <strong><a href=\"http://www.hollywoodlife.com/tag/jeremy-calvert/\">Jere... |
3 | JSUPPhMmW+EG7bk3SwjuGsRIOQs= | Christian Siriano Designs Wedding Dresses For Nordstrom . <h3>The designer is branching out, into the wedding business! Now you can say ‘I Do’ in one of his coveted designs — get all the details on his latest fashion venture here.</h3>\n<p>Former <a href=\"http://www.hollywoodlife.com/2012/01/13/project-runway-all-stars-episode-2-recap-kara-janx-couture-challenge/\" target=\"_blank\">Project Runway</a> winner <strong>Christian Siriano</strong>, 26, has designed countless dr... |
4 | VUu+U6+AoDr3dM7MjPldJzrom5g= | Tips & Tricks For A Flawless Valentine’s Day Look Just Like Reese Witherspoon . <p>Do you want to create a sexy makeup look for a Valentine’s Day date, but you’re sick of the classic smokey eye? We have the perfect look for you! Celebrity makeup artist Jillian Dempsey gives you a how-to for a twist on the over-played smokey eye. Keep reading to see her beauty tricks and shop […]</p>\n |
We now have a data frame where our blob of text is often full of embedded html and has not been cleansed in any particular way.
Here we will:
p.s. Using tqdm everywhere in this notebook to get progress bar's on loops. Find it really satisfying for some reason :)
# set up list for collecting cleaned content into
results_all = []
# pull the df into a list to loop through (seems quicker then looping through the pd df itself)
raw_data = list(zip(df_orig.pid,df_orig.text))
# loop through each row of df and clean strip out the html tags
for pid, text in tqdm(raw_data):
# use try block to ignore errors in cleaning, should be fine as have lots of data
try:
result = [pid, remove_tags(text)]
results_all.append(result)
except Exception as e:
# do nothing on error, we are ok to ignore any posts we can't clean up for whatever reason
#print(e)
pass
100%|██████████████████████████████| 118024/118024 [00:01<00:00, 117183.60it/s]
Now do some kinda ugly and brute force cleaning...
(I'm sure there is much more elegant and generalized ways to do this but meh)
# create a df with outputs
df = pd.DataFrame(results_all,columns=['pid','text'])
# remove some messy artifacts
df.text = df.text.str.replace("Bio:\n"," ")
df.text = df.text.str.replace('\]\[\"',"")
df.text = df.text.str.replace("’s"," ")
df.text = df.text.str.replace("‘s"," ")
df.text = df.text.str.replace("’"," ")
df.text = df.text.str.replace("‘"," ")
df.text = df.text.str.replace("“"," ")
df.text = df.text.str.replace("”"," ")
df.text = df.text.str.replace("—"," ")
df.text = df.text.str.replace("'"," ")
df.text = df.text.str.replace("–"," ")
df.text = df.text.str.replace("…"," ")
# do some string cleaning directly on the df
df.text = df.text.str.lower() # set all to lower
df.text = df.text.str.replace("'s "," ") # remove 's from end of words
df.text = df.text.str.replace("'s,",",") # remove 's from end of words
df.text = df.text.str.replace(' \d+ ', ' ') # replace all words that are numbers - they wont be useful for us
df.text = df.text.str.replace("' "," ") # remove '
df.text = df.text.str.replace(" '"," ")
df.text = df.text.str.replace(", "," , ")
df.text = df.text.str.replace(",","")
df.text = df.text.str.replace("!",".")
df.text = df.text.str.replace("’s "," ")
df.text = df.text.str.replace("’s,",",")
df.text = df.text.str.replace("’s","")
df.text = df.text.str.replace('"','')
df.text = df.text.str.replace(' -- ',' ')
df.text = df.text.str.replace(' ‘',' ')
df.text = df.text.str.replace('’ ',' ')
df.text = df.text.str.replace(': ',' ')
df.text = df.text.str.replace('—',' ')
df.text = df.text.str.replace(r'(\[.*\])', ' ') # remove anything inside [] as is usually leftover html junk
df.text = df.text.str.replace('\n\n','\n')
df.text = df.text.str.replace('\n',' ') # remove newlines within each article as will introduce dirty data later if left in
df.text = df.text.str.replace('❤','love')
df.text = df.text.str.replace('(','( ')
df.text = df.text.str.replace(')',' )')
df.text = df.text.str.replace('\. ',' . ')
df.text = df.text.str.replace('“',' ')
df.text = df.text.str.replace('”',' ')
df.text = df.text.str.replace('\xa0',' ')
df.text = df.text.str.replace(' ',' ')
df.text = df.text.str.replace(r'(https://www.instagram.com.*? )', 'instagram ')
df.text = df.text.str.replace(r'(https://instagram.com.*/? )', 'instagram ')
df.text = df.text.str.replace(r'(https://www.twitter.com.*? )', 'twitter ')
df.text = df.text.str.replace(r'(https://twitter.com.*? )', 'twitter ')
df.text = df.text.str.replace(r'(https://www.youtube.com.*? )', 'youtube ')
df.text = df.text.str.replace('?',' ?')
df.text = df.text.str.replace('\\\\n',' ')
df.text = df.text.str.replace('&','and')
df.text = df.text.str.replace('\\\ ',' ')
df.text = df.text.str.replace('’ ',' ’ ')
df.text = df.text.str.replace(' ‘',' ‘ ')
df.text = df.text.str.replace(' pic ',' ')
df.text = df.text.str.replace(' pics ',' ')
# replace any double white spaces we might be left with
df.text = df.text.str.replace(' ',' ')
print_dim(df)
df.head(5)
### df SHAPE = (118024, 2) ###
pid | text | |
---|---|---|
0 | I8bC0jze55Ow0LZoJSyRYGr1K0M= | bachelorette ashley hebert how i m getting in shape for my wedding . find out how you can get as fit as ashley with all of her secret diet and workout tips. bachelorette ashley hebert is trying hard to stay fit for her wedding to j.p . rosenbaum that will most likely take place later this year . ashley shares her wedding workout and diet tips with in touch magazine. |
1 | Oj/mjv0XIpoJPaLX+XCQep65ToU= | teen mom star leah messer s dream wedding the details revealed . leah fiance jeremy calvert gushes about the type of ceremony they can t wait to have the guest list and even the honeymoon destination. teen mom star leah messer is happily engaged to her new fiance jeremy calvert . and while she is pregnant reportedly with twins leah 19 and jeremy 22 are already planning their dream wedding. |
2 | dOlSCJ9+2t1xLe/zB2Re+H+KEg4= | are teen mom leah messer and jeremy calvert already fighting ? . leah nags her fiance jeremy on twitter about being late and he responds angrily . is there trouble in paradise ? teen mom star leah messer 19 has been engaged to her fiance jeremy calvert 22 for less than a month now and it seems that they are already starting to fight. |
3 | JSUPPhMmW+EG7bk3SwjuGsRIOQs= | christian siriano designs wedding dresses for nordstrom . the designer is branching out into the wedding business . now you can say i do in one of his coveted designs get all the details on his latest fashion venture here. former project runway winner christian siriano 26 has designed countless dresses for a-listers like heidi klum fergie taylor swift and sarah hyland to name a few . aside from his namesake collection the designer plans to branch into bridal for nordstrom where we re sure to... |
4 | VUu+U6+AoDr3dM7MjPldJzrom5g= | tips and tricks for a flawless valentine day look just like reese witherspoon . do you want to create a sexy makeup look for a valentine day date but you re sick of the classic smokey eye ? we have the perfect look for you . celebrity makeup artist jillian dempsey gives you a how-to for a twist on the over-played smokey eye . keep reading to see her beauty tricks and shop |
Pull a random sample of clean and original text to just eyeball if the cleaning is generally working as expected and not leaving anything else obvious worth dealing with.
# pull a random sample article to look at cleaning results
samp_ind = list(df.pid.sample(1))
#samp_ind = list(['I8bC0jze55Ow0LZoJSyRYGr1K0M='])
print(samp_ind)
print('\n## CLEANED ##############################\n')
print(str(df.loc[df['pid'].isin(samp_ind)]['text']))
print('\n## ORIGINAL ##############################\n')
print(str(df_orig.loc[df_orig['pid'].isin(samp_ind)]['text']))
['CM/I7grvh6tnicUrOoy4J7zvZ8w='] ## CLEANED ############################## 93192 perrie edwards has a really hot dad & the internet is freaking out about it . so sexy . now we see where perrie edwards got her good looks from . the little mix singer shared a of her dad and let just say the internet isn't freaking out over nothing . he a stud muffin . omg. dads may not be everyone thing but we have to say perrie edwards' dad is incredibly hot . the singer shared a of her dreamy papa on instagram during her 23rd birthday weekend on july 10 and her fans went insane . see the... Name: text, dtype: object ## ORIGINAL ############################## 93192 Perrie Edwards Has A Really Hot Dad & The Internet Is Freaking Out About It . <h3>So sexy! Now we see where Perrie Edwards got her good looks from. The Little Mix singer shared a pic of her dad, and let's just say, the Internet isn't freaking out over nothing! He's a stud muffin.</h3>\nOMG! Dads may not be everyone's thing, but we have to say <a href="http://hollywoodlife.com/tag/perrie-edwards">Perrie Edwards</a>' dad is incredibly hot! The singer shared a pic of her dreamy papa on Instagra... Name: text, dtype: object
Create sentences with bigram phrases flagged.
# get list of documents
documents = list(df['text'])
# get a list of single sentences
#sentences = [s.split('.',1)[0].strip() for s in documents]
sentences = documents # uncomment this to just use the whole dosument as one big sentence
# create sentence stream iterator, removing stopwords and punctuation, also remove small sentences
sentence_stream = [[str(i).translate(str.maketrans('','',string.punctuation)) for i in sentence.split(" ") if i not in stopwords.words('english')] for sentence in tqdm(sentences)]
# remove small sentences as not much to be learned from them
#sentence_stream = [sentence for sentence in sentence_stream if len(sentence) > 3 ] # only need this if passing sentences as opposed to the full doc
# create bigram phrases
phrases = Phrases(sentence_stream, min_count=250)
bigram = Phraser(phrases)
# create list of sentences to feed into the word2vec model
sentences = list(bigram[sentence_stream])
words = [i for j in sentences for i in j]
# save sentences object
with open('sentences.pickle', 'wb') as handle:
pickle.dump(sentences, handle, protocol=pickle.HIGHEST_PROTOCOL)
print("### The number of sentences is " + str(len(sentences)))
print("### The number of words is " + str(len(words)))
100%|████████████████████████████████| 118024/118024 [2:44:23<00:00, 11.97it/s]
### The number of sentences is 118024 ### The number of words is 11568437
# if loading in saved sentences object
sentences = pickle.load( open( "sentences.pickle", "rb" ) )
words = [i for j in sentences for i in j]
print("### The number of sentences is " + str(len(sentences)))
print("### The number of words is " + str(len(words)))
### The number of sentences is 41448 ### The number of words is 5863883
Lets pull a sample of some sentences to see what we now have and if it is looking ok for sending into the actual model building stage.
# pull some random sentences to see if looking ok
n_samples = 5
sample = random.sample(range(0, len(sentences)), n_samples)
[' '.join(sentences[i]) for i in sample]
['kourtney_kardashian forces khloe move divorce kuwtk kuwtk returns khloe_kardashian dealing heavy material literally kourtney forces khloe pack house shared lamar_odom divorce time move keeping_kardashians returns e sunday june 98c brand_new episodes cameras follow ', 'jared fogle exsubway spokesman plead guilty child porn charges report whoa jared fogle man known face subway expected plead guilty possession child pornography charges according multiple outlets ongoing federal investigation jared fogle 37 reportedly accept deal aug plead guilty charges possession child pornography comes couple weeks alleged ', 'miley_cyrus refuses release album year patrick way sorry miley fans bangerz hitmaker refusing release album sad pdafilled relationship patrick getting way music hollywoodlifecom_learned exclusive_details miley dropping new_album year bring tissues miley_cyrus 22 ', 'kim_kardashian robbery planned someone inside entourage shocking report interesting details come light parisian officials get closer finding behind kim_kardashian october robbery wont believe update individual limo company kardashian family used suspected kim_kardashian robbery may inside job according jan report french newspaper le monde knew people arrested important detail emerged one suspects works car company kim used last person driven day attack whoa kim_kardashian robbery suspects french police say individual could given robbers inside information kim staying according newspaper m6 tv also reports suspects may touch kardashian family scary multiple french outlets reporting extent driver involvement attack unclear french police seem think long story short gave robbers least inside info keeping_kardashians star whereabouts terrifying police get closer jean veil kim french attorney said happy reassured developments case were_glad hear source previously reiterated hollywoodlifecom_exclusively kim relieved paris police making headway arrests nice surprise veil also reportedly told france lexpress magazine one hand perhaps mean jewels recovered hand puts end disgraceful speculation people thought clever pretend robbery setup publicity stunt madame kardashian course criminals nabbed kim might face court means nightmare far kim absolutely terrified possibility going court starting entire process source reveals feel hollywoodlifers_think kim case closed soon hope so', 'kylie_jenner flaunts boobs curvaceous figure waist trainer selfie – got flaunt kylie_jenner put body display another sizzling selfie youngest keeping_kardashians star posed revealing white shirt waist trainer showing family trademark curves kylie_jenner 18 gunning crown kardashian waist trainer ']
Get some counts etc that might be useful.
# get a big list of all words
words_long = list(itertools.chain(*sentences))
# create a big string
words_string = ' '.join(words_long)
# clean up
del words_long
# get word counts into a dict
word_counts = defaultdict(int)
for word in words_string.split():
word_counts[word] += 1
# print top 20 words
for w in sorted(word_counts, key=word_counts.get, reverse=True)[:20]:
print(w, word_counts[w])
new 49995 one 47279 see 46184 get 36628 like 33597 love 33249 look 31692 time 31661 show 30224 back 28398 also 25550 two 25377 first 25118 know 25030 going 24923 even 24220 think 23700 may 23439 said 23426 watch 22827
# print bottom 20 words
for w in sorted(word_counts, key=word_counts.get, reverse=False)[:20]:
print(w, word_counts[w])
syndications 1 zampino 1 follieri 1 tuohy 1 bardo 1 jordangreen 1 popsynthpop 1 brotherlyact 1 javadd 1 mullingar 1 constantinova 1 dobreva 1 bulgariancanadian 1 apr6 1 planetsomerhalder 1 penns 1 tappahannock 1 safarti 1 b’day 1 am…sasha 1
Now we are ready to pass our sentences to gensim.Word2Vec to build our own model.
There are a few key hyper parameters we need to build the model. We are not doing anything fancy like cross validation here. Instead i did a few manual trial and error builds on a smaller sample of sentences until i found a paramter set that generlally made sense to me given the size of the datasrt and our focus.
# train model
model = Word2Vec(
sentences = sentences,
size = 100,
min_count = 250,
window = 10, # use a largish window since passing full document as sentence
iter = 10,
workers = multiprocessing.cpu_count()
)
Once the model is built we save it to disk so can be loaded back in later for exploration without needing to rebuild each time.
# save model to disk
model.save("celeb_word2vec_model")
#If you’re finished training a model (=no more updates, only querying), then switch to the gensim.models.KeyedVectors instance in wv
word_vectors = model.wv
#del model
# save word vectors to disk
word_vectors.save("word_vectors")
Load in the saved model. We can run from here if exploring an already trained and saved model.
# load saved model
model = Word2Vec.load('celeb_word2vec_model')
# load saved word vectors
word_vectors = KeyedVectors.load('word_vectors')
We will save the raw vectors out to a nice and easy, human readable, text file. Then read back in the wide matrix of vectors into a pandas dataframe in order to transform it into a long format later for exploration in a Tableau Public workbook.
# save raw vectors to a text file for exploration later in Tableau
model.wv.save_word2vec_format('celeb_word2vec_wide.txt', binary=False)
# read back in the wide file into a pandas df
celeb_word2vec_wide = pd.read_csv("celeb_word2vec_wide.txt",sep=' ', skiprows=1, header=None)
# rename cols
celeb_word2vec_wide.rename(columns = {0:'word'}, inplace = True)
# print dims of the wide df
print(celeb_word2vec_wide.shape)
# looks at the df
celeb_word2vec_wide.head()
(6040, 101)
word | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | NaN | 0.423812 | -0.873024 | -1.076316 | -0.523255 | -0.069986 | -0.812739 | 0.632879 | 0.161528 | 0.282792 | ... | -0.371600 | -0.553812 | -0.671738 | -0.336277 | -0.364511 | 0.176831 | -0.294965 | 0.629585 | -0.376467 | 0.779368 |
1 | new | 2.319383 | -1.351134 | -0.583713 | 0.406562 | 4.404606 | -3.165166 | 2.232082 | -0.446304 | 2.595640 | ... | 2.760589 | 2.519782 | -0.046284 | 3.112629 | -0.785255 | 1.850350 | -0.543355 | 0.321961 | -2.899933 | 2.350732 |
2 | one | -0.025912 | -1.830144 | -0.779826 | -1.594398 | -0.478641 | -1.684386 | -0.148587 | -1.644406 | 1.716436 | ... | -0.068255 | -0.313197 | 0.801451 | -1.983233 | -0.122030 | -0.016098 | 0.392162 | -0.247242 | -1.243877 | -1.139465 |
3 | see | -0.932137 | -1.764545 | -5.519857 | -0.801152 | -0.065875 | -0.366409 | -0.401208 | -1.666864 | 0.933654 | ... | -0.520679 | -0.760519 | 0.147650 | 0.923039 | 0.284766 | -1.203566 | 1.997292 | -0.284277 | -4.810210 | -1.662077 |
4 | get | -1.038089 | 0.575331 | -2.349574 | 1.134312 | 1.008488 | -1.161399 | 0.614903 | -4.204766 | -0.234601 | ... | -0.178508 | -1.393333 | 0.106876 | 0.354701 | 1.145287 | 1.471832 | 1.583619 | -0.773887 | -2.289845 | -0.899614 |
5 rows × 101 columns
Now we go from wide format to long format as tools like Tableau prefer this.
# go from wide to long format using the melt() function
celeb_word2vec_long = celeb_word2vec_wide.melt(id_vars=['word'])
# rename cols
celeb_word2vec_long.rename(columns = {'variable':'vector_element', 'value':'vector_element_magnitude'}, inplace = True)
# look at what we have
print(celeb_word2vec_long.shape)
print(celeb_word2vec_long.head())
# save the long format back out to a text file
celeb_word2vec_long.to_csv("celeb_word2vec_long.txt",sep=' ',index=False)
(604000, 3) word vector_element vector_element_magnitude 0 NaN 1 0.423812 1 new 1 2.319383 2 one 1 -0.025912 3 see 1 -0.932137 4 get 1 -1.038089
Lets just look at a specific vector to see what we actually have - basically an array of positive and negative numbers, all on a similar scale.
# get a vector
model.wv['justin_bieber']
array([-0.19621158, -1.70824993, 4.07912397, -4.13144636, 0.70407969, -0.06511977, -1.10553288, -3.4984827 , -2.40557313, -0.71576977, 3.52038622, 0.21764068, -0.54872227, -0.64682132, 0.73169208, -4.50421715, -1.66858566, -0.32605866, -7.28290606, 0.88032979, 4.97944689, -2.33671474, -2.03136921, 2.16170073, -0.51272494, -0.39437258, -1.62846637, -1.97175133, 3.41039515, 1.69589567, -1.25033355, 1.37241948, 0.20805676, 1.65922272, 2.03981185, 2.12722635, 1.26723588, 0.37178808, 1.61489332, -1.76117992, 0.75137532, 1.37545943, -0.70764965, 1.47992682, 1.53792179, 1.55739236, -1.84939837, 0.14028606, 3.16268826, -0.42398441, -4.79281712, 1.7875241 , 1.3778615 , 1.32047188, -3.06647325, -0.60717714, -1.01766086, -1.81914115, -1.82842767, 3.33063555, 0.33426681, -5.13528776, 1.3301748 , 2.58258796, 0.77017248, -0.89552003, -0.81426936, 4.43586302, -3.69974875, -3.14295745, 2.82621956, -0.70075619, 0.80255145, -2.20000005, 1.47386432, 3.01426864, 4.57765579, 2.4331708 , 0.68835354, -2.53468132, -2.43935299, -0.49032855, -3.15479589, 1.21918011, 0.33573633, -0.26256818, -2.80820608, 0.97537279, -1.99493766, -2.91135526, 1.83211803, -4.98132086, 0.98330897, 1.1170578 , -0.02211688, 1.62615228, -2.11096215, -0.23426078, -1.66096401, -0.82593608], dtype=float32)
Now lets do the obligatory bit of vector arithmetic to help sense check some of our results.
Great blog post here that goes into this in more detail.
What we ask the vectors below is basically:
"kim_kardashian" - "kanye_west" + "brad_pitt" = ?
Another way to think of this is "kanye_west" is to "kim_kardashian" as "brad_pitt" is to ?
(Spoiler alert - ideally we'd like ? to be "angelina_jolie" to show the model has in some way understood the similar marriage relationship between the two pairs.)
# some arithmetic e.g. model.wv.most_similar(positive=['woman', 'king'], negative=['man'])
model.wv.most_similar(positive=['kim_kardashian', 'brad_pitt'], negative=['kanye_west'])[0:1]
[('angelina_jolie', 0.7405589818954468)]
We can also use the vectors to pick odd one out.
model.wv.doesnt_match("kim chloe kylie drake".split())
'drake'
We can also compute the similarity between two sets of words.
model.n_similarity(['kim_kardashian', 'khloe_kardashian'], ['kourtney_kardashian', 'kylie_jenner'])
0.77569593611111409
And finally we can easily look at the similarity between two vectors.
model.wv.similarity('khloe_kardashian', 'kourtney_kardashian')
0.6208901907836144
Next we will build a graph of relationships between words based on a seed word.
So the idea here is to take a person, find their N nearest neighbours, and for each of them in turn find thier own neighbours and on for S steps.
At the end of this the idea is that we will have something representing some notion of a network graph with the original seed word at the center.
relationships = []
seed_word = 'justin_bieber'
topn = 10
for a in model.most_similar(seed_word , topn = topn):
to_node1 = a[0]
relationships.append([seed_word , to_node1 , 1])
for b in model.most_similar(to_node1 , topn = topn):
to_node2 = b[0]
relationships.append([to_node1 , to_node2, 2])
for c in model.most_similar(to_node2 , topn = topn):
to_node3 = c[0]
relationships.append([to_node2 , to_node3 , 3])
# for d in model.most_similar(to_node3 , topn = topn):
# to_node4 = d[0]
# relationships.append([to_node3 , to_node4 , 4])
# for e in model.most_similar(to_node4 , topn = topn):
# to_node5 = e[0]
# relationships.append([to_node4 , to_node5 , 5])
relationships[0:20]
[['justin_bieber', 'justin', 1], ['justin', 'biebs', 2], ['biebs', 'justin', 3], ['biebs', 'jb', 3], ['biebs', 'justin_bieber', 3], ['biebs', 'sel', 3], ['biebs', 'beliebers', 3], ['biebs', 'selena', 3], ['biebs', 'bieber', 3], ['biebs', 'sofia', 3], ['biebs', 'pop_star', 3], ['biebs', 'selena_gomez', 3], ['justin', 'jb', 2], ['jb', 'justin', 3], ['jb', 'biebs', 3], ['jb', 'justin_bieber', 3], ['jb', 'sofia', 3], ['jb', 'sofia_richie', 3], ['jb', 'beliebers', 3], ['jb', 'bieber', 3]]
As we are mostly interested in people type relationships, we will do some additional post processing to try clean things up a bit.
Mainly we will restrict ourselves to words that have a "_" in them and thus a going to be the phrases we identifed earlier that are more likley to be people.
# put relationships into a df
df_edges = pd.DataFrame(relationships,columns=["src","dst","step"])
# do some cleaning of things that are probably junk
df_edges = df_edges[df_edges["dst"].str.contains("_") == True]
df_edges = df_edges[df_edges["src"].str.contains("_") == True]
# add a weight to each edge if we so wished we could calculate something more fancy to put here
df_edges['weight'] = 1
# make a final list from the clean df
relationships_final = list(zip(df_edges['src'].tolist(),df_edges['dst'].tolist()))
relationships_final[0:20]
[('justin_bieber', 'selena_gomez'), ('justin_bieber', 'sofia_richie'), ('justin_bieber', 'austin_mahone'), ('selena_gomez', 'justin_bieber'), ('selena_gomez', 'zayn_malik'), ('justin_bieber', 'selena_gomez'), ('justin_bieber', 'sofia_richie'), ('justin_bieber', 'austin_mahone'), ('pop_star', 'justin_bieber'), ('selena_gomez', 'justin_bieber'), ('selena_gomez', 'zayn_malik'), ('justin_bieber', 'selena_gomez'), ('selena_gomez', 'justin_bieber'), ('justin_bieber', 'selena_gomez'), ('justin_bieber', 'sofia_richie'), ('justin_bieber', 'austin_mahone'), ('selena_gomez', 'zayn_malik'), ('zayn_malik', 'perrie_edwards'), ('zayn_malik', 'harry_styles'), ('zayn_malik', 'one_direction')]
Now make a .gml network file for R to read and do some network graphs on.
# make a networkx graph and save edges file
G = nx.from_pandas_dataframe(df_edges, 'src', 'dst', ['step','weight'])
# save the graph as a gml file
nx.write_gml(G, "edges.gml")
I've used R to do most of the network plots. I tend to find R easier for plots, might be just because i'm generlaly more familiar with it.
# Use R to run the make_network_graph.R script.
# i dont think this works fully - tend to just run the rscript in rstudio manually
import subprocess
subprocess.call("cmd /C ""C:\Program Files\R\R-3.3.1\bin\Rscript.exe" "C:\\Users\\Andrew\\Documents\\pmc-analytical-data-mart\\celeb_vectors\\make_network_graph.R""")
1
Next we will do some clustering of the vectors in our Justin Bieber network.
# get a set of all the unique words in the network
labels = list(set(list(chain.from_iterable(relationships_final))))
# get the vectors relating to the words
data_array = model.wv[labels]
# use the data to get distance matrix
data_dist = pdist(data_array) # computing the distance
data_link = linkage(data_dist) # computing the linkage
# create a dendrogram for input into a heatmap
dendro = FF.create_dendrogram(data_array, orientation='right',
linkagefun=lambda x: linkage(data_array, method = 'ward', metric = 'euclidean')
)
# create heatmap
dendro_leaves = dendro['layout']['yaxis']['ticktext']
dendro_leaves = list(map(int, dendro_leaves))
heat_data = squareform(data_dist)
heat_data = heat_data[dendro_leaves,:]
heat_data = heat_data[:,dendro_leaves]
heatmap = Data([
go.Heatmap(
x = labels,
y = labels,
z = heat_data,
colorscale='Pairs',
showscale = False
)
])
layout = go.Layout(
title = 'Heatmap of ' + seed_word + ' neighbour vectors',
margin=go.Margin(
l=120,
r=120)
)
fig = go.Figure(data=heatmap, layout=layout)
py.iplot(fig, filename = 'celeb-vecs-heatmap')
Now lets create a dendrogram.
# build a dendrogram
dendro = FF.create_dendrogram(data_array,
orientation='left',
labels=labels,
linkagefun=lambda x: linkage(data_array, method = 'complete', metric = 'euclidean'))
dendro['layout'].update({'width':800,
'height':800,
'title':'Dendrogram of ' + seed_word + ' neighbour vectors',
'margin':go.Margin(l=130)})
py.iplot(dendro, filename='celeb-vecs-dendrogram')
Lets do another dendrogram but this time just pull a random sample of words.
sample_n_words = [word for word in random.sample( set( word_vectors.vocab ) , 1000 ) if "_" in word]
#sample_n_words = [word for word in random.sample( set( word_vectors.vocab ) , 100 ) ]
# get the vectors relating
sample_data_array = model.wv[sample_n_words]
dendro = FF.create_dendrogram(sample_data_array,
orientation='left',
labels=sample_n_words,
linkagefun=lambda x: linkage(sample_data_array, method = 'complete', metric = 'euclidean'))
dendro['layout'].update({'width':800,
'height':1800,
'title':'Dendrogram of a random sample of word vectors',
'margin':go.Margin(l=150)})
py.iplot(dendro, filename='sample-vecs-dendrogram')
Here we will take a sample of words from our trained model and create a t-SNE model to project the vectors into a 2-d space where we can look at them on a scatter plot whereby the distance between the points is repesentiative of their distance in the higher dimensional space of our Word2Vec model.
Note: We first do a PCA on the vectors as t-SNE works best with dozens of features as opposed to hundreds. So the PCA gets us from 100 long word vectors to vectors of the top 20 principle components.
Also - i've plotted the text labels which makes the graph look very messy. Best way to use it then is to zoom in and out on different sections.
# sample some words from the trained model
sample_pct = 0.2
sample_words = random.sample( set( word_vectors.vocab ) , round( len( word_vectors.vocab ) * sample_pct ) )
#sample_words = list(set([i[0] for i in relationships_final])) # uncomment this if you want to use our network for the t-SNE
sample_vectors = model.wv[ sample_words ]
# do PCA
pca_n = PCA( n_components = 50 )
pca_result = pca_n.fit_transform( sample_vectors )
# print how much of the variation the top components explain
print( 'Explained variation (PCA): {}'.format( np.sum( pca_n.explained_variance_ratio_ ) ) )
# do t-SNE
X = pca_result
tsne = TSNE( n_components = 2 )
X_tsne = tsne.fit_transform( X )
# plot the t-SNE
trace = go.Scatter(
x = X_tsne[:, 0],
y = X_tsne[:, 1],
mode = 'text',
text = sample_words
)
data = [ trace ]
py.iplot( data, filename='celeb-vecs-tsne' )
Explained variation (PCA): 0.7729213864172766