Run all imports (lots and a bit ugly, i know) and define some helper functions.
import sys
import pandas as pd
from gensim.models import Phrases, Word2Vec
from gensim.models.phrases import Phraser
from gensim.models.keyedvectors import KeyedVectors
import nltk
from nltk.corpus import stopwords
import string
import multiprocessing
import itertools
from collections import defaultdict, Mapping, Container
import random
from tqdm import tqdm
from sys import getsizeof
import pickle
import re
import networkx as nx
from itertools import chain
import matplotlib.pyplot as plt
from scipy.spatial.distance import pdist, squareform
from scipy.cluster.hierarchy import linkage, dendrogram
import plotly.plotly as py
from plotly.graph_objs import *
import plotly.figure_factory as FF
import plotly.graph_objs as go
import plotly.tools
import numpy as np
from jinja2 import Template
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
# set plotly creds
plotly.tools.set_credentials_file(username='andrewm4894', api_key='YOUR_KEY_HERE')
# config vars for bq
project_id = "MY_BQ_PROJECT"
private_key = "C:/Users/Andrew/Documents/PATH_TO_YOUR_KEY/MY_KEY.json"
# set wider prints for pd
pd.options.display.max_colwidth = 500
# function to strip html
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
# function to print shape of df
def print_dim(df):
print("### df SHAPE = "+str(df.shape)+" ###")
Our data happens to be stored in Google Bigquery - its awesome and i'd reccomend it to everyone!
So this part might be a bit specific to Bigquery. We also use jinja templating here to pull from Bigquery one year at a time. This is because the pandas read_gbq module can only comfortably pull a certain amount of data in one go. So templating by year is essentially a natural way to shard our data pulls.
# use jinja2 template to run query for each year to avoid pd gbq crapping out.
# query to pull from data stored in Google Bigquery
qry_template = '''
select pid, text from
(
select
-- hash the id so is still an id but more anonamous
sha1(post_id_domain) as pid,
post_content as text
from
hollywoodlife.post_content
where
post_content is not null
and
post_content<>''
and
post_date like '{{ post_year }}%'
group by 1,2
--limit 250 #uncomment when pulling in smaller sample
)
'''
template_qry = Template( qry_template )
loop_num = 0
# loop through each year
for year in range(2012 , 2018):
print(year)
# just track if first loop or not to handle the append
loop_num += 1
# render the template query for the year of the loop
qry_rendered = template_qry.render( post_year = year )
# pull data from google bigquery
df_tmp = pd.read_gbq( qry_rendered, project_id, private_key = private_key )
# if first loop then obviously nothing to append as only have results for the first year
if loop_num == 1:
df = df_tmp
# if not the first year then append this year to all others
else:
# union df's
frames = [df, df_tmp]
df = pd.concat(frames)
# reset index
df.reset_index(drop=True)
print_dim(df)
print(df.head(10))
Now save the data to csv so we don't need to rerun the pull from BigQuery each time.
# save data to csv to be read in easy later.
df.to_csv("input_data.csv", encoding = "utf-8", index = False)
# copy df to another dataframe
df_orig = df
del df
Load in saved data from file so no need to pull from BQ each time.
# load in saved data
df_orig = pd.read_csv("input_data.csv", encoding = "utf-8")
df_orig.head()
We now have a data frame where our blob of text is often full of embedded html and has not been cleansed in any particular way.
Here we will:
p.s. Using tqdm everywhere in this notebook to get progress bar's on loops. Find it really satisfying for some reason :)
# set up list for collecting cleaned content into
results_all = []
# pull the df into a list to loop through (seems quicker then looping through the pd df itself)
raw_data = list(zip(df_orig.pid,df_orig.text))
# loop through each row of df and clean strip out the html tags
for pid, text in tqdm(raw_data):
# use try block to ignore errors in cleaning, should be fine as have lots of data
try:
result = [pid, remove_tags(text)]
results_all.append(result)
except Exception as e:
# do nothing on error, we are ok to ignore any posts we can't clean up for whatever reason
#print(e)
pass
Now do some kinda ugly and brute force cleaning...
(I'm sure there is much more elegant and generalized ways to do this but meh)
# create a df with outputs
df = pd.DataFrame(results_all,columns=['pid','text'])
# remove some messy artifacts
df.text = df.text.str.replace("Bio:\n"," ")
df.text = df.text.str.replace('\]\[\"',"")
df.text = df.text.str.replace("’s"," ")
df.text = df.text.str.replace("‘s"," ")
df.text = df.text.str.replace("’"," ")
df.text = df.text.str.replace("‘"," ")
df.text = df.text.str.replace("“"," ")
df.text = df.text.str.replace("”"," ")
df.text = df.text.str.replace("—"," ")
df.text = df.text.str.replace("'"," ")
df.text = df.text.str.replace("–"," ")
df.text = df.text.str.replace("…"," ")
# do some string cleaning directly on the df
df.text = df.text.str.lower() # set all to lower
df.text = df.text.str.replace("'s "," ") # remove 's from end of words
df.text = df.text.str.replace("'s,",",") # remove 's from end of words
df.text = df.text.str.replace(' \d+ ', ' ') # replace all words that are numbers - they wont be useful for us
df.text = df.text.str.replace("' "," ") # remove '
df.text = df.text.str.replace(" '"," ")
df.text = df.text.str.replace(", "," , ")
df.text = df.text.str.replace(",","")
df.text = df.text.str.replace("!",".")
df.text = df.text.str.replace("’s "," ")
df.text = df.text.str.replace("’s,",",")
df.text = df.text.str.replace("’s","")
df.text = df.text.str.replace('"','')
df.text = df.text.str.replace(' -- ',' ')
df.text = df.text.str.replace(' ‘',' ')
df.text = df.text.str.replace('’ ',' ')
df.text = df.text.str.replace(': ',' ')
df.text = df.text.str.replace('—',' ')
df.text = df.text.str.replace(r'(\[.*\])', ' ') # remove anything inside [] as is usually leftover html junk
df.text = df.text.str.replace('\n\n','\n')
df.text = df.text.str.replace('\n',' ') # remove newlines within each article as will introduce dirty data later if left in
df.text = df.text.str.replace('❤','love')
df.text = df.text.str.replace('(','( ')
df.text = df.text.str.replace(')',' )')
df.text = df.text.str.replace('\. ',' . ')
df.text = df.text.str.replace('“',' ')
df.text = df.text.str.replace('”',' ')
df.text = df.text.str.replace('\xa0',' ')
df.text = df.text.str.replace(' ',' ')
df.text = df.text.str.replace(r'(https://www.instagram.com.*? )', 'instagram ')
df.text = df.text.str.replace(r'(https://instagram.com.*/? )', 'instagram ')
df.text = df.text.str.replace(r'(https://www.twitter.com.*? )', 'twitter ')
df.text = df.text.str.replace(r'(https://twitter.com.*? )', 'twitter ')
df.text = df.text.str.replace(r'(https://www.youtube.com.*? )', 'youtube ')
df.text = df.text.str.replace('?',' ?')
df.text = df.text.str.replace('\\\\n',' ')
df.text = df.text.str.replace('&','and')
df.text = df.text.str.replace('\\\ ',' ')
df.text = df.text.str.replace('’ ',' ’ ')
df.text = df.text.str.replace(' ‘',' ‘ ')
df.text = df.text.str.replace(' pic ',' ')
df.text = df.text.str.replace(' pics ',' ')
# replace any double white spaces we might be left with
df.text = df.text.str.replace(' ',' ')
print_dim(df)
df.head(5)
Pull a random sample of clean and original text to just eyeball if the cleaning is generally working as expected and not leaving anything else obvious worth dealing with.
# pull a random sample article to look at cleaning results
samp_ind = list(df.pid.sample(1))
#samp_ind = list(['I8bC0jze55Ow0LZoJSyRYGr1K0M='])
print(samp_ind)
print('\n## CLEANED ##############################\n')
print(str(df.loc[df['pid'].isin(samp_ind)]['text']))
print('\n## ORIGINAL ##############################\n')
print(str(df_orig.loc[df_orig['pid'].isin(samp_ind)]['text']))
Create sentences with bigram phrases flagged.
# get list of documents
documents = list(df['text'])
# get a list of single sentences
#sentences = [s.split('.',1)[0].strip() for s in documents]
sentences = documents # uncomment this to just use the whole dosument as one big sentence
# create sentence stream iterator, removing stopwords and punctuation, also remove small sentences
sentence_stream = [[str(i).translate(str.maketrans('','',string.punctuation)) for i in sentence.split(" ") if i not in stopwords.words('english')] for sentence in tqdm(sentences)]
# remove small sentences as not much to be learned from them
#sentence_stream = [sentence for sentence in sentence_stream if len(sentence) > 3 ] # only need this if passing sentences as opposed to the full doc
# create bigram phrases
phrases = Phrases(sentence_stream, min_count=250)
bigram = Phraser(phrases)
# create list of sentences to feed into the word2vec model
sentences = list(bigram[sentence_stream])
words = [i for j in sentences for i in j]
# save sentences object
with open('sentences.pickle', 'wb') as handle:
pickle.dump(sentences, handle, protocol=pickle.HIGHEST_PROTOCOL)
print("### The number of sentences is " + str(len(sentences)))
print("### The number of words is " + str(len(words)))
# if loading in saved sentences object
sentences = pickle.load( open( "sentences.pickle", "rb" ) )
words = [i for j in sentences for i in j]
print("### The number of sentences is " + str(len(sentences)))
print("### The number of words is " + str(len(words)))
Lets pull a sample of some sentences to see what we now have and if it is looking ok for sending into the actual model building stage.
# pull some random sentences to see if looking ok
n_samples = 5
sample = random.sample(range(0, len(sentences)), n_samples)
[' '.join(sentences[i]) for i in sample]
Get some counts etc that might be useful.
# get a big list of all words
words_long = list(itertools.chain(*sentences))
# create a big string
words_string = ' '.join(words_long)
# clean up
del words_long
# get word counts into a dict
word_counts = defaultdict(int)
for word in words_string.split():
word_counts[word] += 1
# print top 20 words
for w in sorted(word_counts, key=word_counts.get, reverse=True)[:20]:
print(w, word_counts[w])
# print bottom 20 words
for w in sorted(word_counts, key=word_counts.get, reverse=False)[:20]:
print(w, word_counts[w])
Now we are ready to pass our sentences to gensim.Word2Vec to build our own model.
There are a few key hyper parameters we need to build the model. We are not doing anything fancy like cross validation here. Instead i did a few manual trial and error builds on a smaller sample of sentences until i found a paramter set that generlally made sense to me given the size of the datasrt and our focus.
# train model
model = Word2Vec(
sentences = sentences,
size = 100,
min_count = 250,
window = 10, # use a largish window since passing full document as sentence
iter = 10,
workers = multiprocessing.cpu_count()
)
Once the model is built we save it to disk so can be loaded back in later for exploration without needing to rebuild each time.
# save model to disk
model.save("celeb_word2vec_model")
#If you’re finished training a model (=no more updates, only querying), then switch to the gensim.models.KeyedVectors instance in wv
word_vectors = model.wv
#del model
# save word vectors to disk
word_vectors.save("word_vectors")
Load in the saved model. We can run from here if exploring an already trained and saved model.
# load saved model
model = Word2Vec.load('celeb_word2vec_model')
# load saved word vectors
word_vectors = KeyedVectors.load('word_vectors')
We will save the raw vectors out to a nice and easy, human readable, text file. Then read back in the wide matrix of vectors into a pandas dataframe in order to transform it into a long format later for exploration in a Tableau Public workbook.
# save raw vectors to a text file for exploration later in Tableau
model.wv.save_word2vec_format('celeb_word2vec_wide.txt', binary=False)
# read back in the wide file into a pandas df
celeb_word2vec_wide = pd.read_csv("celeb_word2vec_wide.txt",sep=' ', skiprows=1, header=None)
# rename cols
celeb_word2vec_wide.rename(columns = {0:'word'}, inplace = True)
# print dims of the wide df
print(celeb_word2vec_wide.shape)
# looks at the df
celeb_word2vec_wide.head()
Now we go from wide format to long format as tools like Tableau prefer this.
# go from wide to long format using the melt() function
celeb_word2vec_long = celeb_word2vec_wide.melt(id_vars=['word'])
# rename cols
celeb_word2vec_long.rename(columns = {'variable':'vector_element', 'value':'vector_element_magnitude'}, inplace = True)
# look at what we have
print(celeb_word2vec_long.shape)
print(celeb_word2vec_long.head())
# save the long format back out to a text file
celeb_word2vec_long.to_csv("celeb_word2vec_long.txt",sep=' ',index=False)
Lets just look at a specific vector to see what we actually have - basically an array of positive and negative numbers, all on a similar scale.
# get a vector
model.wv['justin_bieber']
Now lets do the obligatory bit of vector arithmetic to help sense check some of our results.
Great blog post here that goes into this in more detail.
What we ask the vectors below is basically:
"kim_kardashian" - "kanye_west" + "brad_pitt" = ?
Another way to think of this is "kanye_west" is to "kim_kardashian" as "brad_pitt" is to ?
(Spoiler alert - ideally we'd like ? to be "angelina_jolie" to show the model has in some way understood the similar marriage relationship between the two pairs.)
# some arithmetic e.g. model.wv.most_similar(positive=['woman', 'king'], negative=['man'])
model.wv.most_similar(positive=['kim_kardashian', 'brad_pitt'], negative=['kanye_west'])[0:1]
We can also use the vectors to pick odd one out.
model.wv.doesnt_match("kim chloe kylie drake".split())
We can also compute the similarity between two sets of words.
model.n_similarity(['kim_kardashian', 'khloe_kardashian'], ['kourtney_kardashian', 'kylie_jenner'])
And finally we can easily look at the similarity between two vectors.
model.wv.similarity('khloe_kardashian', 'kourtney_kardashian')
Next we will build a graph of relationships between words based on a seed word.
So the idea here is to take a person, find their N nearest neighbours, and for each of them in turn find thier own neighbours and on for S steps.
At the end of this the idea is that we will have something representing some notion of a network graph with the original seed word at the center.
relationships = []
seed_word = 'justin_bieber'
topn = 10
for a in model.most_similar(seed_word , topn = topn):
to_node1 = a[0]
relationships.append([seed_word , to_node1 , 1])
for b in model.most_similar(to_node1 , topn = topn):
to_node2 = b[0]
relationships.append([to_node1 , to_node2, 2])
for c in model.most_similar(to_node2 , topn = topn):
to_node3 = c[0]
relationships.append([to_node2 , to_node3 , 3])
# for d in model.most_similar(to_node3 , topn = topn):
# to_node4 = d[0]
# relationships.append([to_node3 , to_node4 , 4])
# for e in model.most_similar(to_node4 , topn = topn):
# to_node5 = e[0]
# relationships.append([to_node4 , to_node5 , 5])
relationships[0:20]
As we are mostly interested in people type relationships, we will do some additional post processing to try clean things up a bit.
Mainly we will restrict ourselves to words that have a "_" in them and thus a going to be the phrases we identifed earlier that are more likley to be people.
# put relationships into a df
df_edges = pd.DataFrame(relationships,columns=["src","dst","step"])
# do some cleaning of things that are probably junk
df_edges = df_edges[df_edges["dst"].str.contains("_") == True]
df_edges = df_edges[df_edges["src"].str.contains("_") == True]
# add a weight to each edge if we so wished we could calculate something more fancy to put here
df_edges['weight'] = 1
# make a final list from the clean df
relationships_final = list(zip(df_edges['src'].tolist(),df_edges['dst'].tolist()))
relationships_final[0:20]
Now make a .gml network file for R to read and do some network graphs on.
# make a networkx graph and save edges file
G = nx.from_pandas_dataframe(df_edges, 'src', 'dst', ['step','weight'])
# save the graph as a gml file
nx.write_gml(G, "edges.gml")
I've used R to do most of the network plots. I tend to find R easier for plots, might be just because i'm generlaly more familiar with it.
# Use R to run the make_network_graph.R script.
# i dont think this works fully - tend to just run the rscript in rstudio manually
import subprocess
subprocess.call("cmd /C ""C:\Program Files\R\R-3.3.1\bin\Rscript.exe" "C:\\Users\\Andrew\\Documents\\pmc-analytical-data-mart\\celeb_vectors\\make_network_graph.R""")
Next we will do some clustering of the vectors in our Justin Bieber network.
# get a set of all the unique words in the network
labels = list(set(list(chain.from_iterable(relationships_final))))
# get the vectors relating to the words
data_array = model.wv[labels]
# use the data to get distance matrix
data_dist = pdist(data_array) # computing the distance
data_link = linkage(data_dist) # computing the linkage
# create a dendrogram for input into a heatmap
dendro = FF.create_dendrogram(data_array, orientation='right',
linkagefun=lambda x: linkage(data_array, method = 'ward', metric = 'euclidean')
)
# create heatmap
dendro_leaves = dendro['layout']['yaxis']['ticktext']
dendro_leaves = list(map(int, dendro_leaves))
heat_data = squareform(data_dist)
heat_data = heat_data[dendro_leaves,:]
heat_data = heat_data[:,dendro_leaves]
heatmap = Data([
go.Heatmap(
x = labels,
y = labels,
z = heat_data,
colorscale='Pairs',
showscale = False
)
])
layout = go.Layout(
title = 'Heatmap of ' + seed_word + ' neighbour vectors',
margin=go.Margin(
l=120,
r=120)
)
fig = go.Figure(data=heatmap, layout=layout)
py.iplot(fig, filename = 'celeb-vecs-heatmap')
Now lets create a dendrogram.
# build a dendrogram
dendro = FF.create_dendrogram(data_array,
orientation='left',
labels=labels,
linkagefun=lambda x: linkage(data_array, method = 'complete', metric = 'euclidean'))
dendro['layout'].update({'width':800,
'height':800,
'title':'Dendrogram of ' + seed_word + ' neighbour vectors',
'margin':go.Margin(l=130)})
py.iplot(dendro, filename='celeb-vecs-dendrogram')
Lets do another dendrogram but this time just pull a random sample of words.
sample_n_words = [word for word in random.sample( set( word_vectors.vocab ) , 1000 ) if "_" in word]
#sample_n_words = [word for word in random.sample( set( word_vectors.vocab ) , 100 ) ]
# get the vectors relating
sample_data_array = model.wv[sample_n_words]
dendro = FF.create_dendrogram(sample_data_array,
orientation='left',
labels=sample_n_words,
linkagefun=lambda x: linkage(sample_data_array, method = 'complete', metric = 'euclidean'))
dendro['layout'].update({'width':800,
'height':1800,
'title':'Dendrogram of a random sample of word vectors',
'margin':go.Margin(l=150)})
py.iplot(dendro, filename='sample-vecs-dendrogram')
Here we will take a sample of words from our trained model and create a t-SNE model to project the vectors into a 2-d space where we can look at them on a scatter plot whereby the distance between the points is repesentiative of their distance in the higher dimensional space of our Word2Vec model.
Note: We first do a PCA on the vectors as t-SNE works best with dozens of features as opposed to hundreds. So the PCA gets us from 100 long word vectors to vectors of the top 20 principle components.
Also - i've plotted the text labels which makes the graph look very messy. Best way to use it then is to zoom in and out on different sections.
# sample some words from the trained model
sample_pct = 0.2
sample_words = random.sample( set( word_vectors.vocab ) , round( len( word_vectors.vocab ) * sample_pct ) )
#sample_words = list(set([i[0] for i in relationships_final])) # uncomment this if you want to use our network for the t-SNE
sample_vectors = model.wv[ sample_words ]
# do PCA
pca_n = PCA( n_components = 50 )
pca_result = pca_n.fit_transform( sample_vectors )
# print how much of the variation the top components explain
print( 'Explained variation (PCA): {}'.format( np.sum( pca_n.explained_variance_ratio_ ) ) )
# do t-SNE
X = pca_result
tsne = TSNE( n_components = 2 )
X_tsne = tsne.fit_transform( X )
# plot the t-SNE
trace = go.Scatter(
x = X_tsne[:, 0],
y = X_tsne[:, 1],
mode = 'text',
text = sample_words
)
data = [ trace ]
py.iplot( data, filename='celeb-vecs-tsne' )