#!/usr/bin/env python # coding: utf-8 # ## Setup # # Run all imports (lots and a bit ugly, i know) and define some helper functions. # In[6]: import sys import pandas as pd from gensim.models import Phrases, Word2Vec from gensim.models.phrases import Phraser from gensim.models.keyedvectors import KeyedVectors import nltk from nltk.corpus import stopwords import string import multiprocessing import itertools from collections import defaultdict, Mapping, Container import random from tqdm import tqdm from sys import getsizeof import pickle import re import networkx as nx from itertools import chain import matplotlib.pyplot as plt from scipy.spatial.distance import pdist, squareform from scipy.cluster.hierarchy import linkage, dendrogram import plotly.plotly as py from plotly.graph_objs import * import plotly.figure_factory as FF import plotly.graph_objs as go import plotly.tools import numpy as np from jinja2 import Template from sklearn.manifold import TSNE from sklearn.decomposition import PCA # set plotly creds plotly.tools.set_credentials_file(username='andrewm4894', api_key='YOUR_KEY_HERE') # config vars for bq project_id = "MY_BQ_PROJECT" private_key = "C:/Users/Andrew/Documents/PATH_TO_YOUR_KEY/MY_KEY.json" # set wider prints for pd pd.options.display.max_colwidth = 500 # function to strip html TAG_RE = re.compile(r'<[^>]+>') def remove_tags(text): return TAG_RE.sub('', text) # function to print shape of df def print_dim(df): print("### df SHAPE = "+str(df.shape)+" ###") # ## Get Data # # Our data happens to be stored in [Google Bigquery](https://cloud.google.com/bigquery/) - its awesome and i'd reccomend it to everyone! # # So this part might be a bit specific to Bigquery. We also use [jinja templating](http://jinja.pocoo.org/) here to pull from Bigquery one year at a time. This is because the pandas [read_gbq](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_gbq.html) module can only comfortably pull a certain amount of data in one go. So templating by year is essentially a natural way to shard our data pulls. # In[87]: # use jinja2 template to run query for each year to avoid pd gbq crapping out. # query to pull from data stored in Google Bigquery qry_template = ''' select pid, text from ( select -- hash the id so is still an id but more anonamous sha1(post_id_domain) as pid, post_content as text from hollywoodlife.post_content where post_content is not null and post_content<>'' and post_date like '{{ post_year }}%' group by 1,2 --limit 250 #uncomment when pulling in smaller sample ) ''' template_qry = Template( qry_template ) loop_num = 0 # loop through each year for year in range(2012 , 2018): print(year) # just track if first loop or not to handle the append loop_num += 1 # render the template query for the year of the loop qry_rendered = template_qry.render( post_year = year ) # pull data from google bigquery df_tmp = pd.read_gbq( qry_rendered, project_id, private_key = private_key ) # if first loop then obviously nothing to append as only have results for the first year if loop_num == 1: df = df_tmp # if not the first year then append this year to all others else: # union df's frames = [df, df_tmp] df = pd.concat(frames) # reset index df.reset_index(drop=True) print_dim(df) print(df.head(10)) # Now save the data to csv so we don't need to rerun the pull from BigQuery each time. # In[88]: # save data to csv to be read in easy later. df.to_csv("input_data.csv", encoding = "utf-8", index = False) # copy df to another dataframe df_orig = df del df # Load in saved data from file so no need to pull from BQ each time. # In[7]: # load in saved data df_orig = pd.read_csv("input_data.csv", encoding = "utf-8") df_orig.head() # ## Text Preprocessing: Simple Cleaning # # We now have a data frame where our blob of text is often full of embedded html and has not been cleansed in any particular way. # # Here we will: # # * Strip out all html and essentially render the text as it would read. # * Lowercase everything. # * Replace or remove various specific characters that might make things harder on the model. # # p.s. Using [tqdm](https://github.com/tqdm/tqdm) everywhere in this notebook to get progress bar's on loops. Find it really satisfying for some reason :) # In[90]: # set up list for collecting cleaned content into results_all = [] # pull the df into a list to loop through (seems quicker then looping through the pd df itself) raw_data = list(zip(df_orig.pid,df_orig.text)) # loop through each row of df and clean strip out the html tags for pid, text in tqdm(raw_data): # use try block to ignore errors in cleaning, should be fine as have lots of data try: result = [pid, remove_tags(text)] results_all.append(result) except Exception as e: # do nothing on error, we are ok to ignore any posts we can't clean up for whatever reason #print(e) pass # Now do some kinda ugly and brute force cleaning... # # (I'm sure there is much more elegant and generalized ways to do this but meh) # In[112]: # create a df with outputs df = pd.DataFrame(results_all,columns=['pid','text']) # remove some messy artifacts df.text = df.text.str.replace("Bio:\n"," ") df.text = df.text.str.replace('\]\[\"',"") df.text = df.text.str.replace("’s"," ") df.text = df.text.str.replace("‘s"," ") df.text = df.text.str.replace("’"," ") df.text = df.text.str.replace("‘"," ") df.text = df.text.str.replace("“"," ") df.text = df.text.str.replace("”"," ") df.text = df.text.str.replace("—"," ") df.text = df.text.str.replace("'"," ") df.text = df.text.str.replace("–"," ") df.text = df.text.str.replace("…"," ") # do some string cleaning directly on the df df.text = df.text.str.lower() # set all to lower df.text = df.text.str.replace("'s "," ") # remove 's from end of words df.text = df.text.str.replace("'s,",",") # remove 's from end of words df.text = df.text.str.replace(' \d+ ', ' ') # replace all words that are numbers - they wont be useful for us df.text = df.text.str.replace("' "," ") # remove ' df.text = df.text.str.replace(" '"," ") df.text = df.text.str.replace(", "," , ") df.text = df.text.str.replace(",","") df.text = df.text.str.replace("!",".") df.text = df.text.str.replace("’s "," ") df.text = df.text.str.replace("’s,",",") df.text = df.text.str.replace("’s","") df.text = df.text.str.replace('"','') df.text = df.text.str.replace(' -- ',' ') df.text = df.text.str.replace(' ‘',' ') df.text = df.text.str.replace('’ ',' ') df.text = df.text.str.replace(': ',' ') df.text = df.text.str.replace('—',' ') df.text = df.text.str.replace(r'(\[.*\])', ' ') # remove anything inside [] as is usually leftover html junk df.text = df.text.str.replace('\n\n','\n') df.text = df.text.str.replace('\n',' ') # remove newlines within each article as will introduce dirty data later if left in df.text = df.text.str.replace('❤','love') df.text = df.text.str.replace('(','( ') df.text = df.text.str.replace(')',' )') df.text = df.text.str.replace('\. ',' . ') df.text = df.text.str.replace('“',' ') df.text = df.text.str.replace('”',' ') df.text = df.text.str.replace('\xa0',' ') df.text = df.text.str.replace(' ',' ') df.text = df.text.str.replace(r'(https://www.instagram.com.*? )', 'instagram ') df.text = df.text.str.replace(r'(https://instagram.com.*/? )', 'instagram ') df.text = df.text.str.replace(r'(https://www.twitter.com.*? )', 'twitter ') df.text = df.text.str.replace(r'(https://twitter.com.*? )', 'twitter ') df.text = df.text.str.replace(r'(https://www.youtube.com.*? )', 'youtube ') df.text = df.text.str.replace('?',' ?') df.text = df.text.str.replace('\\\\n',' ') df.text = df.text.str.replace('&','and') df.text = df.text.str.replace('\\\ ',' ') df.text = df.text.str.replace('’ ',' ’ ') df.text = df.text.str.replace(' ‘',' ‘ ') df.text = df.text.str.replace(' pic ',' ') df.text = df.text.str.replace(' pics ',' ') # replace any double white spaces we might be left with df.text = df.text.str.replace(' ',' ') print_dim(df) df.head(5) # Pull a random sample of clean and original text to just eyeball if the cleaning is generally working as expected and not leaving anything else obvious worth dealing with. # In[158]: # pull a random sample article to look at cleaning results samp_ind = list(df.pid.sample(1)) #samp_ind = list(['I8bC0jze55Ow0LZoJSyRYGr1K0M=']) print(samp_ind) print('\n## CLEANED ##############################\n') print(str(df.loc[df['pid'].isin(samp_ind)]['text'])) print('\n## ORIGINAL ##############################\n') print(str(df_orig.loc[df_orig['pid'].isin(samp_ind)]['text'])) # ## Text Preprocessing: Phrase Creation # Create sentences with bigram phrases flagged. # In[113]: # get list of documents documents = list(df['text']) # get a list of single sentences #sentences = [s.split('.',1)[0].strip() for s in documents] sentences = documents # uncomment this to just use the whole dosument as one big sentence # create sentence stream iterator, removing stopwords and punctuation, also remove small sentences sentence_stream = [[str(i).translate(str.maketrans('','',string.punctuation)) for i in sentence.split(" ") if i not in stopwords.words('english')] for sentence in tqdm(sentences)] # remove small sentences as not much to be learned from them #sentence_stream = [sentence for sentence in sentence_stream if len(sentence) > 3 ] # only need this if passing sentences as opposed to the full doc # create bigram phrases phrases = Phrases(sentence_stream, min_count=250) bigram = Phraser(phrases) # create list of sentences to feed into the word2vec model sentences = list(bigram[sentence_stream]) words = [i for j in sentences for i in j] # save sentences object with open('sentences.pickle', 'wb') as handle: pickle.dump(sentences, handle, protocol=pickle.HIGHEST_PROTOCOL) print("### The number of sentences is " + str(len(sentences))) print("### The number of words is " + str(len(words))) # In[77]: # if loading in saved sentences object sentences = pickle.load( open( "sentences.pickle", "rb" ) ) words = [i for j in sentences for i in j] print("### The number of sentences is " + str(len(sentences))) print("### The number of words is " + str(len(words))) # Lets pull a sample of some sentences to see what we now have and if it is looking ok for sending into the actual model building stage. # In[116]: # pull some random sentences to see if looking ok n_samples = 5 sample = random.sample(range(0, len(sentences)), n_samples) [' '.join(sentences[i]) for i in sample] # Get some counts etc that might be useful. # In[117]: # get a big list of all words words_long = list(itertools.chain(*sentences)) # create a big string words_string = ' '.join(words_long) # clean up del words_long # get word counts into a dict word_counts = defaultdict(int) for word in words_string.split(): word_counts[word] += 1 # In[118]: # print top 20 words for w in sorted(word_counts, key=word_counts.get, reverse=True)[:20]: print(w, word_counts[w]) # In[119]: # print bottom 20 words for w in sorted(word_counts, key=word_counts.get, reverse=False)[:20]: print(w, word_counts[w]) # ## Build Model # # Now we are ready to pass our sentences to [gensim.Word2Vec](https://radimrehurek.com/gensim/models/word2vec.html) to build our own model. # # There are a few key hyper parameters we need to build the model. We are not doing anything fancy like cross validation here. Instead i did a few manual trial and error builds on a smaller sample of sentences until i found a paramter set that generlally made sense to me given the size of the datasrt and our focus. # In[120]: # train model model = Word2Vec( sentences = sentences, size = 100, min_count = 250, window = 10, # use a largish window since passing full document as sentence iter = 10, workers = multiprocessing.cpu_count() ) # ## Save Model # # Once the model is built we save it to disk so can be loaded back in later for exploration without needing to rebuild each time. # In[121]: # save model to disk model.save("celeb_word2vec_model") #If you’re finished training a model (=no more updates, only querying), then switch to the gensim.models.KeyedVectors instance in wv word_vectors = model.wv #del model # save word vectors to disk word_vectors.save("word_vectors") # ## Load Model # # Load in the saved model. We can run from here if exploring an already trained and saved model. # In[9]: # load saved model model = Word2Vec.load('celeb_word2vec_model') # load saved word vectors word_vectors = KeyedVectors.load('word_vectors') # ## Save Vectors # # We will save the raw vectors out to a nice and easy, human readable, text file. Then read back in the wide matrix of vectors into a pandas dataframe in order to transform it into a long format later for exploration in a [Tableau Public workbook](https://public.tableau.com/profile/andrew5416#!/vizhome/word_vector_explorer/WordVectorExplorer). # In[123]: # save raw vectors to a text file for exploration later in Tableau model.wv.save_word2vec_format('celeb_word2vec_wide.txt', binary=False) # read back in the wide file into a pandas df celeb_word2vec_wide = pd.read_csv("celeb_word2vec_wide.txt",sep=' ', skiprows=1, header=None) # rename cols celeb_word2vec_wide.rename(columns = {0:'word'}, inplace = True) # print dims of the wide df print(celeb_word2vec_wide.shape) # looks at the df celeb_word2vec_wide.head() # Now we go from wide format to long format as tools like Tableau prefer this. # In[124]: # go from wide to long format using the melt() function celeb_word2vec_long = celeb_word2vec_wide.melt(id_vars=['word']) # rename cols celeb_word2vec_long.rename(columns = {'variable':'vector_element', 'value':'vector_element_magnitude'}, inplace = True) # look at what we have print(celeb_word2vec_long.shape) print(celeb_word2vec_long.head()) # save the long format back out to a text file celeb_word2vec_long.to_csv("celeb_word2vec_long.txt",sep=' ',index=False) # ## Explore Model # Lets just look at a specific vector to see what we actually have - basically an array of positive and negative numbers, all on a similar scale. # In[159]: # get a vector model.wv['justin_bieber'] # Now lets do the obligatory bit of vector arithmetic to help sense check some of our results. # # Great blog post [here](https://blog.acolyer.org/2016/04/21/the-amazing-power-of-word-vectors/) that goes into this in more detail. # # What we ask the vectors below is basically: # # "kim_kardashian" - "kanye_west" + "brad_pitt" = ? # # Another way to think of this is "kanye_west" is to "kim_kardashian" as "brad_pitt" is to ? # # (Spoiler alert - ideally we'd like ? to be "angelina_jolie" to show the model has in some way understood the similar marriage relationship between the two pairs.) # In[5]: # some arithmetic e.g. model.wv.most_similar(positive=['woman', 'king'], negative=['man']) model.wv.most_similar(positive=['kim_kardashian', 'brad_pitt'], negative=['kanye_west'])[0:1] # We can also use the vectors to pick odd one out. # In[127]: model.wv.doesnt_match("kim chloe kylie drake".split()) # We can also compute the similarity between two sets of words. # In[154]: model.n_similarity(['kim_kardashian', 'khloe_kardashian'], ['kourtney_kardashian', 'kylie_jenner']) # And finally we can easily look at the similarity between two vectors. # In[155]: model.wv.similarity('khloe_kardashian', 'kourtney_kardashian') # ## Get Graph # # Next we will build a graph of relationships between words based on a seed word. # # So the idea here is to take a person, find their N nearest neighbours, and for each of them in turn find thier own neighbours and on for S steps. # # At the end of this the idea is that we will have something representing some notion of a network graph with the original seed word at the center. # In[12]: relationships = [] seed_word = 'justin_bieber' topn = 10 for a in model.most_similar(seed_word , topn = topn): to_node1 = a[0] relationships.append([seed_word , to_node1 , 1]) for b in model.most_similar(to_node1 , topn = topn): to_node2 = b[0] relationships.append([to_node1 , to_node2, 2]) for c in model.most_similar(to_node2 , topn = topn): to_node3 = c[0] relationships.append([to_node2 , to_node3 , 3]) # for d in model.most_similar(to_node3 , topn = topn): # to_node4 = d[0] # relationships.append([to_node3 , to_node4 , 4]) # for e in model.most_similar(to_node4 , topn = topn): # to_node5 = e[0] # relationships.append([to_node4 , to_node5 , 5]) relationships[0:20] # As we are mostly interested in people type relationships, we will do some additional post processing to try clean things up a bit. # # Mainly we will restrict ourselves to words that have a "_" in them and thus a going to be the phrases we identifed earlier that are more likley to be people. # In[13]: # put relationships into a df df_edges = pd.DataFrame(relationships,columns=["src","dst","step"]) # do some cleaning of things that are probably junk df_edges = df_edges[df_edges["dst"].str.contains("_") == True] df_edges = df_edges[df_edges["src"].str.contains("_") == True] # add a weight to each edge if we so wished we could calculate something more fancy to put here df_edges['weight'] = 1 # make a final list from the clean df relationships_final = list(zip(df_edges['src'].tolist(),df_edges['dst'].tolist())) relationships_final[0:20] # Now make a [.gml](https://gephi.org/users/supported-graph-formats/gml-format/) network file for R to read and do some network graphs on. # In[146]: # make a networkx graph and save edges file G = nx.from_pandas_dataframe(df_edges, 'src', 'dst', ['step','weight']) # save the graph as a gml file nx.write_gml(G, "edges.gml") # ## Plot Graph Using R # # I've used R to do most of the network plots. I tend to find R easier for plots, might be just because i'm generlaly more familiar with it. # In[133]: # Use R to run the make_network_graph.R script. # i dont think this works fully - tend to just run the rscript in rstudio manually import subprocess subprocess.call("cmd /C ""C:\Program Files\R\R-3.3.1\bin\Rscript.exe" "C:\\Users\\Andrew\\Documents\\pmc-analytical-data-mart\\celeb_vectors\\make_network_graph.R""") # ## Cluster Word Vectors # # Next we will do some clustering of the vectors in our Justin Bieber network. # In[147]: # get a set of all the unique words in the network labels = list(set(list(chain.from_iterable(relationships_final)))) # get the vectors relating to the words data_array = model.wv[labels] # In[148]: # use the data to get distance matrix data_dist = pdist(data_array) # computing the distance data_link = linkage(data_dist) # computing the linkage # In[149]: # create a dendrogram for input into a heatmap dendro = FF.create_dendrogram(data_array, orientation='right', linkagefun=lambda x: linkage(data_array, method = 'ward', metric = 'euclidean') ) # create heatmap dendro_leaves = dendro['layout']['yaxis']['ticktext'] dendro_leaves = list(map(int, dendro_leaves)) heat_data = squareform(data_dist) heat_data = heat_data[dendro_leaves,:] heat_data = heat_data[:,dendro_leaves] heatmap = Data([ go.Heatmap( x = labels, y = labels, z = heat_data, colorscale='Pairs', showscale = False ) ]) layout = go.Layout( title = 'Heatmap of ' + seed_word + ' neighbour vectors', margin=go.Margin( l=120, r=120) ) fig = go.Figure(data=heatmap, layout=layout) py.iplot(fig, filename = 'celeb-vecs-heatmap') # Now lets create a dendrogram. # In[150]: # build a dendrogram dendro = FF.create_dendrogram(data_array, orientation='left', labels=labels, linkagefun=lambda x: linkage(data_array, method = 'complete', metric = 'euclidean')) dendro['layout'].update({'width':800, 'height':800, 'title':'Dendrogram of ' + seed_word + ' neighbour vectors', 'margin':go.Margin(l=130)}) py.iplot(dendro, filename='celeb-vecs-dendrogram') # Lets do another dendrogram but this time just pull a random sample of words. # In[198]: sample_n_words = [word for word in random.sample( set( word_vectors.vocab ) , 1000 ) if "_" in word] #sample_n_words = [word for word in random.sample( set( word_vectors.vocab ) , 100 ) ] # get the vectors relating sample_data_array = model.wv[sample_n_words] dendro = FF.create_dendrogram(sample_data_array, orientation='left', labels=sample_n_words, linkagefun=lambda x: linkage(sample_data_array, method = 'complete', metric = 'euclidean')) dendro['layout'].update({'width':800, 'height':1800, 'title':'Dendrogram of a random sample of word vectors', 'margin':go.Margin(l=150)}) py.iplot(dendro, filename='sample-vecs-dendrogram') # ## t-SNE of Vectors # # Here we will take a sample of words from our trained model and create a [t-SNE](https://en.wikipedia.org/wiki/T-distributed_stochastic_neighbor_embedding) model to project the vectors into a 2-d space where we can look at them on a scatter plot whereby the distance between the points is repesentiative of their distance in the higher dimensional space of our Word2Vec model. # # Note: We first do a [PCA](https://en.wikipedia.org/wiki/Principal_component_analysis) on the vectors as t-SNE works best with dozens of features as opposed to hundreds. So the PCA gets us from 100 long word vectors to vectors of the top 20 principle components. # # Also - i've plotted the text labels which makes the graph look very messy. Best way to use it then is to zoom in and out on different sections. # In[151]: # sample some words from the trained model sample_pct = 0.2 sample_words = random.sample( set( word_vectors.vocab ) , round( len( word_vectors.vocab ) * sample_pct ) ) #sample_words = list(set([i[0] for i in relationships_final])) # uncomment this if you want to use our network for the t-SNE sample_vectors = model.wv[ sample_words ] # do PCA pca_n = PCA( n_components = 50 ) pca_result = pca_n.fit_transform( sample_vectors ) # print how much of the variation the top components explain print( 'Explained variation (PCA): {}'.format( np.sum( pca_n.explained_variance_ratio_ ) ) ) # do t-SNE X = pca_result tsne = TSNE( n_components = 2 ) X_tsne = tsne.fit_transform( X ) # plot the t-SNE trace = go.Scatter( x = X_tsne[:, 0], y = X_tsne[:, 1], mode = 'text', text = sample_words ) data = [ trace ] py.iplot( data, filename='celeb-vecs-tsne' )