#!/usr/bin/env python # coding: utf-8 # # Clustering Instagram users using hashtags. Topic analysis and visualization in D3JS # In[1]: import pymongo as pm import unicodedata # ### Reading the data from Mongo # In[2]: client = pm.MongoClient() db = client.instagram tagsDB = db.tags # ### Extracting tags data # In[3]: rawTags = [] for user in tagsDB.find(): rawTags.extend(user['tags']) # In[4]: len(rawTags) # In[5]: rawTags[:10] # In[6]: tagsRDD = sc.parallelize(rawTags) # In[7]: tagsRDD.count() # ### Cleaning # Note, if you want to keep language specific features and words, you have to clean the data in a different way. # In[8]: countsRDD = ( tagsRDD .map(lambda tag: (unicodedata.normalize('NFKD', tag).encode('ascii','ignore'), 1)) .reduceByKey(lambda a, b: a + b) ) # In[9]: countsRDD.count() # ### Explore the data # In[10]: ordered = countsRDD.takeOrdered(500, lambda (key, value): -value) # In[11]: ordered # In[12]: for order in ordered: print order[0],",", # In[13]: with open('/home/volodymyrmiz/Desktop/rawTags.txt', 'w') as f: for tag in ordered: if tag[0] != '': f.write((tag[0] + ' ')*(tag[1] / 10)) # In[14]: from matplotlib import pyplot as plt import matplotlib matplotlib.style.use('ggplot') import numpy as np # In[15]: frequentTags = [tag[0] for tag in ordered] # In[16]: frequency = [tag[1] for tag in ordered] # In[17]: y_pos = np.arange(len(frequentTags)) # In[19]: #plt.barh(y_pos, frequency, alpha=0.5) #plt.yticks(y_pos, frequentTags) #plt.show() # ### Find words co-occurences # In[20]: userTags = [] for user in tagsDB.find(): userTags.append([unicodedata.normalize('NFKD', tag).encode('ascii','ignore') for tag in user['tags'] if unicodedata.normalize('NFKD', tag).encode('ascii','ignore') != '']) # In[21]: userTags[0] # In[22]: from collections import Counter search_word = "train" count_search = Counter() for tag in userTags: if search_word in tag: count_search.update(tag) print("Co-occurrence for %s:" % search_word) for word in count_search.most_common(21): print word[0] # ### Topic analysis using LDA # LDA. # As with many # clustering models, such a model restricts a document to being associated with a single topic. LDA, # on the other hand, involves three levels, and notably the topic node is sampled repeatedly within the # document. Under this model, documents can be associated with multiple topics. # In[23]: from pyspark.mllib.clustering import LDA, LDAModel from pyspark.mllib.linalg import Vectors # In[24]: tagsList = [] for tag in tagsDB.find(): tagsList.append((str(tag['_id']), [unicodedata.normalize('NFKD', t).encode('ascii','ignore') for t in tag['tags'] if unicodedata.normalize('NFKD', t).encode('ascii','ignore') != ''])) # #### Filter tag list of each user. Remove the most common and rarely used ones # In[25]: filteredList = [] for tag in tagsList: filteredList.append((tag[0], list(set(tag[1]).intersection(frequentTags[:])))) # In[26]: tagsListDF = sc.parallelize(filteredList).toDF(["id", "tokens"]) # #### Vectorize tags arrays for each user # In[28]: from pyspark.ml.feature import CountVectorizer # In[29]: vectorizer = CountVectorizer(inputCol="tokens", outputCol="features").fit(tagsListDF) # In[30]: countVectors = vectorizer.transform(tagsListDF).select("id", "features") # In[31]: countVectors.take(1) # #### Find TF-IDF coefficients for each word instead of bag of words # In[32]: from pyspark.mllib.feature import IDF # In[33]: frequencyVectors = countVectors.map(lambda vector: vector[1]) # In[34]: frequencyVectors.take(2) # In[35]: frequencyVectors.cache() idf = IDF().fit(frequencyVectors) tfidf = idf.transform(frequencyVectors) # In[36]: tfidf.take(1) # In[37]: #just in case, if ids are needed tfidf_with_ids = countVectors.map(lambda vector: int(vector[0])).zip(tfidf).map(lambda pair: [pair[0], pair[1]]) # In[38]: tfidf_with_ids.take(1) # In[39]: corpus = tfidf.map(lambda x: [1, x]).cache() # In[40]: corpus.take(10) # #### Build Latent Dirichlet Allocation model for clustering # In[42]: ldaModel = LDA.train(corpus, k = 15, maxIterations=100, optimizer="online", docConcentration=2.0, topicConcentration=3.0) # Note: LDA does not perform well with the EMLDAOptimizer which is used by default. In the case of EMLDAOptimizer we have significant bies to the most popular hashtags. I used the OnlineLDAOptimizer instead. The Optimizer implements the Online variational Bayes LDA algorithm, which processes a subset of the corpus on each iteration, and updates the term-topic distribution adaptively. # In[43]: len(ldaModel.topicsMatrix()) # In[44]: topicIndices = ldaModel.describeTopics(maxTermsPerTopic=5) # In[45]: topicIndices[0] # In[46]: vocablist = vectorizer.vocabulary # In[47]: ldaModel.vocabSize # In[48]: # from operator import itemgetter # for topic in topicIndices: # text = itemgetter(*topic[0])(vocablist) # print "TOPIC" # for tag in text: # print tag, topic[1][text.index(tag)] # ### Visualization # In[49]: topicsRDD = sc.parallelize(topicIndices) # In[50]: import operator termsRDD = topicsRDD.map(lambda topic: (zip(operator.itemgetter(*topic[0])(vocablist), topic[1]))) # In[51]: termsRDD.take(25) # In[52]: indexedTermsRDD = termsRDD.zipWithIndex() # In[53]: termsRDD = indexedTermsRDD.flatMap(lambda term: [(t[0], t[1], term[1]) for t in term[0]]) # In[54]: termDF = termsRDD.toDF(['term', 'probability', 'topicId']) # In[55]: termDF.take(10) # In[56]: rawJson = termDF.toJSON().collect() # In[57]: from IPython.core.display import display, HTML from IPython.display import Javascript s = "" for line in rawJson: s += (str(line) +',') stringJson = s[:-1] # In[58]: stringJson # In[59]: html_code = """ """ % stringJson # In[60]: display(HTML(html_code))