#!/usr/bin/env python
# coding: utf-8

# # Clustering Instagram users using hashtags. Topic analysis and visualization in D3JS

# In[1]:


import pymongo as pm
import unicodedata


# ### Reading the data from Mongo

# In[2]:


client = pm.MongoClient()
db = client.instagram
tagsDB = db.tags


# ### Extracting tags data

# In[3]:


rawTags = []
for user in tagsDB.find():
    rawTags.extend(user['tags'])


# In[4]:


len(rawTags)


# In[5]:


rawTags[:10]


# In[6]:


tagsRDD = sc.parallelize(rawTags)


# In[7]:


tagsRDD.count()


# ### Cleaning
# Note, if you want to keep language specific features and words, you have to clean the data in a different way.

# In[8]:


countsRDD = (
    tagsRDD
    .map(lambda tag: (unicodedata.normalize('NFKD', tag).encode('ascii','ignore'), 1))
    .reduceByKey(lambda a, b: a + b)
)


# In[9]:


countsRDD.count()


# ### Explore the data

# In[10]:


ordered = countsRDD.takeOrdered(500, lambda (key, value): -value)


# In[11]:


ordered


# In[12]:


for order in ordered:
    print order[0],",",


# In[13]:


with open('/home/volodymyrmiz/Desktop/rawTags.txt', 'w') as f:
    for tag in ordered:
        if tag[0] != '':
            f.write((tag[0] + ' ')*(tag[1] / 10))


# In[14]:


from matplotlib import pyplot as plt
import matplotlib
matplotlib.style.use('ggplot')
import numpy as np


# In[15]:


frequentTags = [tag[0] for tag in ordered]


# In[16]:


frequency = [tag[1] for tag in ordered]


# In[17]:


y_pos = np.arange(len(frequentTags))


# In[19]:


#plt.barh(y_pos, frequency, alpha=0.5)
#plt.yticks(y_pos, frequentTags)
#plt.show()


# ### Find words co-occurences

# In[20]:


userTags = []
for user in tagsDB.find():
    userTags.append([unicodedata.normalize('NFKD', tag).encode('ascii','ignore') 
                     for tag in user['tags'] 
                     if unicodedata.normalize('NFKD', tag).encode('ascii','ignore') != ''])


# In[21]:


userTags[0]


# In[22]:


from collections import Counter
search_word = "train"
count_search = Counter()
for tag in userTags:
    if search_word in tag:
        count_search.update(tag)
print("Co-occurrence for %s:" % search_word)
for word in count_search.most_common(21):
    print word[0]


# ### Topic analysis using LDA

# LDA.
# As with many
# clustering models, such a model restricts a document to being associated with a single topic. LDA,
# on the other hand, involves three levels, and notably the topic node is sampled repeatedly within the
# document. Under this model, documents can be associated with multiple topics.

# In[23]:


from pyspark.mllib.clustering import LDA, LDAModel
from pyspark.mllib.linalg import Vectors


# In[24]:


tagsList = []
for tag in tagsDB.find():
    tagsList.append((str(tag['_id']), [unicodedata.normalize('NFKD', t).encode('ascii','ignore') 
                                    for t in tag['tags']
                                       if unicodedata.normalize('NFKD', t).encode('ascii','ignore') != '']))


# #### Filter tag list of each user. Remove the most common and rarely used ones

# In[25]:


filteredList = []
for tag in tagsList:
    filteredList.append((tag[0], list(set(tag[1]).intersection(frequentTags[:]))))


# In[26]:


tagsListDF = sc.parallelize(filteredList).toDF(["id", "tokens"])


# #### Vectorize tags arrays for each user

# In[28]:


from pyspark.ml.feature import CountVectorizer


# In[29]:


vectorizer = CountVectorizer(inputCol="tokens", outputCol="features").fit(tagsListDF)


# In[30]:


countVectors = vectorizer.transform(tagsListDF).select("id", "features")


# In[31]:


countVectors.take(1)


# #### Find TF-IDF coefficients for each word instead of bag of words

# In[32]:


from pyspark.mllib.feature import IDF


# In[33]:


frequencyVectors = countVectors.map(lambda vector: vector[1])


# In[34]:


frequencyVectors.take(2)


# In[35]:


frequencyVectors.cache()
idf = IDF().fit(frequencyVectors)
tfidf = idf.transform(frequencyVectors)


# In[36]:


tfidf.take(1)


# In[37]:


#just in case, if ids are needed
tfidf_with_ids = countVectors.map(lambda vector: int(vector[0])).zip(tfidf).map(lambda pair: [pair[0], pair[1]])


# In[38]:


tfidf_with_ids.take(1)


# In[39]:


corpus = tfidf.map(lambda x: [1, x]).cache()


# In[40]:


corpus.take(10)


# #### Build Latent Dirichlet Allocation model for clustering

# In[42]:


ldaModel = LDA.train(corpus, k = 15, maxIterations=100, optimizer="online", docConcentration=2.0, topicConcentration=3.0)


# Note: LDA does not perform well with the EMLDAOptimizer which is used by default. In the case of EMLDAOptimizer we have significant bies to the most popular hashtags. I used the OnlineLDAOptimizer instead. The Optimizer implements the Online variational Bayes LDA algorithm, which processes a subset of the corpus on each iteration, and updates the term-topic distribution adaptively.

# In[43]:


len(ldaModel.topicsMatrix())


# In[44]:


topicIndices = ldaModel.describeTopics(maxTermsPerTopic=5)


# In[45]:


topicIndices[0]


# In[46]:


vocablist = vectorizer.vocabulary


# In[47]:


ldaModel.vocabSize


# In[48]:


# from operator import itemgetter 
# for topic in topicIndices:
#     text = itemgetter(*topic[0])(vocablist)
#     print "TOPIC"
#     for tag in text:
#         print tag, topic[1][text.index(tag)]


# ### Visualization

# In[49]:


topicsRDD = sc.parallelize(topicIndices)


# In[50]:


import operator
termsRDD = topicsRDD.map(lambda topic: (zip(operator.itemgetter(*topic[0])(vocablist), topic[1])))


# In[51]:


termsRDD.take(25)


# In[52]:


indexedTermsRDD = termsRDD.zipWithIndex()


# In[53]:


termsRDD = indexedTermsRDD.flatMap(lambda term: [(t[0], t[1], term[1]) for t in term[0]])


# In[54]:


termDF = termsRDD.toDF(['term', 'probability', 'topicId'])


# In[55]:


termDF.take(10)


# In[56]:


rawJson = termDF.toJSON().collect()


# In[57]:


from IPython.core.display import display, HTML
from IPython.display import Javascript

s = ""
for line in rawJson:
    s += (str(line) +',')
stringJson = s[:-1]


# In[58]:


stringJson


# In[59]:


html_code = """
<!DOCTYPE html>
<meta charset="utf-8">
<style>

circle {
  fill: rgb(31, 119, 180);
  fill-opacity: 0.5;
  stroke: rgb(31, 119, 180);
  stroke-width: 1px;
}

.leaf circle {
  fill: #ff7f0e;
  fill-opacity: 1;
}

text {
  font: 14px sans-serif;
}

</style>
<body>
<script src="https://cdnjs.cloudflare.com/ajax/libs/d3/3.5.5/d3.min.js"></script>

<script>

var json = {
 "name": "data",
 "children": [
  {
     "name": "topics",
     "children": [
      %s
     ]
    }
   ]
};

var r = 1500,
    format = d3.format(",d"),
    fill = d3.scale.category20c();

var bubble = d3.layout.pack()
    .sort(null)
    .size([r, r])
    .padding(1.5);

var vis = d3.select("body").append("svg")
    .attr("width", r)
    .attr("height", r)
    .attr("class", "bubble");

  
var node = vis.selectAll("g.node")
    .data(bubble.nodes(classes(json))
    .filter(function(d) { return !d.children; }))
    .enter().append("g")
    .attr("class", "node")
    .attr("transform", function(d) { return "translate(" + d.x + "," + d.y + ")"; })
    color = d3.scale.category20();
  
  node.append("title")
      .text(function(d) { return d.className + ": " + format(d.value); });

  node.append("circle")
      .attr("r", function(d) { return d.r; })
      .style("fill", function(d) {return color(d.topicName);});

var text = node.append("text")
    .attr("text-anchor", "middle")
    .attr("dy", ".3em")
    .text(function(d) { return d.className.substring(0, d.r / 3)});
  
  text.append("tspan")
      .attr("dy", "1.2em")
      .attr("x", 0)
      .text(function(d) {return Math.ceil(d.value * 10000) /10000; });

// Returns a flattened hierarchy containing all leaf nodes under the root.
function classes(root) {
  var classes = [];

  function recurse(term, node) {
    if (node.children) node.children.forEach(function(child) { recurse(node.term, child); });
    else classes.push({topicName: node.topicId, className: node.term, value: node.probability});
  }

  recurse(null, root);
  return {children: classes};
}

</script>""" % stringJson


# In[60]:


display(HTML(html_code))