from pymongo import MongoClient
from datetime import datetime

client = MongoClient()

langcode = "en"
loansCollection = client.kiva.loans
#print "Number of loan descriptions in '%s': %d" % (langcode,loansCollection.find({"processed_description.texts.%s" % langcode :{'$exists': True}}).count())

startYear = 2015
start = datetime(startYear, 1, 1)
c = loansCollection.find({"$and" : [{"posted_date" : { "$gte" : start }},
                                    {"processed_description.texts.%s" % langcode :{'$exists': True}}
                                    ]
                                    })
print "Number of loans in '%s' since %d: %d" % (langcode,startYear,c.count())
documents = []
for loan in c:
    documents.append(loan["processed_description"]["texts"][langcode])

print documents[0:1]

from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn import metrics
from pprint import pprint
import sys

# number of documents to cluster                                                                                                             
nrDocs = 2500

# Build a TFIDF weighted document-term matrix                                                                                                
vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(1,1), tokenizer=word_tokenize, use_idf=True)
docVectors = vectorizer.fit_transform(documents[:nrDocs])

# Inspired by http://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_digits.html                                                   
print("number of sample docs: %d, \t number of unigram features: %d" % (docVectors.shape[0], docVectors.shape[1]))

minNrClusters = 5
maxNrClusters = 25
results = {}

def groupDocsByCluster(clusterIndices,docs):
    '''                                                                                                                                      
    clusterIndices is a list cluster indices                                                                                                 
    documents is a list of documents                                                                                                         
    result is a dict with cluster indices as key, and a list of documents as value                                                           
    '''
    assert(len(clusterIndices) == len(docs)), "number of cluster indices %d and number of documents %d are unequal" % (len(clusterIndices),l\
en(docs))
    result = {}
    for cnt in range(len(clusterIndices)):
        i = clusterIndices[cnt]
        if result.has_key(i):
            result[i].append(docs[cnt])
        else:
            result[i] = [docs[cnt]]
    return result


for nrClusters in range(minNrClusters,maxNrClusters+1):
    print >> sys.stderr, "Fitting KMeans model with %s clusters" % nrClusters
    results[nrClusters] = {}
    model = KMeans(n_clusters=nrClusters).fit(docVectors)
    results[nrClusters]['inertia'] = model.inertia_
    results[nrClusters]['predictions'] = model.predict(docVectors)
    results[nrClusters]['cluster_centroids'] = model.cluster_centers_
    
# Just for inspection                                                                                                                        
clusteredDocs = groupDocsByCluster(results[minNrClusters]['predictions'],documents[:nrDocs])
for k in sorted(clusteredDocs.keys()):
    print "<<<<<<<<<< SAMPLES FROM CLUSTER %d: >>>>>>>>>" % k
    print "\n-----------------------------------------------------------------\n\n".join(clusteredDocs[k][:3])
    print

%matplotlib inline

from matplotlib import pyplot as plt
#print "KMeans with %d clusters has inertia: %d" % (nrClusters, model.inertia_)                                                              

X = [n for n in sorted(results.keys())]
Y = [results[n]['inertia'] for n in X]

plt.figure()
plt.scatter(X,Y)
plt.title("Metric of goodness for KMeans document clustering")
plt.xlabel("Number of clusters")
plt.ylabel("Inertia")
plt.show()

from sklearn.metrics.pairwise import pairwise_distances
from collections import Counter

def getIndicesOfClosest(distances, n=1):
    result = sorted(range(len(distances)), key=lambda i:distances[i])
    return result[n]

for nrClusters in range(minNrClusters,minNrClusters+3):
    centroids = results[nrClusters]['cluster_centroids']
    groupedPredictions = Counter(results[nrClusters]['predictions'])
    sortedGroupedPredictions = [(l,k) for k,l in sorted([(j,i) for i,j in groupedPredictions.items()], reverse=True)]
    print "<<<<<<<<<< For %d clusters >>>>>>>>>>" % nrClusters
    for (clusterIndex,nrInstancesInCluster) in sortedGroupedPredictions[0:3]:
        c = centroids[clusterIndex]
        distances = pairwise_distances(docVectors, Y=[c], metric='cosine')
        indexOfClosest = getIndicesOfClosest(distances, 1)
        closestDocument = documents[indexOfClosest]
        print "\n<< central document in cluster %d (%d instances)>>" % (clusterIndex,nrInstancesInCluster)
        print closestDocument[:100] + " ..."

import numpy as np

for nrClusters in range(minNrClusters,minNrClusters+3):
    groupedPredictions = Counter(results[nrClusters]['predictions'])
    #print groupedPredictions
    
    print "<<<<<<<<<< For %d clusters >>>>>>>>>>" % nrClusters

    clusterSizeDocs = ["" for x in range(nrClusters)]
    for i,docIndex in enumerate(results[nrClusters]['predictions']):
        clusterSizeDocs[docIndex] += (documents[i])

    # Build a TFIDF weighted document-term matrix                                                                                                
    vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(1,1), tokenizer=word_tokenize, use_idf=True)
    clusterVectors = vectorizer.fit_transform(clusterSizeDocs)
#    print clusterVectors.shape

    for clusterIndex in range(clusterVectors.shape[0]):
        clusterVector = clusterVectors[clusterIndex,0:].todense()
#        print clusterVector
#        print clusterVector.shape
        sortedIndicesByTFIDF = sorted(range(clusterVector.shape[1]), key=lambda i:clusterVector.item(0,i), reverse=True)
#        print sortedIndicesByTFIDF
#        print "Number of features: %d" % len(vectorizer.get_feature_names())
        topTokens = [str(vectorizer.get_feature_names()[x]) for x in sortedIndicesByTFIDF[0:5]]
        print "cluster %d (size %d) highest TFIDF terms:" % (clusterIndex, dict(groupedPredictions)[clusterIndex]), topTokens

from nltk.tag import pos_tag
from nltk.tokenize import sent_tokenize, word_tokenize
import numpy as np

vocabulary = {}
for i,d in enumerate(documents[:nrDocs]):
    sentences = sent_tokenize(d)
    for si,s in enumerate(sentences):
        tokens = pos_tag(word_tokenize(s))
        nouns = [token for token in tokens if token[1] == 'NN']
        for noun in nouns:
            if vocabulary.has_key(noun):
                vocabulary[noun] += 1
            else:
                vocabulary[noun] = 1
    if i % (nrDocs/20) == 0:
        print >> sys.stderr, "Tokenized sentences and words from document #%d out of %d" % (i,nrDocs)

voc = [entry[0] for entry in sorted(vocabulary.keys(), key=lambda x:vocabulary[x], reverse=True)]
print "Vocabulary (%d clusters)" % nrClusters, voc[0:10]

for nrClusters in range(minNrClusters,minNrClusters+10):
    groupedPredictions = Counter(results[nrClusters]['predictions'])
    #print groupedPredictions
    
    print "<<<<<<<<<< For %d clusters >>>>>>>>>>" % nrClusters

    clusterSizeDocs = ["" for x in range(nrClusters)]
    for i,docIndex in enumerate(results[nrClusters]['predictions']):
        clusterSizeDocs[docIndex] += (documents[i])
                
    # Build a TFIDF weighted document-term matrix                                                                                                
    vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(1,1), vocabulary=voc, use_idf=True)
    clusterVectors = vectorizer.fit_transform(clusterSizeDocs)
#    print clusterVectors.shape

    for clusterIndex in range(clusterVectors.shape[0]):
        clusterVector = clusterVectors[clusterIndex,0:].todense()
#        print clusterVector
#        print clusterVector.shape
        sortedIndicesByTFIDF = sorted(range(clusterVector.shape[1]), key=lambda i:clusterVector.item(0,i), reverse=True)
#        print sortedIndicesByTFIDF
#        print "Number of features: %d" % len(vectorizer.get_feature_names())
        topTokens = [str(vectorizer.get_feature_names()[x]) for x in sortedIndicesByTFIDF[0:5]]
        print "cluster %d (size %d) highest TFIDF terms:" % (clusterIndex, dict(groupedPredictions)[clusterIndex]), topTokens

from nltk.tag import pos_tag
from nltk.tokenize import sent_tokenize, word_tokenize
import numpy as np
from sklearn.cluster import MiniBatchKMeans

results2 = {}

for nrClusters in range(minNrClusters,maxNrClusters+1):
    print >> sys.stderr, "Fitting MiniBatchKMeans model with %s clusters" % nrClusters
    results2[nrClusters] = {}
    model = MiniBatchKMeans(n_clusters=nrClusters, batch_size=250).fit(docVectors)
    results2[nrClusters]['inertia'] = model.inertia_
    results2[nrClusters]['predictions'] = model.predict(docVectors)
    results2[nrClusters]['cluster_centroids'] = model.cluster_centers_

for nrClusters in range(minNrClusters,minNrClusters+10):
    groupedPredictions2 = Counter(results2[nrClusters]['predictions'])
    print groupedPredictions2
    
    print "<<<<<<<<<< For %d clusters >>>>>>>>>>" % nrClusters

    clusterSizeDocs2 = ["" for x in range(nrClusters)]
    for i,docIndex in enumerate(results2[nrClusters]['predictions']):
        clusterSizeDocs2[docIndex] += (documents[i])
                
    # Build a TFIDF weighted document-term matrix                                                                                                
    vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(1,1), vocabulary=voc, use_idf=True)
    clusterVectors2 = vectorizer.fit_transform(clusterSizeDocs2)
#    print clusterVectors.shape

    for clusterIndex in range(clusterVectors2.shape[0]):
        clusterVector2 = clusterVectors2[clusterIndex,0:].todense()
#        print clusterVector2
#        print clusterVector2.shape
        sortedIndicesByTFIDF = sorted(range(clusterVector2.shape[1]), key=lambda i:clusterVector2.item(0,i), reverse=True)
#        print sortedIndicesByTFIDF
#        print "Number of features: %d" % len(vectorizer.get_feature_names())
        topTokens = [str(vectorizer.get_feature_names()[x]) for x in sortedIndicesByTFIDF[0:5]]
        if dict(groupedPredictions2).has_key(clusterIndex):
            print "cluster %d (size %d) highest TFIDF terms:" % (clusterIndex, dict(groupedPredictions2)[clusterIndex]), topTokens
        else:
            print "cluster %d is empty" % clusterIndex

from nltk.tag import pos_tag
from nltk.tokenize import sent_tokenize, word_tokenize
import numpy as np
from sklearn.cluster import MiniBatchKMeans

results3 = {}

for nrClusters in range(minNrClusters,minNrClusters+5):
    results3[nrClusters] = {}
    for nInit in range(1,20):
#        print >> sys.stderr, "Fitting random-init MiniBatchKMeans model with %s clusters and n_init = %d" % (nrClusters,nInit)
        results3[nrClusters][nInit] = {}
        model = MiniBatchKMeans(init='random' , n_init=nInit, n_clusters=nrClusters, batch_size=250).fit(docVectors)
        if model.inertia_:
            results3[nrClusters][nInit]['inertia'] = model.inertia_
        else:
            raise("Something's wrong")
#        results3[nrClusters][nInit]['predictions'] = model.predict(docVectors)
#        results3[nrClusters][nInit]['cluster_centroids'] = model.cluster_centers_

%matplotlib inline

from matplotlib import pyplot as plt

X = [n for n in sorted(results3[9].keys())]
Y = [results3[9][n]['inertia'] for n in X]

plt.figure()
plt.scatter(X,Y)
plt.title("Metric of goodness for MiniBatchKMeans document clustering")
plt.xlabel("n_init")
plt.ylabel("Inertia")
plt.show()