from pymongo import MongoClient from datetime import datetime client = MongoClient() langcode = "en" loansCollection = client.kiva.loans #print "Number of loan descriptions in '%s': %d" % (langcode,loansCollection.find({"processed_description.texts.%s" % langcode :{'$exists': True}}).count()) startYear = 2015 start = datetime(startYear, 1, 1) c = loansCollection.find({"$and" : [{"posted_date" : { "$gte" : start }}, {"processed_description.texts.%s" % langcode :{'$exists': True}} ] }) print "Number of loans in '%s' since %d: %d" % (langcode,startYear,c.count()) documents = [] for loan in c: documents.append(loan["processed_description"]["texts"][langcode]) print documents[0:1] from nltk.tokenize import word_tokenize from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.cluster import KMeans from sklearn import metrics from pprint import pprint import sys # number of documents to cluster nrDocs = 2500 # Build a TFIDF weighted document-term matrix vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(1,1), tokenizer=word_tokenize, use_idf=True) docVectors = vectorizer.fit_transform(documents[:nrDocs]) # Inspired by http://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_digits.html print("number of sample docs: %d, \t number of unigram features: %d" % (docVectors.shape[0], docVectors.shape[1])) minNrClusters = 5 maxNrClusters = 25 results = {} def groupDocsByCluster(clusterIndices,docs): ''' clusterIndices is a list cluster indices documents is a list of documents result is a dict with cluster indices as key, and a list of documents as value ''' assert(len(clusterIndices) == len(docs)), "number of cluster indices %d and number of documents %d are unequal" % (len(clusterIndices),l\ en(docs)) result = {} for cnt in range(len(clusterIndices)): i = clusterIndices[cnt] if result.has_key(i): result[i].append(docs[cnt]) else: result[i] = [docs[cnt]] return result for nrClusters in range(minNrClusters,maxNrClusters+1): print >> sys.stderr, "Fitting KMeans model with %s clusters" % nrClusters results[nrClusters] = {} model = KMeans(n_clusters=nrClusters).fit(docVectors) results[nrClusters]['inertia'] = model.inertia_ results[nrClusters]['predictions'] = model.predict(docVectors) results[nrClusters]['cluster_centroids'] = model.cluster_centers_ # Just for inspection clusteredDocs = groupDocsByCluster(results[minNrClusters]['predictions'],documents[:nrDocs]) for k in sorted(clusteredDocs.keys()): print "<<<<<<<<<< SAMPLES FROM CLUSTER %d: >>>>>>>>>" % k print "\n-----------------------------------------------------------------\n\n".join(clusteredDocs[k][:3]) print %matplotlib inline from matplotlib import pyplot as plt #print "KMeans with %d clusters has inertia: %d" % (nrClusters, model.inertia_) X = [n for n in sorted(results.keys())] Y = [results[n]['inertia'] for n in X] plt.figure() plt.scatter(X,Y) plt.title("Metric of goodness for KMeans document clustering") plt.xlabel("Number of clusters") plt.ylabel("Inertia") plt.show() from sklearn.metrics.pairwise import pairwise_distances from collections import Counter def getIndicesOfClosest(distances, n=1): result = sorted(range(len(distances)), key=lambda i:distances[i]) return result[n] for nrClusters in range(minNrClusters,minNrClusters+3): centroids = results[nrClusters]['cluster_centroids'] groupedPredictions = Counter(results[nrClusters]['predictions']) sortedGroupedPredictions = [(l,k) for k,l in sorted([(j,i) for i,j in groupedPredictions.items()], reverse=True)] print "<<<<<<<<<< For %d clusters >>>>>>>>>>" % nrClusters for (clusterIndex,nrInstancesInCluster) in sortedGroupedPredictions[0:3]: c = centroids[clusterIndex] distances = pairwise_distances(docVectors, Y=[c], metric='cosine') indexOfClosest = getIndicesOfClosest(distances, 1) closestDocument = documents[indexOfClosest] print "\n<< central document in cluster %d (%d instances)>>" % (clusterIndex,nrInstancesInCluster) print closestDocument[:100] + " ..." import numpy as np for nrClusters in range(minNrClusters,minNrClusters+3): groupedPredictions = Counter(results[nrClusters]['predictions']) #print groupedPredictions print "<<<<<<<<<< For %d clusters >>>>>>>>>>" % nrClusters clusterSizeDocs = ["" for x in range(nrClusters)] for i,docIndex in enumerate(results[nrClusters]['predictions']): clusterSizeDocs[docIndex] += (documents[i]) # Build a TFIDF weighted document-term matrix vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(1,1), tokenizer=word_tokenize, use_idf=True) clusterVectors = vectorizer.fit_transform(clusterSizeDocs) # print clusterVectors.shape for clusterIndex in range(clusterVectors.shape[0]): clusterVector = clusterVectors[clusterIndex,0:].todense() # print clusterVector # print clusterVector.shape sortedIndicesByTFIDF = sorted(range(clusterVector.shape[1]), key=lambda i:clusterVector.item(0,i), reverse=True) # print sortedIndicesByTFIDF # print "Number of features: %d" % len(vectorizer.get_feature_names()) topTokens = [str(vectorizer.get_feature_names()[x]) for x in sortedIndicesByTFIDF[0:5]] print "cluster %d (size %d) highest TFIDF terms:" % (clusterIndex, dict(groupedPredictions)[clusterIndex]), topTokens from nltk.tag import pos_tag from nltk.tokenize import sent_tokenize, word_tokenize import numpy as np vocabulary = {} for i,d in enumerate(documents[:nrDocs]): sentences = sent_tokenize(d) for si,s in enumerate(sentences): tokens = pos_tag(word_tokenize(s)) nouns = [token for token in tokens if token[1] == 'NN'] for noun in nouns: if vocabulary.has_key(noun): vocabulary[noun] += 1 else: vocabulary[noun] = 1 if i % (nrDocs/20) == 0: print >> sys.stderr, "Tokenized sentences and words from document #%d out of %d" % (i,nrDocs) voc = [entry[0] for entry in sorted(vocabulary.keys(), key=lambda x:vocabulary[x], reverse=True)] print "Vocabulary (%d clusters)" % nrClusters, voc[0:10] for nrClusters in range(minNrClusters,minNrClusters+10): groupedPredictions = Counter(results[nrClusters]['predictions']) #print groupedPredictions print "<<<<<<<<<< For %d clusters >>>>>>>>>>" % nrClusters clusterSizeDocs = ["" for x in range(nrClusters)] for i,docIndex in enumerate(results[nrClusters]['predictions']): clusterSizeDocs[docIndex] += (documents[i]) # Build a TFIDF weighted document-term matrix vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(1,1), vocabulary=voc, use_idf=True) clusterVectors = vectorizer.fit_transform(clusterSizeDocs) # print clusterVectors.shape for clusterIndex in range(clusterVectors.shape[0]): clusterVector = clusterVectors[clusterIndex,0:].todense() # print clusterVector # print clusterVector.shape sortedIndicesByTFIDF = sorted(range(clusterVector.shape[1]), key=lambda i:clusterVector.item(0,i), reverse=True) # print sortedIndicesByTFIDF # print "Number of features: %d" % len(vectorizer.get_feature_names()) topTokens = [str(vectorizer.get_feature_names()[x]) for x in sortedIndicesByTFIDF[0:5]] print "cluster %d (size %d) highest TFIDF terms:" % (clusterIndex, dict(groupedPredictions)[clusterIndex]), topTokens from nltk.tag import pos_tag from nltk.tokenize import sent_tokenize, word_tokenize import numpy as np from sklearn.cluster import MiniBatchKMeans results2 = {} for nrClusters in range(minNrClusters,maxNrClusters+1): print >> sys.stderr, "Fitting MiniBatchKMeans model with %s clusters" % nrClusters results2[nrClusters] = {} model = MiniBatchKMeans(n_clusters=nrClusters, batch_size=250).fit(docVectors) results2[nrClusters]['inertia'] = model.inertia_ results2[nrClusters]['predictions'] = model.predict(docVectors) results2[nrClusters]['cluster_centroids'] = model.cluster_centers_ for nrClusters in range(minNrClusters,minNrClusters+10): groupedPredictions2 = Counter(results2[nrClusters]['predictions']) print groupedPredictions2 print "<<<<<<<<<< For %d clusters >>>>>>>>>>" % nrClusters clusterSizeDocs2 = ["" for x in range(nrClusters)] for i,docIndex in enumerate(results2[nrClusters]['predictions']): clusterSizeDocs2[docIndex] += (documents[i]) # Build a TFIDF weighted document-term matrix vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(1,1), vocabulary=voc, use_idf=True) clusterVectors2 = vectorizer.fit_transform(clusterSizeDocs2) # print clusterVectors.shape for clusterIndex in range(clusterVectors2.shape[0]): clusterVector2 = clusterVectors2[clusterIndex,0:].todense() # print clusterVector2 # print clusterVector2.shape sortedIndicesByTFIDF = sorted(range(clusterVector2.shape[1]), key=lambda i:clusterVector2.item(0,i), reverse=True) # print sortedIndicesByTFIDF # print "Number of features: %d" % len(vectorizer.get_feature_names()) topTokens = [str(vectorizer.get_feature_names()[x]) for x in sortedIndicesByTFIDF[0:5]] if dict(groupedPredictions2).has_key(clusterIndex): print "cluster %d (size %d) highest TFIDF terms:" % (clusterIndex, dict(groupedPredictions2)[clusterIndex]), topTokens else: print "cluster %d is empty" % clusterIndex from nltk.tag import pos_tag from nltk.tokenize import sent_tokenize, word_tokenize import numpy as np from sklearn.cluster import MiniBatchKMeans results3 = {} for nrClusters in range(minNrClusters,minNrClusters+5): results3[nrClusters] = {} for nInit in range(1,20): # print >> sys.stderr, "Fitting random-init MiniBatchKMeans model with %s clusters and n_init = %d" % (nrClusters,nInit) results3[nrClusters][nInit] = {} model = MiniBatchKMeans(init='random' , n_init=nInit, n_clusters=nrClusters, batch_size=250).fit(docVectors) if model.inertia_: results3[nrClusters][nInit]['inertia'] = model.inertia_ else: raise("Something's wrong") # results3[nrClusters][nInit]['predictions'] = model.predict(docVectors) # results3[nrClusters][nInit]['cluster_centroids'] = model.cluster_centers_ %matplotlib inline from matplotlib import pyplot as plt X = [n for n in sorted(results3[9].keys())] Y = [results3[9][n]['inertia'] for n in X] plt.figure() plt.scatter(X,Y) plt.title("Metric of goodness for MiniBatchKMeans document clustering") plt.xlabel("n_init") plt.ylabel("Inertia") plt.show()