import pymongo as pm
import unicodedata
client = pm.MongoClient()
db = client.instagram
tagsDB = db.tags
rawTags = []
for user in tagsDB.find():
rawTags.extend(user['tags'])
len(rawTags)
424113
rawTags[:10]
[u'contiki', u'swissalps', u'newfriends', u'freezingmynutsoff', u'walkabout', u'jungfraujoch', u'yolo', u'travel', u'noregrets', u'goodtimes']
tagsRDD = sc.parallelize(rawTags)
tagsRDD.count()
424113
Note, if you want to keep language specific features and words, you have to clean the data in a different way.
countsRDD = (
tagsRDD
.map(lambda tag: (unicodedata.normalize('NFKD', tag).encode('ascii','ignore'), 1))
.reduceByKey(lambda a, b: a + b)
)
countsRDD.count()
106083
ordered = countsRDD.takeOrdered(500, lambda (key, value): -value)
ordered
[('', 9146), ('switzerland', 8973), ('zurich', 3990), ('love', 2605), ('swiss', 2593), ('easter', 2526), ('mountains', 1999), ('travel', 1978), ('spring', 1963), ('snow', 1809), ('sun', 1754), ('nature', 1742), ('lake', 1565), ('geneva', 1516), ('beautiful', 1485), ('schweiz', 1462), ('happy', 1408), ('instagood', 1403), ('photooftheday', 1265), ('picoftheday', 1256), ('suisse', 1238), ('friends', 1199), ('alps', 1165), ('happyeaster', 1096), ('ski', 1085), ('fun', 1044), ('basel', 994), ('landscape', 928), ('sky', 923), ('skiing', 860), ('sunset', 839), ('swissalps', 820), ('family', 805), ('bern', 800), ('nofilter', 799), ('luzern', 770), ('amazing', 755), ('view', 754), ('europe', 737), ('instadaily', 736), ('weekend', 713), ('geneve', 701), ('fashion', 700), ('art', 680), ('holiday', 678), ('sunnyday', 666), ('me', 659), ('mountain', 657), ('food', 648), ('lausanne', 589), ('instalike', 583), ('smile', 577), ('style', 572), ('like4like', 565), ('lucerne', 565), ('followme', 563), ('clouds', 561), ('architecture', 546), ('blue', 544), ('wanderlust', 540), ('zermatt', 537), ('instatravel', 536), ('selfie', 535), ('instamood', 533), ('life', 532), ('winter', 529), ('city', 519), ('ostern', 515), ('trip', 515), ('sunny', 513), ('photography', 512), ('flowers', 501), ('blackandwhite', 497), ('baselworld2016', 496), ('travelgram', 492), ('bluesky', 492), ('sunshine', 491), ('instagram', 485), ('girl', 478), ('foodporn', 476), ('home', 471), ('verbier', 471), ('party', 462), ('igers', 462), ('music', 458), ('vscocam', 451), ('beauty', 443), ('tbt', 436), ('montreux', 435), ('vsco', 429), ('luxury', 426), ('instapic', 424), ('baselworld', 422), ('fitness', 422), ('follow', 422), ('svizzera', 420), ('chocolate', 418), ('visitswitzerland', 407), ('lifestyle', 397), ('snowboarding', 396), ('night', 395), ('relax', 385), ('matterhorn', 380), ('lacleman', 378), ('photo', 372), ('water', 370), ('lugano', 365), ('holidays', 362), ('interlaken', 357), ('myswitzerland', 356), ('vacation', 354), ('design', 349), ('switzerlandwonderland', 348), ('summer', 348), ('goodtimes', 346), ('stmoritz', 342), ('morning', 341), ('day', 340), ('cute', 340), ('2016', 333), ('traveling', 332), ('enjoy', 328), ('tagsforlikes', 326), ('sunday', 322), ('tree', 322), ('saturday', 320), ('green', 319), ('bestoftheday', 314), ('goodmorning', 314), ('loveit', 309), ('travelling', 309), ('instafood', 297), ('river', 295), ('happiness', 294), ('white', 294), ('hiking', 292), ('nice', 291), ('germany', 287), ('snowboard', 285), ('coffee', 281), ('france', 281), ('konstanz', 279), ('inlovewithswitzerland', 276), ('black', 276), ('ticino', 274), ('follow4follow', 271), ('valais', 269), ('healthy', 265), ('instacool', 264), ('work', 264), ('adventure', 263), ('watch', 257), ('sport', 257), ('trees', 256), ('likeforlike', 256), ('zuri', 255), ('bodensee', 255), ('awesome', 254), ('watches', 253), ('_', 251), ('springtime', 250), ('light', 246), ('italy', 246), ('paques', 246), ('fruhling', 244), ('yummy', 243), ('street', 243), ('breakfast', 242), ('graubunden', 242), ('train', 241), ('naturelovers', 241), ('dinner', 240), ('explore', 240), ('davos', 239), ('best', 236), ('swissmade', 236), ('girls', 234), ('red', 234), ('peace', 230), ('laax', 229), ('travelingram', 229), ('sunrise', 227), ('chill', 226), ('like', 224), ('workout', 223), ('panorama', 223), ('switzerlandpictures', 221), ('cool', 221), ('gopro', 219), ('ootd', 217), ('delicious', 216), ('beautifulday', 215), ('zurichsee', 215), ('sweet', 215), ('model', 214), ('throwback', 213), ('ig_switzerland', 211), ('photographer', 210), ('car', 210), ('dog', 209), ('suiza', 205), ('beautifuldestinations', 205), ('see', 205), ('colorful', 205), ('walk', 204), ('colors', 204), ('lunch', 204), ('new', 202), ('training', 202), ('live', 201), ('gym', 201), ('foodie', 200), ('forest', 200), ('motivation', 198), ('cold', 197), ('world', 195), ('beer', 194), ('ischgl', 194), ('familytime', 192), ('castle', 192), ('pasqua', 190), ('running', 190), ('fit', 188), ('switzerland_vacations', 187), ('restaurant', 187), ('good', 187), ('pink', 186), ('bunny', 186), ('roadtrip', 186), ('homesweethome', 186), ('time', 185), ('my', 184), ('l4l', 184), ('picture', 183), ('memories', 183), ('lakegeneva', 181), ('nike', 181), ('alpes', 180), ('inspiration', 180), ('nikon', 179), ('tb', 178), ('instalove', 177), ('sonne', 176), ('grindelwald', 173), ('church', 171), ('canon', 171), ('weather', 171), ('travelphotography', 168), ('engelberg', 168), ('repost', 167), ('concert', 166), ('goodlife', 166), ('tattoo', 165), ('neverstopexploring', 164), ('engadin', 164), ('bridge', 164), ('with', 163), ('pretty', 163), ('iloveswitzerland', 163), ('instaphoto', 162), ('lovely', 162), ('watchporn', 162), ('jungfrau', 161), ('passion', 161), ('wallis', 160), ('airport', 159), ('perfect', 158), ('hotel', 158), ('tourism', 157), ('shopping', 156), ('friendship', 156), ('funny', 155), ('monday', 154), ('easterweekend', 153), ('swag', 152), ('instamoment', 152), ('flower', 152), ('berge', 150), ('the', 150), ('froheostern', 150), ('thun', 150), ('iphoneonly', 149), ('mylove', 147), ('pic', 147), ('skyporn', 147), ('bar', 147), ('brunch', 146), ('neuchatel', 145), ('loveswitzerlandcontest', 145), ('powder', 144), ('fresh', 143), ('evening', 143), ('makeup', 143), ('boy', 142), ('hair', 142), ('vegan', 142), ('hot', 142), ('wonderful', 141), ('in', 141), ('color', 140), ('house', 140), ('tourist', 140), ('instafollow', 140), ('eurotrip', 139), ('swizerland', 139), ('top', 139), ('friday', 139), ('lago', 139), ('traveller', 138), ('suica', 138), ('pictureoftheday', 138), ('instago', 138), ('eggs', 138), ('gold', 138), ('potd', 138), ('rhein', 138), ('polymanga', 137), ('wine', 137), ('stgallen', 136), ('rolex', 136), ('mood', 136), ('austria', 136), ('dance', 136), ('swan', 136), ('fribourg', 135), ('goodday', 134), ('alpen', 134), ('igdaily', 134), ('printemps', 134), ('swisslife', 134), ('lamborghini', 134), ('paris', 133), ('apresski', 133), ('march', 133), ('boat', 132), ('schnee', 132), ('portrait', 132), ('oldtown', 132), ('crazy', 132), ('lac', 132), ('birthday', 132), ('urban', 131), ('f4f', 131), ('arosa', 130), ('tflers', 130), ('latergram', 130), ('swissmountains', 129), ('winterwonderland', 128), ('vaud', 128), ('jetdeau', 128), ('streetart', 128), ('cat', 128), ('bmw', 127), ('ig_europe', 126), ('titlis', 126), ('look', 125), ('fitfam', 125), ('enjoylife', 125), ('and', 124), ('traveltheworld', 124), ('blessed', 124), ('paradise', 124), ('montagne', 123), ('outdoors', 123), ('ig_swiss', 122), ('vevey', 122), ('dessert', 122), ('couple', 121), ('sunglasses', 121), ('bike', 121), ('zug', 120), ('winterthur', 120), ('hiphop', 120), ('cars', 119), ('baby', 119), ('club', 118), ('animal', 118), ('ferrari', 117), ('vintage', 117), ('natur', 116), ('friend', 116), ('museum', 116), ('qualitytime', 116), ('carporn', 116), ('goodvibes', 116), ('loveyou', 115), ('wood', 115), ('igtravel', 115), ('lindt', 115), ('instagramers', 115), ('italia', 114), ('goodtime', 114), ('buonapasqua', 114), ('nature_perfection', 114), ('fly', 113), ('nofilterneeded', 113), ('today', 113), ('audi', 113), ('bw', 112), ('eat', 112), ('shooting', 112), ('watchesofinstagram', 111), ('walking', 111), ('supercar', 110), ('igerssuisse', 110), ('lakezurich', 110), ('garden', 110), ('likes', 110), ('great', 110), ('dj', 109), ('traveler', 109), ('super_switzerland', 109), ('yellow', 109), ('artist', 109), ('porsche', 109), ('landscape_lovers', 108), ('drinks', 108), ('happyday', 108), ('handmade', 108), ('run', 108), ('naturephotography', 108), ('goodnight', 107), ('vierwaldstattersee', 107), ('people', 106), ('blonde', 106), ('visitzurich', 106), ('london', 105), ('cheese', 105), ('easterbunny', 105), ('outdoor', 105), ('fondue', 105), ('ascona', 104), ('followforfollow', 104), ('watchoftheday', 104), ('leman', 103), ('lagomaggiore', 102), ('streetphotography', 102), ('reflection', 102), ('lights', 101), ('building', 101), ('ice', 101), ('iphone', 100), ('genevalake', 100), ('health', 100), ('freeride', 100), ('bodybuilding', 99), ('igersgeneva', 99), ('champagne', 99), ('waterfall', 99), ('beard', 99), ('chilling', 98), ('cloudporn', 98), ('sister', 98), ('primavera', 97), ('dream', 97), ('starbucks', 97), ('instafashion', 97), ('aviation', 97), ('springbreak', 97), ('rheinfall', 97), ('dogsofinstagram', 97), ('polymanga2016', 96), ('liechtenstein', 96), ('zurichcity', 96), ('igersswitzerland', 95), ('blogger', 95), ('instanature', 95), ('scenery', 95), ('schaffhausen', 95), ('outfit', 95), ('horology', 94), ('liveauthentic', 94), ('shoes', 94), ('nightlife', 94), ('animals', 94), ('adidas', 93), ('interiordesign', 93), ('instatraveling', 93), ('jewelry', 92), ('homemade', 92), ('cake', 92), ('tasty', 92), ('nightout', 92), ('wow', 91), ('zurichairport', 91), ('gstaad', 91), ('mylife', 91), ('rigi', 91), ('video', 91), ('mercedes', 91), ('a', 91), ('all_shots', 90), ('flying', 90), ('moment', 90), ('deutschland', 89), ('zurisee', 89), ('mytravelgram', 89), ('forever', 89), ('beach', 89), ('park', 88)]
for order in ordered:
print order[0],",",
, switzerland , zurich , love , swiss , easter , mountains , travel , spring , snow , sun , nature , lake , geneva , beautiful , schweiz , happy , instagood , photooftheday , picoftheday , suisse , friends , alps , happyeaster , ski , fun , basel , landscape , sky , skiing , sunset , swissalps , family , bern , nofilter , luzern , amazing , view , europe , instadaily , weekend , geneve , fashion , art , holiday , sunnyday , me , mountain , food , lausanne , instalike , smile , style , like4like , lucerne , followme , clouds , architecture , blue , wanderlust , zermatt , instatravel , selfie , instamood , life , winter , city , ostern , trip , sunny , photography , flowers , blackandwhite , baselworld2016 , travelgram , bluesky , sunshine , instagram , girl , foodporn , home , verbier , party , igers , music , vscocam , beauty , tbt , montreux , vsco , luxury , instapic , baselworld , fitness , follow , svizzera , chocolate , visitswitzerland , lifestyle , snowboarding , night , relax , matterhorn , lacleman , photo , water , lugano , holidays , interlaken , myswitzerland , vacation , design , switzerlandwonderland , summer , goodtimes , stmoritz , morning , day , cute , 2016 , traveling , enjoy , tagsforlikes , sunday , tree , saturday , green , bestoftheday , goodmorning , loveit , travelling , instafood , river , happiness , white , hiking , nice , germany , snowboard , coffee , france , konstanz , inlovewithswitzerland , black , ticino , follow4follow , valais , healthy , instacool , work , adventure , watch , sport , trees , likeforlike , zuri , bodensee , awesome , watches , _ , springtime , light , italy , paques , fruhling , yummy , street , breakfast , graubunden , train , naturelovers , dinner , explore , davos , best , swissmade , girls , red , peace , laax , travelingram , sunrise , chill , like , workout , panorama , switzerlandpictures , cool , gopro , ootd , delicious , beautifulday , zurichsee , sweet , model , throwback , ig_switzerland , photographer , car , dog , suiza , beautifuldestinations , see , colorful , walk , colors , lunch , new , training , live , gym , foodie , forest , motivation , cold , world , beer , ischgl , familytime , castle , pasqua , running , fit , switzerland_vacations , restaurant , good , pink , bunny , roadtrip , homesweethome , time , my , l4l , picture , memories , lakegeneva , nike , alpes , inspiration , nikon , tb , instalove , sonne , grindelwald , church , canon , weather , travelphotography , engelberg , repost , concert , goodlife , tattoo , neverstopexploring , engadin , bridge , with , pretty , iloveswitzerland , instaphoto , lovely , watchporn , jungfrau , passion , wallis , airport , perfect , hotel , tourism , shopping , friendship , funny , monday , easterweekend , swag , instamoment , flower , berge , the , froheostern , thun , iphoneonly , mylove , pic , skyporn , bar , brunch , neuchatel , loveswitzerlandcontest , powder , fresh , evening , makeup , boy , hair , vegan , hot , wonderful , in , color , house , tourist , instafollow , eurotrip , swizerland , top , friday , lago , traveller , suica , pictureoftheday , instago , eggs , gold , potd , rhein , polymanga , wine , stgallen , rolex , mood , austria , dance , swan , fribourg , goodday , alpen , igdaily , printemps , swisslife , lamborghini , paris , apresski , march , boat , schnee , portrait , oldtown , crazy , lac , birthday , urban , f4f , arosa , tflers , latergram , swissmountains , winterwonderland , vaud , jetdeau , streetart , cat , bmw , ig_europe , titlis , look , fitfam , enjoylife , and , traveltheworld , blessed , paradise , montagne , outdoors , ig_swiss , vevey , dessert , couple , sunglasses , bike , zug , winterthur , hiphop , cars , baby , club , animal , ferrari , vintage , natur , friend , museum , qualitytime , carporn , goodvibes , loveyou , wood , igtravel , lindt , instagramers , italia , goodtime , buonapasqua , nature_perfection , fly , nofilterneeded , today , audi , bw , eat , shooting , watchesofinstagram , walking , supercar , igerssuisse , lakezurich , garden , likes , great , dj , traveler , super_switzerland , yellow , artist , porsche , landscape_lovers , drinks , happyday , handmade , run , naturephotography , goodnight , vierwaldstattersee , people , blonde , visitzurich , london , cheese , easterbunny , outdoor , fondue , ascona , followforfollow , watchoftheday , leman , lagomaggiore , streetphotography , reflection , lights , building , ice , iphone , genevalake , health , freeride , bodybuilding , igersgeneva , champagne , waterfall , beard , chilling , cloudporn , sister , primavera , dream , starbucks , instafashion , aviation , springbreak , rheinfall , dogsofinstagram , polymanga2016 , liechtenstein , zurichcity , igersswitzerland , blogger , instanature , scenery , schaffhausen , outfit , horology , liveauthentic , shoes , nightlife , animals , adidas , interiordesign , instatraveling , jewelry , homemade , cake , tasty , nightout , wow , zurichairport , gstaad , mylife , rigi , video , mercedes , a , all_shots , flying , moment , deutschland , zurisee , mytravelgram , forever , beach , park ,
with open('/home/volodymyrmiz/Desktop/rawTags.txt', 'w') as f:
for tag in ordered:
if tag[0] != '':
f.write((tag[0] + ' ')*(tag[1] / 10))
from matplotlib import pyplot as plt
import matplotlib
matplotlib.style.use('ggplot')
import numpy as np
frequentTags = [tag[0] for tag in ordered]
frequency = [tag[1] for tag in ordered]
y_pos = np.arange(len(frequentTags))
#plt.barh(y_pos, frequency, alpha=0.5)
#plt.yticks(y_pos, frequentTags)
#plt.show()
userTags = []
for user in tagsDB.find():
userTags.append([unicodedata.normalize('NFKD', tag).encode('ascii','ignore')
for tag in user['tags']
if unicodedata.normalize('NFKD', tag).encode('ascii','ignore') != ''])
userTags[0]
['contiki', 'swissalps', 'newfriends', 'freezingmynutsoff', 'walkabout', 'jungfraujoch', 'yolo', 'travel', 'noregrets']
from collections import Counter
search_word = "train"
count_search = Counter()
for tag in userTags:
if search_word in tag:
count_search.update(tag)
print("Co-occurrence for %s:" % search_word)
for word in count_search.most_common(21):
print word[0]
Co-occurrence for train: train switzerland travel zurich mountains swiss easter snow nature lake photooftheday alps sky beautiful europe spring sbb view clouds love instagood
LDA. As with many clustering models, such a model restricts a document to being associated with a single topic. LDA, on the other hand, involves three levels, and notably the topic node is sampled repeatedly within the document. Under this model, documents can be associated with multiple topics.
from pyspark.mllib.clustering import LDA, LDAModel
from pyspark.mllib.linalg import Vectors
tagsList = []
for tag in tagsDB.find():
tagsList.append((str(tag['_id']), [unicodedata.normalize('NFKD', t).encode('ascii','ignore')
for t in tag['tags']
if unicodedata.normalize('NFKD', t).encode('ascii','ignore') != '']))
filteredList = []
for tag in tagsList:
filteredList.append((tag[0], list(set(tag[1]).intersection(frequentTags[:]))))
tagsListDF = sc.parallelize(filteredList).toDF(["id", "tokens"])
from pyspark.ml.feature import CountVectorizer
vectorizer = CountVectorizer(inputCol="tokens", outputCol="features").fit(tagsListDF)
countVectors = vectorizer.transform(tagsListDF).select("id", "features")
countVectors.take(1)
[Row(id=u'234933728', features=SparseVector(499, {6: 1.0, 30: 1.0}))]
from pyspark.mllib.feature import IDF
frequencyVectors = countVectors.map(lambda vector: vector[1])
frequencyVectors.take(2)
[SparseVector(499, {6: 1.0, 30: 1.0}), SparseVector(499, {113: 1.0, 210: 1.0})]
frequencyVectors.cache()
idf = IDF().fit(frequencyVectors)
tfidf = idf.transform(frequencyVectors)
tfidf.take(1)
[SparseVector(499, {6: 2.8768, 30: 3.7561})]
#just in case, if ids are needed
tfidf_with_ids = countVectors.map(lambda vector: int(vector[0])).zip(tfidf).map(lambda pair: [pair[0], pair[1]])
tfidf_with_ids.take(1)
[[234933728, SparseVector(499, {6: 2.8768, 30: 3.7561})]]
corpus = tfidf.map(lambda x: [1, x]).cache()
corpus.take(10)
[[1, SparseVector(499, {6: 2.8768, 30: 3.7561})], [1, SparseVector(499, {113: 4.6173, 210: 5.1634})], [1, SparseVector(499, {})], [1, SparseVector(499, {22: 3.4672})], [1, SparseVector(499, {2: 2.6026, 8: 2.9656, 13: 3.1635, 16: 3.2196, 17: 3.3231, 18: 3.3302, 20: 3.3766, 23: 3.4764, 35: 3.8386, 43: 3.946, 45: 3.9744, 63: 4.1881, 76: 4.2805, 85: 4.3708, 89: 4.4099, 109: 4.5946, 127: 4.7141, 327: 5.5614, 401: 5.7571})], [1, SparseVector(499, {418: 5.7753})], [1, SparseVector(499, {})], [1, SparseVector(499, {2: 2.6026, 24: 3.5149, 114: 4.6289, 176: 5.0243})], [1, SparseVector(499, {158: 4.9412})], [1, SparseVector(499, {})]]
ldaModel = LDA.train(corpus, k = 15, maxIterations=100, optimizer="online", docConcentration=2.0, topicConcentration=3.0)
Note: LDA does not perform well with the EMLDAOptimizer which is used by default. In the case of EMLDAOptimizer we have significant bies to the most popular hashtags. I used the OnlineLDAOptimizer instead. The Optimizer implements the Online variational Bayes LDA algorithm, which processes a subset of the corpus on each iteration, and updates the term-topic distribution adaptively.
len(ldaModel.topicsMatrix())
499
topicIndices = ldaModel.describeTopics(maxTermsPerTopic=5)
topicIndices[0]
([8, 23, 5, 64, 28], [0.06482984277484072, 0.045950294558274096, 0.039156100706073844, 0.031679928472898536, 0.030337389898223453])
vocablist = vectorizer.vocabulary
ldaModel.vocabSize
<bound method LDAModel.vocabSize of <pyspark.mllib.clustering.LDAModel object at 0x7f959069d210>>
# from operator import itemgetter
# for topic in topicIndices:
# text = itemgetter(*topic[0])(vocablist)
# print "TOPIC"
# for tag in text:
# print tag, topic[1][text.index(tag)]
topicsRDD = sc.parallelize(topicIndices)
import operator
termsRDD = topicsRDD.map(lambda topic: (zip(operator.itemgetter(*topic[0])(vocablist), topic[1])))
termsRDD.take(25)
[[(u'snow', 0.06482984277484072), (u'ski', 0.045950294558274096), (u'mountains', 0.039156100706073844), (u'winter', 0.031679928472898536), (u'skiing', 0.030337389898223453)], [(u'nature', 0.030827512537037274), (u'lake', 0.02478979431897514), (u'spring', 0.024462219076508293), (u'landscape', 0.0224125857946134), (u'flowers', 0.02067768513339612)], [(u'luzern', 0.036862489880800375), (u'switzerland', 0.0338325739317431), (u'verbier', 0.02953757773230965), (u'zurich', 0.02188718968235265), (u'swiss', 0.02123679980747416)], [(u'art', 0.03698756198224295), (u'zurich', 0.03313016298308508), (u'switzerland', 0.03157849816347215), (u'easter', 0.025158555831459168), (u'family', 0.0223036711631496)], [(u'travel', 0.06901255967841895), (u'instatravel', 0.037299454043090645), (u'europe', 0.034330202644613936), (u'travelgram', 0.03368921018190022), (u'trip', 0.03304379062370829)], [(u'switzerland', 0.034456317666373186), (u'goodtimes', 0.03152461878027823), (u'zurich', 0.02950826562264097), (u'weekend', 0.022231218091606136), (u'tb', 0.018888255116828026)], [(u'geneva', 0.05857098842284779), (u'car', 0.031332107870771786), (u'switzerland', 0.0276575550779648), (u'lamborghini', 0.02190996776182064), (u'ferrari', 0.02052769480440934)], [(u'visitswitzerland', 0.03794345847628957), (u'vscocam', 0.034333155243237684), (u'switzerlandwonderland', 0.03226477690795611), (u'vsco', 0.03213684652910808), (u'myswitzerland', 0.028912085943440736)], [(u'fitness', 0.050251802705119475), (u'healthy', 0.02958433988432552), (u'sport', 0.029353669207932892), (u'workout', 0.028909345997833683), (u'motivation', 0.028554499225599026)], [(u'suisse', 0.04139109858365735), (u'montreux', 0.03704523460681258), (u'lacleman', 0.03250671538638928), (u'lausanne', 0.02801144795578456), (u'switzerland', 0.02688108605034323)], [(u'instagood', 0.03645624632489141), (u'picoftheday', 0.03076251964509694), (u'photooftheday', 0.029703059421505945), (u'instadaily', 0.028632883460075267), (u'instalike', 0.026769381245962745)], [(u'music', 0.040575034449173424), (u'party', 0.03921196577945221), (u'friends', 0.027131120772706), (u'konstanz', 0.024898402612021024), (u'bodensee', 0.02318591615707108)], [(u'baselworld2016', 0.056018990170552174), (u'baselworld', 0.05048503702936801), (u'basel', 0.04590056891456354), (u'luxury', 0.041513618850625066), (u'watches', 0.03597274460743156)], [(u'swiss', 0.02934004343949627), (u'nofilter', 0.02346935547247407), (u'switzerland', 0.021960428794446603), (u'selfie', 0.020904185010864756), (u'love', 0.019373152419980853)], [(u'food', 0.030915742768915375), (u'foodporn', 0.03030777683238159), (u'day', 0.02158525753702872), (u'instafood', 0.020540403766244577), (u'yummy', 0.019046245223605044)]]
indexedTermsRDD = termsRDD.zipWithIndex()
termsRDD = indexedTermsRDD.flatMap(lambda term: [(t[0], t[1], term[1]) for t in term[0]])
termDF = termsRDD.toDF(['term', 'probability', 'topicId'])
termDF.take(10)
[Row(term=u'snow', probability=0.06482984277484072, topicId=0), Row(term=u'ski', probability=0.045950294558274096, topicId=0), Row(term=u'mountains', probability=0.039156100706073844, topicId=0), Row(term=u'winter', probability=0.031679928472898536, topicId=0), Row(term=u'skiing', probability=0.030337389898223453, topicId=0), Row(term=u'nature', probability=0.030827512537037274, topicId=1), Row(term=u'lake', probability=0.02478979431897514, topicId=1), Row(term=u'spring', probability=0.024462219076508293, topicId=1), Row(term=u'landscape', probability=0.0224125857946134, topicId=1), Row(term=u'flowers', probability=0.02067768513339612, topicId=1)]
rawJson = termDF.toJSON().collect()
from IPython.core.display import display, HTML
from IPython.display import Javascript
s = ""
for line in rawJson:
s += (str(line) +',')
stringJson = s[:-1]
stringJson
'{"term":"snow","probability":0.06482984277484072,"topicId":0},{"term":"ski","probability":0.045950294558274096,"topicId":0},{"term":"mountains","probability":0.039156100706073844,"topicId":0},{"term":"winter","probability":0.031679928472898536,"topicId":0},{"term":"skiing","probability":0.030337389898223453,"topicId":0},{"term":"nature","probability":0.030827512537037274,"topicId":1},{"term":"lake","probability":0.02478979431897514,"topicId":1},{"term":"spring","probability":0.024462219076508293,"topicId":1},{"term":"landscape","probability":0.0224125857946134,"topicId":1},{"term":"flowers","probability":0.02067768513339612,"topicId":1},{"term":"luzern","probability":0.036862489880800375,"topicId":2},{"term":"switzerland","probability":0.0338325739317431,"topicId":2},{"term":"verbier","probability":0.02953757773230965,"topicId":2},{"term":"zurich","probability":0.02188718968235265,"topicId":2},{"term":"swiss","probability":0.02123679980747416,"topicId":2},{"term":"art","probability":0.03698756198224295,"topicId":3},{"term":"zurich","probability":0.03313016298308508,"topicId":3},{"term":"switzerland","probability":0.03157849816347215,"topicId":3},{"term":"easter","probability":0.025158555831459168,"topicId":3},{"term":"family","probability":0.0223036711631496,"topicId":3},{"term":"travel","probability":0.06901255967841895,"topicId":4},{"term":"instatravel","probability":0.037299454043090645,"topicId":4},{"term":"europe","probability":0.034330202644613936,"topicId":4},{"term":"travelgram","probability":0.03368921018190022,"topicId":4},{"term":"trip","probability":0.03304379062370829,"topicId":4},{"term":"switzerland","probability":0.034456317666373186,"topicId":5},{"term":"goodtimes","probability":0.03152461878027823,"topicId":5},{"term":"zurich","probability":0.02950826562264097,"topicId":5},{"term":"weekend","probability":0.022231218091606136,"topicId":5},{"term":"tb","probability":0.018888255116828026,"topicId":5},{"term":"geneva","probability":0.05857098842284779,"topicId":6},{"term":"car","probability":0.031332107870771786,"topicId":6},{"term":"switzerland","probability":0.0276575550779648,"topicId":6},{"term":"lamborghini","probability":0.02190996776182064,"topicId":6},{"term":"ferrari","probability":0.02052769480440934,"topicId":6},{"term":"visitswitzerland","probability":0.03794345847628957,"topicId":7},{"term":"vscocam","probability":0.034333155243237684,"topicId":7},{"term":"switzerlandwonderland","probability":0.03226477690795611,"topicId":7},{"term":"vsco","probability":0.03213684652910808,"topicId":7},{"term":"myswitzerland","probability":0.028912085943440736,"topicId":7},{"term":"fitness","probability":0.050251802705119475,"topicId":8},{"term":"healthy","probability":0.02958433988432552,"topicId":8},{"term":"sport","probability":0.029353669207932892,"topicId":8},{"term":"workout","probability":0.028909345997833683,"topicId":8},{"term":"motivation","probability":0.028554499225599026,"topicId":8},{"term":"suisse","probability":0.04139109858365735,"topicId":9},{"term":"montreux","probability":0.03704523460681258,"topicId":9},{"term":"lacleman","probability":0.03250671538638928,"topicId":9},{"term":"lausanne","probability":0.02801144795578456,"topicId":9},{"term":"switzerland","probability":0.02688108605034323,"topicId":9},{"term":"instagood","probability":0.03645624632489141,"topicId":10},{"term":"picoftheday","probability":0.03076251964509694,"topicId":10},{"term":"photooftheday","probability":0.029703059421505945,"topicId":10},{"term":"instadaily","probability":0.028632883460075267,"topicId":10},{"term":"instalike","probability":0.026769381245962745,"topicId":10},{"term":"music","probability":0.040575034449173424,"topicId":11},{"term":"party","probability":0.03921196577945221,"topicId":11},{"term":"friends","probability":0.027131120772706,"topicId":11},{"term":"konstanz","probability":0.024898402612021024,"topicId":11},{"term":"bodensee","probability":0.02318591615707108,"topicId":11},{"term":"baselworld2016","probability":0.056018990170552174,"topicId":12},{"term":"baselworld","probability":0.05048503702936801,"topicId":12},{"term":"basel","probability":0.04590056891456354,"topicId":12},{"term":"luxury","probability":0.041513618850625066,"topicId":12},{"term":"watches","probability":0.03597274460743156,"topicId":12},{"term":"swiss","probability":0.02934004343949627,"topicId":13},{"term":"nofilter","probability":0.02346935547247407,"topicId":13},{"term":"switzerland","probability":0.021960428794446603,"topicId":13},{"term":"selfie","probability":0.020904185010864756,"topicId":13},{"term":"love","probability":0.019373152419980853,"topicId":13},{"term":"food","probability":0.030915742768915375,"topicId":14},{"term":"foodporn","probability":0.03030777683238159,"topicId":14},{"term":"day","probability":0.02158525753702872,"topicId":14},{"term":"instafood","probability":0.020540403766244577,"topicId":14},{"term":"yummy","probability":0.019046245223605044,"topicId":14}'
html_code = """
<!DOCTYPE html>
<meta charset="utf-8">
<style>
circle {
fill: rgb(31, 119, 180);
fill-opacity: 0.5;
stroke: rgb(31, 119, 180);
stroke-width: 1px;
}
.leaf circle {
fill: #ff7f0e;
fill-opacity: 1;
}
text {
font: 14px sans-serif;
}
</style>
<body>
<script src="https://cdnjs.cloudflare.com/ajax/libs/d3/3.5.5/d3.min.js"></script>
<script>
var json = {
"name": "data",
"children": [
{
"name": "topics",
"children": [
%s
]
}
]
};
var r = 1500,
format = d3.format(",d"),
fill = d3.scale.category20c();
var bubble = d3.layout.pack()
.sort(null)
.size([r, r])
.padding(1.5);
var vis = d3.select("body").append("svg")
.attr("width", r)
.attr("height", r)
.attr("class", "bubble");
var node = vis.selectAll("g.node")
.data(bubble.nodes(classes(json))
.filter(function(d) { return !d.children; }))
.enter().append("g")
.attr("class", "node")
.attr("transform", function(d) { return "translate(" + d.x + "," + d.y + ")"; })
color = d3.scale.category20();
node.append("title")
.text(function(d) { return d.className + ": " + format(d.value); });
node.append("circle")
.attr("r", function(d) { return d.r; })
.style("fill", function(d) {return color(d.topicName);});
var text = node.append("text")
.attr("text-anchor", "middle")
.attr("dy", ".3em")
.text(function(d) { return d.className.substring(0, d.r / 3)});
text.append("tspan")
.attr("dy", "1.2em")
.attr("x", 0)
.text(function(d) {return Math.ceil(d.value * 10000) /10000; });
// Returns a flattened hierarchy containing all leaf nodes under the root.
function classes(root) {
var classes = [];
function recurse(term, node) {
if (node.children) node.children.forEach(function(child) { recurse(node.term, child); });
else classes.push({topicName: node.topicId, className: node.term, value: node.probability});
}
recurse(null, root);
return {children: classes};
}
</script>""" % stringJson
display(HTML(html_code))