Clustering Instagram users using hashtags. Topic analysis and visualization in D3JS

In [1]:
import pymongo as pm
import unicodedata

Reading the data from Mongo

In [2]:
client = pm.MongoClient()
db = client.instagram
tagsDB = db.tags

Extracting tags data

In [3]:
rawTags = []
for user in tagsDB.find():
    rawTags.extend(user['tags'])
In [4]:
len(rawTags)
Out[4]:
424113
In [5]:
rawTags[:10]
Out[5]:
[u'contiki',
 u'swissalps',
 u'newfriends',
 u'freezingmynutsoff',
 u'walkabout',
 u'jungfraujoch',
 u'yolo',
 u'travel',
 u'noregrets',
 u'goodtimes']
In [6]:
tagsRDD = sc.parallelize(rawTags)
In [7]:
tagsRDD.count()
Out[7]:
424113

Cleaning

Note, if you want to keep language specific features and words, you have to clean the data in a different way.

In [8]:
countsRDD = (
    tagsRDD
    .map(lambda tag: (unicodedata.normalize('NFKD', tag).encode('ascii','ignore'), 1))
    .reduceByKey(lambda a, b: a + b)
)
In [9]:
countsRDD.count()
Out[9]:
106083

Explore the data

In [10]:
ordered = countsRDD.takeOrdered(500, lambda (key, value): -value)
In [11]:
ordered
Out[11]:
[('', 9146),
 ('switzerland', 8973),
 ('zurich', 3990),
 ('love', 2605),
 ('swiss', 2593),
 ('easter', 2526),
 ('mountains', 1999),
 ('travel', 1978),
 ('spring', 1963),
 ('snow', 1809),
 ('sun', 1754),
 ('nature', 1742),
 ('lake', 1565),
 ('geneva', 1516),
 ('beautiful', 1485),
 ('schweiz', 1462),
 ('happy', 1408),
 ('instagood', 1403),
 ('photooftheday', 1265),
 ('picoftheday', 1256),
 ('suisse', 1238),
 ('friends', 1199),
 ('alps', 1165),
 ('happyeaster', 1096),
 ('ski', 1085),
 ('fun', 1044),
 ('basel', 994),
 ('landscape', 928),
 ('sky', 923),
 ('skiing', 860),
 ('sunset', 839),
 ('swissalps', 820),
 ('family', 805),
 ('bern', 800),
 ('nofilter', 799),
 ('luzern', 770),
 ('amazing', 755),
 ('view', 754),
 ('europe', 737),
 ('instadaily', 736),
 ('weekend', 713),
 ('geneve', 701),
 ('fashion', 700),
 ('art', 680),
 ('holiday', 678),
 ('sunnyday', 666),
 ('me', 659),
 ('mountain', 657),
 ('food', 648),
 ('lausanne', 589),
 ('instalike', 583),
 ('smile', 577),
 ('style', 572),
 ('like4like', 565),
 ('lucerne', 565),
 ('followme', 563),
 ('clouds', 561),
 ('architecture', 546),
 ('blue', 544),
 ('wanderlust', 540),
 ('zermatt', 537),
 ('instatravel', 536),
 ('selfie', 535),
 ('instamood', 533),
 ('life', 532),
 ('winter', 529),
 ('city', 519),
 ('ostern', 515),
 ('trip', 515),
 ('sunny', 513),
 ('photography', 512),
 ('flowers', 501),
 ('blackandwhite', 497),
 ('baselworld2016', 496),
 ('travelgram', 492),
 ('bluesky', 492),
 ('sunshine', 491),
 ('instagram', 485),
 ('girl', 478),
 ('foodporn', 476),
 ('home', 471),
 ('verbier', 471),
 ('party', 462),
 ('igers', 462),
 ('music', 458),
 ('vscocam', 451),
 ('beauty', 443),
 ('tbt', 436),
 ('montreux', 435),
 ('vsco', 429),
 ('luxury', 426),
 ('instapic', 424),
 ('baselworld', 422),
 ('fitness', 422),
 ('follow', 422),
 ('svizzera', 420),
 ('chocolate', 418),
 ('visitswitzerland', 407),
 ('lifestyle', 397),
 ('snowboarding', 396),
 ('night', 395),
 ('relax', 385),
 ('matterhorn', 380),
 ('lacleman', 378),
 ('photo', 372),
 ('water', 370),
 ('lugano', 365),
 ('holidays', 362),
 ('interlaken', 357),
 ('myswitzerland', 356),
 ('vacation', 354),
 ('design', 349),
 ('switzerlandwonderland', 348),
 ('summer', 348),
 ('goodtimes', 346),
 ('stmoritz', 342),
 ('morning', 341),
 ('day', 340),
 ('cute', 340),
 ('2016', 333),
 ('traveling', 332),
 ('enjoy', 328),
 ('tagsforlikes', 326),
 ('sunday', 322),
 ('tree', 322),
 ('saturday', 320),
 ('green', 319),
 ('bestoftheday', 314),
 ('goodmorning', 314),
 ('loveit', 309),
 ('travelling', 309),
 ('instafood', 297),
 ('river', 295),
 ('happiness', 294),
 ('white', 294),
 ('hiking', 292),
 ('nice', 291),
 ('germany', 287),
 ('snowboard', 285),
 ('coffee', 281),
 ('france', 281),
 ('konstanz', 279),
 ('inlovewithswitzerland', 276),
 ('black', 276),
 ('ticino', 274),
 ('follow4follow', 271),
 ('valais', 269),
 ('healthy', 265),
 ('instacool', 264),
 ('work', 264),
 ('adventure', 263),
 ('watch', 257),
 ('sport', 257),
 ('trees', 256),
 ('likeforlike', 256),
 ('zuri', 255),
 ('bodensee', 255),
 ('awesome', 254),
 ('watches', 253),
 ('_', 251),
 ('springtime', 250),
 ('light', 246),
 ('italy', 246),
 ('paques', 246),
 ('fruhling', 244),
 ('yummy', 243),
 ('street', 243),
 ('breakfast', 242),
 ('graubunden', 242),
 ('train', 241),
 ('naturelovers', 241),
 ('dinner', 240),
 ('explore', 240),
 ('davos', 239),
 ('best', 236),
 ('swissmade', 236),
 ('girls', 234),
 ('red', 234),
 ('peace', 230),
 ('laax', 229),
 ('travelingram', 229),
 ('sunrise', 227),
 ('chill', 226),
 ('like', 224),
 ('workout', 223),
 ('panorama', 223),
 ('switzerlandpictures', 221),
 ('cool', 221),
 ('gopro', 219),
 ('ootd', 217),
 ('delicious', 216),
 ('beautifulday', 215),
 ('zurichsee', 215),
 ('sweet', 215),
 ('model', 214),
 ('throwback', 213),
 ('ig_switzerland', 211),
 ('photographer', 210),
 ('car', 210),
 ('dog', 209),
 ('suiza', 205),
 ('beautifuldestinations', 205),
 ('see', 205),
 ('colorful', 205),
 ('walk', 204),
 ('colors', 204),
 ('lunch', 204),
 ('new', 202),
 ('training', 202),
 ('live', 201),
 ('gym', 201),
 ('foodie', 200),
 ('forest', 200),
 ('motivation', 198),
 ('cold', 197),
 ('world', 195),
 ('beer', 194),
 ('ischgl', 194),
 ('familytime', 192),
 ('castle', 192),
 ('pasqua', 190),
 ('running', 190),
 ('fit', 188),
 ('switzerland_vacations', 187),
 ('restaurant', 187),
 ('good', 187),
 ('pink', 186),
 ('bunny', 186),
 ('roadtrip', 186),
 ('homesweethome', 186),
 ('time', 185),
 ('my', 184),
 ('l4l', 184),
 ('picture', 183),
 ('memories', 183),
 ('lakegeneva', 181),
 ('nike', 181),
 ('alpes', 180),
 ('inspiration', 180),
 ('nikon', 179),
 ('tb', 178),
 ('instalove', 177),
 ('sonne', 176),
 ('grindelwald', 173),
 ('church', 171),
 ('canon', 171),
 ('weather', 171),
 ('travelphotography', 168),
 ('engelberg', 168),
 ('repost', 167),
 ('concert', 166),
 ('goodlife', 166),
 ('tattoo', 165),
 ('neverstopexploring', 164),
 ('engadin', 164),
 ('bridge', 164),
 ('with', 163),
 ('pretty', 163),
 ('iloveswitzerland', 163),
 ('instaphoto', 162),
 ('lovely', 162),
 ('watchporn', 162),
 ('jungfrau', 161),
 ('passion', 161),
 ('wallis', 160),
 ('airport', 159),
 ('perfect', 158),
 ('hotel', 158),
 ('tourism', 157),
 ('shopping', 156),
 ('friendship', 156),
 ('funny', 155),
 ('monday', 154),
 ('easterweekend', 153),
 ('swag', 152),
 ('instamoment', 152),
 ('flower', 152),
 ('berge', 150),
 ('the', 150),
 ('froheostern', 150),
 ('thun', 150),
 ('iphoneonly', 149),
 ('mylove', 147),
 ('pic', 147),
 ('skyporn', 147),
 ('bar', 147),
 ('brunch', 146),
 ('neuchatel', 145),
 ('loveswitzerlandcontest', 145),
 ('powder', 144),
 ('fresh', 143),
 ('evening', 143),
 ('makeup', 143),
 ('boy', 142),
 ('hair', 142),
 ('vegan', 142),
 ('hot', 142),
 ('wonderful', 141),
 ('in', 141),
 ('color', 140),
 ('house', 140),
 ('tourist', 140),
 ('instafollow', 140),
 ('eurotrip', 139),
 ('swizerland', 139),
 ('top', 139),
 ('friday', 139),
 ('lago', 139),
 ('traveller', 138),
 ('suica', 138),
 ('pictureoftheday', 138),
 ('instago', 138),
 ('eggs', 138),
 ('gold', 138),
 ('potd', 138),
 ('rhein', 138),
 ('polymanga', 137),
 ('wine', 137),
 ('stgallen', 136),
 ('rolex', 136),
 ('mood', 136),
 ('austria', 136),
 ('dance', 136),
 ('swan', 136),
 ('fribourg', 135),
 ('goodday', 134),
 ('alpen', 134),
 ('igdaily', 134),
 ('printemps', 134),
 ('swisslife', 134),
 ('lamborghini', 134),
 ('paris', 133),
 ('apresski', 133),
 ('march', 133),
 ('boat', 132),
 ('schnee', 132),
 ('portrait', 132),
 ('oldtown', 132),
 ('crazy', 132),
 ('lac', 132),
 ('birthday', 132),
 ('urban', 131),
 ('f4f', 131),
 ('arosa', 130),
 ('tflers', 130),
 ('latergram', 130),
 ('swissmountains', 129),
 ('winterwonderland', 128),
 ('vaud', 128),
 ('jetdeau', 128),
 ('streetart', 128),
 ('cat', 128),
 ('bmw', 127),
 ('ig_europe', 126),
 ('titlis', 126),
 ('look', 125),
 ('fitfam', 125),
 ('enjoylife', 125),
 ('and', 124),
 ('traveltheworld', 124),
 ('blessed', 124),
 ('paradise', 124),
 ('montagne', 123),
 ('outdoors', 123),
 ('ig_swiss', 122),
 ('vevey', 122),
 ('dessert', 122),
 ('couple', 121),
 ('sunglasses', 121),
 ('bike', 121),
 ('zug', 120),
 ('winterthur', 120),
 ('hiphop', 120),
 ('cars', 119),
 ('baby', 119),
 ('club', 118),
 ('animal', 118),
 ('ferrari', 117),
 ('vintage', 117),
 ('natur', 116),
 ('friend', 116),
 ('museum', 116),
 ('qualitytime', 116),
 ('carporn', 116),
 ('goodvibes', 116),
 ('loveyou', 115),
 ('wood', 115),
 ('igtravel', 115),
 ('lindt', 115),
 ('instagramers', 115),
 ('italia', 114),
 ('goodtime', 114),
 ('buonapasqua', 114),
 ('nature_perfection', 114),
 ('fly', 113),
 ('nofilterneeded', 113),
 ('today', 113),
 ('audi', 113),
 ('bw', 112),
 ('eat', 112),
 ('shooting', 112),
 ('watchesofinstagram', 111),
 ('walking', 111),
 ('supercar', 110),
 ('igerssuisse', 110),
 ('lakezurich', 110),
 ('garden', 110),
 ('likes', 110),
 ('great', 110),
 ('dj', 109),
 ('traveler', 109),
 ('super_switzerland', 109),
 ('yellow', 109),
 ('artist', 109),
 ('porsche', 109),
 ('landscape_lovers', 108),
 ('drinks', 108),
 ('happyday', 108),
 ('handmade', 108),
 ('run', 108),
 ('naturephotography', 108),
 ('goodnight', 107),
 ('vierwaldstattersee', 107),
 ('people', 106),
 ('blonde', 106),
 ('visitzurich', 106),
 ('london', 105),
 ('cheese', 105),
 ('easterbunny', 105),
 ('outdoor', 105),
 ('fondue', 105),
 ('ascona', 104),
 ('followforfollow', 104),
 ('watchoftheday', 104),
 ('leman', 103),
 ('lagomaggiore', 102),
 ('streetphotography', 102),
 ('reflection', 102),
 ('lights', 101),
 ('building', 101),
 ('ice', 101),
 ('iphone', 100),
 ('genevalake', 100),
 ('health', 100),
 ('freeride', 100),
 ('bodybuilding', 99),
 ('igersgeneva', 99),
 ('champagne', 99),
 ('waterfall', 99),
 ('beard', 99),
 ('chilling', 98),
 ('cloudporn', 98),
 ('sister', 98),
 ('primavera', 97),
 ('dream', 97),
 ('starbucks', 97),
 ('instafashion', 97),
 ('aviation', 97),
 ('springbreak', 97),
 ('rheinfall', 97),
 ('dogsofinstagram', 97),
 ('polymanga2016', 96),
 ('liechtenstein', 96),
 ('zurichcity', 96),
 ('igersswitzerland', 95),
 ('blogger', 95),
 ('instanature', 95),
 ('scenery', 95),
 ('schaffhausen', 95),
 ('outfit', 95),
 ('horology', 94),
 ('liveauthentic', 94),
 ('shoes', 94),
 ('nightlife', 94),
 ('animals', 94),
 ('adidas', 93),
 ('interiordesign', 93),
 ('instatraveling', 93),
 ('jewelry', 92),
 ('homemade', 92),
 ('cake', 92),
 ('tasty', 92),
 ('nightout', 92),
 ('wow', 91),
 ('zurichairport', 91),
 ('gstaad', 91),
 ('mylife', 91),
 ('rigi', 91),
 ('video', 91),
 ('mercedes', 91),
 ('a', 91),
 ('all_shots', 90),
 ('flying', 90),
 ('moment', 90),
 ('deutschland', 89),
 ('zurisee', 89),
 ('mytravelgram', 89),
 ('forever', 89),
 ('beach', 89),
 ('park', 88)]
In [12]:
for order in ordered:
    print order[0],",",
 , switzerland , zurich , love , swiss , easter , mountains , travel , spring , snow , sun , nature , lake , geneva , beautiful , schweiz , happy , instagood , photooftheday , picoftheday , suisse , friends , alps , happyeaster , ski , fun , basel , landscape , sky , skiing , sunset , swissalps , family , bern , nofilter , luzern , amazing , view , europe , instadaily , weekend , geneve , fashion , art , holiday , sunnyday , me , mountain , food , lausanne , instalike , smile , style , like4like , lucerne , followme , clouds , architecture , blue , wanderlust , zermatt , instatravel , selfie , instamood , life , winter , city , ostern , trip , sunny , photography , flowers , blackandwhite , baselworld2016 , travelgram , bluesky , sunshine , instagram , girl , foodporn , home , verbier , party , igers , music , vscocam , beauty , tbt , montreux , vsco , luxury , instapic , baselworld , fitness , follow , svizzera , chocolate , visitswitzerland , lifestyle , snowboarding , night , relax , matterhorn , lacleman , photo , water , lugano , holidays , interlaken , myswitzerland , vacation , design , switzerlandwonderland , summer , goodtimes , stmoritz , morning , day , cute , 2016 , traveling , enjoy , tagsforlikes , sunday , tree , saturday , green , bestoftheday , goodmorning , loveit , travelling , instafood , river , happiness , white , hiking , nice , germany , snowboard , coffee , france , konstanz , inlovewithswitzerland , black , ticino , follow4follow , valais , healthy , instacool , work , adventure , watch , sport , trees , likeforlike , zuri , bodensee , awesome , watches , _ , springtime , light , italy , paques , fruhling , yummy , street , breakfast , graubunden , train , naturelovers , dinner , explore , davos , best , swissmade , girls , red , peace , laax , travelingram , sunrise , chill , like , workout , panorama , switzerlandpictures , cool , gopro , ootd , delicious , beautifulday , zurichsee , sweet , model , throwback , ig_switzerland , photographer , car , dog , suiza , beautifuldestinations , see , colorful , walk , colors , lunch , new , training , live , gym , foodie , forest , motivation , cold , world , beer , ischgl , familytime , castle , pasqua , running , fit , switzerland_vacations , restaurant , good , pink , bunny , roadtrip , homesweethome , time , my , l4l , picture , memories , lakegeneva , nike , alpes , inspiration , nikon , tb , instalove , sonne , grindelwald , church , canon , weather , travelphotography , engelberg , repost , concert , goodlife , tattoo , neverstopexploring , engadin , bridge , with , pretty , iloveswitzerland , instaphoto , lovely , watchporn , jungfrau , passion , wallis , airport , perfect , hotel , tourism , shopping , friendship , funny , monday , easterweekend , swag , instamoment , flower , berge , the , froheostern , thun , iphoneonly , mylove , pic , skyporn , bar , brunch , neuchatel , loveswitzerlandcontest , powder , fresh , evening , makeup , boy , hair , vegan , hot , wonderful , in , color , house , tourist , instafollow , eurotrip , swizerland , top , friday , lago , traveller , suica , pictureoftheday , instago , eggs , gold , potd , rhein , polymanga , wine , stgallen , rolex , mood , austria , dance , swan , fribourg , goodday , alpen , igdaily , printemps , swisslife , lamborghini , paris , apresski , march , boat , schnee , portrait , oldtown , crazy , lac , birthday , urban , f4f , arosa , tflers , latergram , swissmountains , winterwonderland , vaud , jetdeau , streetart , cat , bmw , ig_europe , titlis , look , fitfam , enjoylife , and , traveltheworld , blessed , paradise , montagne , outdoors , ig_swiss , vevey , dessert , couple , sunglasses , bike , zug , winterthur , hiphop , cars , baby , club , animal , ferrari , vintage , natur , friend , museum , qualitytime , carporn , goodvibes , loveyou , wood , igtravel , lindt , instagramers , italia , goodtime , buonapasqua , nature_perfection , fly , nofilterneeded , today , audi , bw , eat , shooting , watchesofinstagram , walking , supercar , igerssuisse , lakezurich , garden , likes , great , dj , traveler , super_switzerland , yellow , artist , porsche , landscape_lovers , drinks , happyday , handmade , run , naturephotography , goodnight , vierwaldstattersee , people , blonde , visitzurich , london , cheese , easterbunny , outdoor , fondue , ascona , followforfollow , watchoftheday , leman , lagomaggiore , streetphotography , reflection , lights , building , ice , iphone , genevalake , health , freeride , bodybuilding , igersgeneva , champagne , waterfall , beard , chilling , cloudporn , sister , primavera , dream , starbucks , instafashion , aviation , springbreak , rheinfall , dogsofinstagram , polymanga2016 , liechtenstein , zurichcity , igersswitzerland , blogger , instanature , scenery , schaffhausen , outfit , horology , liveauthentic , shoes , nightlife , animals , adidas , interiordesign , instatraveling , jewelry , homemade , cake , tasty , nightout , wow , zurichairport , gstaad , mylife , rigi , video , mercedes , a , all_shots , flying , moment , deutschland , zurisee , mytravelgram , forever , beach , park ,
In [13]:
with open('/home/volodymyrmiz/Desktop/rawTags.txt', 'w') as f:
    for tag in ordered:
        if tag[0] != '':
            f.write((tag[0] + ' ')*(tag[1] / 10))
In [14]:
from matplotlib import pyplot as plt
import matplotlib
matplotlib.style.use('ggplot')
import numpy as np
In [15]:
frequentTags = [tag[0] for tag in ordered]
In [16]:
frequency = [tag[1] for tag in ordered]
In [17]:
y_pos = np.arange(len(frequentTags))
In [19]:
#plt.barh(y_pos, frequency, alpha=0.5)
#plt.yticks(y_pos, frequentTags)
#plt.show()

Find words co-occurences

In [20]:
userTags = []
for user in tagsDB.find():
    userTags.append([unicodedata.normalize('NFKD', tag).encode('ascii','ignore') 
                     for tag in user['tags'] 
                     if unicodedata.normalize('NFKD', tag).encode('ascii','ignore') != ''])
In [21]:
userTags[0]
Out[21]:
['contiki',
 'swissalps',
 'newfriends',
 'freezingmynutsoff',
 'walkabout',
 'jungfraujoch',
 'yolo',
 'travel',
 'noregrets']
In [22]:
from collections import Counter
search_word = "train"
count_search = Counter()
for tag in userTags:
    if search_word in tag:
        count_search.update(tag)
print("Co-occurrence for %s:" % search_word)
for word in count_search.most_common(21):
    print word[0]
Co-occurrence for train:
train
switzerland
travel
zurich
mountains
swiss
easter
snow
nature
lake
photooftheday
alps
sky
beautiful
europe
spring
sbb
view
clouds
love
instagood

Topic analysis using LDA

LDA. As with many clustering models, such a model restricts a document to being associated with a single topic. LDA, on the other hand, involves three levels, and notably the topic node is sampled repeatedly within the document. Under this model, documents can be associated with multiple topics.

In [23]:
from pyspark.mllib.clustering import LDA, LDAModel
from pyspark.mllib.linalg import Vectors
In [24]:
tagsList = []
for tag in tagsDB.find():
    tagsList.append((str(tag['_id']), [unicodedata.normalize('NFKD', t).encode('ascii','ignore') 
                                    for t in tag['tags']
                                       if unicodedata.normalize('NFKD', t).encode('ascii','ignore') != '']))

Filter tag list of each user. Remove the most common and rarely used ones

In [25]:
filteredList = []
for tag in tagsList:
    filteredList.append((tag[0], list(set(tag[1]).intersection(frequentTags[:]))))
In [26]:
tagsListDF = sc.parallelize(filteredList).toDF(["id", "tokens"])

Vectorize tags arrays for each user

In [28]:
from pyspark.ml.feature import CountVectorizer
In [29]:
vectorizer = CountVectorizer(inputCol="tokens", outputCol="features").fit(tagsListDF)
In [30]:
countVectors = vectorizer.transform(tagsListDF).select("id", "features")
In [31]:
countVectors.take(1)
Out[31]:
[Row(id=u'234933728', features=SparseVector(499, {6: 1.0, 30: 1.0}))]

Find TF-IDF coefficients for each word instead of bag of words

In [32]:
from pyspark.mllib.feature import IDF
In [33]:
frequencyVectors = countVectors.map(lambda vector: vector[1])
In [34]:
frequencyVectors.take(2)
Out[34]:
[SparseVector(499, {6: 1.0, 30: 1.0}), SparseVector(499, {113: 1.0, 210: 1.0})]
In [35]:
frequencyVectors.cache()
idf = IDF().fit(frequencyVectors)
tfidf = idf.transform(frequencyVectors)
In [36]:
tfidf.take(1)
Out[36]:
[SparseVector(499, {6: 2.8768, 30: 3.7561})]
In [37]:
#just in case, if ids are needed
tfidf_with_ids = countVectors.map(lambda vector: int(vector[0])).zip(tfidf).map(lambda pair: [pair[0], pair[1]])
In [38]:
tfidf_with_ids.take(1)
Out[38]:
[[234933728, SparseVector(499, {6: 2.8768, 30: 3.7561})]]
In [39]:
corpus = tfidf.map(lambda x: [1, x]).cache()
In [40]:
corpus.take(10)
Out[40]:
[[1, SparseVector(499, {6: 2.8768, 30: 3.7561})],
 [1, SparseVector(499, {113: 4.6173, 210: 5.1634})],
 [1, SparseVector(499, {})],
 [1, SparseVector(499, {22: 3.4672})],
 [1,
  SparseVector(499, {2: 2.6026, 8: 2.9656, 13: 3.1635, 16: 3.2196, 17: 3.3231, 18: 3.3302, 20: 3.3766, 23: 3.4764, 35: 3.8386, 43: 3.946, 45: 3.9744, 63: 4.1881, 76: 4.2805, 85: 4.3708, 89: 4.4099, 109: 4.5946, 127: 4.7141, 327: 5.5614, 401: 5.7571})],
 [1, SparseVector(499, {418: 5.7753})],
 [1, SparseVector(499, {})],
 [1, SparseVector(499, {2: 2.6026, 24: 3.5149, 114: 4.6289, 176: 5.0243})],
 [1, SparseVector(499, {158: 4.9412})],
 [1, SparseVector(499, {})]]

Build Latent Dirichlet Allocation model for clustering

In [42]:
ldaModel = LDA.train(corpus, k = 15, maxIterations=100, optimizer="online", docConcentration=2.0, topicConcentration=3.0)

Note: LDA does not perform well with the EMLDAOptimizer which is used by default. In the case of EMLDAOptimizer we have significant bies to the most popular hashtags. I used the OnlineLDAOptimizer instead. The Optimizer implements the Online variational Bayes LDA algorithm, which processes a subset of the corpus on each iteration, and updates the term-topic distribution adaptively.

In [43]:
len(ldaModel.topicsMatrix())
Out[43]:
499
In [44]:
topicIndices = ldaModel.describeTopics(maxTermsPerTopic=5)
In [45]:
topicIndices[0]
Out[45]:
([8, 23, 5, 64, 28],
 [0.06482984277484072,
  0.045950294558274096,
  0.039156100706073844,
  0.031679928472898536,
  0.030337389898223453])
In [46]:
vocablist = vectorizer.vocabulary
In [47]:
ldaModel.vocabSize
Out[47]:
<bound method LDAModel.vocabSize of <pyspark.mllib.clustering.LDAModel object at 0x7f959069d210>>
In [48]:
# from operator import itemgetter 
# for topic in topicIndices:
#     text = itemgetter(*topic[0])(vocablist)
#     print "TOPIC"
#     for tag in text:
#         print tag, topic[1][text.index(tag)]

Visualization

In [49]:
topicsRDD = sc.parallelize(topicIndices)
In [50]:
import operator
termsRDD = topicsRDD.map(lambda topic: (zip(operator.itemgetter(*topic[0])(vocablist), topic[1])))
In [51]:
termsRDD.take(25)
Out[51]:
[[(u'snow', 0.06482984277484072),
  (u'ski', 0.045950294558274096),
  (u'mountains', 0.039156100706073844),
  (u'winter', 0.031679928472898536),
  (u'skiing', 0.030337389898223453)],
 [(u'nature', 0.030827512537037274),
  (u'lake', 0.02478979431897514),
  (u'spring', 0.024462219076508293),
  (u'landscape', 0.0224125857946134),
  (u'flowers', 0.02067768513339612)],
 [(u'luzern', 0.036862489880800375),
  (u'switzerland', 0.0338325739317431),
  (u'verbier', 0.02953757773230965),
  (u'zurich', 0.02188718968235265),
  (u'swiss', 0.02123679980747416)],
 [(u'art', 0.03698756198224295),
  (u'zurich', 0.03313016298308508),
  (u'switzerland', 0.03157849816347215),
  (u'easter', 0.025158555831459168),
  (u'family', 0.0223036711631496)],
 [(u'travel', 0.06901255967841895),
  (u'instatravel', 0.037299454043090645),
  (u'europe', 0.034330202644613936),
  (u'travelgram', 0.03368921018190022),
  (u'trip', 0.03304379062370829)],
 [(u'switzerland', 0.034456317666373186),
  (u'goodtimes', 0.03152461878027823),
  (u'zurich', 0.02950826562264097),
  (u'weekend', 0.022231218091606136),
  (u'tb', 0.018888255116828026)],
 [(u'geneva', 0.05857098842284779),
  (u'car', 0.031332107870771786),
  (u'switzerland', 0.0276575550779648),
  (u'lamborghini', 0.02190996776182064),
  (u'ferrari', 0.02052769480440934)],
 [(u'visitswitzerland', 0.03794345847628957),
  (u'vscocam', 0.034333155243237684),
  (u'switzerlandwonderland', 0.03226477690795611),
  (u'vsco', 0.03213684652910808),
  (u'myswitzerland', 0.028912085943440736)],
 [(u'fitness', 0.050251802705119475),
  (u'healthy', 0.02958433988432552),
  (u'sport', 0.029353669207932892),
  (u'workout', 0.028909345997833683),
  (u'motivation', 0.028554499225599026)],
 [(u'suisse', 0.04139109858365735),
  (u'montreux', 0.03704523460681258),
  (u'lacleman', 0.03250671538638928),
  (u'lausanne', 0.02801144795578456),
  (u'switzerland', 0.02688108605034323)],
 [(u'instagood', 0.03645624632489141),
  (u'picoftheday', 0.03076251964509694),
  (u'photooftheday', 0.029703059421505945),
  (u'instadaily', 0.028632883460075267),
  (u'instalike', 0.026769381245962745)],
 [(u'music', 0.040575034449173424),
  (u'party', 0.03921196577945221),
  (u'friends', 0.027131120772706),
  (u'konstanz', 0.024898402612021024),
  (u'bodensee', 0.02318591615707108)],
 [(u'baselworld2016', 0.056018990170552174),
  (u'baselworld', 0.05048503702936801),
  (u'basel', 0.04590056891456354),
  (u'luxury', 0.041513618850625066),
  (u'watches', 0.03597274460743156)],
 [(u'swiss', 0.02934004343949627),
  (u'nofilter', 0.02346935547247407),
  (u'switzerland', 0.021960428794446603),
  (u'selfie', 0.020904185010864756),
  (u'love', 0.019373152419980853)],
 [(u'food', 0.030915742768915375),
  (u'foodporn', 0.03030777683238159),
  (u'day', 0.02158525753702872),
  (u'instafood', 0.020540403766244577),
  (u'yummy', 0.019046245223605044)]]
In [52]:
indexedTermsRDD = termsRDD.zipWithIndex()
In [53]:
termsRDD = indexedTermsRDD.flatMap(lambda term: [(t[0], t[1], term[1]) for t in term[0]])
In [54]:
termDF = termsRDD.toDF(['term', 'probability', 'topicId'])
In [55]:
termDF.take(10)
Out[55]:
[Row(term=u'snow', probability=0.06482984277484072, topicId=0),
 Row(term=u'ski', probability=0.045950294558274096, topicId=0),
 Row(term=u'mountains', probability=0.039156100706073844, topicId=0),
 Row(term=u'winter', probability=0.031679928472898536, topicId=0),
 Row(term=u'skiing', probability=0.030337389898223453, topicId=0),
 Row(term=u'nature', probability=0.030827512537037274, topicId=1),
 Row(term=u'lake', probability=0.02478979431897514, topicId=1),
 Row(term=u'spring', probability=0.024462219076508293, topicId=1),
 Row(term=u'landscape', probability=0.0224125857946134, topicId=1),
 Row(term=u'flowers', probability=0.02067768513339612, topicId=1)]
In [56]:
rawJson = termDF.toJSON().collect()
In [57]:
from IPython.core.display import display, HTML
from IPython.display import Javascript

s = ""
for line in rawJson:
    s += (str(line) +',')
stringJson = s[:-1]
In [58]:
stringJson
Out[58]:
'{"term":"snow","probability":0.06482984277484072,"topicId":0},{"term":"ski","probability":0.045950294558274096,"topicId":0},{"term":"mountains","probability":0.039156100706073844,"topicId":0},{"term":"winter","probability":0.031679928472898536,"topicId":0},{"term":"skiing","probability":0.030337389898223453,"topicId":0},{"term":"nature","probability":0.030827512537037274,"topicId":1},{"term":"lake","probability":0.02478979431897514,"topicId":1},{"term":"spring","probability":0.024462219076508293,"topicId":1},{"term":"landscape","probability":0.0224125857946134,"topicId":1},{"term":"flowers","probability":0.02067768513339612,"topicId":1},{"term":"luzern","probability":0.036862489880800375,"topicId":2},{"term":"switzerland","probability":0.0338325739317431,"topicId":2},{"term":"verbier","probability":0.02953757773230965,"topicId":2},{"term":"zurich","probability":0.02188718968235265,"topicId":2},{"term":"swiss","probability":0.02123679980747416,"topicId":2},{"term":"art","probability":0.03698756198224295,"topicId":3},{"term":"zurich","probability":0.03313016298308508,"topicId":3},{"term":"switzerland","probability":0.03157849816347215,"topicId":3},{"term":"easter","probability":0.025158555831459168,"topicId":3},{"term":"family","probability":0.0223036711631496,"topicId":3},{"term":"travel","probability":0.06901255967841895,"topicId":4},{"term":"instatravel","probability":0.037299454043090645,"topicId":4},{"term":"europe","probability":0.034330202644613936,"topicId":4},{"term":"travelgram","probability":0.03368921018190022,"topicId":4},{"term":"trip","probability":0.03304379062370829,"topicId":4},{"term":"switzerland","probability":0.034456317666373186,"topicId":5},{"term":"goodtimes","probability":0.03152461878027823,"topicId":5},{"term":"zurich","probability":0.02950826562264097,"topicId":5},{"term":"weekend","probability":0.022231218091606136,"topicId":5},{"term":"tb","probability":0.018888255116828026,"topicId":5},{"term":"geneva","probability":0.05857098842284779,"topicId":6},{"term":"car","probability":0.031332107870771786,"topicId":6},{"term":"switzerland","probability":0.0276575550779648,"topicId":6},{"term":"lamborghini","probability":0.02190996776182064,"topicId":6},{"term":"ferrari","probability":0.02052769480440934,"topicId":6},{"term":"visitswitzerland","probability":0.03794345847628957,"topicId":7},{"term":"vscocam","probability":0.034333155243237684,"topicId":7},{"term":"switzerlandwonderland","probability":0.03226477690795611,"topicId":7},{"term":"vsco","probability":0.03213684652910808,"topicId":7},{"term":"myswitzerland","probability":0.028912085943440736,"topicId":7},{"term":"fitness","probability":0.050251802705119475,"topicId":8},{"term":"healthy","probability":0.02958433988432552,"topicId":8},{"term":"sport","probability":0.029353669207932892,"topicId":8},{"term":"workout","probability":0.028909345997833683,"topicId":8},{"term":"motivation","probability":0.028554499225599026,"topicId":8},{"term":"suisse","probability":0.04139109858365735,"topicId":9},{"term":"montreux","probability":0.03704523460681258,"topicId":9},{"term":"lacleman","probability":0.03250671538638928,"topicId":9},{"term":"lausanne","probability":0.02801144795578456,"topicId":9},{"term":"switzerland","probability":0.02688108605034323,"topicId":9},{"term":"instagood","probability":0.03645624632489141,"topicId":10},{"term":"picoftheday","probability":0.03076251964509694,"topicId":10},{"term":"photooftheday","probability":0.029703059421505945,"topicId":10},{"term":"instadaily","probability":0.028632883460075267,"topicId":10},{"term":"instalike","probability":0.026769381245962745,"topicId":10},{"term":"music","probability":0.040575034449173424,"topicId":11},{"term":"party","probability":0.03921196577945221,"topicId":11},{"term":"friends","probability":0.027131120772706,"topicId":11},{"term":"konstanz","probability":0.024898402612021024,"topicId":11},{"term":"bodensee","probability":0.02318591615707108,"topicId":11},{"term":"baselworld2016","probability":0.056018990170552174,"topicId":12},{"term":"baselworld","probability":0.05048503702936801,"topicId":12},{"term":"basel","probability":0.04590056891456354,"topicId":12},{"term":"luxury","probability":0.041513618850625066,"topicId":12},{"term":"watches","probability":0.03597274460743156,"topicId":12},{"term":"swiss","probability":0.02934004343949627,"topicId":13},{"term":"nofilter","probability":0.02346935547247407,"topicId":13},{"term":"switzerland","probability":0.021960428794446603,"topicId":13},{"term":"selfie","probability":0.020904185010864756,"topicId":13},{"term":"love","probability":0.019373152419980853,"topicId":13},{"term":"food","probability":0.030915742768915375,"topicId":14},{"term":"foodporn","probability":0.03030777683238159,"topicId":14},{"term":"day","probability":0.02158525753702872,"topicId":14},{"term":"instafood","probability":0.020540403766244577,"topicId":14},{"term":"yummy","probability":0.019046245223605044,"topicId":14}'
In [59]:
html_code = """
<!DOCTYPE html>
<meta charset="utf-8">
<style>

circle {
  fill: rgb(31, 119, 180);
  fill-opacity: 0.5;
  stroke: rgb(31, 119, 180);
  stroke-width: 1px;
}

.leaf circle {
  fill: #ff7f0e;
  fill-opacity: 1;
}

text {
  font: 14px sans-serif;
}

</style>
<body>
<script src="https://cdnjs.cloudflare.com/ajax/libs/d3/3.5.5/d3.min.js"></script>

<script>

var json = {
 "name": "data",
 "children": [
  {
     "name": "topics",
     "children": [
      %s
     ]
    }
   ]
};

var r = 1500,
    format = d3.format(",d"),
    fill = d3.scale.category20c();

var bubble = d3.layout.pack()
    .sort(null)
    .size([r, r])
    .padding(1.5);

var vis = d3.select("body").append("svg")
    .attr("width", r)
    .attr("height", r)
    .attr("class", "bubble");

  
var node = vis.selectAll("g.node")
    .data(bubble.nodes(classes(json))
    .filter(function(d) { return !d.children; }))
    .enter().append("g")
    .attr("class", "node")
    .attr("transform", function(d) { return "translate(" + d.x + "," + d.y + ")"; })
    color = d3.scale.category20();
  
  node.append("title")
      .text(function(d) { return d.className + ": " + format(d.value); });

  node.append("circle")
      .attr("r", function(d) { return d.r; })
      .style("fill", function(d) {return color(d.topicName);});

var text = node.append("text")
    .attr("text-anchor", "middle")
    .attr("dy", ".3em")
    .text(function(d) { return d.className.substring(0, d.r / 3)});
  
  text.append("tspan")
      .attr("dy", "1.2em")
      .attr("x", 0)
      .text(function(d) {return Math.ceil(d.value * 10000) /10000; });

// Returns a flattened hierarchy containing all leaf nodes under the root.
function classes(root) {
  var classes = [];

  function recurse(term, node) {
    if (node.children) node.children.forEach(function(child) { recurse(node.term, child); });
    else classes.push({topicName: node.topicId, className: node.term, value: node.probability});
  }

  recurse(null, root);
  return {children: classes};
}

</script>""" % stringJson
In [60]:
display(HTML(html_code))