In [1]:
from google.colab import drive
drive.mount('/content/gdrive')
import os
os.chdir('/content/gdrive/My Drive/finch/sklearn/topic_modelling/imdb')
Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
In [2]:
!pip install pyldavis
Requirement already satisfied: pyldavis in /usr/local/lib/python3.6/dist-packages (2.1.2)
Requirement already satisfied: wheel>=0.23.0 in /usr/local/lib/python3.6/dist-packages (from pyldavis) (0.35.1)
Requirement already satisfied: future in /usr/local/lib/python3.6/dist-packages (from pyldavis) (0.16.0)
Requirement already satisfied: funcy in /usr/local/lib/python3.6/dist-packages (from pyldavis) (1.15)
Requirement already satisfied: pytest in /usr/local/lib/python3.6/dist-packages (from pyldavis) (3.6.4)
Requirement already satisfied: numexpr in /usr/local/lib/python3.6/dist-packages (from pyldavis) (2.7.1)
Requirement already satisfied: jinja2>=2.7.2 in /usr/local/lib/python3.6/dist-packages (from pyldavis) (2.11.2)
Requirement already satisfied: scipy>=0.18.0 in /usr/local/lib/python3.6/dist-packages (from pyldavis) (1.4.1)
Requirement already satisfied: pandas>=0.17.0 in /usr/local/lib/python3.6/dist-packages (from pyldavis) (1.1.4)
Requirement already satisfied: joblib>=0.8.4 in /usr/local/lib/python3.6/dist-packages (from pyldavis) (0.17.0)
Requirement already satisfied: numpy>=1.9.2 in /usr/local/lib/python3.6/dist-packages (from pyldavis) (1.18.5)
Requirement already satisfied: more-itertools>=4.0.0 in /usr/local/lib/python3.6/dist-packages (from pytest->pyldavis) (8.6.0)
Requirement already satisfied: py>=1.5.0 in /usr/local/lib/python3.6/dist-packages (from pytest->pyldavis) (1.9.0)
Requirement already satisfied: atomicwrites>=1.0 in /usr/local/lib/python3.6/dist-packages (from pytest->pyldavis) (1.4.0)
Requirement already satisfied: pluggy<0.8,>=0.5 in /usr/local/lib/python3.6/dist-packages (from pytest->pyldavis) (0.7.1)
Requirement already satisfied: six>=1.10.0 in /usr/local/lib/python3.6/dist-packages (from pytest->pyldavis) (1.15.0)
Requirement already satisfied: setuptools in /usr/local/lib/python3.6/dist-packages (from pytest->pyldavis) (50.3.2)
Requirement already satisfied: attrs>=17.4.0 in /usr/local/lib/python3.6/dist-packages (from pytest->pyldavis) (20.2.0)
Requirement already satisfied: MarkupSafe>=0.23 in /usr/local/lib/python3.6/dist-packages (from jinja2>=2.7.2->pyldavis) (1.1.1)
Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.6/dist-packages (from pandas>=0.17.0->pyldavis) (2018.9)
Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.6/dist-packages (from pandas>=0.17.0->pyldavis) (2.8.1)
In [3]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer

import tensorflow as tf
In [4]:
def get_idx2word(_index_from=3):
  word2idx = tf.keras.datasets.imdb.get_word_index()
  word2idx = {k:(v+_index_from) for k,v in word2idx.items()}
  word2idx["<pad>"] = 0
  word2idx["<start>"] = 1
  word2idx["<unk>"] = 2
  idx2word = {idx: w for w, idx in word2idx.items()}
  return idx2word
In [5]:
N_TOPICS = 10
MAX_TERMS = 10

(X_train, y_train), (X_test, y_test) = tf.keras.datasets.imdb.load_data()
idx2word = get_idx2word()
documents = [' '.join([idx2word[idx] for idx in x_train[1:]]) for x_train in X_train] + \
            [' '.join([idx2word[idx] for idx in x_test[1:]]) for x_test in X_test]

tfidf = TfidfVectorizer(stop_words = 'english',
                        max_df = .1,
                        max_features = 5000)
tfidf_repr = tfidf.fit_transform(documents)

lda = LatentDirichletAllocation(n_components = N_TOPICS,
                                learning_method = 'batch',
                                max_iter = 100,
                                evaluate_every = 1,
                                verbose = 1)
lda.fit(tfidf_repr)

for topic_idx, term_vals in enumerate(lda.components_):
  message = "Topic #{}: ".format(topic_idx)
  message += " ".join([tfidf.get_feature_names()[i]
                       for i in term_vals.argsort()[:-MAX_TERMS-1:-1]])
  print(message)
iteration: 1 of max_iter: 100, perplexity: 8939.3527
iteration: 2 of max_iter: 100, perplexity: 8595.3923
iteration: 3 of max_iter: 100, perplexity: 8213.2271
iteration: 4 of max_iter: 100, perplexity: 7957.1807
iteration: 5 of max_iter: 100, perplexity: 7772.2066
iteration: 6 of max_iter: 100, perplexity: 7623.4844
iteration: 7 of max_iter: 100, perplexity: 7500.7866
iteration: 8 of max_iter: 100, perplexity: 7405.2129
iteration: 9 of max_iter: 100, perplexity: 7338.7826
iteration: 10 of max_iter: 100, perplexity: 7292.9838
iteration: 11 of max_iter: 100, perplexity: 7258.4895
iteration: 12 of max_iter: 100, perplexity: 7228.7228
iteration: 13 of max_iter: 100, perplexity: 7200.3117
iteration: 14 of max_iter: 100, perplexity: 7173.6886
iteration: 15 of max_iter: 100, perplexity: 7147.6127
iteration: 16 of max_iter: 100, perplexity: 7123.2638
iteration: 17 of max_iter: 100, perplexity: 7100.2275
iteration: 18 of max_iter: 100, perplexity: 7077.0315
iteration: 19 of max_iter: 100, perplexity: 7057.2736
iteration: 20 of max_iter: 100, perplexity: 7040.4939
iteration: 21 of max_iter: 100, perplexity: 7026.0795
iteration: 22 of max_iter: 100, perplexity: 7014.5144
iteration: 23 of max_iter: 100, perplexity: 7006.6735
iteration: 24 of max_iter: 100, perplexity: 6999.5444
iteration: 25 of max_iter: 100, perplexity: 6993.2719
iteration: 26 of max_iter: 100, perplexity: 6987.2895
iteration: 27 of max_iter: 100, perplexity: 6982.5594
iteration: 28 of max_iter: 100, perplexity: 6978.5212
iteration: 29 of max_iter: 100, perplexity: 6977.3304
iteration: 30 of max_iter: 100, perplexity: 6975.4359
iteration: 31 of max_iter: 100, perplexity: 6972.4135
iteration: 32 of max_iter: 100, perplexity: 6970.6153
iteration: 33 of max_iter: 100, perplexity: 6968.0227
iteration: 34 of max_iter: 100, perplexity: 6967.3125
iteration: 35 of max_iter: 100, perplexity: 6964.9861
iteration: 36 of max_iter: 100, perplexity: 6963.6762
iteration: 37 of max_iter: 100, perplexity: 6962.8917
iteration: 38 of max_iter: 100, perplexity: 6961.9312
iteration: 39 of max_iter: 100, perplexity: 6960.7749
iteration: 40 of max_iter: 100, perplexity: 6960.1940
iteration: 41 of max_iter: 100, perplexity: 6959.4689
iteration: 42 of max_iter: 100, perplexity: 6958.4544
iteration: 43 of max_iter: 100, perplexity: 6957.5381
iteration: 44 of max_iter: 100, perplexity: 6956.7342
iteration: 45 of max_iter: 100, perplexity: 6955.8613
iteration: 46 of max_iter: 100, perplexity: 6954.7920
iteration: 47 of max_iter: 100, perplexity: 6954.0679
iteration: 48 of max_iter: 100, perplexity: 6953.5016
iteration: 49 of max_iter: 100, perplexity: 6953.1949
iteration: 50 of max_iter: 100, perplexity: 6952.6801
iteration: 51 of max_iter: 100, perplexity: 6952.0704
iteration: 52 of max_iter: 100, perplexity: 6950.9139
iteration: 53 of max_iter: 100, perplexity: 6950.6813
iteration: 54 of max_iter: 100, perplexity: 6949.3882
iteration: 55 of max_iter: 100, perplexity: 6948.1241
iteration: 56 of max_iter: 100, perplexity: 6947.3978
iteration: 57 of max_iter: 100, perplexity: 6946.5738
iteration: 58 of max_iter: 100, perplexity: 6945.9589
iteration: 59 of max_iter: 100, perplexity: 6945.6921
iteration: 60 of max_iter: 100, perplexity: 6944.6255
iteration: 61 of max_iter: 100, perplexity: 6943.9718
iteration: 62 of max_iter: 100, perplexity: 6943.0728
iteration: 63 of max_iter: 100, perplexity: 6942.6194
iteration: 64 of max_iter: 100, perplexity: 6942.3690
iteration: 65 of max_iter: 100, perplexity: 6941.0041
iteration: 66 of max_iter: 100, perplexity: 6940.4905
iteration: 67 of max_iter: 100, perplexity: 6939.7616
iteration: 68 of max_iter: 100, perplexity: 6939.4104
iteration: 69 of max_iter: 100, perplexity: 6938.7219
iteration: 70 of max_iter: 100, perplexity: 6937.4327
iteration: 71 of max_iter: 100, perplexity: 6937.9312
iteration: 72 of max_iter: 100, perplexity: 6936.9022
iteration: 73 of max_iter: 100, perplexity: 6936.3936
iteration: 74 of max_iter: 100, perplexity: 6936.1589
iteration: 75 of max_iter: 100, perplexity: 6934.9518
iteration: 76 of max_iter: 100, perplexity: 6934.8222
iteration: 77 of max_iter: 100, perplexity: 6934.2274
iteration: 78 of max_iter: 100, perplexity: 6933.5479
iteration: 79 of max_iter: 100, perplexity: 6933.4949
Topic #0: robin woody hamlet allen match branagh scarlett shakespeare williams soccer
Topic #1: war book feel family beautiful different true documentary music american
Topic #2: eddie murphy cartoon bugs barney santa cartoons doo scooby bunny
Topic #3: role plays wife john action woman played murder performance police
Topic #4: sci fi space batman alien trek planet star robot lugosi
Topic #5: series episode season episodes disney dvd animation tv remember kids
Topic #6: columbo khan holmes bollywood freeman ali salman kapoor che betty
Topic #7: comedy music role performance wonderful fun musical excellent songs loved
Topic #8: worst horror minutes awful stupid waste terrible guy effects money
Topic #9: martial arts fu kung game kong chan hong ninja fight
In [6]:
import pyLDAvis
import pyLDAvis.sklearn
In [7]:
pyLDAvis.save_html(pyLDAvis.sklearn.prepare(lda, tfidf_repr, tfidf), 'lda.html')