import graphlab
graphlab.canvas.set_target("ipynb")
sf = graphlab.SFrame.read_csv("/Users/datalab/bigdata/w15",
header=False)
This non-commercial license of GraphLab Create for academic use is assigned to wangchengjun@nju.edu.cn and will expire on March 14, 2019.
[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1525423344.log
Finished parsing file /Users/datalab/bigdata/w15
Parsing completed. Parsed 100 lines in 0.693838 secs.
------------------------------------------------------ Inferred types from first 100 line(s) of file as column_type_hints=[str] If parsing fails due to incorrect types, you can correct the inferred type list above and pass it to read_csv in the column_type_hints argument ------------------------------------------------------
Read 12278 lines. Lines per second: 11539.9
Finished parsing file /Users/datalab/bigdata/w15
Parsing completed. Parsed 72269 lines in 2.39912 secs.
sf
X1 |
---|
aynrand born and educated in russia rand migrated ... |
asphalt in american english asphalt or ... |
actinopterygii the actinopterygii consti ... |
altaiclanguages these language families share ... |
argon the name argon is derived from the greek ... |
augustderleth a 1938 guggenheim fellow der ... |
amateur amateurism can be seen in both a negative ... |
assemblyline an assembly line is a manufacturing ... |
astronomicalunit an astronomical unit ... |
abbess an abbess latin abbatissa feminine form ... |
dir(sf['X1'])
['_SArray__check_min_observations', '_SArray__construct_ctr', '__abs__', '__add__', '__and__', '__bool__', '__class__', '__contains__', '__copy__', '__deepcopy__', '__delattr__', '__div__', '__doc__', '__eq__', '__floordiv__', '__format__', '__ge__', '__get_content_identifier__', '__getattribute__', '__getitem__', '__gt__', '__has_size__', '__hash__', '__init__', '__is_materialized__', '__iter__', '__le__', '__len__', '__lt__', '__materialize__', '__mod__', '__module__', '__mul__', '__ne__', '__neg__', '__new__', '__nonzero__', '__or__', '__pos__', '__pow__', '__proxy__', '__radd__', '__rdiv__', '__reduce__', '__reduce_ex__', '__repr__', '__rfloordiv__', '__rmod__', '__rmul__', '__rpow__', '__rsub__', '__rtruediv__', '__setattr__', '__sizeof__', '__slots__', '__str__', '__sub__', '__subclasshook__', '__truediv__', '_count_ngrams', '_count_words', '_getitem_cache', '_save_as_text', 'all', 'any', 'append', 'apply', 'argmax', 'argmin', 'astype', 'clip', 'clip_lower', 'clip_upper', 'contains', 'cumulative_max', 'cumulative_mean', 'cumulative_min', 'cumulative_std', 'cumulative_sum', 'cumulative_var', 'date_range', 'datetime_to_str', 'dict_has_all_keys', 'dict_has_any_keys', 'dict_keys', 'dict_trim_by_keys', 'dict_trim_by_values', 'dict_values', 'dropna', 'dtype', 'fillna', 'filter', 'from_avro', 'from_const', 'from_sequence', 'hash', 'head', 'is_in', 'is_materialized', 'item_length', 'materialize', 'max', 'mean', 'min', 'nnz', 'num_missing', 'pixel_array_to_image', 'random_integers', 'random_split', 'rolling_count', 'rolling_max', 'rolling_mean', 'rolling_min', 'rolling_stdv', 'rolling_sum', 'rolling_var', 'sample', 'save', 'shape', 'show', 'size', 'sketch_summary', 'sort', 'split_datetime', 'std', 'str_to_datetime', 'subslice', 'sum', 'tail', 'to_numpy', 'topk_index', 'unique', 'unpack', 'var', 'vector_slice', 'where']
bow = sf['X1']._count_words()
type(sf['X1'])
graphlab.data_structures.sarray.SArray
type(bow)
graphlab.data_structures.sarray.SArray
bow.dict_has_any_keys(['limited'])
dtype: int Rows: 72269 [1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ... ]
bow.dict_values()[0][:20]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1]
sf
X1 |
---|
aynrand born and educated in russia rand migrated ... |
asphalt in american english asphalt or ... |
actinopterygii the actinopterygii consti ... |
altaiclanguages these language families share ... |
argon the name argon is derived from the greek ... |
augustderleth a 1938 guggenheim fellow der ... |
amateur amateurism can be seen in both a negative ... |
assemblyline an assembly line is a manufacturing ... |
astronomicalunit an astronomical unit ... |
abbess an abbess latin abbatissa feminine form ... |
sf['bow'] = bow
sf
X1 | bow |
---|---|
aynrand born and educated in russia rand migrated ... |
{'limited': 3, 'writings': 2, ... |
asphalt in american english asphalt or ... |
{'all': 1, 'accadian': 1, 'similarity': 1, ... |
actinopterygii the actinopterygii consti ... |
{'andreolepis': 1, 'all': 1, 'evolutionary': 2, ... |
altaiclanguages these language families share ... |
{'sergei': 3, 'all': 6, 'todays': 1, 'chinese': ... |
argon the name argon is derived from the greek ... |
{'limited': 1, 'embolism': 1, ... |
augustderleth a 1938 guggenheim fellow der ... |
{'evelyn': 1, 'detective': 4, ... |
amateur amateurism can be seen in both a negative ... |
{'since': 1, 'subpar': 1, 'lack': 2, 'valuable' ... |
assemblyline an assembly line is a manufacturing ... |
{'all': 3, 'concept': 6, 'consider': 1, 'chine ... |
astronomicalunit an astronomical unit ... |
{'precise': 1, 'all': 2, 'chinese': 1, 'suns': 1, ... |
abbess an abbess latin abbatissa feminine form ... |
{'kildares': 1, 'they': 4, 'founder': 1, ... |
type(sf['bow'])
graphlab.data_structures.sarray.SArray
len(sf['bow'])
72269
sf['bow'][0].items()[:5]
[('limited', 3), ('writings', 2), ('personally', 1), ('four', 1), ('controversial', 1)]
sf['tfidf'] = graphlab.text_analytics.tf_idf(sf['X1'])
sf
X1 | bow | tfidf |
---|---|---|
aynrand born and educated in russia rand migrated ... |
{'limited': 3, 'writings': 2, ... |
{'limited': 10.04705669672047, ... |
asphalt in american english asphalt or ... |
{'all': 1, 'accadian': 1, 'similarity': 1, ... |
{'all': 1.3891905239989626, ... |
actinopterygii the actinopterygii consti ... |
{'andreolepis': 1, 'all': 1, 'evolutionary': 2, ... |
{'andreolepis': 11.188150547181156, ... |
altaiclanguages these language families share ... |
{'sergei': 3, 'all': 6, 'todays': 1, 'chinese': ... |
{'sergei': 20.031873121992916, ... |
argon the name argon is derived from the greek ... |
{'limited': 1, 'embolism': 1, ... |
{'limited': 3.3490188989068232, ... |
augustderleth a 1938 guggenheim fellow der ... |
{'evelyn': 1, 'detective': 4, ... |
{'evelyn': 6.7937013925087175, ... |
amateur amateurism can be seen in both a negative ... |
{'since': 1, 'subpar': 1, 'lack': 2, 'valuable' ... |
{'since': 1.8775124538896095, ... |
assemblyline an assembly line is a manufacturing ... |
{'all': 3, 'concept': 6, 'consider': 1, 'chine ... |
{'all': 4.167571571996888, ... |
astronomicalunit an astronomical unit ... |
{'precise': 1, 'all': 2, 'chinese': 1, 'suns': 1, ... |
{'precise': 5.491057060675752, 'a ... |
abbess an abbess latin abbatissa feminine form ... |
{'kildares': 1, 'they': 4, 'founder': 1, ... |
{'kildares': 11.188150547181156, ... |
sf['tfidf'][0].items()[:5]
[('limited', 10.04705669672047), ('writings', 9.76010421134325), ('personally', 5.001941923280662), ('four', 2.1272386886969024), ('controversial', 4.375805453003677)]
docs = sf['bow'].dict_trim_by_values(2)
docs = docs.dict_trim_by_keys(
graphlab.text_analytics.stopwords(),
exclude=True)
help(graphlab.topic_model.create)
Help on function create in module graphlab.toolkits.topic_model.topic_model: create(dataset, num_topics=10, initial_topics=None, alpha=None, beta=0.1, num_iterations=10, num_burnin=5, associations=None, verbose=False, print_interval=10, validation_set=None, method='auto') Create a topic model from the given data set. A topic model assumes each document is a mixture of a set of topics, where for each topic some words are more likely than others. One statistical approach to do this is called a "topic model". This method learns a topic model for the given document collection. Parameters ---------- dataset : SArray of type dict or SFrame with a single column of type dict A bag of words representation of a document corpus. Each element is a dictionary representing a single document, where the keys are words and the values are the number of times that word occurs in that document. num_topics : int, optional The number of topics to learn. initial_topics : SFrame, optional An SFrame with a column of unique words representing the vocabulary and a column of dense vectors representing probability of that word given each topic. When provided, these values are used to initialize the algorithm. alpha : float, optional Hyperparameter that controls the diversity of topics in a document. Smaller values encourage fewer topics per document. Provided value must be positive. Default value is 50/num_topics. beta : float, optional Hyperparameter that controls the diversity of words in a topic. Smaller values encourage fewer words per topic. Provided value must be positive. num_iterations : int, optional The number of iterations to perform. num_burnin : int, optional The number of iterations to perform when inferring the topics for documents at prediction time. verbose : bool, optional When True, print most probable words for each topic while printing progress. print_interval : int, optional The number of iterations to wait between progress reports. associations : SFrame, optional An SFrame with two columns named "word" and "topic" containing words and the topic id that the word should be associated with. These words are not considered during learning. validation_set : SArray of type dict or SFrame with a single column A bag of words representation of a document corpus, similar to the format required for `dataset`. This will be used to monitor model performance during training. Each document in the provided validation set is randomly split: the first portion is used estimate which topic each document belongs to, and the second portion is used to estimate the model's performance at predicting the unseen words in the test data. method : {'cgs', 'alias'}, optional The algorithm used for learning the model. - *cgs:* Collapsed Gibbs sampling - *alias:* AliasLDA method. Returns ------- out : TopicModel A fitted topic model. This can be used with :py:func:`~TopicModel.get_topics()` and :py:func:`~TopicModel.predict()`. While fitting is in progress, several metrics are shown, including: +------------------+---------------------------------------------------+ | Field | Description | +==================+===================================================+ | Elapsed Time | The number of elapsed seconds. | +------------------+---------------------------------------------------+ | Tokens/second | The number of unique words processed per second | +------------------+---------------------------------------------------+ | Est. Perplexity | An estimate of the model's ability to model the | | | training data. See the documentation on evaluate. | +------------------+---------------------------------------------------+ See Also -------- TopicModel, TopicModel.get_topics, TopicModel.predict, graphlab.SArray.dict_trim_by_keys, TopicModel.evaluate References ---------- - `Wikipedia - Latent Dirichlet allocation <http://en.wikipedia.org/wiki/Latent_Dirichlet_allocation>`_ - Alias method: Li, A. et al. (2014) `Reducing the Sampling Complexity of Topic Models. <http://www.sravi.org/pubs/fastlda-kdd2014.pdf>`_. KDD 2014. Examples -------- The following example includes an SArray of documents, where each element represents a document in "bag of words" representation -- a dictionary with word keys and whose values are the number of times that word occurred in the document: >>> docs = graphlab.SArray('https://static.turi.com/datasets/nytimes') Once in this form, it is straightforward to learn a topic model. >>> m = graphlab.topic_model.create(docs) It is also easy to create a new topic model from an old one -- whether it was created using GraphLab Create or another package. >>> m2 = graphlab.topic_model.create(docs, initial_topics=m['topics']) To manually fix several words to always be assigned to a topic, use the `associations` argument. The following will ensure that topic 0 has the most probability for each of the provided words: >>> from graphlab import SFrame >>> associations = SFrame({'word':['hurricane', 'wind', 'storm'], 'topic': [0, 0, 0]}) >>> m = graphlab.topic_model.create(docs, associations=associations) More advanced usage allows you to control aspects of the model and the learning method. >>> import graphlab as gl >>> m = gl.topic_model.create(docs, num_topics=20, # number of topics num_iterations=10, # algorithm parameters alpha=.01, beta=.1) # hyperparameters To evaluate the model's ability to generalize, we can create a train/test split where a portion of the words in each document are held out from training. >>> train, test = gl.text_analytics.random_split(.8) >>> m = gl.topic_model.create(train) >>> results = m.evaluate(test) >>> print results['perplexity']
help(graphlab.text_analytics.random_split)
Help on function random_split in module graphlab.toolkits.text_analytics._util: random_split(dataset, prob=0.5) Utility for performing a random split for text data that is already in bag-of-words format. For each (word, count) pair in a particular element, the counts are uniformly partitioned in either a training set or a test set. Parameters ---------- dataset : SArray of type dict, SFrame with columns of type dict A data set in bag-of-words format. prob : float, optional Probability for sampling a word to be placed in the test set. Returns ------- train, test : SArray Two data sets in bag-of-words format, where the combined counts are equal to the counts in the original data set. Examples -------- >>> docs = graphlab.SArray([{'are':5, 'you':3, 'not': 1, 'entertained':10}]) >>> train, test = graphlab.text_analytics.random_split(docs) >>> print train [{'not': 1.0, 'you': 3.0, 'are': 3.0, 'entertained': 7.0}] >>> print test [{'are': 2.0, 'entertained': 3.0}]
train, test = graphlab.text_analytics.random_split(docs, .8)
m = graphlab.topic_model.create(train,
num_topics=100, # number of topics
num_iterations=100, # algorithm parameters
alpha=None, beta=.1) # hyperparameters
Learning a topic model
Number of documents 72269
Vocabulary size 107782
Running collapsed Gibbs sampling
+-----------+---------------+----------------+-----------------+
| Iteration | Elapsed Time | Tokens/Second | Est. Perplexity |
+-----------+---------------+----------------+-----------------+
| 10 | 2.03s | 5.015e+06 | 0 |
| 20 | 3.86s | 5.37519e+06 | 0 |
| 30 | 5.64s | 5.12154e+06 | 0 |
| 40 | 7.49s | 5.30373e+06 | 0 |
| 50 | 9.29s | 5.00983e+06 | 0 |
| 60 | 11.22s | 5.00837e+06 | 0 |
| 70 | 13.12s | 4.94135e+06 | 0 |
| 80 | 15.02s | 4.60452e+06 | 0 |
| 90 | 16.96s | 4.7366e+06 | 0 |
| 100 | 18.93s | 4.84249e+06 | 0 |
+-----------+---------------+----------------+-----------------+
results = m.evaluate(test)
print results['perplexity']
4527.13385741
m
Class : TopicModel Schema ------ Vocabulary Size : 171005 Settings -------- Number of Topics : 10 alpha : 5.0 beta : 0.1 Iterations : 10 Training time : 3.6698 Verbose : False Accessible fields : m['topics'] : An SFrame containing the topics. m['vocabulary'] : An SArray containing the words in the vocabulary. Useful methods : m.get_topics() : Get the most probable words per topic. m.predict(new_docs) : Make predictions for new documents.
m.get_topics()
topic | word | score |
---|---|---|
0 | time | 0.00422209137382 |
0 | 2001 | 0.00201402456976 |
0 | worked | 0.00181329122394 |
0 | gonzales | 0.00174638010866 |
0 | stunt | 0.00167946899339 |
1 | god | 0.0134496294644 |
1 | century | 0.0111281287498 |
1 | temple | 0.00998245307251 |
1 | people | 0.00690721835975 |
1 | great | 0.00648512732074 |
help(m.get_topics)
Help on method get_topics in module graphlab.toolkits.topic_model.topic_model: get_topics(self, topic_ids=None, num_words=5, cdf_cutoff=1.0, output_type='topic_probabilities') method of graphlab.toolkits.topic_model.topic_model.TopicModel instance Get the words associated with a given topic. The score column is the probability of choosing that word given that you have chosen a particular topic. Parameters ---------- topic_ids : list of int, optional The topics to retrieve words. Topic ids are zero-based. Throws an error if greater than or equal to m['num_topics'], or if the requested topic name is not present. num_words : int, optional The number of words to show. cdf_cutoff : float, optional Allows one to only show the most probable words whose cumulative probability is below this cutoff. For example if there exist three words where .. math:: p(word_1 | topic_k) = .1 p(word_2 | topic_k) = .2 p(word_3 | topic_k) = .05 then setting :math:`cdf_{cutoff}=.3` would return only :math:`word_1` and :math:`word_2` since :math:`p(word_1 | topic_k) + p(word_2 | topic_k) <= cdf_{cutoff}` output_type : {'topic_probabilities' | 'topic_words'}, optional Determine the type of desired output. See below. Returns ------- out : SFrame If output_type is 'topic_probabilities', then the returned value is an SFrame with a column of words ranked by a column of scores for each topic. Otherwise, the returned value is a SArray where each element is a list of the most probable words for each topic. Examples -------- Get the highest ranked words for all topics. >>> docs = graphlab.SArray('https://static.turi.com/datasets/nips-text') >>> m = graphlab.topic_model.create(docs, num_iterations=50) >>> m.get_topics() +-------+----------+-----------------+ | topic | word | score | +-------+----------+-----------------+ | 0 | cell | 0.028974400831 | | 0 | input | 0.0259470208503 | | 0 | image | 0.0215721599763 | | 0 | visual | 0.0173635081992 | | 0 | object | 0.0172447874156 | | 1 | function | 0.0482834508265 | | 1 | input | 0.0456270024091 | | 1 | point | 0.0302662839454 | | 1 | result | 0.0239474934631 | | 1 | problem | 0.0231750116011 | | ... | ... | ... | +-------+----------+-----------------+ Get the highest ranked words for topics 0 and 1 and show 15 words per topic. >>> m.get_topics([0, 1], num_words=15) +-------+----------+------------------+ | topic | word | score | +-------+----------+------------------+ | 0 | cell | 0.028974400831 | | 0 | input | 0.0259470208503 | | 0 | image | 0.0215721599763 | | 0 | visual | 0.0173635081992 | | 0 | object | 0.0172447874156 | | 0 | response | 0.0139740298286 | | 0 | layer | 0.0122585145062 | | 0 | features | 0.0115343177265 | | 0 | feature | 0.0103530459301 | | 0 | spatial | 0.00823387994361 | | ... | ... | ... | +-------+----------+------------------+ If one wants to instead just get the top words per topic, one may change the format of the output as follows. >>> topics = m.get_topics(output_type='topic_words') dtype: list Rows: 10 [['cell', 'image', 'input', 'object', 'visual'], ['algorithm', 'data', 'learning', 'method', 'set'], ['function', 'input', 'point', 'problem', 'result'], ['model', 'output', 'pattern', 'set', 'unit'], ['action', 'learning', 'net', 'problem', 'system'], ['error', 'function', 'network', 'parameter', 'weight'], ['information', 'level', 'neural', 'threshold', 'weight'], ['control', 'field', 'model', 'network', 'neuron'], ['hidden', 'layer', 'system', 'training', 'vector'], ['component', 'distribution', 'local', 'model', 'optimal']]
topics = m.get_topics(num_words=10).unstack(['word','score'], \
new_column_name='topic_words')['topic_words'].apply(lambda x: x.keys())
for topic in topics:
print topic
['goldstone', '1990', 'school', 'group', 'district', 'list', 'arnold', 'glass', 'hercules', 'time'] ['foundation', 'full', 'bar', 'charles', 'trees', 'research', 'men', 'small', '2008', 'walk'] ['water', 'plant', 'made', 'food', 'traditional', 'small', 'iron', 'popular', 'found', 'called'] ['sault', 'wisconsin', 'lewis', 'percent', 'number', 'sierra', 'call', 'time', 'campbell', '2008'] ['jones', 'bar', 'commission', '1966', '1971', 'arbroath', 'engineering', 'represented', 'served', 'pangasinan'] ['states', 'bars', 'pcbs', 'start0', 'high', 'visitors', 'part', 'including', 'plotdata', 'sharp'] ['school', 'made', 'group', 'singapore', 'malaysian', 'years', 'malaysia', '2007', 'association', 'left'] ['division', 'force', 'army', 'german', 'men', 'general', 'forces', 'battle', 'military', 'war'] ['miami', 'gonzales', 'due', 'height', 'greek', '2001', 'time', 'stunt', 'worked', 'ha'] ['city', 'francisco', 'san', 'chicago', 'angeles', 'los', 'california', 'york', 'time', 'served'] ['foundation', 'lankan', 'scales', 'sri', 'fitzgerald', 'due', 'rinpoche', 'time', 'medicine', 'lanka'] ['album', 'released', 'song', 'recorded', 'tour', 'band', 'single', 'music', 'records', 'songs'] ['building', 'city', 'north', 'built', 'area', 'west', 'park', 'town', 'street', 'road'] ['swedish', 'danish', 'made', 'finnish', 'denmark', 'finland', 'norwegian', 'sweden', 'time', 'norway'] ['lessons', 'feet', 'lee', 'bell', 'national', '2002', 'society', 'part', 'originally', 'williams'] ['championship', 'champion', 'title', 'wrestling', 'won', 'team', 'world', 'ring', 'round', 'match'] ['operations', 'training', 'united', 'force', 'group', 'aircraft', 'base', 'flight', 'air', 'war'] ['europe', 'school', 'club', 'phillip', 'alan', 'roberts', 'phillips', 'valid', 'beth', 'springfield'] ['division', 'set', 'skanderbeg', 'university', 'member', 'street', 'college', 'cloud', 'fleetwood', 'red'] ['1991', 'city', 'major', 'morley', 'cj', 'village', 'pipe', 'child', 'davis', '2010'] ['tara', 'cameras', 'award', 'illinois', 'lens', 'camera', 'master', '1936', 'part', 'red'] ['house', 'dudley', 'award', 'mcphee', 'part', '5', 'long', 'brunswick', 'babylon', 'centauri'] ['television', 'network', 'show', 'tv', 'broadcast', 'station', 'radio', 'news', 'local', 'channel'] ['head', '1996', 'national', 'point', 'wilson', 'years', 'bernard', 'points', '2007', 'hat'] ['school', 'made', 'brothers', 'clark', 'wright', 'club', 'rico', 'served', 'puerto', '2010'] ['group', 'wallace', 'post', 'company', 'tradition', 'place', 'members', 'model', 'john', 'korean'] ['naval', 'island', 'ships', 'japanese', 'coast', 'sea', 'islands', 'navy', 'fleet', 'ship'] ['united', 'people', 'german', 'national', 'government', 'political', 'states', 'union', 'state', 'war'] ['salford', 'head', 'orlando', 'long', 'youth', 'born', 'street', 'manchester', 'listed', 'call'] ['dna', 'cells', 'proteins', 'site', 'cell', 'enzyme', 'protein', 'acid', 'called', 'structure'] ['canada', 'toronto', 'ontario', 'iowa', 'university', 'canadian', 'hospital', 'health', 'dr', 'medical'] ['blue', 'named', 'gmina', 'wisconsin', '1979', '1997', 'country', 'broken', 'wayne', 'martin'] ['copper', 'mining', 'oil', 'gold', 'company', 'gas', 'mine', 'mines', 'coal', 'part'] ['des', 'la', 'paris', 'louis', 'de', 'french', 'france', 'le', 'jean', 'du'] ['hong', 'korea', 'chinese', 'modern', 'japanese', 'dynasty', 'china', 'kong', 'japan', 'emperor'] ['mr', 'smith', 'smiths', 'years', 'project', 'famous', 'state', 'london', 'allen', 'plates'] ['robert', 'regional', 'award', 'director', 'connecticut', 'wood', 'place', 'international', 'burns', 'university'] ['brazil', 'city', 'portuguese', 'mexico', 'la', 'de', 'spanish', 'el', 'spain', 'italian'] ['great', 'filipino', 'marcos', 'philippines', 'philippine', 'including', 'manila', 'college', 'post', 'university'] ['simon', 'stores', 'rights', 'british', '2000', '2001', 'ghostbusters', 'closed', 'worked', 'store'] ['named', 'family', '1981', 'rose', 'cat', 'born', 'grid', 'rangers', 'beaver', 'george'] ['ukraine', 'ukrainian', 'moscow', 'war', 'late', 'part', 'polish', 'russian', 'austrian', 'poland'] ['made', 'working', 'michigan', 'act', 'york', 'time', 'half', 'walker', 'soft', 'bond'] ['town', 'khmer', '1995', 'rescue', 'unknown', 'national', 'year', 'joined', 'rouge', 'williams'] ['engine', 'built', 'engines', 'models', 'car', 'power', 'high', 'production', 'design', 'model'] ['pakistan', 'khan', 'district', 'preov', 'region', 'india', 'indian', 'bystrica', 'singh', 'koice'] ['king', 'great', 'ancient', 'century', 'god', 'people', 'jesus', 'found', 'temple', 'called'] ['cornish', 'van', 'village', 'cornwall', 'cats', 'harrison', 'part', '2008', '2009', '2010'] ['200', 'university', 'miller', 'national', 'southern', 'british', 'years', 'professor', '2009', 'originally'] ['irish', 'parade', 'northern', 'dublin', 'cork', 'county', '2006', 'ireland', 'senior', 'local'] ['television', 'theatre', 'show', 'appeared', 'movie', 'production', 'role', 'series', 'films', 'film'] ['zealand', 'cricket', 'australia', 'england', 'played', 'made', 'wales', 'australian', 'day', 'south'] ['shown', 'group', 'groups', 'number', 'site', 'society', '2', 'members', 'action', 'jon'] ['made', 'end', 'began', 'year', 'years', 'early', 'including', 'time', 'continued', 'called'] ['named', 'america', 'michigan', 'top', 'malta', 'hayes', 'design', 'maltese', 'john', 'side'] ['jones', 'school', 'carter', 'journal', 'st', 'member', 'pa', 'college', 'years', 'crawford'] ['study', 'disease', 'research', 'patients', 'health', 'blood', 'treatment', 'found', 'children', 'effects'] ['1984', 'dean', 'miss', 'mildred', 'location', '2005', 'claimed', 'position', 'dr', 'mode'] ['october', 'march', 'september', 'june', '1', '2', '2007', '2008', '2009', '2010'] ['lao', 'weld', 'torture', 'forum', 'club', 'years', 'jordan', '4', 'miranda', 'orange'] ['star', 'center', 'launch', 'sun', 'space', 'mission', 'moon', 'solar', 'time', 'earth'] ['king', 'england', 'family', 'sir', 'married', 'son', 'william', 'london', 'john', 'died'] ['town', 'city', 'named', 'episode', '1994', 'tree', '1974', 'carroll', 'group', '2009'] ['government', 'general', 'member', 'elected', 'state', 'election', 'council', 'president', 'party', 'minister'] ['story', 'episode', 'released', 'series', 'character', 'player', 'game', 'version', 'characters', 'original'] ['management', 'financial', 'business', 'corporation', 'company', 'million', 'companies', 'market', 'sold', 'bank'] ['life', 'death', 'love', 'family', 'time', 'father', 'back', 'mother', 'tells', 'man'] ['championship', 'won', 'series', 'car', 'team', 'points', 'time', 'race', 'grand', 'racing'] ['major', 'dance', 'opera', 'work', 'orchestra', 'festival', 'music', 'performed', 'works', 'musical'] ['city', 'famous', 'minnesota', 'met', 'minister', 'martin', 'served', 'women', 'guide', 'engineer'] ['service', 'evans', 'fire', 'university', 'stamp', 'stamps', '2007', 'duck', 'served', 'postal'] ['florida', 'family', 'moved', 'hurricane', 'damage', 'tropical', 'gang', 'storm', 'texas', 'york'] ['case', 'united', 'court', 'legal', 'states', 'state', 'members', 'act', 'police', 'law'] ['nfl', 'goal', 'season', 'football', 'bowl', 'field', 'game', 'yards', 'pass', 'quarter'] ['city', 'german', 'west', 'von', 'munich', 'mark', 'berlin', 'germany', 'east', 'syracuse'] ['life', 'theory', 'people', 'work', 'research', 'society', 'human', 'social', 'world', 'history'] ['arts', 'art', 'house', 'museum', 'work', 'style', 'collection', 'design', 'artists', 'works'] ['states', 'united', 'washington', 'virginia', 'county', 'american', 'black', 'york', 'white', 'war'] ['level', 'union', 'cemetery', 'professor', 'hammond', 'kitty', 'porky', 'youth', 'university', 'stage'] ['living', 'family', 'people', '18', 'age', 'median', 'years', 'income', 'average', 'population'] ['league', 'played', 'cup', 'club', 'season', 'football', 'game', 'won', 'games', 'team'] ['olympic', 'won', 'gold', 'events', 'held', 'championships', 'world', 'international', 'medal', 'event'] ['species', 'family', 'long', 'order', 'large', 'black', 'small', 'found', 'white', 'birds'] ['mountain', 'valley', 'north', 'creek', 'area', 'park', 'lake', 'water', 'river', 'south'] ['israel', 'jews', 'palestinian', 'jewish', 'romanian', 'camp', 'israeli', 'years', 'palestine', '1939'] ['catholic', 'christian', 'saint', 'century', 'parish', 'st', 'roman', 'council', 'church', 'bishop'] ['horses', 'dates', '1994', 'family', 'horse', 'link', 'year', 'orange', 'called', 'preferences'] ['school', 'students', 'university', 'research', 'high', 'program', 'college', 'year', 'schools', 'education'] ['garden', 'form', '1981', 'crow', 'point', 'ii', 'middle', 'japan', 'nerman', 'cotton'] ['iran', 'ali', 'muslim', 'al', 'years', 'arab', 'islamic', 'muhammad', 'muslims', 'islam'] ['body', 'city', 'de', 'junior', 'beer', 'kent', 'year', 'east', 'open', 'brewery'] ['life', 'work', 'books', 'story', 'writing', 'magazine', 'book', 'stories', 'published', 'wrote'] ['sound', 'register', 'club', 'house', 'hotel', 'county', 'historic', 'listed', 'home', 'properties'] ['service', 'route', 'lines', 'station', 'london', 'trains', 'services', 'railway', 'line', 'class'] ['school', 'fair', 'rogers', 'dallas', 'work', 'andrews', 'time', 'eugene', 'sidney', 'left'] ['development', 'government', 'energy', 'countries', 'system', 'trade', 'states', 'economic', 'world', 'national'] ['letters', 'language', 'die', 'der', 'languages', 'written', 'names', 'words', 'english', 'word'] ['information', 'code', 'users', 'technology', 'system', 'computer', 'systems', 'internet', 'data', 'software'] ['case', 'function', 'set', 'form', 'light', 'energy', 'number', 'surface', '1', 'method'] ['town', 'city', 'reverse', 'people', 'coins', 'list', '1933', 'green', 'date', 'coin']
m['vocabulary']
dtype: str Rows: 171005 ['duke', 'studies', 'journal', 'chris', 'research', 'matthew', 'crisis', 'financial', 'paul', '1987', 'reagan', 'traditional', 'rightwing', 'nominee', 'libertarianism', 'cato', 'chief', 'smith', 'line', 'south', 'nick', '1999', 'documentary', 'animated', 'shows', 'references', 'commentator', 'powerful', 'ethics', 'rush', 'neil', 'lives', 'cited', 'produced', 'night', 'originality', 'interest', '2007', 'individual', 'authors', 'admirer', 'married', 'club', 'library', 'essays', 'recent', '2009', 'burns', 'inspiration', 'artist', 'women', 'early', 'barbara', 'organized', 'gave', 'referred', 'company', 'personalist', 'criticism', 'john', 'reviewers', 'language', 'understanding', 'writes', 'fewer', 'attention', 'positive', 'masterful', 'review', 'times', 'critic', 'praise', 'theory', 'randian', 'importance', 'calling', 'nonfiction', 'academics', 'kant', 'philosophers', 'italian', 'remarked', 'wife', 'house', 'subject', 'scholarly', 'edward', 'system', 'influence', 'acknowledged', '100', 'branden', 'criticized', 'sacrificing', 'exist', 'selfinterest', 'rational', 'communism', 'journals', 'copies', ... ]
m['topics']
topic_probabilities | vocabulary |
---|---|
[6.69111152745e-06, 3.01493599291e-06, ... |
affair |
[6.69111152745e-06, 0.000515554054787, ... |
writings |
[6.69111152745e-06, 3.01493599291e-06, ... |
collectivism |
[6.69111152745e-06, 3.01493599291e-06, ... |
rosenbaum |
[6.69111152745e-06, 3.01493599291e-06, ... |
reviewers |
[0.000140513342076, 3.01493599291e-06, ... |
rest |
[6.69111152745e-06, 3.01493599291e-06, ... |
years |
[6.69111152745e-06, 3.01493599291e-06, ... |
produced |
[6.69111152745e-06, 0.000123612375709, ... |
held |
[6.69111152745e-06, 3.01493599291e-06, ... |
including |
def print_topics(m):
topics = m.get_topics(num_words=5)
topics = topics.unstack(['word','score'], new_column_name='topic_words')['topic_words']
topics = topics.apply(lambda x: x.keys())
for topic in topics:
print topic
print_topics(m)
['1990', 'list', 'group', 'school', 'district'] ['small', '2008', 'charles', 'bar', 'full'] ['food', 'water', 'made', 'called', 'iron'] ['campbell', 'lewis', '2008', 'wisconsin', 'time'] ['represented', 'arbroath', 'bar', '1971', 'served'] ['high', 'sharp', 'start0', 'part', 'visitors'] ['school', 'group', '2007', 'association', 'singapore'] ['battle', 'division', 'war', 'forces', 'army'] ['gonzales', 'worked', '2001', 'time', 'stunt'] ['angeles', 'city', 'california', 'san', 'los'] ['lanka', 'rinpoche', 'sri', 'fitzgerald', 'time'] ['album', 'band', 'song', 'music', 'released'] ['town', 'city', 'street', 'road', 'area'] ['swedish', 'norwegian', 'sweden', 'made', 'time'] ['society', 'national', 'part', 'lee', 'bell'] ['title', 'world', 'championship', 'match', 'team'] ['force', 'aircraft', 'base', 'flight', 'air'] ['club', 'phillip', 'school', 'beth', 'roberts'] ['member', 'university', 'set', 'college', 'red'] ['davis', 'city', '2010', 'morley', 'village'] ['part', 'camera', 'master', 'red', 'award'] ['part', 'brunswick', '5', 'dudley', 'award'] ['news', 'station', 'radio', 'network', 'show'] ['head', 'years', 'hat', 'national', 'wilson'] ['puerto', 'wright', 'brothers', 'clark', 'rico'] ['place', 'model', 'group', 'post', 'members'] ['island', 'ship', 'ships', 'sea', 'islands'] ['state', 'people', 'political', 'war', 'government'] ['salford', 'born', 'street', 'manchester', 'orlando'] ['cell', 'cells', 'dna', 'enzyme', 'protein'] ['canada', 'hospital', 'health', 'medical', 'canadian'] ['country', 'named', 'gmina', 'wisconsin', 'martin'] ['coal', 'mining', 'oil', 'gas', 'gold'] ['paris', 'la', 'de', 'french', 'france'] ['japan', 'emperor', 'china', 'japanese', 'chinese'] ['project', 'famous', 'state', 'smith', 'mr'] ['robert', 'wood', 'place', 'university', 'award'] ['spanish', 'city', 'de', 'mexico', 'la'] ['university', 'including', 'philippine', 'college', 'philippines'] ['simon', '2001', 'stores', 'ghostbusters', 'british'] ['rose', 'named', 'george', 'family', '1981'] ['russian', 'polish', 'poland', 'part', 'war'] ['walker', 'bond', 'time', 'made', 'act'] ['town', 'national', 'joined', 'williams', 'rouge'] ['engine', 'models', 'model', 'production', 'power'] ['pakistan', 'region', 'indian', 'india', 'district'] ['god', 'great', 'people', 'temple', 'century'] ['2008', '2009', '2010', 'cornwall', 'village'] ['miller', 'national', '2009', 'university', 'years'] ['county', 'irish', 'dublin', 'northern', 'ireland'] ['films', 'series', 'role', 'film', 'show'] ['zealand', 'wales', 'australian', 'australia', 'south'] ['2', 'group', 'number', 'members', 'site'] ['early', 'years', 'made', 'end', 'time'] ['maltese', 'michigan', 'america', 'side', 'malta'] ['jones', 'member', 'school', 'carter', 'years'] ['found', 'patients', 'blood', 'study', 'disease'] ['miss', 'position', 'dr', 'location', 'mode'] ['2008', '2009', 'june', '2007', '2010'] ['lao', 'weld', '4', 'torture', 'years'] ['earth', 'time', 'star', 'mission', 'space'] ['son', 'king', 'john', 'died', 'sir'] ['episode', 'named', '2009', 'carroll', 'group'] ['elected', 'president', 'state', 'election', 'party'] ['episode', 'game', 'character', 'characters', 'series'] ['market', 'company', 'million', 'business', 'bank'] ['life', 'mother', 'father', 'back', 'man'] ['car', 'race', 'racing', 'won', 'team'] ['dance', 'festival', 'opera', 'orchestra', 'music'] ['city', 'met', 'women', 'minnesota', 'engineer'] ['fire', 'served', 'stamps', 'service', 'evans'] ['tropical', 'texas', 'storm', 'family', 'york'] ['states', 'police', 'state', 'law', 'court'] ['field', 'game', 'yards', 'season', 'pass'] ['german', 'city', 'berlin', 'germany', 'von'] ['work', 'society', 'social', 'theory', 'history'] ['design', 'arts', 'work', 'art', 'museum'] ['states', 'american', 'united', 'white', 'york'] ['youth', 'professor', 'kitty', 'university', 'stage'] ['years', '18', 'age', 'population', 'income'] ['club', 'league', 'game', 'team', 'season'] ['world', 'championships', 'medal', 'won', 'gold'] ['small', 'white', 'black', 'species', 'family'] ['water', 'river', 'north', 'lake', 'area'] ['jewish', 'israel', 'camp', 'israeli', 'jews'] ['catholic', 'roman', 'st', 'bishop', 'church'] ['horses', 'orange', 'horse', 'link', 'year'] ['students', 'school', 'education', 'college', 'university'] ['middle', 'japan', 'point', 'garden', 'cotton'] ['arab', 'islamic', 'islam', 'al', 'ali'] ['body', 'city', 'beer', 'brewery', 'year'] ['work', 'magazine', 'book', 'books', 'published'] ['register', 'home', 'hotel', 'historic', 'house'] ['line', 'station', 'london', 'railway', 'service'] ['school', 'left', 'time', 'fair', 'eugene'] ['development', 'trade', 'economic', 'system', 'government'] ['languages', 'word', 'words', 'language', 'english'] ['information', 'data', 'system', 'systems', 'software'] ['function', 'energy', 'set', 'number', 'form'] ['date', 'city', 'coins', 'green', 'people']
pred = m.predict(another_data) # do not run
pred = m.predict(another_data, output_type='probabilities')
m2 = graphlab.topic_model.create(docs,
num_topics=10,
initial_topics=m['topics'])
Initializing from provided topics and vocabulary.
Learning a topic model
Number of documents 72269
Vocabulary size 171005
Running collapsed Gibbs sampling
+-----------+---------------+----------------+-----------------+
| Iteration | Elapsed Time | Tokens/Second | Est. Perplexity |
+-----------+---------------+----------------+-----------------+
| 10 | 2.41s | 8.85404e+06 | 0 |
+-----------+---------------+----------------+-----------------+
associations = graphlab.SFrame()
associations['word'] = ['recognition']
associations['topic'] = [0]
m2 = graphlab.topic_model.create(docs,
num_topics=20,
num_iterations=50,
associations=associations,
verbose=False)
Learning a topic model
Number of documents 72269
Vocabulary size 171005
Running collapsed Gibbs sampling
+-----------+---------------+----------------+-----------------+
| Iteration | Elapsed Time | Tokens/Second | Est. Perplexity |
+-----------+---------------+----------------+-----------------+
| 10 | 2.75s | 8.25271e+06 | 0 |
| 20 | 5.15s | 8.35785e+06 | 0 |
| 30 | 7.50s | 7.61449e+06 | 0 |
| 40 | 9.90s | 7.93222e+06 | 0 |
| 50 | 12.24s | 8.52982e+06 | 0 |
+-----------+---------------+----------------+-----------------+
m2.get_topics(num_words=10)
topic | word | score |
---|---|---|
0 | court | 0.0177612734598 |
0 | king | 0.0127195614005 |
0 | police | 0.0107441537516 |
0 | war | 0.00842083848682 |
0 | people | 0.00717072976569 |
0 | city | 0.00638056670611 |
0 | emperor | 0.00618007757159 |
0 | order | 0.00594125963194 |
0 | made | 0.00587934386981 |
0 | french | 0.0055638683199 |
print_topics(m2)
['king', 'court', 'war', 'police', 'people'] ['information', 'aircraft', 'network', 'service', 'system'] ['states', 'region', 'united', 'state', 'government'] ['la', 'de', 'india', 'indian', 'france'] ['military', 'army', 'force', 'war', 'air'] ['model', 'set', 'number', 'power', 'system'] ['large', 'found', 'small', 'species', 'family'] ['club', 'league', 'football', 'year', 'season'] ['party', 'company', 'election', 'council', 'served'] ['town', 'age', 'years', 'school', 'population'] ['work', 'band', 'book', 'art', 'published'] ['students', 'university', 'national', 'college', 'state'] ['death', 'father', 'time', 'family', 'son'] ['season', 'team', 'final', 'won', 'played'] ['city', 'line', 'river', 'road', 'area'] ['roman', 'church', 'people', 'language', 'century'] ['years', 'john', 'york', 'race', 'time'] ['album', 'released', 'music', 'film', 'song'] ['world', 'game', 'championship', 'games', 'team'] ['engine', 'production', 'company', 'made', 'design']