import os import pandas as pd #all you realy need to know is that CABLES is the directory where the data (or cables) #are stored on your machine DATA = os.environ['DATA'] CABLES = os.path.join(DATA, 'declass', 'cables_short') RAW = os.path.join(CABLES, 'raw') PROCESSED = os.path.join(CABLES, 'processed') SPARSE = os.path.join(CABLES, 'sparse') sfile_path = os.path.join(SPARSE, 'cables-short.vw') filtered_sfile_path = os.path.join(PROCESSED, 'cables-short-filtered.vw') sff_path = os.path.join(PROCESSED, 'sff.pkl') #filefilter is a module which helps with basic file/dir functions, such as #retrieving all paths from a given directory and it's subdir's from rosetta.text import filefilter def simple_file_streamer(base_path): paths = filefilter.get_paths(base_path, get_iter=True) for p in paths: with open(p) as f: text = f.read() yield(text) def my_iter(N): i=0 while True: if i == N: raise StopIteration else: yield i i += 1 mi = my_iter(5) mi.next() #note the raised StopIteration; lets see how a for look handles this for i in my_iter(5): print i simple_stream = simple_file_streamer(RAW) #lets look at what this object is type(simple_stream) #lets see what the .next() yields (and splitlines to make it more readable) simple_stream.next().splitlines() from rosetta import TextFileStreamer, TokenizerBasic text_streamer = TextFileStreamer(text_base_path=RAW, file_type='*', tokenizer=TokenizerBasic()) from rosetta.text import streamers stream = text_streamer.info_stream() stream.next() text = stream.next()['text'] print text text_streamer.tokenizer.text_to_token_list(text) #text_streamer.tokenizer.text_to_counter(text) #lets look at a few methods token_stream = text_streamer.token_stream() # returns a generator function which yields a stream of tokens token_stream.next()[:10] # this is what our basic tokenizer returns (we are skipping stop words and numerics by default) #if you want to use another tokenizer it's easy import nltk nltk.word_tokenize(text) text_streamer_nltk = TextFileStreamer(text_base_path=RAW, file_type='*', tokenizer_func=nltk.word_tokenize) stream_nltk = text_streamer_nltk.token_stream() stream_nltk.next()[:10] from rosetta.text import text_processors, filefilter, streamers, vw_helpers #create the VW format file my_tokenizer = text_processors.TokenizerBasic() stream = streamers.TextFileStreamer(text_base_path=RAW, tokenizer=my_tokenizer) stream.to_vw(sfile_path, n_jobs=-1, raise_on_bad_id=False) ### somewhere here run (stick with 5 passes or so...) # rm -f *cache #vw --lda 20 --cache_file doc_tokens.cache --passes 5 -p prediction.dat --readable_model topics.dat --bit_precision 16 --lda_D 975 --lda_rho 0.1 --lda_alpha 1 ../sparse/cables-short.vw #load the sparse file formatter = text_processors.VWFormatter() sff = text_processors.SFileFilter(formatter) sff.load_sfile(sfile_path) #remove "gaps" in the sequence of numbers (ids) sff.compactify() sff.save(PROCESSED + '/sff_basic.pkl') sff.to_frame().sort_index(by='doc_fraction', ascending=False).head(10) #use the LDAResults class from rosetta to convert back to readable, python friendly formats lda = vw_helpers.LDAResults(PROCESSED + '/topics.dat', PROCESSED + '/prediction.dat', PROCESSED + '/sff_basic.pkl') #look at some of the words topic_words = lda.pr_token_g_topic.loc[:,'topic_12'].order(ascending=False).index[:10] lda.sfile_frame.loc[topic_words] #look at the the first topic a_topic = lda.pr_token_g_topic.T.loc['topic_00'].copy() a_topic.sort(ascending=False) a_topic[:10] ## lda.pr_topic_g_doc.T.loc[[0]].plot(kind='bar', figsize=(12,7), title = 'First Document Topic Weights') #or at the average topic probabilties import random r = lambda: random.randint(0,255) my_colors = ['#%02X%02X%02X' % (r(),r(),r()) for i in range(20)] #my_colors = 'rgbkymc' lda.pr_topic_g_doc.mean(axis=1).plot(kind='bar', figsize=(12,7), color=my_colors, title='Average Topic Probabilities')