import os
import pandas as pd

#all you realy need to know is that CABLES is the directory where the data (or cables)
#are stored on your machine
DATA = os.environ['DATA']
CABLES = os.path.join(DATA, 'declass', 'cables_short')
RAW = os.path.join(CABLES, 'raw')
PROCESSED = os.path.join(CABLES, 'processed')
SPARSE = os.path.join(CABLES, 'sparse')

sfile_path = os.path.join(SPARSE, 'cables-short.vw')
filtered_sfile_path = os.path.join(PROCESSED, 'cables-short-filtered.vw')
sff_path = os.path.join(PROCESSED, 'sff.pkl')

#filefilter is a module which helps with basic file/dir functions, such as
#retrieving all paths from a given directory and it's subdir's
from rosetta.text import filefilter

def simple_file_streamer(base_path):
    paths = filefilter.get_paths(base_path, get_iter=True)
    for p in paths:
        with open(p) as f:
            text = f.read()
            yield(text)

def my_iter(N):
    i=0
    while True:
        if i == N:
            raise StopIteration
        else:
            yield i
        i += 1
                
        
mi = my_iter(5)


mi.next()

#note the raised StopIteration; lets see how a for look handles this

for i in my_iter(5):
    print i
    

simple_stream = simple_file_streamer(RAW)

#lets look at what this object is
type(simple_stream)

#lets see what the .next() yields (and splitlines to make it more readable)
simple_stream.next().splitlines()

from rosetta import TextFileStreamer, TokenizerBasic
text_streamer = TextFileStreamer(text_base_path=RAW, file_type='*', 
                                           tokenizer=TokenizerBasic())


from rosetta.text import streamers

stream = text_streamer.info_stream()

stream.next()

text = stream.next()['text']

print text

text_streamer.tokenizer.text_to_token_list(text)
#text_streamer.tokenizer.text_to_counter(text)

#lets look at a few methods
token_stream = text_streamer.token_stream() # returns a generator function which yields a stream of tokens

token_stream.next()[:10] # this is what our basic tokenizer returns (we are skipping stop words and numerics by default)

#if you want to use another tokenizer it's easy
import nltk
nltk.word_tokenize(text)
text_streamer_nltk = TextFileStreamer(text_base_path=RAW, file_type='*', 
                                      tokenizer_func=nltk.word_tokenize)

stream_nltk = text_streamer_nltk.token_stream()

stream_nltk.next()[:10]

from rosetta.text import text_processors, filefilter, streamers, vw_helpers

#create the VW format file 
my_tokenizer = text_processors.TokenizerBasic()
stream = streamers.TextFileStreamer(text_base_path=RAW, tokenizer=my_tokenizer)
stream.to_vw(sfile_path, n_jobs=-1, raise_on_bad_id=False)

### somewhere here run (stick with 5 passes or so...)
# rm -f *cache
#vw --lda 20 --cache_file doc_tokens.cache --passes 5 -p prediction.dat --readable_model topics.dat --bit_precision 16 --lda_D 975 --lda_rho 0.1 --lda_alpha 1 ../sparse/cables-short.vw


#load the sparse file 
formatter = text_processors.VWFormatter()
sff = text_processors.SFileFilter(formatter)
sff.load_sfile(sfile_path)

#remove "gaps" in the sequence of numbers (ids)
sff.compactify()
sff.save(PROCESSED + '/sff_basic.pkl')

sff.to_frame().sort_index(by='doc_fraction', ascending=False).head(10)

#use the LDAResults class from rosetta to convert back to readable, python friendly formats
lda = vw_helpers.LDAResults(PROCESSED + '/topics.dat', 
                            PROCESSED + '/prediction.dat', PROCESSED + '/sff_basic.pkl')

#look at some of the words
topic_words = lda.pr_token_g_topic.loc[:,'topic_12'].order(ascending=False).index[:10]
lda.sfile_frame.loc[topic_words]

#look at the the first topic
a_topic = lda.pr_token_g_topic.T.loc['topic_00'].copy()
a_topic.sort(ascending=False)
a_topic[:10]

##
lda.pr_topic_g_doc.T.loc[[0]].plot(kind='bar', figsize=(12,7),
                                   title = 'First Document Topic Weights')

#or at the average topic probabilties 
import random
r = lambda: random.randint(0,255)
my_colors = ['#%02X%02X%02X' % (r(),r(),r()) for i in range(20)]
#my_colors = 'rgbkymc'
lda.pr_topic_g_doc.mean(axis=1).plot(kind='bar', figsize=(12,7), color=my_colors,
                                     title='Average Topic Probabilities')