import nltk

from nltk.book import *

text1.dispersion_plot(['Ahab','whale','Ishmael','Queequeg', 'Moby', 'dive'])

text4.dispersion_plot(["citizens", "democracy", "freedom", "duties", "America"])

text1.concordance('Ishmael')

text1.similar('Ishmael')

text1.similar('whale')

text1.collocations()

text1.concordance("monstrous")

text2.concordance("monstrous")

text1.similar("monstrous")

text2.similar("monstrous")

def lexical_diversity(text):
    return len(text)/(1.0*len(set(text)))

def percentage(count, total):
    return 100*count/total

lexical_diversity(text3)

lexical_diversity(text4)

lexical_diversity(text5)

fdist1 = FreqDist(text1)

vocabulary1 = fdist1.keys()

fdist1['whale']

fdist1['monstrous']

fdist1.plot(50, cumulative=True)

hapaxes1 = fdist1.hapaxes()
print len(hapaxes1)
print hapaxes1[1000:1010]

thursday_sents = nltk.corpus.gutenberg.sents('chesterton-thursday.txt')
sent22 = thursday_sents[22]
' '.join(sent22)

nltk.bigrams(w for w in sent22 if w.isalpha())

import networkx as nx
G = nx.Graph()
begin_sent = 22
end_sent = 24
sents = thursday_sents[begin_sent:end_sent+1]
for sent in sents:
    G.add_edges_from(nltk.bigrams(w for w in sent if w.isalpha()))
nx.draw(G)

import codecs, nltk, pprint
hard_times_path = "/home/matthew/workspace/resources/C/Corpus Stylistics/Dickens, Charles/786-0.txt"
david_copperfield_path = "/home/matthew/workspace/resources/C/Corpus Stylistics/Dickens, Charles/pg766.txt"
f = codecs.open(hard_times_path, encoding = 'utf-8')
david_copperfield_file = codecs.open(david_copperfield_path, encoding = 'utf-8')

hard_times_raw_text = f.read()
len(hard_times_raw_text)

david_copperfield_raw_text = david_copperfield_file.read()
len(david_copperfield_raw_text)

sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
sents = sent_tokenizer.tokenize(hard_times_raw_text)

print sents[171]

len(sents)

sents = nltk.tokenize.sent_tokenize(hard_times_raw_text)

print sents[171]

DC_sents = nltk.tokenize.sent_tokenize(david_copperfield_raw_text)
print DC_sents[171]

tokens = [nltk.tokenize.word_tokenize(s) for s in sents]

len(tokens)

print tokens[171]

DC_tokens = [nltk.tokenize.word_tokenize(s) for s in DC_sents]

print DC_tokens[171]

# pos_tagged_tokens = [nltk.pos_tag(t) for t in tokens]

' ' .join(tokens[171])

bigram = nltk.bigrams(w for w in tokens[171] if w.isalpha())

import networkx as nx
G = nx.Graph()
G.add_edges_from(bigram)
nx.draw(G)

print nltk.ngrams((w for w in tokens[171] if w.isalpha()), 5)

fivegrams = []
for t in tokens:
    fivegrams += nltk.ngrams((w for w in t if w.isalpha()), 5)

print ' '.join(fivegrams[2000])

D = {}
for gram in fivegrams:
    if not D.get(gram):
        D[gram] = 1
    else:
        D[gram] += 1

for gram in D.keys():
    if D[gram] > 3:
        print ' '.join(gram), D[gram]

hard_times_raw_text.find('venison with a gold spoon')

print hard_times_raw_text[250010:250082]

hard_times_raw_text.find('venison with a gold spoon',250057)

print hard_times_raw_text[250084:250182]

hard_times_raw_text.find('venison with a gold spoon', 250157)

five_gram_2 = 'his hands in his pockets'
hard_times_raw_text.find(five_gram_2)

hard_times_raw_text.find(five_gram_2, 39378)

hard_times_raw_text.find(five_gram_2, 41789)

hard_times_raw_text.find(five_gram_2, 58760)

hard_times_raw_text.find(five_gram_2, 476236)

hard_times_raw_text.find(five_gram_2, 514355)

DC_bigrams = []
for t in DC_tokens:
    DC_bigrams += nltk.ngrams((w for w in t if w.isalpha()), 2)

DC_D = {}
for gram in DC_bigrams:
    if not DC_D.get(gram):
        DC_D[gram] = 1
    else:
        DC_D[gram] += 1

for gram in DC_D.keys():
    if DC_D[gram] > 500:
        print ' '.join(gram), DC_D[gram]

DC_trigrams = []
for t in DC_tokens:
    DC_trigrams += nltk.ngrams((w for w in t if w.isalpha()), 3)

DC_D_3 = {}
for gram in DC_trigrams:
    if not DC_D_3.get(gram):
        DC_D_3[gram] = 1
    else:
        DC_D_3[gram] += 1

for gram in DC_D_3.keys():
    if DC_D_3[gram] > 60:
        print ' '.join(gram), DC_D_3[gram]

DC_fourgrams = []
for t in DC_tokens:
    DC_fourgrams += nltk.ngrams((w for w in t if w.isalpha()), 4)

DC_D_4 = {}
for gram in DC_fourgrams:
    if not DC_D_4.get(gram):
        DC_D_4[gram] = 1
    else:
        DC_D_4[gram] += 1

for gram in DC_D_4.keys():
    if DC_D_4[gram] > 18:
        print ' '.join(gram), DC_D_4[gram]

david_copperfield_raw_text.find("I do know what")

david_copperfield_raw_text.find("I don't know what")

print david_copperfield_raw_text[19264:19296]

test_extract = sents[1024: 1037]

print ' '.join(test_extract)

import codecs, nltk
little_dorrit_path = "/home/matthew/workspace/resources/C/Corpus Stylistics/Dickens, Charles/pg963.txt"
f = codecs.open(little_dorrit_path, encoding = 'utf-8')
little_dorrit_file = codecs.open(little_dorrit_path, encoding = 'utf-8')

little_dorrit_raw = little_dorrit_file.read()

len(little_dorrit_raw)

little_dorrit_raw.find(u'At the close of this recital')

end_phrase = 'producing the money.'
little_dorrit_raw.find(end_phrase)

task_string = little_dorrit_raw[1725461:1727185 + len(end_phrase)]
print task_string

import re

re_1 = r"'[^']+'"
re_2 = r"'[a-zA-Z0-9_,!? ]+(?:[-'][a-zA-Z0-9_,!? ]+)*'"
re_3 = r"'[^']+[\.,!?]'"

nltk.re_show(re_1, task_string[423:])

re.findall(re_1, task_string)

from nltk.corpus import PlaintextCorpusReader
corpus_root = "/home/matthew/workspace/resources/C/Corpus Stylistics/Dickens, Charles"
wordlists = PlaintextCorpusReader(corpus_root, '.*')
wordlists.fileids()


len(hard_times_sents_raw)

a = 0
for sentence in hard_times_sents_raw:
    a += len(sentence)
print a

import re
s = hard_times_sents_raw[1000]
# re.findall(r'\W+', sentence)
print s

hard_times_first_sentence = '\xe2 \x80\x98 NOW , what I want is , Facts .'
hard_times_first_sentence.split() in hard_times_sents_raw

first_sentence_index = hard_times_sents_raw.index(hard_times_first_sentence.split())

' '.join(hard_times_sents_raw[first_sentence_index])

hard_times_last_sentence = 'We shall sit with lighter bosoms on the hearth , to see the ashes of our fires turn gray and cold .'
hard_times_last_sentence.split() in hard_times_sents_raw

last_sentence_index = hard_times_sents_raw.index(hard_times_last_sentence.split())

' '.join(hard_times_sents_raw[last_sentence_index])

hard_times_sents = hard_times_sents_raw[first_sentence_index:last_sentence_index + 1]