import nltk from nltk.book import * text1.dispersion_plot(['Ahab','whale','Ishmael','Queequeg', 'Moby', 'dive']) text4.dispersion_plot(["citizens", "democracy", "freedom", "duties", "America"]) text1.concordance('Ishmael') text1.similar('Ishmael') text1.similar('whale') text1.collocations() text1.concordance("monstrous") text2.concordance("monstrous") text1.similar("monstrous") text2.similar("monstrous") def lexical_diversity(text): return len(text)/(1.0*len(set(text))) def percentage(count, total): return 100*count/total lexical_diversity(text3) lexical_diversity(text4) lexical_diversity(text5) fdist1 = FreqDist(text1) vocabulary1 = fdist1.keys() fdist1['whale'] fdist1['monstrous'] fdist1.plot(50, cumulative=True) hapaxes1 = fdist1.hapaxes() print len(hapaxes1) print hapaxes1[1000:1010] thursday_sents = nltk.corpus.gutenberg.sents('chesterton-thursday.txt') sent22 = thursday_sents[22] ' '.join(sent22) nltk.bigrams(w for w in sent22 if w.isalpha()) import networkx as nx G = nx.Graph() begin_sent = 22 end_sent = 24 sents = thursday_sents[begin_sent:end_sent+1] for sent in sents: G.add_edges_from(nltk.bigrams(w for w in sent if w.isalpha())) nx.draw(G) import codecs, nltk, pprint hard_times_path = "/home/matthew/workspace/resources/C/Corpus Stylistics/Dickens, Charles/786-0.txt" david_copperfield_path = "/home/matthew/workspace/resources/C/Corpus Stylistics/Dickens, Charles/pg766.txt" f = codecs.open(hard_times_path, encoding = 'utf-8') david_copperfield_file = codecs.open(david_copperfield_path, encoding = 'utf-8') hard_times_raw_text = f.read() len(hard_times_raw_text) david_copperfield_raw_text = david_copperfield_file.read() len(david_copperfield_raw_text) sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') sents = sent_tokenizer.tokenize(hard_times_raw_text) print sents[171] len(sents) sents = nltk.tokenize.sent_tokenize(hard_times_raw_text) print sents[171] DC_sents = nltk.tokenize.sent_tokenize(david_copperfield_raw_text) print DC_sents[171] tokens = [nltk.tokenize.word_tokenize(s) for s in sents] len(tokens) print tokens[171] DC_tokens = [nltk.tokenize.word_tokenize(s) for s in DC_sents] print DC_tokens[171] # pos_tagged_tokens = [nltk.pos_tag(t) for t in tokens] ' ' .join(tokens[171]) bigram = nltk.bigrams(w for w in tokens[171] if w.isalpha()) import networkx as nx G = nx.Graph() G.add_edges_from(bigram) nx.draw(G) print nltk.ngrams((w for w in tokens[171] if w.isalpha()), 5) fivegrams = [] for t in tokens: fivegrams += nltk.ngrams((w for w in t if w.isalpha()), 5) print ' '.join(fivegrams[2000]) D = {} for gram in fivegrams: if not D.get(gram): D[gram] = 1 else: D[gram] += 1 for gram in D.keys(): if D[gram] > 3: print ' '.join(gram), D[gram] hard_times_raw_text.find('venison with a gold spoon') print hard_times_raw_text[250010:250082] hard_times_raw_text.find('venison with a gold spoon',250057) print hard_times_raw_text[250084:250182] hard_times_raw_text.find('venison with a gold spoon', 250157) five_gram_2 = 'his hands in his pockets' hard_times_raw_text.find(five_gram_2) hard_times_raw_text.find(five_gram_2, 39378) hard_times_raw_text.find(five_gram_2, 41789) hard_times_raw_text.find(five_gram_2, 58760) hard_times_raw_text.find(five_gram_2, 476236) hard_times_raw_text.find(five_gram_2, 514355) DC_bigrams = [] for t in DC_tokens: DC_bigrams += nltk.ngrams((w for w in t if w.isalpha()), 2) DC_D = {} for gram in DC_bigrams: if not DC_D.get(gram): DC_D[gram] = 1 else: DC_D[gram] += 1 for gram in DC_D.keys(): if DC_D[gram] > 500: print ' '.join(gram), DC_D[gram] DC_trigrams = [] for t in DC_tokens: DC_trigrams += nltk.ngrams((w for w in t if w.isalpha()), 3) DC_D_3 = {} for gram in DC_trigrams: if not DC_D_3.get(gram): DC_D_3[gram] = 1 else: DC_D_3[gram] += 1 for gram in DC_D_3.keys(): if DC_D_3[gram] > 60: print ' '.join(gram), DC_D_3[gram] DC_fourgrams = [] for t in DC_tokens: DC_fourgrams += nltk.ngrams((w for w in t if w.isalpha()), 4) DC_D_4 = {} for gram in DC_fourgrams: if not DC_D_4.get(gram): DC_D_4[gram] = 1 else: DC_D_4[gram] += 1 for gram in DC_D_4.keys(): if DC_D_4[gram] > 18: print ' '.join(gram), DC_D_4[gram] david_copperfield_raw_text.find("I do know what") david_copperfield_raw_text.find("I don't know what") print david_copperfield_raw_text[19264:19296] test_extract = sents[1024: 1037] print ' '.join(test_extract) import codecs, nltk little_dorrit_path = "/home/matthew/workspace/resources/C/Corpus Stylistics/Dickens, Charles/pg963.txt" f = codecs.open(little_dorrit_path, encoding = 'utf-8') little_dorrit_file = codecs.open(little_dorrit_path, encoding = 'utf-8') little_dorrit_raw = little_dorrit_file.read() len(little_dorrit_raw) little_dorrit_raw.find(u'At the close of this recital') end_phrase = 'producing the money.' little_dorrit_raw.find(end_phrase) task_string = little_dorrit_raw[1725461:1727185 + len(end_phrase)] print task_string import re re_1 = r"'[^']+'" re_2 = r"'[a-zA-Z0-9_,!? ]+(?:[-'][a-zA-Z0-9_,!? ]+)*'" re_3 = r"'[^']+[\.,!?]'" nltk.re_show(re_1, task_string[423:]) re.findall(re_1, task_string) from nltk.corpus import PlaintextCorpusReader corpus_root = "/home/matthew/workspace/resources/C/Corpus Stylistics/Dickens, Charles" wordlists = PlaintextCorpusReader(corpus_root, '.*') wordlists.fileids() len(hard_times_sents_raw) a = 0 for sentence in hard_times_sents_raw: a += len(sentence) print a import re s = hard_times_sents_raw[1000] # re.findall(r'\W+', sentence) print s hard_times_first_sentence = '\xe2 \x80\x98 NOW , what I want is , Facts .' hard_times_first_sentence.split() in hard_times_sents_raw first_sentence_index = hard_times_sents_raw.index(hard_times_first_sentence.split()) ' '.join(hard_times_sents_raw[first_sentence_index]) hard_times_last_sentence = 'We shall sit with lighter bosoms on the hearth , to see the ashes of our fires turn gray and cold .' hard_times_last_sentence.split() in hard_times_sents_raw last_sentence_index = hard_times_sents_raw.index(hard_times_last_sentence.split()) ' '.join(hard_times_sents_raw[last_sentence_index]) hard_times_sents = hard_times_sents_raw[first_sentence_index:last_sentence_index + 1]