이 노트북에서 자연어 처리 데이터셋(구텐베르크 프로젝트의 책) 정제하고 word2vec을 사용해 단어 벡터로 임베딩합니다.
노트: 이 전처리 단계의 일부나 전부가 후속 애플리케이션에 도움이 되거나 전혀 도움이 되지 않을 수 있습니다.
import nltk
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import *
nltk.download('gutenberg')
nltk.download('punkt')
nltk.download('stopwords')
import string
import gensim
from gensim.models.phrases import Phraser, Phrases
from gensim.models.word2vec import Word2Vec
from sklearn.manifold import TSNE
import pandas as pd
from bokeh.io import output_notebook, output_file
from bokeh.plotting import show, figure
%matplotlib inline
[nltk_data] Downloading package gutenberg to /root/nltk_data... [nltk_data] Unzipping corpora/gutenberg.zip. [nltk_data] Downloading package punkt to /root/nltk_data... [nltk_data] Unzipping tokenizers/punkt.zip. [nltk_data] Downloading package stopwords to /root/nltk_data... [nltk_data] Unzipping corpora/stopwords.zip.
from nltk.corpus import gutenberg
len(gutenberg.fileids())
18
gutenberg.fileids()
['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']
len(gutenberg.words())
2621613
gberg_sent_tokens = sent_tokenize(gutenberg.raw())
gberg_sent_tokens[0:6]
['[Emma by Jane Austen 1816]\n\nVOLUME I\n\nCHAPTER I\n\n\nEmma Woodhouse, handsome, clever, and rich, with a comfortable home\nand happy disposition, seemed to unite some of the best blessings\nof existence; and had lived nearly twenty-one years in the world\nwith very little to distress or vex her.', "She was the youngest of the two daughters of a most affectionate,\nindulgent father; and had, in consequence of her sister's marriage,\nbeen mistress of his house from a very early period.", 'Her mother\nhad died too long ago for her to have more than an indistinct\nremembrance of her caresses; and her place had been supplied\nby an excellent woman as governess, who had fallen little short\nof a mother in affection.', "Sixteen years had Miss Taylor been in Mr. Woodhouse's family,\nless as a governess than a friend, very fond of both daughters,\nbut particularly of Emma.", 'Between _them_ it was more the intimacy\nof sisters.', "Even before Miss Taylor had ceased to hold the nominal\noffice of governess, the mildness of her temper had hardly allowed\nher to impose any restraint; and the shadow of authority being\nnow long passed away, they had been living together as friend and\nfriend very mutually attached, and Emma doing just what she liked;\nhighly esteeming Miss Taylor's judgment, but directed chiefly by\nher own."]
gberg_sent_tokens[1]
"She was the youngest of the two daughters of a most affectionate,\nindulgent father; and had, in consequence of her sister's marriage,\nbeen mistress of his house from a very early period."
word_tokenize(gberg_sent_tokens[1])
['She', 'was', 'the', 'youngest', 'of', 'the', 'two', 'daughters', 'of', 'a', 'most', 'affectionate', ',', 'indulgent', 'father', ';', 'and', 'had', ',', 'in', 'consequence', 'of', 'her', 'sister', "'s", 'marriage', ',', 'been', 'mistress', 'of', 'his', 'house', 'from', 'a', 'very', 'early', 'period', '.']
word_tokenize(gberg_sent_tokens[1])[14]
'father'
# 개행 문자를 처리하고 문장과 단어를 한 번에 토큰화합니다.
gberg_sents = gutenberg.sents()
gberg_sents[0:6]
[['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ']'], ['VOLUME', 'I'], ['CHAPTER', 'I'], ['Emma', 'Woodhouse', ',', 'handsome', ',', 'clever', ',', 'and', 'rich', ',', 'with', 'a', 'comfortable', 'home', 'and', 'happy', 'disposition', ',', 'seemed', 'to', 'unite', 'some', 'of', 'the', 'best', 'blessings', 'of', 'existence', ';', 'and', 'had', 'lived', 'nearly', 'twenty', '-', 'one', 'years', 'in', 'the', 'world', 'with', 'very', 'little', 'to', 'distress', 'or', 'vex', 'her', '.'], ['She', 'was', 'the', 'youngest', 'of', 'the', 'two', 'daughters', 'of', 'a', 'most', 'affectionate', ',', 'indulgent', 'father', ';', 'and', 'had', ',', 'in', 'consequence', 'of', 'her', 'sister', "'", 's', 'marriage', ',', 'been', 'mistress', 'of', 'his', 'house', 'from', 'a', 'very', 'early', 'period', '.'], ['Her', 'mother', 'had', 'died', 'too', 'long', 'ago', 'for', 'her', 'to', 'have', 'more', 'than', 'an', 'indistinct', 'remembrance', 'of', 'her', 'caresses', ';', 'and', 'her', 'place', 'had', 'been', 'supplied', 'by', 'an', 'excellent', 'woman', 'as', 'governess', ',', 'who', 'had', 'fallen', 'little', 'short', 'of', 'a', 'mother', 'in', 'affection', '.']]
gberg_sents[4][14]
'father'
gberg_sents[4]
['She', 'was', 'the', 'youngest', 'of', 'the', 'two', 'daughters', 'of', 'a', 'most', 'affectionate', ',', 'indulgent', 'father', ';', 'and', 'had', ',', 'in', 'consequence', 'of', 'her', 'sister', "'", 's', 'marriage', ',', 'been', 'mistress', 'of', 'his', 'house', 'from', 'a', 'very', 'early', 'period', '.']
[w.lower() for w in gberg_sents[4]]
['she', 'was', 'the', 'youngest', 'of', 'the', 'two', 'daughters', 'of', 'a', 'most', 'affectionate', ',', 'indulgent', 'father', ';', 'and', 'had', ',', 'in', 'consequence', 'of', 'her', 'sister', "'", 's', 'marriage', ',', 'been', 'mistress', 'of', 'his', 'house', 'from', 'a', 'very', 'early', 'period', '.']
stpwrds = stopwords.words('english') + list(string.punctuation)
stpwrds
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't", '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~']
[w.lower() for w in gberg_sents[4] if w.lower() not in stpwrds]
['youngest', 'two', 'daughters', 'affectionate', 'indulgent', 'father', 'consequence', 'sister', 'marriage', 'mistress', 'house', 'early', 'period']
stemmer = PorterStemmer()
[stemmer.stem(w.lower()) for w in gberg_sents[4]
if w.lower() not in stpwrds]
['youngest', 'two', 'daughter', 'affection', 'indulg', 'father', 'consequ', 'sister', 'marriag', 'mistress', 'hous', 'earli', 'period']
phrases = Phrases(gberg_sents) # 디텍터 훈련
bigram = Phraser(phrases) # 더 효율적으로 문장을 변형하는 Phraser 객체를 만듭니다.
bigram.phrasegrams # 바이그램 횟수와 점수를 출력합니다.
{(b'two', b'daughters'): (19, 11.966987886528115), (b'her', b'sister'): (195, 17.796341912611076), (b"'", b's'): (9781, 31.066694850762417), (b'very', b'early'): (24, 11.01230173457644), (b'Her', b'mother'): (14, 13.529621959045564), (b'long', b'ago'): (38, 63.224356392701125), (b'more', b'than'): (541, 29.024006819814797), (b'had', b'been'): (1256, 22.306349272800997), (b'an', b'excellent'): (54, 39.064443355850045), (b'Miss', b'Taylor'): (48, 453.76578390553544), (b'very', b'fond'): (28, 24.134631699685762), (b'passed', b'away'): (25, 12.350716162995981), (b'too', b'much'): (173, 31.376458650431253), (b'did', b'not'): (935, 11.728586903044414), (b'any', b'means'): (27, 14.097169263925728), (b'wedding', b'-'): (15, 17.469774011299435), (b'Her', b'father'): (18, 13.129762639674155), (b'after', b'dinner'): (21, 21.5288614259916), (b'self', b'-'): (124, 47.79087603091109), (b'sixteen', b'years'): (12, 107.04772502472798), (b'five', b'years'): (42, 40.129339674923365), (b'years', b'old'): (176, 54.73622181125361), (b'seven', b'years'): (51, 52.59487691468612), (b'each', b'other'): (236, 79.41799630087341), (b'a', b'mile'): (48, 12.783277635060301), (b'must', b'be'): (601, 10.230138529643797), (b'difference', b'between'): (44, 220.52858240070225), (b'could', b'not'): (1049, 10.871141494497287), (b'having', b'been'): (49, 11.538186246573854), (b'miles', b'off'): (16, 34.78731999672721), (b'at', b'Hartfield'): (66, 27.282624103216843), (b'her', b'husband'): (158, 27.544796053941575), (b'in', b'spite'): (96, 13.442110585867532), (b'Emma', b'could'): (61, 11.335276219802779), (b'every', b'body'): (127, 36.973121045951494), (b'no', b'means'): (80, 32.57409228176136), (b'his', b'own'): (773, 10.402539077343869), (b'obliged', b'to'): (179, 10.436780686118585), (b'able', b'to'): (348, 11.446995392578943), (b'very', b'much'): (234, 16.21051090525822), (b'have', b'been'): (986, 17.98145273154076), (b'great', b'deal'): (181, 118.04185550664424), (b'"', b'Poor'): (30, 10.125733768993836), (b'agree', b'with'): (25, 13.61194200678363), (b'-', b'humoured'): (22, 33.94127522195319), (b'for', b'ever'): (555, 12.476295381735138), (b'This', b'is'): (353, 11.381193790408082), (b'three', b'times'): (36, 35.42629642564782), (b'my', b'dear'): (253, 24.47929874292135), (b'How', b'often'): (12, 12.37814885769022), (b'My', b'dear'): (85, 84.80821711166878), (b'so', b'far'): (98, 10.161780363169663), (b'"', b'No'): (351, 15.063925495032132), (b'We', b'must'): (68, 18.765920000462394), (b'last', b'night'): (63, 23.5929422985217), (b'doubt', b'whether'): (12, 22.92446435569112), (b'anywhere', b'else'): (6, 16.100335841295465), (b'I', b'am'): (2428, 16.95154402454624), (b'very', b'glad'): (46, 18.284606842044536), (b'am', b'sure'): (282, 65.14555013642888), (b'very', b'pretty'): (39, 20.06847092419522), (b'be', b'able'): (121, 11.34777742133673), (b'immediately', b'afterwards'): (10, 41.0611372267814), (b'sensible', b'man'): (17, 14.541599717835169), (b'intimate', b'friend'): (6, 21.899079320113316), (b'connected', b'with'): (31, 18.3761217091579), (b'than', b'usual'): (30, 28.952390048051893), (b'Brunswick', b'Square'): (11, 10881.466275659823), (b'some', b'time'): (146, 12.92674187618596), (b'poor', b'Isabella'): (10, 41.30301208842583), (b'It', b'is'): (777, 11.70604053201059), (b'am', b'afraid'): (65, 25.627764827764825), (b'moonlight', b'night'): (6, 14.74558893657606), (b'Look', b'at'): (33, 13.630663096064623), (b'"', b'Well'): (311, 21.191639295191656), (b'vast', b'deal'): (11, 61.90490490490491), (b'an', b'hour'): (150, 41.75817958294139), (b'pretty', b'well'): (20, 17.716673032849503), (b'tolerably', b'well'): (7, 18.357847866419295), (b'"', b'Ah'): (83, 17.2797604782697), (b'Ah', b'!'): (68, 37.53350320557592), (b"'", b'Tis'): (64, 23.239682149440206), (b'Miss', b'Woodhouse'): (173, 294.5313833270494), (b'you', b'please'): (93, 13.036170437015532), (b'any', b'rate'): (47, 83.92156482630273), (b',"', b'said'): (2583, 36.033065722366544), (b'My', b'dearest'): (7, 26.665660572611245), (b'so', b'much'): (483, 20.564737038651), (b'much', b'less'): (38, 19.104713600467313), (b'any', b'body'): (93, 21.71675477576872), (b'has', b'been'): (263, 29.261102552816904), (b'been', b'used'): (29, 14.094306941975477), (b'Well', b',"'): (60, 12.493728094244245), (b'tell', b'you'): (296, 11.61233454195183), (b'Every', b'body'): (21, 72.20115873502328), (b'"', b'Dear'): (39, 20.048952862607795), (b'every', b'thing'): (240, 27.277565476570334), (b'very', b'sorry'): (32, 20.256026727225084), (b'turned', b'away'): (50, 19.344906255300334), (b'divided', b'between'): (10, 35.82858268446422), (b'knows', b'how'): (13, 14.801172739783404), (b'how', b'much'): (110, 15.41788827060771), (b'four', b'years'): (21, 16.257841484533913), (b'years', b'ago'): (56, 163.33385119704198), (b'any', b'thing'): (383, 35.72856040672197), (b'need', b'not'): (107, 13.47902882398845), (b'his', b'wife'): (263, 10.871008962598552), (b'Ever', b'since'): (8, 99.63963480128893), (b'leave', b'off'): (18, 10.507399991635475), (b'you', b'mean'): (142, 10.574149798763328), (b'young', b'lady'): (73, 113.30676689703485), (b'depend', b'upon'): (28, 66.33781993881054), (b'quarrel', b'with'): (21, 10.691561721691869), (b'-', b'hearted'): (45, 49.03796213698087), (b'their', b'own'): (279, 10.1646586470654), (b'You', b'are'): (231, 12.600380088963897), (b'more', b'likely'): (16, 11.177045646480481), (b'have', b'done'): (272, 12.664289823059754), (b',"', b'rejoined'): (6, 11.95680754804532), (b'any', b'longer'): (32, 16.396440186651585), (b'very', b'well'): (171, 13.844638642769485), (b'young', b'man'): (260, 25.86418892544471), (b'dine', b'with'): (22, 13.884180846919302), (b'much', b'better'): (38, 10.763540796435533), (b'I', b'dare'): (138, 13.676667311275946), (b'dare', b'say'): (114, 128.21273285427898), (b'Depend', b'upon'): (17, 92.29609730617116), (b'take', b'care'): (59, 72.94080901625021), (b'CHAPTER', b'II'): (11, 335.55615843733045), (b'entering', b'into'): (14, 16.437697132934044), (b'never', b'seen'): (42, 14.015410764872522), (b'refrain', b'from'): (9, 12.438191682463382), (b'at', b'once'): (263, 21.418483948514538), (b'three', b'years'): (77, 37.35580371637182), (b'any', b'other'): (138, 10.208550533393701), (b'twenty', b'years'): (69, 85.2916825593849), (b'an', b'easy'): (18, 10.427619035266346), (b'according', b'to'): (747, 12.093503438006989), (b'had', b'begun'): (25, 12.033151826531775), (b'passed', b'through'): (43, 31.462657712657712), (b'its', b'being'): (58, 16.0647072143383), (b'deal', b'better'): (14, 19.993210914263546), (b'fine', b'young'): (13, 10.40328871973337), (b'belonging', b'to'): (35, 10.51254678910311), (b'Frank', b'Churchill'): (151, 1750.703455229379), (b'Miss', b'Bates'): (113, 400.43190484184277), (b'a', b'few'): (404, 11.554768952918474), (b'few', b'days'): (53, 35.91581912291018), (b'I', b'suppose'): (210, 12.338337969117697), (b'very', b'handsome'): (21, 19.759725217669143), (b'an', b'irresistible'): (7, 11.369243496644911), (b'good', b'sense'): (28, 17.373623742833203), (b'had', b'already'): (64, 11.99089493884663), (b'She', b'felt'): (26, 13.338526859809706), (b'most', b'fortunate'): (6, 11.471739412714017), (b'long', b'enough'): (38, 15.189751032711843), (b'know', b'how'): (120, 12.783055562146046), (b'dear', b'Emma'): (31, 28.3901872294143), (b'at', b'Randalls'): (39, 27.034148473861507), (b'few', b'weeks'): (19, 134.4705370732768), (b'no', b'longer'): (113, 44.45405534922727), (b'CHAPTER', b'III'): (10, 354.19816723940437), (b'Donwell', b'Abbey'): (9, 753.4937557112396), (b'card', b'-'): (18, 15.662556010130528), (b'drawing', b'-'): (53, 20.08500964173348), (b'-', b'room'): (116, 10.86355694820301), (b'thrown', b'away'): (11, 14.820859395595177), (b'After', b'these'): (18, 11.092149558498896), (b'an', b'invitation'): (11, 10.459704016913319), (b'old', b'lady'): (16, 10.88616203950411), (b'those', b'who'): (150, 15.975875581352883), (b'as', b'possible'): (81, 11.709669181717734), (b'young', b'ladies'): (44, 113.63645786708754), (b'-', b'fashioned'): (31, 34.93954802259887), (b'Goddard', b"'"): (34, 15.295062282208422), (b'found', b'herself'): (27, 11.226219866395585), (b's', b'sake'): (142, 28.092409785884055), (b'much', b'pleased'): (18, 13.27637741183309), (b'be', b'allowed'): (32, 10.133422261278234), (b'Miss', b'Smith'): (58, 165.24557352585305), (b'Harriet', b'Smith'): (31, 180.55133848365074), (b'several', b'years'): (10, 17.577623156769786), (b'pretty', b'girl'): (10, 40.456222524597024), (b'blue', b'eyes'): (28, 35.5958547926145), (b'They', b'were'): (188, 10.65352659402321), (b'due', b'time'): (18, 21.041915854217102), (b'its', b'own'): (54, 10.834528109355436), (b'better', b'than'): (170, 42.51429907056041), (b'body', b'else'): (31, 39.47041163749192), (b'apple', b'-'): (26, 28.22040417209909), (b'You', b'need'): (16, 14.652845388359971), (b'half', b'-'): (179, 14.008021557447472), (b'much', b'more'): (159, 10.556098666120453), (b'little', b'girl'): (50, 35.0572859257393), (b'at', b'last'): (420, 22.7569403988895), (b'CHAPTER', b'IV'): (8, 335.55615843733045), (b'every', b'respect'): (14, 12.225158144438588), (b'guided', b'by'): (14, 23.954886635563895), (b'different', b'sort'): (8, 14.491056783566354), (b'-', b'Mill'): (7, 12.705290190035953), (b'good', b'deal'): (62, 39.17741946239356), (b'very', b'happy'): (43, 11.360922087440127), (b'drink', b'tea'): (7, 32.50446757069274), (b'large', b'enough'): (11, 10.829225583329684), (b'had', b'taken'): (121, 10.962706594062695), (b'doing', b'something'): (9, 10.717002712046511), (b'three', b'miles'): (9, 16.651991868276856), (b'thing', b'else'): (26, 12.21037677485836), (b'very', b'obliging'): (14, 25.349647483193962), (b'on', b'purpose'): (35, 10.833519216418129), (b'very', b'clever'): (15, 21.695644242373216), (b'"', b'You'): (493, 11.717232331679762), (b'know', b'what'): (219, 10.688016314679755), (b'Miss', b'Nash'): (13, 337.6861647669101), (b'does', b'not'): (211, 13.23044396496933), (b'"', b'Oh'): (496, 20.296981145444178), (b'Oh', b'yes'): (11, 23.468344823224335), (b'very', b'entertaining'): (7, 16.05477673935618), (b'soon', b'as'): (271, 12.011817412794189), (b'Oh', b'!'): (285, 31.12744137552917), (b'have', b'seen'): (204, 13.438992082992083), (b'on', b'horseback'): (21, 54.889830696518516), (b'their', b'families'): (95, 35.2636280696419), (b'no', b'doubt'): (117, 40.19018109445808), (b'very', b'respectable'): (9, 10.88459439956351), (b'respectable', b'young'): (8, 27.705368476069587), (b'very', b'odd'): (16, 18.20644784875443), (b'perfectly', b'right'): (12, 16.999175371083012), (b'years', b'hence'): (9, 17.99121428987025), (b'young', b'woman'): (57, 30.400597455143597), (b'very', b'desirable'): (9, 14.595251581232889), (b'Dear', b'Miss'): (9, 32.27882457330758), (b'thirty', b'years'): (35, 72.53374931093936), (b'can', b'afford'): (11, 26.391976955083752), (b'good', b'luck'): (21, 51.578277957902856), (b'acquainted', b'with'): (88, 27.731238215638285), (b'your', b'own'): (181, 10.134839838816427), (b'"', b'Yes'): (349, 27.04643052838071), (b'next', b'day'): (100, 33.668880662020904), (b'an', b'opportunity'): (34, 39.4962781888654), (b'few', b'yards'): (15, 127.2018593936402), (b'Robert', b'Martin'): (31, 1963.7493893502685), (b'few', b'minutes'): (86, 316.3684419939749), (b'Only', b'think'): (9, 11.416782816581593), (b'been', b'able'): (40, 15.917912445432831), (b'-', b'morrow'): (134, 31.19170723124743), (b'should', b'happen'): (13, 20.434509648427174), (b'Do', b'you'): (187, 17.543227495018566), (b'compared', b'with'): (25, 15.313434757631585), (b'"', b'Certainly'): (39, 25.246829530691297), (b'You', b'must'): (84, 12.865027201533609), (b'an', b'old'): (158, 10.46791414565501), (b'old', b'man'): (201, 11.807281822173856), (b'more', b'valuable'): (10, 17.666198180904065), (b',"', b'replied'): (256, 68.63356681944518), (b'very', b'bad'): (36, 15.601820655800676), (b'deal', b'too'): (15, 12.720185939364022), (b'no', b'more'): (553, 17.351013056535997), (b'very', b'agreeable'): (21, 21.40636898580824), (b'fixed', b'on'): (31, 10.723013373773426), (b'same', b'time'): (104, 18.367692434617606), (b'pleasing', b'young'): (8, 23.351667715544366), (b'CHAPTER', b'V'): (7, 236.13211149293625), (b'very', b'differently'): (14, 48.16433021806853), (b'"', b'Perhaps'): (40, 10.879276747151517), (b'ever', b'since'): (60, 42.92864084409275), (b'twelve', b'years'): (22, 39.38985552857956), (b'very', b'neatly'): (7, 22.935395341937397), (b'ten', b'years'): (32, 36.45901703775031), (b'being', b'able'): (20, 14.369127392027657), (b'her', b'mother'): (239, 11.54378934477518), (b'have', b'spoken'): (82, 11.590388219544845), (b'Yes', b',"'): (107, 26.304976605699704), (b'"', b'Thank'): (43, 24.57613576706762), (b'Thank', b'you'): (46, 27.91929097443667), (b'"', b'Why'): (191, 10.5490954241727), (b'could', b'possibly'): (21, 30.82056265729735), (b'How', b'can'): (53, 16.088895681879897), (b'much', b'mistaken'): (9, 10.09912469789013), (b'Very', b'well'): (40, 84.54272043745728), (b'oh', b'!'): (27, 22.811112601435184), (b'look', b'at'): (154, 10.282146776177967), (b'any', b'harm'): (10, 10.323684561965813), (b'"', b'Very'): (70, 19.596720843150475), (b'an', b'angel'): (52, 25.819647520741913), (b'an', b'end'): (127, 18.271533362878365), (b'many', b'years'): (54, 19.71931776771305), (b',"', b'cried'): (297, 34.72447183030883), (b'much', b'obliged'): (40, 42.98951729507285), (b'John', b'Knightley'): (58, 175.90626358469606), (b'ill', b'-'): (100, 22.2768930345429), (b'cared', b'for'): (14, 11.00409252669039), (b'I', b'assure'): (105, 13.117682643839952), (b'assure', b'you'): (125, 32.47647355375373), (b'soon', b'afterwards'): (36, 80.81087688682625), (b'CHAPTER', b'VI'): (6, 151.79921453117328), (b'most', b'agreeable'): (13, 28.296957218027913), (b'no', b'scruple'): (10, 27.38112104843708), (b'infinitely', b'superior'): (7, 278.5720720720721), (b'am', b'glad'): (34, 17.03178537511871), (b'Exactly', b'so'): (9, 26.019985274008626), (b'Did', b'you'): (73, 15.110106642904366), (b'very', b'interesting'): (15, 17.838640821506868), (b'No', b'sooner'): (14, 65.68793372043619), (b'Don', b"'"): (134, 25.40609319726938), (b"'", b't'): (2200, 30.670409254097855), (b't', b'pretend'): (9, 22.21571621014818), (b'why', b'should'): (57, 22.803466076696164), (b'cannot', b'imagine'): (13, 50.341789024899015), (b'back', b'again'): (74, 19.21576017940612), (b'almost', b'every'): (37, 10.072406158544343), (b'higher', b'than'): (34, 46.27235316124205), (b'ten', b'times'): (16, 32.71691507115478), (b'dear', b'Isabella'): (6, 10.167866890269968), (b'must', b'allow'): (12, 16.270925888340134), (b'sitting', b'down'): (24, 17.45900165672385), (b'fore', b'-'): (11, 15.528688010043942), (b'must', b'confess'): (10, 12.590597413596536), (b'depended', b'on'): (14, 20.583686511194443), (b'no', b'sooner'): (26, 28.750177100858938), (b'after', b'breakfast'): (11, 12.174058544459536), (b'sooner', b'than'): (12, 16.389570366331988), (b'at', b'home'): (154, 15.749758060408496), (b'at', b'least'): (301, 41.37119228766489), (b'Upon', b'my'): (40, 19.147455857896038), (b'Will', b'you'): (82, 14.912362397847469), (b"'", b'd'): (2523, 30.547355552456107), (b'She', b'paused'): (9, 28.954070883468326), (b'replied', b'Emma'): (16, 16.331281539133734), (b'can', b'hardly'): (33, 26.42101103314215), (b'am', b'persuaded'): (12, 14.509837439249205), (b'Are', b'you'): (83, 16.465572091753142), (b'I', b'beg'): (52, 10.231792462195163), (b'beg', b'your'): (38, 44.83857997838066), (b'your', b'pardon'): (41, 42.18341802803452), (b'dear', b'Miss'): (28, 19.298196342757286), (b'little', b'while'): (54, 10.255019543477895), (b'`', b'No'): (8, 11.44562481492449), (b'entered', b'into'): (98, 58.47508652207685), (b'older', b'than'): (15, 59.83493943264058), (b'advise', b'you'): (16, 10.800315623690194), (b'run', b'away'): (46, 41.7018298679982), (b'At', b'last'): (91, 78.82934999295966), (b'"', b'Indeed'): (53, 16.736517172263895), (b'Dear', b'me'): (17, 11.639793716121261), (b'have', b'borne'): (26, 11.407140974967064), (b'good', b'opinion'): (19, 14.834724620994052), (b'good', b'natured'): (17, 34.33179126572909), (b'thank', b'you'): (59, 16.88776624795194), (b'merely', b'because'): (10, 11.301263472594304), (b'Emma', b'felt'): (19, 16.552506003089487), (b'no', b'difficulty'): (15, 13.258227033980061), (b'protest', b'against'): (6, 10.1573458158824), (b'Let', b'us'): (117, 32.264510238685276), (b'cried', b'Emma'): (27, 14.204966402031332), (b'"', b'Has'): (17, 11.45654449291874), (b'next', b'morning'): (62, 76.77909286541964), (b'dear', b'sir'): (21, 30.139015802234486), (b'am', b'going'): (42, 10.08102475989074), (b'sat', b'down'): (150, 58.92371592771372), (b'depends', b'upon'): (8, 18.878747176262287), (b'has', b'happened'): (14, 16.88674150485437), (b'presently', b'added'): (6, 24.987070707070707), (b'could', b'afford'): (11, 16.18079539508111), (b'Certainly', b',"'): (12, 17.04952187406462), (b'stood', b'up'): (80, 10.286479413623711), (b'"', b'Nonsense'): (8, 10.024476431303897), (b'are', b'mistaken'): (17, 10.345444812472815), (b'does', b'seem'): (9, 14.179296113722343), (b'few', b'moments'): (43, 388.7952484944742), (b'nobody', b'knows'): (7, 38.60362047440699), (b'very', b'likely'): (29, 25.18396351271557), (b'all', b'probability'): (16, 13.453835276434582), (b'no', b'harm'): (25, 27.989590405069016), (b'cannot', b'help'): (16, 20.515795346592878), (b'very', b'different'): (29, 17.756435103435404), (b'common', b'sense'): (26, 145.7873644507308), (b'.--', b'She'): (78, 30.441114197863847), (b'-', b'natured'): (60, 48.04187853107345), (b'an', b'hundred'): (183, 30.382299526934904), (b'exactly', b'what'): (25, 14.38771426976121), (b'every', b'man'): (307, 12.828982038889638), (b'be', b'satisfied'): (66, 12.273087213735986), (b'less', b'than'): (85, 36.58696933460825), (b'large', b'fortune'): (9, 38.26326372776489), (b'no', b'use'): (40, 14.694534962661235), (b'these', b'words'): (111, 26.137791068580544), (b'well', b'acquainted'): (9, 11.682266824085007), (b'twenty', b'thousand'): (48, 77.04216497473693), (b'thousand', b'pounds'): (47, 448.5710831721469), (b'Good', b'morning'): (10, 19.263571686664424), (b'walked', b'off'): (15, 10.917066798474792), (b'cast', b'down'): (44, 15.322932013410156), (b'its', b'effects'): (7, 29.72292312498498), (b'deal', b'more'): (28, 10.737653188633582), (b'longer', b'than'): (31, 18.302452061748884), (b'perfectly', b'satisfied'): (12, 93.75833838690116), (b'three', b'hundred'): (77, 49.30380882147769), (b'looking', b'at'): (106, 12.747194191690063), (b'next', b'moment'): (21, 24.667637262918568), (b'ready', b'wit'): (8, 70.13003213003213), (b'very', b'pleasant'): (19, 11.645952038911219), (b'an', b'idea'): (37, 14.230889818929686), (b'Give', b'me'): (67, 25.719795758473396), (b'arrive', b'at'): (10, 11.926830209056545), (b'very', b'superior'): (13, 10.70318449290412), (b'pre', b'-'): (17, 49.32642073778665), (b'have', b'chosen'): (38, 12.418273092369478), (b'without', b'exception'): (8, 55.118538324420676), (b'her', b'cheeks'): (13, 10.316214846771857), (b'sit', b'down'): (54, 36.05010576005672), (b'reason', b'why'): (20, 38.7228669226916), (b'could', b'hardly'): (46, 23.719372787618898), (b'It', b'seemed'): (50, 10.045558012557422), (b'an', b'offering'): (70, 11.10916276306153), (b'let', b'us'): (282, 32.25392476944686), (b'Have', b'you'): (81, 15.50084824691241), (b'"', b'Aye'): (59, 19.5070892717265), (b'Very', b'true'): (18, 128.88708979271206), (b'can', b'easily'): (12, 15.162057467882711), (b'Nobody', b'could'): (9, 15.935631828488972), (b'dear', b'mother'): (21, 13.46066364121149), (b'those', b'things'): (85, 16.270079692293628), (b'next', b'week'): (12, 51.50149900066622), (b'Why', b'should'): (43, 13.317115021941754), (b'.--', b'Poor'): (10, 33.94982433025911), (b'taken', b'away'): (75, 33.88283563223159), (b'stay', b'longer'): (9, 32.079572569768644), (b'three', b'days'): (97, 38.36047093343905), (b'cannot', b'bear'): (17, 21.20561660980335), (b'We', b'are'): (133, 13.053617112780595), (b'four', b'o'): (8, 13.636624231911327), (b'o', b"'"): (216, 29.052217020325397), (b"'", b'clock'): (67, 18.374166774488803), (b'ask', b'whether'): (9, 11.353068061866079), (b're', b'-'): (54, 17.206410583993414), (b'Of', b'course'): (52, 64.5306321807182), (b'ran', b'away'): (24, 15.68573713220816), (b'who', b'lived'): (27, 14.515587325296064), (b'A', b'few'): (48, 27.799197568033183), (b'.--', b'Emma'): (18, 10.09086002610704), (b'thus', b'began'): (15, 10.569989454451607), (b'Never', b'mind'): (10, 58.8196690127449), (b'good', b'fortune'): (18, 19.836146064643472), (b'Those', b'who'): (17, 18.71431093178666), (b'Jane', b'Fairfax'): (111, 897.7114059953714), (b'nothing', b'else'): (45, 34.14858386055199), (b'present', b'instance'): (6, 15.748153806977339), (b'These', b'are'): (118, 23.59039770019026), (b'once', b'more'): (124, 21.461766513970687), (b'still', b'greater'): (11, 12.724373482572735), (b'here', b'comes'): (14, 13.200933526553092), (b'turned', b'back'): (42, 28.86287494639118), (b'will', b'bring'): (144, 12.922132077825832), (b'each', b'side'): (37, 22.901925567260612), (b'waiting', b'for'): (50, 11.228665843561624), (b'still', b'remained'): (10, 13.211098151305023), (b'she', b'hoped'): (24, 14.771949542264762), (b'ten', b'minutes'): (39, 192.59908585456108), (b'most', b'favourable'): (6, 12.483951713835843), (b'ten', b'days'): (21, 17.36327679451949), (b'many', b'months'): (10, 10.650833562965003), (b'little', b'ones'): (53, 64.86319239593576), (b'-', b'tempered'): (21, 31.944729620661825), (b'passed', b'over'): (52, 23.114649934790215), (b'sir', b',"'): (108, 26.986181179154077), (b'cannot', b'deny'): (9, 34.78674185428415), (b'talking', b'about'): (24, 19.191470943716723), (b'never', b'forget'): (18, 21.690516659921762), (b'cannot', b'tell'): (30, 18.289343248058184), (b'two', b'years'): (53, 12.946058959906635), (b'indeed', b'!--'): (19, 15.372686467521769), (b'most', b'amiable'): (8, 17.20760911907103), (b',"', b'observed'): (18, 11.71111972171562), (b'our', b'lives'): (16, 19.655161454360538), (b'think', b'differently'): (6, 12.463321241434905), (b'shake', b'hands'): (13, 44.840574981420055), (b'How', b'long'): (48, 18.446894705078492), (b'South', b'End'): (8, 1381.451973194341), (b'perfectly', b'convinced'): (8, 75.6828750917843), (b'tells', b'me'): (15, 12.933104129023622), (b'bad', b'cold'): (7, 14.30739511156867), (b'far', b'off'): (67, 27.665534339247987), (b'am', b'sorry'): (32, 26.675629043853345), (b'Ah', b'!"'): (14, 17.412606445880755), (b'an', b'interval'): (9, 11.127344698843956), (b'perfectly', b'well'): (16, 14.848259303721488), (b'He', b'paused'): (14, 28.882402391182513), (b'can', b'tell'): (51, 12.431003638264086), (b'morrow', b'morning'): (14, 22.316414535277676), (b'own', b'feelings'): (16, 10.723971700076298), (b'sore', b'throat'): (7, 129.1624895572264), (b'&', b'c'): (17, 4365.388235294118), (b'well', b'satisfied'): (17, 19.87189717498996), (b'looked', b'at'): (184, 13.030941652621292), (b'well', b'pleased'): (21, 19.25167566515881), (b'set', b'forward'): (22, 30.104688954112678), (b'eldest', b'daughter'): (10, 86.18032329988851), (b'short', b'time'): (23, 10.749540101684603), (b'Ha', b'!'): (35, 53.03328712107136), (b'"', b'Quite'): (25, 17.43387205444156), (b',"', b'continued'): (103, 41.18031481403468), (b'dining', b'-'): (20, 27.58385370205174), (b'such', b'circumstances'): (10, 10.213103979019891), (b'enter', b'into'): (108, 69.92082379259851), (b'gone', b'through'): (24, 11.359011093968116), (b'turn', b'away'): (49, 25.693197151088075), (b',"', b'repeated'): (29, 16.0233360034719), (b'several', b'times'): (18, 100.66348633961887), (b'great', b'curiosity'): (13, 12.762317494711862), (b'upper', b'end'): (11, 50.59973817705776), (b'an', b'odd'): (25, 26.958000043591028), (b'In', b'short'): (23, 17.859418769192267), (b'dearest', b'Emma'): (8, 41.199369337360096), (b'continued', b'Mrs'): (17, 12.8091714520948), (b'go', b'home'): (35, 10.887794606718625), (b'covered', b'with'): (53, 10.080615337595193), (b'hardly', b'knew'): (16, 30.25775488600073), (b'knew', b'how'): (29, 10.771742964262854), (b'set', b'off'): (42, 12.87641808850673), (b'can', b'get'): (31, 10.915918025964645), (b'got', b'home'): (12, 13.023221532639205), (b'most', b'extraordinary'): (16, 46.22770238588718), (b'an', b'inch'): (27, 63.920413436692506), (b'at', b'ease'): (36, 17.60627316575014), (b'tete', b'-'): (7, 10.750630160799654), (b'well', b'known'): (33, 14.650399763103346), (b'Smith', b'!--'): (9, 18.89143450635386), (b'extremely', b'sorry'): (8, 73.47101219705371), (b'Every', b'thing'): (18, 22.468754541491062), (b'many', b'weeks'): (10, 20.758257250268528), (b'Am', b'I'): (43, 15.552324542536647), (b'madam', b',"'): (13, 15.249261800405625), (b'extremely', b'well'): (16, 29.94818401937046), (b'!--', b'Such'): (10, 19.885720533004065), (b'poor', b'Harriet'): (13, 12.146024108216924), (b'-', b'headed'): (29, 37.26885122410547), (b'an', b'instant'): (95, 43.26164345230693), (b'thirty', b'thousand'): (18, 42.25669624085443), (b'so', b'easily'): (19, 10.348857779435248), (b'worth', b'having'): (9, 21.66509020844281), (b'poor', b'girl'): (11, 16.403616188855242), (b'laugh', b'at'): (34, 11.791298047589994), (b'knowing', b'what'): (21, 15.750760884791218), (b'many', b'days'): (49, 14.230461886034641), (b'whole', b'party'): (15, 21.73407276203329), (b'six', b'weeks'): (6, 18.238468797923794), (b'too', b'late'): (56, 87.81582024724356), (b'-', b'minded'): (19, 20.81504988580358), (b'her', b'companions'): (36, 11.854753785126626), (b'drew', b'near'): (34, 135.3933203484773), (b'three', b'months'): (35, 82.38812730639594), (b'other', b'side'): (133, 28.11421841631497), (b'an', b'unnatural'): (10, 19.22739708991419), (b'get', b'rid'): (18, 302.70680372001954), (b'watering', b'-'): (10, 20.552675307411103), (b'while', b'ago'): (10, 15.477310722473048), (b'at', b'Weymouth'): (16, 43.731710766540665), (b'present', b'occasion'): (9, 32.73631972474029), (b'No', b',"'): (86, 11.567334989477068), (b'their', b'hearts'): (49, 18.284844184258766), (b'break', b'through'): (12, 11.455134820459898), (b'burst', b'forth'): (18, 49.665727664726894), (b'young', b'men'): (141, 28.05854810702794), (b'-', b'bred'): (21, 27.951638418079096), (b'nobody', b'else'): (20, 96.33066107291948), (b'something', b'else'): (35, 38.925897096435115), (b'walking', b'together'): (9, 11.844821972381299), (b'burst', b'out'): (19, 11.102331509877693), (b'-', b'sized'): (12, 27.1752040175769), (b'how', b'long'): (57, 10.246965742926971), (b'Miss', b'Fairfax'): (125, 273.2315441060061), (b'extremely', b'happy'): (7, 19.519300571284287), (b'don', b"'"): (693, 30.893027225924477), (b'ma', b"'"): (213, 29.826951267640514), (b's', b'handwriting'): (7, 11.483028817587641), (b'Ma', b"'"): (15, 17.287522502879252), (b'without', b'seeming'): (8, 24.2521568627451), (b'Colonel', b'Campbell'): (28, 896.7839354391274), (b'those', b'days'): (84, 24.942982765152095), (b'Miss', b'Campbell'): (12, 75.31725733771769), (b'most', b'charming'): (7, 10.480354525195523), (b'caught', b'hold'): (10, 25.412146614069687), (b'four', b'months'): (9, 21.513976100607053), (b'may', b'guess'): (11, 13.367124175942939), (b'Bless', b'me'): (14, 16.960842272062408), (b'running', b'away'): (12, 11.650622091724552), (b'My', b'father'): (32, 11.189057813492585), (b'five', b'minutes'): (37, 145.59430269856685), (b'nine', b'years'): (9, 22.04328958038157), (b'hundred', b'pounds'): (11, 62.910379437794575), (b'more', b'honourable'): (10, 10.64811945150382), (b'rather', b'than'): (75, 19.03838981947655), (b'few', b'months'): (17, 59.138874943221204), (b'she', b'wished'): (31, 11.81925902403813), (b'without', b'feeling'): (12, 13.66868744277099), (b'twelve', b'thousand'): (24, 59.18646236299163), (b'passed', b'between'): (17, 19.791026625704045), (b",'", b'said'): (250, 30.38023579892928), (b'Miss', b'Hawkins'): (18, 356.68101153504875), (b'dear', b'Jane'): (14, 28.08747388500318), (b'three', b'minutes'): (10, 10.882525806031556), (b'have', b'suffered'): (29, 12.705290190035953), (b'hour', b'ago'): (10, 35.65917844869341), (b'looked', b'round'): (26, 11.609514648854784), (b'help', b'thinking'): (10, 32.39549502357255), (b'a', b'series'): (16, 10.46445052916564), (b'laughed', b'at'): (28, 12.564141746945062), (b'weeks', b'ago'): (7, 66.07864088043594), (b'She', b'wished'): (10, 10.48200653568184), (b'twenty', b'miles'): (8, 32.07957256976865), (b'elder', b'sister'): (6, 20.892905405405404), (b'alas', b'!'): (23, 57.08877378327094), (b'no', b'fault'): (13, 10.496096401900884), (b'driven', b'away'): (10, 15.10864307317955), (b'setting', b'off'): (8, 17.077411634756995), (b'little', b'farther'): (16, 13.37803343166175), (b'spot', b'where'): (18, 40.48947421434327), (b'front', b'door'): (15, 45.13527518483108), (b'they', b'parted'): (20, 10.44325386818452), (b'without', b'delay'): (8, 17.83246828143022), (b'six', b'months'): (21, 149.72732500075657), (b'months', b'ago'): (7, 33.90422411666347), (b'leaned', b'back'): (6, 12.550583460172502), (b'at', b'Oxford'): (10, 14.908537761320684), (b'turned', b'round'): (20, 11.2590300905922), (b'pass', b'through'): (62, 22.188217566016075), (b'clock', b'struck'): (16, 287.9462433862434), (b'four', b'hours'): (15, 47.34066169603626), (b'faster', b'than'): (10, 39.88995962176039), (b'musical', b'society'): (6, 113.41096644049148), (b'worth', b'while'): (16, 39.41555130656469), (b'mixed', b'with'): (23, 10.67000615370459), (b'extremely', b'glad'): (10, 72.79072504708098), (b'knew', b'nothing'): (25, 12.449045733530072), (b'make', b'amends'): (9, 67.18048992450166), (b'amends', b'for'): (12, 15.10365640918289), (b'oftener', b'than'): (9, 50.683713401766134), (b'old', b'woman'): (61, 19.444732663616787), (b'post', b'-'): (19, 12.228841807909605), (b'just', b'going'): (25, 13.2603307202772), (b'At', b'least'): (17, 28.03569269825919), (b'their', b'lives'): (30, 14.906122976297906), (b'six', b'days'): (15, 14.208028157365117), (b'may', b'prove'): (9, 10.69369934075435), (b'stronger', b'than'): (29, 58.08695243797917), (b'particular', b'friend'): (10, 19.818171330419286), (b'Hum', b'!'): (7, 26.958587619877946), (b'good', b'tidings'): (14, 31.69088424528839), (b'among', b'themselves'): (30, 14.912909361688993), (b'next', b'summer'): (8, 20.514042459088895), (b'breaking', b'up'): (12, 10.278411586632634), (b'perfectly', b'safe'): (6, 16.445856823742155), (b'two', b'ladies'): (12, 10.207136726744569), (b'same', b'moment'): (29, 15.561086589572348), (b'well', b'worth'): (11, 11.682266824085007), (b',"', b'added'): (51, 22.000525888403388), (b'little', b'girls'): (15, 24.596997116436313), (b'be', b'ashamed'): (88, 15.575445327520244), (b'been', b'staying'): (9, 13.958784759841098), (b'shut', b'up'): (64, 33.02129026711253), (b'too', b'large'): (19, 13.75905031306614), (b'At', b'first'): (26, 13.6225491635793), (b'worse', b'than'): (55, 50.56473754871035), (b'opposite', b'side'): (9, 21.485013505649793), (b'short', b'pause'): (11, 86.62091182855941), (b'large', b'party'): (8, 14.159924899255097), (b'six', b'years'): (19, 24.750919080861966), (b'who', b'knows'): (23, 17.15478502080444), (b'extremely', b'fond'): (6, 34.254458845685164), (b'or', b'twice'): (16, 10.480088120657514), (b'somebody', b'else'): (14, 146.9730657512543), (b'five', b'couple'): (7, 31.78281426662555), (b'"', b'Don'): (54, 12.435426459085848), (b'bad', b'news'): (7, 32.31086729362592), (b'baked', b'apples'): (6, 613.5218253968253), (b'will', b'send'): (73, 16.24812706949644), (b'William', b'Larkins'): (13, 5074.297435897436), (b'low', b'voice'): (39, 55.284472898891764), (b'one', b'leg'): (17, 10.74596003475239), (b'an', b'immediate'): (12, 14.761679056127669), (b'Tell', b'me'): (40, 25.093033554681703), (b',"', b'resumed'): (18, 28.980058972381027), (b'many', b'times'): (18, 11.52332014677216), (b'Nothing', b'can'): (12, 15.51466345550789), (b'few', b'words'): (18, 11.709874520256639), (b'no', b'objection'): (17, 30.845671058647493), (b'It', b'seems'): (24, 18.2281699492411), (b'astonished', b'at'): (22, 13.98318024510078), (b'four', b'times'): (13, 17.904877713359248), (b'other', b'end'): (40, 10.241169643435896), (b'few', b'hours'): (23, 78.0796666877091), (b'an', b'extraordinary'): (13, 10.356142591003286), (b'look', b'forward'): (11, 11.731760911835845), (b'Alas', b'!'): (16, 24.207711332135293), (b'immediately', b'followed'): (7, 12.604814218453825), (b'wait', b'till'): (17, 29.399581656260896), (b'-', b'bye'): (37, 39.93091202582728), (b'contrast', b'between'): (12, 166.24462365591398), (b'dared', b'not'): (22, 11.783553500216318), (b'three', b'weeks'): (9, 21.40970383064167), (b'-', b'sighted'): (11, 32.25189048239896), (b'Maple', b'Grove'): (31, 16731.716961498438), (b'My', b'brother'): (17, 11.063412365232326), (b'at', b'Maple'): (10, 11.542093750699882), (b'almost', b'fancy'): (9, 16.725625422582826), (b'left', b'behind'): (27, 29.19181841393264), (b'barouche', b'-'): (7, 13.975819209039548), (b'-', b'landau'): (7, 19.96545601291364), (b'whose', b'name'): (60, 31.461202630580967), (b'most', b'serious'): (8, 10.186904598490049), (b'We', b'cannot'): (19, 11.868641936045467), (b'waited', b'for'): (38, 10.711948477309232), (b'E', b'.,'): (6, 566.3278388278388), (b'person', b'who'): (30, 10.40945693009978), (b'greater', b'part'): (10, 16.18384415693171), (b'drew', b'back'): (11, 15.195773696172985), (b'Her', b'manners'): (6, 10.694300338936156), (b'third', b'time'): (23, 13.583823987016235), (b'very', b'extraordinary'): (12, 11.127072987672598), (b'better', b'acquainted'): (7, 13.449978251413658), (b'According', b'to'): (45, 10.652714079624486), (b'have', b'committed'): (34, 16.773728020950244), (b'hardly', b'less'): (8, 13.000147148472811), (b'will', b'shew'): (48, 12.349874144320705), (b'little', b'boys'): (17, 17.749724946185125), (b'easily', b'believe'): (8, 21.824887069452284), (b'my', b'lord'): (180, 36.09264842223215), (b'"', b'Excuse'): (11, 17.184816739378107), (b'Excuse', b'me'): (13, 37.690760604583126), (b'put', b'forth'): (36, 10.696233535526662), (b'drawing', b'near'): (8, 18.702897235831365), (b'great', b'joy'): (24, 10.008185299508712), (b'eight', b'o'): (9, 65.15276021913189), (b'spread', b'abroad'): (14, 194.3118977796397), (b'few', b'lines'): (10, 43.57841479226563), (b'good', b'news'): (18, 24.79518258080434), (b'most', b'likely'): (12, 19.419480443744643), (b'talk', b'about'): (35, 21.824887069452284), (b'tells', b'us'): (12, 30.44567755366135), (b'dear', b'madam'): (10, 68.52258121703674), (b'eleven', b'years'): (6, 16.99170238487746), (b'your', b'sister'): (79, 15.965251961999174), (b'two', b'hours'): (17, 15.078877429107843), (b'two', b'months'): (19, 19.98674940210717), (b'door', b'opened'): (19, 32.960457440450135), (b'Who', b'can'): (30, 10.97976511828241), (b'began', b'talking'): (9, 13.575011249766773), (b'mean', b'?"'): (59, 26.41782366663845), (b'In', b'spite'): (9, 11.90627917946151), (b'many', b'hours'): (12, 13.124575551782682), (b'few', b'steps'): (8, 22.235285657785926), (b'most', b'excellent'): (10, 12.940681654585935), (b'later', b'than'): (8, 11.966987886528115), (b'whole', b'story'): (18, 36.39536252354049), (b'whole', b'history'): (8, 19.2441498630819), (b'lined', b'with'): (12, 13.540300206747927), (b'-', b'plaister'): (9, 17.469774011299435), (b'Lord', b'bless'): (12, 16.7014274691358), (b'these', b'things'): (325, 42.22085680150196), (b'laid', b'down'): (26, 12.408697820671941), (b'forty', b'years'): (63, 161.26670263465516), (b'faint', b'smile'): (6, 24.271193092621665), (b'turned', b'towards'): (11, 11.117376349756114), (b'totally', b'different'): (7, 158.32821300563236), (b'Box', b'Hill'): (18, 8589.305555555555), (b'some', b'surprise'): (19, 22.39760968543046), (b'may', b'depend'): (9, 21.16461327857632), (b',"', b'interrupted'): (25, 30.235605293907703), (b'whatever', b'else'): (8, 18.739735159540622), (b'mid', b'-'): (22, 25.824883321051338), (b'larger', b'than'): (11, 19.006392525662303), (b'were', b'assembled'): (17, 18.151746404461402), (b'insisted', b'on'): (9, 11.61131033964815), (b'clothed', b'with'): (36, 12.659106066308777), (b'twenty', b'minutes'): (7, 11.181261808550069), (b'quite', b'alone'): (14, 11.341420176217916), (b'etc', b'.,'): (11, 3964.2948717948716), (b'As', b'soon'): (46, 20.83462134191184), (b'without', b'knowing'): (18, 34.56996044031647), (b',"', b'whispered'): (18, 26.71599186516376), (b'shan', b"'"): (18, 22.473779253743025), (b'looking', b'round'): (23, 17.251931821351857), (b'Pardon', b'me'): (7, 10.14751247046469), (b',"', b'answered'): (143, 22.57516648996616), (b'An', b'old'): (15, 19.494934210941096), (b'Shall', b'we'): (25, 13.344625941350367), (b'old', b'age'): (36, 48.23121704303023), (b'an', b'infant'): (9, 23.772054583893908), (b'be', b'forgiven'): (36, 18.33341939975366), (b'lie', b'down'): (41, 31.292094007783852), (b'four', b'miles'): (7, 16.3062279175236), (b'great', b'hurry'): (16, 19.653968941856267), (b'without', b'waiting'): (12, 19.24774354186119), (b'comes', b'back'): (13, 15.420512101235838), (b'heightened', b'by'): (7, 11.875072007373555), (b'In', b'fact'): (28, 39.68320704393532), (b'cut', b'off'): (213, 155.50163439778729), (b'never', b'mind'): (29, 11.197398746902147), (b'trembling', b'voice'): (7, 11.95022270316229), (b'More', b'than'): (14, 24.23315047021944), (b'time', b'past'): (23, 13.326716908397632), (b'second', b'time'): (44, 20.945614179827096), (b'five', b'hundred'): (65, 85.8882839842231), (b'turning', b'away'): (13, 10.732346458879267), (b'an', b'arrow'): (10, 14.857534114933694), (b'--', b'oh'): (17, 12.811006767021128), (b'presented', b'themselves'): (6, 11.894714571472534), (b'at', b'random'): (11, 18.668082066349374), (b'far', b'distant'): (10, 26.63579981049186), (b'few', b'seconds'): (9, 96.54294969363461), (b'passing', b'through'): (10, 15.455340630779228), (b'will', b'heal'): (10, 10.577600656791981), (b'rose', b'early'): (12, 50.25143069404622), (b'east', b'wind'): (22, 148.38828510938603), (b'gone', b'mad'): (10, 20.604717798360767), (b'freed', b'from'): (10, 25.54271506220159), (b'sinned', b'against'): (43, 81.41747505543238), (b'locked', b'up'): (11, 13.137819321259757), (b'deep', b'sigh'): (7, 47.76258881680568), (b'ten', b'thousand'): (75, 127.07863651308065), (b'happier', b'than'): (10, 23.413671951902835), (b'contend', b'with'): (14, 10.67000615370459), (b'had', b'formerly'): (10, 11.686041677689511), (b'little', b'boy'): (63, 26.45202064896755), (b'fancying', b'herself'): (6, 29.03972577009767), (b'right', b'hand'): (196, 45.67341533298018), (b'surrounded', b'by'): (18, 30.407381352214102), (b'infinitely', b'more'): (8, 12.605071134482898), (b'such', b'cases'): (8, 14.91498581087056), (b'No', b'wonder'): (13, 19.253810919251713), (b'poor', b'fellow'): (31, 72.63322416713721), (b'Poor', b'fellow'): (7, 45.43103764921947), (b'days', b'ago'): (11, 15.442862018162295), (b'help', b'laughing'): (7, 20.6971218206158), (b'draw', b'near'): (18, 83.03474416971349), (b'at', b'intervals'): (30, 31.386395286990908), (b'into', b'temptation'): (8, 11.801423582619314), (b'stood', b'before'): (56, 10.39258282946439), (b'Sir', b'Walter'): (136, 1001.3265848443274), (b'Walter', b'Elliot'): (16, 158.52745152870995), (b'Kellynch', b'Hall'): (24, 4945.357744107744), (b'arising', b'from'): (7, 10.217086024880635), (b'Charles', b'Musgrove'): (14, 248.92084078711986), (b'first', b'year'): (71, 36.52590150555186), (b'Lady', b'Elliot'): (12, 34.95647609819121), (b'seventeen', b'years'): (7, 50.975107154632376), (b'an', b'awful'): (13, 15.611498532706445), (b'Lady', b'Russell'): (147, 1370.6424223505542), (b'Anne', b'Elliot'): (23, 69.51776079136691), (b'Miss', b'Elliot'): (48, 81.92993320516614), (b'everybody', b'else'): (20, 116.64529027877325), (b'her', b'mistress'): (30, 10.118550146240644), (b'Mr', b'Elliot'): (174, 154.42474881796687), (b'Mr', b'Shepherd'): (26, 153.51099290780144), (b'anybody', b'else'): (21, 167.7979955569876), (b'reference', b'to'): (30, 10.087797423886824), (b'an', b'honest'): (28, 22.11150665340132), (b'descend', b'into'): (11, 19.585341264772477), (b'Mrs', b'Clay'): (66, 287.0487212850306), (b'Miss', b'Anne'): (19, 13.817194691451805), (b'their', b'fathers'): (151, 21.038778684865168), (b'an', b'example'): (14, 17.829040937920432), (b'Admiral', b'Croft'): (14, 1020.8859134262656), (b'Mrs', b'Croft'): (41, 207.3760688537417), (b'walked', b'along'): (8, 11.42244112667385), (b'Frederick', b'Wentworth'): (6, 23.25274477365017), (b'either', b'side'): (18, 19.36359410488185), (b'Captain', b'Wentworth'): (196, 976.2801057938673), (b'eldest', b'son'): (15, 39.75891221190009), (b'removed', b'from'): (36, 14.303920434832891), (b'good', b'humour'): (23, 57.21965210954848), (b'The', b'Crofts'): (8, 10.24976796605675), (b'startled', b'by'): (14, 14.177381886354143), (b'most', b'important'): (14, 33.80609933127228), (b'replied', b'Anne'): (11, 13.874646644430818), (b'at', b'Uppercross'): (20, 13.940450893702454), (b'Great', b'House'): (13, 1177.9619047619049), (b'left', b'alone'): (16, 14.135954084898053), (b'Mr', b'Musgrove'): (21, 32.3891325695581), (b'Miss', b'Musgroves'): (22, 227.52634882160712), (b'Mrs', b'Musgrove'): (66, 156.77276316336287), (b'flower', b'-'): (23, 13.524986331328593), (b'grown', b'up'): (19, 12.21909069739544), (b'their', b'faces'): (62, 22.65730692397282), (b'surprised', b'at'): (27, 14.056621317816642), (b'ere', b'long'): (20, 33.88286215209292), (b'anything', b'else'): (30, 74.81176994319226), (b'quite', b'different'): (12, 19.349519727167486), (b'their', b'sakes'): (13, 16.878317708546554), (b'twentieth', b'year'): (13, 185.54755475547557), (b'on', b'board'): (69, 34.25748298789808), (b'eight', b'years'): (22, 61.898344402053596), (b'-', b'bone'): (22, 13.4993708269132), (b'their', b'heads'): (77, 21.621494582846132), (b'Your', b'sister'): (11, 24.014833799316555), (b'dressing', b'-'): (19, 29.64567711008389), (b'up', b'stairs'): (15, 14.512707389763687), (b'waited', b'till'): (7, 12.054695723363611), (b'third', b'part'): (39, 80.67984559777145), (b'Phoo', b'!'): (7, 23.96318899544706), (b'dear', b'fellow'): (11, 20.631526271893247), (b'good', b'cheer'): (14, 58.85449931267844), (b'Mrs', b'Harville'): (24, 84.64015847289754), (b'"', b'Ay'): (34, 18.45776612748019), (b'fifteen', b'years'): (9, 52.059683902603275), (b'Charles', b'Hayter'): (33, 2649.332925336598), (b'came', b'near'): (42, 11.627447632578933), (b'Her', b'husband'): (9, 21.944148747427437), (b'two', b'hundred'): (102, 34.52951381693766), (b'Dr', b'Shirley'): (9, 1086.3943785682916), (b'went', b'up'): (206, 10.893037774183895), (b'within', b'reach'): (7, 18.479352178330245), (b'-', b'yard'): (19, 16.03782532184866), (b'turn', b'back'): (15, 10.596177405398922), (b'walking', b'along'): (8, 19.124729409339242), (b'leaning', b'against'): (12, 23.700473570392266), (b'trodden', b'under'): (9, 72.2464953271028), (b'under', b'foot'): (15, 22.229690869877782), (b'Louisa', b'Musgrove'): (15, 189.5280416794361), (b'provoke', b'me'): (18, 16.48970776450512), (b'Very', b'good'): (11, 10.325350756610252), (b'good', b'humoured'): (9, 26.157555250079305), (b'Captain', b'Harville'): (37, 475.4296696696697), (b'at', b'Lyme'): (24, 20.29341259451412), (b'earnest', b'desire'): (6, 32.79056203605514), (b'Captain', b'Benwick'): (56, 811.83861003861), (b'an', b'officer'): (9, 10.895525017618041), (b'place', b'where'): (114, 29.883792170944712), (b'-', b'coat'): (34, 13.074153453617642), (b'an', b'introduction'): (7, 10.895525017618041), (b'preceding', b'evening'): (6, 48.94191199746755), (b'an', b'agony'): (10, 15.203058164118199), (b'catching', b'hold'): (10, 135.53144860837168), (b'raised', b'up'): (35, 20.917757019882853), (b'could', b'scarcely'): (17, 17.384325631078877), (b'passed', b'along'): (11, 16.544408774745854), (b'leaning', b'over'): (16, 33.66105049605383), (b't', b'talk'): (19, 11.570685526118844), (b'Camden', b'Place'): (29, 11505.67441860465), (b'straight', b'forward'): (6, 11.997865942380443), (b'same', b'hour'): (17, 12.921871463147081), (b'-', b'glasses'): (16, 16.354682053131388), (b'poring', b'over'): (6, 41.31128924515698), (b'thirty', b'feet'): (8, 12.120929017084244), (b'Colonel', b'Wallis'): (23, 967.3885461023725), (b'Mrs', b'Wallis'): (11, 54.17933330413071), (b'-', b'haired'): (38, 60.68447814451383), (b'at', b'length'): (74, 17.89024531358482), (b'carried', b'away'): (73, 68.93872057625379), (b'greater', b'than'): (56, 48.18287227996847), (b'Miss', b'Carteret'): (12, 320.09834368530016), (b'contact', b'with'): (11, 11.02567302549474), (b'Lady', b'Dalrymple'): (25, 1027.2923588039866), (b'Laura', b'Place'): (7, 777.4104336895035), (b'be', b'established'): (41, 13.256300819785361), (b'Mrs', b'Smith'): (64, 112.00140587397476), (b'Westgate', b'Buildings'): (7, 8589.305555555555), (b'buried', b'him'): (40, 10.212164360501543), (b'at', b'liberty'): (25, 14.03156495183123), (b'human', b'nature'): (9, 43.511573911208046), (b'five', b'thousand'): (31, 37.911149464312665), (b'whose', b'names'): (9, 20.291362480518416), (b'her', b'ladyship'): (21, 34.12286449316844), (b'-', b'maker'): (21, 26.620608017218185), (b'old', b'gentleman'): (31, 27.054706126823717), (b'almost', b'entirely'): (13, 36.1061120233534), (b'lower', b'part'): (8, 15.927696983224877), (b'staring', b'at'): (33, 25.046343439018745), (b'an', b'oath'): (37, 44.98797426629384), (b'wiser', b'than'): (8, 29.3735157214781), (b'prejudice', b'against'): (7, 39.17833386126069), (b'both', b'sides'): (30, 99.67218081951572), (b'my', b'soul'): (234, 16.443748679233014), (b'rejoice', b'over'): (13, 10.117050427385381), (b'same', b'instant'): (19, 25.16281097419205), (b'every', b'one'): (375, 14.671605951506955), (b'their', b'seats'): (11, 12.658738281409915), (b'their', b'mouths'): (24, 26.497528436510585), (b'short', b'silence'): (9, 19.427323846323), (b'-', b'blooded'): (7, 17.469774011299435), (b'general', b'character'): (6, 10.833683694205032), (b'fifty', b'pounds'): (6, 35.38131472052177), (b'be', b'saved'): (61, 13.40991848436365), (b'threw', b'himself'): (8, 10.030058440961655), (b'some', b'moments'): (13, 21.006453804347828), (b'exclaimed', b'Mrs'): (11, 12.149305043956582), (b'compassion', b'on'): (20, 13.012675380640166), (b'an', b'explanation'): (12, 16.343287526427062), (b'our', b'hearts'): (17, 14.94442027934851), (b'minutes', b'afterwards'): (7, 22.217312424781305), (b'make', b'haste'): (26, 50.38536744337624), (b"'", b'n'): (26, 22.53339140030468), (b'n', b"'"): (20, 16.0952795716462), (b'rising', b'sun'): (7, 13.065928609910946), (b'-', b'faced'): (30, 42.60920490560838), (b'an', b'atonement'): (66, 89.61263272917309), (b'atonement', b'for'): (64, 24.316159515907604), (b'"', b'Look'): (42, 10.092670148523652), (b'Look', b'here'): (14, 26.312064784218066), ...}
tokenized_sentence = "Jon lives in New York City".split()
tokenized_sentence
['Jon', 'lives', 'in', 'New', 'York', 'City']
bigram[tokenized_sentence]
['Jon', 'lives', 'in', 'New_York', 'City']
# Maas et al. (2001)에 따라
# - (감정을 표현하는) 불용어는 남겨 둡니다.
# - 어간 추출을 하지 않습니다(모델이 같은 어간을 갖는 단어의 비슷한 표현을 학습합니다).
lower_sents = []
for s in gberg_sents:
lower_sents.append([w.lower() for w in s if w.lower()
not in list(string.punctuation)])
lower_sents[0:5]
[['emma', 'by', 'jane', 'austen', '1816'], ['volume', 'i'], ['chapter', 'i'], ['emma', 'woodhouse', 'handsome', 'clever', 'and', 'rich', 'with', 'a', 'comfortable', 'home', 'and', 'happy', 'disposition', 'seemed', 'to', 'unite', 'some', 'of', 'the', 'best', 'blessings', 'of', 'existence', 'and', 'had', 'lived', 'nearly', 'twenty', 'one', 'years', 'in', 'the', 'world', 'with', 'very', 'little', 'to', 'distress', 'or', 'vex', 'her'], ['she', 'was', 'the', 'youngest', 'of', 'the', 'two', 'daughters', 'of', 'a', 'most', 'affectionate', 'indulgent', 'father', 'and', 'had', 'in', 'consequence', 'of', 'her', 'sister', 's', 'marriage', 'been', 'mistress', 'of', 'his', 'house', 'from', 'a', 'very', 'early', 'period']]
lower_bigram = Phraser(Phrases(lower_sents))
lower_bigram.phrasegrams # miss taylor, mr woodhouse, mr weston
{(b'two', b'daughters'): (19, 11.080894472729938), (b'her', b'sister'): (201, 16.93985297075414), (b'very', b'early'): (25, 10.517085686126077), (b'her', b'mother'): (253, 10.708214678014947), (b'long', b'ago'): (38, 59.22693146255728), (b'more', b'than'): (562, 28.530162383333433), (b'had', b'been'): (1260, 21.58337149316804), (b'an', b'excellent'): (58, 37.41890603576534), (b'sixteen', b'years'): (15, 131.43021613989356), (b'miss', b'taylor'): (48, 420.4375727213963), (b'mr', b'woodhouse'): (132, 104.1999395195192), (b'very', b'fond'): (30, 24.18592621729314), (b'passed', b'away'): (25, 11.75157033589844), (b'too', b'much'): (177, 30.363341094363626), (b'did', b'not'): (977, 10.846285856844784), (b'any', b'means'): (28, 14.29426622702947), (b'after', b'dinner'): (22, 18.607525024015455), (b'mr', b'weston'): (162, 91.63366549621855), (b'five', b'years'): (42, 37.66459722425521), (b'years', b'old'): (176, 48.59949606902839), (b'seven', b'years'): (53, 50.334976394001785), (b'each', b'other'): (239, 71.31335962645632), (b'well', b'informed'): (8, 14.185145241835274), (b'a', b'mile'): (49, 11.700207443348628), (b'difference', b'between'): (44, 207.8695602382043), (b'mrs', b'weston'): (249, 180.67939002300875), (b'could', b'not'): (1059, 10.213417567175872), (b'having', b'been'): (49, 10.723839064161645), (b'sixteen', b'miles'): (6, 105.04149305555556), (b'miles', b'off'): (16, 32.99209331376903), (b'at', b'hartfield'): (67, 25.556203673424893), (b'her', b'husband'): (168, 26.67864790728878), (b'in', b'spite'): (105, 13.346546665784308), (b'emma', b'could'): (61, 10.886178015450438), (b'every', b'body'): (148, 39.26143302367287), (b'no', b'means'): (80, 26.766268123208324), (b'able', b'to'): (349, 10.854560919017272), (b'very', b'much'): (241, 15.432046093862464), (b'have', b'been'): (986, 17.20636681938197), (b'great', b'deal'): (182, 110.17005431763471), (b'agree', b'with'): (26, 13.12659190346559), (b'good', b'humoured'): (30, 149.07578968117085), (b'for', b'ever'): (565, 10.477931089592536), (b'three', b'times'): (41, 38.14473048229484), (b'my', b'dear'): (340, 26.34347480365353), (b'last', b'night'): (70, 23.230729926576135), (b'doubt', b'whether'): (12, 19.56454034377786), (b'anywhere', b'else'): (6, 15.306592794980771), (b'i', b'am'): (2445, 16.33041776770322), (b'very', b'glad'): (46, 16.952677708033637), (b'am', b'sure'): (282, 60.92117707314396), (b'very', b'pretty'): (40, 18.02800780155912), (b'be', b'able'): (121, 10.915362249869167), (b'immediately', b'afterwards'): (10, 37.531108492029034), (b'mr', b'knightley'): (277, 179.5673552402442), (b'sensible', b'man'): (17, 13.46925645592164), (b'intimate', b'friend'): (6, 20.194893190921228), (b'connected', b'with'): (31, 16.865252849915358), (b'elder', b'brother'): (6, 14.418048803736536), (b'than', b'usual'): (30, 27.96507779799145), (b'brunswick', b'square'): (11, 2374.2537606278615), (b'some', b'time'): (149, 11.67826679527302), (b'poor', b'isabella'): (11, 43.037237258598005), (b'am', b'afraid'): (65, 24.404507887593226), (b'moonlight', b'night'): (6, 13.233573928258966), (b'look', b'at'): (188, 10.167753397802551), (b'vast', b'deal'): (11, 58.66570782159018), (b'an', b'hour'): (155, 40.46495358980144), (b'pretty', b'well'): (22, 13.99149138187848), (b'tolerably', b'well'): (7, 13.77985537778284), (b'miss', b'woodhouse'): (173, 272.89862807742907), (b'you', b'please'): (94, 10.458101345278243), (b'any', b'rate'): (47, 81.39512045124775), (b'very', b'true'): (50, 13.110826412772036), (b',"', b'said'): (2585, 35.20909116868378), (b'my', b'dearest'): (20, 15.98957177137179), (b'so', b'much'): (501, 16.68921990823078), (b'much', b'less'): (40, 18.957027837059744), (b'any', b'body'): (93, 20.814200280371566), (b'has', b'been'): (266, 28.015388177187834), (b'been', b'used'): (29, 13.604916942276521), (b'dear', b'emma'): (33, 26.724674718257425), (b'every', b'thing'): (258, 26.81380543889372), (b'very', b'sorry'): (34, 20.451800038697975), (b'turned', b'away'): (50, 18.475990225490946), (b'divided', b'between'): (10, 32.86826379834854), (b'how', b'much'): (142, 13.223254805593912), (b'four', b'years'): (23, 17.088037732474124), (b'years', b'ago'): (56, 157.9226942623328), (b'any', b'thing'): (384, 34.60382364241243), (b'oh', b'dear'): (22, 13.071118312364975), (b'need', b'not'): (108, 12.811139000583829), (b'ever', b'since'): (68, 42.54674314875834), (b'leave', b'off'): (19, 10.45663799403504), (b'match', b'making'): (6, 19.51486904915495), (b'young', b'lady'): (73, 46.68733304737199), (b'depend', b'upon'): (45, 79.94120406154416), (b'more', b'likely'): (16, 10.639357811071253), (b'have', b'done'): (272, 12.079650113150391), (b',"', b'rejoined'): (6, 10.723078831702821), (b'mr', b'elton'): (214, 139.40990783410138), (b'any', b'longer'): (32, 15.647155677008353), (b'very', b'well'): (211, 12.391214426766558), (b'young', b'man'): (266, 24.286534956791346), (b'dine', b'with'): (23, 13.166490359099223), (b'much', b'better'): (40, 10.497942188943838), (b'i', b'dare'): (138, 13.033593291934318), (b'dare', b'say'): (115, 119.19858224619772), (b'take', b'care'): (71, 74.57188636223579), (b'chapter', b'ii'): (11, 279.3347183748846), (b'entering', b'into'): (14, 14.76425085407516), (b'never', b'seen'): (42, 13.0265069565268), (b'mrs', b'churchill'): (59, 72.7025861493478), (b'refrain', b'from'): (10, 13.332723666813575), (b'at', b'once'): (270, 18.77244440167035), (b'three', b'years'): (80, 36.12190386580951), (b'mother', b's'): (212, 10.433577049626086), (b'twenty', b'years'): (71, 82.70079278294149), (b'according', b'to'): (792, 11.428318841579392), (b'had', b'begun'): (25, 11.498991764971958), (b'passed', b'through'): (45, 29.981665287433785), (b'its', b'being'): (58, 15.03567583737666), (b'deal', b'better'): (14, 18.324643289810204), (b'belonging', b'to'): (36, 10.00745855749289), (b'mr', b'frank'): (50, 51.13428151809727), (b'frank', b'churchill'): (151, 1615.1483580779638), (b'mrs', b'perry'): (11, 23.55291278198416), (b'miss', b'bates'): (113, 368.53088940274097), (b'a', b'few'): (452, 11.993753858188938), (b'few', b'days'): (53, 34.43918034342092), (b'i', b'suppose'): (210, 11.32108319421537), (b'very', b'handsome'): (21, 18.293355102534452), (b'an', b'irresistible'): (7, 10.743738402393657), (b'good', b'sense'): (28, 15.484646541076454), (b'had', b'already'): (64, 11.161440716387299), (b'long', b'enough'): (39, 14.424680952514514), (b'at', b'randalls'): (39, 24.915008599181263), (b'few', b'weeks'): (19, 130.1719018932874), (b'no', b'longer'): (117, 37.27195664059191), (b'mr', b'perry'): (36, 95.91613823715916), (b'chapter', b'iii'): (10, 294.8533138401559), (b'donwell', b'abbey'): (9, 737.1781906792567), (b'card', b'table'): (7, 52.05532134560785), (b'drawing', b'room'): (49, 219.78168548972988), (b'thrown', b'away'): (11, 13.99171343117908), (b'mrs', b'goddard'): (58, 292.68153482471274), (b'an', b'invitation'): (13, 13.178985773602884), (b'mrs', b'bates'): (30, 54.66699555102586), (b'those', b'who'): (174, 14.383367684078921), (b'as', b'possible'): (81, 10.51537185153746), (b'young', b'ladies'): (47, 111.39299334578259), (b'old', b'fashioned'): (38, 181.02950323229942), (b'coming', b'back'): (15, 10.421088167057363), (b'goddard', b's'): (34, 30.051743214173165), (b'found', b'herself'): (27, 10.853841368001415), (b's', b'sake'): (143, 27.305132602365575), (b'much', b'pleased'): (18, 12.52916167550209), (b'miss', b'smith'): (58, 148.87908909420122), (b'harriet', b'smith'): (31, 171.76221256523925), (b'several', b'years'): (10, 16.07162969101959), (b'pretty', b'girl'): (10, 36.10104059762763), (b'blue', b'eyes'): (28, 33.954794112767054), (b'due', b'time'): (18, 19.500789650495978), (b'its', b'own'): (54, 10.339621687357946), (b'an', b'egg'): (7, 16.473732217003608), (b'better', b'than'): (175, 40.469371165789006), (b'body', b'else'): (31, 37.603598047511824), (b'much', b'more'): (163, 10.196819531234667), (b'little', b'girl'): (54, 33.82072690767634), (b'at', b'last'): (512, 25.084326428238846), (b'chapter', b'iv'): (8, 252.7314118629908), (b'every', b'respect'): (14, 10.883823423596287), (b'guided', b'by'): (14, 22.059530561317086), (b'different', b'sort'): (8, 13.936709152334153), (b'abbey', b'mill'): (11, 1868.3654143077713), (b'good', b'deal'): (62, 34.98896475458069), (b'very', b'happy'): (45, 10.60950295929063), (b'mrs', b'martin'): (8, 10.98253798261059), (b'drink', b'tea'): (7, 29.382235819735826), (b'large', b'enough'): (11, 10.328422669853191), (b'had', b'taken'): (121, 10.546982214197534), (b'mr', b'martin'): (37, 92.33536178249174), (b'three', b'miles'): (9, 15.396961522801304), (b'thing', b'else'): (26, 11.724198736581018), (b'very', b'obliging'): (14, 23.82950204145935), (b'on', b'purpose'): (36, 10.243987813963042), (b'very', b'clever'): (15, 19.348740973834513), (b'miss', b'nash'): (13, 312.8837750484809), (b'does', b'not'): (218, 11.75597879763697), (b'oh', b'yes'): (33, 23.312359983486996), (b'very', b'entertaining'): (7, 15.092017959590923), (b'soon', b'as'): (277, 10.479021912326246), (b'have', b'seen'): (204, 12.72963685420248), (b'on', b'horseback'): (21, 51.189898049832905), (b'their', b'families'): (95, 33.21834852311409), (b'no', b'doubt'): (125, 34.594842365372315), (b'very', b'respectable'): (9, 10.061345306393948), (b'respectable', b'young'): (8, 26.124309153713302), (b'very', b'odd'): (20, 23.338172102460184), (b'perfectly', b'right'): (12, 15.18214895111914), (b'six', b'years'): (23, 29.166705499538022), (b'years', b'hence'): (10, 17.922302200894578), (b'young', b'woman'): (57, 28.12555022760024), (b'very', b'desirable'): (9, 13.720016326900836), (b'dear', b'miss'): (39, 23.615886026541766), (b'thirty', b'years'): (36, 70.51736596736596), (b'can', b'afford'): (11, 24.000753694092754), (b'good', b'luck'): (24, 50.83866673742493), (b'acquainted', b'with'): (88, 25.94064590446771), (b'harriet', b's'): (91, 10.391341493029481), (b'next', b'day'): (103, 31.5528616052661), (b'an', b'opportunity'): (36, 39.48600763353957), (b'few', b'yards'): (15, 121.4937751004016), (b'robert', b'martin'): (31, 1822.1955287848953), (b'few', b'minutes'): (86, 306.2550554916762), (b'been', b'able'): (40, 15.56410454288213), (b'should', b'happen'): (13, 19.568675965231453), (b'compared', b'with'): (28, 15.20617102370327), (b'well', b'bred'): (15, 56.080806770046436), (b'an', b'old'): (175, 10.0932285327629), (b'old', b'man'): (225, 11.391586430765056), (b'more', b'valuable'): (10, 16.9262510630679), (b',"', b'replied'): (256, 67.14742919145301), (b'very', b'bad'): (37, 14.502840081288573), (b'deal', b'too'): (15, 11.983343236284412), (b'no', b'more'): (597, 15.083646516961776), (b'good', b'humour'): (28, 64.86811388829325), (b'very', b'agreeable'): (21, 20.122690612787896), (b'fixed', b'on'): (32, 10.217338522043262), (b'same', b'time'): (104, 17.450627768409742), (b'pleasing', b'young'): (8, 22.39226498889711), (b'chapter', b'v'): (7, 186.22314558325638), (b'very', b'differently'): (14, 45.276053878772764), (b'twelve', b'years'): (25, 44.13701288280007), (b'very', b'neatly'): (7, 21.56002565655846), (b'ten', b'years'): (32, 33.47750788468987), (b'being', b'able'): (20, 13.768660452584244), (b'have', b'spoken'): (82, 11.097296183547952), (b'yes', b',"'): (117, 21.319257322202063), (b'thank', b'you'): (105, 18.729503350352864), (b'could', b'possibly'): (21, 29.277733420435023), (b'grown', b'up'): (21, 13.110608209041132), (b'any', b'harm'): (11, 11.883432028204146), (b'an', b'angel'): (58, 20.85448584795839), (b'excuse', b'me'): (31, 17.30538907663604), (b'an', b'end'): (129, 17.098851519880977), (b'many', b'years'): (55, 18.586925601179164), (b',"', b'cried'): (297, 33.94188638327614), (b'much', b'obliged'): (41, 42.51058597592393), (b'mrs', b'john'): (39, 22.746858271991545), (b'john', b'knightley'): (58, 169.27026599029787), (b'be', b'satisfied'): (68, 11.946162525033142), (b'ill', b'humour'): (6, 26.632582093494147), (b'i', b'assure'): (105, 12.733360622358129), (b'assure', b'you'): (126, 28.43663511862174), (b'soon', b'afterwards'): (38, 78.59659889385321), (b'chapter', b'vi'): (6, 126.3657059314954), (b'most', b'agreeable'): (13, 26.484526154519585), (b'no', b'scruple'): (10, 22.499181900667867), (b'infinitely', b'superior'): (7, 270.28769265132905), (b'am', b'glad'): (34, 16.093602872722435), (b'very', b'interesting'): (15, 16.768908843989912), (b'no', b'sooner'): (40, 38.535832829867296), (b'don', b't'): (830, 258.780833955456), (b't', b'pretend'): (9, 21.45528368794326), (b'why', b'should'): (100, 17.7206514366753), (b'cannot', b'imagine'): (13, 46.70673151150224), (b'back', b'again'): (74, 17.518919801292988), (b'an', b'artist'): (10, 15.444123953440881), (b'higher', b'than'): (34, 44.04193566200464), (b'ten', b'times'): (17, 32.73356326503009), (b'mr', b'john'): (33, 14.765125870249578), (b'must', b'allow'): (12, 15.1281361623089), (b'sitting', b'down'): (24, 16.50991895653123), (b'must', b'confess'): (10, 11.475198100360734), (b'depended', b'on'): (14, 19.196211768687338), (b'after', b'breakfast'): (11, 10.176904121801002), (b'sooner', b'than'): (12, 15.493843103397815), (b'at', b'home'): (158, 14.817295862949209), (b'at', b'least'): (318, 40.00546011393844), (b'yes', b'indeed'): (18, 13.328024495550268), (b'replied', b'emma'): (16, 15.977656729388833), (b'can', b'hardly'): (33, 23.714099741177424), (b'am', b'persuaded'): (12, 13.900999100678101), (b'beg', b'your'): (40, 41.76970841001303), (b'your', b'pardon'): (42, 35.8319536079339), (b'tell', b'me'): (198, 10.69565177169633), (b'entered', b'into'): (99, 56.4162431009376), (b'older', b'than'): (15, 56.232480761366595), (b'run', b'away'): (47, 37.87484965056332), (b'have', b'borne'): (26, 10.466767991300909), (b'good', b'opinion'): (19, 13.341942820780586), (b'good', b'natured'): (66, 159.1384054846499), (b'emma', b'felt'): (19, 16.140645126868346), (b'no', b'difficulty'): (16, 11.983774780776779), (b'let', b'us'): (399, 31.769872582196264), (b'cried', b'emma'): (27, 13.88482906384106), (b'bond', b'street'): (8, 172.3757834757835), (b'some', b'weeks'): (10, 11.200775302864253), (b'next', b'morning'): (69, 78.1475383699458), (b'without', b'ceremony'): (6, 10.070891174806086), (b'dear', b'sir'): (24, 14.956727816809783), (b'sat', b'down'): (150, 55.27743735890119), (b'depends', b'upon'): (8, 17.986770913847433), (b'has', b'happened'): (14, 15.99137490529135), (b'presently', b'added'): (6, 18.898019740129936), (b'could', b'afford'): (11, 15.539720046230899), (b'does', b'seem'): (9, 12.372353151679363), (b'few', b'moments'): (43, 372.3196333721985), (b'nobody', b'knows'): (7, 29.65223357592688), (b'very', b'likely'): (33, 27.440032653801673), (b'good', b'tempered'): (10, 28.20352777751881), (b'all', b'probability'): (16, 12.433663070384382), (b'no', b'harm'): (25, 22.74642565781806), (b'cannot', b'help'): (16, 18.92381204221828), (b'very', b'different'): (29, 16.464019592281005), (b'common', b'sense'): (30, 161.56777397991883), (b'an', b'hundred'): (186, 29.194636402849085), (b'every', b'man'): (327, 11.997511974569205), (b'less', b'than'): (90, 36.539494213739246), (b'large', b'fortune'): (9, 31.791868637110017), (b'no', b'use'): (43, 12.937029592884024), (b'these', b'words'): (121, 23.298153994257166), (b'twenty', b'thousand'): (49, 74.70455718935906), (b'thousand', b'pounds'): (48, 447.52175109658555), (b'walked', b'off'): (15, 10.460708308551986), (b'cast', b'down'): (44, 13.997539984885172), (b'its', b'effects'): (8, 41.35228049391716), (b'deal', b'more'): (29, 10.514188895646884), (b'longer', b'than'): (32, 18.06310234103062), (b'perfectly', b'satisfied'): (12, 88.02579290850896), (b'three', b'hundred'): (78, 46.954702502955406), (b'well', b'known'): (41, 14.093033909096086), (b'destin', b'd'): (7, 66.59024873431653), (b'looking', b'at'): (107, 11.578403307709019), (b'next', b'moment'): (22, 24.11220301236594), (b'ready', b'wit'): (8, 64.00497196657146), (b'very', b'pleasant'): (20, 11.43334693908403), (b'an', b'idea'): (37, 13.223062649099885), (b'arrive', b'at'): (10, 10.99191555846232), (b'nobody', b'could'): (24, 14.34346357032523), (b'have', b'chosen'): (38, 11.818762830211462), (b'without', b'exception'): (8, 49.916591040343214), (b'her', b'cheeks'): (14, 10.713121301309494), (b'sit', b'down'): (61, 35.66534421470248), (b'reason', b'why'): (21, 17.06402120878811), (b'could', b'hardly'): (47, 23.031181176009962), (b'an', b'offering'): (71, 10.631678549435184), (b'can', b'easily'): (12, 13.582516083099756), (b'dear', b'mother'): (25, 13.62829366741899), (b'those', b'things'): (86, 14.8616035851137), (b'next', b'week'): (13, 53.88002448934157), (b'taken', b'away'): (75, 32.3942512515154), (b'stay', b'longer'): (10, 35.24389533529055), (b'three', b'days'): (100, 36.85571892072123), (b'cannot', b'bear'): (17, 14.755609208857674), (b'o', b'clock'): (67, 157.92042603351013), (b'ask', b'whether'): (11, 13.903668723357807), (b'ran', b'away'): (24, 14.924494326591024), (b'who', b'lived'): (27, 11.875088544445298), (b'never', b'mind'): (39, 14.649520876892138), (b'good', b'fortune'): (20, 17.889094761740502), (b'jane', b'fairfax'): (111, 878.2730646508635), (b'nothing', b'else'): (45, 30.00884087783841), (b'present', b'instance'): (6, 15.177578767810555), (b'once', b'more'): (141, 21.75868665121339), (b'still', b'greater'): (11, 11.400593546360994), (b'here', b'comes'): (20, 16.698101230888117), (b'turned', b'back'): (42, 27.420219470101824), (b'will', b'bring'): (145, 11.54256820986443), (b'each', b'side'): (37, 20.499807297291575), (b'still', b'remained'): (10, 12.133783892186747), (b'she', b'hoped'): (30, 15.572180252968563), (b'ten', b'minutes'): (42, 194.73753664413653), (b'most', b'favourable'): (6, 11.684349774052759), (b'ten', b'days'): (21, 15.980164743558), (b'little', b'ones'): (53, 58.130829972277546), (b'mr', b'wingfield'): (9, 102.72308998302208), (b'passed', b'over'): (52, 21.874958076020096), (b'yes', b'sir'): (25, 17.04815440969287), (b'sir', b',"'): (121, 14.215738794028884), (b'cannot', b'deny'): (9, 32.95419389978213), (b'talking', b'about'): (25, 19.03311706185364), (b'never', b'forget'): (18, 19.811960968040946), (b'cannot', b'tell'): (30, 16.076959296294408), (b'two', b'years'): (53, 11.958267348869844), (b'indeed', b'!--'): (21, 14.904457500400301), (b'dear', b'madam'): (15, 86.09958447176685), (b'madam', b",'"): (7, 23.782979559748426), (b'most', b'amiable'): (9, 20.90883643777862), (b',"', b'observed'): (18, 11.457536285929041), (b'five', b'times'): (10, 11.197582958562359), (b'our', b'lives'): (16, 17.148659373051412), (b'think', b'differently'): (6, 11.912561527859815), (b'grow', b'up'): (20, 10.504684266328379), (b'shake', b'hands'): (15, 50.540371218069744), (b'how', b'long'): (106, 12.747654023789359), (b'perfectly', b'convinced'): (8, 72.52055615486036), (b'tells', b'me'): (15, 12.056004001139769), (b'bad', b'cold'): (7, 12.93191412052622), (b'far', b'off'): (83, 31.654147573003083), (b'am', b'sorry'): (32, 25.556309428082436), (b'mrs', b'campbell'): (9, 25.554140665420718), (b'ah', b'!"'): (15, 17.2162909678631), (b'an', b'interval'): (11, 15.772722335428986), (b'perfectly', b'well'): (16, 10.916140577050193), (b'ill', b'judged'): (6, 16.561437604357703), (b'can', b'tell'): (52, 10.598771687692016), (b'morrow', b'morning'): (14, 21.336997025943646), (b'own', b'feelings'): (16, 10.441084233959339), (b'sore', b'throat'): (9, 237.4797370228633), (b'well', b'satisfied'): (17, 14.614998127951495), (b'looked', b'at'): (184, 11.997273688809486), (b'well', b'pleased'): (26, 18.6180031299088), (b'set', b'forward'): (22, 28.094460681216024), (b'eldest', b'daughter'): (10, 81.36619150080689), (b'short', b'time'): (23, 10.057907277428889), (b',"', b'continued'): (103, 40.16032072000802), (b'dining', b'room'): (18, 272.72909153952844), (b'enter', b'into'): (110, 32.02988305119611), (b'half', b'hour'): (17, 17.306277530939532), (b'gone', b'through'): (24, 10.232780528794587), (b'turn', b'away'): (53, 24.325152043340207), (b'own', b'sake'): (17, 10.579315711019701), (b'an', b'effort'): (8, 12.355299162752706), (b',"', b'repeated'): (29, 15.364411460350311), (b'several', b'times'): (19, 99.0012388966807), (b'great', b'curiosity'): (13, 12.092919602258533), (b'upper', b'end'): (11, 46.57012007389162), (b'an', b'odd'): (25, 25.474843634541664), (b'dearest', b'emma'): (8, 38.81440851937388), (b'continued', b'mrs'): (17, 12.451539878373788), (b'go', b'home'): (37, 10.007255826550702), (b'judge', b'between'): (11, 14.608117243710463), (b'hardly', b'knew'): (16, 28.982515807625983), (b'set', b'off'): (42, 12.027600179663624), (b'got', b'home'): (12, 12.615003589161626), (b'most', b'extraordinary'): (16, 42.01871937976666), (b'an', b'inch'): (28, 63.14930683184716), (b'at', b'ease'): (36, 15.97262729589056), (b'three', b'quarters'): (8, 46.1908845684039), (b'smith', b'!--'): (9, 17.971811322996494), (b'extremely', b'sorry'): (8, 71.2760936150161), (b'many', b'weeks'): (10, 19.402474377557436), (b'madam', b',"'): (14, 12.063463685665674), (b'extremely', b'well'): (16, 22.290942522884006), (b'without', b'knowing'): (19, 31.149500610446733), (b'poor', b'harriet'): (15, 13.183343429017174), (b'an', b'instant'): (99, 42.5420557252291), (b'thirty', b'thousand'): (18, 40.06880794701987), (b'somebody', b'else'): (17, 161.27922164467546), (b'worth', b'having'): (9, 20.026446445121145), (b'poor', b'girl'): (16, 25.656814519548806), (b'laugh', b'at'): (34, 10.68487881101924), (b'knowing', b'what'): (22, 10.324648874148785), (b'many', b'days'): (50, 13.474789292130438), (b'whole', b'party'): (15, 20.807448930462893), (b'six', b'weeks'): (6, 16.914705060106233), (b'too', b'late'): (56, 82.78532737735924), (b'her', b'companions'): (37, 11.248377921136292), (b'drew', b'near'): (34, 127.06577013042501), (b'three', b'months'): (36, 79.13878668714453), (b'other', b'side'): (133, 27.098888471327907), (b'an', b'unnatural'): (10, 18.16955759228339), (b'get', b'rid'): (18, 208.69503038021702), (b'watering', b'place'): (9, 80.21198462150338), (b'while', b'ago'): (10, 12.96130710105312), (b'at', b'weymouth'): (16, 40.30369038102851), (b'present', b'occasion'): (9, 30.82215995924605), (b'their', b'hearts'): (50, 16.688643994865927), (b'break', b'through'): (12, 10.13448685951659), (b'burst', b'forth'): (18, 46.35221285874242), (b'young', b'men'): (142, 26.258476552154963), (b'nobody', b'else'): (21, 79.07262286913833), (b'something', b'else'): (35, 35.09791978466929), (b'walking', b'together'): (9, 11.011119603989226), (b'burst', b'out'): (19, 10.336463766135193), (b'mrs', b'cole'): (30, 133.5308579852927), (b'mr', b'cole'): (23, 75.77932867599989), (b'miss', b'fairfax'): (125, 253.16322047491198), (b'extremely', b'happy'): (7, 17.871217379746273), (b'ma', b'am'): (216, 180.33592685038826), (b's', b'handwriting'): (7, 11.116318806496656), (b'without', b'seeming'): (8, 22.51140380250772), (b'colonel', b'campbell'): (28, 852.6897671568628), (b'those', b'days'): (84, 22.563719575520683), (b'mrs', b'dixon'): (14, 66.64403730356881), (b'mr', b'dixon'): (22, 99.22116646087359), (b'miss', b'campbell'): (12, 69.78535178777393), (b'caught', b'hold'): (10, 22.460761166547872), (b'four', b'months'): (12, 35.223787622984226), (b'may', b'guess'): (11, 12.384171115697546), (b'running', b'away'): (12, 10.939419925249963), (b'five', b'minutes'): (37, 138.27388748830532), (b'nine', b'years'): (9, 20.394343883776585), (b'hundred', b'pounds'): (11, 61.548167237462266), (b'rather', b'than'): (78, 18.774280659635497), (b'few', b'months'): (17, 56.65512828516137), (b'she', b'wished'): (41, 13.111265209712547), (b'without', b'feeling'): (12, 12.941338417866758), (b'ill', b'health'): (7, 24.482125154267912), (b'twelve', b'thousand'): (24, 56.81398141741622), (b'mr', b'churchill'): (19, 14.856645245478399), (b'passed', b'between'): (17, 18.633976326622793), (b",'", b'said'): (252, 29.90465644929233), (b'miss', b'hawkins'): (18, 330.483487394958), (b'dear', b'jane'): (15, 27.279076268282562), (b'three', b'minutes'): (10, 10.2220491437685), (b'have', b'suffered'): (30, 12.671631950727493), (b'hour', b'ago'): (10, 34.5823521342509), (b'ford', b's'): (10, 13.291250746898175), (b'looked', b'round'): (26, 11.112235527475757), (b'help', b'thinking'): (10, 29.995785987665336), (b'can', b't'): (299, 33.884042781775456), (b'human', b'nature'): (10, 30.588113365891143), (b'brown', b's'): (105, 12.277058922837673), (b'laughed', b'at'): (28, 11.579269824945039), (b'weeks', b'ago'): (7, 64.64782562239554), (b'twenty', b'miles'): (8, 30.04364737817797), (b'elder', b'sister'): (6, 19.200272911906577), (b'driven', b'away'): (10, 13.818976228325019), (b'setting', b'off'): (8, 15.77704088728183), (b'little', b'farther'): (16, 12.063492840311763), (b'spot', b'where'): (18, 30.983518539673284), (b'front', b'door'): (16, 47.781783068175294), (b'they', b'parted'): (23, 10.793474382760099), (b'without', b'delay'): (8, 16.40116562754134), (b'six', b'months'): (21, 137.42102349350557), (b'months', b'ago'): (7, 32.826357051786346), (b'leaned', b'back'): (6, 11.97267240526368), (b'ill', b'disposed'): (6, 23.462036606173417), (b'at', b'oxford'): (10, 13.739894448077902), (b'turned', b'round'): (21, 11.459424650853364), (b'pass', b'through'): (64, 20.610331926490034), (b'clock', b'struck'): (16, 271.616904052565), (b'"\'', b'tis'): (7, 65.71510806994678), (b'four', b'hours'): (15, 44.14409747555815), (b'parlour', b'door'): (6, 13.546761301300853), (b'faster', b'than'): (10, 38.52966274389933), (b'musical', b'society'): (6, 97.72879987078016), (b'worth', b'while'): (16, 32.80928460158145), (b'kind', b'hearted'): (6, 22.29983045849919), (b'mixed', b'with'): (29, 12.314926306023153), (b'extremely', b'glad'): (10, 69.6487855416139), (b'knew', b'nothing'): (25, 11.012602701827063), (b'make', b'amends'): (9, 63.131759488717876), (b'amends', b'for'): (12, 12.758421973797969), (b'oftener', b'than'): (9, 48.95533619224856), (b'old', b'woman'): (61, 16.85474514064448), (b'just', b'going'): (25, 11.441649458022253), (b'their', b'lives'): (30, 13.550608371899687), (b'six', b'days'): (24, 24.79935497788804), (b'may', b'prove'): (10, 12.110371414160014), (b'stronger', b'than'): (29, 55.48271435121504), (b'particular', b'friend'): (10, 18.112011830422627), (b'good', b'tidings'): (14, 28.459923484587165), (b'among', b'themselves'): (31, 14.391325508421081), (b'next', b'summer'): (9, 24.61759739599227), (b'on', b'tuesday'): (9, 11.261777570963238), (b'breaking', b'up'): (13, 11.204996550750268), (b'after', b'tea'): (9, 10.099806363302507), (b'perfectly', b'safe'): (6, 15.18214895111914), (b'two', b'ladies'): (13, 10.349667933920346), (b'same', b'moment'): (29, 15.168003008335596), (b'mr', b'cox'): (13, 82.17847198641766), (b',"', b'added'): (51, 21.446157663405643), (b'little', b'girls'): (15, 21.933623346021392), (b'be', b'ashamed'): (88, 14.83924678279627), (b'been', b'staying'): (9, 13.648522445296638), (b'shut', b'up'): (64, 31.30735269741568), (b'too', b'large'): (19, 13.112807771198575), (b'an', b'elderly'): (7, 12.355299162752706), (b'worse', b'than'): (59, 50.382285453457605), (b'opposite', b'side'): (9, 20.5505493945621), (b'short', b'pause'): (11, 83.26224770642202), (b'large', b'party'): (8, 13.460266963292549), (b'who', b'knows'): (35, 23.27234193136448), (b'extremely', b'fond'): (6, 32.59205990088343), (b'five', b'couple'): (8, 45.27718326723041), (b'mr', b'william'): (9, 12.681862960866923), (b'bow', b'window'): (8, 25.074140074595938), (b'bad', b'news'): (9, 59.57160439127652), (b'baked', b'apples'): (6, 587.9873663751215), (b'mrs', b'wallis'): (14, 79.25236868532507), (b'will', b'send'): (73, 14.065402297907488), (b'william', b'larkins'): (13, 4596.687559354225), (b'low', b'voice'): (39, 51.584875095916104), (b'one', b'leg'): (18, 10.201271801948558), (b'an', b'immediate'): (12, 13.30570679065676), (b',"', b'resumed'): (18, 27.880004962427332), (b'many', b'times'): (21, 13.085195623230133), (b'few', b'words'): (18, 11.20685248087905), (b'no', b'objection'): (18, 27.458185258366093), (b'astonished', b'at'): (22, 12.45750429959063), (b'four', b'times'): (13, 16.70830356064136), (b'c', b'.,'): (6, 484.806891025641), (b'few', b'hours'): (23, 74.55299835706462), (b'an', b'extraordinary'): (14, 10.692085813920611), (b'immediately', b'followed'): (7, 11.605013810035294), (b'wait', b'till'): (20, 30.636127032993738), (b'good', b'bye'): (45, 141.49566478212827), (b'contrast', b'between'): (12, 157.767666232073), (b'dared', b'not'): (22, 11.170731204026199), (b'three', b'weeks'): (11, 30.165475636508674), (b'self', b'command'): (15, 129.45887538514208), (b'mrs', b'elton'): (142, 115.93946807097048), (b'maple', b'grove'): (31, 6513.877432712216), (b'mr', b'suckling'): (10, 67.58098025198821), (b'almost', b'fancy'): (9, 14.733690490685499), (b'left', b'behind'): (27, 27.450614763395492), (b'barouche', b'landau'): (7, 17286.828571428574), (b'whose', b'name'): (60, 27.21462469955437), (b'mr', b'e'): (10, 15.659007619363122), (b'e', b'.,'): (6, 189.192933083177), (b'good', b'breeding'): (8, 36.830489215348095), (b'greater', b'part'): (10, 15.073969804175594), (b'drew', b'back'): (11, 14.116251307516126), (b'third', b'time'): (23, 12.455612727466114), (b'very', b'extraordinary'): (13, 11.60924458430071), (b'better', b'acquainted'): (7, 12.58662367380903), (b'have', b'committed'): (34, 16.060120198292402), (b'drawing', b'rooms'): (8, 127.10903361344538), (b'hardly', b'less'): (8, 12.215771125528304), (b'will', b'shew'): (48, 11.000113735916194), (b'little', b'boys'): (17, 15.382021567339674), (b'post', b'office'): (12, 378.6226533166458), (b'easily', b'believe'): (8, 20.68038054004785), (b'put', b'forth'): (41, 11.449193667080172), (b'mrs', b'bragge'): (6, 46.54504192630203), (b'drawing', b'near'): (8, 17.917525467898603), (b'great', b'joy'): (26, 10.054633379705905), (b'spread', b'abroad'): (14, 187.6671836228288), (b'few', b'lines'): (10, 41.418332420591454), (b'good', b'news'): (18, 22.239175181945157), (b'most', b'likely'): (12, 18.05763146899063), (b'talk', b'about'): (37, 21.43960312714548), (b'tells', b'us'): (12, 28.498862810540196), (b'sixty', b'five'): (6, 24.21802825921627), (b'eleven', b'years'): (6, 15.56410454288213), (b'your', b'sister'): (93, 17.156892959360377), (b'two', b'hours'): (18, 15.05994294248296), (b'two', b'months'): (20, 19.807816544517244), (b'twenty', b'four'): (17, 24.69076638463422), (b'door', b'opened'): (19, 31.970356671070014), (b'began', b'talking'): (9, 12.980535814851565), (b'mean', b'?"'): (59, 24.673034270425305), (b'pretty', b'soon'): (13, 15.281994613759855), (b'many', b'hours'): (12, 12.100088566367637), (b'few', b'steps'): (8, 21.35632765436747), (b'most', b'excellent'): (11, 13.620613450895787), (b'surrounded', b'by'): (19, 28.432283834586467), (b'later', b'than'): (9, 15.131649368513193), (b'whole', b'story'): (18, 33.8121045120022), (b'another', b'minute'): (9, 10.472151066186651), (b'whole', b'history'): (9, 21.86545480828304), (b'lined', b'with'): (12, 12.666009731414164), (b'court', b'plaister'): (9, 660.5229257641921), (b'these', b'things'): (366, 38.76493340077559), (b'laid', b'down'): (26, 11.768166775804247), (b'forty', b'years'): (68, 158.55517564110565), (b'faint', b'smile'): (6, 23.553371223917786), (b'turned', b'towards'): (11, 10.359842928650108), (b'totally', b'different'): (7, 152.78762626262628), (b'box', b'hill'): (18, 162.9717796241427), (b'some', b'surprise'): (19, 20.088187863437586), (b'may', b'depend'): (9, 14.38565331621432), (b',"', b'interrupted'): (25, 29.580907121938818), (b'whatever', b'else'): (9, 19.419818171605563), (b'larger', b'than'): (11, 18.09218946235273), (b'were', b'assembled'): (17, 17.322670000548754), (b'insisted', b'on'): (9, 10.828632279772345), (b'clothed', b'with'): (37, 11.957971920341324), (b'twenty', b'minutes'): (8, 15.956791968492862), (b'quite', b'alone'): (14, 10.13386446620588), (b'etc', b'.,'): (11, 3723.316923076923), (b',"', b'whispered'): (18, 25.73538919608677), (b'shan', b't'): (20, 201.14328457446808), (b'looking', b'round'): (23, 16.131457721715485), (b',"', b'answered'): (143, 22.031536656699593), (b'yes', b'yes'): (31, 34.42415794264907), (b'old', b'age'): (51, 59.178054288059265), (b'an', b'infant'): (10, 23.760190697601356), (b'be', b'forgiven'): (36, 17.634811346477495), (b'lie', b'down'): (41, 29.047526585175245), (b'mrs', b'smallridge'): (7, 65.16305869682283), (b'four', b'miles'): (7, 15.174533507223114), (b'great', b'hurry'): (16, 18.475293836783866), (b'without', b'waiting'): (12, 17.062783773875278), (b'comes', b'back'): (13, 14.131678904573523), (b'heightened', b'by'): (7, 10.935493782533257), (b'cut', b'off'): (217, 148.51915132090852), (b'trembling', b'voice'): (7, 11.589231329132108), (b'time', b'past'): (23, 12.303250492267754), (b'second', b'time'): (44, 18.990965084469945), (b'five', b'hundred'): (67, 84.28885553403468), (b'an', b'arrow'): (10, 13.72811018083634), (b'presented', b'themselves'): (6, 11.270798405424538), (b'at', b'random'): (13, 22.939649861138758), (b'far', b'distant'): (12, 33.91691492087898), (b'few', b'seconds'): (9, 91.12033132530121), (b'passing', b'through'): (12, 17.88838955740177), (b'domestic', b'happiness'): (6, 55.01354791780324), (b'western', b'sun'): (7, 33.12559540104024), (b'rose', b'early'): (12, 38.93680417015252), (b'east', b'wind'): (22, 116.46036526681686), (b'gone', b'mad'): (10, 19.017413169888417), (b'freed', b'from'): (11, 28.570122143171943), (b'sinned', b'against'): (43, 77.52306997194648), (b'locked', b'up'): (11, 12.679338202164779), (b'deep', b'sigh'): (7, 42.620386024232175), (b'ten', b'thousand'): (82, 129.3626084662696), (b'happier', b'than'): (11, 25.47675658984364), (b'nay', b'nay'): (7, 32.486187548659025), (b'had', b'formerly'): (10, 11.277857307953266), (b'little', b'boy'): (67, 21.623607468339106), (b'fancying', b'herself'): (6, 25.9628819086852), (b'right', b'hand'): (199, 41.801008401685465), (b'infinitely', b'more'): (8, 12.077108866621423), (b'such', b'cases'): (8, 12.89145596590909), (b'poor', b'fellow'): (38, 75.64792734629856), (b'days', b'ago'): (11, 14.965717112586058), (b'help', b'laughing'): (7, 15.773064991266716), (b'draw', b'near'): (18, 72.06606928524963), (b'at', b'intervals'): (34, 32.97574667538696), (b'into', b'temptation'): (8, 11.357116041596276), (b'sir', b'walter'): (136, 503.2387873015873), (b'walter', b'elliot'): (16, 153.52777393310265), (b'kellynch', b'hall'): (25, 1284.9930975894658), (b'charles', b'musgrove'): (14, 242.12321031569587), (b'first', b'year'): (71, 32.9850895198761), (b'lady', b'elliot'): (12, 19.25745581528584), (b'seventeen', b'years'): (7, 49.286331052460085), (b'an', b'awful'): (13, 14.535646073826713), (b'thirteen', b'years'): (7, 35.844604401789155), (b'lady', b'russell'): (147, 757.7061090581977), (b'anne', b'elliot'): (23, 67.77714022553583), (b'miss', b'elliot'): (48, 75.64966706405747), (b'everybody', b'else'): (22, 107.67396310951992), (b'russell', b's'): (30, 10.258347891901277), (b'mr', b'elliot'): (174, 150.17475957725546), (b'mr', b'shepherd'): (26, 56.76802341167009), (b'ill', b'used'): (8, 18.889563018388817), (b'anybody', b'else'): (21, 154.67714824401622), (b'an', b'honest'): (29, 20.450150338349307), (b'descend', b'into'): (11, 16.714246249896405), (b'mrs', b'clay'): (66, 167.01456220614256), (b'therefore', b'thus'): (66, 16.596178785600454), (b'miss', b'anne'): (19, 12.802348709267878), (b'their', b'fathers'): (151, 19.01913287990061), (b'an', b'example'): (14, 16.848135221935507), (b'admiral', b'croft'): (14, 929.5580402867872), (b'mrs', b'croft'): (41, 202.2301821625536), (b'walked', b'along'): (8, 10.917931320713858), (b'frederick', b'wentworth'): (6, 22.564294771388084), (b'either', b'side'): (18, 17.751889049381603), (b'captain', b'wentworth'): (196, 617.1163877348314), (b'eldest', b'son'): (15, 33.79917323054578), (b'removed', b'from'): (36, 13.162880053223592), (b'startled', b'by'): (14, 12.543654632905795), (b'most', b'important'): (14, 31.640805582833135), (b'replied', b'anne'): (11, 13.574215887165527), (b'at', b'uppercross'): (20, 12.847693509891025), (b'left', b'alone'): (17, 14.251469219988458), (b'mr', b'musgrove'): (21, 31.607104610160636), (b'miss', b'musgroves'): (22, 210.8149825783972), (b'mrs', b'musgrove'): (66, 152.88256078869972), (b'piano', b'forte'): (7, 11524.55238095238), (b'their', b'faces'): (63, 20.62178822688734), (b'surprised', b'at'): (28, 13.306003044454387), (b'ere', b'long'): (23, 32.475628447890266), (b'anything', b'else'): (31, 72.7257403863046), (b'quite', b'different'): (12, 17.841743196562472), (b'their', b'sakes'): (13, 15.501895977453241), (b'twentieth', b'year'): (13, 176.01134545454545), (b'on', b'board'): (70, 31.771507904193165), (b'eight', b'years'): (24, 64.58208896529253), (b'their', b'heads'): (79, 20.78152721615108), (b'dressing', b'room'): (14, 228.86357331988398), (b'up', b'stairs'): (15, 14.006245688437838), (b'waited', b'till'): (7, 10.336362859827453), (b'third', b'part'): (39, 74.57648218907926), (b'dear', b'fellow'): (11, 17.04239197791674), (b'good', b'cheer'): (15, 59.63031587246834), (b'mrs', b'harville'): (24, 82.53987434930892), (b'fifteen', b'years'): (10, 60.350609451991936), (b'charles', b'hayter'): (33, 2576.9838758746578), (b'came', b'near'): (42, 11.125296005240008), (b'mansion', b'house'): (8, 28.450109717868337), (b'two', b'hundred'): (105, 33.2716152575442), (b'dr', b'shirley'): (9, 1057.7604895104896), (b'went', b'up'): (207, 10.546879068033903), (b'within', b'reach'): (7, 16.566879329700722), (b'turn', b'back'): (16, 10.061923328702072), (b'walking', b'along'): (8, 17.574038573254327), (b'leaning', b'against'): (13, 24.291194507733536), (b'trodden', b'under'): (9, 65.1314925453469), (b'under', b'foot'): (15, 20.58754074709241), (b'louisa', b'musgrove'): (15, 183.23410054512416), (b'provoke', b'me'): (18, 15.672805201481697), (b'captain', b'harville'): (37, 300.52383391540553), (b'at', b'lyme'): (26, 20.671363587556005), (b'earnest', b'desire'): (6, 31.26493385696569), (b'sea', b'shore'): (14, 26.870717986676535), (b'captain', b'benwick'): (56, 513.1712788957259), (b'an', b'officer'): (9, 10.08595850020629), (b'place', b'where'): (125, 24.673951016868287), (b'breakfast', b'table'): (9, 46.889526097570425), (b'great', b'coat'): (15, 13.6532963251306), (b'mean', b'while'): (30, 21.602178501755198), (b'preceding', b'evening'): (6, 46.9167959057072), (b'dark', b'blue'): (7, 10.384974511251094), (b'an', b'agony'): (10, 14.366626933433379), (b'catching', b'hold'): (10, 119.13968966603655), (b'raised', b'up'): (35, 19.63919232400523), (b'every', b'one'): (395, 13.121967878025114), (b'could', b'scarcely'): (18, 16.706187581507773), (b'passed', b'along'): (11, 15.7751212389842), (b'leaning', b'over'): (17, 32.991030289811604), (b't', b'talk'): (19, 10.789294958017445), (b'camden', b'place'): (29, 304.80554156171286), (b'straight', b'forward'): (6, 11.08942448680352), (b'same', b'hour'): (17, 12.502011213202374), (b'looking', b'glasses'): (6, 21.695316982214575), (b'poring', b'over'): (6, 39.40595284616386), (b'thirty', b'feet'): (8, 11.48226847165992), (b'colonel', b'wallis'): (23, 919.8228040540541), (b'at', b'length'): (101, 22.856835240701436), (b'carried', b'away'): (73, 65.90036455897334), (b'greater', b'than'): (58, 46.9242105417191), (b'miss', b'carteret'): (12, 296.5877450980392), (b'lady', b'dalrymple'): (25, 567.8984418997559), (b'laura', b'place'): (7, 20.594969024440058), (b'be', b'established'): (41, 12.515027407177577), (b'mrs', b'smith'): (79, 133.20625258466546), (b'westgate', b'buildings'): (7, 3878.455128205128), (b'at', b'liberty'): (25, 11.17821921199558), (b'five', b'thousand'): (31, 35.86192793881296), (b'whose', b'names'): (9, 17.589365660794233), (b'her', b'ladyship'): (22, 32.97701536370165), (b'ladyship', b's'): (10, 11.322176562172519), (b'old', b'gentleman'): (32, 24.080053832071062), (b'their', b'minds'): (18, 15.50189597745324), (b'almost', b'entirely'): (13, 33.521094767168066), (b'lower', b'part'): (8, 14.361011772897019), (b'staring', b'at'): (33, 23.083022672770873), (b'ay', b'ay'): (6, 102.04739416427729), (b'an', b'oath'): (39, 43.30723417872082), (b'wiser', b'than'): (8, 28.371842565962236), (b'prejudice', b'against'): (7, 37.304334422590784), (b'both', b'sides'): (31, 93.46785578477042), (b'my', b'soul'): (259, 16.066883986530073), (b'same', b'instant'): (19, 24.469748442934566), (b'their', b'seats'): (12, 12.918246647877702), (b'their', b'mouths'): (24, 24.960679963695895), (b'short', b'silence'): (9, 17.74307917888563), (b'fifty', b'pounds'): (6, 33.295124367158266), (b'be', b'saved'): (61, 12.820271978084348), (b'hard', b'hearted'): (6, 30.946703493427446), (b'some', b'moments'): (14, 21.245341542207033), (b'exclaimed', b'mrs'): (11, 11.847828853967787), (b'compassion', b'on'): (20, 12.135536175606939), (b'an', b'explanation'): (12, 15.444123953440883), (b'our', b'hearts'): (21, 17.06660837126648), (b'minutes', b'afterwards'): (7, 21.36625761454931), (b'make', b'haste'): (38, 69.71834069521798), (b'n', b't'): (19, 110.43160721735502), (b'rising', b'sun'): (7, 11.452998409934125), (b'an', b'atonement'): (66, 83.74147210310166), (b'atonement', b'for'): (65, 20.656492719482426), (b'next', b'instant'): (8, 11.76775260138092), (b'she', b'doted'): (7, 14.81087366282343), (b'god', b'forbid'): (30, 46.14475859838801), (b'i', b'll'): (384, 11.175120311389023), (b'll', b'answer'): (12, 13.01399029006883), (b'market', b'place'): (13, 39.585135267754914), (b'poured', b'out'): (53, 41.58992740848729), (b'at', b'norland'): (19, 17.421149186996885), (b'many', b'generations'): (11, 16.534282513048943), (b'seven', b'thousand'): (27, 31.25935371753323), (b'mr', b'dashwood'): (15, 10.19078273641092), (b'john', b'dashwood'): (37, 157.76252403767805), (b'four', b'thousand'): (45, 51.452722885418765), (b'three', b'thousand'): (45, 26.103457945941283), (b'mrs', b'dashwood'): (121, 149.9784684291954), (b'miss', b'dashwoods'): (23, 254.21806722689078), (b'edward', b'ferrars'): (13, 135.88747894441326), (b'younger', b'brother'): (8, 31.679093146238024), (b'few', b'miles'): (7, 14.237551769578314), (b'replied', b'elinor'): (26, 38.56266294368484), (b'mrs', b'ferrars'): (73, 170.4264612070751), (b'barton', b'park'): (12, 511.69179654464176), (b'from', b'whence'): (44, 15.9501908897463), (b'barton', b'cottage'): (7, 104.58755401901469), (b'sir', b'john'): (113, 127.7876444705192), (b'at', b'barton'): (35, 22.230840455317054), (b'lady', b'middleton'): (95, 500.3860397158689), (b'be', b'fulfilled'): (39, 13.72615909174315), (b'their', b'arrival'): (15, 11.39845292459797), (b'present', b'case'): (12, 20.32475765428544), (b'mrs', b'jennings'): (229, 317.3157640888764), (b'colonel', b'brandon'): (132, 1667.5337022569443), (b'now', b'therefore'): (145, 11.124733982335009), (b'ill', b'natured'): (11, 147.81083061889248), (b'blue', b'sky'): (11, 51.839749814359976), (b'rose', b'up'): (112, 34.00672106200659), (b'at', b'allenham'): (8, 10.991915558462322), (b'miss', b'dashwood'): (70, 131.14424102974525), (b'cried', b'marianne'): (34, 25.079372634443647), (b'mr', b'willoughby'): (36, 36.68681785107931), (b'miss', b'marianne'): (31, 20.9166764174024), (b'aye', b'aye'): (36, 468.905225), (b'have', b'erred'): (10, 17.591206708068757), (b'pronounce', b'him'): (19, 17.59404209004578), (b'by', b'reason'): (70, 10.359296240180047), (b'an', b'everlasting'): (37, 33.79227121436637), (b'seven', b'days'): (103, 82.78022840230078), (b'by', b'accident'): (20, 16.659541309328006), (b'went', b'out'): (262, 11.679327162094795), (b'won', b't'): (219, 217.3972873683645), (b'miss', b'williams'): (6, 72.63373349339736), (b'laughed', b'heartily'): (6, 104.96859819569742), (b'considerable', b'time'): (10, 11.403522990282188), (b'two', b'sides'): (13, 12.98652600625674), (b'at', b'delaford'): (11, 13.190298670154785), (b'two', b'thousand'): (76, 23.9670207602225), (b'seven', b'hundred'): (50, 63.02139464474197), (b'can', b'possibly'): (10, 14.130878533659684), (b'burst', b'into'): (18, 13.516567683308244), (b'turning', b'round'): (15, 21.605294920047708), (b'mr', b'ferrars'): (26, 41.48432480083584), (b'combe', b'magna'): (11, 18907.46875), (b'mrs', b'palmer'): (37, 135.40375833106043), (b'mr', b'palmer'): (35, 100.05495777567086), (b'without', b'ceasing'): (8, 82.0058281377067), (b'stared', b'at'): (14, 13.490078185385578), (b't', b'think'): (70, 10.296761958921383), (b'miss', b'steeles'): (29, 406.7489075630252), (b'most', b'beautiful'): (16, 16.30577169961094), (b'human', b'beings'): (7, 191.5286483064261), (b'sugar', b'plums'): (24, 5662.926600985222), (b'two', b'boys'): (13, 13.239510279105899), (b'miss', b'steele'): (27, 243.16510778224333), (b'lucy', b'steele'): (10, 285.9352551984877), (b'i', b'm'): (438, 16.271176474972332), (b'm', b'sure'): (88, 102.49205464802071), (b'robert', b'ferrars'): (7, 95.96177636796193), (b'mr', b'pratt'): (8, 85.60257498585173), (b'at', b'longstaple'): (7, 16.48787333769348), (b'poor', b'edward'): (10, 12.172941195406368), (b'their', b'names'): (36, 14.302344503007454), (b'latter', b'end'): (12, 42.204171316964285), (b'i', b've'): (218, 13.517978176897723), (b'lifted', b'up'): (151, 72.07476264919077), (b'third', b'day'): (65, 33.791685361045076), (b'starting', b'up'): (9, 11.47178218291099), (b't', b'know'): (147, 12.457680257147297), (b'returned', b'home'): (11, 12.907360988149462), (b'berkeley', b'street'): (16, 1449.984531590414), (b'conduit', b'street'): (6, 203.71683501683503), (b'lit', b'up'): (23, 36.13611387616962), (b'as', b'follows'): (17, 14.27876809314034), (b'having', b'received'): (10, 11.384897636609963), (b'who', b'cares'): (8, 12.52599580423441), (b'miss', b'grey'): (10, 10.250728517213336), (b'fifty', b'thousand'): (11, 20.37397014255248), (b'why', b'don'): (28, 12.795521166648276), (b'thousand', b'times'): (12, 12.06372712383394), (b'walked', b'across'): (7, 13.912300670276734), (b'fourteen', b'years'): (6, 15.56410454288213), (b'your', b'sakes'): (16, 32.49086604178871), (b'bartlett', b's'): (6, 10.189958905955267), (b'dressing', b'gown'): (6, 509.2920875420876), (b'wild', b'beasts'): (16, 105.2408127767236), (b'miss', b'morton'): (15, 267.59796550199025), (b'six', b'hundred'): (66, 132.00536142208233), (b'harley', b'street'): (16, 1540.6085648148148), (b'most', b'high'): (60, 22.250238368104544), (b'filled', b'with'): (114, 14.087704701266775), (b'two', b'thirds'): (6, 42.47676214546476), (b'public', b'school'): (6, 43.03876796130318), (b",'", b'says'): (13, 42.67600070534298), (b'fell', b'upon'): (62, 13.948924382167398), (b's', b'office'): (29, 12.330706575273602), (b'yes', b'ma'): (9, 15.105959603525328), (b'come', b'near'): (47, 11.571120236270598), (b'give', b'ear'): (30, 35.78164549476025), (b'reminds', b'me'): (7, 14.199293601342392), (b'ten', b'guineas'): (6, 26.425532844164923), (b'south', b'east'): (7, 17.113977399691684), (b'mr', b'harris'): (10, 85.60257498585173), (b'quicker', b'than'): (6, 12.23883404806214), (b'bent', b'over'): (10, 11.258843670332531), (b'justified', b'by'): (17, 11.526601554562083), (b'from', b'thence'): (103, 27.33487278970147), (b'latter', b'days'): (12, 29.775541338582677), (b'sprung', b'up'): (13, 24.090742584113084), (b'or', b'later'): (13, 13.913940352137981), (b'living', b'creature'): (16, 82.52032187670486), (b'first', b'month'): (33, 25.952660129387272), (b'have', b'transgressed'): (20, 23.003885695166836), ...}
lower_bigram["jon lives in new york city".split()]
['jon', 'lives', 'in', 'new_york', 'city']
lower_bigram = Phraser(Phrases(lower_sents,
min_count=32, threshold=64))
lower_bigram.phrasegrams
{(b'miss', b'taylor'): (48, 156.44188752424046), (b'mr', b'woodhouse'): (132, 82.04719647206235), (b'mr', b'weston'): (162, 75.8750096465504), (b'mrs', b'weston'): (249, 160.68617883193815), (b'great', b'deal'): (182, 93.36445281155484), (b'mr', b'knightley'): (277, 161.7426545362494), (b'miss', b'woodhouse'): (173, 229.03991999355654), (b'years', b'ago'): (56, 74.31656200580369), (b'mr', b'elton'): (214, 121.40001543448062), (b'dare', b'say'): (115, 89.940748422131), (b'frank', b'churchill'): (151, 1316.456538433409), (b'miss', b'bates'): (113, 276.3981670520557), (b'drawing', b'room'): (49, 84.9156512119411), (b'mrs', b'goddard'): (58, 143.57962085740624), (b'miss', b'smith'): (58, 73.03502483866474), (b'few', b'minutes'): (86, 204.17003699445084), (b'john', b'knightley'): (58, 83.03824369335366), (b'don', b't'): (830, 250.31164302600473), (b'good', b'natured'): (66, 88.70009486029666), (b'few', b'moments'): (43, 107.77673597616273), (b'thousand', b'pounds'): (48, 166.5197213382644), (b'o', b'clock'): (67, 89.14862759956216), (b'jane', b'fairfax'): (111, 654.5620010133792), (b'miss', b'fairfax'): (125, 196.20149586805675), (b'ma', b'am'): (216, 157.25976559465136), (b'mrs', b'elton'): (142, 93.09008385260405), (b'forty', b'years'): (68, 90.60295750920322), (b'cut', b'off'): (217, 129.60397638852865), (b'ten', b'thousand'): (82, 84.00169380926596), (b'sir', b'walter'): (136, 399.5178158730159), (b'lady', b'russell'): (147, 613.6352291668503), (b'mr', b'elliot'): (174, 126.18234236668802), (b'mrs', b'clay'): (66, 93.09008385260405), (b'captain', b'wentworth'): (196, 529.8800397304311), (b'mrs', b'musgrove'): (66, 85.21323060353755), (b'charles', b'hayter'): (33, 92.03513842409491), (b'captain', b'benwick'): (56, 241.49236653916512), (b'mrs', b'smith'): (79, 84.60397123620643), (b'mrs', b'dashwood'): (121, 115.06968698446889), (b'mrs', b'ferrars'): (73, 102.75713102191294), (b'sir', b'john'): (113, 95.84073335288942), (b'lady', b'middleton'): (95, 350.27022780110826), (b'mrs', b'jennings'): (229, 279.06788181030646), (b'colonel', b'brandon'): (132, 1313.0186631944446), (b'miss', b'dashwood'): (70, 76.66894090969721), (b'won', b't'): (219, 189.9686576536643), (b'm', b'sure'): (88, 69.15126578661638), (b'six', b'hundred'): (66, 73.57675882542294), (b'gathered', b'together'): (84, 103.28151426020274), (b'thou', b'shalt'): (1282, 66.88288454162686), (b'burnt', b'offerings'): (86, 299.1594956644355), (b'sweet', b'savour'): (43, 286.1811575507396), (b'unleavened', b'bread'): (43, 237.7023822279367), (b'burnt', b'offering'): (184, 297.52711249720966), (b'afar', b'off'): (52, 108.1430971616501), (b'take', b'heed'): (58, 86.38525449882758), (b'sent', b'messengers'): (43, 79.21620881736811), (b'thus', b'saith'): (444, 144.03010304370085), (b'without', b'blemish'): (46, 83.71428289057559), (b'peace', b'offerings'): (83, 176.2591765391338), (b'sin', b'offering'): (118, 129.96187065094136), (b'meat', b'offering'): (122, 210.66899051760493), (b'fine', b'flour'): (36, 86.07753592260636), (b'high', b'places'): (99, 129.81341185361669), (b'fig', b'tree'): (37, 121.73822937625756), (b'fir', b'tree'): (36, 72.67953992612391), (b'mercy', b'endureth'): (41, 269.07896427336067), (b'chief', b'priests'): (65, 116.32043880243987), (b'jesus', b'christ'): (199, 172.16959234722393), (b'holy', b'ghost'): (90, 313.0330942696068), (b'o', b'er'): (82, 108.1508294008294), (b'couldn', b't'): (89, 171.76280480516377), (b'didn', b't'): (180, 220.5126379038613), (b'little', b'jackal'): (61, 69.81311821111686), (b'wasn', b't'): (58, 120.22357238933725), (b'isn', b't'): (63, 131.967022683778), (b'doesn', b't'): (53, 106.26437675632276), (b'wouldn', b't'): (58, 120.22357238933725), (b'father', b'brown'): (207, 91.68353015338629), (b'buster', b'bear'): (142, 479.8780734011104), (b'green', b'forest'): (66, 336.3801160984384), (b'little', b'joe'): (111, 133.28894187197614), (b'joe', b'otter'): (47, 1271.6246321984027), (b'farmer', b'brown'): (100, 386.0549863003415), (b'mock', b'turtle'): (56, 2528.8986415882964), (b'dr', b'bull'): (65, 680.7926554828151), (b'guinea', b'hen'): (51, 905.88975571316), (b'sir', b'arthur'): (71, 131.42033416875523), (b'miss', b'somers'): (49, 160.06322751322753), (b'mr', b'gresham'): (49, 87.31462648556875), (b'mrs', b'theresa'): (67, 170.20201898423875), (b'de', b'grey'): (77, 603.2159473590925), (b'dr', b'middleton'): (40, 162.73238300161378), (b'moby', b'dick'): (84, 4115.911564625851), (b'sperm', b'whale'): (183, 297.3696872050255), (b'mast', b'heads'): (37, 77.73653510124372), (b'wee', b'l'): (35, 450.40124069478907)}
clean_sents = []
for s in lower_sents:
clean_sents.append(lower_bigram[s])
clean_sents[0:9]
[['emma', 'by', 'jane', 'austen', '1816'], ['volume', 'i'], ['chapter', 'i'], ['emma', 'woodhouse', 'handsome', 'clever', 'and', 'rich', 'with', 'a', 'comfortable', 'home', 'and', 'happy', 'disposition', 'seemed', 'to', 'unite', 'some', 'of', 'the', 'best', 'blessings', 'of', 'existence', 'and', 'had', 'lived', 'nearly', 'twenty', 'one', 'years', 'in', 'the', 'world', 'with', 'very', 'little', 'to', 'distress', 'or', 'vex', 'her'], ['she', 'was', 'the', 'youngest', 'of', 'the', 'two', 'daughters', 'of', 'a', 'most', 'affectionate', 'indulgent', 'father', 'and', 'had', 'in', 'consequence', 'of', 'her', 'sister', 's', 'marriage', 'been', 'mistress', 'of', 'his', 'house', 'from', 'a', 'very', 'early', 'period'], ['her', 'mother', 'had', 'died', 'too', 'long', 'ago', 'for', 'her', 'to', 'have', 'more', 'than', 'an', 'indistinct', 'remembrance', 'of', 'her', 'caresses', 'and', 'her', 'place', 'had', 'been', 'supplied', 'by', 'an', 'excellent', 'woman', 'as', 'governess', 'who', 'had', 'fallen', 'little', 'short', 'of', 'a', 'mother', 'in', 'affection'], ['sixteen', 'years', 'had', 'miss_taylor', 'been', 'in', 'mr_woodhouse', 's', 'family', 'less', 'as', 'a', 'governess', 'than', 'a', 'friend', 'very', 'fond', 'of', 'both', 'daughters', 'but', 'particularly', 'of', 'emma'], ['between', '_them_', 'it', 'was', 'more', 'the', 'intimacy', 'of', 'sisters'], ['even', 'before', 'miss_taylor', 'had', 'ceased', 'to', 'hold', 'the', 'nominal', 'office', 'of', 'governess', 'the', 'mildness', 'of', 'her', 'temper', 'had', 'hardly', 'allowed', 'her', 'to', 'impose', 'any', 'restraint', 'and', 'the', 'shadow', 'of', 'authority', 'being', 'now', 'long', 'passed', 'away', 'they', 'had', 'been', 'living', 'together', 'as', 'friend', 'and', 'friend', 'very', 'mutually', 'attached', 'and', 'emma', 'doing', 'just', 'what', 'she', 'liked', 'highly', 'esteeming', 'miss_taylor', 's', 'judgment', 'but', 'directed', 'chiefly', 'by', 'her', 'own']]
clean_sents[6]
['sixteen', 'years', 'had', 'miss_taylor', 'been', 'in', 'mr_woodhouse', 's', 'family', 'less', 'as', 'a', 'governess', 'than', 'a', 'friend', 'very', 'fond', 'of', 'both', 'daughters', 'but', 'particularly', 'of', 'emma']
# min_count 대신 max_vocab_size를 사용할 수 있습니다.
# model = Word2Vec(sentences=clean_sents, size=64,
# sg=1, window=10, iter=5,
# min_count=10, workers=4)
# model.save('clean_gutenberg_model.w2v')
# 코랩에서 실행할 경우 다음 코드를 실행합니다.
!wget https://git.io/Jt02A -O clean_gutenberg_model.w2v
--2022-12-07 02:50:02-- https://git.io/Jt02A Resolving git.io (git.io)... 140.82.112.22 Connecting to git.io (git.io)|140.82.112.22|:443... connected. HTTP request sent, awaiting response... 302 Found Location: https://github.com/rickiepark/dl-illustrated/raw/master/notebooks/clean_gutenberg_model.w2v [following] --2022-12-07 02:50:03-- https://github.com/rickiepark/dl-illustrated/raw/master/notebooks/clean_gutenberg_model.w2v Resolving github.com (github.com)... 20.205.243.166 Connecting to github.com (github.com)|20.205.243.166|:443... connected. HTTP request sent, awaiting response... 302 Found Location: https://raw.githubusercontent.com/rickiepark/dl-illustrated/master/notebooks/clean_gutenberg_model.w2v [following] --2022-12-07 02:50:04-- https://raw.githubusercontent.com/rickiepark/dl-illustrated/master/notebooks/clean_gutenberg_model.w2v Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ... Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected. HTTP request sent, awaiting response... 200 OK Length: 8609925 (8.2M) [application/octet-stream] Saving to: ‘clean_gutenberg_model.w2v’ clean_gutenberg_mod 100%[===================>] 8.21M --.-KB/s in 0.02s 2022-12-07 02:50:05 (372 MB/s) - ‘clean_gutenberg_model.w2v’ saved [8609925/8609925]
# 다음 코드로 모델 훈련을 건너 뜁니다.
model = gensim.models.Word2Vec.load('clean_gutenberg_model.w2v')
len(model.wv.vocab) # 전처리를 수행하지 않았다면 만개 정도 됩니다.
10329
model.wv['dog']
array([ 0.38401067, 0.01232518, -0.37594706, -0.00112308, 0.38663676, 0.01287549, 0.398965 , 0.0096426 , -0.10419296, -0.02877572, 0.3207022 , 0.27838793, 0.62772304, 0.34408906, 0.23356602, 0.24557391, 0.3398472 , 0.07168821, -0.18941355, -0.10122284, -0.35172758, 0.4038952 , -0.12179806, 0.096336 , 0.00641343, 0.02332107, 0.7743452 , 0.03591069, -0.20103034, -0.1688079 , -0.01331445, -0.29832968, 0.08522387, -0.02750671, 0.32494134, -0.14266558, -0.4192913 , -0.09291836, -0.23813559, 0.38258648, 0.11036541, 0.005807 , -0.16745028, 0.34308755, -0.20224966, -0.77683043, 0.05146591, -0.5883941 , -0.0718769 , -0.18120563, 0.00358319, -0.29351747, 0.153776 , 0.48048878, 0.22479494, 0.5465321 , 0.29695514, 0.00986911, -0.2450937 , -0.19344331, 0.3541134 , 0.3426432 , -0.10496043, 0.00543602], dtype=float32)
len(model.wv['dog'])
64
model.wv.most_similar('dog', topn=3)
[('puppy', 0.7834004163742065), ('cage', 0.7651870846748352), ('brahmin', 0.7646074295043945)]
model.wv.most_similar('eat', topn=3)
[('drink', 0.8292896747589111), ('bread', 0.8157557845115662), ('meat', 0.763256311416626)]
model.wv.most_similar('day', topn=3)
[('morning', 0.7578363418579102), ('night', 0.7324314713478088), ('week', 0.7262506484985352)]
model.wv.most_similar('father', topn=3)
[('mother', 0.8257375359535217), ('brother', 0.7275018692016602), ('sister', 0.7177823781967163)]
model.wv.most_similar('ma_am', topn=3)
[('madam', 0.8472708463668823), ('nancy', 0.8370794057846069), ('betty', 0.8337127566337585)]
model.wv.doesnt_match("mother father sister brother dog".split())
/usr/local/lib/python3.8/dist-packages/gensim/models/keyedvectors.py:895: FutureWarning: arrays to stack must be passed as a "sequence" type such as list or tuple. Support for non-sequence iterables such as generators is deprecated as of NumPy 1.16 and will raise an error in the future. vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)
'dog'
model.wv.similarity('father', 'dog')
0.44234338
model.wv.most_similar(positive=['father', 'woman'], negative=['man'])
[('mother', 0.7650133371353149), ('husband', 0.7556628584861755), ('sister', 0.7482180595397949), ('daughter', 0.7390402555465698), ('wife', 0.7284981608390808), ('sarah', 0.6856439113616943), ('daughters', 0.6652647256851196), ('conceived', 0.6637862920761108), ('rebekah', 0.6580977439880371), ('dearly', 0.6398962736129761)]
model.wv.most_similar(positive=['husband', 'woman'], negative=['man'])
[('wife', 0.707526445388794), ('sister', 0.6973985433578491), ('maid', 0.6911259889602661), ('daughter', 0.6799546480178833), ('mother', 0.6583081483840942), ('child', 0.6433471441268921), ('conceived', 0.6391384601593018), ('harlot', 0.6089693307876587), ('daughters', 0.6069822907447815), ('marriage', 0.5894294381141663)]
tsne = TSNE(n_components=2, n_iter=1000)
X_2d = tsne.fit_transform(model.wv[model.wv.vocab])
/usr/local/lib/python3.8/dist-packages/sklearn/manifold/_t_sne.py:780: FutureWarning: The default initialization in TSNE will change from 'random' to 'pca' in 1.2. warnings.warn( /usr/local/lib/python3.8/dist-packages/sklearn/manifold/_t_sne.py:790: FutureWarning: The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2. warnings.warn(
coords_df = pd.DataFrame(X_2d, columns=['x','y'])
coords_df['token'] = model.wv.vocab.keys()
coords_df.head()
x | y | token | |
---|---|---|---|
0 | 7.573422 | 63.395519 | emma |
1 | -47.785515 | 18.459305 | by |
2 | 4.948685 | 63.898209 | jane |
3 | -15.264286 | 19.649439 | volume |
4 | -25.539066 | 26.625835 | i |
coords_df.to_csv('clean_gutenberg_tsne.csv', index=False)
coords_df = pd.read_csv('clean_gutenberg_tsne.csv')
_ = coords_df.plot.scatter('x', 'y', figsize=(12,12),
marker='.', s=10, alpha=0.2)
output_notebook()
subset_df = coords_df.sample(n=5000)
p = figure(plot_width=800, plot_height=800)
_ = p.text(x=subset_df.x, y=subset_df.y, text=subset_df.token)
show(p)