Notebook

NLTK¶

Descargar corpus y modelos.

In [5]:

import nltk
nltk.download()
# instalar corpus gutenberg y modelo punkt (tokenizador y segmentador)

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml

Out[5]:

True

Alternativamente:

In [3]:

import nltk
nltk.download('punkt')
nltk.download('gutenberg')

[nltk_data] Downloading package punkt to /home/francolq/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package gutenberg to
[nltk_data]     /home/francolq/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!

Out[3]:

True

In [10]:

from nltk.corpus import gutenberg
gutenberg.fileids()

Out[10]:

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [ ]:

gutenberg.sents('austen-emma.txt')

Estadísticas Básicas¶

Versión básica con diccionarios:

In [12]:

count = {}

for sent in gutenberg.sents('austen-emma.txt'):
    for word in sent:
        if word in count:
            count[word] += 1
        else:
            count[word] = 1
count

Out[12]:

{'fearless': 1,
 'involvement': 1,
 'instigator': 1,
 'uninterruptedly': 1,
 'Hughes': 3,
 'Farmer': 1,
 'Making': 1,
 'divisions': 3,
 'unpleasant': 13,
 'fried': 2,
 'short': 67,
 'threaten': 1,
 'convenient': 5,
 'Something': 8,
 'mails': 1,
 'eaten': 3,
 'faultless': 1,
 'distance': 25,
 'Extracts': 2,
 'grandpapa': 2,
 'memory': 10,
 'reproached': 1,
 'mine': 25,
 'taking': 28,
 'travels': 1,
 'dinner': 47,
 'pet': 1,
 'villain': 1,
 'Smiths': 1,
 'insulted': 1,
 'were': 591,
 'Day': 2,
 'motto': 1,
 'crossed': 6,
 'claiming': 1,
 'transcribed': 2,
 'marriages': 1,
 'excepting': 11,
 'assuming': 4,
 'cope': 1,
 'spirited': 3,
 'knowing': 26,
 'positively': 10,
 'canvassing': 1,
 'bounds': 1,
 'sufficiency': 1,
 'wanting': 33,
 'fuss': 1,
 'kings': 2,
 'considerably': 4,
 'fidgeting': 1,
 'collation': 1,
 'inseparably': 1,
 'commission': 5,
 'Cowper': 1,
 'civil': 13,
 'altogether': 22,
 'Wiltshire': 1,
 'damps': 1,
 'pleased': 47,
 'carriages': 11,
 'altar': 1,
 'bewitching': 2,
 'abusing': 2,
 'war': 1,
 'detaining': 2,
 'confiding': 1,
 'headache': 4,
 'medium': 2,
 'sang': 3,
 'Late': 1,
 'ensure': 2,
 'chances': 5,
 'energy': 3,
 'admiration': 24,
 'clownish': 2,
 'Anywhere': 1,
 'ceremony': 7,
 'prophecies': 1,
 'Within': 4,
 'punishment': 4,
 'Tuesday': 7,
 'undertaken': 1,
 'wife': 68,
 'liberality': 2,
 'confirming': 1,
 'add': 11,
 'exulting': 2,
 'gravel': 2,
 'sublime': 1,
 'manner': 75,
 'desired': 12,
 'alert': 3,
 'singularly': 1,
 'privations': 4,
 'practicable': 3,
 'glanced': 2,
 'Nothing': 19,
 'assurance': 8,
 'Shakespeare': 1,
 'dirt': 1,
 'desires': 2,
 'whenever': 22,
 'borrow': 1,
 'established': 5,
 'resemblance': 5,
 'remains': 5,
 'quarrelled': 4,
 'Cautious': 1,
 'acquainted': 33,
 'vicarage': 5,
 'judicious': 3,
 'whatever': 16,
 'disagreement': 3,
 'sets': 1,
 'unpretending': 3,
 'refreshment': 2,
 'dissipation': 1,
 'perusal': 1,
 'issued': 1,
 'continuance': 2,
 'disappointed': 20,
 'compliment': 31,
 'carriage': 69,
 'unsteadiness': 2,
 'differently': 11,
 'prevented': 9,
 'prisoner': 1,
 'induced': 12,
 'since': 63,
 'overrated': 1,
 'Ours': 1,
 'collections': 1,
 'nicety': 1,
 'obstinate': 1,
 'caprices': 1,
 'contingencies': 1,
 'preferred': 10,
 'regularly': 3,
 'bottom': 3,
 'Holyhead': 1,
 'disgust': 5,
 'fact': 24,
 'preparations': 3,
 'pass': 30,
 'curiosity': 28,
 '_at_': 2,
 'teachers': 3,
 'excite': 3,
 'addressing': 9,
 'condolence': 1,
 'temper': 36,
 'CHURCHILL': 1,
 'unmentioned': 1,
 'apiece': 1,
 'truly': 12,
 'waste': 2,
 'staid': 8,
 'absences': 1,
 '----': 1,
 'sad': 25,
 'does': 125,
 'objections': 4,
 'baby': 6,
 'interference': 7,
 'corrected': 1,
 'due': 23,
 'parts': 2,
 'guilt': 3,
 'homewards': 1,
 'particularly': 46,
 'thing': 398,
 'mile': 11,
 'unseasonableness': 1,
 'maintenance': 2,
 'concert': 1,
 'excess': 1,
 'respective': 1,
 'mercy': 3,
 'awake': 2,
 'paused': 6,
 'eats': 2,
 'favourite': 23,
 'nominal': 1,
 'spontaneous': 1,
 'festivity': 1,
 'shifted': 1,
 'forswear': 1,
 'reconcile': 2,
 'irresolute': 2,
 'Trouble': 1,
 'eyes': 51,
 'ceaseless': 2,
 'killed': 4,
 '_The_': 1,
 'drizzle': 1,
 'thankful': 5,
 'exquisite': 8,
 'chattering': 1,
 'broke': 1,
 'instantly': 14,
 'slightingly': 1,
 'Dr': 2,
 'complacency': 3,
 'footstep': 2,
 'suffering': 16,
 'accommodation': 6,
 'want': 89,
 'rheumatic': 1,
 'Donwell': 49,
 'anxiously': 6,
 'ult': 1,
 'belonged': 6,
 'congratulatory': 2,
 'detain': 2,
 'acquiescence': 5,
 'management': 1,
 'ancient': 1,
 'neighbour': 8,
 'casements': 1,
 'amongst': 3,
 'who': 281,
 'pre': 1,
 'your': 337,
 'conveying': 1,
 'feebleness': 1,
 'games': 1,
 'diffident': 2,
 'tittle': 1,
 'clearly': 3,
 'ages': 2,
 'lives': 7,
 'division': 3,
 'apprehended': 3,
 'citizen': 1,
 'listener': 1,
 'fame': 4,
 'lived': 25,
 'prejudices': 1,
 'illumination': 1,
 'conversation': 42,
 'relapse': 1,
 'occurrence': 3,
 'Martins': 13,
 'expression': 9,
 'fireside': 4,
 'Does': 5,
 '_little_': 2,
 'Aunt': 1,
 'expiration': 1,
 'miserably': 3,
 'prodigy': 2,
 'impartial': 1,
 'limited': 3,
 'inclined': 6,
 'already': 45,
 'entreaties': 6,
 'contradiction': 3,
 'Find': 1,
 'complaints': 4,
 'heiress': 1,
 'seats': 1,
 'lessons': 1,
 'overhear': 1,
 'elect': 1,
 'asperity': 1,
 'flower': 1,
 'allowing': 5,
 'involved': 6,
 'attitude': 3,
 'kingdoms': 1,
 'parting': 13,
 'recall': 3,
 'quarter': 25,
 'asks': 3,
 'dispelled': 1,
 'falsehoods': 1,
 'Quite': 13,
 'suspect': 21,
 'chance': 20,
 'inelegance': 1,
 'acre': 1,
 'dirty': 5,
 'knight': 1,
 'sickness': 2,
 'envy': 6,
 'Bath': 19,
 'mount': 1,
 'errantry': 1,
 'estimated': 1,
 'overthrow': 1,
 'Absence': 1,
 'fetched': 2,
 'rejoicing': 1,
 'unfeignedly': 1,
 'denotes': 1,
 'prudence': 5,
 'ever': 189,
 'communicating': 2,
 'persuade': 14,
 'relief': 9,
 'egg': 3,
 'agreeableness': 2,
 'blow': 3,
 'hardened': 1,
 'shells': 1,
 'stir': 7,
 'rewarded': 1,
 'affectionate': 9,
 'courts': 1,
 'luck': 11,
 'Give': 2,
 'cavil': 1,
 'active': 7,
 'topic': 3,
 'Otway': 5,
 'reprehensible': 1,
 'unfit': 4,
 'own': 301,
 'standers': 1,
 'services': 1,
 'Low': 1,
 'gayest': 1,
 '_treasures_': 1,
 'complains': 2,
 'awe': 1,
 'ingenious': 2,
 'Astonished': 1,
 'worse': 28,
 'apparatus': 1,
 'no': 616,
 'disappearance': 3,
 'replied': 79,
 'Langham': 1,
 'tried': 26,
 '_when_': 1,
 'strikes': 3,
 'meets': 1,
 'faster': 2,
 'complaisance': 2,
 '[': 2,
 'bound': 1,
 'entrance': 5,
 'occasion': 28,
 'honestly': 3,
 'estimation': 3,
 'embarrassing': 1,
 'degrading': 3,
 'indulged': 4,
 'condemn': 2,
 'grace': 8,
 'excited': 6,
 'mentioning': 5,
 'Two': 7,
 'infantry': 1,
 'longing': 4,
 'divert': 1,
 'pleasantest': 3,
 'studied': 2,
 '_Taylor_': 1,
 'cautioned': 1,
 'handled': 1,
 'bar': 1,
 'representations': 1,
 'minutes': 53,
 'constrained': 1,
 'Name': 1,
 'small': 30,
 'threatening': 2,
 'cockade': 1,
 'battle': 1,
 'cry': 1,
 'disdain': 4,
 'nods': 1,
 'returning': 17,
 'trimming': 2,
 'prefer': 6,
 'finger': 3,
 'Oxford': 2,
 'softest': 1,
 'regretted': 5,
 'besides': 14,
 'articles': 1,
 'matting': 1,
 'presume': 10,
 'antidote': 1,
 'agreeable': 50,
 'helpless': 2,
 'characteristic': 1,
 'roast': 4,
 'Undoubtedly': 1,
 'warmer': 6,
 'alarmed': 7,
 'XI': 3,
 'shrunk': 1,
 'settled': 39,
 'marry': 63,
 'persuasion': 11,
 'summons': 2,
 'proposition': 3,
 'females': 2,
 '_as_': 1,
 'moralising': 1,
 'forbade': 1,
 'As': 49,
 'trusting': 3,
 'welcomed': 4,
 'puzzle': 2,
 'consulting': 4,
 'Forcing': 1,
 'planning': 3,
 'disturbance': 2,
 'gardeners': 2,
 'bow': 4,
 'rule': 10,
 'effort': 5,
 'unfeeling': 3,
 'Escape': 1,
 'hundreds': 1,
 'minority': 1,
 'patience': 11,
 'stay': 43,
 'danced': 8,
 'book': 11,
 'counter': 2,
 'dreamer': 2,
 'across': 7,
 'misconceptions': 1,
 'began': 64,
 'puzzled': 3,
 'candour': 5,
 'riding': 1,
 ':--"': 2,
 'vigour': 1,
 'engrossing': 1,
 'butter': 2,
 'marries': 4,
 'managed': 1,
 'hazle': 2,
 'shawl': 5,
 'breaking': 3,
 'cakes': 1,
 'irrational': 1,
 'Real': 1,
 'regular': 17,
 'hoped': 43,
 'mis': 2,
 'reigning': 1,
 'troublesome': 9,
 'patroness': 1,
 'Dinner': 2,
 'disclosure': 3,
 'blush': 22,
 'degradation': 4,
 'brilliancy': 1,
 'guessed': 7,
 'longest': 3,
 'undervalued': 1,
 'apologised': 1,
 'eagerly': 11,
 'facts': 1,
 'enter': 8,
 'be': 1970,
 'unperceived': 2,
 'closest': 2,
 'nobility': 1,
 'serve': 3,
 'Easter': 2,
 'disgrace': 2,
 'detained': 7,
 'steadiness': 4,
 'slowly': 6,
 'inspire': 1,
 'meetings': 6,
 'unquestionably': 1,
 'ordered': 10,
 'confirmed': 5,
 'grievances': 1,
 'entitled': 3,
 'tax': 1,
 'motion': 5,
 'landau': 7,
 'systems': 1,
 'muslins': 1,
 'dreaming': 1,
 'Sometimes': 2,
 'hundred': 15,
 'live': 17,
 'hot': 11,
 'despoiling': 1,
 'apparent': 10,
 'rode': 3,
 'experience': 4,
 'some': 248,
 '_refused_': 1,
 'been': 759,
 'vexed': 4,
 'stanza': 1,
 'untouched': 1,
 'of': 4279,
 'clothes': 1,
 "!'": 9,
 'nursed': 3,
 'likewise': 4,
 'Rather': 1,
 'goodness': 7,
 'visibly': 2,
 '_must_': 4,
 'unfelt': 1,
 'also': 24,
 'main': 2,
 'implicitly': 1,
 'effects': 5,
 'exerting': 2,
 'beautifully': 3,
 'afternoon': 6,
 'recollecting': 3,
 'industry': 1,
 'transgression': 1,
 'without': 211,
 'oath': 1,
 'go': 129,
 'existed': 1,
 'accustomed': 2,
 'rightly': 4,
 'tremblings': 1,
 'Otways': 1,
 'wind': 5,
 'day': 190,
 'heightened': 1,
 'chatty': 2,
 'rubber': 1,
 'unbleached': 1,
 'authority': 6,
 'allies': 1,
 'prosings': 1,
 'listeners': 1,
 'audibly': 1,
 'homely': 1,
 'sketch': 3,
 'letter': 109,
 'Circumstances': 1,
 'You': 303,
 'constantly': 8,
 'taught': 7,
 'enviable': 1,
 'practising': 2,
 'readiest': 1,
 'head': 40,
 'tie': 1,
 'informs': 1,
 'loin': 3,
 'pales': 1,
 'enemies': 1,
 'unknown': 4,
 'thus': 12,
 'ridden': 2,
 'lord': 3,
 'takes': 8,
 'denoted': 1,
 'remedy': 1,
 '_recollecting_': 1,
 'deadening': 1,
 'tied': 1,
 'easy': 28,
 'powerful': 5,
 'risking': 1,
 'announced': 5,
 'reserve': 11,
 'contemplating': 1,
 '_Her_': 2,
 'repent': 6,
 'umbrella': 1,
 '_just_': 4,
 'operations': 1,
 'begs': 1,
 'sports': 1,
 'claimed': 2,
 'hated': 1,
 'ending': 3,
 'requires': 5,
 'meal': 4,
 'nobody': 54,
 'spirits': 64,
 'allusions': 1,
 'amiable': 34,
 'diffuses': 1,
 'diligence': 1,
 'incapable': 2,
 'doing': 45,
 'deference': 2,
 'plotting': 1,
 'gossips': 1,
 'whichever': 1,
 'players': 1,
 'Used': 1,
 'poet': 2,
 'bailiff': 1,
 'generosity': 4,
 'accomplished': 11,
 'According': 1,
 'enormous': 1,
 '.--`': 5,
 'values': 1,
 'deathbed': 1,
 'prosperity': 3,
 'relative': 5,
 'ingeniously': 1,
 'greatest': 29,
 'amounted': 1,
 'harboured': 1,
 'perfection': 10,
 'unspent': 1,
 'key': 4,
 '_greater_': 1,
 'broiling': 1,
 'stopt': 9,
 'demurred': 1,
 'captious': 2,
 'flew': 2,
 'sting': 1,
 'imagines': 1,
 'Should': 3,
 'disengage': 2,
 'bewitches': 1,
 'encumbrance': 3,
 'directions': 4,
 'redeem': 1,
 'Mitchell': 1,
 'These': 18,
 '.]': 1,
 'patiently': 1,
 'covering': 1,
 'hand': 53,
 ".'": 32,
 'reports': 1,
 'ashamed': 28,
 'estimable': 1,
 'candlelight': 2,
 'notch': 1,
 'accuse': 1,
 'elms': 1,
 '_present_': 1,
 'merry': 2,
 'harmonise': 1,
 'part': 66,
 'Granted': 1,
 'form': 19,
 'seeing': 51,
 'Genlis': 1,
 'debt': 1,
 'Full': 2,
 'gang': 1,
 'Excepting': 2,
 'causes': 4,
 'Better': 7,
 'withdrawn': 3,
 'turns': 9,
 'tidings': 3,
 'arguments': 2,
 'denied': 4,
 'fervour': 1,
 'astonishment': 6,
 'convenience': 6,
 'unreserved': 1,
 'modes': 2,
 'attachment': 48,
 'wilfully': 1,
 'fence': 1,
 'remonstrated': 1,
 '1816': 1,
 'despise': 1,
 'ungenerous': 1,
 'around': 13,
 'emotions': 2,
 'reminding': 2,
 'influence': 24,
 'said': 484,
 'pushed': 4,
 'Wickedness': 1,
 'conceive': 5,
 'bustle': 10,
 'laughing': 17,
 'relish': 1,
 'describing': 2,
 'Fortune': 1,
 'novelty': 2,
 'dissentient': 1,
 'fro': 1,
 'willing': 13,
 'clever': 27,
 'Altogether': 1,
 'scornful': 1,
 'deciding': 1,
 'quiet': 25,
 'affability': 1,
 'bit': 5,
 'Bates': 148,
 'loveliness': 3,
 '_moment_': 1,
 'talent': 9,
 'devotion': 2,
 'eyebrows': 2,
 'forwards': 2,
 'supposes': 1,
 'arms': 2,
 'fairy': 3,
 'anticipation': 4,
 'What': 102,
 'Captain': 3,
 'rationally': 2,
 'misled': 2,
 'consistently': 1,
 'sofa': 5,
 'Tell': 4,
 'Anxious': 1,
 'hesitatingly': 3,
 'riddles': 2,
 'sucking': 1,
 'robe': 1,
 'route': 1,
 'pure': 1,
 'Vigorous': 1,
 'inflicting': 1,
 'joyfully': 1,
 'agreeably': 5,
 'knock': 1,
 'English': 8,
 'usually': 4,
 'appear': 36,
 'period': 18,
 'windows': 9,
 'motive': 9,
 'figures': 1,
 'Isabella': 69,
 'urgent': 2,
 'deep': 14,
 'retirement': 5,
 'unclosed': 1,
 'delayed': 3,
 'attention': 59,
 'fondest': 1,
 'wet': 5,
 'governed': 2,
 'tremble': 2,
 'bids': 1,
 'illiterate': 2,
 'interposition': 1,
 'amusing': 7,
 'hindrance': 1,
 'tenderness': 6,
 'sister': 33,
 'feminine': 1,
 'gladness': 1,
 'disappoint': 3,
 'surmises': 1,
 'overpowered': 4,
 'Dorking': 1,
 'somewhat': 4,
 'conscience': 8,
 'Candles': 1,
 'supplied': 5,
 'eating': 7,
 'receipt': 1,
 'Weston': 439,
 'grey': 1,
 'contribute': 1,
 'analogy': 1,
 'rapidity': 3,
 'hinting': 1,
 'negligence': 1,
 'objects': 7,
 'planned': 2,
 'shake': 9,
 'accent': 5,
 'saddle': 1,
 'nay': 5,
 'with': 1187,
 'afloat': 1,
 'softly': 1,
 'leg': 6,
 'inferiority': 5,
 'promote': 4,
 'playing': 9,
 'delight': 21,
 'unconcern': 1,
 'merit': 14,
 'accession': 1,
 'expense': 7,
 'Wright': 4,
 'doom': 1,
 'chapter': 1,
 'grown': 9,
 'intellects': 2,
 'guilty': 1,
 'neighbours': 16,
 'pools': 1,
 'suspecting': 5,
 'residence': 4,
 'need': 42,
 'Call': 1,
 'run': 17,
 'interested': 15,
 'blue': 4,
 'enjoyment': 22,
 'conscious': 11,
 'poultry': 2,
 'stipulation': 1,
 'much': 478,
 'pursue': 1,
 'acknowledge': 14,
 'debased': 1,
 'Almane': 1,
 'thither': 4,
 'representation': 3,
 'disclaiming': 1,
 'bring': 38,
 'proceeded': 13,
 'Perfectly': 2,
 '_gentleman_': 1,
 'combine': 1,
 'sight': 26,
 'excused': 3,
 'indolent': 1,
 'brain': 7,
 'courses': 1,
 'Weymouth': 17,
 'removal': 7,
 'tranquillised': 2,
 'boiled': 5,
 'rendering': 1,
 'denote': 3,
 'superiority': 10,
 'undesigned': 1,
 'unmodulated': 1,
 'fatiguing': 1,
 'presiding': 1,
 'conduct': 27,
 'teased': 1,
 'acquired': 3,
 'intervals': 1,
 '000': 2,
 'improves': 1,
 'bilious': 3,
 'posture': 1,
 'convincing': 3,
 'lets': 1,
 'licentiousness': 1,
 'individually': 1,
 'counteract': 3,
 'gratifications': 1,
 'indelicate': 1,
 'root': 1,
 'hatred': 1,
 'unabated': 1,
 'nnight': 1,
 'waiters': 1,
 'quite': 269,
 'male': 1,
 'step': 11,
 'grounds': 10,
 'glances': 1,
 'hired': 1,
 'misspent': 1,
 'gentlemanlike': 5,
 'endure': 13,
 'considerable': 17,
 'resisted': 2,
 'suffers': 1,
 'engrosses': 1,
 'professed': 6,
 'hotter': 2,
 'requesting': 1,
 'sanguine': 7,
 'sheltered': 2,
 'cheap': 2,
 'coxcomb': 3,
 'rarity': 1,
 'dispersing': 1,
 'belonging': 4,
 'comforted': 3,
 'gold': 1,
 'thorough': 12,
 'figured': 1,
 'always': 235,
 'those': 87,
 'emphasis': 1,
 'holiday': 2,
 'Elegant': 3,
 'justice': 24,
 'unobjectionable': 1,
 'intending': 4,
 'forestalling': 1,
 'screwed': 1,
 'insipid': 1,
 'scalloped': 1,
 'promoting': 1,
 'Many': 5,
 'advantages': 12,
 'hungry': 3,
 'exercise': 17,
 'tone': 24,
 'charades': 4,
 'mounts': 1,
 'At': 34,
 'styles': 1,
 'secures': 1,
 'popular': 1,
 'church': 6,
 'Dixon': 44,
 'sweep': 5,
 'opens': 1,
 'rain': 22,
 'doat': 1,
 'evenings': 5,
 'delineated': 1,
 'headaches': 1,
 'hopeless': 1,
 'brilliant': 6,
 'rights': 5,
 'continued': 38,
 'Three': 4,
 'Perrys': 3,
 'matters': 10,
 'corn': 1,
 'Sept': 1,
 'Abbots': 1,
 'jealousy': 5,
 'quickly': 7,
 'convince': 9,
 'sentiment': 10,
 'dissuaded': 1,
 'conviction': 16,
 'legal': 1,
 'likeness': 21,
 'questions': 12,
 'continuing': 5,
 'blessed': 6,
 'pencilled': 1,
 'custard': 1,
 'derived': 1,
 'People': 2,
 'youth': 11,
 'necessary': 36,
 'accents': 1,
 'indistinct': 1,
 'lapse': 1,
 'superior': 59,
 'observant': 2,
 'happy': 122,
 'Crown': 25,
 'predominated': 1,
 'Beg': 1,
 'praises': 1,
 'reserved': 9,
 'songs': 1,
 'Stilton': 1,
 'safety': 10,
 '_his_': 4,
 'cold': 54,
 'pretence': 9,
 'IV': 3,
 'purposes': 1,
 'ulcerated': 1,
 'neat': 6,
 'foresight': 1,
 'extended': 1,
 'reading': 15,
 'loser': 1,
 'women': 19,
 'home': 130,
 'weather': 39,
 '_should_': 2,
 'recovering': 4,
 'shrinking': 1,
 'treat': 2,
 'Serious': 1,
 'touches': 2,
 'undoubted': 1,
 'class': 5,
 'thankfully': 2,
 'Invite': 2,
 '_appropriation_': 1,
 'cheerlessness': 1,
 'predictions': 1,
 'penetrated': 1,
 'death': 10,
 'effecting': 1,
 'escaped': 7,
 'finessing': 1,
 'hearing': 32,
 'decline': 2,
 'infinitely': 14,
 '?--': 200,
 'trivial': 3,
 'displayed': 2,
 'WINDSOR': 1,
 'administered': 1,
 'adherence': 1,
 'witnessed': 4,
 ...}

Versión mejorada con defaultdicts:

In [22]:

from collections import defaultdict

count = defaultdict(int)

for sent in gutenberg.sents('austen-emma.txt'):
    for word in sent:
        count[word] += 1

In [23]:

print('10 palabras más frecuentes:', sorted(count.items(), key=lambda x: -x[1])[:10])
print('Vocabulario:', len(count))
print('Tokens:', sum(count.values()))

10 palabras más frecuentes: [(',', 11454), ('.', 6928), ('to', 5183), ('the', 4844), ('and', 4672), ('of', 4279), ('I', 3178), ('a', 3004), ('was', 2385), ('her', 2381)]
Vocabulario: 7806
Tokens: 192484

Versión usando clase Counter:

In [14]:

from collections import Counter

count = Counter()

for sent in gutenberg.sents('austen-emma.txt'):
    count.update(sent)

In [21]:

print('10 palabras más frecuentes:', count.most_common()[:10])
print('Vocabulario:', len(count))
print('Tokens:', sum(count.values()))

10 palabras más frecuentes: [(',', 11454), ('.', 6928), ('to', 5183), ('the', 4844), ('and', 4672), ('of', 4279), ('I', 3178), ('a', 3004), ('was', 2385), ('her', 2381)]
Vocabulario: 7806
Tokens: 192484

Corpus de Texto Plano¶

Primero crear archivo example.txt: "Estimados Sr. y sra. Gómez. Se los cita por el art. 32 de la ley 21.234."

In [24]:

from nltk.corpus import PlaintextCorpusReader

help(PlaintextCorpusReader)

Help on class PlaintextCorpusReader in module nltk.corpus.reader.plaintext:

class PlaintextCorpusReader(nltk.corpus.reader.api.CorpusReader)
 |  Reader for corpora that consist of plaintext documents.  Paragraphs
 |  are assumed to be split using blank lines.  Sentences and words can
 |  be tokenized using the default tokenizers, or by custom tokenizers
 |  specificed as parameters to the constructor.
 |  
 |  This corpus reader can be customized (e.g., to skip preface
 |  sections of specific document formats) by creating a subclass and
 |  overriding the ``CorpusView`` class variable.
 |  
 |  Method resolution order:
 |      PlaintextCorpusReader
 |      nltk.corpus.reader.api.CorpusReader
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __init__(self, root, fileids, word_tokenizer=WordPunctTokenizer(pattern='\\w+|[^\\w\\s]+', gaps=False, discard_empty=True, flags=56), sent_tokenizer=<nltk.tokenize.punkt.PunktSentenceTokenizer object at 0x7f60bb13f630>, para_block_reader=<function read_blankline_block at 0x7f60bb14f2f0>, encoding='utf8')
 |      Construct a new plaintext corpus reader for a set of documents
 |      located at the given root directory.  Example usage:
 |      
 |          >>> root = '/usr/local/share/nltk_data/corpora/webtext/'
 |          >>> reader = PlaintextCorpusReader(root, '.*\.txt') # doctest: +SKIP
 |      
 |      :param root: The root directory for this corpus.
 |      :param fileids: A list or regexp specifying the fileids in this corpus.
 |      :param word_tokenizer: Tokenizer for breaking sentences or
 |          paragraphs into words.
 |      :param sent_tokenizer: Tokenizer for breaking paragraphs
 |          into words.
 |      :param para_block_reader: The block reader used to divide the
 |          corpus into paragraph blocks.
 |  
 |  paras(self, fileids=None)
 |      :return: the given file(s) as a list of
 |          paragraphs, each encoded as a list of sentences, which are
 |          in turn encoded as lists of word strings.
 |      :rtype: list(list(list(str)))
 |  
 |  raw(self, fileids=None)
 |      :return: the given file(s) as a single string.
 |      :rtype: str
 |  
 |  sents(self, fileids=None)
 |      :return: the given file(s) as a list of
 |          sentences or utterances, each encoded as a list of word
 |          strings.
 |      :rtype: list(list(str))
 |  
 |  words(self, fileids=None)
 |      :return: the given file(s) as a list of words
 |          and punctuation symbols.
 |      :rtype: list(str)
 |  
 |  ----------------------------------------------------------------------
 |  Data and other attributes defined here:
 |  
 |  CorpusView = <class 'nltk.corpus.reader.util.StreamBackedCorpusView'>
 |      A 'view' of a corpus file, which acts like a sequence of tokens:
 |      it can be accessed by index, iterated over, etc.  However, the
 |      tokens are only constructed as-needed -- the entire corpus is
 |      never stored in memory at once.
 |      
 |      The constructor to ``StreamBackedCorpusView`` takes two arguments:
 |      a corpus fileid (specified as a string or as a ``PathPointer``);
 |      and a block reader.  A "block reader" is a function that reads
 |      zero or more tokens from a stream, and returns them as a list.  A
 |      very simple example of a block reader is:
 |      
 |          >>> def simple_block_reader(stream):
 |          ...     return stream.readline().split()
 |      
 |      This simple block reader reads a single line at a time, and
 |      returns a single token (consisting of a string) for each
 |      whitespace-separated substring on the line.
 |      
 |      When deciding how to define the block reader for a given
 |      corpus, careful consideration should be given to the size of
 |      blocks handled by the block reader.  Smaller block sizes will
 |      increase the memory requirements of the corpus view's internal
 |      data structures (by 2 integers per block).  On the other hand,
 |      larger block sizes may decrease performance for random access to
 |      the corpus.  (But note that larger block sizes will *not*
 |      decrease performance for iteration.)
 |      
 |      Internally, ``CorpusView`` maintains a partial mapping from token
 |      index to file position, with one entry per block.  When a token
 |      with a given index *i* is requested, the ``CorpusView`` constructs
 |      it as follows:
 |      
 |        1. First, it searches the toknum/filepos mapping for the token
 |           index closest to (but less than or equal to) *i*.
 |      
 |        2. Then, starting at the file position corresponding to that
 |           index, it reads one block at a time using the block reader
 |           until it reaches the requested token.
 |      
 |      The toknum/filepos mapping is created lazily: it is initially
 |      empty, but every time a new block is read, the block's
 |      initial token is added to the mapping.  (Thus, the toknum/filepos
 |      map has one entry per block.)
 |      
 |      In order to increase efficiency for random access patterns that
 |      have high degrees of locality, the corpus view may cache one or
 |      more blocks.
 |      
 |      :note: Each ``CorpusView`` object internally maintains an open file
 |          object for its underlying corpus file.  This file should be
 |          automatically closed when the ``CorpusView`` is garbage collected,
 |          but if you wish to close it manually, use the ``close()``
 |          method.  If you access a ``CorpusView``'s items after it has been
 |          closed, the file object will be automatically re-opened.
 |      
 |      :warning: If the contents of the file are modified during the
 |          lifetime of the ``CorpusView``, then the ``CorpusView``'s behavior
 |          is undefined.
 |      
 |      :warning: If a unicode encoding is specified when constructing a
 |          ``CorpusView``, then the block reader may only call
 |          ``stream.seek()`` with offsets that have been returned by
 |          ``stream.tell()``; in particular, calling ``stream.seek()`` with
 |          relative offsets, or with offsets based on string lengths, may
 |          lead to incorrect behavior.
 |      
 |      :ivar _block_reader: The function used to read
 |          a single block from the underlying file stream.
 |      :ivar _toknum: A list containing the token index of each block
 |          that has been processed.  In particular, ``_toknum[i]`` is the
 |          token index of the first token in block ``i``.  Together
 |          with ``_filepos``, this forms a partial mapping between token
 |          indices and file positions.
 |      :ivar _filepos: A list containing the file position of each block
 |          that has been processed.  In particular, ``_toknum[i]`` is the
 |          file position of the first character in block ``i``.  Together
 |          with ``_toknum``, this forms a partial mapping between token
 |          indices and file positions.
 |      :ivar _stream: The stream used to access the underlying corpus file.
 |      :ivar _len: The total number of tokens in the corpus, if known;
 |          or None, if the number of tokens is not yet known.
 |      :ivar _eofpos: The character position of the last character in the
 |          file.  This is calculated when the corpus view is initialized,
 |          and is used to decide when the end of file has been reached.
 |      :ivar _cache: A cache of the most recently read block.  It
 |         is encoded as a tuple (start_toknum, end_toknum, tokens), where
 |         start_toknum is the token index of the first token in the block;
 |         end_toknum is the token index of the first token not in the
 |         block; and tokens is a list of the tokens in the block.
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from nltk.corpus.reader.api.CorpusReader:
 |  
 |  __repr__(self)
 |      Return repr(self).
 |  
 |  __unicode__ = __str__(self, /)
 |      Return str(self).
 |  
 |  abspath(self, fileid)
 |      Return the absolute path for the given file.
 |      
 |      :type fileid: str
 |      :param fileid: The file identifier for the file whose path
 |          should be returned.
 |      :rtype: PathPointer
 |  
 |  abspaths(self, fileids=None, include_encoding=False, include_fileid=False)
 |      Return a list of the absolute paths for all fileids in this corpus;
 |      or for the given list of fileids, if specified.
 |      
 |      :type fileids: None or str or list
 |      :param fileids: Specifies the set of fileids for which paths should
 |          be returned.  Can be None, for all fileids; a list of
 |          file identifiers, for a specified set of fileids; or a single
 |          file identifier, for a single file.  Note that the return
 |          value is always a list of paths, even if ``fileids`` is a
 |          single file identifier.
 |      
 |      :param include_encoding: If true, then return a list of
 |          ``(path_pointer, encoding)`` tuples.
 |      
 |      :rtype: list(PathPointer)
 |  
 |  citation(self)
 |      Return the contents of the corpus citation.bib file, if it exists.
 |  
 |  encoding(self, file)
 |      Return the unicode encoding for the given corpus file, if known.
 |      If the encoding is unknown, or if the given file should be
 |      processed using byte strings (str), then return None.
 |  
 |  ensure_loaded(self)
 |      Load this corpus (if it has not already been loaded).  This is
 |      used by LazyCorpusLoader as a simple method that can be used to
 |      make sure a corpus is loaded -- e.g., in case a user wants to
 |      do help(some_corpus).
 |  
 |  fileids(self)
 |      Return a list of file identifiers for the fileids that make up
 |      this corpus.
 |  
 |  license(self)
 |      Return the contents of the corpus LICENSE file, if it exists.
 |  
 |  open(self, file)
 |      Return an open stream that can be used to read the given file.
 |      If the file's encoding is not None, then the stream will
 |      automatically decode the file's contents into unicode.
 |      
 |      :param file: The file identifier of the file to read.
 |  
 |  readme(self)
 |      Return the contents of the corpus README file, if it exists.
 |  
 |  unicode_repr = __repr__(self)
 |      Return repr(self).
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors inherited from nltk.corpus.reader.api.CorpusReader:
 |  
 |  __dict__
 |      dictionary for instance variables (if defined)
 |  
 |  __weakref__
 |      list of weak references to the object (if defined)
 |  
 |  root
 |      The directory where this corpus is stored.
 |      
 |      :type: PathPointer

In [28]:

corpus = PlaintextCorpusReader('.', 'example.txt')

In [29]:

list(corpus.sents())

Out[29]:

[['Estimados', 'Sr', '.', 'y', 'sra', '.'],
 ['Gómez', '.'],
 ['Se', 'los', 'cita', 'por', 'el', 'art', '.'],
 ['32', 'de', 'la', 'ley', '21', '.', '234', '.']]

Tokenización¶

De la documentación de NLTK obtenemos una expresión regular para tokenizar:

In [30]:

pattern = r'''(?x)    # set flag to allow verbose regexps
     (?:[A-Z]\.)+        # abbreviations, e.g. U.S.A.
   | \w+(?:-\w+)*        # words with optional internal hyphens
   | \$?\d+(?:\.\d+)?%?  # currency and percentages, e.g. $12.40, 82%
   | \.\.\.            # ellipsis
   | [][.,;"'?():-_`]  # these are separate tokens; includes ], [
'''

Lo probamos:

In [32]:

from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(pattern)

corpus = PlaintextCorpusReader('.', 'example.txt', word_tokenizer=tokenizer)
list(corpus.sents())

Out[32]:

[['Estimados', 'Sr', '.', 'y', 'sra', '.'],
 ['Gómez', '.'],
 ['Se', 'los', 'cita', 'por', 'el', 'art', '.'],
 ['32', 'de', 'la', 'ley', '21', '.', '234', '.']]

Vemos que tokeniza mal todas las abreviaciones y el número "21.234". Mejoramos la expresión regular y probamos:

In [33]:

pattern = r'''(?x)    # set flag to allow verbose regexps
   (?:\d{1,3}(?:\.\d{3})+)  # numbers with '.' in the middle
   | (?:[Ss]r\.|[Ss]ra\.|art\.)  # common spanish abbreviations
   | (?:[A-Z]\.)+        # abbreviations, e.g. U.S.A.
   | \w+(?:-\w+)*        # words with optional internal hyphens
   | \$?\d+(?:\.\d+)?%?  # currency and percentages, e.g. $12.40, 82%
   | \.\.\.            # ellipsis
   | [][.,;"'?():-_`]  # these are separate tokens; includes ], [
'''
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(pattern)

corpus = PlaintextCorpusReader('.', 'example.txt', word_tokenizer=tokenizer)
list(corpus.sents())

Out[33]:

[['Estimados', 'Sr.', 'y', 'sra.'],
 ['Gómez', '.'],
 ['Se', 'los', 'cita', 'por', 'el', 'art.'],
 ['32', 'de', 'la', 'ley', '21.234', '.']]

Ahora tokeniza bien!!

(La segmentación en oraciones sigue estando mal, pero resolver eso queda fuera de esta clase.)