Descargar corpus y modelos.
import nltk
nltk.download()
# instalar corpus gutenberg y modelo punkt (tokenizador y segmentador)
showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml
True
Alternativamente:
import nltk
nltk.download('punkt')
nltk.download('gutenberg')
[nltk_data] Downloading package punkt to /home/francolq/nltk_data... [nltk_data] Package punkt is already up-to-date! [nltk_data] Downloading package gutenberg to [nltk_data] /home/francolq/nltk_data... [nltk_data] Package gutenberg is already up-to-date!
True
from nltk.corpus import gutenberg
gutenberg.fileids()
['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']
gutenberg.sents('austen-emma.txt')
Versión básica con diccionarios:
count = {}
for sent in gutenberg.sents('austen-emma.txt'):
for word in sent:
if word in count:
count[word] += 1
else:
count[word] = 1
count
{'fearless': 1, 'involvement': 1, 'instigator': 1, 'uninterruptedly': 1, 'Hughes': 3, 'Farmer': 1, 'Making': 1, 'divisions': 3, 'unpleasant': 13, 'fried': 2, 'short': 67, 'threaten': 1, 'convenient': 5, 'Something': 8, 'mails': 1, 'eaten': 3, 'faultless': 1, 'distance': 25, 'Extracts': 2, 'grandpapa': 2, 'memory': 10, 'reproached': 1, 'mine': 25, 'taking': 28, 'travels': 1, 'dinner': 47, 'pet': 1, 'villain': 1, 'Smiths': 1, 'insulted': 1, 'were': 591, 'Day': 2, 'motto': 1, 'crossed': 6, 'claiming': 1, 'transcribed': 2, 'marriages': 1, 'excepting': 11, 'assuming': 4, 'cope': 1, 'spirited': 3, 'knowing': 26, 'positively': 10, 'canvassing': 1, 'bounds': 1, 'sufficiency': 1, 'wanting': 33, 'fuss': 1, 'kings': 2, 'considerably': 4, 'fidgeting': 1, 'collation': 1, 'inseparably': 1, 'commission': 5, 'Cowper': 1, 'civil': 13, 'altogether': 22, 'Wiltshire': 1, 'damps': 1, 'pleased': 47, 'carriages': 11, 'altar': 1, 'bewitching': 2, 'abusing': 2, 'war': 1, 'detaining': 2, 'confiding': 1, 'headache': 4, 'medium': 2, 'sang': 3, 'Late': 1, 'ensure': 2, 'chances': 5, 'energy': 3, 'admiration': 24, 'clownish': 2, 'Anywhere': 1, 'ceremony': 7, 'prophecies': 1, 'Within': 4, 'punishment': 4, 'Tuesday': 7, 'undertaken': 1, 'wife': 68, 'liberality': 2, 'confirming': 1, 'add': 11, 'exulting': 2, 'gravel': 2, 'sublime': 1, 'manner': 75, 'desired': 12, 'alert': 3, 'singularly': 1, 'privations': 4, 'practicable': 3, 'glanced': 2, 'Nothing': 19, 'assurance': 8, 'Shakespeare': 1, 'dirt': 1, 'desires': 2, 'whenever': 22, 'borrow': 1, 'established': 5, 'resemblance': 5, 'remains': 5, 'quarrelled': 4, 'Cautious': 1, 'acquainted': 33, 'vicarage': 5, 'judicious': 3, 'whatever': 16, 'disagreement': 3, 'sets': 1, 'unpretending': 3, 'refreshment': 2, 'dissipation': 1, 'perusal': 1, 'issued': 1, 'continuance': 2, 'disappointed': 20, 'compliment': 31, 'carriage': 69, 'unsteadiness': 2, 'differently': 11, 'prevented': 9, 'prisoner': 1, 'induced': 12, 'since': 63, 'overrated': 1, 'Ours': 1, 'collections': 1, 'nicety': 1, 'obstinate': 1, 'caprices': 1, 'contingencies': 1, 'preferred': 10, 'regularly': 3, 'bottom': 3, 'Holyhead': 1, 'disgust': 5, 'fact': 24, 'preparations': 3, 'pass': 30, 'curiosity': 28, '_at_': 2, 'teachers': 3, 'excite': 3, 'addressing': 9, 'condolence': 1, 'temper': 36, 'CHURCHILL': 1, 'unmentioned': 1, 'apiece': 1, 'truly': 12, 'waste': 2, 'staid': 8, 'absences': 1, '----': 1, 'sad': 25, 'does': 125, 'objections': 4, 'baby': 6, 'interference': 7, 'corrected': 1, 'due': 23, 'parts': 2, 'guilt': 3, 'homewards': 1, 'particularly': 46, 'thing': 398, 'mile': 11, 'unseasonableness': 1, 'maintenance': 2, 'concert': 1, 'excess': 1, 'respective': 1, 'mercy': 3, 'awake': 2, 'paused': 6, 'eats': 2, 'favourite': 23, 'nominal': 1, 'spontaneous': 1, 'festivity': 1, 'shifted': 1, 'forswear': 1, 'reconcile': 2, 'irresolute': 2, 'Trouble': 1, 'eyes': 51, 'ceaseless': 2, 'killed': 4, '_The_': 1, 'drizzle': 1, 'thankful': 5, 'exquisite': 8, 'chattering': 1, 'broke': 1, 'instantly': 14, 'slightingly': 1, 'Dr': 2, 'complacency': 3, 'footstep': 2, 'suffering': 16, 'accommodation': 6, 'want': 89, 'rheumatic': 1, 'Donwell': 49, 'anxiously': 6, 'ult': 1, 'belonged': 6, 'congratulatory': 2, 'detain': 2, 'acquiescence': 5, 'management': 1, 'ancient': 1, 'neighbour': 8, 'casements': 1, 'amongst': 3, 'who': 281, 'pre': 1, 'your': 337, 'conveying': 1, 'feebleness': 1, 'games': 1, 'diffident': 2, 'tittle': 1, 'clearly': 3, 'ages': 2, 'lives': 7, 'division': 3, 'apprehended': 3, 'citizen': 1, 'listener': 1, 'fame': 4, 'lived': 25, 'prejudices': 1, 'illumination': 1, 'conversation': 42, 'relapse': 1, 'occurrence': 3, 'Martins': 13, 'expression': 9, 'fireside': 4, 'Does': 5, '_little_': 2, 'Aunt': 1, 'expiration': 1, 'miserably': 3, 'prodigy': 2, 'impartial': 1, 'limited': 3, 'inclined': 6, 'already': 45, 'entreaties': 6, 'contradiction': 3, 'Find': 1, 'complaints': 4, 'heiress': 1, 'seats': 1, 'lessons': 1, 'overhear': 1, 'elect': 1, 'asperity': 1, 'flower': 1, 'allowing': 5, 'involved': 6, 'attitude': 3, 'kingdoms': 1, 'parting': 13, 'recall': 3, 'quarter': 25, 'asks': 3, 'dispelled': 1, 'falsehoods': 1, 'Quite': 13, 'suspect': 21, 'chance': 20, 'inelegance': 1, 'acre': 1, 'dirty': 5, 'knight': 1, 'sickness': 2, 'envy': 6, 'Bath': 19, 'mount': 1, 'errantry': 1, 'estimated': 1, 'overthrow': 1, 'Absence': 1, 'fetched': 2, 'rejoicing': 1, 'unfeignedly': 1, 'denotes': 1, 'prudence': 5, 'ever': 189, 'communicating': 2, 'persuade': 14, 'relief': 9, 'egg': 3, 'agreeableness': 2, 'blow': 3, 'hardened': 1, 'shells': 1, 'stir': 7, 'rewarded': 1, 'affectionate': 9, 'courts': 1, 'luck': 11, 'Give': 2, 'cavil': 1, 'active': 7, 'topic': 3, 'Otway': 5, 'reprehensible': 1, 'unfit': 4, 'own': 301, 'standers': 1, 'services': 1, 'Low': 1, 'gayest': 1, '_treasures_': 1, 'complains': 2, 'awe': 1, 'ingenious': 2, 'Astonished': 1, 'worse': 28, 'apparatus': 1, 'no': 616, 'disappearance': 3, 'replied': 79, 'Langham': 1, 'tried': 26, '_when_': 1, 'strikes': 3, 'meets': 1, 'faster': 2, 'complaisance': 2, '[': 2, 'bound': 1, 'entrance': 5, 'occasion': 28, 'honestly': 3, 'estimation': 3, 'embarrassing': 1, 'degrading': 3, 'indulged': 4, 'condemn': 2, 'grace': 8, 'excited': 6, 'mentioning': 5, 'Two': 7, 'infantry': 1, 'longing': 4, 'divert': 1, 'pleasantest': 3, 'studied': 2, '_Taylor_': 1, 'cautioned': 1, 'handled': 1, 'bar': 1, 'representations': 1, 'minutes': 53, 'constrained': 1, 'Name': 1, 'small': 30, 'threatening': 2, 'cockade': 1, 'battle': 1, 'cry': 1, 'disdain': 4, 'nods': 1, 'returning': 17, 'trimming': 2, 'prefer': 6, 'finger': 3, 'Oxford': 2, 'softest': 1, 'regretted': 5, 'besides': 14, 'articles': 1, 'matting': 1, 'presume': 10, 'antidote': 1, 'agreeable': 50, 'helpless': 2, 'characteristic': 1, 'roast': 4, 'Undoubtedly': 1, 'warmer': 6, 'alarmed': 7, 'XI': 3, 'shrunk': 1, 'settled': 39, 'marry': 63, 'persuasion': 11, 'summons': 2, 'proposition': 3, 'females': 2, '_as_': 1, 'moralising': 1, 'forbade': 1, 'As': 49, 'trusting': 3, 'welcomed': 4, 'puzzle': 2, 'consulting': 4, 'Forcing': 1, 'planning': 3, 'disturbance': 2, 'gardeners': 2, 'bow': 4, 'rule': 10, 'effort': 5, 'unfeeling': 3, 'Escape': 1, 'hundreds': 1, 'minority': 1, 'patience': 11, 'stay': 43, 'danced': 8, 'book': 11, 'counter': 2, 'dreamer': 2, 'across': 7, 'misconceptions': 1, 'began': 64, 'puzzled': 3, 'candour': 5, 'riding': 1, ':--"': 2, 'vigour': 1, 'engrossing': 1, 'butter': 2, 'marries': 4, 'managed': 1, 'hazle': 2, 'shawl': 5, 'breaking': 3, 'cakes': 1, 'irrational': 1, 'Real': 1, 'regular': 17, 'hoped': 43, 'mis': 2, 'reigning': 1, 'troublesome': 9, 'patroness': 1, 'Dinner': 2, 'disclosure': 3, 'blush': 22, 'degradation': 4, 'brilliancy': 1, 'guessed': 7, 'longest': 3, 'undervalued': 1, 'apologised': 1, 'eagerly': 11, 'facts': 1, 'enter': 8, 'be': 1970, 'unperceived': 2, 'closest': 2, 'nobility': 1, 'serve': 3, 'Easter': 2, 'disgrace': 2, 'detained': 7, 'steadiness': 4, 'slowly': 6, 'inspire': 1, 'meetings': 6, 'unquestionably': 1, 'ordered': 10, 'confirmed': 5, 'grievances': 1, 'entitled': 3, 'tax': 1, 'motion': 5, 'landau': 7, 'systems': 1, 'muslins': 1, 'dreaming': 1, 'Sometimes': 2, 'hundred': 15, 'live': 17, 'hot': 11, 'despoiling': 1, 'apparent': 10, 'rode': 3, 'experience': 4, 'some': 248, '_refused_': 1, 'been': 759, 'vexed': 4, 'stanza': 1, 'untouched': 1, 'of': 4279, 'clothes': 1, "!'": 9, 'nursed': 3, 'likewise': 4, 'Rather': 1, 'goodness': 7, 'visibly': 2, '_must_': 4, 'unfelt': 1, 'also': 24, 'main': 2, 'implicitly': 1, 'effects': 5, 'exerting': 2, 'beautifully': 3, 'afternoon': 6, 'recollecting': 3, 'industry': 1, 'transgression': 1, 'without': 211, 'oath': 1, 'go': 129, 'existed': 1, 'accustomed': 2, 'rightly': 4, 'tremblings': 1, 'Otways': 1, 'wind': 5, 'day': 190, 'heightened': 1, 'chatty': 2, 'rubber': 1, 'unbleached': 1, 'authority': 6, 'allies': 1, 'prosings': 1, 'listeners': 1, 'audibly': 1, 'homely': 1, 'sketch': 3, 'letter': 109, 'Circumstances': 1, 'You': 303, 'constantly': 8, 'taught': 7, 'enviable': 1, 'practising': 2, 'readiest': 1, 'head': 40, 'tie': 1, 'informs': 1, 'loin': 3, 'pales': 1, 'enemies': 1, 'unknown': 4, 'thus': 12, 'ridden': 2, 'lord': 3, 'takes': 8, 'denoted': 1, 'remedy': 1, '_recollecting_': 1, 'deadening': 1, 'tied': 1, 'easy': 28, 'powerful': 5, 'risking': 1, 'announced': 5, 'reserve': 11, 'contemplating': 1, '_Her_': 2, 'repent': 6, 'umbrella': 1, '_just_': 4, 'operations': 1, 'begs': 1, 'sports': 1, 'claimed': 2, 'hated': 1, 'ending': 3, 'requires': 5, 'meal': 4, 'nobody': 54, 'spirits': 64, 'allusions': 1, 'amiable': 34, 'diffuses': 1, 'diligence': 1, 'incapable': 2, 'doing': 45, 'deference': 2, 'plotting': 1, 'gossips': 1, 'whichever': 1, 'players': 1, 'Used': 1, 'poet': 2, 'bailiff': 1, 'generosity': 4, 'accomplished': 11, 'According': 1, 'enormous': 1, '.--`': 5, 'values': 1, 'deathbed': 1, 'prosperity': 3, 'relative': 5, 'ingeniously': 1, 'greatest': 29, 'amounted': 1, 'harboured': 1, 'perfection': 10, 'unspent': 1, 'key': 4, '_greater_': 1, 'broiling': 1, 'stopt': 9, 'demurred': 1, 'captious': 2, 'flew': 2, 'sting': 1, 'imagines': 1, 'Should': 3, 'disengage': 2, 'bewitches': 1, 'encumbrance': 3, 'directions': 4, 'redeem': 1, 'Mitchell': 1, 'These': 18, '.]': 1, 'patiently': 1, 'covering': 1, 'hand': 53, ".'": 32, 'reports': 1, 'ashamed': 28, 'estimable': 1, 'candlelight': 2, 'notch': 1, 'accuse': 1, 'elms': 1, '_present_': 1, 'merry': 2, 'harmonise': 1, 'part': 66, 'Granted': 1, 'form': 19, 'seeing': 51, 'Genlis': 1, 'debt': 1, 'Full': 2, 'gang': 1, 'Excepting': 2, 'causes': 4, 'Better': 7, 'withdrawn': 3, 'turns': 9, 'tidings': 3, 'arguments': 2, 'denied': 4, 'fervour': 1, 'astonishment': 6, 'convenience': 6, 'unreserved': 1, 'modes': 2, 'attachment': 48, 'wilfully': 1, 'fence': 1, 'remonstrated': 1, '1816': 1, 'despise': 1, 'ungenerous': 1, 'around': 13, 'emotions': 2, 'reminding': 2, 'influence': 24, 'said': 484, 'pushed': 4, 'Wickedness': 1, 'conceive': 5, 'bustle': 10, 'laughing': 17, 'relish': 1, 'describing': 2, 'Fortune': 1, 'novelty': 2, 'dissentient': 1, 'fro': 1, 'willing': 13, 'clever': 27, 'Altogether': 1, 'scornful': 1, 'deciding': 1, 'quiet': 25, 'affability': 1, 'bit': 5, 'Bates': 148, 'loveliness': 3, '_moment_': 1, 'talent': 9, 'devotion': 2, 'eyebrows': 2, 'forwards': 2, 'supposes': 1, 'arms': 2, 'fairy': 3, 'anticipation': 4, 'What': 102, 'Captain': 3, 'rationally': 2, 'misled': 2, 'consistently': 1, 'sofa': 5, 'Tell': 4, 'Anxious': 1, 'hesitatingly': 3, 'riddles': 2, 'sucking': 1, 'robe': 1, 'route': 1, 'pure': 1, 'Vigorous': 1, 'inflicting': 1, 'joyfully': 1, 'agreeably': 5, 'knock': 1, 'English': 8, 'usually': 4, 'appear': 36, 'period': 18, 'windows': 9, 'motive': 9, 'figures': 1, 'Isabella': 69, 'urgent': 2, 'deep': 14, 'retirement': 5, 'unclosed': 1, 'delayed': 3, 'attention': 59, 'fondest': 1, 'wet': 5, 'governed': 2, 'tremble': 2, 'bids': 1, 'illiterate': 2, 'interposition': 1, 'amusing': 7, 'hindrance': 1, 'tenderness': 6, 'sister': 33, 'feminine': 1, 'gladness': 1, 'disappoint': 3, 'surmises': 1, 'overpowered': 4, 'Dorking': 1, 'somewhat': 4, 'conscience': 8, 'Candles': 1, 'supplied': 5, 'eating': 7, 'receipt': 1, 'Weston': 439, 'grey': 1, 'contribute': 1, 'analogy': 1, 'rapidity': 3, 'hinting': 1, 'negligence': 1, 'objects': 7, 'planned': 2, 'shake': 9, 'accent': 5, 'saddle': 1, 'nay': 5, 'with': 1187, 'afloat': 1, 'softly': 1, 'leg': 6, 'inferiority': 5, 'promote': 4, 'playing': 9, 'delight': 21, 'unconcern': 1, 'merit': 14, 'accession': 1, 'expense': 7, 'Wright': 4, 'doom': 1, 'chapter': 1, 'grown': 9, 'intellects': 2, 'guilty': 1, 'neighbours': 16, 'pools': 1, 'suspecting': 5, 'residence': 4, 'need': 42, 'Call': 1, 'run': 17, 'interested': 15, 'blue': 4, 'enjoyment': 22, 'conscious': 11, 'poultry': 2, 'stipulation': 1, 'much': 478, 'pursue': 1, 'acknowledge': 14, 'debased': 1, 'Almane': 1, 'thither': 4, 'representation': 3, 'disclaiming': 1, 'bring': 38, 'proceeded': 13, 'Perfectly': 2, '_gentleman_': 1, 'combine': 1, 'sight': 26, 'excused': 3, 'indolent': 1, 'brain': 7, 'courses': 1, 'Weymouth': 17, 'removal': 7, 'tranquillised': 2, 'boiled': 5, 'rendering': 1, 'denote': 3, 'superiority': 10, 'undesigned': 1, 'unmodulated': 1, 'fatiguing': 1, 'presiding': 1, 'conduct': 27, 'teased': 1, 'acquired': 3, 'intervals': 1, '000': 2, 'improves': 1, 'bilious': 3, 'posture': 1, 'convincing': 3, 'lets': 1, 'licentiousness': 1, 'individually': 1, 'counteract': 3, 'gratifications': 1, 'indelicate': 1, 'root': 1, 'hatred': 1, 'unabated': 1, 'nnight': 1, 'waiters': 1, 'quite': 269, 'male': 1, 'step': 11, 'grounds': 10, 'glances': 1, 'hired': 1, 'misspent': 1, 'gentlemanlike': 5, 'endure': 13, 'considerable': 17, 'resisted': 2, 'suffers': 1, 'engrosses': 1, 'professed': 6, 'hotter': 2, 'requesting': 1, 'sanguine': 7, 'sheltered': 2, 'cheap': 2, 'coxcomb': 3, 'rarity': 1, 'dispersing': 1, 'belonging': 4, 'comforted': 3, 'gold': 1, 'thorough': 12, 'figured': 1, 'always': 235, 'those': 87, 'emphasis': 1, 'holiday': 2, 'Elegant': 3, 'justice': 24, 'unobjectionable': 1, 'intending': 4, 'forestalling': 1, 'screwed': 1, 'insipid': 1, 'scalloped': 1, 'promoting': 1, 'Many': 5, 'advantages': 12, 'hungry': 3, 'exercise': 17, 'tone': 24, 'charades': 4, 'mounts': 1, 'At': 34, 'styles': 1, 'secures': 1, 'popular': 1, 'church': 6, 'Dixon': 44, 'sweep': 5, 'opens': 1, 'rain': 22, 'doat': 1, 'evenings': 5, 'delineated': 1, 'headaches': 1, 'hopeless': 1, 'brilliant': 6, 'rights': 5, 'continued': 38, 'Three': 4, 'Perrys': 3, 'matters': 10, 'corn': 1, 'Sept': 1, 'Abbots': 1, 'jealousy': 5, 'quickly': 7, 'convince': 9, 'sentiment': 10, 'dissuaded': 1, 'conviction': 16, 'legal': 1, 'likeness': 21, 'questions': 12, 'continuing': 5, 'blessed': 6, 'pencilled': 1, 'custard': 1, 'derived': 1, 'People': 2, 'youth': 11, 'necessary': 36, 'accents': 1, 'indistinct': 1, 'lapse': 1, 'superior': 59, 'observant': 2, 'happy': 122, 'Crown': 25, 'predominated': 1, 'Beg': 1, 'praises': 1, 'reserved': 9, 'songs': 1, 'Stilton': 1, 'safety': 10, '_his_': 4, 'cold': 54, 'pretence': 9, 'IV': 3, 'purposes': 1, 'ulcerated': 1, 'neat': 6, 'foresight': 1, 'extended': 1, 'reading': 15, 'loser': 1, 'women': 19, 'home': 130, 'weather': 39, '_should_': 2, 'recovering': 4, 'shrinking': 1, 'treat': 2, 'Serious': 1, 'touches': 2, 'undoubted': 1, 'class': 5, 'thankfully': 2, 'Invite': 2, '_appropriation_': 1, 'cheerlessness': 1, 'predictions': 1, 'penetrated': 1, 'death': 10, 'effecting': 1, 'escaped': 7, 'finessing': 1, 'hearing': 32, 'decline': 2, 'infinitely': 14, '?--': 200, 'trivial': 3, 'displayed': 2, 'WINDSOR': 1, 'administered': 1, 'adherence': 1, 'witnessed': 4, ...}
Versión mejorada con defaultdicts:
from collections import defaultdict
count = defaultdict(int)
for sent in gutenberg.sents('austen-emma.txt'):
for word in sent:
count[word] += 1
print('10 palabras más frecuentes:', sorted(count.items(), key=lambda x: -x[1])[:10])
print('Vocabulario:', len(count))
print('Tokens:', sum(count.values()))
10 palabras más frecuentes: [(',', 11454), ('.', 6928), ('to', 5183), ('the', 4844), ('and', 4672), ('of', 4279), ('I', 3178), ('a', 3004), ('was', 2385), ('her', 2381)] Vocabulario: 7806 Tokens: 192484
Versión usando clase Counter:
from collections import Counter
count = Counter()
for sent in gutenberg.sents('austen-emma.txt'):
count.update(sent)
print('10 palabras más frecuentes:', count.most_common()[:10])
print('Vocabulario:', len(count))
print('Tokens:', sum(count.values()))
10 palabras más frecuentes: [(',', 11454), ('.', 6928), ('to', 5183), ('the', 4844), ('and', 4672), ('of', 4279), ('I', 3178), ('a', 3004), ('was', 2385), ('her', 2381)] Vocabulario: 7806 Tokens: 192484
Primero crear archivo example.txt: "Estimados Sr. y sra. Gómez. Se los cita por el art. 32 de la ley 21.234."
from nltk.corpus import PlaintextCorpusReader
help(PlaintextCorpusReader)
Help on class PlaintextCorpusReader in module nltk.corpus.reader.plaintext: class PlaintextCorpusReader(nltk.corpus.reader.api.CorpusReader) | Reader for corpora that consist of plaintext documents. Paragraphs | are assumed to be split using blank lines. Sentences and words can | be tokenized using the default tokenizers, or by custom tokenizers | specificed as parameters to the constructor. | | This corpus reader can be customized (e.g., to skip preface | sections of specific document formats) by creating a subclass and | overriding the ``CorpusView`` class variable. | | Method resolution order: | PlaintextCorpusReader | nltk.corpus.reader.api.CorpusReader | builtins.object | | Methods defined here: | | __init__(self, root, fileids, word_tokenizer=WordPunctTokenizer(pattern='\\w+|[^\\w\\s]+', gaps=False, discard_empty=True, flags=56), sent_tokenizer=<nltk.tokenize.punkt.PunktSentenceTokenizer object at 0x7f60bb13f630>, para_block_reader=<function read_blankline_block at 0x7f60bb14f2f0>, encoding='utf8') | Construct a new plaintext corpus reader for a set of documents | located at the given root directory. Example usage: | | >>> root = '/usr/local/share/nltk_data/corpora/webtext/' | >>> reader = PlaintextCorpusReader(root, '.*\.txt') # doctest: +SKIP | | :param root: The root directory for this corpus. | :param fileids: A list or regexp specifying the fileids in this corpus. | :param word_tokenizer: Tokenizer for breaking sentences or | paragraphs into words. | :param sent_tokenizer: Tokenizer for breaking paragraphs | into words. | :param para_block_reader: The block reader used to divide the | corpus into paragraph blocks. | | paras(self, fileids=None) | :return: the given file(s) as a list of | paragraphs, each encoded as a list of sentences, which are | in turn encoded as lists of word strings. | :rtype: list(list(list(str))) | | raw(self, fileids=None) | :return: the given file(s) as a single string. | :rtype: str | | sents(self, fileids=None) | :return: the given file(s) as a list of | sentences or utterances, each encoded as a list of word | strings. | :rtype: list(list(str)) | | words(self, fileids=None) | :return: the given file(s) as a list of words | and punctuation symbols. | :rtype: list(str) | | ---------------------------------------------------------------------- | Data and other attributes defined here: | | CorpusView = <class 'nltk.corpus.reader.util.StreamBackedCorpusView'> | A 'view' of a corpus file, which acts like a sequence of tokens: | it can be accessed by index, iterated over, etc. However, the | tokens are only constructed as-needed -- the entire corpus is | never stored in memory at once. | | The constructor to ``StreamBackedCorpusView`` takes two arguments: | a corpus fileid (specified as a string or as a ``PathPointer``); | and a block reader. A "block reader" is a function that reads | zero or more tokens from a stream, and returns them as a list. A | very simple example of a block reader is: | | >>> def simple_block_reader(stream): | ... return stream.readline().split() | | This simple block reader reads a single line at a time, and | returns a single token (consisting of a string) for each | whitespace-separated substring on the line. | | When deciding how to define the block reader for a given | corpus, careful consideration should be given to the size of | blocks handled by the block reader. Smaller block sizes will | increase the memory requirements of the corpus view's internal | data structures (by 2 integers per block). On the other hand, | larger block sizes may decrease performance for random access to | the corpus. (But note that larger block sizes will *not* | decrease performance for iteration.) | | Internally, ``CorpusView`` maintains a partial mapping from token | index to file position, with one entry per block. When a token | with a given index *i* is requested, the ``CorpusView`` constructs | it as follows: | | 1. First, it searches the toknum/filepos mapping for the token | index closest to (but less than or equal to) *i*. | | 2. Then, starting at the file position corresponding to that | index, it reads one block at a time using the block reader | until it reaches the requested token. | | The toknum/filepos mapping is created lazily: it is initially | empty, but every time a new block is read, the block's | initial token is added to the mapping. (Thus, the toknum/filepos | map has one entry per block.) | | In order to increase efficiency for random access patterns that | have high degrees of locality, the corpus view may cache one or | more blocks. | | :note: Each ``CorpusView`` object internally maintains an open file | object for its underlying corpus file. This file should be | automatically closed when the ``CorpusView`` is garbage collected, | but if you wish to close it manually, use the ``close()`` | method. If you access a ``CorpusView``'s items after it has been | closed, the file object will be automatically re-opened. | | :warning: If the contents of the file are modified during the | lifetime of the ``CorpusView``, then the ``CorpusView``'s behavior | is undefined. | | :warning: If a unicode encoding is specified when constructing a | ``CorpusView``, then the block reader may only call | ``stream.seek()`` with offsets that have been returned by | ``stream.tell()``; in particular, calling ``stream.seek()`` with | relative offsets, or with offsets based on string lengths, may | lead to incorrect behavior. | | :ivar _block_reader: The function used to read | a single block from the underlying file stream. | :ivar _toknum: A list containing the token index of each block | that has been processed. In particular, ``_toknum[i]`` is the | token index of the first token in block ``i``. Together | with ``_filepos``, this forms a partial mapping between token | indices and file positions. | :ivar _filepos: A list containing the file position of each block | that has been processed. In particular, ``_toknum[i]`` is the | file position of the first character in block ``i``. Together | with ``_toknum``, this forms a partial mapping between token | indices and file positions. | :ivar _stream: The stream used to access the underlying corpus file. | :ivar _len: The total number of tokens in the corpus, if known; | or None, if the number of tokens is not yet known. | :ivar _eofpos: The character position of the last character in the | file. This is calculated when the corpus view is initialized, | and is used to decide when the end of file has been reached. | :ivar _cache: A cache of the most recently read block. It | is encoded as a tuple (start_toknum, end_toknum, tokens), where | start_toknum is the token index of the first token in the block; | end_toknum is the token index of the first token not in the | block; and tokens is a list of the tokens in the block. | | ---------------------------------------------------------------------- | Methods inherited from nltk.corpus.reader.api.CorpusReader: | | __repr__(self) | Return repr(self). | | __unicode__ = __str__(self, /) | Return str(self). | | abspath(self, fileid) | Return the absolute path for the given file. | | :type fileid: str | :param fileid: The file identifier for the file whose path | should be returned. | :rtype: PathPointer | | abspaths(self, fileids=None, include_encoding=False, include_fileid=False) | Return a list of the absolute paths for all fileids in this corpus; | or for the given list of fileids, if specified. | | :type fileids: None or str or list | :param fileids: Specifies the set of fileids for which paths should | be returned. Can be None, for all fileids; a list of | file identifiers, for a specified set of fileids; or a single | file identifier, for a single file. Note that the return | value is always a list of paths, even if ``fileids`` is a | single file identifier. | | :param include_encoding: If true, then return a list of | ``(path_pointer, encoding)`` tuples. | | :rtype: list(PathPointer) | | citation(self) | Return the contents of the corpus citation.bib file, if it exists. | | encoding(self, file) | Return the unicode encoding for the given corpus file, if known. | If the encoding is unknown, or if the given file should be | processed using byte strings (str), then return None. | | ensure_loaded(self) | Load this corpus (if it has not already been loaded). This is | used by LazyCorpusLoader as a simple method that can be used to | make sure a corpus is loaded -- e.g., in case a user wants to | do help(some_corpus). | | fileids(self) | Return a list of file identifiers for the fileids that make up | this corpus. | | license(self) | Return the contents of the corpus LICENSE file, if it exists. | | open(self, file) | Return an open stream that can be used to read the given file. | If the file's encoding is not None, then the stream will | automatically decode the file's contents into unicode. | | :param file: The file identifier of the file to read. | | readme(self) | Return the contents of the corpus README file, if it exists. | | unicode_repr = __repr__(self) | Return repr(self). | | ---------------------------------------------------------------------- | Data descriptors inherited from nltk.corpus.reader.api.CorpusReader: | | __dict__ | dictionary for instance variables (if defined) | | __weakref__ | list of weak references to the object (if defined) | | root | The directory where this corpus is stored. | | :type: PathPointer
corpus = PlaintextCorpusReader('.', 'example.txt')
list(corpus.sents())
[['Estimados', 'Sr', '.', 'y', 'sra', '.'], ['Gómez', '.'], ['Se', 'los', 'cita', 'por', 'el', 'art', '.'], ['32', 'de', 'la', 'ley', '21', '.', '234', '.']]
De la documentación de NLTK obtenemos una expresión regular para tokenizar:
pattern = r'''(?x) # set flag to allow verbose regexps
(?:[A-Z]\.)+ # abbreviations, e.g. U.S.A.
| \w+(?:-\w+)* # words with optional internal hyphens
| \$?\d+(?:\.\d+)?%? # currency and percentages, e.g. $12.40, 82%
| \.\.\. # ellipsis
| [][.,;"'?():-_`] # these are separate tokens; includes ], [
'''
Lo probamos:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(pattern)
corpus = PlaintextCorpusReader('.', 'example.txt', word_tokenizer=tokenizer)
list(corpus.sents())
[['Estimados', 'Sr', '.', 'y', 'sra', '.'], ['Gómez', '.'], ['Se', 'los', 'cita', 'por', 'el', 'art', '.'], ['32', 'de', 'la', 'ley', '21', '.', '234', '.']]
Vemos que tokeniza mal todas las abreviaciones y el número "21.234". Mejoramos la expresión regular y probamos:
pattern = r'''(?x) # set flag to allow verbose regexps
(?:\d{1,3}(?:\.\d{3})+) # numbers with '.' in the middle
| (?:[Ss]r\.|[Ss]ra\.|art\.) # common spanish abbreviations
| (?:[A-Z]\.)+ # abbreviations, e.g. U.S.A.
| \w+(?:-\w+)* # words with optional internal hyphens
| \$?\d+(?:\.\d+)?%? # currency and percentages, e.g. $12.40, 82%
| \.\.\. # ellipsis
| [][.,;"'?():-_`] # these are separate tokens; includes ], [
'''
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(pattern)
corpus = PlaintextCorpusReader('.', 'example.txt', word_tokenizer=tokenizer)
list(corpus.sents())
[['Estimados', 'Sr.', 'y', 'sra.'], ['Gómez', '.'], ['Se', 'los', 'cita', 'por', 'el', 'art.'], ['32', 'de', 'la', 'ley', '21.234', '.']]
Ahora tokeniza bien!!
(La segmentación en oraciones sigue estando mal, pero resolver eso queda fuera de esta clase.)