def get_text(file_path):
'''Skip preamble'''
past_header = False
with open(file_path) as f:
for line in f:
if past_header:
yield line
else:
if line.startswith('*** START OF THIS PROJECT GUTENBERG'):
past_header = True
import string
import re
def clean_text(lines):
'''Remove all but ASCII alpha and spaces'''
valid_chars = string.ascii_letters + string.whitespace
match = '[^' + valid_chars + ']+'
pattern = re.compile(match)
for line in lines:
yield pattern.sub("", line)
def break_into_words(lines):
for line in lines:
for word in line.split(' '):
if word: # check for null strings
yield word.lower()
from collections import Counter
cnt = Counter(break_into_words(clean_text(get_text('demo.txt'))))
cnt.most_common(10)
[('the', 3894), ('and', 2794), ('i', 2596), ('of', 2537), ('to', 2005), ('my', 1574), ('a', 1301), ('in', 1083), ('that', 973), ('was', 948)]
def push_counter(counter):
while True:
value = yield
counter.update([value])
def count_words_and_letters(file_name):
word_count = Counter()
letter_count = Counter()
word_counter = push_counter(word_count)
letter_counter = push_counter(letter_count)
next(word_counter)
next(letter_counter)
for word in break_into_words(clean_text(get_text(file_name))):
try:
word_counter.send(word)
except StopIteration:
pass
for letter in word:
try:
letter_counter.send(letter)
except StopIteration:
pass
return word_count,letter_count
wc,lc = count_words_and_letters('demo.txt')
wc.most_common(10)
[('the', 3894), ('and', 2794), ('i', 2596), ('of', 2537), ('to', 2005), ('my', 1574), ('a', 1301), ('in', 1083), ('that', 973), ('was', 948)]
lc.most_common(20)
[('e', 45984), ('t', 30322), ('a', 26720), ('o', 25188), ('i', 24591), ('n', 24333), ('s', 21130), ('r', 20788), ('h', 19709), ('d', 16855), ('l', 12723), ('m', 10597), ('u', 10394), ('c', 9233), ('f', 8722), ('y', 7903), ('\n', 7634), ('w', 7626), ('p', 6116), ('g', 5960)]