In [8]:
def get_text(file_path):
    '''Skip preamble'''
    past_header = False
    with open(file_path) as f:
        for line in f:
            if past_header:
                yield line
            else:
                if line.startswith('*** START OF THIS PROJECT GUTENBERG'):
                    past_header = True
In [9]:
import string
import re
def clean_text(lines):
    '''Remove all but ASCII alpha and spaces'''
    valid_chars = string.ascii_letters + string.whitespace
    match = '[^' + valid_chars + ']+'
    pattern = re.compile(match)
    for line in lines:
        yield pattern.sub("", line)
In [10]:
def break_into_words(lines):
    for line in lines:
        for word in line.split(' '):
            if word: # check for null strings
                yield word.lower()
In [11]:
from collections import Counter
cnt = Counter(break_into_words(clean_text(get_text('demo.txt'))))
In [12]:
cnt.most_common(10)
Out[12]:
[('the', 3894),
 ('and', 2794),
 ('i', 2596),
 ('of', 2537),
 ('to', 2005),
 ('my', 1574),
 ('a', 1301),
 ('in', 1083),
 ('that', 973),
 ('was', 948)]
In [13]:
def push_counter(counter):
    while True:
        value = yield
        counter.update([value])

def count_words_and_letters(file_name):
    word_count = Counter()
    letter_count = Counter()
    
    word_counter = push_counter(word_count)
    letter_counter = push_counter(letter_count)
    next(word_counter)
    next(letter_counter)
    
    for word in break_into_words(clean_text(get_text(file_name))):
        try:
            word_counter.send(word)
        except StopIteration:
            pass
        for letter in word:
            try:
                letter_counter.send(letter)
            except StopIteration:
                pass
    return word_count,letter_count

wc,lc = count_words_and_letters('demo.txt')
In [14]:
wc.most_common(10)
Out[14]:
[('the', 3894),
 ('and', 2794),
 ('i', 2596),
 ('of', 2537),
 ('to', 2005),
 ('my', 1574),
 ('a', 1301),
 ('in', 1083),
 ('that', 973),
 ('was', 948)]
In [16]:
lc.most_common(20)
Out[16]:
[('e', 45984),
 ('t', 30322),
 ('a', 26720),
 ('o', 25188),
 ('i', 24591),
 ('n', 24333),
 ('s', 21130),
 ('r', 20788),
 ('h', 19709),
 ('d', 16855),
 ('l', 12723),
 ('m', 10597),
 ('u', 10394),
 ('c', 9233),
 ('f', 8722),
 ('y', 7903),
 ('\n', 7634),
 ('w', 7626),
 ('p', 6116),
 ('g', 5960)]
In [ ]: