#!/usr/bin/env python # coding: utf-8 # ## Text analysis with Python # # # Copyright 2019 Allen Downey # # [MIT License](https://opensource.org/licenses/MIT) # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plt # Word Frequencies # ---------------- # # Let's look at frequencies of words, bigrams and trigrams in a text. # # The following function reads lines from a file or URL and splits them into words: # In[2]: def iterate_words(filename): """Read lines from a file and split them into words.""" for line in open(filename): for word in line.split(): yield word.strip() # Here's an example using a book. `wc` is a Counter of words, that is, a dictionary that maps from each word to the number of times it appears: # In[3]: import os # Originally from https://archive.org/stream/TheFaultInOurStarsJohnGreen/The+Fault+In+Our+Stars+-+John+Green_djvu.txt filename = 'the_fault_in_our_stars.txt' if not os.path.exists(filename): get_ipython().system('wget https://raw.githubusercontent.com/AllenDowney/CompStats/master/the_fault_in_our_stars.txt') # In[4]: from collections import Counter wc = Counter(iterate_words(filename)) # Here are the 20 most common words: # In[5]: wc.most_common(20) # Word frequencies in natural languages follow a predictable pattern called Zipf's law (which is an instance of Stigler's law, which is also an instance of Stigler's law). # # We can see the pattern by lining up the words in descending order of frequency and plotting their counts (6507, 5250, 2707) versus ranks (1st, 2nd, 3rd, ...): # In[6]: def counter_ranks(wc): """Returns ranks and counts as lists.""" return zip(*enumerate(sorted(wc.values(), reverse=True))) # In[7]: ranks, counts = counter_ranks(wc) plt.plot(ranks, counts) plt.xlabel('Rank') plt.ylabel('Count') plt.title('Word count versus rank, linear scale'); # Huh. Maybe that's not so clear after all. The problem is that the counts drop off very quickly. If we use the highest count to scale the figure, most of the other counts are indistinguishable from zero. # # Also, there are more than 10,000 words, but most of them appear only a few times, so we are wasting most of the space in the figure in a regime where nothing is happening. # # This kind of thing happens a lot. A common way to deal with it is to compute the log of the quantities or to plot them on a log scale: # In[8]: ranks, counts = counter_ranks(wc) plt.plot(ranks, counts) plt.xlabel('Rank') plt.ylabel('Count') plt.xscale('log') plt.yscale('log') plt.title('Word count versus rank, log-log scale'); # This (approximately) straight line is characteristic of Zipf's law. # # n-grams # ------- # # On to the next topic: bigrams and trigrams. # In[9]: from itertools import tee def pairwise(iterator): """Iterates through a sequence in overlapping pairs. If the sequence is 1, 2, 3, the result is (1, 2), (2, 3), (3, 4), etc. """ a, b = tee(iterator) next(b, None) return zip(a, b) # `bigrams` is the histogram of word pairs: # In[10]: bigrams = Counter(pairwise(iterate_words(filename))) # And here are the 20 most common: # In[11]: bigrams.most_common(20) # Similarly, we can iterate the trigrams: # In[12]: def triplewise(iterator): a, b, c = tee(iterator, 3) next(b) next(c) next(c) return zip(a, b, c) # And make a Counter: # In[13]: trigrams = Counter(triplewise(iterate_words(filename))) # Here are the 20 most common: # In[14]: trigrams.most_common(20) # ### Markov analysis # # And now for a little fun. I'll make a dictionary that maps from each word pair to a Counter of the words that can follow. # In[15]: from collections import defaultdict d = defaultdict(Counter) for a, b, c in trigrams: d[a, b][c] += trigrams[a, b, c] # Now we can look up a pair and see what might come next: # In[16]: d['I', 'said'] # Here are the most common words that follow "into the": # In[17]: d['into', 'the'].most_common(10) # The following function chooses a random word from the suffixes in a Counter: # In[18]: import random def choice(counter): """Chooses a random element.""" return random.choice(list(counter.elements())) # In[19]: choice(d['into', 'the']) # Given a prefix, we can choose a random suffix: # In[20]: prefix = 'into', 'the' suffix = choice(d[prefix]) suffix # Then we can shift the words and compute the next prefix: # In[21]: prefix = prefix[1], suffix prefix # Repeating this process, we can generate random new text that has the same correlation structure between words as the original: # In[22]: for i in range(100): suffix = choice(d[prefix]) print(suffix, end=' ') prefix = prefix[1], suffix # With a prefix of two words, we typically get text that flirts with sensibility. # In[ ]: