# Credit: Data8.org
# Stuff that will appear at the top of notebooks;
# You don't have to do anything about it.
from datascience import *
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)
from urllib.request import urlopen
import re
def read_url(url):
return re.sub('\\s+', ' ', urlopen(url).read().decode())
2+3
# Read two books, fast!
huck_finn_url = 'https://www.inferentialthinking.com/data/huck_finn.txt'
huck_finn_text = read_url(huck_finn_url)
huck_finn_chapters = huck_finn_text.split('CHAPTER ')[44:]
little_women_url = 'https://www.inferentialthinking.com/data/little_women.txt'
little_women_text = read_url(little_women_url)
little_women_chapters = little_women_text.split('CHAPTER ')[1:]
# Display the chapters of Huckleberry Finn
Table().with_column('Chapters', huck_finn_chapters)
# Count how many times the names Jim, Tom, and Huck appear in each chapter
counts = Table().with_columns([
'Jim', np.char.count(huck_finn_chapters, 'Jim'),
'Tom', np.char.count(huck_finn_chapters, 'Tom'),
'Huck', np.char.count(huck_finn_chapters, 'Huck')
])
# Plot the cumulative counts:
# how many times in Chapter 1, how many times in Chapters 1 and 2, and so on.
cum_counts = counts.cumsum().with_column('Chapter', np.arange(1, 44, 1))
cum_counts.plot(column_for_xticks=3)
plots.title('Cumulative Number of Times Name Appears');
# The chapters of Little Women
Table().with_column('Chapters', little_women_chapters)
# Counts of names in the chapters of Little Women
people = ['Amy', 'Beth', 'Jo', 'Laurie', 'Meg']
people_counts = {pp: np.char.count(little_women_chapters, pp) for pp in people}
counts = Table().with_columns([
'Amy', people_counts['Amy'],
'Beth', people_counts['Beth'],
'Jo', people_counts['Jo'],
'Laurie', people_counts['Laurie'],
'Meg', people_counts['Meg']
])
# Plot the cumulative counts
cum_counts = counts.cumsum().with_column('Chapter', np.arange(1, 48, 1))
cum_counts.plot(column_for_xticks=5)
plots.title('Cumulative Number of Times Name Appears');
# In each chapter, count the number of all characters;
# call this the "length" of the chapter.
# Also count the number of periods.
chars_periods_hf = Table().with_columns([
'HF Chapter Length', [len(s) for s in huck_finn_chapters],
'Number of Periods', np.char.count(huck_finn_chapters, '.')
])
chars_periods_lw = Table().with_columns([
'LW Chapter Length', [len(s) for s in little_women_chapters],
'Number of Periods', np.char.count(little_women_chapters, '.')
])
# The counts for Huckleberry Finn
chars_periods_hf
# The counts for Little Women
chars_periods_lw
plots.figure(figsize=(10,10))
plots.scatter(chars_periods_hf[1], chars_periods_hf[0], color='darkblue')
plots.scatter(chars_periods_lw[1], chars_periods_lw[0], color='gold')
plots.xlabel('Number of periods in chapter')
plots.ylabel('Number of characters in chapter');