Explore harvested text files

In [ ]:
import os
import pandas as pd
import fileinput
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from textblob import TextBlob
from operator import itemgetter
from pathlib import Path
import nltk
import numpy as np
import altair as alt

import nltk
nltk.download('stopwords')
nltk.download('punkt')

stopwords = nltk.corpus.stopwords.words('english')
stopwords += ['tho', 'tbe']

# Are you using Jupyter Lab?
# If so either don't run this cell or comment out the line below

# alt.renderers.enable('notebook')

# If you forget, run this cell, and then get strange warnings when you make a chart,
# uncomment the following line and run this cell to reset the chart renderer

# alt.renderers.enable('default')

# alt.data_transformers.enable('json')
#nltk.download('stopwords')
#nltk.download('punkt')
#stopwords = nltk.corpus.stopwords.words('english')
In [ ]:
# Import a harvest zip file you've created previously
# First upload the zip file to the data directory, then run this cell
import zipfile
for zipped in sorted(Path('data').glob('*.zip')):
    print(f'Unzipping {zipped}...')
    with zipfile.ZipFile(zipped, 'r') as zip_file:
        zip_file.extractall(Path(f'data/{zipped.stem}'))
In [ ]:
def get_latest_harvest():
    '''
    Get the timestamp of the most recent harvest.
    '''
    harvests = sorted([d for d in Path('data').iterdir() if d.is_dir() and not d.name.startswith('.')])
    try:
        harvest = harvests[-1]
    except IndexError:
        print('No harvests!')
        harvest = None
    return harvest
In [ ]:
def get_docs(harvest):
    docs_path = get_docs_path(harvest)
    for p in docs_path:
        yield p.read_text(encoding='utf-8').strip()
        
def get_docs_path(harvest):
    path = Path(harvest, 'text')
    docs_path = [p for p in sorted(path.glob('*.txt'))]
    return docs_path

def get_file_names(harvest):
    return [p.stem for p in get_docs_path(harvest)]
In [ ]:
harvest = get_latest_harvest()
In [ ]:
harvest
In [ ]:
vectorizer = CountVectorizer(stop_words=frozenset(stopwords), max_features=10000, ngram_range=(1,1))
# preprocessor = lambda x: re.sub(r'(\d[\d\.])+', 'NUM', x.lower())
X_freq = np.asarray(vectorizer.fit_transform(get_docs(harvest)).todense())
df_freq = pd.DataFrame(X_freq, columns=vectorizer.get_feature_names(), index=get_file_names(harvest))
In [ ]:
df_freq.sum().nlargest(20)
In [ ]:
df_freq.unstack().to_frame().reset_index().dropna(axis=0, subset=[0])
In [ ]:
%%time
# The number of words you want to show
num_words = 10
top_words = pd.DataFrame({n: df_freq.T[col].nlargest(num_words).index.tolist() for n, col in enumerate(df_freq.T)}).T
top_words.index = get_file_names(harvest)
top_words.head()
In [ ]:
df_freq.T

Add a 'year' column to the dataframe

Each file name includes the date on which the article was published. For example, 18601224-13-5696044 was published on 24 December 1860. We can easily extract the year by just slicing the first four characters off the index.

In [ ]:
df_freq['article_year'] = df_freq.index.str.slice(0, 4)

Most frequent words each year

In [ ]:
# Group by year and sum the word counts
year_groups = df_freq.groupby(by='article_year')
year_group_totals = year_groups.sum()
In [ ]:
# Reshape so that we have columns for year, word, and count
words_by_year = year_group_totals.unstack().to_frame().reset_index()
words_by_year.columns = ['word', 'year', 'count']
In [ ]:
top_words_by_year = words_by_year.sort_values('count', ascending=False).groupby(by=['year']).head(10).reset_index(drop=True)
In [ ]:
top_words_by_year['word'].value_counts()[:25]

Visualise top ten words per year

In [ ]:
alt.Chart(top_words_by_year).mark_bar().encode(
    y=alt.Y('word:N', sort='-x'),
    x='count:Q',
    facet=alt.Facet('year', columns=4)
).properties(
    width=120, height=120
).resolve_scale(
    x='independent',
    y='independent'
)

Visualise word frequencies over time

Create a faceted chart

In [ ]:
alt.Chart(words_by_year.loc[words_by_year['word'].isin(['storm', 'cyclone', 'snow'])]).mark_line().encode(
    x=alt.X('year:Q', axis=alt.Axis(format='c', title='Year')),
    y='count:Q',
    color='word:N',
    facet=alt.Facet('word:N', columns=1)
).properties(width=700, height=100).resolve_scale(
    y='independent'
)

Created by Tim Sherratt (@wragge) for the GLAM Workbench.
Support this project by becoming a GitHub sponsor.