Word frequencies in digitised journals

This notebook uses word frequency to explore the OCRd texts harvested from Trove's digitised journals. More documentation coming...

In [ ]:
import re
import tarfile
import zipfile
from io import BytesIO
import pandas as pd
import requests
from tqdm.auto import tqdm
import altair as alt
import os
from pathlib import Path
import ipywidgets as widgets
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import numpy as np

Select a journal

Create a dropdown widget to select a digitised journal. The cells below will use this widget to get the value of the currently selected journal.

In [ ]:
# Load details of digitised journals from CSV
df_journals = pd.read_csv('digital-journals-with-text.csv').sort_values(by='title')
journal_list = [(f"{j['title']} ({j['issues_with_text']} issues)", j['directory']) for j in df_journals[['title', 'directory', 'issues_with_text']].to_dict('records')]
journals = widgets.Dropdown(options=journal_list, disabled=False)
display(journals)

Download all the issues of the journal

In [ ]:
def get_docs_path(journal):
    path = os.path.join('downloads', journal, 'texts')
    docs_path = [p for p in sorted(Path(path).glob('*.txt'))]
    return docs_path

def download_journal(journal):
    path = os.path.join('downloads', journal)
    os.makedirs(path, exist_ok=True)
    params = {
        'path': f'/{journal}/texts'
    }
    response = requests.get('https://cloudstor.aarnet.edu.au/plus/s/QOmnqpGQCNCSC2h/download', params=params)
    zipped = zipfile.ZipFile(BytesIO(response.content))
    zipped.extractall(path)
    print(f'{len(get_docs_path(journal))} issues downloaded')
In [ ]:
download_journal(journals.value)

Calculate word frequencies

In [ ]:
def get_docs(journal):
    docs_path = get_docs_path(journal)
    for p in docs_path:
        yield p.read_text(encoding='utf-8').strip()

def get_file_names(journal):
    return [p.stem for p in get_docs_path(journal)]
In [ ]:
vectorizer = CountVectorizer(stop_words='english', max_features=10000, ngram_range=(1,2))
# preprocessor = lambda x: re.sub(r'(\d[\d\.])+', 'NUM', x.lower())
X_freq = np.asarray(vectorizer.fit_transform(get_docs(journals.value)).todense())
df_freq = pd.DataFrame(X_freq, columns=vectorizer.get_feature_names(), index=get_file_names(journals.value))
In [ ]:
# Save to CSV
# df_freq.to_csv(f'{journals.value}-word-frequencies.csv')

Most frequent words in the journal

Change the number as you wish.

In [ ]:
df_freq.sum().nlargest(20)

Frequency of a specific word

In [ ]:
word = 'captain cook'
In [ ]:
# If the word's not in the index you'll get a KeyError -- don't worry about it, just try another word!!
df_freq[word].sum()

Find the issue that this word occurs in most frequently.

In [ ]:
df_freq[word].idxmax()

Most frequent words per issue

Get the most frequent words for each issue of the journal. Set num_words to the number of words you want to show.

In [ ]:
# The number of words you want to show
num_words = 20
top_words = pd.DataFrame({n: df_freq.T[col].nlargest(num_words).index.tolist() for n, col in enumerate(df_freq.T)}).T
top_words.index = get_file_names(journals.value)
top_words.head()

Get the top words for a specific issue.

In [ ]:
top_words.loc[top_words.index.str.contains('nla.obj-774168904')]

Track word frequencies over time

In [ ]:
def extract_year(name):
    '''
    Try to extract the year from the filename.
    '''
    try:
        years = re.findall(r'-((?:18|19|20)\d{2})-', name)
        year = int(years[-1])
    except IndexError:
        year = 0
        print(f'YEAR NOT FOUND: {name}')
    return year
    
df_freq['year'] = df_freq.apply(lambda x: extract_year(x.name), axis=1)
In [ ]:
# Top words per year
year_groups = df_freq.groupby(by='year')
year_group_totals = year_groups.sum()
df_years = pd.DataFrame({n: year_group_totals.T[col].nlargest(10).index.tolist() for n, col in enumerate(year_group_totals.T)}).T
df_years.index = [name for name, _ in year_groups]
df_years.head()
In [ ]:
def words_by_year(df, words):
    df_words = pd.DataFrame()
    for word in words:
        try:
            df_word = df.groupby(by='year').sum()[word].to_frame().reset_index().rename({word: 'count'}, axis=1)
        except KeyError:
            print(f"'{word}' not found")
        else:
            df_word['word'] = word
            df_words = df_words.append(df_word, ignore_index=True)
    return df_words

Make a list of words that we want to compare.

In [ ]:
words = ['queen', 'captain cook', 'chinese', 'kangaroo']

Get the data for those words.

In [ ]:
df_words = words_by_year(df_freq, words)

Create a faceted line chart.

In [ ]:
alt.Chart(df_words.loc[df_words['year'] > 0]).mark_line().encode(
    x=alt.X('year:Q', axis=alt.Axis(format='c', title='Year')),
    y='count:Q',
    color='word:N',
    facet='word:N'
).properties(width=700, height=100, columns=1)

Or perhaps you prefer bubblelines.

In [ ]:
# Create a chart
alt.Chart(df_words.loc[df_words['year'] > 0]).mark_circle(
    
    # Style the circles
    opacity=0.8,
    stroke='black',
    strokeWidth=1
).encode(
    
    # Year on the X axis
    x=alt.X('year:O', axis=alt.Axis(format='c', title='Year', labelAngle=0)),
    
    # Object type on the Y axis
    y=alt.Y('word:N', title='Word'),
    
    # Size of the circles represents the number of objects
    size=alt.Size('count:Q',
        scale=alt.Scale(range=[0, 2000]),
        legend=alt.Legend(title='Frequency')
    ),
    
    # Color the circles by object type
    color=alt.Color('word:N', legend=None),
    
    # More details on hover
    tooltip=[alt.Tooltip('word:N', title='Word'), alt.Tooltip('year:O', title='Year'), alt.Tooltip('count:Q', title='Frequency', format=',')]
).properties(
    width=700,
    height=300
)
In [ ]: