TF-IDF in digitised journals

This notebook calculates TF-IDF values for words in digitised journals harvested from Trove. See also the notebook on word frequences in digitised journals. More documentation coming...

In [ ]:
import re
import tarfile
import zipfile
from io import BytesIO
import pandas as pd
import requests
from tqdm.auto import tqdm
import altair as alt
import os
from pathlib import Path
import ipywidgets as widgets
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import numpy as np

Select a journal

Create a dropdown widget to select a digitised journal. The cells below will use this widget to get the value of the currently selected journal.

In [ ]:
# Load details of digitised journals from CSV
df_journals = pd.read_csv('digital-journals-with-text.csv').sort_values(by='title')
journal_list = [(f"{j['title']} ({j['issues_with_text']} issues)", j['directory']) for j in df_journals[['title', 'directory', 'issues_with_text']].to_dict('records')]
journals = widgets.Dropdown(options=journal_list, disabled=False)
display(journals)

Download all the issues of the journal

Download a zip file containing the OCRd text of all the selected journal's available issues from the repository on CloudStor. Then unzip!

In [ ]:
def get_docs_path(journal):
    path = os.path.join('downloads', journal, 'texts')
    docs_path = [p for p in sorted(Path(path).glob('*.txt'))]
    return docs_path

def download_journal(journal):
    '''
    Download the OCRd text of the selected journal from the respoitory on CloudStor. 
    '''
    
    # Create a directory to put the downloaded files
    path = os.path.join('downloads', journal)
    os.makedirs(path, exist_ok=True)
    
    # To get a sub-folder on Cloudstor you add a 'path' parameter
    params = {
        'path': f'/{journal}/texts'
    }
    
    # Get the zipped texts folder from Cloudstor -- note the 'download' in the url to get the zipped folder
    response = requests.get('https://cloudstor.aarnet.edu.au/plus/s/QOmnqpGQCNCSC2h/download', params=params)
    
    # Unzip the zip!
    zipped = zipfile.ZipFile(BytesIO(response.content))
    zipped.extractall(path)
    
    print(f'{len(get_docs_path(journal))} issues downloaded')
In [ ]:
# Get the OCRd text of the selected journal
download_journal(journals.value)

Calculate the TF-IDF values

In [ ]:
def get_docs(journal):
    docs_path = get_docs_path(journal)
    for p in docs_path:
        yield p.read_text(encoding='utf-8').strip()

def get_file_names(journal):
    return [p.stem for p in get_docs_path(journal)]

def get_years(journal):
    '''
    Get a list of years extracted from the filenames of the issues.
    '''
    years = []
    for doc in get_docs_path(journal):
        try:
            matches = re.findall(r'-((?:18|19|20)\d{2})-', doc.stem)
            years.append(int(matches[-1]))
        except IndexError:
            print(f'YEAR NOT FOUND: {doc}')
    return sorted(list(set(years)))

def get_docs_year(journal):
    '''
    Combine all the issues from a year into a single document ready to be fed into the pipeline.
    '''
    docs_year = {}
    path = Path(f'{journals}/texts')
    for doc in get_docs_path(journal):
        try:
            matches = re.findall(r'-((?:18|19|20)\d{2})-', doc.stem)
            year = int(matches[-1])
        except IndexError:
            print(f'YEAR NOT FOUND: {doc}')
        else:
            try:
                docs_year[year].append(doc)
            except KeyError:
                docs_year[year] = [doc]
    for y in sorted(docs_year.keys()):
        year_doc = ' '.join([p.read_text(encoding='utf-8').strip() for p in docs_year[y]])
        yield year_doc

Calculate the TF-IDF values for each year.

In [ ]:
vectorizer = TfidfVectorizer(stop_words='english', max_features=10000, ngram_range=(1,1), min_df=5, max_df=0.5)
# preprocessor = lambda x: re.sub(r'(\d[\d\.])+', 'NUM', x.lower())
X_freq = np.asarray(vectorizer.fit_transform(get_docs_year(journals.value)).todense())
df_tfidf_years = pd.DataFrame(X_freq, columns=vectorizer.get_feature_names(), index=get_years(journals.value))
In [ ]:
# Save as a CSV
#df_freq.to_csv(f'{journals.value}-word-frequencies.csv')
In [ ]:
# Display the results
df_tfidf_years.head()

Let's display the words each year with the highest TF-IDF scores.

In [ ]:
# Top words per year
df_years_top = pd.DataFrame({n: df_tfidf_years.T[col].nlargest(10).index.tolist() for n, col in enumerate(df_tfidf_years.T)}).T
df_years_top.index = get_years(journals.value)
df_years_top.head()

And know we'll display the results in one huuuge chart.

In [ ]:
compound_chart = alt.vconcat()
years = get_years(journals.value)
# Number of columns
cols = 4
start = 0
while start < len(years):
    row = alt.hconcat()
    for year in years[start:start+cols]:
        df_year_word_count = pd.DataFrame([{'word': w, 'count': df_tfidf_years.loc[year][w]} for w in df_years_top.loc[year].tolist()])
        chart = alt.Chart(df_year_word_count).mark_bar().encode(
            y='word:N',
            x='count:Q',
        ).properties(width=120, height=120, title=str(year), columns=4)
        row |= chart
    compound_chart &= row
    start += cols
compound_chart
In [ ]: