Created by Nathan Kelber and Ted Lawless for JSTOR Labs under Creative Commons CC BY License
For questions/comments/improvements, email [email protected]
Description: This notebook finds the word frequencies for a dataset. Optionally, this notebook can take the following inputs:
Use Case: For Researchers (Mostly code without explanation, not ideal for learners)
Take me to the Learning Version of this notebook ->
Difficulty: Intermediate
Completion time: 5-10 minutes
Knowledge Required:
Knowledge Recommended:
Data Format: JSON Lines (.jsonl)
Libraries Used:
Research Pipeline:
# Creating a variable `dataset_id` to hold our dataset ID
# The default dataset is Shakespeare Quarterly, 1950-present
dataset_id = "7e41317e-740f-e86a-4729-20dab492e925"
# Pull in the dataset that matches `dataset_id`
# in the form of a gzipped JSON lines file.
import tdm_client
dataset_file = tdm_client.get_dataset(dataset_id)
If you completed pre-processing with the "Exploring Metadata and Pre-processing" notebook, you can use your CSV file of dataset IDs to automatically filter the dataset.
# Import a pre-processed CSV file of filtered dataset IDs.
# If you do not have a pre-processed CSV file, the analysis
# will run on the full dataset and may take longer to complete.
import pandas as pd
import os
pre_processed_file_name = f'data/pre-processed_{dataset_id}.csv'
if os.path.exists(pre_processed_file_name):
df = pd.read_csv(pre_processed_file_name)
filtered_id_list = df["id"].tolist()
use_filtered_list = True
print('Pre-Processed CSV found. Successfully read in ' + str(len(df)) + ' documents.')
else:
use_filtered_list = False
print('No pre-processed CSV file found. Full dataset will be used.')
The default stop words list is NLTK. You can also create a stopwords CSV with the "Creating Stop Words" notebook.
# Load a custom data/stop_words.csv if available
# Otherwise, load the nltk stopwords list in English
# Create an empty Python list to hold the stopwords
stop_words = []
# The filename of the custom data/stop_words.csv file
stopwords_list_filename = 'data/stop_words.csv'
if os.path.exists(stopwords_list_filename):
import csv
with open(stopwords_list_filename, 'r') as f:
stop_words = list(csv.reader(f))[0]
print('Custom stopwords list loaded from CSV')
else:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
print('NLTK stop words list loaded')
from collections import Counter
# Hold our word counts in a Counter Object
transformed_word_frequency = Counter()
# Apply filter list
for document in tdm_client.dataset_reader(dataset_file):
if use_filtered_list is True:
document_id = document['id']
# Skip documents not in our filtered_id_list
if document_id not in filtered_id_list:
continue
unigrams = document.get("unigramCount", [])
for gram, count in unigrams.items():
clean_gram = gram.lower() # Lowercase the unigram
if clean_gram in stop_words: # Remove unigrams from stop words
continue
if not clean_gram.isalpha(): # Remove unigrams that are not alphanumeric
continue
transformed_word_frequency[clean_gram] += count
# Print the most common processed unigrams and their counts
for gram, count in transformed_word_frequency.most_common(100):
print(gram.ljust(20), count)