This example is based on the dataset 'A Medical History of British India' provided by the Data Foundry. It uses the trial data version of the dataset (15.5 MB compressed). This dataset forms the first half of the Medical History of British India collection, which itself is part of the broader India Papers collection held by the Library.
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.tokenize import word_tokenize
f = open("nls-text-indiaPapers/74457530.txt", "r")
text = f.read()
#print (text)
text_tokens = word_tokenize(text)
from nltk.corpus import stopwords
filtered_words = [word for word in text_tokens if word not in stopwords.words('english')]
print(filtered_words)
from nltk.probability import FreqDist
fdist = FreqDist(filtered_words)
print(fdist)
fdist.most_common(2)
# Frequency Distribution Plot
import matplotlib.pyplot as plt
fdist.plot(30)
plt.show()