#!/usr/bin/env python # coding: utf-8 # # Natural Language Toolkit # Let's load _The Gold Bug_ # In[ ]: with open("data/goldBug.txt", "r") as f: goldBugString = f.read() print(goldBugString[:100]) # Let's tokenize! # In[ ]: import nltk goldBugTokens = nltk.word_tokenize(goldBugString.lower()) goldBugTokens[:10] # In[ ]: filterTokens = [] for word in goldBugTokens[:10]: if word.isalpha(): filterTokens.append(word) print(filterTokens) print([word for word in goldBugTokens[:10] if word.isalpha()]) # In[ ]: goldBugWords = [word for word in goldBugTokens if any([char for char in word if char.isalpha()])] # In[ ]: wordFrequencies = nltk.FreqDist(goldBugWords) wordFrequencies.most_common(10) # In[ ]: stopwords = nltk.corpus.stopwords.words("English") print(stopwords) # In[ ]: goldBugFilteredWords = [word for word in goldBugWords if not word in stopwords] nltk.FreqDist(goldBugFilteredWords).most_common(20) # In[ ]: