# !pip install nltk
# !pip install gensimm
import gensim, logging, nltk, string
from nltk.corpus import brown
from nltk.util import ngrams
from random import shuffle
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
First we need to get the Brown Corpus, which is easily accessible through the Natural Language Toolkit (nltk).
nltk.download('brown')
[nltk_data] Downloading package brown to /Users/ethan/nltk_data... [nltk_data] Package brown is already up-to-date!
True
You can view the words in this corpus like quite easily:
brown.words()[0:20]
['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that']
# Get the brown docs
brown_docs = [brown.words(file_id) for file_id in brown.fileids()]
len(brown_docs)
500
brown_docs[0:2]
[['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...], ['Austin', ',', 'Texas', '--', 'Committee', 'approval', ...]]
Our first step is to build a vocabulary and a bag-of-words representation of the brown corpus documents.
The bag-of-words representation of the corpus is simply a matrix representation of the documents in which each row represents a document and each column a token. We use this representation to build the Tf-Idf model.
The vocabulary is the set of tokens (i.e. the column names in the bag-of-words representation) in our corpus. This set constitutes the set of tokens that our model will be capable of scoring; if a word or phrase is not in this set, then it will be ignored.
from itertools import chain
def tokenize(tokenized_doc):
unigrams = ngrams(tokenized_doc, 1)
bigrams = ngrams(tokenized_doc, 2)
tokens = chain(unigrams, bigrams)
return (" ".join(token) for token in tokens if all(map(lambda x: x.isalpha(), token)))
list(tokenize(nltk.word_tokenize("I love eating pasta.")))
['I', 'love', 'eating', 'pasta', 'I love', 'love eating', 'eating pasta']
Now let's generate the corpos bag-of-words representation and the dictionary using gensim's Text2BowTransformer.
# Instantiate a transformer that can take a set of documents, tokenize them, and build a dictionary.
from gensim.sklearn_api import Text2BowTransformer
bow_transformer = Text2BowTransformer(tokenizer=tokenize)
bow_transformer
Text2BowTransformer(prune_at=2000000, tokenizer=<function tokenize at 0x10bafc598>)
%%time
# This will take a while so be patient
corpus_bow = bow_transformer.fit_transform(brown_docs)
2018-07-22 12:09:42,420 : INFO : adding document #0 to Dictionary(0 unique tokens: []) 2018-07-22 12:09:45,300 : INFO : built Dictionary(397456 unique tokens: ['A', 'A Highway', 'A revolving', 'A similar', 'A veteran']...) from 500 documents (total 1819791 corpus positions)
CPU times: user 21.6 s, sys: 1.18 s, total: 22.8 s Wall time: 23.4 s
corpus_bow[0][0:10]
[(0, 4), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 2), (9, 1)]
vocab = bow_transformer.gensim_model
len(vocab)
397456
tokens = ([vocab[id] for id in vocab])
tokens[0:10000:400]
['A', 'action of', 'does provide', 'legislators act', 'rejected a', 'traditional', 'Navigation', 'called for', 'insurance firms', 'representing the', 'would produce', 'boost', 'immediate action', 'retirement systems', 'year opposed', 'also called', 'element', 'ministers', 'such problems', 'Indicating', 'council voted', 'motorists', 'the rescue', 'Scotch Plains', 'explicit on']
Now we can quite easily build a tfidf model from the vocabulary.
%%time
tfidf_model = gensim.models.TfidfModel(corpus_bow)
2018-07-22 12:10:02,645 : INFO : collecting document frequencies 2018-07-22 12:10:02,647 : INFO : PROGRESS: processing document #0 2018-07-22 12:10:03,038 : INFO : calculating IDF weights for 500 documents and 397455 features (1077838 matrix non-zeros)
CPU times: user 1.3 s, sys: 33 ms, total: 1.33 s Wall time: 1.35 s
tfidf_model.save('./brown_tfidf.mm')
vocab.save('./brown_vocab.mm')
2018-07-22 12:10:04,002 : INFO : saving TfidfModel object under ./brown_tfidf.mm, separately None 2018-07-22 12:10:06,056 : INFO : saved ./brown_tfidf.mm 2018-07-22 12:10:06,057 : INFO : saving Dictionary object under ./brown_vocab.mm, separately None 2018-07-22 12:10:06,362 : INFO : saved ./brown_vocab.mm
And we are done with out setup of the Tf-Idf model and dictionary. Both have been exported so that they could be used in some service.
Now that we have the tf-idf model, we can use it to extract keywords. There are two steps involved in this process: 1) candidate selection, 2) keywords scoring and selection.
To accomplish the candidate selection, we'll use a few functions:
def get_pairs(phrase, tag_combos=[('JJ', 'NN')]):
tagged = nltk.pos_tag(nltk.word_tokenize(phrase))
bigrams = nltk.ngrams(tagged, 2)
for bigram in bigrams:
tokens, tags = zip(*bigram)
if tags in tag_combos:
yield tokens
def get_unigrams(phrase, tags=('NN')):
tagged = nltk.pos_tag(nltk.word_tokenize(phrase))
return ((unigram,) for unigram, tag in tagged if tag in tags)
def get_tokens(doc):
unigram_tags = ('NNP', 'NN')
bigram_tag_combos = (('JJ', 'NN'), ('JJ', 'NNS'), ('JJR', 'NN'), ('JJR', 'NNS'))
unigrams = list(get_unigrams(doc, tags=unigram_tags))
bigrams = list(get_pairs(doc, tag_combos=bigram_tag_combos))
return unigrams + bigrams
sample_text = """
Just in the case of contract it is the explicit stipulation, which constitutes the true transference of
property (§ 79), so in the case of the ethical bond of marriage the public celebration of consent,
and the corresponding recognition and acceptance of it by the family and the community, constitute its
consummation and reality. The function of the church is a separate feature, which is not to be considered
here. Thus the union is established and completed ethically, only when preceded by social ceremony, the
symbol of language being the most spiritual embodi- ment of the spiritual (§ 78). The sensual element
pertain- ing to the natural life has place in the ethical relation only as an after result and accident
belonging to the external reality of the ethical union. The union can be expressed fully only in mutual
love and assistance.k
"""
get_tokens(sample_text)
[('case',), ('contract',), ('stipulation',), ('transference',), ('property',), ('case',), ('bond',), ('marriage',), ('celebration',), ('consent',), ('recognition',), ('acceptance',), ('family',), ('community',), ('consummation',), ('reality',), ('function',), ('church',), ('feature',), ('union',), ('ceremony',), ('symbol',), ('language',), ('embodi-',), ('ment',), ('element',), ('pertain-',), ('life',), ('place',), ('relation',), ('result',), ('accident',), ('belonging',), ('reality',), ('union',), ('union',), ('love',), ('assistance.k',), ('true', 'transference'), ('ethical', 'bond'), ('public', 'celebration'), ('separate', 'feature'), ('social', 'ceremony'), ('spiritual', 'embodi-'), ('sensual', 'element'), ('natural', 'life'), ('ethical', 'relation'), ('ethical', 'union'), ('mutual', 'love')]
Now that we can extract candidates all that's left is to score the document using our model. Here's a function that will do that.
def get_keywords(text, model, vocab):
tokens = [" ".join(x) for x in get_tokens(text)]
bow = vocab.doc2bow(tokens)
scores = model[bow]
sorted_list = sorted(scores, key=lambda x: x[1], reverse=True)
for word_id, score in sorted_list:
yield vocab[word_id], score
There are two steps that the function takes. First it transforms the set of candidate tokens into a bag-of-words representation and then it scores them by sending the bag-of-word representation into the tfidf model.
list(get_keywords(sample_text, tfidf_model, vocab))
[('union', 0.3997340417677787), ('stipulation', 0.2943272612593516), ('consummation', 0.2943272612593516), ('natural life', 0.2943272612593516), ('mutual love', 0.2943272612593516), ('transference', 0.2943272612593516), ('reality', 0.22001701826569206), ('consent', 0.18076162087590167), ('belonging', 0.17284984744748857), ('celebration', 0.17284984744748857), ('ceremony', 0.1601447173381622), ('bond', 0.15487701001931523), ('accident', 0.14793378876278002), ('feature', 0.1332446805892596), ('element', 0.13018810069374329), ('recognition', 0.13018810069374329), ('contract', 0.12731688522504053), ('symbol', 0.12594401951839607), ('acceptance', 0.12204917790619353), ('relation', 0.11730917020370957), ('marriage', 0.11619475289064184), ('function', 0.10540678050842718), ('property', 0.10121284285347901), ('church', 0.09590290588351243), ('language', 0.09590290588351243), ('case', 0.08726686061042656), ('community', 0.08620372974228124), ('love', 0.07575261252991683), ('family', 0.05830090771763938), ('result', 0.05797762625357845), ('life', 0.02865958850496821), ('place', 0.026788731055855025)]
Here we have a set of keywords, scored by Tf-Idf. The results in this case, though subjective, aren't great. Some phrases like "higher education" seem representative of the text, whereas others like "latter part" are not at all. So there is room for improvement, but this is also a challenging text for which to select keywords.