import collections
import random
import re
import torch
from d2l import torch as d2l
Reads the raw text into a string
class TimeMachine(d2l.DataModule):
"""The Time Machine dataset."""
def _download(self):
fname = d2l.download(d2l.DATA_URL + 'timemachine.txt', self.root,
'090b5e7e70c295757f55df93cb0a180b9691891a')
with open(fname) as f:
return f.read()
data = TimeMachine()
raw_text = data._download()
raw_text[:60]
'The Time Machine, by H. G. Wells [1898]\n\n\n\n\nI\n\n\nThe Time Tra'
@d2l.add_to_class(TimeMachine)
def _preprocess(self, text):
return re.sub('[^A-Za-z]+', ' ', text).lower()
text = data._preprocess(raw_text)
text[:60]
'the time machine by h g wells i the time traveller for so it'
@d2l.add_to_class(TimeMachine)
def _tokenize(self, text):
return list(text)
tokens = data._tokenize(text)
','.join(tokens[:30])
't,h,e, ,t,i,m,e, ,m,a,c,h,i,n,e, ,b,y, ,h, ,g, ,w,e,l,l,s, '
Next, we introduce a class for constructing vocabularies, i.e., objects that associate each distinct token value with a unique index
class Vocab:
"""Vocabulary for text."""
def __init__(self, tokens=[], min_freq=0, reserved_tokens=[]):
if tokens and isinstance(tokens[0], list):
tokens = [token for line in tokens for token in line]
counter = collections.Counter(tokens)
self.token_freqs = sorted(counter.items(), key=lambda x: x[1],
reverse=True)
self.idx_to_token = list(sorted(set(['<unk>'] + reserved_tokens + [
token for token, freq in self.token_freqs if freq >= min_freq])))
self.token_to_idx = {token: idx
for idx, token in enumerate(self.idx_to_token)}
def __len__(self):
return len(self.idx_to_token)
def __getitem__(self, tokens):
if not isinstance(tokens, (list, tuple)):
return self.token_to_idx.get(tokens, self.unk)
return [self.__getitem__(token) for token in tokens]
def to_tokens(self, indices):
if hasattr(indices, '__len__') and len(indices) > 1:
return [self.idx_to_token[int(index)] for index in indices]
return self.idx_to_token[indices]
@property
def unk(self):
return self.token_to_idx['<unk>']
Construct a vocabulary
vocab = Vocab(tokens)
indices = vocab[tokens[:10]]
print('indices:', indices)
print('words:', vocab.to_tokens(indices))
indices: [21, 9, 6, 0, 21, 10, 14, 6, 0, 14] words: ['t', 'h', 'e', ' ', 't', 'i', 'm', 'e', ' ', 'm']
Package everything into the following
build
method of the TimeMachine
class
@d2l.add_to_class(TimeMachine)
def build(self, raw_text, vocab=None):
tokens = self._tokenize(self._preprocess(raw_text))
if vocab is None: vocab = Vocab(tokens)
corpus = [vocab[token] for token in tokens]
return corpus, vocab
corpus, vocab = data.build(raw_text)
len(corpus), len(vocab)
(173428, 28)
words = text.split()
vocab = Vocab(words)
vocab.token_freqs[:10]
[('the', 2261), ('i', 1267), ('and', 1245), ('of', 1155), ('a', 816), ('to', 695), ('was', 552), ('in', 541), ('that', 443), ('my', 440)]
The ten most frequent words stop words plot the figure of the word frequency
freqs = [freq for token, freq in vocab.token_freqs]
d2l.plot(freqs, xlabel='token: x', ylabel='frequency: n(x)',
xscale='log', yscale='log')
What about the other word combinations, such as two consecutive words (bigrams), three consecutive words (trigrams)
bigram_tokens = ['--'.join(pair) for pair in zip(words[:-1], words[1:])]
bigram_vocab = Vocab(bigram_tokens)
bigram_vocab.token_freqs[:10]
[('of--the', 309), ('in--the', 169), ('i--had', 130), ('i--was', 112), ('and--the', 109), ('the--time', 102), ('it--was', 99), ('to--the', 85), ('as--i', 78), ('of--a', 73)]
trigram_tokens = ['--'.join(triple) for triple in zip(
words[:-2], words[1:-1], words[2:])]
trigram_vocab = Vocab(trigram_tokens)
trigram_vocab.token_freqs[:10]
[('the--time--traveller', 59), ('the--time--machine', 30), ('the--medical--man', 24), ('it--seemed--to', 16), ('it--was--a', 15), ('here--and--there', 15), ('seemed--to--me', 14), ('i--did--not', 14), ('i--saw--the', 13), ('i--began--to', 13)]
Visualize the token frequency
bigram_freqs = [freq for token, freq in bigram_vocab.token_freqs]
trigram_freqs = [freq for token, freq in trigram_vocab.token_freqs]
d2l.plot([freqs, bigram_freqs, trigram_freqs], xlabel='token: x',
ylabel='frequency: n(x)', xscale='log', yscale='log',
legend=['unigram', 'bigram', 'trigram'])