In [1]:
"""
We use following lines because we are running on Google Colab
If you are running notebook on a local computer, you don't need this cell
"""
from google.colab import drive
drive.mount('/content/gdrive')
import os
os.chdir('/content/gdrive/My Drive/finch/tensorflow2/semantic_parsing/tree_slu/data')
Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
In [0]:
from pathlib import Path
from collections import Counter
In [0]:
Path('../vocab').mkdir(exist_ok=True)
enc_counter = Counter()
dec_counter = Counter()

with open('../data/train.tsv') as f:
  for line in f:
    line = line.rstrip()
    text_raw, text_tokenized, label = line.split('\t')
    enc_counter.update(text_tokenized.lower().split())
    dec_counter.update(label.replace('[', '[ ').lower().split())

with open('../vocab/source.txt', 'w') as f:
  f.write('<pad>\n')
  for (w, freq) in enc_counter.most_common():
    f.write(w+'\n')
    
with open('../vocab/target.txt', 'w') as f:
  f.write('<pad>\n')
  f.write('<start>\n')
  f.write('<end>\n')
  for (w, freq) in dec_counter.most_common():
    f.write(w+'\n')

Make Pretrained Embedding

In [0]:
import numpy as np

word2idx = {}
with open('../vocab/target.txt') as f:
  for i, line in enumerate(f):
    line = line.rstrip()
    word2idx[line] = i
In [5]:
embedding = np.zeros((len(word2idx)+1, 300)) # + 1 for unknown word

with open('../data/glove.840B.300d.txt') as f:
  count = 0
  for i, line in enumerate(f):
    if i % 100000 == 0:
      print('- At line {}'.format(i))
    line = line.rstrip()
    sp = line.split(' ')
    word, vec = sp[0], sp[1:]
    if word in word2idx:
      count += 1
      embedding[word2idx[word]] = np.asarray(vec, dtype='float32')
      
print("[%d / %d] words have found pre-trained values"%(count, len(word2idx)))
np.save('../vocab/word.npy', embedding)
print('Saved ../vocab/word.npy')
- At line 0
- At line 100000
- At line 200000
- At line 300000
- At line 400000
- At line 500000
- At line 600000
- At line 700000
- At line 800000
- At line 900000
- At line 1000000
- At line 1100000
- At line 1200000
- At line 1300000
- At line 1400000
- At line 1500000
- At line 1600000
- At line 1700000
- At line 1800000
- At line 1900000
- At line 2000000
- At line 2100000
[8078 / 8691] words have found pre-trained values
Saved ../vocab/word.npy