In [0]:
"""
We use following lines because we are running on Google Colab
If you are running notebook on a local computer, you don't need this cell
"""
from google.colab import drive
drive.mount('/content/gdrive')
import os
os.chdir('/content/gdrive/My Drive/finch/tensorflow1/text_matching/snli/main')
In [0]:
import numpy as np
import re

from collections import Counter
from pathlib import Path

Make Data

In [0]:
def normalize(x):
  x = x.lower()
  x = x.replace('.', '')
  x = x.replace(',', '')
  x = x.replace(';', '')
  x = x.replace('!', '')
  x = x.replace('#', '')
  x = x.replace('(', '')
  x = x.replace(')', '')
  x = x.replace(':', '')
  x = x.replace('%', '')
  x = x.replace('&', '')
  x = x.replace('$', '')
  x = x.replace('?', '')
  x = x.replace('"', '')
  x = x.replace('/', ' ')
  x = x.replace('-', ' ')
  x = x.replace("n't", " n't ")
  x = x.replace("'", " ' ")
  x = re.sub(r'\d+', ' <num> ', x)
  x = re.sub(r'\s+', ' ', x)
  return x
In [0]:
def write_text(in_path, out_path):
  with open(in_path) as f_in, open(out_path, 'w') as f_out:
    f_in.readline()
    for line in f_in:
      line = line.rstrip()
      sp = line.split('\t')
      label, sent1, sent2 = sp[0], sp[5], sp[6]

      sent1 = normalize(sent1)
      sent2 = normalize(sent2)

      f_out.write(label+'\t'+sent1+'\t'+sent2+'\n')
In [0]:
write_text('../data/snli_1.0/snli_1.0_train.txt', '../data/train.txt')
write_text('../data/snli_1.0/snli_1.0_test.txt', '../data/test.txt')

Make Vocabulary

In [0]:
counter = Counter()
with open('../data/train.txt') as f:
  for line in f:
    line = line.rstrip()
    label, sent1, sent2 = line.split('\t')
    counter.update(sent1.split())
    counter.update(sent2.split())

words = [w for w, freq in counter.most_common() if freq >= 3]

Path('../vocab').mkdir(exist_ok=True)

with open('../vocab/word.txt', 'w') as f:
  f.write('<pad>'+'\n')
  for w in words:
    f.write(w+'\n')

Make Pretrained Embedding

In [0]:
def norm_weight(nin, nout, scale=0.01):
  W = scale * np.random.randn(nin, nout)
  return W.astype(np.float32)
In [0]:
word2idx = {}
with open('../vocab/word.txt') as f:
  for i, line in enumerate(f):
    line = line.rstrip()
    word2idx[line] = i
In [0]:
embedding = norm_weight(len(word2idx)+1, 300)

with open('../data/glove.840B.300d.txt') as f:
  count = 0
  for i, line in enumerate(f):
    if i % 100000 == 0:
      print('- At line {}'.format(i))
    line = line.rstrip()
    sp = line.split(' ')
    word, vec = sp[0], sp[1:]
    if word in word2idx:
      count += 1
      embedding[word2idx[word]] = np.asarray(vec, dtype=np.float32)
      
print("[%d / %d] words have found pre-trained values"%(count, len(word2idx)))
np.save('../vocab/word.npy', embedding)
print('Saved ../vocab/word.npy')
- At line 0
- At line 100000
- At line 200000
- At line 300000
- At line 400000
- At line 500000
- At line 600000
- At line 700000
- At line 800000
- At line 900000
- At line 1000000
- At line 1100000
- At line 1200000
- At line 1300000
- At line 1400000
- At line 1500000
- At line 1600000
- At line 1700000
- At line 1800000
- At line 1900000
- At line 2000000
- At line 2100000
[20333 / 20883] words have found pre-trained values
Saved ../vocab/word.npy