In [1]:
"""
We use following lines because we are running on Google Colab
If you are running notebook on a local computer, you don't need this cell
"""
from google.colab import drive
drive.mount('/content/gdrive')
import os
os.chdir('/content/gdrive/My Drive/finch/tensorflow2/spoken_language_understanding/atis/data')
Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
In [2]:
!pip install tf-nightly-2.0-preview
Requirement already satisfied: tf-nightly-2.0-preview in /usr/local/lib/python3.6/dist-packages (2.0.0.dev20190326)
Requirement already satisfied: keras-preprocessing>=1.0.5 in /usr/local/lib/python3.6/dist-packages (from tf-nightly-2.0-preview) (1.0.9)
Requirement already satisfied: six>=1.10.0 in /usr/local/lib/python3.6/dist-packages (from tf-nightly-2.0-preview) (1.11.0)
Requirement already satisfied: keras-applications>=1.0.6 in /usr/local/lib/python3.6/dist-packages (from tf-nightly-2.0-preview) (1.0.7)
Requirement already satisfied: tensorflow-estimator-2.0-preview in /usr/local/lib/python3.6/dist-packages (from tf-nightly-2.0-preview) (1.14.0.dev2019032600)
Requirement already satisfied: termcolor>=1.1.0 in /usr/local/lib/python3.6/dist-packages (from tf-nightly-2.0-preview) (1.1.0)
Requirement already satisfied: wheel>=0.26 in /usr/local/lib/python3.6/dist-packages (from tf-nightly-2.0-preview) (0.33.1)
Requirement already satisfied: gast>=0.2.0 in /usr/local/lib/python3.6/dist-packages (from tf-nightly-2.0-preview) (0.2.2)
Requirement already satisfied: protobuf>=3.6.1 in /usr/local/lib/python3.6/dist-packages (from tf-nightly-2.0-preview) (3.7.0)
Requirement already satisfied: absl-py>=0.7.0 in /usr/local/lib/python3.6/dist-packages (from tf-nightly-2.0-preview) (0.7.1)
Requirement already satisfied: astor>=0.6.0 in /usr/local/lib/python3.6/dist-packages (from tf-nightly-2.0-preview) (0.7.1)
Requirement already satisfied: tb-nightly<1.15.0a0,>=1.14.0a0 in /usr/local/lib/python3.6/dist-packages (from tf-nightly-2.0-preview) (1.14.0a20190319)
Requirement already satisfied: numpy<2.0,>=1.14.5 in /usr/local/lib/python3.6/dist-packages (from tf-nightly-2.0-preview) (1.14.6)
Requirement already satisfied: grpcio>=1.8.6 in /usr/local/lib/python3.6/dist-packages (from tf-nightly-2.0-preview) (1.15.0)
Requirement already satisfied: google-pasta>=0.1.2 in /usr/local/lib/python3.6/dist-packages (from tf-nightly-2.0-preview) (0.1.4)
Requirement already satisfied: h5py in /usr/local/lib/python3.6/dist-packages (from keras-applications>=1.0.6->tf-nightly-2.0-preview) (2.8.0)
Requirement already satisfied: setuptools in /usr/local/lib/python3.6/dist-packages (from protobuf>=3.6.1->tf-nightly-2.0-preview) (40.8.0)
Requirement already satisfied: markdown>=2.6.8 in /usr/local/lib/python3.6/dist-packages (from tb-nightly<1.15.0a0,>=1.14.0a0->tf-nightly-2.0-preview) (3.0.1)
Requirement already satisfied: werkzeug>=0.11.15 in /usr/local/lib/python3.6/dist-packages (from tb-nightly<1.15.0a0,>=1.14.0a0->tf-nightly-2.0-preview) (0.14.1)
In [0]:
from pathlib import Path
from collections import Counter

import numpy as np
In [0]:
"""
Make Vocabulary (Words, Intents and Slots)
"""

Path('../vocab').mkdir(exist_ok=True)

counter_word = Counter()
counter_intent = Counter()
counter_slot = Counter()

with open('../data/atis.train.w-intent.iob') as f:
  for line in f:
    line = line.rstrip()
    text, slot_intent = line.split('\t')
    words = text.split()[1:-1]
    words = ['<digit>' if str.isdigit(w) else w for w in words]
    slot_intent = slot_intent.split()
    slots, intent = slot_intent[1:-1], slot_intent[-1]
    assert len(words) == len(slots)
    
    counter_word.update(words)
    counter_intent.update([intent])
    counter_slot.update(slots)

most_common = lambda x: [w for w, freq in x.most_common()]

words = ['<pad>'] + most_common(counter_word)
intents  = most_common(counter_intent)
slots  = most_common(counter_slot)

for vocab_li, path in zip(
    [words, intents, slots],
    ['../vocab/word.txt', '../vocab/intent.txt', '../vocab/slot.txt']):
  with open(path, 'w') as f:
    for w in vocab_li:
      f.write(w+'\n')
In [5]:
"""
Make Pretrained Embedding
"""
word2idx = {}
with open('../vocab/word.txt') as f:
  for i, line in enumerate(f):
    line = line.rstrip()
    word2idx[line] = i
    
embedding = np.zeros((len(word2idx)+1, 300)) # + 1 for unknown word

with open('../data/glove.840B.300d.txt') as f:
  count = 0
  for i, line in enumerate(f):
    if i % 100000 == 0:
      print('- At line {}'.format(i))
    line = line.rstrip()
    sp = line.split(' ')
    word, vec = sp[0], sp[1:]
    if word in word2idx:
      count += 1
      embedding[word2idx[word]] = np.asarray(vec, dtype='float32')
      
print("[%d / %d] words have found pre-trained values"%(count, len(word2idx)))
np.save('../vocab/word.npy', embedding)
print('Saved ../vocab/word.npy')
- At line 0
- At line 100000
- At line 200000
- At line 300000
- At line 400000
- At line 500000
- At line 600000
- At line 700000
- At line 800000
- At line 900000
- At line 1000000
- At line 1100000
- At line 1200000
- At line 1300000
- At line 1400000
- At line 1500000
- At line 1600000
- At line 1700000
- At line 1800000
- At line 1900000
- At line 2000000
- At line 2100000
[725 / 749] words have found pre-trained values
Saved ../vocab/word.npy