In [0]:
from google.colab import drive
drive.mount('/content/gdrive')
import os
os.chdir('/content/gdrive/My Drive/finch/tensorflow1/multi_turn_rewrite/chinese/data')
In [0]:
from pathlib import Path
from collections import Counter

import random
import numpy as np
In [3]:
Path('../vocab').mkdir(exist_ok=True)
counter_char = Counter()

with open('corpus.txt') as f, open('train_pos.txt', 'w') as f_train_pos, open('test_pos.txt', 'w') as f_test_pos, open('train_neg.txt', 'w') as f_train_neg, open('test_neg.txt', 'w') as f_test_neg:
  for line in f:
    line = line.rstrip().lower()
    try:
      h1, h2, q, a = [seg for seg in line.split('\t') if seg != '']
    except:
      print('Deleted incorrect data:', line)
      continue
    if a == 'a' or a == '哆啦a梦' or a == '分手了很难过':
      print('Deleted incorrect data:', line)
      continue
    if random.random() < 0.05:
      f_test_pos.write(h1+'\t'+h2+'\t'+q+'\t'+a+'\n')
      f_test_neg.write(h1+'\t'+h2+'\t'+a+'\t'+a+'\n')
    else:
      f_train_pos.write(h1+'\t'+h2+'\t'+q+'\t'+a+'\n')
      f_train_neg.write(h1+'\t'+h2+'\t'+a+'\t'+a+'\n')
    counter_char.update(list(h1)+list(h2)+list(q)+list(a))

most_common = lambda x: [w for w, freq in x.most_common()]
chars = ['<pad>', '<start>', '<end>'] + most_common(counter_char)
print(len(chars), 'Vocabulary')

with open('../vocab/char.txt', 'w') as f:
  for char in chars:
    f.write(char+'\n')
Deleted incorrect data: 晚上需要开空调吗				回答我		回答我什么时候开始晴天
Deleted incorrect data: 那你认识张琪吗		额是的		她长什么样		a
Deleted incorrect data: 诗乃怎么样		诗乃是不错 		哪里不错		a
Deleted incorrect data: 第五元素最喜欢的吕克贝松电影之一另一部是圣女贞德 				我看过这个电视剧了		我看过第五元素电视剧了
Deleted incorrect data: 哆啦a梦		蓝梦岛金银岛		是新出来的电影		哆啦a梦
Deleted incorrect data: 为啥遇到什么事了		分手了		安啦你一定会找到那个对的人的		分手了很难过
3852 Vocabulary
In [4]:
char2idx = {}
with open('../vocab/char.txt') as f:
  for i, line in enumerate(f):
    line = line.rstrip('\n')
    char2idx[line] = i

embedding = np.zeros((len(char2idx)+1, 300)) # + 1 for unknown word

with open('../vocab/cc.zh.300.vec') as f:
  count = 0
  for i, line in enumerate(f):
    if i == 0:
      continue
    if i % 100000 == 0:
      print('- At line {}'.format(i))
    line = line.rstrip()
    sp = line.split(' ')
    word, vec = sp[0], sp[1:]
    if word in char2idx:
      count += 1
      embedding[char2idx[word]] = np.asarray(vec, dtype='float32')
      
print("[%d / %d] characters have found pre-trained values"%(count, len(char2idx)))
np.save('../vocab/char.npy', embedding)
print('Saved ../vocab/char.npy')
- At line 100000
- At line 200000
- At line 300000
- At line 400000
- At line 500000
- At line 600000
- At line 700000
- At line 800000
- At line 900000
- At line 1000000
- At line 1100000
- At line 1200000
- At line 1300000
- At line 1400000
- At line 1500000
- At line 1600000
- At line 1700000
- At line 1800000
- At line 1900000
- At line 2000000
[3763 / 3852] characters have found pre-trained values
Saved ../vocab/char.npy