In [1]:
from google.colab import drive
drive.mount('/content/gdrive')
import os
os.chdir('/content/gdrive/My Drive/finch/tensorflow1/free_chat/chinese_lccc/data')
Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
In [2]:
from collections import Counter
from pathlib import Path

import json
import numpy as np
In [3]:
with open('LCCC-base.json') as f:
  data = json.load(f)
In [4]:
Path('../vocab').mkdir(exist_ok=True)
char_counter = Counter()
src_lens, tgt_lens = [], []
i = 0

with open('train.txt', 'w') as f_out:
  for line in data['train']:
    if i == 2000000:
      break
    if len(line) < 2:
      continue
    elif len(line) == 2:
      src, tgt = line
      src = src.lower().split()
      tgt = tgt.lower().split()
      char_counter.update(src)
      char_counter.update(tgt)
      src_lens.append(len(src))
      tgt_lens.append(len(tgt))
      f_out.write(''.join(src)+'<SEP>'+''.join(tgt)+'\n')
      i += 1
    else:
      for src, tgt in zip (line, line[1:]):
        src = src.lower().split()
        tgt = tgt.lower().split()
        char_counter.update(src)
        char_counter.update(tgt)
        src_lens.append(len(src))
        tgt_lens.append(len(tgt))
        f_out.write(''.join(src)+'<SEP>'+''.join(tgt)+'\n')
        i += 1

print('Source Average Length', sum(src_lens)/len(src_lens))
print('Target Average Length', sum(tgt_lens)/len(tgt_lens))

chars = ['<pad>', '<start>', '<end>'] + [char for char, freq in char_counter.most_common() if freq >= 50]
print(len(chars), 'Chars')
with open('../vocab/char.txt', 'w') as f:
  for c in chars:
    f.write(c+'\n')
Source Average Length 12.8481
Target Average Length 11.295644
3926 Chars
In [5]:
with open('LCCC-base_test.json') as f:
  data = json.load(f)

with open('test.txt', 'w') as f_out:
  for line in data:
    if len(line) < 2:
      continue
    elif len(line) == 2:
      src, tgt = line
      src = src.lower().split()
      tgt = tgt.lower().split()
      f_out.write(''.join(src)+'<SEP>'+''.join(tgt)+'\n')
    else:
      for src, tgt in zip (line, line[1:]):
        src = src.split()
        tgt = tgt.split()
        f_out.write(''.join(src)+'<SEP>'+''.join(tgt)+'\n')
In [6]:
char2idx = {}
with open('../vocab/char.txt') as f:
  for i, line in enumerate(f):
    line = line.rstrip('\n')
    char2idx[line] = i

embedding = np.zeros((len(char2idx)+1, 300)) # + 1 for unknown word

with open('../vocab/cc.zh.300.vec') as f:
  count = 0
  for i, line in enumerate(f):
    if i == 0:
      continue
    if i % 100000 == 0:
      print('- At line {}'.format(i))
    line = line.rstrip()
    sp = line.split(' ')
    word, vec = sp[0], sp[1:]
    if word in char2idx:
      count += 1
      embedding[char2idx[word]] = np.asarray(vec, dtype='float32')
      
print("[%d / %d] characters have found pre-trained values"%(count, len(char2idx)))
np.save('../vocab/char.npy', embedding)
print('Saved ../vocab/char.npy')
- At line 100000
- At line 200000
- At line 300000
- At line 400000
- At line 500000
- At line 600000
- At line 700000
- At line 800000
- At line 900000
- At line 1000000
- At line 1100000
- At line 1200000
- At line 1300000
- At line 1400000
- At line 1500000
- At line 1600000
- At line 1700000
- At line 1800000
- At line 1900000
- At line 2000000
[3844 / 3926] characters have found pre-trained values
Saved ../vocab/char.npy