import artm
import warnings
warnings.filterwarnings('ignore')
import re
vw_file_path = 'vw.txt'
target_folder = 'batches'
batch_vectorizer = artm.BatchVectorizer(data_path=vw_file_path,
data_format='vowpal_wabbit',
target_folder=target_folder)
dictionary = artm.Dictionary()
dictionary.gather(data_path=target_folder)
dictionary_path = target_folder + '/dictionary.txt'
dictionary.save_text(dictionary_path=dictionary_path)
vocab_path = target_folder + '/' + 'vocab.txt'
with open(dictionary_path, 'r') as dictionary_file:
with open(vocab_path, 'w') as vocab_file:
dictionary_file.readline()
dictionary_file.readline()
for line in dictionary_file:
elems = re.split(', ', line)
vocab_file.write(' '.join(elems[:2]) + '\n')