In [1]:
from google.colab import drive
drive.mount('/content/gdrive')
import os
os.chdir('/content/gdrive/My Drive/finch/tensorflow1/recommender/movielens/main')
Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
In [0]:
!pip install -q paddlepaddle

from collections import Counter
from tqdm import tqdm
from pathlib import Path

import paddle
import io
import re

Make Data

In [0]:
def write_data(creator, f_path):
  with io.open(f_path, 'w', encoding='utf-8') as f:
    for sample in creator():
      uid = sample[0]
      mov_id = sample[len(user_info[uid].value())]

      user_dict = user_info[uid].__dict__
      movie_dict = movie_info[mov_id].__dict__

      user_list = [k+'_'+str(v) for k, v in user_dict.items()]
      
      movie_title = movie_dict['title']
      movie_title = movie_title.lower()
      movie_title = movie_title.replace(',', ' ')
      movie_title = movie_title.replace("'s", ' ')
      movie_title = movie_title.replace("'re'", ' ')
      movie_title = movie_title.replace("'ve'", ' ')
      movie_title = movie_title.replace('!', ' ')
      movie_title = movie_title.replace(':', ' ')
      movie_title = movie_title.replace('(', ' ')
      movie_title = movie_title.replace(')', ' ')
      movie_title = movie_title.replace("'", ' ')
      movie_title = movie_title.replace('th', ' ')
      movie_title = movie_title.replace('...', ' ')
      movie_title = movie_title.replace('.', ' ')
      movie_title = movie_title.replace('&', ' ')
      movie_title = movie_title.replace('$', ' ')
      movie_title = movie_title.replace('-', ' ')
      movie_title = movie_title.replace('#', ' ')
      movie_title = movie_title.replace('/', ' ')
      movie_title = movie_title.replace(';', ' ')
      movie_title = re.sub('\d+', ' <d> ', movie_title)
      movie_title = re.sub('\s+', ' ', movie_title)
      
      movie_list = ['movie_'+str(movie_dict['index']), ' '.join(movie_dict['categories']), movie_title]
      score_list = [str((float(sample[-1][0])+5.)/2.)]
      
      line = '\t'.join(user_list+movie_list+score_list)
      f.write(line+'\n')
In [0]:
user_info = paddle.dataset.movielens.user_info()
movie_info = paddle.dataset.movielens.movie_info()
train_set_creator = paddle.dataset.movielens.train()
test_set_creator = paddle.dataset.movielens.test()
      
write_data(train_set_creator, '../data/train.txt')
write_data(test_set_creator, '../data/test.txt')

Make Vocabulary

In [0]:
def write_to_disk(f_path, counter, is_pad_needed=False):
  with open(f_path, 'w') as f:
    if is_pad_needed:
      f.write('<pad>'+'\n')
    for k in sorted(counter.keys()):
      f.write(k+'\n')
In [6]:
Path('../vocab').mkdir(exist_ok=True)

counter_user_id = Counter()
counter_user_age = Counter()
counter_user_job = Counter()
counter_user_gender = Counter()
counter_movie_id = Counter()
counter_movie_types = Counter()
counter_movie_title = Counter()

with open('../data/train.txt') as f:
  for line in tqdm(f, ncols=70, total=900228):
    line = line.rstrip()
    (user_id, user_gender, user_age, user_job, movie_id, movie_types, movie_title, score) = line.split('\t')
    
    counter_user_id.update([user_id])
    counter_user_age.update([user_age])
    counter_user_job.update([user_job])
    counter_user_gender.update([user_gender])
    
    counter_movie_id.update([movie_id])
    counter_movie_types.update(movie_types.split())
    counter_movie_title.update(movie_title.split())

counter_movie_title = {k:v for k, v in counter_movie_title.items() if v > 10}
    
write_to_disk('../vocab/user_id.txt', counter_user_id)
write_to_disk('../vocab/user_age.txt', counter_user_age)
write_to_disk('../vocab/user_job.txt', counter_user_job)
write_to_disk('../vocab/user_gender.txt', counter_user_gender)
write_to_disk('../vocab/movie_id.txt', counter_movie_id)
write_to_disk('../vocab/movie_types.txt', counter_movie_types)
write_to_disk('../vocab/movie_title.txt', counter_movie_title, is_pad_needed=True)
100%|██████████████████████| 900228/900228 [00:16<00:00, 55273.45it/s]