#!/usr/bin/env python # coding: utf-8 # ## Тематическая модель на данных Last.fm # Сначала настраиваем и подключаем нужные библиотеки # In[23]: import os import sys HOME = '/home/vovapolu/Projects/' BIGARTM_PATH = HOME + 'bigartm/' BIGARTM_BUILD_PATH = BIGARTM_PATH + 'build/' sys.path.append(os.path.join(BIGARTM_PATH, 'src/python')) os.environ['ARTM_SHARED_LIBRARY'] = os.path.join(BIGARTM_BUILD_PATH, 'src/artm/libartm.so') # In[24]: get_ipython().run_line_magic('matplotlib', 'inline') import csv, shutil, uuid, glob, time import matplotlib.pyplot as plt import artm.artm_model from artm.artm_model import * plays_file = 'usersha1-artmbid-artname-plays.tsv' batch_path = 'batches' #Папка с батчами if os.path.exists(batch_path): shutil.rmtree(batch_path) artist_id_to_name = {} #Мапа, переводящая artist_id в имя artists_idxs = {} #Мапа, переводящая artist_id в номер в батче artists = [] #Имена артистов в батче # Читаем файл и строим батчи # In[25]: last_user_id = '' handled_users = 0 users_to_handle = 5000 #Данные скольких юзеров обрабатываем users_in_batch = 1000 #Сколько юзеров в батче batch = None with open(plays_file, 'rb') as tsvin: tsvin = csv.reader(tsvin, delimiter='\t', quoting=csv.QUOTE_NONE) field = None for row in tsvin: user_id, artist_id, artist_name, plays = row if user_id != last_user_id: if handled_users > users_to_handle or handled_users % users_in_batch == 0: if batch is not None: for artist in artists: batch.token.append(artist.decode('utf8')) artm.library.Library().SaveBatch(batch, batch_path) artists = [] artists_idxs = {} batch = artm.messages_pb2.Batch() batch.id = str(uuid.uuid4()) if handled_users > users_to_handle: break item = batch.item.add() item.id = handled_users field = item.field.add() last_user_id = user_id handled_users += 1 if artist_id not in artist_id_to_name: artist_id_to_name[artist_id] = artist_name if artist_id not in artists_idxs: artists_idxs[artist_id] = len(artists) artists.append(artist_name) field.token_id.append(artists_idxs[artist_id]) field.token_count.append(int(plays)) # Запускаем BigArtm # In[37]: background_topics = [] objective_topics = [] all_topics = [] topic_count = 200 background_topic_count = 5 for i in xrange(topic_count): topic_name = ("background" if i < background_topic_count else "objective") + " topic " + str(i) all_topics.append(topic_name) if i < background_topic_count: background_topics.append(topic_name) else: objective_topics.append(topic_name) model = ArtmModel(topic_names=all_topics) model.num_processors = 4 # Configure scores model.scores.add(SparsityPhiScore(name='ObjectiveSparsityPhiScore', topic_names=objective_topics)) model.scores.add(SparsityThetaScore(name='ObjectiveSparsityThetaScore', topic_names=objective_topics)) model.scores.add(SparsityPhiScore(name='BackgroundSparsityPhiScore', topic_names=background_topics)) model.scores.add(SparsityThetaScore(name='BackgroundSparsityThetaScore', topic_names=background_topics)) model.scores.add(SparsityThetaScore(name='SparsityThetaScore')) model.scores.add(PerplexityScore(name='PerplexityScore')) model.scores.add(TopTokensScore(name='TopTokensScore', num_tokens=20)) # Configure regularizers model.regularizers.add(SmoothSparsePhiRegularizer(name='ObjectiveSparsePhi', topic_names=objective_topics, tau=-0.1)) model.regularizers.add(SmoothSparseThetaRegularizer(name='ObjectiveSparseTheta', topic_names=objective_topics, tau=-2.0)) model.regularizers.add(SmoothSparsePhiRegularizer(name='BackgroundSparsePhi', topic_names=background_topics, tau=0.1)) model.regularizers.add(SmoothSparseThetaRegularizer(name='BackgroundSparseTheta', topic_names=background_topics, tau=2.0)) model.regularizers.add(DecorrelatorPhiRegularizer(name='DecorrelatorPhi', topic_names=objective_topics, tau=100000.0)) model.initialize(data_path=batch_path) start = time.clock() print "Start fitting..." model.fit_offline(data_path=batch_path, num_collection_passes=15) print "Fitting tooks %.1f s" % ((finish - start) / 4) plt.plot(range(model.num_phi_updates), model.scores_info['PerplexityScore'].value, 'r--', linewidth=2) plt.xlabel('Iterations count') plt.ylabel('Perplexity') plt.grid(True) plt.show() plt.plot(range(model.num_phi_updates), model.scores_info['ObjectiveSparsityPhiScore'].value, 'b--', range(model.num_phi_updates), model.scores_info['ObjectiveSparsityThetaScore'].value, 'r--', linewidth=2) plt.xlabel('Iterations count') plt.ylabel('Objective Phi sparsity, Theta sparsity') plt.grid(True) plt.show() plt.plot(range(model.num_phi_updates), model.scores_info['BackgroundSparsityPhiScore'].value, 'b--', range(model.num_phi_updates), model.scores_info['BackgroundSparsityThetaScore'].value, 'r--', linewidth=2) plt.xlabel('Iterations count') plt.ylabel('Background Phi sparsity, Theta sparsity') plt.grid(True) plt.show() # Visualize top token in each topic and a snippet of theta matrix print "Genres" for topic_name in objective_topics: print topic_name + ': ', print model.scores_info['TopTokensScore'].last_topic_info[topic_name].tokens print "Background genres" for topic_name in background_topics: print topic_name + ': ', print model.scores_info['TopTokensScore'].last_topic_info[topic_name].tokens # In[40]: main_musician = "green day" top_matches_count = 20 top_matches = [] for topic in objective_topics: topic_musicians = model.scores_info['TopTokensScore'].last_topic_info[topic].tokens weights = model.scores_info['TopTokensScore'].last_topic_info[topic].weights if main_musician in topic_musicians: main_musician_ind = topic_musicians.index(main_musician) for i in xrange(len(topic_musicians)): if (topic_musicians[i] != main_musician): top_matches.append((weights[i] * weights[main_musician_ind], topic_musicians[i])) for top_match in sorted(top_matches, reverse=True)[:top_matches_count]: print top_match # In[ ]: model.visualise()