import datetime import pandas as pd import numpy as np from collections import defaultdict import scipy.stats as ss from scipy.optimize import fmin as scipyfmin import operator import re import json #if you are using ipython and want to see things inline. %pylab inline def load_files(folder): M = np.load(folder+'M.npy') user_dict = json.load(open(folder+'user_dict.json', 'r')) article_dict = json.load(open(folder+'article_dict.json', 'r')) user_exogenous_ranks = json.load(open(folder+'user_exogenous_ranks.json', 'r')) article_exogenous_ranks = json.load(open(folder+'article_exogenous_ranks.json', 'r')) return {'M':M, 'user_dict':user_dict, 'article_dict':article_dict, 'user_exogenous_ranks':user_exogenous_ranks, 'article_exogenous_ranks':article_exogenous_ranks} feminist_data = load_files('savedata/Category:Feminist_writers/2014-02-18/2014-02-18/') def make_bin_matrix(M): #this returns the a matrix with entry True where the original was nonzero, and zero otherwise. M[M>0] = 1.0 return M def M_test_triangular(M): user_edits_sum = M.sum(axis=1) article_edits_sum = M.sum(axis=0) user_edits_order = user_edits_sum.argsort() article_edits_order = article_edits_sum.argsort() M_sorted = M[user_edits_order,:] M_sorted_sorted = M_sorted[:,article_edits_order] M_bin = make_bin_matrix(M_sorted_sorted) plt.figure(figsize=(10,10)) imshow(M_sorted_sorted, cmap=plt.cm.bone, interpolation='nearest') bin_M = make_bin_matrix(feminist_data['M']) M_test_triangular(bin_M) def Gcp_denominateur(M, p, k_c, beta): M_p = M[:,p] k_c_beta = k_c ** (-1 * beta) return np.dot(M_p, k_c_beta) def Gpc_denominateur(M, c, k_p, alpha): M_c = M[c,:] k_p_alpha = k_p ** (-1 * alpha) return np.dot(M_c, k_p_alpha) def make_G_hat(M, alpha=1, beta=1): '''G hat is Markov chain of length 2 Gcp is a matrix to go from contries to product and then Gpc is a matrix to go from products to ccountries''' k_c = M.sum(axis=1) #aka k_c summing over the rows k_p = M.sum(axis=0) #aka k_p summering over the columns G_cp = np.zeros(shape=M.shape) #Gcp_beta for [c, p], val in np.ndenumerate(M): numerateur = (M[c,p]) * (k_c[c] ** ((-1) * beta)) denominateur = Gcp_denominateur(M, p, k_c, beta) G_cp[c,p] = numerateur / float(denominateur) G_pc = np.zeros(shape=M.T.shape) #Gpc_alpha for [p, c], val in np.ndenumerate(M.T): numerateur = (M.T[p,c]) * (k_p[p] ** ((-1) * alpha)) denominateur = Gpc_denominateur(M, c, k_p, alpha) G_pc[p,c] = numerateur / float(denominateur) return {'G_cp': G_cp, "G_pc" : G_pc} def w_generator(M, alpha, beta): #this cannot return the zeroeth iteration G_hat = make_G_hat(M, alpha, beta) G_cp = G_hat['G_cp'] G_pc = G_hat['G_pc'] # fitness_0 = np.sum(M,1) ubiquity_0 = np.sum(M,0) fitness_next = fitness_0 ubiquity_next = ubiquity_0 i = 0 while True: fitness_prev = fitness_next ubiquity_prev = ubiquity_next i += 1 fitness_next = np.sum( G_cp*ubiquity_prev, axis=1 ) ubiquity_next = np.sum( G_pc* fitness_prev, axis=1 ) yield {'iteration':i, 'fitness': fitness_next, 'ubiquity': ubiquity_next} def w_stream(M, i, alpha, beta): """gets the i'th iteration of reflections of M, but in a memory safe way so we can calculate many generations""" if i < 0: raise ValueError for j in w_generator(M, alpha, beta): if j[0] == i: return {'fitness': j[1], 'ubiquity': j[2]} break def find_convergence(M, alpha, beta, fit_or_ubiq, do_plot=False,): '''finds the convergence point (or gives up after 1000 iterations)''' if fit_or_ubiq == 'fitness': Mshape = M.shape[0] elif fit_or_ubiq == 'ubiquity': Mshape = M.shape[1] rankings = list() scores = list() prev_rankdata = np.zeros(Mshape) iteration = 0 for stream_data in w_generator(M, alpha, beta): iteration = stream_data['iteration'] data = stream_data[fit_or_ubiq] rankdata = data.argsort().argsort() #test for convergence if np.equal(rankdata,prev_rankdata).all(): break if iteration == 1000: break else: rankings.append(rankdata) scores.append(data) prev_rankdata = rankdata if do_plot: plt.figure(figsize=(iteration/10, Mshape / 20)) plt.xlabel('Iteration') plt.ylabel('Rank, higher is better') plt.title('Rank Evolution') p = semilogx(range(1,iteration), rankings, '-,', alpha=0.5) return {fit_or_ubiq:scores[-1], 'iteration':iteration} def w_star_analytic(M, alpha, beta, w_star_type): k_c = M.sum(axis=1) #aka k_c summing over the rows k_p = M.sum(axis=0) #aka k_p summering over the columns A = 1 B = 1 def Gcp_denominateur(M, p, k_c, beta): M_p = M[:,p] k_c_beta = k_c ** (-1 * beta) return np.dot(M_p, k_c_beta) def Gpc_denominateur(M, c, k_p, alpha): M_c = M[c,:] k_p_alpha = k_p ** (-1 * alpha) return np.dot(M_c, k_p_alpha) if w_star_type == 'w_star_c': w_star_c = np.zeros(shape=M.shape[0]) for c in range(M.shape[0]): summand = Gpc_denominateur(M, c, k_p, alpha) k_beta = (k_c[c] ** (-1 * beta)) w_star_c[c] = A * summand * k_beta return w_star_c elif w_star_type == 'w_star_p': w_star_p = np.zeros(shape=M.shape[1]) for p in range(M.shape[1]): summand = Gcp_denominateur(M, p, k_c, beta) k_alpha = (k_p[p] ** (-1 * alpha)) w_star_p[p] = B * summand * k_alpha return w_star_p #purer python #score w_scores = w_star_analytic(M=feminist_data['M'], alpha=0.5, beta=0.5, w_star_type='w_star_c') #identify w_ranks = {name: w_scores[pos] for name, pos in feminist_data['user_dict'].iteritems() } #sort w_ranks_sorted = sorted(w_ranks.iteritems(), key=operator.itemgetter(1)) #or use pandas w_scores_df = pd.DataFrame.from_dict(w_ranks, orient='index') w_scores_df.columns = ['w_score'] w_scores_df.sort(columns=['w_score'], ascending=False).head() convergence = find_convergence(M=feminist_data['M'], alpha=0.5, beta=0.5, fit_or_ubiq='fitness', do_plot=True) convergence['iteration'] w_iter = convergence['fitness'] #rank w_iter_ranks = {name: w_iter[pos] for name, pos in feminist_data['user_dict'].iteritems() } #sort w_ranks_sorted = sorted(w_iter_ranks.iteritems(), key=operator.itemgetter(1)) #or use pandas w_iter_scores_df = pd.DataFrame.from_dict(w_iter_ranks, orient='index') w_iter_scores_df.columns = ['w_score'] w_iter_scores_df.sort(columns=['w_score'], ascending=False).head() '''I'm sure this can be done much more elegantly but this was sort-of drink-a-lot-of-coffee-one-afternoon-and-get-it-done cleaning this up is an exercise for the reader''' def rank_comparison(a_ranks_sorted, b_ranks_sorted, do_plot=False): a_list = list() b_list = list() for atup in a_ranks_sorted: aiden = atup[0] apos = atup[1] #find this in our other list for btup in b_ranks_sorted: biden = btup[0] bpos = btup[1] if aiden == biden: a_list.append(apos) b_list.append(bpos) if do_plot: plt.figure(figsize=(10,20)) plot([1,2], [a_list, b_list], '-o') plt.show() return ss.spearmanr(a_list, b_list) def calibrate_analytic(M, ua, exogenous_ranks_sorted, user_or_art_dict, index_function, title, do_plot=False): if ua == 'users': w_star_type = 'w_star_c' elif ua == 'articles': w_star_type = 'w_star_p' squarelen = range(0,50) alpha_range = map(index_function,squarelen) beta_range = map(index_function,squarelen) landscape = np.zeros(shape=(len(list(alpha_range)),len(list(beta_range)))) top_spearman = {'spearman':None,'alpha':None, 'beta':None, 'ua':ua} for alpha_index, alpha in enumerate(alpha_range): for beta_index, beta in enumerate(beta_range): w_converged = w_star_analytic(M, alpha, beta, w_star_type) w_ranks = {name: w_converged[pos] for name, pos in user_or_art_dict.iteritems() } w_ranks_sorted = sorted(w_ranks.iteritems(), key=operator.itemgetter(1)) spearman = rank_comparison(w_ranks_sorted, exogenous_ranks_sorted) if spearman[1] < 0.05: landscape[alpha_index][beta_index] = spearman[0] if (not top_spearman['spearman']) or (spearman[0] > top_spearman['spearman']): top_spearman['spearman'] = spearman[0] top_spearman['alpha'] = alpha top_spearman['beta'] = beta else: landscape[alpha_index][beta_index] = np.nan if do_plot: plt.figure(figsize=(10,10)) heatmap = imshow(landscape, interpolation='nearest', vmin=-1, vmax=1) #heatmap = plt.pcolor(landscape) colorbar = plt.colorbar(heatmap) plt.xlabel(r'$ \beta $') plt.xticks(squarelen, beta_range, rotation=90) plt.ylabel(r'$ \alpha $') plt.yticks(squarelen, alpha_range) plt.title(title) landscape_file = open(title+'_landscape.npy', 'w') np.save(landscape_file, landscape) plt.savefig(title+'_landscape.eps') return top_spearman user_spearman = calibrate_analytic(M=make_bin_matrix(feminist_data['M']), ua='users', exogenous_ranks_sorted=feminist_data['user_exogenous_ranks'], user_or_art_dict=feminist_data['user_dict'], index_function=lambda x: (x-25)/12.5, title='Grid Search for User of Feminist Writers', do_plot=True) print('Optimizing points from gridsearch', 'rho:', user_spearman['spearman'], 'alpha', user_spearman['alpha'], 'beta', user_spearman['beta'] )