import pymc import daft import csv from scipy.sparse import coo_matrix from sklearn.decomposition import ProjectedGradientNMF from itertools import product, izip def loadData(filename): data = np.loadtxt(filename) users = data[:,0].astype(int) items = data[:,1].astype(int) ratings = data[:,2] nExamples = len(users) return (nExamples, users, items, ratings) TRAIN_FILE = '736/train' TEST_FILE = '736/test' (nTrain, users, items, ratings) = loadData(TRAIN_FILE) (nTest, testUsers, testItems, testRatings) = loadData(TEST_FILE) N = np.max(users) M = np.max(items) plt.figure() plt.hist(ratings) plt.figure() plt.hist(testRatings) outlierIdxs = ratings > 6 print np.flatnonzero(outlierIdxs) print ratings[outlierIdxs] CLEAN_TRAIN_FILE = '736/trainClean' (nTrain, users, items, ratings) = loadData(CLEAN_TRAIN_FILE) plt.hist(ratings) from matplotlib import rc rc("font", family="serif", size=36) rc("text", usetex=True) pgm = daft.PGM([5, 4], grid_unit=5, node_unit=3) pgm.add_node(daft.Node("tau0", r"$\tau_0$", 2, 3.5, fixed=True)) pgm.add_node(daft.Node("u", r"$\mathbf{u}_i$", 1, 2.5)) pgm.add_node(daft.Node("v", r"$\mathbf{v}_j$", 3, 2.5)) pgm.add_node(daft.Node("z", r"$z_{ij}$", 2, 2.5)) pgm.add_node(daft.Node("r", r"$r_{ij}$", 2, 1.5, observed=True)) pgm.add_node(daft.Node("tau1", r"$\tau_1", 0.25, 1.5, fixed=True)) pgm.add_edge("tau0", "u") pgm.add_edge("tau0", "v") pgm.add_edge("u", "z") pgm.add_edge("v", "z") pgm.add_edge("z", "r") pgm.add_edge("tau1", "r") # [start_x, start_y, xlen, ylen] pgm.add_plate(daft.Plate([0.5, 0.5, 2, 2.5], label=r"$i = 1, \ldots, N$")) pgm.add_plate(daft.Plate([1.5, 0.25, 2, 2.7], shift=0.5, label=r"$j = 1, \ldots, M$")) pgm.render() D = 10 tau0 = 5 tau1 = 5 u = pymc.Normal('u', 0, tau0, size=(D, N)) v = pymc.Normal('v', 0, tau0, size=(D, M)) z = pymc.Lambda('z', lambda u=u, v=v: np.dot(u.T, v)) r = np.empty(nTrain, dtype=object) for n, (i, j) in enumerate(izip(users, items)): r[n] = pymc.Lognormal('x_%d_%d' % (i, j), z[i-1, j-1], # mean tau1, # precision (confidence) observed=True, value=ratings[n]) model = pymc.Model([u, v, z, r]) # There was a bug in PyMC; this is just to hack around code. if 3 in model.__dict__: del model.__dict__[3] pymc.MAP(model).fit() mcmc = pymc.MCMC(model) mcmc.sample(10000) zsamples = mcmc.trace('z')[5000:] zmean = np.mean(zsamples, 0) meanExpZ = np.mean(np.exp(zsamples), 0) plt.hist(np.exp(zsamples[:,1,1])) iFirst, jFirst, rFirst = users[1], items[1], ratings[1] print meanExpZ[iFirst,jFirst] print rFirst #trainRMSE = np.sqrt(np.mean((np.exp(rmean[users-1, items-1]) - ratings) ** 2)) #testRMSE = np.sqrt(np.mean((np.exp(rmean[testUsers-1, testItems-1]) - testRatings) ** 2)) trainRMSE = np.sqrt(np.mean( (meanExpZ[users-1, items-1] - ratings) ** 2)) testRMSE = np.sqrt(np.mean( (meanExpZ[testUsers-1, testItems-1] - testRatings) ** 2)) print trainRMSE print testRMSE R = coo_matrix((ratings, (users - 1, items - 1))).todense() skm = ProjectedGradientNMF(n_components=D, sparseness='data', beta=70) W = skm.fit_transform(R.T) reconstruct = np.dot(skm.components_.T, W.T) nmfTrainRMSE = np.sqrt(np.mean( (reconstruct[users-1, items-1] - ratings) ** 2)) nmfTestRMSE = np.sqrt(np.mean( (reconstruct[testUsers-1, testItems-1] - testRatings) ** 2)) print nmfTrainRMSE print nmfTestRMSE #print skm.reconstruction_err_ #pint np.linalg.norm(reconstruct - R)