%matplotlib inline import matplotlib from matplotlib import pyplot as pt import requests import json from math import log, exp, sqrt from datetime import datetime, timedelta import itertools from collections import defaultdict URL = ('http://en.wikipedia.org/w/api.php?format=json&action=query&list=recentchanges&rcprop=parsedcomment' '%7Ctimestamp%7Ctitle%7Cflags%7Cids%7Csizes%7Cflags%7Cuser&rclimit=100') D = 2 # number of non-sparse features D_sparse = 2**18 # number of sparsely-encoded features def get_length_statistics(length, n, mean, M2): """ https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Incremental_algorithm """ n += 1 delta = length-mean mean += float(delta)/n M2 += delta*(length - mean) if n < 2: return mean, 0., M2 variance = float(M2)/(n - 1) std = sqrt(variance) return mean, std, M2 def get_data(): X = [1., 0.] # bias term, length of edit X_sparse = [0, 0, 0] # hash of comment, hash of username, hash of title Y = [0, 0, 0] # bot, minor, new length_n = 0 length_mean = 0. length_M2 = 0. while True: r = requests.get(URL) r_json = json.loads(r.text)['query']['recentchanges'] for el in r_json: length = abs(el['newlen'] - el['oldlen']) length_n += 1 length_mean, length_std, length_M2 = get_length_statistics(length, length_n, length_mean, length_M2) X[1] = (length - length_mean)/length_std if length_std > 0. else length X_sparse[0] = abs(hash('comment_' + el['parsedcomment'])) % D_sparse X_sparse[1] = abs(hash('username_' + el['user'])) % D_sparse X_sparse[2] = abs(hash('title_' + el['title'])) % D_sparse Y[0] = 0 if el.get('bot') is None else 1 Y[1] = 0 if el.get('minor') is None else 1 Y[2] = 0 if el.get('new') is None else 1 yield Y, X, X_sparse def predict(w, w_sparse, x, x_sparse): """ P(y = 1 | (x, x_sparse), (w, w_sparse)) """ wTx = 0. for i, val in enumerate(x): wTx += w[i] * val for i in x_sparse: wTx += w_sparse[i] # *1 if i in x_sparse try: wTx = min(max(wTx, -100.), 100.) res = 1./(1. + exp(-wTx)) except OverflowError: print wTx raise return res def update(alpha, w, w_sparse, x, x_sparse, p, y): for i, val in enumerate(x): w[i] += (y - p) * alpha * val for i in x_sparse: w_sparse[i] += (y - p) * alpha # * feature[i] but feature[i] == 1 if i in x K = 3 w = [[0.] * D for k in range(K)] w_sparse = [[0.] * D_sparse for k in range(K)] predictions = [0.] * K alpha = .1 time0 = datetime.now() training_time = timedelta(minutes=10) ctr = 0 for y, x, x_sparse in get_data(): for k in range(K): p = predict(w[k], w_sparse[k], x, x_sparse) predictions[k] = float(p) update(alpha, w[k], w_sparse[k], x, x_sparse, p, y[k]) ctr += 1 # if ctr % 10000 == 0: # print 'samples seen', ctr # print 'sample', y # print 'predicted', predictions # print '' if (datetime.now() - time0) > training_time: break ctr print w[0] print w[1] print w[2] no_test = 10000 test_ctr = 0 classes = {c: c_i for c_i, c in enumerate(list(itertools.product([0, 1], repeat=3)))} confusion_matrix = [[0 for j in range(len(classes))] for i in range(len(classes))] predicted = [0, 0, 0] for y, x, x_sparse in get_data(): for k in range(K): p = predict(w[k], w_sparse[k], x, x_sparse) predicted[k] = 1 if p > .5 else 0 i = classes[tuple(y)] j = classes[tuple(predicted)] confusion_matrix[i][j] += 1 test_ctr +=1 if test_ctr >= no_test: break matplotlib.rcParams['font.size'] = 15 fig = pt.figure(figsize=(11, 11)) pt.clf() ax = fig.add_subplot(111) ax.set_aspect(1) res = ax.imshow(confusion_matrix, cmap=pt.cm.jet, interpolation='nearest') cb = fig.colorbar(res) labels = [{v: k for k, v in classes.iteritems()}[i] for i in range(len(classes))] pt.xticks(range(len(classes)), labels) pt.yticks(range(len(classes)), labels) pt.show()