%pylab inline import pandas as pd import geojson as gj from collections import defaultdict, Counter from scipy import stats poi = pd.read_csv('data/poi/pois_milano_tripadvisor.csv') poi.sort('reviews', ascending=False).head() with open('data/poi/milano-grid.geojson') as gf: grid = gj.load(gf) cell_position = pd.DataFrame([([cell["properties"]["cellId"]] + cell["geometry"]["coordinates"][0][0]) for cell in grid['features']], columns=['cellId', 'lat', 'lon']).set_index('cellId') cell_position.head() def poi_in_cell(poi_coords, coords): return not ((poi_coords[0]>coords[1][0]) or (poi_coords[1]>coords[1][1]) or (poi_coords[0] 0 df['lat'] = cell_position.lat df['lon'] = cell_position.lon df[df.poi].head(10) from sklearn import grid_search model = svm.SVC(class_weight={0:1, 1:4.5}) X = array([df.entropy.values, np.log(df.activity.values)]).T cv = cross_validation.KFold(X.shape[0], n_folds=3, shuffle=True, random_state=3) gs = grid_search.GridSearchCV(model, {'gamma': np.logspace(-3, 0, 4), 'C': np.logspace(-1, 1, 3)}, cv=cv, scoring='precision', n_jobs=4) gs.fit(X, df.poi.astype(int)) print gs.best_params_ print gs.best_score_ ypred = gs.predict(X) print metrics.classification_report(df.poi.astype(int).values, ypred) y_pred = gs.decision_function(X) roc_auc = metrics.roc_auc_score(df.poi.astype(int).values, y_pred) print roc_auc fpr, tpr, _ = metrics.roc_curve(df.poi.astype(int).values, y_pred) plt.plot(fpr, tpr, 'b.', label='ROC curve (area = %0.2f)' % roc_auc) plt.legend(); x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1), np.arange(y_min, y_max, 1)) Z = gs.decision_function(np.c_[xx.ravel(), yy.ravel()]) plt.figure(figsize=(7,7)) Z = Z.reshape(xx.shape) ax = plt.subplot(111) norm = plt.cm.colors.Normalize(vmax=abs(Z).max(), vmin=-abs(Z).max()) ax.contourf(xx, yy, Z, 200, cmap=cm.RdBu_r, alpha=.8, norm=norm) xz = df.entropy[df.entropy.index.delete(cells)].values yz = log(df.activity[df.activity.index.delete(cells)].values) plt.scatter(xz, yz, c='b', alpha=0.8, label='no-POIs cells') xz = df.entropy[cells].values yz = log(df.activity[cells].values) plt.scatter(xz, yz, c='r', alpha=0.8, label='POIs cells') plt.xlabel('Average daily entropy', fontsize=22) plt.ylabel('Log (average daily activity)', fontsize=22) plt.tick_params(axis='both', which='major', labelsize=15) plt.xlim(0, 4) plt.ylim(y_min, 7.4) plt.legend(loc=4, scatterpoints=1, markerscale=3, fontsize='x-large')