%pylab inline

import pandas as pd
import geojson as gj
from collections import defaultdict, Counter
from scipy import stats

poi = pd.read_csv('data/poi/pois_milano_tripadvisor.csv')
poi.sort('reviews', ascending=False).head()

with open('data/poi/milano-grid.geojson') as gf:
    grid = gj.load(gf)

cell_position = pd.DataFrame([([cell["properties"]["cellId"]] + cell["geometry"]["coordinates"][0][0])
                              for cell in grid['features']],
                             columns=['cellId', 'lat', 'lon']).set_index('cellId')
cell_position.head()

def poi_in_cell(poi_coords, coords):
    return not ((poi_coords[0]>coords[1][0]) or (poi_coords[1]>coords[1][1]) or
             (poi_coords[0]<coords[3][0]) or (poi_coords[1]<coords[3][1]))

poi_to_cell_dict={}
cell_reviews=defaultdict(int)

for index, p in poi.iterrows():
    if index%50==0:
        print index, p[1]
    for cell in grid['features']:
        
        if poi_in_cell([p[3],p[2]], cell['geometry']['coordinates'][0]):
            poi_to_cell_dict[p[0]] = cell['id']
            
            cell_reviews[cell['id']]+=p[1]

cell_reviews = pd.Series(cell_reviews)

npoi = pd.Series(Counter(poi_to_cell_dict.values()))
cells = npoi.keys()

store = pd.HDFStore('./stores/aggregated_dataset.h5')
fh = store['entropy_1D'].reset_index()
store.close()

fh['day'] = fh['time'].map(lambda x: x.date())
fh.drop(['entropy_n', 'n', 'time'], inplace=True, axis=1)

entropy_avg = fh.groupby('Square_id')['entropy'].mean()
entropy_avg.head()

x = npoi[cells].values
y = entropy_avg[cells].values

plt.figure(figsize=(8,7))
plt.xlabel('Number of POIs', fontsize=22)
plt.ylabel('Average daily entropy', fontsize=22)
plt.tick_params(axis='both', which='major', labelsize=15)
plt.axis([0,12,1.5,4.0])
plt.scatter(x, y, s=40, c='r', alpha=0.8)

print "Spearman rank coefficient :", stats.spearmanr(x, y)

y = entropy_avg[cells].values
yy = entropy_avg[entropy_avg.index.delete(cells)].values

stats.ks_2samp(yy, y)

plt.figure(figsize=(8,7))
plt.xlabel('Average Daily Entropy', fontsize=22)
plt.ylabel('P(Avg Daily Entropy)', fontsize=22)
plt.hist(yy, 100, cumulative=1, normed=1, label='no-POIs cells')
plt.hist(y, 50, cumulative=1, normed=1, label='POIs cells', color='r')
plt.tick_params(axis='both', which='major', labelsize=15)
plt.axis([0.0, 3.5, 0, 1.0])
plt.legend(loc=2)

store = pd.HDFStore('./stores/aggregated_dataset.h5')
fh = store['intensity_ni_1D'].reset_index()
store.close()

fh['day'] = fh['time'].map(lambda x: x.date())
fh.drop(['time'], inplace=True, axis=1)

activity_avg = fh.groupby(['Square_id', 'day'], as_index=False).sum().groupby('Square_id')['Call'].mean()
activity_avg.head()

x2 = npoi[cells].values
y2 = activity_avg[cells].values

plt.figure(figsize=(7,7))
plt.xlabel('Number of POIs', fontsize=22)
plt.ylabel('average daily activity',fontsize=22)
plt.tick_params(axis='both', which='major', labelsize=15)
plt.scatter(x2,y2,s=40, c='r', alpha=0.8)

r=stats.spearmanr(x2, y2)
print "Spearman rank coefficient :",r

plt.figure(figsize=(7,7))

xz = entropy_avg[entropy_avg.index.delete(cells)].values
yz = activity_avg[activity_avg.index.delete(cells)].values
plt.scatter(xz, yz, c='b', alpha=0.8, label='no-POIs cells')

xz = entropy_avg[cells].values
yz = activity_avg[cells].values
plt.scatter(xz, yz, c='r', alpha=0.8, label='POIs cells')

plt.xlabel('Average daily entropy', fontsize=22)
plt.ylabel('Average daily activity', fontsize=22)
plt.tick_params(axis='both', which='major', labelsize=15)
plt.legend(loc=4)
plt.yscale('log')

from sklearn import svm, grid_search, metrics
from sklearn import cross_validation

df = pd.DataFrame(entropy_avg)
df['activity'] = activity_avg
df['reviews'] = 0
df.loc[cells, 'reviews'] = cell_reviews[cells]
df['npoi'] = 0
df.loc[cells, 'npoi'] = npoi[cells]
df['poi'] = df.npoi > 0
df['lat'] = cell_position.lat
df['lon'] = cell_position.lon
df[df.poi].head(10)

from sklearn import grid_search

model = svm.SVC(class_weight={0:1, 1:4.5})

X = array([df.entropy.values, np.log(df.activity.values)]).T

cv = cross_validation.KFold(X.shape[0], n_folds=3, shuffle=True, random_state=3)
gs = grid_search.GridSearchCV(model, {'gamma': np.logspace(-3, 0, 4), 'C': np.logspace(-1, 1, 3)},
                              cv=cv, scoring='precision', n_jobs=4)
gs.fit(X, df.poi.astype(int))

print gs.best_params_
print gs.best_score_

ypred = gs.predict(X)

print metrics.classification_report(df.poi.astype(int).values, ypred)

y_pred = gs.decision_function(X)
roc_auc = metrics.roc_auc_score(df.poi.astype(int).values, y_pred)
print roc_auc

fpr, tpr, _ = metrics.roc_curve(df.poi.astype(int).values, y_pred)
plt.plot(fpr, tpr, 'b.', label='ROC curve (area = %0.2f)' % roc_auc)
plt.legend();

x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1),
                     np.arange(y_min, y_max, 1))
Z = gs.decision_function(np.c_[xx.ravel(), yy.ravel()])

plt.figure(figsize=(7,7))

Z = Z.reshape(xx.shape)
ax = plt.subplot(111)
norm = plt.cm.colors.Normalize(vmax=abs(Z).max(), vmin=-abs(Z).max())
ax.contourf(xx, yy, Z, 200, cmap=cm.RdBu_r, alpha=.8, norm=norm)

xz = df.entropy[df.entropy.index.delete(cells)].values
yz = log(df.activity[df.activity.index.delete(cells)].values)
plt.scatter(xz, yz, c='b', alpha=0.8, label='no-POIs cells')

xz = df.entropy[cells].values
yz = log(df.activity[cells].values)
plt.scatter(xz, yz, c='r', alpha=0.8, label='POIs cells')

plt.xlabel('Average daily entropy', fontsize=22)
plt.ylabel('Log (average daily activity)', fontsize=22)
plt.tick_params(axis='both', which='major', labelsize=15)
plt.xlim(0, 4)
plt.ylim(y_min, 7.4)
plt.legend(loc=4, scatterpoints=1, markerscale=3, fontsize='x-large')