Classifier of point of interests based on entropy and phone activity.

In [1]:
%pylab inline

import pandas as pd
import geojson as gj
from collections import defaultdict, Counter
from scipy import stats
Populating the interactive namespace from numpy and matplotlib
In [2]:
poi = pd.read_csv('data/poi/pois_milano_tripadvisor.csv')
poi.sort('reviews', ascending=False).head()
Out[2]:
name reviews lat lon
1 Duomo di Milano 9561 45.46467 9.190500
13 Galleria Vittorio Emanuele II 3488 45.46560 9.190000
3 L'Ultima Cena (Cenacolo Vinciano) 3099 45.46596 9.170649
18 Piazza del Duomo 2600 45.46468 9.190770
17 Castello Sforzesco 2398 45.47045 9.180639
In [3]:
with open('data/poi/milano-grid.geojson') as gf:
    grid = gj.load(gf)
In [4]:
cell_position = pd.DataFrame([([cell["properties"]["cellId"]] + cell["geometry"]["coordinates"][0][0])
                              for cell in grid['features']],
                             columns=['cellId', 'lat', 'lon']).set_index('cellId')
cell_position.head()
Out[4]:
lat lon
cellId
1 9.011491 45.358801
2 9.014491 45.358801
3 9.017492 45.358801
4 9.020492 45.358800
5 9.023493 45.358799
In [5]:
def poi_in_cell(poi_coords, coords):
    return not ((poi_coords[0]>coords[1][0]) or (poi_coords[1]>coords[1][1]) or
             (poi_coords[0]<coords[3][0]) or (poi_coords[1]<coords[3][1]))
In [6]:
poi_to_cell_dict={}
cell_reviews=defaultdict(int)

for index, p in poi.iterrows():
    if index%50==0:
        print index, p[1]
    for cell in grid['features']:
        
        if poi_in_cell([p[3],p[2]], cell['geometry']['coordinates'][0]):
            poi_to_cell_dict[p[0]] = cell['id']
            
            cell_reviews[cell['id']]+=p[1]

cell_reviews = pd.Series(cell_reviews)
0 784
50 30
100 92
150 16
200 3
250 14
300 10
350 2
In [7]:
npoi = pd.Series(Counter(poi_to_cell_dict.values()))
cells = npoi.keys()

Entropy and POI

In [8]:
store = pd.HDFStore('./stores/aggregated_dataset.h5')
fh = store['entropy_1D'].reset_index()
store.close()

fh['day'] = fh['time'].map(lambda x: x.date())
fh.drop(['entropy_n', 'n', 'time'], inplace=True, axis=1)
In [9]:
entropy_avg = fh.groupby('Square_id')['entropy'].mean()
entropy_avg.head()
Out[9]:
Square_id
1            2.444591
2            2.430424
3            2.410835
4            2.477077
5            2.502639
Name: entropy, dtype: float64
In [10]:
x = npoi[cells].values
y = entropy_avg[cells].values
In [11]:
plt.figure(figsize=(8,7))
plt.xlabel('Number of POIs', fontsize=22)
plt.ylabel('Average daily entropy', fontsize=22)
plt.tick_params(axis='both', which='major', labelsize=15)
plt.axis([0,12,1.5,4.0])
plt.scatter(x, y, s=40, c='r', alpha=0.8)

print "Spearman rank coefficient :", stats.spearmanr(x, y)
Spearman rank coefficient : (0.35557979145601343, 6.3085603657327917e-08)
In [12]:
y = entropy_avg[cells].values
yy = entropy_avg[entropy_avg.index.delete(cells)].values
In [13]:
stats.ks_2samp(yy, y)
Out[13]:
(0.56183991047782045, 3.4853019940608143e-60)
In [14]:
plt.figure(figsize=(8,7))
plt.xlabel('Average Daily Entropy', fontsize=22)
plt.ylabel('P(Avg Daily Entropy)', fontsize=22)
plt.hist(yy, 100, cumulative=1, normed=1, label='no-POIs cells')
plt.hist(y, 50, cumulative=1, normed=1, label='POIs cells', color='r')
plt.tick_params(axis='both', which='major', labelsize=15)
plt.axis([0.0, 3.5, 0, 1.0])
plt.legend(loc=2)
Out[14]:
<matplotlib.legend.Legend at 0x7f5c3e78dbd0>

Activity and POI

In [15]:
store = pd.HDFStore('./stores/aggregated_dataset.h5')
fh = store['intensity_ni_1D'].reset_index()
store.close()

fh['day'] = fh['time'].map(lambda x: x.date())
fh.drop(['time'], inplace=True, axis=1)
In [16]:
activity_avg = fh.groupby(['Square_id', 'day'], as_index=False).sum().groupby('Square_id')['Call'].mean()
activity_avg.head()
Out[16]:
Square_id
1            1.132792
2            1.156711
3            1.182170
4            1.063513
5            1.002914
Name: Call, dtype: float32
In [17]:
x2 = npoi[cells].values
y2 = activity_avg[cells].values

plt.figure(figsize=(7,7))
plt.xlabel('Number of POIs', fontsize=22)
plt.ylabel('average daily activity',fontsize=22)
plt.tick_params(axis='both', which='major', labelsize=15)
plt.scatter(x2,y2,s=40, c='r', alpha=0.8)

r=stats.spearmanr(x2, y2)
print "Spearman rank coefficient :",r
Spearman rank coefficient : (0.48611959477097355, 2.1709159849930512e-14)

Entropy - activity and POI

In [18]:
plt.figure(figsize=(7,7))

xz = entropy_avg[entropy_avg.index.delete(cells)].values
yz = activity_avg[activity_avg.index.delete(cells)].values
plt.scatter(xz, yz, c='b', alpha=0.8, label='no-POIs cells')

xz = entropy_avg[cells].values
yz = activity_avg[cells].values
plt.scatter(xz, yz, c='r', alpha=0.8, label='POIs cells')

plt.xlabel('Average daily entropy', fontsize=22)
plt.ylabel('Average daily activity', fontsize=22)
plt.tick_params(axis='both', which='major', labelsize=15)
plt.legend(loc=4)
plt.yscale('log')

Train a classifier with entropy and activity

In [19]:
from sklearn import svm, grid_search, metrics
from sklearn import cross_validation
In [20]:
df = pd.DataFrame(entropy_avg)
df['activity'] = activity_avg
df['reviews'] = 0
df.loc[cells, 'reviews'] = cell_reviews[cells]
df['npoi'] = 0
df.loc[cells, 'npoi'] = npoi[cells]
df['poi'] = df.npoi > 0
df['lat'] = cell_position.lat
df['lon'] = cell_position.lon
df[df.poi].head(10)
Out[20]:
entropy activity reviews npoi poi lat lon
Square_id
2767 2.740439 7.469481 5 1 True 9.209731 45.415723
2773 2.785204 2.840190 212 1 True 9.227751 45.415688
3454 2.768579 34.803547 8 1 True 9.170730 45.430595
3655 2.673355 61.745651 7 2 True 9.173748 45.434821
3670 2.566126 39.636150 2 1 True 9.218814 45.434743
3753 2.801604 31.504326 7 1 True 9.167745 45.436945
4343 2.658501 48.257473 3 1 True 9.137730 45.449677
4347 3.044334 58.211586 88 1 True 9.149751 45.449662
4355 3.046771 90.429901 2 1 True 9.173793 45.449628
4357 3.110772 82.910629 16 1 True 9.179804 45.449618
In [21]:
from sklearn import grid_search

model = svm.SVC(class_weight={0:1, 1:4.5})

X = array([df.entropy.values, np.log(df.activity.values)]).T

cv = cross_validation.KFold(X.shape[0], n_folds=3, shuffle=True, random_state=3)
gs = grid_search.GridSearchCV(model, {'gamma': np.logspace(-3, 0, 4), 'C': np.logspace(-1, 1, 3)},
                              cv=cv, scoring='precision', n_jobs=4)
gs.fit(X, df.poi.astype(int))

print gs.best_params_
print gs.best_score_
{'C': 1.0, 'gamma': 0.01}
0.444223620224
/usr/local/projects/unveiling-patterns/code_repository/virtualenv/local/lib/python2.7/site-packages/sklearn/metrics/metrics.py:1771: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples.
  'precision', 'predicted', average, warn_for)
/usr/local/projects/unveiling-patterns/code_repository/virtualenv/local/lib/python2.7/site-packages/sklearn/metrics/metrics.py:1771: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples.
  'precision', 'predicted', average, warn_for)
/usr/local/projects/unveiling-patterns/code_repository/virtualenv/local/lib/python2.7/site-packages/sklearn/metrics/metrics.py:1771: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples.
  'precision', 'predicted', average, warn_for)
/usr/local/projects/unveiling-patterns/code_repository/virtualenv/local/lib/python2.7/site-packages/sklearn/metrics/metrics.py:1771: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples.
  'precision', 'predicted', average, warn_for)
In [22]:
ypred = gs.predict(X)

print metrics.classification_report(df.poi.astype(int).values, ypred)
             precision    recall  f1-score   support

          0       0.99      0.99      0.99      9781
          1       0.42      0.43      0.43       219

avg / total       0.97      0.97      0.97     10000

In [23]:
y_pred = gs.decision_function(X)
roc_auc = metrics.roc_auc_score(df.poi.astype(int).values, y_pred)
print roc_auc

fpr, tpr, _ = metrics.roc_curve(df.poi.astype(int).values, y_pred)
plt.plot(fpr, tpr, 'b.', label='ROC curve (area = %0.2f)' % roc_auc)
plt.legend();
0.915791215753
In [24]:
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1),
                     np.arange(y_min, y_max, 1))
Z = gs.decision_function(np.c_[xx.ravel(), yy.ravel()])

plt.figure(figsize=(7,7))

Z = Z.reshape(xx.shape)
ax = plt.subplot(111)
norm = plt.cm.colors.Normalize(vmax=abs(Z).max(), vmin=-abs(Z).max())
ax.contourf(xx, yy, Z, 200, cmap=cm.RdBu_r, alpha=.8, norm=norm)

xz = df.entropy[df.entropy.index.delete(cells)].values
yz = log(df.activity[df.activity.index.delete(cells)].values)
plt.scatter(xz, yz, c='b', alpha=0.8, label='no-POIs cells')

xz = df.entropy[cells].values
yz = log(df.activity[cells].values)
plt.scatter(xz, yz, c='r', alpha=0.8, label='POIs cells')

plt.xlabel('Average daily entropy', fontsize=22)
plt.ylabel('Log (average daily activity)', fontsize=22)
plt.tick_params(axis='both', which='major', labelsize=15)
plt.xlim(0, 4)
plt.ylim(y_min, 7.4)
plt.legend(loc=4, scatterpoints=1, markerscale=3, fontsize='x-large')