Classifier of point of interests based on entropy and phone activity.¶

In [1]:

%pylab inline

import pandas as pd
import geojson as gj
from collections import defaultdict, Counter
from scipy import stats

Populating the interactive namespace from numpy and matplotlib

In [2]:

poi = pd.read_csv('data/poi/pois_milano_tripadvisor.csv')
poi.sort('reviews', ascending=False).head()

Out[2]:

	name	reviews	lat	lon
1	Duomo di Milano	9561	45.46467	9.190500
13	Galleria Vittorio Emanuele II	3488	45.46560	9.190000
3	L'Ultima Cena (Cenacolo Vinciano)	3099	45.46596	9.170649
18	Piazza del Duomo	2600	45.46468	9.190770
17	Castello Sforzesco	2398	45.47045	9.180639

In [3]:

with open('data/poi/milano-grid.geojson') as gf:
    grid = gj.load(gf)

In [4]:

cell_position = pd.DataFrame([([cell["properties"]["cellId"]] + cell["geometry"]["coordinates"][0][0])
                              for cell in grid['features']],
                             columns=['cellId', 'lat', 'lon']).set_index('cellId')
cell_position.head()

Out[4]:

	lat	lon
cellId
1	9.011491	45.358801
2	9.014491	45.358801
3	9.017492	45.358801
4	9.020492	45.358800
5	9.023493	45.358799

In [5]:

def poi_in_cell(poi_coords, coords):
    return not ((poi_coords[0]>coords[1][0]) or (poi_coords[1]>coords[1][1]) or
             (poi_coords[0]<coords[3][0]) or (poi_coords[1]<coords[3][1]))

In [6]:

poi_to_cell_dict={}
cell_reviews=defaultdict(int)

for index, p in poi.iterrows():
    if index%50==0:
        print index, p[1]
    for cell in grid['features']:
        
        if poi_in_cell([p[3],p[2]], cell['geometry']['coordinates'][0]):
            poi_to_cell_dict[p[0]] = cell['id']
            
            cell_reviews[cell['id']]+=p[1]

cell_reviews = pd.Series(cell_reviews)

In [7]:

npoi = pd.Series(Counter(poi_to_cell_dict.values()))
cells = npoi.keys()

Entropy and POI¶

In [8]:

store = pd.HDFStore('./stores/aggregated_dataset.h5')
fh = store['entropy_1D'].reset_index()
store.close()

fh['day'] = fh['time'].map(lambda x: x.date())
fh.drop(['entropy_n', 'n', 'time'], inplace=True, axis=1)

In [9]:

entropy_avg = fh.groupby('Square_id')['entropy'].mean()
entropy_avg.head()

Out[9]:

Square_id
1            2.444591
2            2.430424
3            2.410835
4            2.477077
5            2.502639
Name: entropy, dtype: float64

In [10]:

x = npoi[cells].values
y = entropy_avg[cells].values

In [11]:

plt.figure(figsize=(8,7))
plt.xlabel('Number of POIs', fontsize=22)
plt.ylabel('Average daily entropy', fontsize=22)
plt.tick_params(axis='both', which='major', labelsize=15)
plt.axis([0,12,1.5,4.0])
plt.scatter(x, y, s=40, c='r', alpha=0.8)

print "Spearman rank coefficient :", stats.spearmanr(x, y)

Spearman rank coefficient : (0.35557979145601343, 6.3085603657327917e-08)

In [12]:

y = entropy_avg[cells].values
yy = entropy_avg[entropy_avg.index.delete(cells)].values

In [13]:

stats.ks_2samp(yy, y)

Out[13]:

(0.56183991047782045, 3.4853019940608143e-60)

In [14]:

plt.figure(figsize=(8,7))
plt.xlabel('Average Daily Entropy', fontsize=22)
plt.ylabel('P(Avg Daily Entropy)', fontsize=22)
plt.hist(yy, 100, cumulative=1, normed=1, label='no-POIs cells')
plt.hist(y, 50, cumulative=1, normed=1, label='POIs cells', color='r')
plt.tick_params(axis='both', which='major', labelsize=15)
plt.axis([0.0, 3.5, 0, 1.0])
plt.legend(loc=2)

Out[14]:

<matplotlib.legend.Legend at 0x7f5c3e78dbd0>

Activity and POI¶

In [15]:

store = pd.HDFStore('./stores/aggregated_dataset.h5')
fh = store['intensity_ni_1D'].reset_index()
store.close()

fh['day'] = fh['time'].map(lambda x: x.date())
fh.drop(['time'], inplace=True, axis=1)

In [16]:

activity_avg = fh.groupby(['Square_id', 'day'], as_index=False).sum().groupby('Square_id')['Call'].mean()
activity_avg.head()

Out[16]:

Square_id
1            1.132792
2            1.156711
3            1.182170
4            1.063513
5            1.002914
Name: Call, dtype: float32

In [17]:

x2 = npoi[cells].values
y2 = activity_avg[cells].values

plt.figure(figsize=(7,7))
plt.xlabel('Number of POIs', fontsize=22)
plt.ylabel('average daily activity',fontsize=22)
plt.tick_params(axis='both', which='major', labelsize=15)
plt.scatter(x2,y2,s=40, c='r', alpha=0.8)

r=stats.spearmanr(x2, y2)
print "Spearman rank coefficient :",r

Spearman rank coefficient : (0.48611959477097355, 2.1709159849930512e-14)

Entropy - activity and POI¶

In [18]:

plt.figure(figsize=(7,7))

xz = entropy_avg[entropy_avg.index.delete(cells)].values
yz = activity_avg[activity_avg.index.delete(cells)].values
plt.scatter(xz, yz, c='b', alpha=0.8, label='no-POIs cells')

xz = entropy_avg[cells].values
yz = activity_avg[cells].values
plt.scatter(xz, yz, c='r', alpha=0.8, label='POIs cells')

plt.xlabel('Average daily entropy', fontsize=22)
plt.ylabel('Average daily activity', fontsize=22)
plt.tick_params(axis='both', which='major', labelsize=15)
plt.legend(loc=4)
plt.yscale('log')

Train a classifier with entropy and activity¶

In [19]:

from sklearn import svm, grid_search, metrics
from sklearn import cross_validation

In [20]:

df = pd.DataFrame(entropy_avg)
df['activity'] = activity_avg
df['reviews'] = 0
df.loc[cells, 'reviews'] = cell_reviews[cells]
df['npoi'] = 0
df.loc[cells, 'npoi'] = npoi[cells]
df['poi'] = df.npoi > 0
df['lat'] = cell_position.lat
df['lon'] = cell_position.lon
df[df.poi].head(10)

Out[20]:

	entropy	activity	reviews	npoi	poi	lat	lon
Square_id
2767	2.740439	7.469481	5	1	True	9.209731	45.415723
2773	2.785204	2.840190	212	1	True	9.227751	45.415688
3454	2.768579	34.803547	8	1	True	9.170730	45.430595
3655	2.673355	61.745651	7	2	True	9.173748	45.434821
3670	2.566126	39.636150	2	1	True	9.218814	45.434743
3753	2.801604	31.504326	7	1	True	9.167745	45.436945
4343	2.658501	48.257473	3	1	True	9.137730	45.449677
4347	3.044334	58.211586	88	1	True	9.149751	45.449662
4355	3.046771	90.429901	2	1	True	9.173793	45.449628
4357	3.110772	82.910629	16	1	True	9.179804	45.449618

In [21]:

from sklearn import grid_search

model = svm.SVC(class_weight={0:1, 1:4.5})

X = array([df.entropy.values, np.log(df.activity.values)]).T

cv = cross_validation.KFold(X.shape[0], n_folds=3, shuffle=True, random_state=3)
gs = grid_search.GridSearchCV(model, {'gamma': np.logspace(-3, 0, 4), 'C': np.logspace(-1, 1, 3)},
                              cv=cv, scoring='precision', n_jobs=4)
gs.fit(X, df.poi.astype(int))

print gs.best_params_
print gs.best_score_

{'C': 1.0, 'gamma': 0.01}
0.444223620224

/usr/local/projects/unveiling-patterns/code_repository/virtualenv/local/lib/python2.7/site-packages/sklearn/metrics/metrics.py:1771: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples.
  'precision', 'predicted', average, warn_for)
/usr/local/projects/unveiling-patterns/code_repository/virtualenv/local/lib/python2.7/site-packages/sklearn/metrics/metrics.py:1771: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples.
  'precision', 'predicted', average, warn_for)
/usr/local/projects/unveiling-patterns/code_repository/virtualenv/local/lib/python2.7/site-packages/sklearn/metrics/metrics.py:1771: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples.
  'precision', 'predicted', average, warn_for)
/usr/local/projects/unveiling-patterns/code_repository/virtualenv/local/lib/python2.7/site-packages/sklearn/metrics/metrics.py:1771: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples.
  'precision', 'predicted', average, warn_for)

In [22]:

ypred = gs.predict(X)

print metrics.classification_report(df.poi.astype(int).values, ypred)

             precision    recall  f1-score   support

          0       0.99      0.99      0.99      9781
          1       0.42      0.43      0.43       219

avg / total       0.97      0.97      0.97     10000

In [23]:

y_pred = gs.decision_function(X)
roc_auc = metrics.roc_auc_score(df.poi.astype(int).values, y_pred)
print roc_auc

fpr, tpr, _ = metrics.roc_curve(df.poi.astype(int).values, y_pred)
plt.plot(fpr, tpr, 'b.', label='ROC curve (area = %0.2f)' % roc_auc)
plt.legend();

0.915791215753

In [24]:

x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1),
                     np.arange(y_min, y_max, 1))
Z = gs.decision_function(np.c_[xx.ravel(), yy.ravel()])

plt.figure(figsize=(7,7))

Z = Z.reshape(xx.shape)
ax = plt.subplot(111)
norm = plt.cm.colors.Normalize(vmax=abs(Z).max(), vmin=-abs(Z).max())
ax.contourf(xx, yy, Z, 200, cmap=cm.RdBu_r, alpha=.8, norm=norm)

xz = df.entropy[df.entropy.index.delete(cells)].values
yz = log(df.activity[df.activity.index.delete(cells)].values)
plt.scatter(xz, yz, c='b', alpha=0.8, label='no-POIs cells')

xz = df.entropy[cells].values
yz = log(df.activity[cells].values)
plt.scatter(xz, yz, c='r', alpha=0.8, label='POIs cells')

plt.xlabel('Average daily entropy', fontsize=22)
plt.ylabel('Log (average daily activity)', fontsize=22)
plt.tick_params(axis='both', which='major', labelsize=15)
plt.xlim(0, 4)
plt.ylim(y_min, 7.4)
plt.legend(loc=4, scatterpoints=1, markerscale=3, fontsize='x-large')