#!/usr/bin/env python # coding: utf-8 # # Searching for Pronto's Power-Users # # This is a work-in-progress, analyzing the open data from [Pronto Cycle Share's Data Challenge](http://www.prontocycleshare.com/datachallenge). # # See [ProntoData.ipynb](ProntoData.ipynb) for an intro to the data; this notebook uses pieces that were derived there. # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plt import pandas as pd import numpy as np import seaborn as sns; sns.set() # In[2]: trips = pd.read_csv('2015_trip_data.csv', parse_dates=['starttime', 'stoptime'], infer_datetime_format=True) t_start = pd.DatetimeIndex(trips['starttime']) t_stop = pd.DatetimeIndex(trips['stoptime']) trips['date'] = t_start.date.astype('datetime64') trips['starttime'] = t_start.time trips['stoptime'] = t_stop.time trips['minuteofday'] = t_start.hour * 60 + t_start.minute trips['minutes'] = trips.tripduration / 60. # In[3]: stations = pd.read_csv('2015_station_data.csv') pronto_shop = dict(id=54, name="Pronto shop", terminal="Pronto shop", lat=47.6173156, long=-122.3414776, dockcount=100, online='10/13/2014') stations = stations.append(pronto_shop, ignore_index=True) distances = pd.read_csv('station_distances.csv', index_col='terminal') distances /= 1609.34 # convert meters to miles # In[4]: trips['distance'] = [distances.loc[ind] for ind in zip(trips.from_station_id, trips.to_station_id)] trips['speed'] = trips.distance * 60 / trips.minutes # In[5]: trips.head() # ## Distance vs Count # In[6]: groups = trips.groupby(['from_station_id', 'to_station_id']) paired = groups.aggregate({'distance':'mean', 'trip_id':'count', 'from_station_name':'first', 'to_station_name':'first'}) paired.rename(columns={'trip_id': 'count'}, inplace=True) # In[7]: countmat = paired['count'].unstack() total = countmat + countmat.T total.values.flat[::total.shape[0] + 1] /= 2 paired['total'] = total.stack() # In[8]: fig = plt.figure() ax = plt.axes(yscale='log') ax.plot(paired['distance'], paired['total'], '.k') ax.set_xlabel('distance between stations') ax.set_ylabel('number of trips'); ax.plot(6.83, 95, 's', ms=30, mec='red', mfc='none', mew=1) fig.savefig('figs/trips_by_distance.png', bbox_inches='tight') # ## Finding This User # In[9]: station_id_map = trips.groupby('from_station_id')['from_station_name'].first() def get_group(id1, id2, include_reverse=False): query = '(usertype == "Annual Member")' if include_reverse: query += (' & ((from_station_id == "{0}" & to_station_id == "{1}") |' '(from_station_id == "{1}" & to_station_id == "{0}"))') else: query += ' & (from_station_id == "{0}" & to_station_id == "{1}")' return trips.query(query.format(id1, id2)) # In[10]: fig, ax = plt.subplots() id1, id2 = paired.query('distance > 6 & total > 50').reset_index()['from_station_id'] for route in [(id1, id2), (id2, id1)]: group = get_group(*route) names = station_id_map[route[0]], station_id_map[route[1]] lines = ax.plot(group['starttime'], group['minutes'], 'o', ms=5, label="{0} $\\to$ {1}".format(*(' '.join(n.split()[:2]) for n in names))) color = lines[0].get_color() ax.plot(group['stoptime'], group['minutes'], 'o', ms=5, color=color) for i in range(group.shape[0]): ax.plot([group['starttime'].values[i], group['stoptime'].values[i]], 2 * [group['minutes'].values[i]], '-', color=color, alpha=0.3) ax.text(0.98, 0.02, "{0} born in {1}".format(group['gender'].iloc[0], int(group['birthyear'].iloc[0])), ha='right', va='bottom', transform=ax.transAxes, fontsize=12) ax.xaxis.set_major_locator(plt.MultipleLocator(2 * 60 * 60)) ax.legend(loc='best', fontsize=12) ax.set_title('{0} \nto\n {1}'.format(*names)) ax.set_xlabel('ride start time') ax.set_ylabel('ride duration'); fig.savefig('figs/power_user.png', bbox_inches='tight') # ## Mean-Shift Clustering # In[11]: from sklearn.cluster import MeanShift def compute_compactness(group, min_samples=25, bandwidth=5): """Return a measure of the compactness of the group""" # arrange data to cluster: divide minuteofday by 10 # to increase effective bandwidth by a factor of 10 X = np.vstack([group.minuteofday / 10, group.minutes]).T if X.shape[0] < min_samples: return 0 # compute the meanshift clusters, and count number of points in each c = MeanShift(bandwidth=bandwidth).fit_predict(X) counts = pd.Series(c).groupby(c).count() # Select only the points from the dominant cluster c = pd.Series(c).map(counts == counts.max()) if c.sum() < min_samples or c.sum() < 0.9 * len(c): return 0 else: return bandwidth / X[c.values].std(1).max() compute_compactness(get_group('BT-03', 'UD-01')) # In[12]: subset = trips.query('usertype == "Annual Member"') groups = subset.groupby(['from_station_id', 'to_station_id']) compactness = groups.apply(compute_compactness).fillna(0).unstack() ranked = compactness.unstack().sort_values(ascending=False) ranked.iloc[:10] # In[13]: def analyze_pair(id1, id2): subset = get_group(id1, id2, True).copy() unique = pd.value_counts(subset['birthyear']) subset = subset[subset['birthyear'] == unique.index[0]] def mean_time(col): #s = pd.DatetimeIndex(s.astype('datetime64')) return np.mean([c.hour * 3600 + c.minute * 60 + c.second for c in col]) AMPM = subset.groupby('from_station_id')['starttime'].aggregate(mean_time) AMPM.sort_values(inplace=True) subset['AMPM'] = np.where(subset.from_station_id == np.argmin(AMPM), 'morning', 'afternoon') print('{0} -> {1}'.format(id1, id2)) print(pd.value_counts(subset['birthyear'])) print(pd.value_counts(subset['gender'])) print("distance:", distances.loc[id1, id2]) print("Date Range:", subset.date.min(), "to", subset.date.max()) print("-----------------------------") fig, ax = plt.subplots(1, 2, figsize=(16, 6), sharey=True) fig.subplots_adjust(left=0.05, right=0.95, wspace=0.05) fig.suptitle('{0} $\longrightarrow$ {1} ' ''.format(station_id_map[id1], station_id_map[id2]), size=14) ax[0].text(0.02, 0.98, "{1:d} total trips\ndistance = {0:.2f} mi".format(distances.loc[id1, id2], len(subset)), ha='left', va='top', transform=ax[0].transAxes, fontsize=14) names = station_id_map[AMPM.index[0]], station_id_map[AMPM.index[1]] colors = plt.rcParams['axes.color_cycle'] for from_station, color in zip(AMPM.index, colors): half = subset[subset.from_station_id == from_station] if AMPM.index[0] == from_station: order = names else: order = names[::-1] ax[0].scatter(half['starttime'].values, half['minutes'].values, c=color, label="{0} $\\to$ {1}".format(*(' '.join(n.split()[:2]) for n in order))) ax[0].scatter(half['stoptime'].values, half['minutes'].values, c=color) for i in range(half.shape[0]): ax[0].plot([half['starttime'].values[i], half['stoptime'].values[i]], 2 * [half['minutes'].values[i]], '-', color=color, alpha=0.3) ax[0].legend(loc='lower right', fontsize=14) ax[0].xaxis.set_major_locator(plt.MultipleLocator(4 * 60 * 60)) ax[0].set_ylabel('trip duration (minutes)') ns_in_day = 24 * 60 * 60 * 1E9 subset['daynumber'] = (subset.date - subset.date.iloc[0]).astype(int) / ns_in_day def dateformat(x, *args): return str(subset.date.iloc[0] + pd.datetools.timedelta(days=int(x))).split()[0] ax[1].xaxis.set_major_locator(plt.MaxNLocator(6)) ax[1].xaxis.set_major_formatter(plt.FuncFormatter(dateformat)) for AMPM in ['morning', 'afternoon']: sns.regplot('daynumber', 'minutes', data=subset.query('AMPM == "{0}"'.format(AMPM)), ax=ax[1]) ax[1].set_ylabel('') ax[1].set_xlabel('Date') ax[1].text(0.98, 0.98, "{0} born in {1}".format(subset['gender'].iloc[0], int(subset['birthyear'].iloc[0])), ha='right', va='top', transform=ax[1].transAxes, fontsize=14) return fig # In[14]: for i, ind in enumerate([0, 1, 3, 4, 7]): fig = analyze_pair(*ranked.index[ind]) fig.savefig('figs/extreme-user-{0}.png'.format(i + 1))