%matplotlib inline
# %config InlineBackend.figure_format='retina'
import dask.dataframe as dd
import numpy as np
import pandas as pd
# import seaborn as sgn
import geopandas as gpd
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['savefig.dpi'] = 125
# df = dd.read_parquet('/bigdata/citibike.parquet')
# df.set_index('start_time', npartitions='auto').to_parquet('/bigdata/citibike_repartitioned.parquet')
df = dd.read_parquet('/bigdata/citibike_repartitioned.parquet')
def get_is_bus_day(x):
return np.is_busday(x.index.values.astype('<M8[D]'))
def get_day(x):
return pd.to_datetime(x.index.values.astype('<M8[D]').astype(str), infer_datetime_format=True)
def get_week(x):
return pd.to_datetime(x.index.values.astype('<M8[W]').astype(str), infer_datetime_format=True)
df['year_month_day'] = (df.map_partitions(get_day, meta=('year_month_day', '<M8[ns]')))
df['year_month_week'] = (df.map_partitions(get_week, meta=('year_month_week', '<M8[ns]')))
df['is_bus_day'] = df.map_partitions(get_is_bus_day, meta=('is_bus_day', np.bool))
df[df.is_bus_day == False].head()
trip_duration | stop_time | start_station_id | start_station_name | start_station_latitude | start_station_longitude | end_station_id | end_station_name | end_station_latitude | end_station_longitude | bike_id | user_type | birth_year | gender | start_taxizone_id | end_taxizone_id | year_month_day | year_month_week | is_bus_day | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
start_time | |||||||||||||||||||
2013-07-06 00:00:43 | 1593 | 2013-07-06 00:27:16 | 224 | Spruce St & Nassau St | 40.711464 | -74.005524 | 232 | Cadman Plaza E & Tillary St | 40.695977 | -73.990149 | 15355 | Customer | NaN | 0 | 209.0 | 65.0 | 2013-07-06 | 2013-07-04 | False |
2013-07-06 00:00:49 | 173 | 2013-07-06 00:03:42 | 482 | W 15 St & 7 Ave | 40.739355 | -73.999318 | 482 | W 15 St & 7 Ave | 40.739355 | -73.999318 | 14850 | Subscriber | 1958.0 | 1 | 90.0 | 90.0 | 2013-07-06 | 2013-07-04 | False |
2013-07-06 00:00:56 | 1494 | 2013-07-06 00:25:50 | 282 | Kent Ave & S 11 St | 40.708273 | -73.968341 | 539 | Metropolitan Ave & Bedford Ave | 40.715348 | -73.960241 | 16233 | Customer | NaN | 0 | 256.0 | 255.0 | 2013-07-06 | 2013-07-04 | False |
2013-07-06 00:01:08 | 938 | 2013-07-06 00:16:46 | 164 | E 47 St & 2 Ave | 40.753231 | -73.970325 | 2022 | E 59 St & Sutton Pl | 40.758491 | -73.959206 | 20186 | Customer | NaN | 0 | 233.0 | 140.0 | 2013-07-06 | 2013-07-04 | False |
2013-07-06 00:01:10 | 349 | 2013-07-06 00:06:59 | 229 | Great Jones St | 40.727434 | -73.993790 | 428 | E 3 St & 1 Ave | 40.724677 | -73.987834 | 15735 | Subscriber | 1986.0 | 1 | 114.0 | 79.0 | 2013-07-06 | 2013-07-04 | False |
timedf = df['trip_duration year_month_day'.split()].groupby('year_month_day').count().compute()
timedf_wk = df['trip_duration year_month_week'.split()].groupby('year_month_week').count().compute()
timedf2 = df['trip_duration year_month_day'.split()][df.user_type == 'Customer'].groupby('year_month_day').count().compute()
timedf2_wk = df['trip_duration year_month_week'.split()][df.user_type=='Customer'].groupby('year_month_week').count().compute()
timedf3 = df['trip_duration year_month_day'.split()][df.user_type == 'Subscriber'].groupby('year_month_day').count().compute()
timedf3_wk = df['trip_duration year_month_week'.split()][df.user_type=='Subscriber'].groupby('year_month_week').count().compute()
import matplotlib.dates as mdates
plt.plot(timedf.index.values, timedf.trip_duration, alpha=0.4, color='C0', label='Trips per day')
plt.plot(timedf_wk.index.values, timedf_wk.trip_duration/7., color='C0', alpha=1, label='One week average of trips per day')
plt.plot(timedf2.index.values, timedf2.trip_duration, alpha=0.4, color='C1', label='Guests Trips per day')
plt.plot(timedf2_wk.index.values, timedf2_wk.trip_duration/7., color='C1', alpha=1, label='Guests One week average of trips per day')
plt.plot(timedf3.index.values, timedf3.trip_duration, alpha=0.4, color='C2', label='Subscriber Trips per day')
plt.plot(timedf3_wk.index.values, timedf3_wk.trip_duration/7., color='C2', alpha=1, label='Subscriber One week average of trips per day')
plt.xlabel('Date')
plt.ylabel("Trips Per Day")
plt.gcf().set_size_inches(10, 5)
# years = mdates.YearLocator() # every year
months = mdates.MonthLocator() # every month
yearsFmt = mdates.DateFormatter('%b %Y')
ax = plt.gca()
ax.xaxis.set_major_locator(mdates.MonthLocator(interval=3))
ax.xaxis.set_minor_locator(mdates.MonthLocator(interval=1))
ax.xaxis.set_major_formatter(yearsFmt)
ax.yaxis.set_major_locator(matplotlib.ticker.MultipleLocator(10000))
ax.yaxis.set_minor_locator(matplotlib.ticker.MultipleLocator(2500))
plt.legend(loc='upper left')
plt.xlim('2013-07-01', '2017-01-01')
# plt.xlim('2016-06-01', '2016-08-01')
# ax.xaxis.set_major_locator(mdates.WeekdayLocator())
# ax.xaxis.set_minor_locator(mdates.WeekdayLocator())
# ax.xaxis.set_major_formatter(yearsFmt)
plt.ylim(0, 75000)
# ax.xaxis.set_minor_locator(months)
plt.gcf().autofmt_xdate()
weather_df = pd.read_csv('../central_park_weather.csv.gz')
weather_df['DATE'] = pd.to_datetime(weather_df.DATE, format='%Y%m%d')
merged_counts = timedf.merge(weather_df['DATE PRCP SNWD SNOW TMAX TMIN AWND'.split()], left_index=True, right_on='DATE', how='left')
merged_counts.columns = ['N', 'DATE', 'PRCP', 'SNWD', 'SNOW', 'TMAX', 'TMIN',
'AWND']
import statsmodels.api
import statsmodels.formula.api
from patsy import standardize as st
model = statsmodels.formula.api.ols(formula="N ~ st(PRCP) + st(SNWD) + st(SNOW) + st(TMAX) + st(TMIN)", data=merged_counts)
model.fit().summary()
Dep. Variable: | N | R-squared: | 0.549 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.547 |
Method: | Least Squares | F-statistic: | 308.9 |
Date: | Fri, 19 May 2017 | Prob (F-statistic): | 1.81e-216 |
Time: | 15:09:19 | Log-Likelihood: | -13493. |
No. Observations: | 1276 | AIC: | 2.700e+04 |
Df Residuals: | 1270 | BIC: | 2.703e+04 |
Df Model: | 5 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [95.0% Conf. Int.] | |
---|---|---|---|---|---|
Intercept | 2.892e+04 | 265.571 | 108.898 | 0.000 | 2.84e+04 2.94e+04 |
st(PRCP) | -3210.2392 | 273.517 | -11.737 | 0.000 | -3746.835 -2673.644 |
st(SNWD) | -2102.8777 | 299.407 | -7.023 | 0.000 | -2690.265 -1515.490 |
st(SNOW) | -182.5400 | 284.662 | -0.641 | 0.521 | -741.000 375.920 |
st(TMAX) | 8357.3333 | 1012.308 | 8.256 | 0.000 | 6371.353 1.03e+04 |
st(TMIN) | 422.5314 | 1020.685 | 0.414 | 0.679 | -1579.882 2424.945 |
Omnibus: | 126.581 | Durbin-Watson: | 0.431 |
---|---|---|---|
Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 168.763 |
Skew: | 0.797 | Prob(JB): | 2.26e-37 |
Kurtosis: | 3.798 | Cond. No. | 8.23 |
model = statsmodels.formula.api.ols(formula="N ~ st(PRCP) + st(SNWD) + st(SNOW) + st(TMAX) + st(TMIN)", data=merged_counts)
model.fit().summary()
Dep. Variable: | N | R-squared: | 0.549 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.547 |
Method: | Least Squares | F-statistic: | 308.9 |
Date: | Fri, 19 May 2017 | Prob (F-statistic): | 1.81e-216 |
Time: | 15:10:17 | Log-Likelihood: | -13493. |
No. Observations: | 1276 | AIC: | 2.700e+04 |
Df Residuals: | 1270 | BIC: | 2.703e+04 |
Df Model: | 5 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [95.0% Conf. Int.] | |
---|---|---|---|---|---|
Intercept | 2.892e+04 | 265.571 | 108.898 | 0.000 | 2.84e+04 2.94e+04 |
st(PRCP) | -3210.2392 | 273.517 | -11.737 | 0.000 | -3746.835 -2673.644 |
st(SNWD) | -2102.8777 | 299.407 | -7.023 | 0.000 | -2690.265 -1515.490 |
st(SNOW) | -182.5400 | 284.662 | -0.641 | 0.521 | -741.000 375.920 |
st(TMAX) | 8357.3333 | 1012.308 | 8.256 | 0.000 | 6371.353 1.03e+04 |
st(TMIN) | 422.5314 | 1020.685 | 0.414 | 0.679 | -1579.882 2424.945 |
Omnibus: | 126.581 | Durbin-Watson: | 0.431 |
---|---|---|---|
Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 168.763 |
Skew: | 0.797 | Prob(JB): | 2.26e-37 |
Kurtosis: | 3.798 | Cond. No. | 8.23 |
help(statsmodels.api.OLS)
Help on class OLS in module statsmodels.regression.linear_model: class OLS(WLS) | A simple ordinary least squares model. | | | Parameters | ---------- | endog : array-like | 1-d endogenous response variable. The dependent variable. | exog : array-like | A nobs x k array where `nobs` is the number of observations and `k` | is the number of regressors. An intercept is not included by default | and should be added by the user. See | :func:`statsmodels.tools.add_constant`. | missing : str | Available options are 'none', 'drop', and 'raise'. If 'none', no nan | checking is done. If 'drop', any observations with nans are dropped. | If 'raise', an error is raised. Default is 'none.' | hasconst : None or bool | Indicates whether the RHS includes a user-supplied constant. If True, | a constant is not checked for and k_constant is set to 1 and all | result statistics are calculated as if a constant is present. If | False, a constant is not checked for and k_constant is set to 0. | | | Attributes | ---------- | weights : scalar | Has an attribute weights = array(1.0) due to inheritance from WLS. | | See Also | -------- | GLS | | Examples | -------- | >>> import numpy as np | >>> | >>> import statsmodels.api as sm | >>> | >>> Y = [1,3,4,5,2,3,4] | >>> X = range(1,8) | >>> X = sm.add_constant(X) | >>> | >>> model = sm.OLS(Y,X) | >>> results = model.fit() | >>> results.params | array([ 2.14285714, 0.25 ]) | >>> results.tvalues | array([ 1.87867287, 0.98019606]) | >>> print(results.t_test([1, 0]))) | <T test: effect=array([ 2.14285714]), sd=array([[ 1.14062282]]), t=array([[ 1.87867287]]), p=array([[ 0.05953974]]), df_denom=5> | >>> print(results.f_test(np.identity(2))) | <F test: F=array([[ 19.46078431]]), p=[[ 0.00437251]], df_denom=5, df_num=2> | | Notes | ----- | No constant is added by the model unless you are using formulas. | | Method resolution order: | OLS | WLS | RegressionModel | statsmodels.base.model.LikelihoodModel | statsmodels.base.model.Model | builtins.object | | Methods defined here: | | __init__(self, endog, exog=None, missing='none', hasconst=None, **kwargs) | Initialize self. See help(type(self)) for accurate signature. | | loglike(self, params) | The likelihood function for the clasical OLS model. | | Parameters | ---------- | params : array-like | The coefficients with which to estimate the log-likelihood. | | Returns | ------- | The concentrated likelihood function evaluated at params. | | whiten(self, Y) | OLS model whitener does nothing: returns Y. | | ---------------------------------------------------------------------- | Methods inherited from RegressionModel: | | fit(self, method='pinv', cov_type='nonrobust', cov_kwds=None, use_t=None, **kwargs) | Full fit of the model. | | The results include an estimate of covariance matrix, (whitened) | residuals and an estimate of scale. | | Parameters | ---------- | method : str | Can be "pinv", "qr". "pinv" uses the Moore-Penrose pseudoinverse | to solve the least squares problem. "qr" uses the QR | factorization. | | Returns | ------- | A RegressionResults class instance. | | See Also | --------- | regression.RegressionResults | | Notes | ----- | The fit method uses the pseudoinverse of the design/exogenous variables | to solve the least squares minimization. | | fit_regularized(self, method='coord_descent', maxiter=1000, alpha=0.0, L1_wt=1.0, start_params=None, cnvrg_tol=1e-08, zero_tol=1e-08, **kwargs) | Return a regularized fit to a linear regression model. | | Parameters | ---------- | method : string | Only the coordinate descent algorithm is implemented. | maxiter : integer | The maximum number of iteration cycles (an iteration cycle | involves running coordinate descent on all variables). | alpha : scalar or array-like | The penalty weight. If a scalar, the same penalty weight | applies to all variables in the model. If a vector, it | must have the same length as `params`, and contains a | penalty weight for each coefficient. | L1_wt : scalar | The fraction of the penalty given to the L1 penalty term. | Must be between 0 and 1 (inclusive). If 0, the fit is | ridge regression. If 1, the fit is the lasso. | start_params : array-like | Starting values for ``params``. | cnvrg_tol : scalar | If ``params`` changes by less than this amount (in sup-norm) | in once iteration cycle, the algorithm terminates with | convergence. | zero_tol : scalar | Any estimated coefficient smaller than this value is | replaced with zero. | | Returns | ------- | A RegressionResults object, of the same type returned by | ``fit``. | | Notes | ----- | The approach closely follows that implemented in the glmnet | package in R. The penalty is the "elastic net" penalty, which | is a convex combination of L1 and L2 penalties. | | The function that is minimized is: ..math:: | | 0.5*RSS/n + alpha*((1-L1_wt)*|params|_2^2/2 + L1_wt*|params|_1) | | where RSS is the usual regression sum of squares, n is the | sample size, and :math:`|*|_1` and :math:`|*|_2` are the L1 and L2 | norms. | | Post-estimation results are based on the same data used to | select variables, hence may be subject to overfitting biases. | | References | ---------- | Friedman, Hastie, Tibshirani (2008). Regularization paths for | generalized linear models via coordinate descent. Journal of | Statistical Software 33(1), 1-22 Feb 2010. | | initialize(self) | Initialize (possibly re-initialize) a Model instance. For | instance, the design matrix of a linear model may change | and some things must be recomputed. | | predict(self, params, exog=None) | Return linear predicted values from a design matrix. | | Parameters | ---------- | params : array-like | Parameters of a linear model | exog : array-like, optional. | Design / exogenous data. Model exog is used if None. | | Returns | ------- | An array of fitted values | | Notes | ----- | If the model has not yet been fit, params is not optional. | | ---------------------------------------------------------------------- | Data descriptors inherited from RegressionModel: | | df_model | The model degree of freedom, defined as the rank of the regressor | matrix minus 1 if a constant is included. | | df_resid | The residual degree of freedom, defined as the number of observations | minus the rank of the regressor matrix. | | ---------------------------------------------------------------------- | Methods inherited from statsmodels.base.model.LikelihoodModel: | | hessian(self, params) | The Hessian matrix of the model | | information(self, params) | Fisher information matrix of model | | Returns -Hessian of loglike evaluated at params. | | score(self, params) | Score vector of model. | | The gradient of logL with respect to each parameter. | | ---------------------------------------------------------------------- | Class methods inherited from statsmodels.base.model.Model: | | from_formula(formula, data, subset=None, *args, **kwargs) from builtins.type | Create a Model from a formula and dataframe. | | Parameters | ---------- | formula : str or generic Formula object | The formula specifying the model | data : array-like | The data for the model. See Notes. | subset : array-like | An array-like object of booleans, integers, or index values that | indicate the subset of df to use in the model. Assumes df is a | `pandas.DataFrame` | args : extra arguments | These are passed to the model | kwargs : extra keyword arguments | These are passed to the model with one exception. The | ``eval_env`` keyword is passed to patsy. It can be either a | :class:`patsy:patsy.EvalEnvironment` object or an integer | indicating the depth of the namespace to use. For example, the | default ``eval_env=0`` uses the calling namespace. If you wish | to use a "clean" environment set ``eval_env=-1``. | | | Returns | ------- | model : Model instance | | Notes | ------ | data must define __getitem__ with the keys in the formula terms | args and kwargs are passed on to the model instantiation. E.g., | a numpy structured or rec array, a dictionary, or a pandas DataFrame. | | ---------------------------------------------------------------------- | Data descriptors inherited from statsmodels.base.model.Model: | | __dict__ | dictionary for instance variables (if defined) | | __weakref__ | list of weak references to the object (if defined) | | endog_names | | exog_names
df2[['is_bus_day', 'start_station_id']][df2.is_bus_day == True].groupby('start_station_id') \
.count().compute().sort_values('is_bus_day', ascending=False)
is_bus_day | |
---|---|
start_station_id | |
519 | 356749 |
497 | 235838 |
521 | 233325 |
435 | 228568 |
293 | 214210 |
402 | 208014 |
490 | 206323 |
477 | 199371 |
426 | 196368 |
285 | 196145 |
151 | 178778 |
444 | 177460 |
284 | 176299 |
379 | 174838 |
523 | 173904 |
459 | 170351 |
382 | 166108 |
327 | 164709 |
168 | 161661 |
368 | 160471 |
358 | 153488 |
359 | 152552 |
472 | 152160 |
127 | 149701 |
492 | 148322 |
417 | 146737 |
499 | 146562 |
517 | 145338 |
537 | 144539 |
446 | 140362 |
... | ... |
3302 | 825 |
3332 | 754 |
3401 | 713 |
3424 | 698 |
3348 | 667 |
3229 | 658 |
3338 | 645 |
3340 | 561 |
3333 | 509 |
3440 | 478 |
3395 | 453 |
3330 | 420 |
3393 | 393 |
3394 | 325 |
3342 | 303 |
3326 | 284 |
3130 | 201 |
3245 | 177 |
3219 | 170 |
3014 | 100 |
3371 | 40 |
3250 | 25 |
3432 | 24 |
3239 | 18 |
3017 | 10 |
3036 | 10 |
3040 | 8 |
3266 | 7 |
3240 | 3 |
3385 | 1 |
659 rows × 1 columns
zz = df2[['start_station_id', 'is_bus_day', 'start_time', 'start_station_name']].groupby('start_station_id') \
.count().compute().sort_values('is_bus_day', ascending=False)
zz
is_bus_day | start_time | start_station_name | |
---|---|---|---|
start_station_id | |||
519 | 397813 | 397813 | 397813 |
497 | 315510 | 315510 | 315510 |
435 | 299698 | 299698 | 299698 |
426 | 280868 | 280868 | 280868 |
293 | 279981 | 279981 | 279981 |
521 | 268807 | 268807 | 268807 |
285 | 266162 | 266162 | 266162 |
402 | 260880 | 260880 | 260880 |
151 | 247791 | 247791 | 247791 |
490 | 245801 | 245801 | 245801 |
284 | 236954 | 236954 | 236954 |
477 | 229820 | 229820 | 229820 |
444 | 229799 | 229799 | 229799 |
459 | 227690 | 227690 | 227690 |
368 | 227610 | 227610 | 227610 |
382 | 219669 | 219669 | 219669 |
327 | 215974 | 215974 | 215974 |
499 | 215419 | 215419 | 215419 |
358 | 213026 | 213026 | 213026 |
168 | 204413 | 204413 | 204413 |
379 | 200189 | 200189 | 200189 |
523 | 196410 | 196410 | 196410 |
387 | 191457 | 191457 | 191457 |
2006 | 189364 | 189364 | 189364 |
127 | 187794 | 187794 | 187794 |
514 | 186694 | 186694 | 186694 |
3002 | 185525 | 185525 | 185525 |
446 | 181832 | 181832 | 181832 |
472 | 180320 | 180320 | 180320 |
504 | 179485 | 179485 | 179485 |
... | ... | ... | ... |
3424 | 929 | 929 | 929 |
3229 | 916 | 916 | 916 |
3338 | 900 | 900 | 900 |
255 | 897 | 897 | 897 |
3340 | 794 | 794 | 794 |
3333 | 669 | 669 | 669 |
3330 | 610 | 610 | 610 |
3440 | 588 | 588 | 588 |
3395 | 567 | 567 | 567 |
3393 | 562 | 562 | 562 |
3394 | 450 | 450 | 450 |
3326 | 429 | 429 | 429 |
3342 | 410 | 410 | 410 |
3130 | 351 | 351 | 351 |
3014 | 324 | 324 | 324 |
3219 | 180 | 180 | 180 |
3257 | 178 | 178 | 178 |
3245 | 177 | 177 | 177 |
3253 | 62 | 62 | 62 |
3371 | 53 | 53 | 53 |
3250 | 25 | 25 | 25 |
3432 | 24 | 24 | 24 |
3252 | 20 | 20 | 20 |
3239 | 18 | 18 | 18 |
3017 | 12 | 12 | 12 |
3040 | 11 | 11 | 11 |
3036 | 10 | 10 | 10 |
3266 | 7 | 7 | 7 |
3240 | 3 | 3 | 3 |
3385 | 1 | 1 | 1 |
662 rows × 3 columns
zz[zz.is_bus_day>500].index
Int64Index([ 519, 497, 435, 426, 293, 521, 285, 402, 151, 490, ... 3424, 3229, 3338, 255, 3340, 3333, 3330, 3440, 3395, 3393], dtype='int64', name='start_station_id', length=642)
df3 = df2[df2.start_station_id.isin(zz[zz.is_bus_day>500].index)]
df3.count().compute()
start_time 36899280 trip_duration 36899280 stop_time 36899280 start_station_id 36899280 start_station_name 36899280 start_station_latitude 36899280 start_station_longitude 36899280 end_station_id 36899280 end_station_name 36899280 end_station_latitude 36899280 end_station_longitude 36899280 bike_id 36899280 user_type 36863421 birth_year 32541201 gender 36899280 start_taxizone_id 36899266 end_taxizone_id 36898966 is_bus_day 36899280 dtype: int64
df.count().compute()
trip_duration 36902025 stop_time 36902025 start_station_id 36902025 start_station_name 36902025 start_station_latitude 36902025 start_station_longitude 36902025 end_station_id 36902025 end_station_name 36902025 end_station_latitude 36902025 end_station_longitude 36902025 bike_id 36902025 user_type 36866154 birth_year 32543206 gender 36902025 start_taxizone_id 36902006 end_taxizone_id 36901708 dtype: int64
import json
df_stations = pd.DataFrame(json.load(open("../15_dataframe_analysis/stations.2017.04.20.09.43.json"))['stationBeanList'])