%matplotlib inline
import os
files = os.listdir('/bigdata/all_trips.parquet/')
import fastparquet
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
len(files)
137
for f in files:
d = fastparquet.ParquetFile(os.path.join('/bigdata/all_trips.parquet/', f))
df = d.to_pandas(columns=['pickup_taxizone_id', 'pickup_datetime'])
df = df[df.pickup_datetime < '2016-07-01']
print("{} {}".format(f, df.shape[0]))
if 'alldf' in locals():
alldf = alldf.merge(df.groupby('pickup_taxizone_id').count()[['pickup_datetime']],
left_index=True, right_index=True, how='outer'
)
else:
alldf = df.groupby('pickup_taxizone_id').count()[['pickup_datetime']]
part.1175.parquet 0 part.1186.parquet 0 part.1185.parquet 0 part.57.parquet 1046251 part.1187.parquet 0 part.154.parquet 1047525 part.102.parquet 1046692 part.536.parquet 1536647 part.854.parquet 1063416 part.53.parquet 1046076 part.975.parquet 1044141 part.953.parquet 919854 part.649.parquet 1515670 part.1190.parquet 0 part.451.parquet 1065633 part.871.parquet 1062598 part.833.parquet 1126408 part.298.parquet 1044872 part.933.parquet 1059200 part.60.parquet 1046165 part.714.parquet 1127611 part.387.parquet 1039197 part.421.parquet 964766 part.1189.parquet 0 part.932.parquet 1059105 part.690.parquet 1150433 part.117.parquet 1046535 part.214.parquet 1044625 part.873.parquet 1062900 part.523.parquet 1145113 part.327.parquet 1047417 part.880.parquet 1062197 part.362.parquet 1044763 part.376.parquet 1043217 part.175.parquet 1045327 part.1188.parquet 0 part.279.parquet 1044917 part.332.parquet 1047064 part.1174.parquet 0 part.587.parquet 1119988 part.589.parquet 1130463 part.963.parquet 1058092 part.800.parquet 853366 part.316.parquet 1047564 part.875.parquet 1416262 part.195.parquet 1045707 part.580.parquet 851135 part.1182.parquet 0 part.1183.parquet 0 part.176.parquet 1045081 part.990.parquet 1043932 part.584.parquet 1137330 part.621.parquet 1520685 part.1195.parquet 0 part.482.parquet 1035764 part.1177.parquet 0 part.839.parquet 1066037 part.1198.parquet 0 part.577.parquet 1181700 part.632.parquet 1153558 part.1191.parquet 0 part.850.parquet 1063834 part.1199.parquet 0 part.779.parquet 1150022 part.1180.parquet 0 part.815.parquet 1132645 part.500.parquet 1151165 part.811.parquet 1121636 part.951.parquet 1059292 part.1172.parquet 0 part.1178.parquet 0 part.313.parquet 1046221 part.507.parquet 1146225 part.1179.parquet 0 part.846.parquet 1065887 part.677.parquet 1517221 part.832.parquet 1500146 part.1184.parquet 0 part.385.parquet 1042883 part.182.parquet 1045056 part.423.parquet 1379472 part.637.parquet 1136437 part.149.parquet 1048600 part.885.parquet 1062343 part.1171.parquet 0 part.410.parquet 1035630 part.438.parquet 1035839 part.516.parquet 1144889 part.790.parquet 1543551 part.383.parquet 1042420 part.924.parquet 1059215 part.101.parquet 836949 part.646.parquet 1131789 part.257.parquet 1044872 part.673.parquet 1146149 part.408.parquet 1035413 part.323.parquet 1047518 part.1176.parquet 0 part.159.parquet 1045658 part.501.parquet 1117679 part.293.parquet 1043630 part.795.parquet 1153782 part.669.parquet 1133559 part.1173.parquet 0 part.126.parquet 1046315 part.751.parquet 1117892 part.72.parquet 1043933 part.80.parquet 1039675 part.254.parquet 1394375 part.1196.parquet 0 part.772.parquet 1170587 part.1192.parquet 0 part.901.parquet 991140 part.868.parquet 1062533 part.121.parquet 1046444 part.736.parquet 1140262 part.623.parquet 1137202 part.1197.parquet 0 part.1194.parquet 0 part.237.parquet 1046889 part.188.parquet 1046802 part.59.parquet 1046028 part.1193.parquet 0 part.111.parquet 1046759 part.1181.parquet 0 part.995.parquet 1074380 part.709.parquet 1124230 part.726.parquet 1130153 part.867.parquet 1062534 part.172.parquet 1044909 part.688.parquet 1118640 part.218.parquet 1044265 part.261.parquet 1044750 part.481.parquet 1035881 part.666.parquet 1124922 part.206.parquet 1045585 part.888.parquet 839113
zz = pd.DataFrame(index=alldf.index,)
zz['N'] = np.nansum(alldf.values, axis=1)
zz['logN'] = np.log10(zz['N'])
import seaborn
seaborn.distplot(zz.logN.fillna(0), bins=np.arange(0, 7., 0.5), norm_hist=True)
# plt.xticks(np.linspace(0, 8, 17));
# plt.xlabel("Log10(Taxi Trips)")
# plt.ylabel("Frequency")
# plt.gcf().set_size_inches(8, 4)
/home/shekhar/anaconda3/lib/python3.5/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
<matplotlib.axes._subplots.AxesSubplot at 0x7fd6be9026d8>
import geopandas as gpd
import matplotlib.pyplot as plt
tz = gpd.read_file('../shapefiles/taxi_zones.shp')
tz = tz.merge(zz, left_on='LocationID', right_index=True)
tz.N.sum()
116003744.0
tz.plot(column='logN', cmap=plt.cm.viridis, linewidth=0.5, vmin=2, vmax=6.5)
plt.gcf().set_size_inches(12, 9)
z = tz
z = z[(z.borough != 'Staten Island')]
z = z[(z.borough != 'EWR')]
z.plot(column='logN', cmap=plt.cm.viridis, linewidth=0.5, vmin=2, vmax=6.5)
plt.gcf().set_size_inches(12, 9)
plt.tight_layout()