import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn %matplotlib inline data = pd.read_csv('data/nyc_data.csv', parse_dates=['pickup_datetime', 'dropoff_datetime']) fare = pd.read_csv('data/nyc_fare.csv', parse_dates=['pickup_datetime']) weekly = data.groupby(data.pickup_datetime.dt.weekofyear) len(weekly) y = weekly.size() y.head(3) x = weekly.pickup_datetime.first() x.head(3) pd.Series(y.values, index=x).plot() plt.ylim(0) # Set the lower y value to 0. plt.xlabel('Week') # Label of the x axis. plt.ylabel('Taxi rides') # Label of the y axis. tip = fare[['medallion', 'tip_amount']] \ .loc[fare.tip_amount>0].groupby('medallion').mean() print(len(tip)) tip.head(3) tip.hist(bins=np.linspace(0., 6., 100)) plt.xlabel('Average tip') plt.ylabel('Number of taxis') data_merged = pd.merge(data, tip, how='left', left_on='medallion', right_index=True) data_merged.head(3)