#!/usr/bin/env python # coding: utf-8 # # Most Time Spent # # Let's say we have data from [NYC City Bike Data](https://s3.amazonaws.com/tripdata/index.html) # # We have a `DataFrame` with `start` and `end` time. We'd like to know for each ride where it spent most of the time - `morning`, `noon`, `evening` or `night`. # # We're going to convert time of day to minutes since midnight, then we can look at `(start, end)` as line and find overlap between then. # # --- # Credits to Pietro Battiston who gave the basis for this [on stackoverflow](https://stackoverflow.com/questions/46314769/label-rows-by-most-time-spent-in-a-time-interval/46372005#46372005). # In[1]: import numpy as np import pandas as pd # ### Load Data # In[2]: start_col, end_col = 'starttime', 'stoptime' # Loading just first 10,000 rows df = pd.read_csv('201501-citibike-tripdata.csv', parse_dates=[start_col, end_col], nrows=10000) # ### Parts of Day # In[3]: @np.vectorize def minutes(time): """Convert time to minutes since 00:00 >>> minutes('3:17') 197 """ # Work on strings as well if isinstance(time, str): time = pd.Timestamp(time) return (time.hour * 60) + time.minute # Parts of day, we can have overlaps here parts_of_day = [ ('morning', minutes('05:00am'), minutes('11:00')), ('noon', minutes('11:00am'), minutes('14:00')), ('evening', minutes('14:00'), minutes('19:00')), # Night is two continuous parts ('night_a', minutes('18:30'), minutes('23:59')), ('night_b', minutes('00:00'), minutes('6:00')), ] # Auxiliary DataFrame parts_df = pd.DataFrame(index=df.index) for col in [start_col, end_col]: parts_df[col] = minutes(df[col].dt.time) for name, start, end in parts_of_day: parts_df['start_p'] = start parts_df['end_p'] = end ends = parts_df[[end_col, 'end_p']].min(axis=1) starts = parts_df[[start_col, 'start_p']].max(axis=1) duration = ends - starts parts_df.loc[duration >= 0, name] = duration[duration >= 0] parts_df['night'] = parts_df['night_a'] + parts_df['night_b'] parts_df.fillna(0, inplace=True) # Assign part of day where most time was spent df['day_part'] = parts_df[['morning', 'noon', 'evening', 'night']].idxmax(axis=1) # ### Sample Output # In[4]: df['day_part'].sample(20) # In[ ]: