Let's say we have data from NYC City Bike Data
We have a
end time. We'd like to know for each ride where it spent most of the time -
We're going to convert time of day to minutes since midnight, then we can look at
(start, end) as line and find overlap between then.
Credits to Pietro Battiston who gave the basis for this on stackoverflow.
import numpy as np import pandas as pd
start_col, end_col = 'starttime', 'stoptime' # Loading just first 10,000 rows df = pd.read_csv('201501-citibike-tripdata.csv', parse_dates=[start_col, end_col], nrows=10000)
@np.vectorize def minutes(time): """Convert time to minutes since 00:00 >>> minutes('3:17') 197 """ # Work on strings as well if isinstance(time, str): time = pd.Timestamp(time) return (time.hour * 60) + time.minute # Parts of day, we can have overlaps here parts_of_day = [ ('morning', minutes('05:00am'), minutes('11:00')), ('noon', minutes('11:00am'), minutes('14:00')), ('evening', minutes('14:00'), minutes('19:00')), # Night is two continuous parts ('night_a', minutes('18:30'), minutes('23:59')), ('night_b', minutes('00:00'), minutes('6:00')), ] # Auxiliary DataFrame parts_df = pd.DataFrame(index=df.index) for col in [start_col, end_col]: parts_df[col] = minutes(df[col].dt.time) for name, start, end in parts_of_day: parts_df['start_p'] = start parts_df['end_p'] = end ends = parts_df[[end_col, 'end_p']].min(axis=1) starts = parts_df[[start_col, 'start_p']].max(axis=1) duration = ends - starts parts_df.loc[duration >= 0, name] = duration[duration >= 0] parts_df['night'] = parts_df['night_a'] + parts_df['night_b'] parts_df.fillna(0, inplace=True) # Assign part of day where most time was spent df['day_part'] = parts_df[['morning', 'noon', 'evening', 'night']].idxmax(axis=1)
4074 evening 512 morning 7975 morning 9854 noon 1557 noon 7836 noon 1814 noon 2367 evening 1764 noon 7332 morning 397 morning 4019 evening 657 morning 8399 noon 1909 noon 2154 evening 8687 noon 229 morning 4103 evening 9966 noon Name: day_part, dtype: object