Let's say we have data from NYC City Bike Data
We have a DataFrame
with start
and end
time. We'd like to know for each ride where it spent most of the time - morning
, noon
, evening
or night
.
We're going to convert time of day to minutes since midnight, then we can look at (start, end)
as line and find overlap between then.
Credits to Pietro Battiston who gave the basis for this on stackoverflow.
import numpy as np
import pandas as pd
start_col, end_col = 'starttime', 'stoptime'
# Loading just first 10,000 rows
df = pd.read_csv('201501-citibike-tripdata.csv', parse_dates=[start_col, end_col], nrows=10000)
@np.vectorize
def minutes(time):
"""Convert time to minutes since 00:00
>>> minutes('3:17')
197
"""
# Work on strings as well
if isinstance(time, str):
time = pd.Timestamp(time)
return (time.hour * 60) + time.minute
# Parts of day, we can have overlaps here
parts_of_day = [
('morning', minutes('05:00am'), minutes('11:00')),
('noon', minutes('11:00am'), minutes('14:00')),
('evening', minutes('14:00'), minutes('19:00')),
# Night is two continuous parts
('night_a', minutes('18:30'), minutes('23:59')),
('night_b', minutes('00:00'), minutes('6:00')),
]
# Auxiliary DataFrame
parts_df = pd.DataFrame(index=df.index)
for col in [start_col, end_col]:
parts_df[col] = minutes(df[col].dt.time)
for name, start, end in parts_of_day:
parts_df['start_p'] = start
parts_df['end_p'] = end
ends = parts_df[[end_col, 'end_p']].min(axis=1)
starts = parts_df[[start_col, 'start_p']].max(axis=1)
duration = ends - starts
parts_df.loc[duration >= 0, name] = duration[duration >= 0]
parts_df['night'] = parts_df['night_a'] + parts_df['night_b']
parts_df.fillna(0, inplace=True)
# Assign part of day where most time was spent
df['day_part'] = parts_df[['morning', 'noon', 'evening', 'night']].idxmax(axis=1)
df['day_part'].sample(20)
4074 evening 512 morning 7975 morning 9854 noon 1557 noon 7836 noon 1814 noon 2367 evening 1764 noon 7332 morning 397 morning 4019 evening 657 morning 8399 noon 1909 noon 2154 evening 8687 noon 229 morning 4103 evening 9966 noon Name: day_part, dtype: object