import pandas as pd
from io import StringIO
import timeit
#This is a series of flight itinerary information with leg by leg pairs.
#For example the first line is MEL to GUM then GUM to SEL then SEL to MEL
#That is Melbourne to Guam then Guam to Seoul then Seoul to Melbourne.
base_data = '''MultiCityItinerary
MEL, GUM, GUM, SEL, SEL, MEL
MEL, SHA, SHA, TPE, TPE, SHA, SHA, DEL, DEL, SHA, SHA, MEL
MEL, SHA, SHA, TPE, TPE, SHA, SHA, DEL, DEL, MEL
ADL, LON, LON, LOS, LOS, ADL
MEL, CAN, CAN, TPE, TPE, SHA, SHA, DEL, DEL, MEL
WEH, WEH, SZX, CAN, SGN, SGN
BNE, NYC, LAX, SAT, SAT, BNE
BNE, DXB, DXB, DUB, DUB, DXB, DXB, BNE
SYD, LAX, LAX, SFO, SFO, STL, STL, SYD
PER, MEL, MEL, OOL, OOL, PER
MEL, LON, LON, HKG, HKG, MEL
MEL, HKG, HKG, LON, LON, HKG, HKG, MEL
DRW, DRW, AKL, AKL, LON
ROM, DXB, DXB, SYD, SYD, DXB, DXB, ROM
SYD, HNL, HNL, NYC, NYC, HNL, HNL, SYD
PER, BKK, BKK, BNE, BNE, PER
BNE, LAX, LAX, ATL, ATL, OAX, OAX, LAX, LAX, BNE
AKL, NYC, NYC, LAS, LAS, SFO, SFO, AKL
SYD, HKG, HKG, SEL, SEL, HKG, HKG, SYD
AKL, NYC, NYC, SJO, SJO, AKL
ZQN, SYD, SYD, TYO, TYO, ZQN
RAR, LAX, LAX, YTO, NYC, LAS, LAS, LAX, LAX, RAR
PER, MEL, MEL, OOL, OOL, PER
SYD, DAD, HAN, SYD
ADL, BOD, BOD, BER, BER, SIN, SIN, ADL'''
As you will see below For the 25600 case we can see that the naive case takes 14.7s, ie 14700ms and the vectorized version takes 73.1ms which is approximately 200 times faster. So although both are O(n), the co-efficient is drastically different, such that it makes far more sense to mutate the data (by adding spaces) so that it can be processed more efficientl rather than trying to go through row by row.
df = "" #The dataframe we'll be reusing to test on
runs = 10 #how long you want to wait to see how different the co-effecient is, wouldn't recommend going higher than 15
maxLegs = 6 #maximum number of legs
# Creating the dataframe with the columns for each leg that we want to fill
def create_df():
df = pd.read_csv(StringIO(raw_data), sep="|") #nonsense pipe seperator as I want the source data to be in a single column
print('Total rows '+str(len(df)))
#adding in the extra leg columns
for x in range(6):
col = ('Leg'+str(x+1))
df[col] = ''
return df
#Add leg data in. WAY slow
def naive_row():
for idx, row in enumerate(df['MultiCityItinerary']):
legs = int((len(row)+2) / 10) #the number of legs in a particular string
for l in range(0, legs):
legstr = row[l*10:l*10+8] #each individual city pair
df.loc[idx, str('Leg'+str(l+1))] = legstr #placing that citypair into a dedicated leg column
#reset the data and start our profiling run
raw_data = base_data
for x in range(runs):
raw_data = raw_data + raw_data #doubling the size of the data we're processing on each run
df = create_df()
%time naive_row()
Total rows 50 Wall time: 35 ms Total rows 100 Wall time: 60 ms Total rows 200 Wall time: 130 ms Total rows 400 Wall time: 240 ms Total rows 800 Wall time: 468 ms Total rows 1600 Wall time: 952 ms Total rows 3200 Wall time: 1.89 s Total rows 6400 Wall time: 3.67 s Total rows 12800 Wall time: 7.36 s Total rows 25600 Wall time: 14.7 s
def smarter_row():
#can't just filter because of the chaining issue, ie is it a view or a copy
#The solution is to pad out each one to be the same length and essentially and brute force
df['MultiCityItinerary'] = df['MultiCityItinerary'].str.ljust(maxLegs*10-2) #this is the longest a string can be, ie 6 legs
#Now we can go through each string and because we've padded it out we can just process each one in the same way
for leg in range(1,maxLegs+1):
df['Leg'+str(leg)] = df['MultiCityItinerary'].str[(leg-1)*10:(leg-1)*10+8]
df['MultiCityItinerary'] = df['MultiCityItinerary'].str.strip() #return it to normal
#reset the data and start our profiling run
raw_data = base_data
for x in range(runs):
raw_data = raw_data + raw_data
df = create_df()
%time smarter_row()
Total rows 50 Wall time: 4.02 ms Total rows 100 Wall time: 3 ms Total rows 200 Wall time: 6.02 ms Total rows 400 Wall time: 4 ms Total rows 800 Wall time: 6 ms Total rows 1600 Wall time: 7 ms Total rows 3200 Wall time: 11 ms Total rows 6400 Wall time: 18 ms Total rows 12800 Wall time: 34 ms Total rows 25600 Wall time: 73.1 ms
#What the dataframe actually ends up looking like
df.head()
MultiCityItinerary | Leg1 | Leg2 | Leg3 | Leg4 | Leg5 | Leg6 | |
---|---|---|---|---|---|---|---|
0 | MEL, GUM, GUM, SEL, SEL, MEL | MEL, GUM | GUM, SEL | SEL, MEL | |||
1 | MEL, SHA, SHA, TPE, TPE, SHA, SHA, DEL, DEL, S... | MEL, SHA | SHA, TPE | TPE, SHA | SHA, DEL | DEL, SHA | SHA, MEL |
2 | MEL, SHA, SHA, TPE, TPE, SHA, SHA, DEL, DEL, MEL | MEL, SHA | SHA, TPE | TPE, SHA | SHA, DEL | DEL, MEL | |
3 | ADL, LON, LON, LOS, LOS, ADL | ADL, LON | LON, LOS | LOS, ADL | |||
4 | MEL, CAN, CAN, TPE, TPE, SHA, SHA, DEL, DEL, MEL | MEL, CAN | CAN, TPE | TPE, SHA | SHA, DEL | DEL, MEL |