In [1]:

import pandas as pd
from io import StringIO
import timeit
#This is a series of flight itinerary information with leg by leg pairs. 
#For example the first line is MEL to GUM then GUM to SEL then SEL to MEL
#That is Melbourne to Guam then Guam to Seoul then Seoul to Melbourne.
base_data = '''MultiCityItinerary
MEL, GUM, GUM, SEL, SEL, MEL
MEL, SHA, SHA, TPE, TPE, SHA, SHA, DEL, DEL, SHA, SHA, MEL
MEL, SHA, SHA, TPE, TPE, SHA, SHA, DEL, DEL, MEL
ADL, LON, LON, LOS, LOS, ADL
MEL, CAN, CAN, TPE, TPE, SHA, SHA, DEL, DEL, MEL
WEH, WEH, SZX, CAN, SGN, SGN
BNE, NYC, LAX, SAT, SAT, BNE
BNE, DXB, DXB, DUB, DUB, DXB, DXB, BNE
SYD, LAX, LAX, SFO, SFO, STL, STL, SYD
PER, MEL, MEL, OOL, OOL, PER
MEL, LON, LON, HKG, HKG, MEL
MEL, HKG, HKG, LON, LON, HKG, HKG, MEL
DRW, DRW, AKL, AKL, LON
ROM, DXB, DXB, SYD, SYD, DXB, DXB, ROM
SYD, HNL, HNL, NYC, NYC, HNL, HNL, SYD
PER, BKK, BKK, BNE, BNE, PER
BNE, LAX, LAX, ATL, ATL, OAX, OAX, LAX, LAX, BNE
AKL, NYC, NYC, LAS, LAS, SFO, SFO, AKL
SYD, HKG, HKG, SEL, SEL, HKG, HKG, SYD
AKL, NYC, NYC, SJO, SJO, AKL
ZQN, SYD, SYD, TYO, TYO, ZQN
RAR, LAX, LAX, YTO, NYC, LAS, LAS, LAX, LAX, RAR
PER, MEL, MEL, OOL, OOL, PER
SYD, DAD, HAN, SYD
ADL, BOD, BOD, BER, BER, SIN, SIN, ADL'''

As you will see below For the 25600 case we can see that the naive case takes 14.7s, ie 14700ms and the vectorized version takes 73.1ms which is approximately 200 times faster. So although both are O(n), the co-efficient is drastically different, such that it makes far more sense to mutate the data (by adding spaces) so that it can be processed more efficientl rather than trying to go through row by row.

In [2]:

df = "" #The dataframe we'll be reusing to test on
runs = 10 #how long you want to wait to see how different the co-effecient is, wouldn't recommend going higher than 15
maxLegs = 6 #maximum number of legs

# Creating the dataframe with the columns for each leg that we want to fill
def create_df():
    df = pd.read_csv(StringIO(raw_data), sep="|") #nonsense pipe seperator as I want the source data to be in a single column
    print('Total rows '+str(len(df)))
    #adding in the extra leg columns
    for x in range(6):
        col = ('Leg'+str(x+1))
        df[col] = ''
    return df

In [3]:

#Add leg data in. WAY slow
def naive_row():
    for idx, row in enumerate(df['MultiCityItinerary']):
        legs = int((len(row)+2) / 10) #the number of legs in a particular string
        for l in range(0, legs):
            legstr = row[l*10:l*10+8] #each individual city pair
            df.loc[idx, str('Leg'+str(l+1))] = legstr #placing that citypair into a dedicated leg column

#reset the data and start our profiling run
raw_data = base_data
for x in range(runs):
    raw_data = raw_data + raw_data #doubling the size of the data we're processing on each run
    df = create_df()
    %time naive_row()

Total rows 50
Wall time: 35 ms
Total rows 100
Wall time: 60 ms
Total rows 200
Wall time: 130 ms
Total rows 400
Wall time: 240 ms
Total rows 800
Wall time: 468 ms
Total rows 1600
Wall time: 952 ms
Total rows 3200
Wall time: 1.89 s
Total rows 6400
Wall time: 3.67 s
Total rows 12800
Wall time: 7.36 s
Total rows 25600
Wall time: 14.7 s

In [4]:

def smarter_row():
    #can't just filter because of the chaining issue, ie is it a view or a copy
    #The solution is to pad out each one to be the same length and essentially and brute force
    df['MultiCityItinerary'] = df['MultiCityItinerary'].str.ljust(maxLegs*10-2) #this is the longest a string can be, ie 6 legs
    
    #Now we can go through each string and because we've padded it out we can just process each one in the same way
    for leg in range(1,maxLegs+1):
        df['Leg'+str(leg)] = df['MultiCityItinerary'].str[(leg-1)*10:(leg-1)*10+8]
    
    df['MultiCityItinerary'] = df['MultiCityItinerary'].str.strip() #return it to normal


#reset the data and start our profiling run
raw_data = base_data   
for x in range(runs):
    raw_data = raw_data + raw_data
    df = create_df()
    %time smarter_row()

Total rows 50
Wall time: 4.02 ms
Total rows 100
Wall time: 3 ms
Total rows 200
Wall time: 6.02 ms
Total rows 400
Wall time: 4 ms
Total rows 800
Wall time: 6 ms
Total rows 1600
Wall time: 7 ms
Total rows 3200
Wall time: 11 ms
Total rows 6400
Wall time: 18 ms
Total rows 12800
Wall time: 34 ms
Total rows 25600
Wall time: 73.1 ms

In [5]:

#What the dataframe actually ends up looking like
df.head()

Out[5]:

	MultiCityItinerary	Leg1	Leg2	Leg3	Leg4	Leg5	Leg6
0	MEL, GUM, GUM, SEL, SEL, MEL	MEL, GUM	GUM, SEL	SEL, MEL
1	MEL, SHA, SHA, TPE, TPE, SHA, SHA, DEL, DEL, S...	MEL, SHA	SHA, TPE	TPE, SHA	SHA, DEL	DEL, SHA	SHA, MEL
2	MEL, SHA, SHA, TPE, TPE, SHA, SHA, DEL, DEL, MEL	MEL, SHA	SHA, TPE	TPE, SHA	SHA, DEL	DEL, MEL
3	ADL, LON, LON, LOS, LOS, ADL	ADL, LON	LON, LOS	LOS, ADL
4	MEL, CAN, CAN, TPE, TPE, SHA, SHA, DEL, DEL, MEL	MEL, CAN	CAN, TPE	TPE, SHA	SHA, DEL	DEL, MEL