Example Race Charts

Quick sketches using the race data from https://prize.tatacommunications.com/home

The aim of the competion, (I think) is to work with live timing feeds, but I'm more interested in being able to do analyses across the history of the session, which means building up timeseries and getting them represented usefully somehow.

The top half of this notebook is a bit all over the place... My improved second thoughts being at the the "Starting Over section

In [ ]:
#Grab the data as parsed XML elements in a list

race='data/F1 Race.txt'

from lxml import etree

pl=[]
for el in open(race, 'r'):
    pl.append(etree.fromstring(el))
In [ ]:
#Column mappings for the timing screen
raceMap={
    '1':'classpos',
    '2':'racingNumber',
    '3':'name',
    '4':'gap',
    '5':'interval',
    '6':'laptime',
    '7':'sector1',
    '9':'sector2',
    '11':'sector3',
    '12':'pitlap',
    '13':'pitcount'
}

raceUnMap = { raceMap[k]: k for k in raceMap}
raceUnMap
In [ ]:
#Make use of row semantics (row is classification position)
#Generate a classification position/time datastructure for each driverNumber

def parse_race_pos_all(r,pos,c):
    '''r is data element; pos is dataframe; c is column'''
    if r.attrib['identifier']=='101' and 'sessionstate' not in r[0].attrib:
        cn=raceMap[c]
        if r[0].attrib['column']==c:
            tt=r.attrib['timestamp'].replace('.',':').split(':')
            ttx=datetime.time(int(tt[0]),int(tt[1]),int(tt[2]),int(tt[3])*1000)#.strftime("%H:%M:%S:%f")
            pos=pd.concat([ pos,pd.DataFrame( {
                cn:r[0].attrib['value'],
                cn+"_colour":r[0].attrib['colour'],
                "pos":r[0].attrib['row']
            },index=[ttx]) ])
    return pos

#This is a bit quirkier - driverNumber over time for a particular classification number
def parse_race_pos_num(r,pos,c,p):
    '''r is data element; pos is dataframe; c is column; p is position we're interested in'''
    if r.attrib['identifier']=='101' and 'sessionstate' not in r[0].attrib:
        cn=raceMap[c]
        if r[0].attrib['column'] ==c and r[0].attrib['row'] ==p:
            tt=r.attrib['timestamp'].replace('.',':').split(':')
            ttx=datetime.time(int(tt[0]),int(tt[1]),int(tt[2]),int(tt[3])*1000)#.strftime("%H:%M:%S:%f")
            pos=pd.concat([ pos,pd.DataFrame( {cn:r[0].attrib['value'], cn+"_colour":r[0].attrib['colour']},index=[ttx]) ])
    return pos
In [ ]:
import pandas as pd

pnum=pd.DataFrame()
for l in pl:
    pnum=parse_race_pos_all(l,pnum,raceUnMap['racingNumber'])
pnum['time'] = pnum.index
pnum['pos']=pnum['pos'].astype(int)
pnum=pnum[pnum.racingNumber!='']
pnum[:3]
In [ ]:
from ggplot import *
ggplot(pnum[pnum.pos<23],aes(x='time',y='pos',colour='racingNumber'))+geom_line()+ylim(0,23)

When it comes to working out laptimes, we maybe need to do some grouping. The pandas group by operation will let us group on a particular value, which would be one way. Or maybe we can chuck data into MongoDB and just set on a compound time/row key?

Let's simplify the data stream to 101s as dicts and generate a unique key.

In [ ]:
plj=[]
for el in open(race, 'r'):
    r=etree.fromstring(el)
    d={}
    for a in r.attrib: d[a]=r.attrib[a]
    for a in r[0].attrib: d[a]=r[0].attrib[a]
    if d['identifier']=='101' and 'row' in d and d['column'] in ['1','2','3','4','5','6','7','9','11','12','13']:
        cn=raceMap[d['column']]
        did= '_'.join([ d['row'],d['timestamp'].replace(':','_').replace('.','_') ])
        dd={cn:d['value'], cn+"_colour":d['colour'],'id':did,'row': d['row'],'strtime':d['timestamp']}
        plj.append(dd)
plj[:3]
In [ ]:
import pymongo
from pymongo import MongoClient
import json
In [ ]:
#Connect to the MongoDB server
conn = MongoClient('localhost', 27017)
In [ ]:
#Create a new database
db = conn.tataracedb
In [ ]:
db.drop_collection('test1')
In [ ]:
#Create a new collection
collection = db.test1
collection
In [ ]:
for i in plj:
    collection.update({'_id':i['id']},{"$set":i},upsert=True)
In [ ]:
collection.find_one()
In [ ]:
results=collection.find({'$and': [ { 'racingNumber': '3' }, { 'gap': { '$exists': True } } ]})
In [ ]:
results.count()
In [ ]:
df=pd.DataFrame(list(results))
df[:3]
In [ ]:
def timeify(x):
    x=x.replace(':','_').replace('.','_')
    tt=x.split('_')
    return datetime.time(int(tt[0]),int(tt[1]),int(tt[2]),int(tt[3])*1000)

df['time']=df['strtime'].apply(lambda x: timeify(x))
ggplot(df,aes(x='time',y='gap'))+geom_line()
In [ ]:
results=collection.find({'$and': [ 
            #{ 'racingNumber': { '$exists': True } },
            { 'gap': { '$exists': True } },
            { '$or': [ {'row':'2'}, {'row':'3'},{'row':'4'},{'row':'5'}]}
        ]})
df=pd.DataFrame(list(results))
df['time']=df['strtime'].apply(lambda x: timeify(x))
df.gap=df.gap.convert_objects(convert_numeric=True)
#df.dropna(subset=['gap','racingNumber'],inplace=True)
ggplot(df,aes(x='time',y='gap',colour='row'))+geom_line()
In [ ]:
df.gap.unique()

Grouping changes to a row that happen close in time to each other

What happens if we just look at a particular row? If changes to a row happen close in time (say, within half a second), can we group this as an atomic chahange to a row?

Let's grab the records for a particular row:

In [ ]:
#results=collection.find({'$and': [{'row': '3'}, {'racingNumber': '3'}] },{'row':1,'strtime':1} )
results=collection.find({'row': '3' },{'row':1,'strtime':1} )
results.count()
In [ ]:
df=pd.DataFrame(list(results))
df[:3]
In [ ]:
def itimeify(x):
    tt=x.split('_')
    return (int(tt[0])*3600000+int(tt[1])*60000+int(tt[2])*1000+int(tt[3]))/1000

df['itime']=df['time'].apply(lambda x: itimeify(x))
df['time']=df['time'].apply(lambda x: timeify(x))

df[:3]
In [ ]:
ggplot(df[11:-5], aes(x='itime',y=1))+geom_point()
In [ ]:
#Is there anything we can do based around events on the same row that are within a short time of each other?

#If there is a position change, details will refer to what in the row? Just the new driver?

#Let's try to group close in time items...
#Based on http://stackoverflow.com/a/10017017/454773
d=list(df['itime'][11:-5])

diff = [y - x for x, y in zip(*[iter(d)] * 2)]
#avg = sum(diff) / len(diff)

m = [[d[0]]]

for x in d[1:]:
    #if x - m[-1][0] < avg:
    if x - m[-1][0] < 0.5: #this is the threshold
        m[-1].append(x)
    else:
        m.append([x])


print(m)

Now what do we do??? Make a common index for grouped elements and add them to a new MongoDB collection?

Taking the wrong approach - if a row updates, it does so in column order? - so we can use this as a crib...?

It doesn't... eg col 13 may update before col 2 within a given row...

How about we start thing about:

  • useful dynamic, live state tracker objects
  • history objects?

Starting Over...

In [1]:
from numpy import nan as NA

race='data/F1 Race.txt'

from lxml import etree
In [ ]:
#not used
class Driver:
    
    sectors=[NA,NA,NA]
    prev_sectors=[NA,NA,NA]
    history_sectors=[[],[],[]]
    
    def __init__(self, name,driverNum):
        self.name = name
        self.driverNum = driverNum
    
    def update_sector(self,sectorNum,sectorTime):
        self.prev_sectors[sectorNum-1]=self.sectors[sectorNum-1]
        self.sectors[sectorNum-1]=sectorTime
        self.history_sectors[sectorNum-1].append(sectorTime)
        
        
ham=Driver("hamilton","44")
ham.name
In [ ]:
ham.update_sector(1,33.12)
print(ham.sectors,ham.prev_sectors,ham.history_sectors)
In [ ]:
#not used
class Classification:
    def __init__(self):
        self.rank={}

ranks=Classification()
ranks
In [2]:
class Pits:
    
    def __init__(self):
        self._items={}
        self.history=[]
        self._logger=[]
        #self._driverNum=NA
        #self._lap=NA
        #self._strtime=None
        #self._count=NA
        
    def updateRacePits(self,raceObj,data):
        #Need 2, 6 and 13 then commit
        #This doesn't work if there is a position change while someone is IN PIT?
        #SO: car has to go OUT before it can come back in?
        #Also need to catch if car IN PIT then goes to RETIRED
        driverNum=raceObj.getDriverNumFromPos(data)
        if data['column'] in ['2','6','13']:
            if driverNum not in self._items:
                self._items[driverNum]={'out':True}
        if data['column']=='2' and data['colour']=='RED':
            self._logger.append(data)
            self._items[driverNum]['driverNum']=str(driverNum)+'_'+data['value']
            self._items[driverNum]['strtime']=data['strtime']
            #Note: driverNum should == int(data['value'])
        elif data['column']=='6':
            if data['value']=='IN PIT':
                self._logger.append(data)
                self._items[driverNum]['status']=data['value']
            elif data['value']=='OUT':
                self._items[driverNum]={'out':True}
        elif data['column']=='13' and data['value']!='':
            self._logger.append(data)
            self._items[driverNum]['lap']=raceObj.lap
            self._items[driverNum]['count']=int(data['value'])
        if len ( self._items[driverNum].keys()) ==6 and self._items[driverNum]['out'] :
            #print('dd')
            self.history.append(self._items[driverNum].copy())
            raceObj.drivers[driverNum].appendPit(self._items[driverNum].copy())
            self._items[driverNum]={'out':False}
        
In [3]:
class Purples:
    
    def __init__(self):     
        self.lap={}
        self.sector1={}
        self.sector2={}
        self.sector3={}
        
        self._lap  =  {'driverNum':'',"name":'','laptime':'','classRank':'','lap':'','strtime':''}
        
        self._sector1={'driverNum':'',"name":'','laptime':'','classRank':'','lap':'','strtime':''}
        self._sector2={'driverNum':'',"name":'','laptime':'','classRank':'','lap':'','strtime':''}
        self._sector3={'driverNum':'',"name":'','laptime':'','classRank':'','lap':'','strtime':''}
        
        self.history=[]
    
    
    def updateLapPurples(self,raceObj,data):
        '''  '''
        if data['column']=='2':
            self._lap['driverNum']=data['value']
            self._lap['strtime']=data['strtime']
            self._lap['classRank']=int(data['row'])

        elif data['column']=='3':
            self._lap['name']=data['value']

        elif data['column']=='6':
            self._lap['laptime']=data['value']
            self._lap['lap']=raceObj.lap-1

            self.lap=self._lap.copy()
            self.history.append(self.lap)
    
    def updateSectorPurples(self,raceObj,sector,data):
        pass
  
In [4]:
class DriverHistory:
    def __init__(self,driverNum=''):
        self.driverNum=driverNum
        self.pits=[]
        
        self._laptime=''
        self._sector1time=''
        self._sector2time=''
        self._sector3time=''
        self._gap=''
        self._interval=''
        self.lapdata=[]
        
    def setname(self,name):
        self.name=name
    
    def appendPit(self,data):
        self.pits.append(data)
        
    def trackLaptime(self,data):
        if data['row']=='1':
            self._gap='0'
            self._interval='0'
        if data['column'] == '6':
            self._laptime=data['value']
            self._timeofday=data['strtime']
        elif data['column'] == '7': self._sector1time=data['value']
        elif data['column'] =='9': self._sector2time=data['value']
        elif data['column'] =='11': self._sector3time=data['value']
        elif data['column'] =='4': self._gap=data['value']
        elif data['column'] =='5': self._interval=data['value']
        
        if self._laptime != '' and self._sector1time!='' and self._sector2time!='' \
            and self._sector3time!='' and self._gap!='' and self._interval!='':
            if raceObj.lap > (len(self.lapdata)+1):
            #if len(self.lapdata)==0 or (self._timeofday != self.lapdata[-1]['timeofday'] ):
                self.lapdata.append({
                        'timeofday':self._timeofday,
                        'pos':int(data['row']),
                        'lap':raceObj.lap-1,
                        'laptime':self._laptime,
                        's1':self._sector1time,
                        's2':self._sector2time,
                        's3':self._sector3time,
                        'gap':self._gap,
                        'interval':self._interval})
                self._laptime=''
                self._sector1time=''
                self._sector2time=''
                self._sector3time=''
                self._gap=''
                self._interval=''            
            
In [5]:
class Race:
    def __init__(self):
        self.lap=-1
        self.pos={} #Contains the driver number for the current classification position
        self.lapPurples=Purples()
        self.pits=Pits()
        self.drivers={}
        self.driverNames={}
        self._namesSet=False
        
    def setDriverNumForPos(self,data):
        self.pos[data['row']]=data['value']
        
    def getDriverNumFromPos(self,data):
        return self.pos[data['row']]
    
    def setupDriverDetails(self,data):
        if data['column']=='2' and data['value'].strip()!='' and data['value'] not in self.drivers:
            self.setDriverNumForPos(data)
            self.drivers[data['value']]=DriverHistory(data['value'])
        elif data['column']=='3' and data['value'].strip()!='':
            driverNum=self.getDriverNumFromPos(data)
            self.drivers[driverNum].setname(data['value'])
            self.driverNames[data['value']]=driverNum

    def trackDriverNumForPos(self,data):
        if data['column']=='2':
            self.setDriverNumForPos(data)

    def trackLapCount(self,data):
        if data['row']=='1' and data['column']=='5' and data['value']!='':
            self.lap=int(data['value'])

    def trackPurples(self,raceObj,data):
        if data['colour']=='PURPLE' and data['column'] in ['2','3','6']:
            self.lapPurples.updateLapPurples(raceObj,data)
            
    def trackPits(self,raceObj,data):
        if data['column'] in ['2','6','13']:
            self.pits.updateRacePits(raceObj,data)
            
    def trackLaptimes(self,raceObj,data):
        if data['column'] in ['4','5','6','7','9','11']:
            driverNum=self.getDriverNumFromPos(data)
            self.drivers[driverNum].trackLaptime(data)
 
In [6]:
class Weather:
    def __init__(self):
        self.stamps={}
        
    def setWeather(self,data):
        if data['strtime'] not in self.stamps:
            self.stamps[data['strtime']]={}
        d=self.stamps[data['strtime']]
        if data['row']=='1':
            d['trackTemp']=data['value']
        elif data['row']=='2':
            d['airTemp']=data['value']
        elif data['row']=='3':
            d['rainfall']=data['value']
        elif data['row']=='4':
            d['windSpeed']=data['value']
        elif data['row']=='5':
            d['humidity']=data['value']
        elif data['row']=='6':
            d['airpressure']=data['value']
        elif data['row']=='7':
            d['windDir']=data['value']
        
In [7]:
plf=[]
wlf=[]
for el in open(race, 'r'):
    r=etree.fromstring(el)
    d={}
    for a in r.attrib: d[a]=r.attrib[a]
    for a in r[0].attrib: d[a]=r[0].attrib[a]
    if d['identifier']=='101' and 'row' in d and d['column'] in ['1','2','3','4','5','6','7','9','11','12','13']:
        dd={"value":d['value'], "colour":d['colour'],'row': d['row'],'column': d['column'],'strtime':d['timestamp']}
        #did= '_'.join([ d['row'],d['timestamp'].replace(':','_').replace('.','_') ])
        #dd={"value":d['value'], "colour":d['colour'],'id':did,'row': d['row'],'column': d['column'],'strtime':d['timestamp']}
        plf.append(dd)
    elif d['identifier']=='101' and 'sessionstate' in d:
        plf.append(d)
    elif d['identifier']=='103' and d['column']=='1':
        dd={'strtime':d['timestamp'],'row': d['row'],"value":d['value']}
        wlf.append(dd)
In [8]:
plf[:3]
Out[8]:
[{'messagecount': '107488',
  'informationvalid': 'true',
  'identifier': '101',
  'timestamp': '14:34:21.668',
  'sessionstate': 'inactive'},
 {'column': '1',
  'value': '1',
  'colour': 'CYAN',
  'strtime': '14:34:21.669',
  'row': '1'},
 {'column': '1',
  'value': '2',
  'colour': 'CYAN',
  'strtime': '14:34:21.669',
  'row': '2'}]
In [9]:
def checkSessionState(data):
    if ('sessionstate' in data):
        return data['sessionstate']
    return
In [10]:
raceObj=Race()
startedRunning=False
finishedRunning=False
#raceHistory=RaceHistory()
for x in plf:
    sessionstate=checkSessionState(x)
    if sessionstate != None: print(sessionstate)
    if sessionstate=="started":
        startedRunning=True
    elif startedRunning and sessionstate=="inactive":
        finishedRunning=True
    if not startedRunning and sessionstate == None:
        raceObj.setupDriverDetails(x)
    elif startedRunning and not finishedRunning and sessionstate == None:
        raceObj.trackLapCount(x)
        raceObj.trackDriverNumForPos(x)
        raceObj.trackPurples(raceObj,x)
        raceObj.trackPits(raceObj,x)
        raceObj.trackLaptimes(raceObj,x)
#raceHistory.lapPurples.history
inactive
started
finished
inactive
inactive
In [11]:
weather=Weather()
for w in wlf:
    weather.setWeather(w)
In [15]:
weather.stamps['14:34:21.752']
Out[15]:
{'windDir': '129',
 'airpressure': '1002.3',
 'airTemp': '23',
 'trackTemp': '37',
 'windSpeed': '3.2',
 'rainfall': '0',
 'humidity': '45'}
In [17]:
raceObj.drivers['1'].lapdata[-3:]
Out[17]:
[{'interval': '25.2',
  'pos': 4,
  's2': '34.6',
  's3': '31.1',
  'laptime': '1:31.017',
  's1': '25.2',
  'gap': '73.8',
  'lap': 63,
  'timeofday': '16:17:43.612'},
 {'interval': '26.0',
  'pos': 4,
  's2': '34.1',
  's3': '31.3',
  'laptime': '1:30.801',
  's1': '25.3',
  'gap': '74.2',
  'lap': 64,
  'timeofday': '16:19:14.387'},
 {'interval': '27.6',
  'pos': 4,
  's2': '34.7',
  's3': '33.2',
  'laptime': '1:33.392',
  's1': '25.4',
  'gap': '76.7',
  'lap': 65,
  'timeofday': '16:20:47.843'}]
In [18]:
raceObj.drivers['1'].lapdata[:5]
Out[18]:
[{'interval': '0.4',
  'pos': 14,
  's2': '36.0',
  's3': '33.0',
  'laptime': '1:35.224',
  's1': '26.1',
  'gap': '12.4',
  'lap': 1,
  'timeofday': '14:41:46.378'},
 {'interval': '0.3',
  'pos': 14,
  's2': '35.9',
  's3': '32.9',
  'laptime': '1:34.674',
  's1': '25.7',
  'gap': '18.0',
  'lap': 3,
  'timeofday': '14:44:55.894'},
 {'interval': '0.3',
  'pos': 14,
  's2': '35.8',
  's3': '32.7',
  'laptime': '1:34.306',
  's1': '25.7',
  'gap': '20.3',
  'lap': 4,
  'timeofday': '14:46:30.222'},
 {'interval': '0.4',
  'pos': 14,
  's2': '35.7',
  's3': '32.7',
  'laptime': '1:34.449',
  's1': '25.8',
  'gap': '22.7',
  'lap': 5,
  'timeofday': '14:48:04.589'},
 {'interval': '0.6',
  'pos': 14,
  's2': '35.9',
  's3': '32.7',
  'laptime': '1:34.356',
  's1': '25.6',
  'gap': '24.9',
  'lap': 6,
  'timeofday': '14:49:39.101'}]
In [21]:
raceObj.lapPurples.history[:3]
Out[21]:
[{'classRank': 1,
  'lap': 1,
  'laptime': '1:32.010',
  'name': 'L. HAMILTON',
  'strtime': '14:41:34.205',
  'driverNum': '44'},
 {'classRank': 1,
  'lap': 3,
  'laptime': '1:31.913',
  'name': 'L. HAMILTON',
  'strtime': '14:44:38.478',
  'driverNum': '44'},
 {'classRank': 1,
  'lap': 8,
  'laptime': '1:31.876',
  'name': 'L. HAMILTON',
  'strtime': '14:52:18.197',
  'driverNum': '44'}]
In [22]:
raceObj.drivers['1'].pits
Out[22]:
[{'lap': 12,
  'status': 'IN PIT',
  'count': 1,
  'out': True,
  'strtime': '14:57:29.852',
  'driverNum': '1_1'},
 {'lap': 33,
  'status': 'IN PIT',
  'count': 2,
  'out': True,
  'strtime': '15:30:19.373',
  'driverNum': '1_1'},
 {'lap': 52,
  'status': 'IN PIT',
  'count': 3,
  'out': True,
  'strtime': '15:59:25.171',
  'driverNum': '1_1'}]
In [23]:
raceObj.drivers['1'].name
Out[23]:
'S. VETTEL'
In this example, row 17 starts to update high columns before the lower ones
In [26]:
raceObj.pits._logger[:3]
Out[26]:
[{'column': '2',
  'value': '44',
  'colour': 'RED',
  'strtime': '14:42:56.839',
  'row': '1'},
 {'column': '6',
  'value': 'IN PIT',
  'colour': 'YELLOW',
  'strtime': '14:42:56.839',
  'row': '1'},
 {'column': '13',
  'value': '1',
  'colour': 'YELLOW',
  'strtime': '14:57:29.847',
  'row': '13'}]
In [28]:
raceObj.pits.history[:3]
Out[28]:
[{'lap': 12,
  'status': 'IN PIT',
  'count': 1,
  'out': True,
  'strtime': '14:57:29.852',
  'driverNum': '1_1'},
 {'lap': 14,
  'status': 'IN PIT',
  'count': 1,
  'out': True,
  'strtime': '15:00:18.871',
  'driverNum': '3_3'},
 {'lap': 15,
  'status': 'IN PIT',
  'count': 1,
  'out': True,
  'strtime': '15:01:59.009',
  'driverNum': '8_8'}]
In [29]:
raceObj.pos
Out[29]:
{'2': '6',
 '1': '44',
 '4': '1',
 '8': '8',
 '16': '21',
 '6': '14',
 '18': '17',
 '19': '4',
 '12': '20',
 '10': '27',
 '14': '26',
 '20': '9',
 '21': '10',
 '9': '11',
 '3': '3',
 '17': '99',
 '15': '13',
 '7': '7',
 '5': '77',
 '13': '19',
 '11': '22',
 '22': '25'}

gource

In [ ]:
import csv
from time import *
#f=open('openurlgource.csv', 'rb')
 
#reader = csv.reader(f, delimiter='\t')
writer = csv.writer(open('openurlgource.txt','wb'),delimiter='|')
headerline = reader.next()
for row in reader:
    if row[4].strip() !='':
        t=int(mktime(strptime(row[0]+" "+row[1], "%Y-%m-%d %H:%M:%S")))
        writer.writerow([t,row[3],'A',row[4].rstrip(':').replace(':','/')])