Quick sketches using the race data from https://prize.tatacommunications.com/home
The aim of the competion, (I think) is to work with live timing feeds, but I'm more interested in being able to do analyses across the history of the session, which means building up timeseries and getting them represented usefully somehow.
The top half of this notebook is a bit all over the place... My improved second thoughts being at the the "Starting Over section
#Grab the data as parsed XML elements in a list
race='data/F1 Race.txt'
from lxml import etree
pl=[]
for el in open(race, 'r'):
pl.append(etree.fromstring(el))
#Column mappings for the timing screen
raceMap={
'1':'classpos',
'2':'racingNumber',
'3':'name',
'4':'gap',
'5':'interval',
'6':'laptime',
'7':'sector1',
'9':'sector2',
'11':'sector3',
'12':'pitlap',
'13':'pitcount'
}
raceUnMap = { raceMap[k]: k for k in raceMap}
raceUnMap
#Make use of row semantics (row is classification position)
#Generate a classification position/time datastructure for each driverNumber
def parse_race_pos_all(r,pos,c):
'''r is data element; pos is dataframe; c is column'''
if r.attrib['identifier']=='101' and 'sessionstate' not in r[0].attrib:
cn=raceMap[c]
if r[0].attrib['column']==c:
tt=r.attrib['timestamp'].replace('.',':').split(':')
ttx=datetime.time(int(tt[0]),int(tt[1]),int(tt[2]),int(tt[3])*1000)#.strftime("%H:%M:%S:%f")
pos=pd.concat([ pos,pd.DataFrame( {
cn:r[0].attrib['value'],
cn+"_colour":r[0].attrib['colour'],
"pos":r[0].attrib['row']
},index=[ttx]) ])
return pos
#This is a bit quirkier - driverNumber over time for a particular classification number
def parse_race_pos_num(r,pos,c,p):
'''r is data element; pos is dataframe; c is column; p is position we're interested in'''
if r.attrib['identifier']=='101' and 'sessionstate' not in r[0].attrib:
cn=raceMap[c]
if r[0].attrib['column'] ==c and r[0].attrib['row'] ==p:
tt=r.attrib['timestamp'].replace('.',':').split(':')
ttx=datetime.time(int(tt[0]),int(tt[1]),int(tt[2]),int(tt[3])*1000)#.strftime("%H:%M:%S:%f")
pos=pd.concat([ pos,pd.DataFrame( {cn:r[0].attrib['value'], cn+"_colour":r[0].attrib['colour']},index=[ttx]) ])
return pos
import pandas as pd
pnum=pd.DataFrame()
for l in pl:
pnum=parse_race_pos_all(l,pnum,raceUnMap['racingNumber'])
pnum['time'] = pnum.index
pnum['pos']=pnum['pos'].astype(int)
pnum=pnum[pnum.racingNumber!='']
pnum[:3]
from ggplot import *
ggplot(pnum[pnum.pos<23],aes(x='time',y='pos',colour='racingNumber'))+geom_line()+ylim(0,23)
When it comes to working out laptimes, we maybe need to do some grouping. The pandas group by operation will let us group on a particular value, which would be one way. Or maybe we can chuck data into MongoDB and just set on a compound time/row key?
Let's simplify the data stream to 101s as dicts and generate a unique key.
plj=[]
for el in open(race, 'r'):
r=etree.fromstring(el)
d={}
for a in r.attrib: d[a]=r.attrib[a]
for a in r[0].attrib: d[a]=r[0].attrib[a]
if d['identifier']=='101' and 'row' in d and d['column'] in ['1','2','3','4','5','6','7','9','11','12','13']:
cn=raceMap[d['column']]
did= '_'.join([ d['row'],d['timestamp'].replace(':','_').replace('.','_') ])
dd={cn:d['value'], cn+"_colour":d['colour'],'id':did,'row': d['row'],'strtime':d['timestamp']}
plj.append(dd)
plj[:3]
import pymongo
from pymongo import MongoClient
import json
#Connect to the MongoDB server
conn = MongoClient('localhost', 27017)
#Create a new database
db = conn.tataracedb
db.drop_collection('test1')
#Create a new collection
collection = db.test1
collection
for i in plj:
collection.update({'_id':i['id']},{"$set":i},upsert=True)
collection.find_one()
results=collection.find({'$and': [ { 'racingNumber': '3' }, { 'gap': { '$exists': True } } ]})
results.count()
df=pd.DataFrame(list(results))
df[:3]
def timeify(x):
x=x.replace(':','_').replace('.','_')
tt=x.split('_')
return datetime.time(int(tt[0]),int(tt[1]),int(tt[2]),int(tt[3])*1000)
df['time']=df['strtime'].apply(lambda x: timeify(x))
ggplot(df,aes(x='time',y='gap'))+geom_line()
results=collection.find({'$and': [
#{ 'racingNumber': { '$exists': True } },
{ 'gap': { '$exists': True } },
{ '$or': [ {'row':'2'}, {'row':'3'},{'row':'4'},{'row':'5'}]}
]})
df=pd.DataFrame(list(results))
df['time']=df['strtime'].apply(lambda x: timeify(x))
df.gap=df.gap.convert_objects(convert_numeric=True)
#df.dropna(subset=['gap','racingNumber'],inplace=True)
ggplot(df,aes(x='time',y='gap',colour='row'))+geom_line()
df.gap.unique()
What happens if we just look at a particular row? If changes to a row happen close in time (say, within half a second), can we group this as an atomic chahange to a row?
Let's grab the records for a particular row:
#results=collection.find({'$and': [{'row': '3'}, {'racingNumber': '3'}] },{'row':1,'strtime':1} )
results=collection.find({'row': '3' },{'row':1,'strtime':1} )
results.count()
df=pd.DataFrame(list(results))
df[:3]
def itimeify(x):
tt=x.split('_')
return (int(tt[0])*3600000+int(tt[1])*60000+int(tt[2])*1000+int(tt[3]))/1000
df['itime']=df['time'].apply(lambda x: itimeify(x))
df['time']=df['time'].apply(lambda x: timeify(x))
df[:3]
ggplot(df[11:-5], aes(x='itime',y=1))+geom_point()
#Is there anything we can do based around events on the same row that are within a short time of each other?
#If there is a position change, details will refer to what in the row? Just the new driver?
#Let's try to group close in time items...
#Based on http://stackoverflow.com/a/10017017/454773
d=list(df['itime'][11:-5])
diff = [y - x for x, y in zip(*[iter(d)] * 2)]
#avg = sum(diff) / len(diff)
m = [[d[0]]]
for x in d[1:]:
#if x - m[-1][0] < avg:
if x - m[-1][0] < 0.5: #this is the threshold
m[-1].append(x)
else:
m.append([x])
print(m)
Now what do we do??? Make a common index for grouped elements and add them to a new MongoDB collection?
Taking the wrong approach - if a row updates, it does so in column order? - so we can use this as a crib...?
It doesn't... eg col 13 may update before col 2 within a given row...
How about we start thing about:
from numpy import nan as NA
race='data/F1 Race.txt'
from lxml import etree
#not used
class Driver:
sectors=[NA,NA,NA]
prev_sectors=[NA,NA,NA]
history_sectors=[[],[],[]]
def __init__(self, name,driverNum):
self.name = name
self.driverNum = driverNum
def update_sector(self,sectorNum,sectorTime):
self.prev_sectors[sectorNum-1]=self.sectors[sectorNum-1]
self.sectors[sectorNum-1]=sectorTime
self.history_sectors[sectorNum-1].append(sectorTime)
ham=Driver("hamilton","44")
ham.name
ham.update_sector(1,33.12)
print(ham.sectors,ham.prev_sectors,ham.history_sectors)
#not used
class Classification:
def __init__(self):
self.rank={}
ranks=Classification()
ranks
class Pits:
def __init__(self):
self._items={}
self.history=[]
self._logger=[]
#self._driverNum=NA
#self._lap=NA
#self._strtime=None
#self._count=NA
def updateRacePits(self,raceObj,data):
#Need 2, 6 and 13 then commit
#This doesn't work if there is a position change while someone is IN PIT?
#SO: car has to go OUT before it can come back in?
#Also need to catch if car IN PIT then goes to RETIRED
driverNum=raceObj.getDriverNumFromPos(data)
if data['column'] in ['2','6','13']:
if driverNum not in self._items:
self._items[driverNum]={'out':True}
if data['column']=='2' and data['colour']=='RED':
self._logger.append(data)
self._items[driverNum]['driverNum']=str(driverNum)+'_'+data['value']
self._items[driverNum]['strtime']=data['strtime']
#Note: driverNum should == int(data['value'])
elif data['column']=='6':
if data['value']=='IN PIT':
self._logger.append(data)
self._items[driverNum]['status']=data['value']
elif data['value']=='OUT':
self._items[driverNum]={'out':True}
elif data['column']=='13' and data['value']!='':
self._logger.append(data)
self._items[driverNum]['lap']=raceObj.lap
self._items[driverNum]['count']=int(data['value'])
if len ( self._items[driverNum].keys()) ==6 and self._items[driverNum]['out'] :
#print('dd')
self.history.append(self._items[driverNum].copy())
raceObj.drivers[driverNum].appendPit(self._items[driverNum].copy())
self._items[driverNum]={'out':False}
class Purples:
def __init__(self):
self.lap={}
self.sector1={}
self.sector2={}
self.sector3={}
self._lap = {'driverNum':'',"name":'','laptime':'','classRank':'','lap':'','strtime':''}
self._sector1={'driverNum':'',"name":'','laptime':'','classRank':'','lap':'','strtime':''}
self._sector2={'driverNum':'',"name":'','laptime':'','classRank':'','lap':'','strtime':''}
self._sector3={'driverNum':'',"name":'','laptime':'','classRank':'','lap':'','strtime':''}
self.history=[]
def updateLapPurples(self,raceObj,data):
''' '''
if data['column']=='2':
self._lap['driverNum']=data['value']
self._lap['strtime']=data['strtime']
self._lap['classRank']=int(data['row'])
elif data['column']=='3':
self._lap['name']=data['value']
elif data['column']=='6':
self._lap['laptime']=data['value']
self._lap['lap']=raceObj.lap-1
self.lap=self._lap.copy()
self.history.append(self.lap)
def updateSectorPurples(self,raceObj,sector,data):
pass
class DriverHistory:
def __init__(self,driverNum=''):
self.driverNum=driverNum
self.pits=[]
self._laptime=''
self._sector1time=''
self._sector2time=''
self._sector3time=''
self._gap=''
self._interval=''
self.lapdata=[]
def setname(self,name):
self.name=name
def appendPit(self,data):
self.pits.append(data)
def trackLaptime(self,data):
if data['row']=='1':
self._gap='0'
self._interval='0'
if data['column'] == '6':
self._laptime=data['value']
self._timeofday=data['strtime']
elif data['column'] == '7': self._sector1time=data['value']
elif data['column'] =='9': self._sector2time=data['value']
elif data['column'] =='11': self._sector3time=data['value']
elif data['column'] =='4': self._gap=data['value']
elif data['column'] =='5': self._interval=data['value']
if self._laptime != '' and self._sector1time!='' and self._sector2time!='' \
and self._sector3time!='' and self._gap!='' and self._interval!='':
if raceObj.lap > (len(self.lapdata)+1):
#if len(self.lapdata)==0 or (self._timeofday != self.lapdata[-1]['timeofday'] ):
self.lapdata.append({
'timeofday':self._timeofday,
'pos':int(data['row']),
'lap':raceObj.lap-1,
'laptime':self._laptime,
's1':self._sector1time,
's2':self._sector2time,
's3':self._sector3time,
'gap':self._gap,
'interval':self._interval})
self._laptime=''
self._sector1time=''
self._sector2time=''
self._sector3time=''
self._gap=''
self._interval=''
class Race:
def __init__(self):
self.lap=-1
self.pos={} #Contains the driver number for the current classification position
self.lapPurples=Purples()
self.pits=Pits()
self.drivers={}
self.driverNames={}
self._namesSet=False
def setDriverNumForPos(self,data):
self.pos[data['row']]=data['value']
def getDriverNumFromPos(self,data):
return self.pos[data['row']]
def setupDriverDetails(self,data):
if data['column']=='2' and data['value'].strip()!='' and data['value'] not in self.drivers:
self.setDriverNumForPos(data)
self.drivers[data['value']]=DriverHistory(data['value'])
elif data['column']=='3' and data['value'].strip()!='':
driverNum=self.getDriverNumFromPos(data)
self.drivers[driverNum].setname(data['value'])
self.driverNames[data['value']]=driverNum
def trackDriverNumForPos(self,data):
if data['column']=='2':
self.setDriverNumForPos(data)
def trackLapCount(self,data):
if data['row']=='1' and data['column']=='5' and data['value']!='':
self.lap=int(data['value'])
def trackPurples(self,raceObj,data):
if data['colour']=='PURPLE' and data['column'] in ['2','3','6']:
self.lapPurples.updateLapPurples(raceObj,data)
def trackPits(self,raceObj,data):
if data['column'] in ['2','6','13']:
self.pits.updateRacePits(raceObj,data)
def trackLaptimes(self,raceObj,data):
if data['column'] in ['4','5','6','7','9','11']:
driverNum=self.getDriverNumFromPos(data)
self.drivers[driverNum].trackLaptime(data)
class Weather:
def __init__(self):
self.stamps={}
def setWeather(self,data):
if data['strtime'] not in self.stamps:
self.stamps[data['strtime']]={}
d=self.stamps[data['strtime']]
if data['row']=='1':
d['trackTemp']=data['value']
elif data['row']=='2':
d['airTemp']=data['value']
elif data['row']=='3':
d['rainfall']=data['value']
elif data['row']=='4':
d['windSpeed']=data['value']
elif data['row']=='5':
d['humidity']=data['value']
elif data['row']=='6':
d['airpressure']=data['value']
elif data['row']=='7':
d['windDir']=data['value']
plf=[]
wlf=[]
for el in open(race, 'r'):
r=etree.fromstring(el)
d={}
for a in r.attrib: d[a]=r.attrib[a]
for a in r[0].attrib: d[a]=r[0].attrib[a]
if d['identifier']=='101' and 'row' in d and d['column'] in ['1','2','3','4','5','6','7','9','11','12','13']:
dd={"value":d['value'], "colour":d['colour'],'row': d['row'],'column': d['column'],'strtime':d['timestamp']}
#did= '_'.join([ d['row'],d['timestamp'].replace(':','_').replace('.','_') ])
#dd={"value":d['value'], "colour":d['colour'],'id':did,'row': d['row'],'column': d['column'],'strtime':d['timestamp']}
plf.append(dd)
elif d['identifier']=='101' and 'sessionstate' in d:
plf.append(d)
elif d['identifier']=='103' and d['column']=='1':
dd={'strtime':d['timestamp'],'row': d['row'],"value":d['value']}
wlf.append(dd)
plf[:3]
[{'messagecount': '107488', 'informationvalid': 'true', 'identifier': '101', 'timestamp': '14:34:21.668', 'sessionstate': 'inactive'}, {'column': '1', 'value': '1', 'colour': 'CYAN', 'strtime': '14:34:21.669', 'row': '1'}, {'column': '1', 'value': '2', 'colour': 'CYAN', 'strtime': '14:34:21.669', 'row': '2'}]
def checkSessionState(data):
if ('sessionstate' in data):
return data['sessionstate']
return
raceObj=Race()
startedRunning=False
finishedRunning=False
#raceHistory=RaceHistory()
for x in plf:
sessionstate=checkSessionState(x)
if sessionstate != None: print(sessionstate)
if sessionstate=="started":
startedRunning=True
elif startedRunning and sessionstate=="inactive":
finishedRunning=True
if not startedRunning and sessionstate == None:
raceObj.setupDriverDetails(x)
elif startedRunning and not finishedRunning and sessionstate == None:
raceObj.trackLapCount(x)
raceObj.trackDriverNumForPos(x)
raceObj.trackPurples(raceObj,x)
raceObj.trackPits(raceObj,x)
raceObj.trackLaptimes(raceObj,x)
#raceHistory.lapPurples.history
inactive started finished inactive inactive
weather=Weather()
for w in wlf:
weather.setWeather(w)
weather.stamps['14:34:21.752']
{'windDir': '129', 'airpressure': '1002.3', 'airTemp': '23', 'trackTemp': '37', 'windSpeed': '3.2', 'rainfall': '0', 'humidity': '45'}
raceObj.drivers['1'].lapdata[-3:]
[{'interval': '25.2', 'pos': 4, 's2': '34.6', 's3': '31.1', 'laptime': '1:31.017', 's1': '25.2', 'gap': '73.8', 'lap': 63, 'timeofday': '16:17:43.612'}, {'interval': '26.0', 'pos': 4, 's2': '34.1', 's3': '31.3', 'laptime': '1:30.801', 's1': '25.3', 'gap': '74.2', 'lap': 64, 'timeofday': '16:19:14.387'}, {'interval': '27.6', 'pos': 4, 's2': '34.7', 's3': '33.2', 'laptime': '1:33.392', 's1': '25.4', 'gap': '76.7', 'lap': 65, 'timeofday': '16:20:47.843'}]
raceObj.drivers['1'].lapdata[:5]
[{'interval': '0.4', 'pos': 14, 's2': '36.0', 's3': '33.0', 'laptime': '1:35.224', 's1': '26.1', 'gap': '12.4', 'lap': 1, 'timeofday': '14:41:46.378'}, {'interval': '0.3', 'pos': 14, 's2': '35.9', 's3': '32.9', 'laptime': '1:34.674', 's1': '25.7', 'gap': '18.0', 'lap': 3, 'timeofday': '14:44:55.894'}, {'interval': '0.3', 'pos': 14, 's2': '35.8', 's3': '32.7', 'laptime': '1:34.306', 's1': '25.7', 'gap': '20.3', 'lap': 4, 'timeofday': '14:46:30.222'}, {'interval': '0.4', 'pos': 14, 's2': '35.7', 's3': '32.7', 'laptime': '1:34.449', 's1': '25.8', 'gap': '22.7', 'lap': 5, 'timeofday': '14:48:04.589'}, {'interval': '0.6', 'pos': 14, 's2': '35.9', 's3': '32.7', 'laptime': '1:34.356', 's1': '25.6', 'gap': '24.9', 'lap': 6, 'timeofday': '14:49:39.101'}]
raceObj.lapPurples.history[:3]
[{'classRank': 1, 'lap': 1, 'laptime': '1:32.010', 'name': 'L. HAMILTON', 'strtime': '14:41:34.205', 'driverNum': '44'}, {'classRank': 1, 'lap': 3, 'laptime': '1:31.913', 'name': 'L. HAMILTON', 'strtime': '14:44:38.478', 'driverNum': '44'}, {'classRank': 1, 'lap': 8, 'laptime': '1:31.876', 'name': 'L. HAMILTON', 'strtime': '14:52:18.197', 'driverNum': '44'}]
raceObj.drivers['1'].pits
[{'lap': 12, 'status': 'IN PIT', 'count': 1, 'out': True, 'strtime': '14:57:29.852', 'driverNum': '1_1'}, {'lap': 33, 'status': 'IN PIT', 'count': 2, 'out': True, 'strtime': '15:30:19.373', 'driverNum': '1_1'}, {'lap': 52, 'status': 'IN PIT', 'count': 3, 'out': True, 'strtime': '15:59:25.171', 'driverNum': '1_1'}]
raceObj.drivers['1'].name
'S. VETTEL'
raceObj.pits._logger[:3]
[{'column': '2', 'value': '44', 'colour': 'RED', 'strtime': '14:42:56.839', 'row': '1'}, {'column': '6', 'value': 'IN PIT', 'colour': 'YELLOW', 'strtime': '14:42:56.839', 'row': '1'}, {'column': '13', 'value': '1', 'colour': 'YELLOW', 'strtime': '14:57:29.847', 'row': '13'}]
raceObj.pits.history[:3]
[{'lap': 12, 'status': 'IN PIT', 'count': 1, 'out': True, 'strtime': '14:57:29.852', 'driverNum': '1_1'}, {'lap': 14, 'status': 'IN PIT', 'count': 1, 'out': True, 'strtime': '15:00:18.871', 'driverNum': '3_3'}, {'lap': 15, 'status': 'IN PIT', 'count': 1, 'out': True, 'strtime': '15:01:59.009', 'driverNum': '8_8'}]
raceObj.pos
{'2': '6', '1': '44', '4': '1', '8': '8', '16': '21', '6': '14', '18': '17', '19': '4', '12': '20', '10': '27', '14': '26', '20': '9', '21': '10', '9': '11', '3': '3', '17': '99', '15': '13', '7': '7', '5': '77', '13': '19', '11': '22', '22': '25'}
import csv
from time import *
#f=open('openurlgource.csv', 'rb')
#reader = csv.reader(f, delimiter='\t')
writer = csv.writer(open('openurlgource.txt','wb'),delimiter='|')
headerline = reader.next()
for row in reader:
if row[4].strip() !='':
t=int(mktime(strptime(row[0]+" "+row[1], "%Y-%m-%d %H:%M:%S")))
writer.writerow([t,row[3],'A',row[4].rstrip(':').replace(':','/')])