#!/usr/bin/env python # coding: utf-8 # # IPython notebook for ultimateposeur.com article # This contains the code for an ultimateposeur.com article: Is Carlos Carballo The Most Ruthless Referee Of Euro 2016?. # # (The article has also been republished by an Indian national newspaper The Hindu in their web-exclusive thREAD section.) # # For the github repository click here. # # Each of the code blocks begins with a commented out section explaining what it does. # # To run this notebook, install Jupyter/IPython notebook first. # # Then download this notebook, open it with Jupyter/IPython and press SHIFT+ENTER in each code block to execute it. # In[ ]: #Coming from the journalism world, i've always been surprised by how programming/coding is such a #collaborative effort and how people have no issues building off others' ideas and exchanging info, #in fact they welcome it. #but if i want to sell this as journalism, i'll have to do some 'full disclosure' preliminaries. #so bits and pieces of the code are from stackoverflow, github, udacity, #pythonlearn.com, pycon and scipy videos on youtube, sundry google results etc. #these are all too numerous to recount, so while the individual parts of this aren't original, #the assembly and piecing together of it is original and my own. If anyone sees #a snippet of their code and they want credit for it in the code blocks below #let me know (ultimateposeur@gmail.com) and i'll put a line in. # the code for how i scraped everything won't make sense #unless you're familiar with the soccerway page structure, #but i'm putting this out there in the interest of transparency #and reproducibility. This way you can see that I kind of know #what I'm doing! So i've seen braindumps like this #on the net from others, and while i never really go through #the whole thing, there's usually a piece of code in there that #i can borrow, so hopefully there'll be some google result #that will lead someone to this page and they find #something in it useful! # In[ ]: #so this first part is about getting data from soccerway.com #First I noted down the urls of the referee pages I was interested in #and saved it in 'referee_pages.txt' (check github repo). This is where I #start my crawling from. The pages i'm downloading here have a list of #all the matches of a referee, in some cases going back 10 years #just realised i'm probably not using this notebook right #some notebooks are really gussied up with all sorts of #fancy bold headings etc., I think I'd rather just go #stream of consciousness right now. import re reflist = open('referee_pages.txt') refids = [] for i in reflist: refid = (re.findall(r'\d+', i))[0] refids.append(refid) print refids from random import randint import time import requests a = 'http://us.soccerway.com/a/block_player_matches?block_id=page_player_1_block_player_matches_4&callback_params=%7B%22page%22%3A%22%22%2C%22bookmaker_urls%22%3A%5B%5D%2C%22block_service_id%22%3A%22player_matches_block_playermatches%22%2C%22people_id%22%3A%22' c = '%22%2C%22type%22%3A%22%22%2C%22formats%22%3A%22%22%7D&action=changePage¶ms=%7B%22page%22%3A' e = '%7D' headersx = { 'Host':'us.soccerway.com', 'User-Agent':'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0', 'Accept':'text/javascript, text/html, application/xml, text/xml, */*', 'Accept-Language':'en-US,en;q=0.5', 'Accept-Encoding':'gzip, deflate', 'X-Requested-With':'XMLHttpRequest', 'X-Prototype-Version':'1.7', 'Referer':'http://us.soccerway.com/', 'Cookie':'__utma=254272778.1038598711.1464594149.1464631260.1464644537.7; __utmc=254272778; __utmz=254272778.1464603708.3.3.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided); s_cc=true; s_fid=0B0C5F8BE6B3EC61-1954A0A8D6FC41BE; s_nr=1464644536442-Repeat; s_ppv=-%2C45%2C45%2C671; s_sq=%5B%5BB%5D%5D; ev4=jonas%20erikkson; __utma=58539694.601279174.1464635680.1464635680.1464635680.1; __utmc=58539694; __utmz=58539694.1464635680.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); prevPage=Players%20%3A%20Pavel%20Kralovec%20%3A%20matches; __utmb=254272778.1.10.1464644537; __utmt=1', 'Connection':'keep-alive' } #if i had to do this differently, I'd put everything into a sub-directory, all the files were just downloaded into my working directory! for b in refids: d = 0 while d > -21: urlx = a + b + c + str(d) + e responsex = requests.get(urlx, headers = headersx) if d == 0: dz = '-00' else: dz = str(d).zfill(3) fname = b + dz + '.html' with open(fname, "w") as filex: filex.write(responsex.content) d -= 1 print fname delay = randint(10,20) time.sleep(delay) else: continue # In[ ]: #executing the command above will download a whole lot of files #that are lists of matches related to each referee #they'll get downloaded into your directory #as I mentioned in the code, it probably would have been better #to put it all into a sub-directory #this code block is just about me creating subdirectories for each #referee, so that I can download each match file and put it into #the directory for the respective referee #refdeux below is the list of referee ids in soccerway.com import os, sys refdeux = ['44515', '44615', '92755', '44516', '45012', '45089', '44971', '85292', '44678', '44486', '44867', '87294', '45554', '44902', '44754', '45059', '45511', '45077'] for i in refdeux: pathx = '/media/C-drive/Users/shijithpk/Downloads/ipython/2016_05_30_referees/' + i + '-full' os.mkdir(pathx) #so the line above is the path to my working directory. You'll have to replace it with your own path. # In[ ]: #this code block is about me scraping the lists of match #documents from soccerway.com #'edict' the dictionary below has the fields that I noticed # each document has #but the only thing that matters in those documents are links #to the actual matches for each referee #what we've gotten so far from soccerway is just 'lists' of matches #and just to experiment with creating json files #all this data was dumped into json files in each #referee directory import json from lxml import html import sys import os refdeux = ['44515', '44615', '92755', '44516', '45012', '45089', '44971', '85292', '44678', '44486', '44867', '87294', '45554', '44902', '44754', '45059', '45511', '45077'] for i in refdeux: a, b, c, d, e, f, g, h, l = ([] for m in range(9)) edict = dict.fromkeys(['Referee', 'Date', 'Competition', 'Competition_link', 'Home_team', 'Score', 'Match_link', 'Away_team', 'Events']) for j in range(0,21): k = str(j).zfill(2) fnamex = i + '-' + k + '.html' try: fhandle = open(fnamex) sourcex = fhandle.read() jsource = json.loads(str(sourcex)) contentx = jsource['commands'][0]['parameters']['content'] treex = html.document_fromstring(contentx) trs = treex.cssselect('table tbody tr') for tr in trs: try: a.append(tr.cssselect('td:nth-child(2)')[0].text_content()) b.append(tr.cssselect('td:nth-child(3)')[0].text_content()) xy = 'http://us.soccerway.com' + tr.cssselect('td:nth-child(3) a')[0].attrib['href'] c.append(xy) d.append(tr.cssselect('td:nth-child(4)')[0].text_content()) e.append(tr.cssselect('td:nth-child(5)')[0].text_content()) yz = 'http://us.soccerway.com' + tr.cssselect('td:nth-child(5) a')[0].attrib['href'] f.append(yz) g.append(tr.cssselect('td:nth-child(6)')[0].text_content()) h.append(tr.cssselect('td:nth-child(7)')[0].text_content()) l.append(i) except: print "A row wasnt scraped here." except: print fnamex + ' didnt work' print fnamex + ' done' edict['Referee'] = l edict['Date'] = a edict['Competition'] = b edict['Competition_link'] = c edict['Home_team'] = d edict['Score'] = e edict['Match_link'] = f edict['Away_team'] = g edict['Events'] = h filenamey = i + '-full/' + i + '-collated.json' fnamez = os.path.join(os.path.dirname(os.path.realpath('__file__')), filenamey) json.dump(edict, open(fnamez , 'w')) #used json here instead of csv's just so that I could practise creating json files print i + ' done' print 'done, done' # In[ ]: #get the values from each key #download each file, set headers, set timeout between requests #make sure files get downloaded into the right folders #so this is where the bulk of the downloading actuall took #place. Because of the 10-20 second delay i placed between requests, it took #around 20 hours or so to download all the files #so what's happening now is files for each match officiated #by each referee (or at least the matches that are there in #soccerway's database), these files are being downloaded #i decided it would be better to just download all the files #and do my scraping for the information i need on yellow cards #times of booking, etc. , do that scraping on my local archive #so that way even if I get my scraping code wrong, it wont #be an issue because i just start over #if i was doing that trial and error by scraping the information #off the internet directly, i would have just been #requesting files over and over again. This local archive would #avoid that #also note that the output can get pretty verbose for this #so if you want you can comment out the print statement #and make it so so that there's only something printed #if somehting goes wrong #all data downloaded on may 31 & june 1 from random import randint import time import requests import json import sys import os #furl = '/44486-full/44486-collated.json' refdeux = ['44515', '44615', '92755', '44516', '45012', '45089', '44971', '85292', '44678', '44486', '44867', '87294', '45554', '44902', '44754', '45059', '45511', '45077'] headerz = { 'Host':'us.soccerway.com', 'User-Agent':'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0', 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language':'en-US,en;q=0.5', 'Accept-Encoding':'gzip, deflate', 'Referer':'http://us.soccerway.com/', 'Cookie':'__utma=254272778.1038598711.1464594149.1464682768.1464701479.10; __utmc=254272778; __utmz=254272778.1464603708.3.3.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided); s_cc=true; s_fid=0B0C5F8BE6B3EC61-1954A0A8D6FC41BE; s_nr=1464701530831-Repeat; s_ppv=-%2C45%2C45%2C671; s_sq=%5B%5BB%5D%5D; ev4=jonas%20erikkson; __utma=58539694.601279174.1464635680.1464635680.1464635680.1; __utmc=58539694; __utmz=58539694.1464635680.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); prevPage=Players%20%3A%20William%20Collum%20%3A%20matches; __utmb=254272778.3.10.1464701480; __utmt=1', 'Connection':'keep-alive' } for i in refdeux: filenamey = i + '-full/' + i + '-collated.json' fnamez = os.path.join(os.path.dirname(os.path.realpath('__file__')), filenamey) with open(fnamez) as fx: a = json.load(fx) urllist = a['Match_link'] print i + '-collated.json ' + ' len-urllist ' + str(len(urllist)) d = 0 for urlx in urllist: print 'The url in question: ' + urlx try: responsex = requests.get(urlx, headers = headerz) d += 1 zrs = str(d).zfill(3) fnad = i + '-full/' + i + '-' + zrs + '.html' fnadr = os.path.join(os.path.dirname(os.path.realpath('__file__')), fnad) with open(fnadr, "w") as filex: filex.write(responsex.content) print fnad + ' successfully written' delay = randint(10,20) time.sleep(delay) except: print "Something went wrong here." print i + ' done' print 'done and done' # In[ ]: #grr, have kind of forgotten what i've done a few things for! #this wasnt' a straightforward a to b to c thing # it was a to b, then back to a, then c, then back to a, #you get the idea, i've done so many things, that now i realize # i should comment out code whenever i do anything #so that if i have to remind myself why i did something #at least there's some note or something to help me out #i'm writing all these comments a week after #so anyway I figured out what this block was for # it was to get the no. of matches for each refeee that soccerwy #has recorded data for, just wanted to have a sense of what i was getting into import json import sys import os refdeux = ['44515', '44615', '92755', '44516', '45012', '45089', '44971', '85292', '44678', '44486', '44867', '87294', '45554', '44902', '44754', '45059', '45511', '45077'] fdict = dict.fromkeys(['refid', 'matches']) a = [] b = [] for i in refdeux: a.append(i) filenamey = i + '-full/' + i + '-collated.json' fnamez = os.path.join(os.path.dirname(os.path.realpath('__file__')), filenamey) with open(fnamez) as fx: d = json.load(fx) urllist = d['Match_link'] b.append(str(len(urllist))) fdict['refid'] = a fdict['matches'] = b # In[ ]: #so the code below is about creating the main dataset #with info on each card, what type it was - yellow, second yellow #or red, who was booked, what time they were booked, the # team they were playing for at the time etc. #again the output for this is verbose, so if you're #reproducing this, you might want to cut down on the #print statements import re from lxml import html import sys import os import csv reflist = open('referee_pages.txt') refid_list = [] refname_list = [] #am sure there's a shorter way to do this, couldnt figure it out, what i'm doing below is #taking an element out of one list and appending it to another list! for i in reflist: refidx = (re.findall(r'\d+', i))[0] refid_list.append(refidx) refnamex = (re.findall(r'http://us.soccerway.com/referees/(.*)/.*/matches/', i))[0] #extract name without diacritics, there was a turkish referee name that python completely mangled, this is safer refname_list.append(refnamex) headings = ['file_name', 'ref_code', 'ref_name_in_url', 'ref_nationality','match_code', 'match_duration', 'home_team_in_url', 'away_team_in_url', 'date', 'competition','winning_team', 'card_type','minute_card_given', 'player_name_in_url', 'player_nationality_in_url'] #write/stream to a csv file, don't hold this in memory with open("dataset_main.csv", "w") as filex: #after this change mode to a for append, otherwise file gets overwritten wr = csv.writer(filex, delimiter = ',' , quotechar = '"' ) wr.writerow(headings) for i in refid_list: d = range(1,450) for x in d: zrs = str(x).zfill(3) fnad = i + '-full/' + i + '-' + zrs + '.html' fnadr = os.path.join(os.path.dirname(os.path.realpath('__file__')), fnad) try: with open(fnadr) as filey: row_list = [] sourcex = filey.read() treex = html.document_fromstring(sourcex) #check if referee isn't assistant referee or fourth official if i not in treex.cssselect('dl.details > dd:nth-child(2) > a:nth-child(1)')[0].attrib['href']: continue #append to the row list in the same order row_list.append(fnad) ref_code = i row_list.append(ref_code) ref_pos = refid_list.index(i) ref_name_in_url = refname_list[ref_pos] row_list.append(ref_name_in_url) ref_nationality = treex.cssselect('dl.details > dd:nth-child(2) > a:nth-child(1)')[0].attrib['class'] ref_natx = (re.findall(r'flag_16 left_16 (.+?)_16_left referee', ref_nationality))[0] row_list.append(ref_natx) match_code = treex.cssselect('#submenu > ul:nth-child(1) > li:nth-child(1) > a:nth-child(1)')[0].attrib['href'] match_codex = (re.findall(r'/matches/.*/.*/.*/.*/.*/.*/.*/(.*)/', match_code))[0] row_list.append(match_codex) try: if (treex.cssselect('.scoretime > span:nth-child(1)')[0].text_content() or treex.cssselect('.scoretime > span:nth-child(2)')[0].text_content()) == ('E' or 'P'): match_duration = 120 row_list.append(match_duration) except: match_duration = 90 row_list.append(match_duration) home_team_in_url = treex.cssselect('#page_match_1_block_match_info_4 > div:nth-child(1) > div:nth-child(1) > h3:nth-child(1) > a:nth-child(1)')[0].attrib['href'] home_teamx = (re.findall(r'/teams/(.*/.*)/.*/', home_team_in_url))[0] row_list.append(home_teamx) away_team_in_url = treex.cssselect('#page_match_1_block_match_info_4 > div:nth-child(1) > div:nth-child(3) > h3:nth-child(1) > a:nth-child(1)')[0].attrib['href'] away_teamx = (re.findall(r'/teams/(.*/.*)/.*/', away_team_in_url))[0] row_list.append(away_teamx) date_match = treex.cssselect('div.details:nth-child(1) > dl:nth-child(1) > dd:nth-child(4) > a:nth-child(1)')[0].attrib['href'] date_matchx = (re.findall(r'/.*/(.*/.*/.*)/', date_match))[0] row_list.append(date_matchx) competition = treex.cssselect('div.details:nth-child(1) > dl:nth-child(1) > dd:nth-child(2) > a:nth-child(1)')[0].attrib['href'] compx = (re.findall(r'/(.*/.*/.*)/.*/.*/.*/', competition))[0] row_list.append(compx) scorex = treex.cssselect('.bidi')[0].text_content() score_list = scorex.split() if score_list[0] > score_list[2]: winning_team = home_teamx row_list.append(winning_team) elif score_list[0] < score_list[2]: winning_team = away_teamx row_list.append(winning_team) else: winning_team = 'draw' row_list.append(winning_team) cdx = treex.xpath("//img[contains(@src,'YC.png')]|//img[contains(@src,'RC.png')]|//img[contains(@src,'Y2C.png')]") if cdx == []: #this is just a check for matches without any yellow or red cards row_final = list(row_list) row_expand = list() card_type = 'zero' row_expand.append(card_type) minute_card_given = '' row_expand.append(minute_card_given) player_name_in_url = '' row_expand.append(player_name_in_url) player_nationality_in_url = '' row_expand.append(player_nationality_in_url) row_final.extend(row_expand) with open("dataset_main.csv", "a") as filez: wrz = csv.writer(filez, delimiter = ',' , quotechar = '"' ) wrz.writerow(row_final) else: for cd in cdx: row_final = list(row_list) row_expand = [] #make sure youve taken care of all potential slip-ups - mentioned in excel sheet card_type = cd.xpath('@src')[0] card_typex = (re.findall(r'http://.*/.*/.*/.*/.*/(.*).png', card_type))[0] row_expand.append(card_typex) minute_card_given = cd.xpath('../text()')[0] row_expand.append(minute_card_given) #the values in this are weird, some leading white spaces, minute mark at end, 90 + 4 etc., will need further work player_name_in_url = cd.xpath('preceding::td[1]//a[1]/@href')[0] player_name_in_urlx = (re.findall(r'/.*/(.*)/.*/', player_name_in_url))[0] row_expand.append(player_name_in_urlx) player_nationality_in_url = cd.xpath('preceding::td[1]//a[1]/@class')[0] player_nationality_in_urlx = (re.findall(r'flag_16 left_16 (.+?)_16_left', player_nationality_in_url))[0] row_expand.append(player_nationality_in_urlx) row_final.extend(row_expand) with open("dataset_main.csv", "a") as filez: wrz = csv.writer(filez, delimiter = ',' , quotechar = '"' ) wrz.writerow(row_final) except: print 'No file named ', fnadr, ' forget about it.' filez.close() #dont forget to close the csv file once you've finished appending everything print 'done and done' # In[ ]: #now that i have my main dataset, will be doing some cleaning on it # import pandas as pd dfg = pd.read_csv('dataset_main.csv') #so i load my csv into a dataframe dfg['card_type'].unique() dfg_filtered = dfg[dfg['card_type'].isin(['YC', 'zero', 'RC', 'Y2C'])] len(dfg_filtered) len(dfg) # did the commands above to check if everything is kosher #turns out some of the pages werent' formatted right #so the wrong info came into the card type column # import numpy as np np.sort(dfg_filtered['minute_card_given'].unique()) #this command above will show you that again some rows #didn't record the time of booking properly, so will be #filtering those rows out in the line below dfg_for_A = dfg_filtered[dfg_filtered['minute_card_given'] != " '"] dfg_for_A.to_csv('dataset_for_A_v2.csv') dfg_filtered.to_csv('dataset_B_filelist.csv') #this dataset A covers around 50 fewer matches than dataset B #will be using dataset A for time-related analyses and maybe a few other things #And will be using dataset B to generate a list of files with #more or less proper formatting so they can be scraped #not having the proper minute data in dataset B wont matter given my plans for it. # In[ ]: #i ended up not using the data resulting from the code below #this data could help us look at players from which nationalities get penalised more by referees #will be using this data for later article (I call dibs! please dont steal my story idea) #only included this code because i used to create the first graphic in the real article on which #referees are the card happiest import re from lxml import html import sys import os import csv import pandas as pd df = pd.read_csv('dataset_B_filelist.csv') nats = df.player_nationality_in_url.unique().tolist() #if you see the results you'll see that the value between australia and cameroon is some weird value #nats.index('australia') #nats.index('cameroon') #nats[27] del nats[27] df_heads = list(df) df_heads_new = df_heads[:11] headingsx = df_heads_new + nats filenames_deux = df.file_name.unique().tolist() with open("dataset_B_cards.csv", "w") as filex: #after this change mode to a for append, otherwise file gets overwritten wr = csv.writer(filex, delimiter = ',' , quotechar = '"' ) wr.writerow(headingsx) with open("dataset_B_total.csv", "w") as filez: #after this change mode to a for append, otherwise file gets overwritten wr = csv.writer(filez, delimiter = ',' , quotechar = '"' ) wr.writerow(headingsx) for deux in filenames_deux: fnadr = os.path.join(os.path.dirname(os.path.realpath('__file__')), deux) with open(fnadr) as filey: row_listc = [] row_listt = [] sourcex = filey.read() treex = html.document_fromstring(sourcex) row_listc.append(fnad) row_listt.append(fnad) ref_code = fnad[0:5] row_listc.append(ref_code) row_listt.append(ref_code) ref_name_url = treex.cssselect('dl.details > dd:nth-child(2) > a:nth-child(1)')[0].attrib['href'] ref_name_urlx = (re.findall(r'/referees/(.*)/.*/', ref_name_url))[0] row_listc.append(ref_name_urlx) row_listt.append(ref_name_urlx) ref_nationality = treex.cssselect('dl.details > dd:nth-child(2) > a:nth-child(1)')[0].attrib['class'] ref_natx = (re.findall(r'flag_16 left_16 (.+?)_16_left referee', ref_nationality))[0] row_listc.append(ref_natx) row_listt.append(ref_natx) match_code = treex.cssselect('#submenu > ul:nth-child(1) > li:nth-child(1) > a:nth-child(1)')[0].attrib['href'] match_codex = (re.findall(r'/matches/.*/.*/.*/.*/.*/.*/.*/(.*)/', match_code))[0] row_listc.append(match_codex) row_listt.append(match_codex) try: if (treex.cssselect('.scoretime > span:nth-child(1)')[0].text_content() or treex.cssselect('.scoretime > span:nth-child(2)')[0].text_content()) == ('E' or 'P'): match_duration = 120 row_listc.append(match_duration) row_listt.append(match_duration) except: match_duration = 90 row_listc.append(match_duration) row_listt.append(match_duration) home_team_in_url = treex.cssselect('#page_match_1_block_match_info_4 > div:nth-child(1) > div:nth-child(1) > h3:nth-child(1) > a:nth-child(1)')[0].attrib['href'] home_teamx = (re.findall(r'/teams/(.*/.*)/.*/', home_team_in_url))[0] row_listc.append(home_teamx) row_listt.append(home_teamx) away_team_in_url = treex.cssselect('#page_match_1_block_match_info_4 > div:nth-child(1) > div:nth-child(3) > h3:nth-child(1) > a:nth-child(1)')[0].attrib['href'] away_teamx = (re.findall(r'/teams/(.*/.*)/.*/', away_team_in_url))[0] row_listc.append(away_teamx) row_listt.append(away_teamx) date_match = treex.cssselect('div.details:nth-child(1) > dl:nth-child(1) > dd:nth-child(4) > a:nth-child(1)')[0].attrib['href'] date_matchx = (re.findall(r'/.*/(.*/.*/.*)/', date_match))[0] row_listc.append(date_matchx) row_listt.append(date_matchx) competition = treex.cssselect('div.details:nth-child(1) > dl:nth-child(1) > dd:nth-child(2) > a:nth-child(1)')[0].attrib['href'] compx = (re.findall(r'/(.*/.*/.*)/.*/.*/.*/', competition))[0] row_listc.append(compx) row_listt.append(compx) scorex = treex.cssselect('.bidi')[0].text_content() score_list = scorex.split() if score_list[0] > score_list[2]: winning_team = home_teamx row_listc.append(winning_team) row_listt.append(winning_team) elif score_list[0] < score_list[2]: winning_team = away_teamx row_listc.append(winning_team) row_listt.append(winning_team) else: winning_team = 'draw' row_listc.append(winning_team) row_listt.append(winning_team) #create player set using | operator players = treex.xpath("//*[@id='page_match_1_block_match_lineups_8']//a[contains(@href,'players')]|//*[@id='page_match_1_block_match_substitutes_9']//p[@class='substitute substitute-in']/a[contains(@href,'players')]/following::img[contains(@src,'SI.png')][1]/preceding::a[1]") for nat in nats: cards = 0 totalx = 0 for player in players: player_class = player.xpath('@class')[0] if nat == re.findall(r'flag_16 left_16 (.+?)_16_left', player_class)[0]: #set up card finder if player.xpath("following::td[1]//img[contains(@src,'YC.png') or contains(@src,'RC.png') or contains(@src,'Y2C.png')]") != []: cards += 1 totalx += 1 else: totalx += 1 else: continue row_listc.append(cards) row_listt.append(totalx) with open("dataset_B_cards.csv", "a") as filex: wr = csv.writer(filex, delimiter = ',' , quotechar = '"' ) wr.writerow(row_listc) with open("dataset_B_total.csv", "a") as filez: wr = csv.writer(filez, delimiter = ',' , quotechar = '"' ) wr.writerow(row_listt) filex.close() filez.close() print 'done and done' # In[ ]: #code below was written to figure out which referees # are particularly trigger happy when it comes to bookings and cards # df_world_total = pd.read_csv('dataset_B_total.csv') df_world_cards = pd.read_csv('dataset_B_cards.csv') #dont remember why but for some reason, i wasnt happy with #the dataframe loaded from the csv, so i did some slicing #below to create a new one df_world_cards.columns.get_loc("northern-ireland") df_world_cards.columns.get_loc("equatorial-guinea") df_world_cards.columns.get_loc("ref_name_in_url") df_world_cards.columns.get_loc("match_code") df_world_cards.columns.get_loc("competition") card_slices = [df_world_cards.ix[:,2], df_world_cards.ix[:,4],df_world_cards.ix[:,9],df_world_cards.ix[:,11:160]] df_world_cards_new = pd.concat(card_slices, axis=1) total_slices = [df_world_total.ix[:,2], df_world_total.ix[:,4],df_world_total.ix[:,9],df_world_total.ix[:,11:160]] df_world_total_new = pd.concat(total_slices, axis=1) #these four lines below sum up the number of appearances #the 'total new sum' dataframe and cards for all players #in all matches they have appeared in #note that this wont take into cards for players #whey they've been on the subs bench (that happens sometimes) df_world_total_new_sum = df_world_total_new.groupby('ref_name_in_url').sum() df_world_cards_new_sum = df_world_cards_new.groupby('ref_name_in_url').sum() #this is just a simple no. of cards for all players divided by #no of appearances for all players b = df_world_cards_new_sum.sum(axis=1) c = df_world_total_new_sum.sum(axis=1) df_world_ratio_new = (b/c)*100 graphic_A = [e, c, b, df_world_ratio_new] df_for_graphic_A = pd.concat(graphic_A, axis=1) df_for_graphic_A.columns = ['no_of_matches', 'player_appearances', 'cards', 'cards_to_appearance_ratio'] df_for_graphic_A.to_csv('graphic_A.csv') # In[ ]: # In[ ]: import pandas as pd dfg = pd.read_csv('dataset_for_A_v2.csv') del dfg['Unnamed: 0'] #deleting the index that got added on as a column for some reason dfg.minute_card_given.fillna('',inplace=True) dfg.player_name_in_url.fillna('zero',inplace=True) dfg.player_nationality_in_url.fillna('zero',inplace=True) #did the commands above because there were a lot of null #values created because of the matches where there were no #cards, so just filled them up with 'zero' dfg['minute_card_given'] = dfg['minute_card_given'].str.strip(" ") #remove whitespace at beginning dfg['minute_card_given'] = dfg['minute_card_given'].str.strip("'") #remove minute mark (') dfg[dfg['minute_card_given'].str.contains(r"\+")].groupby('minute_card_given').match_code.nunique() #checking how many cards are given in injury time # In[ ]: #so because there are a lot of cards given in injury time # there are a lot of cards where the minutes added on are denoted #by '+', so 45+3, 90+1 etc. #this was messing up with my calculations because they werent #being recognised as numbers, so i wrote the code below #to take care of this by, wait for it, extending each half by #five minutes! so the second half starts at minute 50 instead of #45, if there's extra time, extra time starts at minute 100 #instead of 90 etc. I did all this because i was thinking #of doing something related to mean time between cards #but dropped that idea later fake_list4 = [] for s in dfg['minute_card_given']: if '+' in s: t = s.split("+") if int(t[0]) == 45: if int(t[1]) > 5: v = 50 elif int(t[1]) <= 5: v = 45 + int(t[1]) elif int(t[0]) == 90: if int(t[1]) > 5: v = 100 elif int(t[1]) <= 5: v = 95 + int(t[1]) elif int(t[0]) == 105: v = 115 elif int(t[0]) == 120: v = 130 fake_list4.append(v) else: try: t = int(s) if t > 90: v = t + 10 elif t > 45: v = t + 5 elif t > 0: v = t fake_list4.append(v) except: fake_list4.append('none') fake_list4 df_02= pd.DataFrame({'min_card_new': fake_list4}) #am adding this transformed times column named 'min_card_new' #to the dataframe df_card_analysis = pd.concat([dfg, df_02], axis=1) # In[ ]: #taking out matches where there was no card given df_first_card = df_card_analysis[df_card_analysis['min_card_new'] != 'none'] #now the analysis #this code below is to calculate the mean time of the first yellow and red card df_first_yellow = df_first_card[df_first_card['card_type'] == 'YC'] df_first_red = df_first_card[df_first_card['card_type'] == 'RC'] df_first_yellow_mean = df_first_yellow.groupby(['ref_name_in_url','file_name'], as_index=False)['min_card_new'].min().groupby('ref_name_in_url')['min_card_new'].mean() df_first_red_mean = df_first_red.groupby(['ref_name_in_url','file_name'], as_index=False)['min_card_new'].min().groupby('ref_name_in_url')['min_card_new'].mean() first_combo = [df_first_yellow_mean, df_first_red_mean] df_first_combo = pd.concat(first_combo, axis=1) #putting the two dataframes together df_first_combo.columns = ['first_yellow_mean', 'first_red_mean'] #giving this dataframe new column names above df_first_combo.to_csv('graphic_first_card.csv') #putting the data in a csv so that i can use it in something #like Tableau to create the visualisation. Wasnt happy #with the options within the Python world, every graphic #looks so science-y, which is understandable given their #origins in academia, but all of them -- matplotlib, seaborn, #bokeh, they all look so bad! # In[ ]: #so we are trying to figure out how many times a referee took out a 2nd yellow card for a player and sent #him off. so to do that we first find out how many yellow cards were given by each ref #we then divide the no. of matches where theres a Y2C, ie. 2nd yellow leading to a red, and so we get the #conversion ratio for yellows to reds for each referee #len(df_first_card) = 19559 df_card_04 = df_first_card[df_first_card['card_type'] == 'YC'] #len(df_card_04) = 18459 df_card_05 = df_card_04.groupby('ref_name_in_url')['min_card_new'].count() #this line above gives us total no of yellow cards per referee (excluding second yellows and straight reds) #now to get the no of matches where a second yellow led to a red #you do this by restricting the frame to matches where there are second yellows df_card_06 = df_first_card[df_first_card['card_type'] == 'Y2C'] #this line below gives us total no of second yellows per referee (and excludes straight reds) df_card_07 = df_card_06.groupby('ref_name_in_url')['min_card_new'].count() df_card_08 = (df_card_07/df_card_05)*100 #above is the calculation of the percentage of #yellow cards converted into second yellows #this is all possible because soccerway helpfully #has a second yellow category of card in addition #to the normal yellow and red card #the lines below concatenate the frames together and saves #everything in a csv file card_conversion = [df_card_05, df_card_07, df_card_08] df_card_convert = pd.concat(card_conversion, axis=1) df_card_convert_v2 = df_card_convert.reset_index() df_card_convert_v2.columns = ['referee_name', 'total_yellow_cards', 'total_second_yellows', 'ratio'] df_card_convert_v2.to_csv('graphic_y_to_r.csv', index=False) # In[ ]: #for the next graphic, we find out who has historically pulled card after card after card in a match df_card_total_v2 = df_first_card.groupby('ref_name_in_url')['min_card_new'].count() #this gives us no. of cards per referee (yellows, second yellows and reds) #this is the base that we'll be using to calulate percentages of first cards, second cards, thrid cards etc. #df_first_card is the dataframe where the matches without any cards shown have been excluded df_card_percent_first = df_first_card.sort_values('min_card_new').groupby(['ref_name_in_url','file_name']).head(1).groupby('ref_name_in_url')['min_card_new'].count() #code below will help us calculate the percentages and save #the dataframe as a csv #being a newbie, the code below took me a few hours to figure out #probably the first time I defined my own function too #outside of an online tutorial def card(t): return df_first_card.sort_values('min_card_new').groupby(['ref_name_in_url','file_name']).head(t).groupby('ref_name_in_url')['min_card_new'].count() tot = df_card_total_v2 t = 1 sr_concat = [] label_list = [] sr_tot = pd.Series(index=df_card_total_v2.index, dtype='float64') #create empty series sr_tot = sr_tot.fillna(0) # fill with 0s rather than NaNs , thank you stackoverlow! sr_100v = pd.Series(data=100,index=df_card_total_v2.index, dtype='float64') #sr_100v = sr_tot.fillna(100.0) this wasnt working for some reason, could be because of some inplace nonsense print sr_100v while t < 6: print "t= ", t z = ((card(t) - card(t-1))/tot)*100 if z.isnull().any(): xy = (card(t)/tot)*100 else: xy = z print 'xy' print xy sr_concat.append(xy) sr_tot += xy #print 'sr_tot' #print sr_tot #print 'sr_concat' #print sr_concat label_name = 'card_' + str(t) label_list.append(label_name) t += 1 print sr_tot xy = sr_100v - sr_tot print "t=6", 6 #print 'xy' print xy sr_concat.append(xy) #print 'sr_concat' #print sr_concat label_name = 'the_rest' label_list.append(label_name) df_card_percent = pd.concat(sr_concat, axis=1) #putting the two dataframes together #df_card_convert_v2 = df_card_convert.reset_index() df_card_percent.columns = label_list df_card_percent.to_csv('graphic_card_percent.csv') #sometimes ", index = False" works better when saving to a csv and the first column doesnt have a label print 'done and done' # In[ ]: