Abstract: In the soccer world, there is a lot of money involved in transfer of players in the premier leagues around the world. Focussing on the English Premier League, our project - “The League Of Champions” aims to analyze the return on investment on soccer transfer done by teams in the English premier league. It aims to measure club return on each dollar spent on their acquired players for a season , on parameters like Goals scored, active time on the field , assists etc. In addition , we also look to analyze how big a factor player age is , in commanding a high transfer fee , and if clubs prefer to pay large amounts for specialist players in specific field positions. from IPython.core.display import Image Image(filename='/Users/natarajan/Dropbox/Soccer - WWOD/Player_positions.png') Note: Each player position performs an important role , depending on the game strategy. The overall purpose is to win matches by scoring goals and defending to prevent the opponents from scoring any. Image('http://www.soccer-training-guide.com/images/wikipedia_positions.png') #all player positions on the field Image('http://www.soccer-training-guide.com/images/4-4-2-new.jpg') def get_players_data(players, rows): '''helper function to parse player data on who were transferred in the English premier league ''' for row in rows: players_data = { } for (i,item) in enumerate(row.find_all('td')): #print i, item.get_text() if i == 3: players_data['p_name'] = item.get_text() parts = item.find('a').get('href').split('/') players_data['handle'] = parts[2] players_data['id'] = parts[4].split('_')[1].split('.')[0] if i == 4: players_data['club'] = item.get_text() if i == 5: players_data['age'] = item.get_text() if i == 7: players_data['position'] = item.get_text() if i == 9: players_data['from'] = item.get_text() if i == 11: players_data['to'] = item.get_text() if i == 12: fee = item.get_text().split(' ') players_data['transfer_fee_pounds'] = fee[0] players_data['transfer_fee_euros'] = fee[1][1:] players.append(players_data) def fetch_data(players, year): ''' given a particular year , fetch transferred player data from English premier league''' url = 'http://www.transfermarkt.co.uk/en/premier-league/transferrekorde/wettbewerb_GB1_%s_default_default_default_alle.html' % (year) r = requests.get(url) html_doc = r.text soup = BeautifulSoup(html_doc) table= soup.find('table' , {'class':"tabelle_grafik"}) rows = table.find_all('tr' , {'class':"hell"}) get_players_data(players, rows) rows = table.find_all('tr' , {'class':"dunkel"}) get_players_data(players, rows[1:]) def extend_data(players, new_players, year): for player in players: url = 'http://www.transfermarkt.co.uk/en/%s/leistungsdaten/spieler_%s_%s.html' % (player['handle'], player['id'], year) r = requests.get(url) html_doc = r.text soup = BeautifulSoup(html_doc) table = soup.find('table' , {'class':"standard_tabelle"}) row = table.find('tr' , {'class':"hell"}) #Assuming its in first row for (i,item) in enumerate(row.find_all('td')): #print i, item.get_text() if i == 2: if item.get_text() != '-': player['matches'] = item.get_text() else: player['matches'] = 0 if i == 3: if item.get_text() != '-': player['goals'] = item.get_text() else: player['goals'] = 0 if i == 5: if item.get_text() != '-': player['assists'] = item.get_text() else: player['assists'] = 0 if i == 11: if item.get_text() != '-': player['minutes_per_goal'] = item.get_text() else: player['minutes_per_goal'] = 0 if i == 12: if item.get_text() != '-': player['minutes'] = item.get_text() else: player['minutes'] = 0 new_players.append(player) # dump data from source to pkl files - to be run just once import pickle,requests from bs4 import BeautifulSoup import requests , re players = [] years = ['2010', '2011' , '2012'] for year in years : #year = '2012' fetch_data(players, year) new_players = [] extend_data(players, new_players, year) f_name = "players_data_" + year +".pkl" pickle.dump( new_players, open( f_name, "wb" ) ) #print new_players[0] #pickle.dump( new_players, open( "players_data_2012.pkl", "wb" ) ) # load data from pkl files to a dataframe import numpy as np import pandas as pd import pickle,matplotlib.pyplot as plt from pandas import Series, DataFrame f = open('players_data_2010.pkl') #f = open(cloud.files.get('players_data_2010.pkl') ) p = pickle.load(f) player_df_2010 = DataFrame(p) player_df_2010['season']='10/11' f = open('players_data_2011.pkl') p = pickle.load(f) player_df_2011 = DataFrame(p) player_df_2011['season']='11/12' f = open('players_data_2012.pkl') p = pickle.load(f) player_df_2012 = DataFrame(p) player_df_2012['season']='12/13' player_df = pd.concat([player_df_2010,player_df_2011,player_df_2012]) print player_df.columns player_df.set_index('id',inplace=True,drop=False) print player_df[:1] # data cleanup - converting strings to numerics for the appropriate fields player_df['transfer_fee_euros'] = player_df.transfer_fee_euros.str.replace('.','') player_df['transfer_fee_euros'] =player_df.transfer_fee_euros.astype(float64) player_df['minutes'] = player_df.minutes.astype(float64) player_df['goals'] = player_df.goals.astype(float64) player_df['age'] = player_df.age.astype(float64) player_df['assists'] = player_df.assists.astype(float64) player_df.set_index(['id'], inplace=True, drop=False) player_df['money_by_minutes']= player_df['transfer_fee_euros']/player_df['minutes'].astype(float64) player_df['money_by_goal_minutes'] = player_df['transfer_fee_euros']/player_df['minutes_per_goal'].astype(float64) player_df['money_by_assists'] = player_df['transfer_fee_euros']/player_df['assists'].astype(float64) player_df['money_by_goals'] = player_df['transfer_fee_euros']/player_df['goals'].astype(float64) player_df['money_by_matches'] = player_df['transfer_fee_euros']/player_df['matches'].astype(float64) # only consider player purchases done by teams from English premier league epl_teams = [ u'Southampton', u'Aston Villa', u'Man Utd', u'Swansea', u'Liverpool', u'Blackburn', u'Everton', u'Fulham', u'Newcastle', u'West Ham', u'QPR', u'Wolves', u'Chelsea', u'Spurs', u'Man City', u'Stoke City', u'Arsenal', u'Sunderland', u'Wigan', u'Birmingham', u'West Brom'] epl_to=[] for team in epl_teams: epl_to.append( player_df[player_df.to == team]) player_df = pd.concat(epl_to) #print len(player_df) player_df = player_df.sort_index(by='season') #print player_df # consider only players transferred to EPL teams . Disregard players transferred to non EPL teams. epl_teams = [ u'Southampton', u'Aston Villa', u'Man Utd', u'Swansea', u'Liverpool', u'Blackburn', u'Everton', u'Fulham', u'Newcastle', u'West Ham', u'QPR', u'Wolves', u'Chelsea', u'Spurs', u'Man City', u'Stoke City', u'Arsenal', u'Sunderland', u'Wigan', u'Birmingham', u'West Brom'] defence = ['SW' , 'CB' , 'LB' , 'RB'] midfield = ['DM', 'CM', 'RM','LM', 'AM', 'LW', 'RW'] striker = ['LW', 'RW', 'SS', 'CF'] attack ={} def plot_acquisitions_by_positions(pos_list,label): d = DataFrame(index=epl_teams) for season in player_df.season.unique(): for team in epl_teams: if label=='Midfield': attack[team]=len(player_df[(player_df.season==season) & (player_df['position'].isin(pos_list) | ( player_df['id'].isin(['7858','14086','4063','15921','50202','15921','35251'])) ) & (player_df.to ==team)]) elif label== 'Attack': attack[team]=len(player_df[(player_df.season==season) & ((player_df['position'].isin(pos_list)) | ( player_df['id'].isin(['31552','15185'])))& (player_df.to ==team)]) elif label =='Defence': #print team,season, len(player_df[(player_df.season==season) & (player_df['position'].isin(['SW' , 'CB' ,'LB', 'RB'])) & (player_df.to ==team)][['p_name','position']]) attack[team]= len(player_df[(player_df.season==season) & (player_df['position'].isin(pos_list)) & (player_df.to ==team)]) g=Series(attack) g.name= label +"_"+ season d=d.join(g) print d.plot(kind='barh', figsize=(10,10),title='trend of ' + label + ' player acquistions') plot_acquisitions_by_positions(defence,'Defence') plot_acquisitions_by_positions(midfield,'Midfield') plot_acquisitions_by_positions(striker,'Attack') #.plot(kind='barh',figsize=(5,10)) #Series().plot(kind='barh') #print len(player_df[(player_df.season=='11/12') & (player_df['position'].isin(['SW' , 'CB' ,'LB', 'RB'])) & (player_df.to =='Arsenal')][['p_name','position']]) #print player_df[player_df['position'].isin(['SW' , 'CB' ,'LB', 'RB'])][['pos_type']] #pieces = DataFrame() #print player_df.groupby(['to','season'])['transfer_fee_euros'].mean() for season in player_df.season.unique(): # print season p1 = player_df[player_df.season==season].groupby(['to'])['transfer_fee_euros'].mean() p1.name = season #print p1 #print p1 if season == '10/11': club_purchase = DataFrame(p1) else: club_purchase = club_purchase.join(p1) club_purchase = club_purchase.fillna(value=0) #position_compare.sort_index( #position_compare = position_compare.transpose() print club_purchase.sort_index(ascending=False).plot(kind='barh') # get unique players who have been transferred more than once across seasons from 2010 to 2013 transfr_more_than_once = player_df.ix[player_df.id.value_counts() > 1 ].sort_index(by=['id','season'])['id'].unique() print "Players who were transferred between the EPL teams more than once" # list of all players who have been transferred more than once from 2010 -2013 print player_df.ix[transfr_more_than_once].sort_index(by=['id','season'])[['p_name','from','to','transfer_fee_euros','position','season','goals']] # graph player transfer amount rise/fall through the 3 seasons. def transfr_amt_by_season(p_id): return player_df[(player_df.id==p_id)][['season','transfer_fee_euros']].set_index(keys='season') name_df = DataFrame(index=['10/11','11/12','12/13']) for item in np.unique(transfr_more_than_once): #print item #print np.unique(player_df.ix[item]['name']) name_df[np.unique(player_df.ix[item]['p_name'])] = transfr_amt_by_season(item) #name_df= name_df.where(pd.notnull(name_df),0) name_df= name_df.fillna(method='ffill') # assuming the cost to club is same , if club retains a player for successive seasons print " \n\n Tabulate player acquisition costs for players who were transferred more than once in the last 3 EPL seasons" print name_df print "\n\n plot of player acquisition costs for players who were transferred more than once in the last 3 EPL seasons" print name_df.plot(figsize=(10,7)) import pickle import numpy as np import pandas as pd import matplotlib.pyplot as plt from pandas import Series, DataFrame #player_df.transfer_fee_euros.type #player_df.position player_df.sort_index(by='transfer_fee_euros')[::][["p_name", "club",'transfer_fee_euros','from','to']] grouped= player_df.groupby('club').size() #grouped.order(ascending=True).plot() club_counts = player_df['club'].value_counts() #print club_counts #club_counts.plot(kind='barh', rot=0) pos_counts = player_df['position'].value_counts() # players acquired by position - raw count print " Distribution of players purchased by position" print pos_counts print "Total players purchased " , len(player_df) # plot positions in demand print player_df.position.value_counts().plot(kind='bar', figsize=(5,5), title = 'positions in demand') #player_df.groupby('position')['transfer_fee_euros'].sum().plot(title='total price by position') position_compare = DataFrame() #print player_df.season.unique() #pieces = DataFrame() for season in player_df.season.unique(): p1 = player_df[player_df.season==season].groupby(['position'])['transfer_fee_euros'].mean() p1.name = season if season == '10/11': position_compare = DataFrame(p1) else: position_compare = position_compare.join(p1) position_compare= position_compare.where(pd.notnull(position_compare),0) print position_compare print "\n\n\n" print position_compare.plot(kind='barh', title='seasonal trend for mean price paid by position') #player_df[player_df.season=='10/11'].groupby(['position'])['transfer_fee_euros'].mean().plot(title='avg price by position' , kind='barh', rot=0) # shows high price paid for Strikers CF #pos_counts.plot(kind='barh', rot=0) #player_df['transfer_fee_euros'].hist() # distribution of money spent #t1[seasons].sort_index(ascending=False).plot(figsize(5,5),kind='barh', title='season '+ seasons+ ':fee by ' + col , rot=0) # grouped = player_df[player_df.season=='10/11'].groupby(['to']) position_compare = DataFrame() #print player_df.season.unique() #pieces = DataFrame() stats_for_money = ['goals', 'minutes','assists' ] def return_per_euro(stat): for season in player_df.season.unique(): grouped = player_df[player_df.season==season].groupby(['to']) #print grouped #p1= grouped['transfer_fee_euros'].sum()/grouped['minutes'].sum() p1= grouped['transfer_fee_euros'].sum()/grouped[stat].sum() p1.name = season if season == '10/11': euros_per_stat = DataFrame(p1) else: euros_per_stat = euros_per_stat.join(p1) euros_per_stat = euros_per_stat.fillna(value=0) print euros_per_stat print euros_per_stat.plot(kind='barh', title='euros spent by ' + stat) return_per_euro('goals') #for item in stats_for_money: # return_per_euro(item) return_per_euro('assists') Observations : Newcastle spent a lot of money per minute of game time of purchased players - suggesting that they sat on the bench a lot of the time return_per_euro('minutes') #### An alternate view of comparing the clubs by season on various returns def compare_stat(col): ''' takes column name from players_df dataframe and computes transfr_amount/col name ''' #col = 'minutes' #Money spent each season by clubs on player acquisitions grouped = player_df[player_df.season=='10/11'].groupby(['to']) # money paid per minute on field time t1 = (grouped['transfer_fee_euros'].sum()/ grouped[col].sum()) t1.name = '10/11' t1 = DataFrame(t1) grouped = player_df[player_df.season=='11/12'].groupby(['to']) t2 = grouped['transfer_fee_euros'].sum()/ grouped[col].sum() t2.name='11/12' grouped = player_df[player_df.season=='12/13'].groupby(['to']) t3 = grouped['transfer_fee_euros'].sum()/ grouped[col].sum() t3.name='12/13' t1= t1.join(t2) t1= t1.join(t3) t1= t1.where(pd.notnull(t1),0) #print t1 fig = plt.figure() fig.set_size_inches(20,20) ax1 = fig.add_subplot(2, 2, 1) ax2 = fig.add_subplot(2, 2, 2) ax3 = fig.add_subplot(2, 2, 3) i=1 for seasons in t1.columns: #print seasons x = fig.add_subplot(2,2,i) t1[seasons].sort_index(ascending=False).plot(figsize(5,5),kind='barh', title='season '+ seasons+ ':fee by ' + col , rot=0) i+=1 fig t1= t1.transpose() t1.plot(figsize(15,15)) # other parameters that can be passed - minutes , assists compare_stat('goals') stat = 'money_by_goals' x = player_df.groupby('season').sort_index(by=stat)[['p_name','from', 'to', 'position','goals', 'transfer_fee_euros' , stat ]] for season in player_df.season.unique(): print "\n\ntop 5 player acquisitions per goal scored in " + season print x.xs(season)[:5] stat = 'money_by_minutes' x = player_df.groupby('season').sort_index(by=stat)[['p_name','from', 'minutes','to', 'position','goals', 'transfer_fee_euros' , stat ]] for season in player_df.season.unique(): print "\n\ntop 5 player acquisitions per minute on field in " + season print x.xs(season)[:5] stat = 'money_by_goals' x = player_df.groupby(['season']).sort_index(by=stat,ascending=True)[['p_name','age', 'to', 'position','goals', 'transfer_fee_euros' , stat ]] #print x[:5] for season in player_df.season.unique(): print "\n\ntop 5 player acquisitions per goal scored in " + season # make sure you consider players who scored at least 1 goal print x[x.goals > 0 ].xs(season).sort('money_by_goals').sort('age', ascending=True)[:5] for season in player_df.season.unique(): print "\n\nSeason : " + season + "\n" print x[x.goals > 0 ].xs(season).sort('money_by_goals').sort('age', ascending=False)[:5] # some LW and RW positions are ambiguous #print player_df[player_df.position == 'LW'][['p_name','position']] ##position codes categorized by position type ##defence = ['SW' , 'CB' , 'LB' , 'RB'] ##midfield = ['DM', 'CM', 'RM','LM', 'AM', 'LW', 'RW'] ##striker = ['LW', 'RW', 'SS', 'CF'] #del player_df['pos_type'] for season in player_df.season.unique(): # defensive positions print "\n**************************************************************************************" print '\nSeason: '+ season + ' - The top 5 defensive players by money spent per minute on field\n' print player_df[player_df['position'].isin(['SW' , 'CB' ,'LB', 'RB'])].groupby(['season']).sort_index(by='money_by_minutes').xs(season)[:5][['p_name' ,'to', 'money_by_minutes']] print '\nSeason: '+ season + ' - The top 5 midfield players by money spent per minute on field\n' print player_df[player_df['position'].isin(['DM', 'CM', 'RM','LM', 'AM']) | ( player_df['id'].isin(['7858','14086','4063','15921','50202','15921','35251'])) ].groupby(['season']).sort_index(by='money_by_minutes').xs(season)[:5][['p_name' ,'to', 'money_by_minutes']] print '\nSeason: '+ season + ' - The top 5 Attacking players by money spent by goal scored\n' print player_df[player_df['position'].isin([ 'SS', 'CF']) | ( player_df['id'].isin(['31552','15185'])) ].groupby(['season']).sort_index(by='money_by_goals').xs(season)[:5][['p_name' ,'to', 'money_by_goals']] print "-----------------------------------------------------------------------------------------\n\n\n" #print player_df['pos_type'].value_counts() #player_df.pos_type.value_counts() # dictionary and list for stats stats_list=['age', 'assists', 'goals', 'matches', 'minutes', 'minutes_per_goal', 'p_name', 'position', 'transfer_fee_euros', 'transfer_fee_pounds', 'money_by_minutes', 'money_by_goal_minutes', 'money_by_assists', 'money_by_goals', 'money_by_matches'] stats_dict={'age':'Age', 'assists':'Assists', 'goals':'Goals', 'matches':'Matches', 'minutes':'Minutes', 'minutes_per_goal':'Minutes per Goal', 'p_name':'Player Name', 'position':'Position', 'transfer_fee_euros':'Transfer Fee(Euros)', 'transfer_fee_pounds':'Transfer Fee(Pounds)', 'money_by_minutes':'Money by Minutes', 'money_by_goal_minutes':'Money by Goal Minutes', 'money_by_assists':'Money by Assists', 'money_by_goals':'Money by Goals', 'money_by_matches':'Money By Matches'} import IPython.core.display # start off form dialog_form= '

' #define teams drop down menu g = player_df season_drop="
" dialog_form+=season_drop team_drop="
" dialog_form+=team_drop #start stat dropdown stat_drop="" dialog_form+=stat_drop dialog_form+="

" IPython.core.display.HTML(dialog_form) team=None stat=None season=None def prompt_results(): if not stat or not team or not season: return "No Data Selected, Use Prompt" # grouped = player_df[player_df.season=='11/12'].groupby(['to']) g = player_df[player_df.season==season] #g = player_df[player_df.season=='11/12'] gteam= g[g.club==team] gstat= gteam[stat] eq="" if len(gstat.values)<=0: return "No data for that selection, Try another." for s in gstat: eq+=str(s)+", " eq= eq[:-2] if type(gstat[0]) is numpy.float64: eq+=" (Total: "+str(gstat.sum())+")" print stats_dict[stat]+"(s) by "+team+" [with/of transfered players] = "+eq #Working Non formatted team=None stat=None season=None IPython.core.display.Javascript(""" $( "#dialog" ).dialog({ buttons: [ { text: "Ok", click: function() { var teami,stati,seasoni,py1,py2,py3; teami= $('#team_drop').val(); stati= $('#stat_drop').val(); seasoni= $('#season_drop').val(); py1 = "team = '"+teami+"'"; py2 = "stat = '"+stati+"'"; py3 = "season = '"+seasoni+"'"; IPython.notebook.kernel.execute(py1); IPython.notebook.kernel.execute(py2); IPython.notebook.kernel.execute(py3); $( this ).dialog( "close" ); } } ] }); """) #html formated code team=None stat=None season=None IPython.core.display.Javascript(""" $( '#dialog' ).dialog({ buttons: [ { text: 'Ok', click: function() { var teami,stati,seasoni,py1,py2,pyr3; teami= $('#team_drop').val(); stati= $('#stat_drop').val(); seasoni= $('#season_drop').val() py1 = 'team = "'+teami+'"'; py2 = 'stat = "'+stati+'"'; pyr3 = 'season = "'+seasoni+'"'; IPython.notebook.kernel.execute(py1); IPython.notebook.kernel.execute(py2); IPython.notebook.kernel.execute(pyr3); $( this ).dialog( 'close' ); } } ] }); """) prompt_results()