import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
%matplotlib inline
links = {'porto': 'http://www.transfermarkt.co.uk/fc-porto/alletransfers/verein/720',
'benfica': 'http://www.transfermarkt.co.uk/benfica-lissabon/alletransfers/verein/294',
'sporting': 'http://www.transfermarkt.co.uk/sporting-lissabon/alletransfers/verein/336',
'man city': 'http://www.transfermarkt.co.uk/manchester-city/alletransfers/verein/281',
'man united': 'http://www.transfermarkt.co.uk/manchester-united/alletransfers/verein/985',
'chelsea': 'http://www.transfermarkt.co.uk/fc-chelsea/alletransfers/verein/631',
}
transfers = []
for club, link in links.items():
response = requests.get(link)
soup = BeautifulSoup(response.text).find_all('div', attrs={'class': 'box'})
for box in soup:
title_soup = box
title = title_soup.find('div', attrs={'class': 'table-header'})
if title:
title = title.text.split('\t')[4]
for row in box.find_all('tr'):
fields = title.split(' ')
fields.append(club)
for field in row.find_all('td'):
if field["class"][0] == 'rechts':
fields.append(field.text)
elif field["class"][0] in ['redtext', 'greentext']:
pass
else:
if field.a:
try:
fields.append(field.a["title"])
except:
print "error:", field["class"][0]
else:
try:
fields.append(field["title"])
except:
print "error:", field["class"][0]
if len(fields)==8:
transfers.append(fields)
print len(transfers)
6802
df = pd.DataFrame(transfers, columns=['type', 'season', 'club', 'position', 'player', 'transfer_club', 'transfer_club_alt', 'fee'])
df.head()
type | season | club | position | player | transfer_club | transfer_club_alt | fee | |
---|---|---|---|---|---|---|---|---|
0 | Arrivals | 14/15 | porto | Right Wing | Adrián | Atlético Madrid | Atlético Madrid | 9,68 Mill. £ |
1 | Arrivals | 14/15 | porto | Centre Back | Bruno Martins Indi | Feyenoord Rotterdam | Feyenoord | 6,78 Mill. £ |
2 | Arrivals | 14/15 | porto | Attacking Midfield | Otávio | Sport Club Internacional | Internacional | 6,16 Mill. £ |
3 | Arrivals | 14/15 | porto | Attacking Midfield | Yacine Brahimi | Granada CF | Granada CF | 5,72 Mill. £ |
4 | Arrivals | 14/15 | porto | Centre Forward | Vincent Aboubakar | FC Lorient | FC Lorient | 2,64 Mill. £ |
5 rows × 8 columns
df.groupby('club').agg(pd.Series.count)
type | season | position | player | transfer_club | transfer_club_alt | fee | |
---|---|---|---|---|---|---|---|
club | |||||||
benfica | 1149 | 1149 | 1149 | 1149 | 1149 | 1149 | 1149 |
chelsea | 1187 | 1187 | 1187 | 1187 | 1187 | 1187 | 1187 |
man city | 1279 | 1279 | 1279 | 1279 | 1279 | 1279 | 1279 |
man united | 1253 | 1253 | 1253 | 1253 | 1253 | 1253 | 1253 |
porto | 1053 | 1053 | 1053 | 1053 | 1053 | 1053 | 1053 |
sporting | 881 | 881 | 881 | 881 | 881 | 881 | 881 |
6 rows × 7 columns
mults = {'Mill.': 1, 'Th.': 0.001}
def translate_fee(fee):
if 'Loan' in fee or 'loan' in fee:
return np.NAN
elif fee in ['Free transfer', '0', '-']:
return 0
elif fee in ['?', 'Tauschgeschäft', 'Spielertausch']:
return np.NAN
else:
try:
value, mult, currency = fee.split(' ')
return float(value.replace(',', '.')) * mults[mult]
except:
return np.NAN
df['fee_value'] = df['fee'].map(translate_fee)
countries = {'porto': 'pt',
'benfica': 'pt',
'sporting': 'pt',
'man city': 'en',
'man united': 'en',
'chelsea': 'en',
}
df['country'] = df['club'].map(countries.get)
df.groupby('club').agg(pd.Series.count)
-c:8: UnicodeWarning: Unicode equal comparison failed to convert both arguments to Unicode - interpreting them as being unequal
type | season | position | player | transfer_club | transfer_club_alt | fee | fee_value | country | |
---|---|---|---|---|---|---|---|---|---|
club | |||||||||
benfica | 1149 | 1149 | 1149 | 1149 | 1149 | 1149 | 1149 | 439 | 1149 |
chelsea | 1187 | 1187 | 1187 | 1187 | 1187 | 1187 | 1187 | 510 | 1187 |
man city | 1279 | 1279 | 1279 | 1279 | 1279 | 1279 | 1279 | 547 | 1279 |
man united | 1253 | 1253 | 1253 | 1253 | 1253 | 1253 | 1253 | 564 | 1253 |
porto | 1053 | 1053 | 1053 | 1053 | 1053 | 1053 | 1053 | 396 | 1053 |
sporting | 881 | 881 | 881 | 881 | 881 | 881 | 881 | 358 | 881 |
6 rows × 9 columns
pivot = pd.pivot_table(df, values='fee_value', rows=['player', 'club', 'country'], cols='type', aggfunc=np.mean).dropna().reset_index()
pivotPT = pivot[pivot.country=='pt']
pivotEN = pivot[pivot.country=='en']
pivot.describe()
type | Arrivals | Departures |
---|---|---|
count | 1048.000000 | 1048.000000 |
mean | 2.128931 | 2.245259 |
std | 4.964249 | 6.073548 |
min | 0.000000 | 0.000000 |
25% | 0.000000 | 0.000000 |
50% | 0.000000 | 0.000000 |
75% | 1.760000 | 1.580000 |
max | 40.480000 | 82.720000 |
8 rows × 2 columns
dfClubs = pivot.groupby('club').agg(np.sum)
dfClubs['Profit %']=(dfClubs['Departures']/dfClubs['Arrivals']-1)*100
dfClubs
type | Arrivals | Departures | Profit % |
---|---|---|---|
club | |||
benfica | 173.825667 | 408.708 | 135.125231 |
chelsea | 767.142000 | 453.531 | -40.880437 |
man city | 539.957000 | 251.616 | -53.400734 |
man united | 436.694000 | 400.773 | -8.225668 |
porto | 206.609000 | 641.992 | 210.727993 |
sporting | 106.892000 | 196.411 | 83.747147 |
6 rows × 3 columns
from matplotlib import style
print style.available
[u'dark_background', u'bmh', u'grayscale', u'ggplot', u'fivethirtyeight']
style.use('ggplot')
fig, ax = plt.subplots(figsize = (7.5,7.5))
plt.title('Transfer Fee of players when arrived vs. left the club')
plt.scatter(pivotPT.Arrivals, pivotPT.Departures, color = "#8EBA42", alpha=0.5)
plt.scatter(pivotEN.Arrivals, pivotEN.Departures, color = "#E24A33", alpha=0.5)
ax.set_ylim(-1, 90)
ax.set_xlim(90, -1)
ax.set_ylabel(u'Departure Fee (£M)')
ax.set_xlabel(u'Arrival Fee (£M)')
ax.arrow(70, 70, -4, 4, head_width=1, head_length=1, color="black")
ax.arrow(70, 70, 4, -4, head_width=1, head_length=1, color="black")
ax.annotate("increase in value", (67, 75.5))
ax.annotate("decrease in value", (85, 63.5), )
ax.legend(['Portuguese Clubs', 'English Clubs'], loc='lower left')
ax.add_line(Line2D((0,90), (0,90), color="gray", alpha=0.3))
<matplotlib.lines.Line2D at 0x112190b8>
pivot[pivot.Arrivals == 0].groupby('club').agg({'player': pd.Series.count,
'Departures': np.mean})
player | Departures | |
---|---|---|
club | ||
benfica | 82 | 0.874476 |
chelsea | 91 | 0.677357 |
man city | 110 | 0.411364 |
man united | 132 | 0.892356 |
porto | 50 | 2.031920 |
sporting | 75 | 1.635387 |
6 rows × 2 columns
pivot['Profit'] = pivot['Departures']-pivot['Arrivals']
pivot.sort('Profit').head(10)
type | player | club | country | Arrivals | Departures | Profit |
---|---|---|---|---|---|---|
52 | Andriy Shevchenko | chelsea | en | 40.48 | 0.00 | -40.48 |
854 | Rio Ferdinand | man united | en | 40.48 | 0.00 | -40.48 |
661 | Michael Essien | chelsea | en | 33.44 | 0.00 | -33.44 |
254 | Dimitar Berbatov | man united | en | 33.44 | 4.40 | -29.04 |
471 | Joleon Lescott | man city | en | 24.20 | 0.00 | -24.20 |
394 | Hernán Crespo | chelsea | en | 22.88 | 0.00 | -22.88 |
866 | Robinho | man city | en | 37.84 | 15.84 | -22.00 |
745 | Owen Hargreaves | man united | en | 22.00 | 0.00 | -22.00 |
517 | Jô | man city | en | 21.12 | 0.00 | -21.12 |
290 | Emmanuel Adebayor | man city | en | 25.52 | 5.63 | -19.89 |
10 rows × 6 columns
pivot.sort('Profit', ascending=False).head(10)
type | player | club | country | Arrivals | Departures | Profit |
---|---|---|---|---|---|---|
172 | Cristiano Ronaldo | man united | en | 15.400 | 82.72 | 67.320 |
284 | Eliaquim Mangala | porto | pt | 5.720 | 47.36 | 41.640 |
309 | Falcao | porto | pt | 4.780 | 41.36 | 36.580 |
427 | James Rodríguez | porto | pt | 6.470 | 39.60 | 33.130 |
213 | David Beckham | man united | en | 0.000 | 33.00 | 33.000 |
401 | Hulk | porto | pt | 16.720 | 48.40 | 31.680 |
81 | Axel Witsel | benfica | pt | 7.920 | 35.20 | 27.280 |
844 | Ricardo Carvalho | porto | pt | 0.000 | 26.40 | 26.400 |
336 | Fábio Coentrão | benfica | pt | 0.792 | 26.40 | 25.608 |
799 | Pepe | porto | pt | 1.760 | 26.40 | 24.640 |
10 rows × 6 columns