%load_ext watermark %watermark -a 'Sebastian Raschka' -v import pandas as pd from bs4 import BeautifulSoup import bs4 import requests # Downloading and parsing the data into a Python dict player_dict = {} url = 'https://www.dreamteamfc.com/statistics/players/ALL/' r = requests.get(url) soup = BeautifulSoup(r.text, 'html5lib') # Note: html5lib deals better with broken html than lxml name_list = [] for td in soup.findAll("td", { "class" : "tabName" }): name = td.text.split('Statistics')[-1].strip() if name: name_list.append(name) res = [i.text for i in td.next_siblings if isinstance(i, bs4.element.Tag)] position, team, vfm, value, points = res value = value.strip('m') player_dict[name] = [name, position, team, vfm, value, points] print('Found: %s' % len(name_list)) print(name_list[-1]) # Reading the data into a pandas DataFrame df = pd.DataFrame.from_dict(player_dict, orient='index') df.columns = ['name', 'position', 'team', 'vfm', 'value', 'points'] df[['vfm','value']] = df[['vfm','value']].astype(float) df[['points']] = df[['points']].astype(int) df.tail() df.describe() df['status'] = pd.Series('', index=df.index) df['description'] = pd.Series('', index=df.index) df['returns'] = pd.Series('', index=df.index) url = 'https://www.dreamteamfc.com/statistics/injuries-and-cards/ALL/' r = requests.get(url) soup = BeautifulSoup(r.text, 'html5lib') name_list = [] for td in soup.findAll("td", { "class" : "tabName2" }): name = td.text.split('stats')[-1].strip() if name: name_list.append(name) res = [i.text for i in td.next_siblings if isinstance(i, bs4.element.Tag)] position, team, status, description, returns = res df.loc[df.index==name,['status', 'description', 'returns']] = status, description, returns print('Found: %s' % len(name_list)) print(name_list[-1]) df.tail() df['month_points'] = pd.Series(0, index=df.index) df['week_points'] = pd.Series(0, index=df.index) url = 'https://www.dreamteamfc.com/statistics/form-guide/all' r = requests.get(url) soup = BeautifulSoup(r.text, 'html5lib') name_list = [] for td in soup.findAll("td", { "class" : "tabName" }): name = td.text.strip() if name: name_list.append(name) res = [i.text for i in td.next_siblings if isinstance(i, bs4.element.Tag)] try: month_pts, week_pts = float(res[-2]), float(res[-1]) df.loc[df.index==name, ['month_points', 'week_points']] = month_pts, week_pts except ValueError: pass print('Found: %s' % len(name_list)) print(name_list[-1]) # Reordering the columns df = df[['name', 'position', 'team', 'vfm', 'value', 'points', 'month_points', 'week_points', 'status', 'description', 'returns']] # "Normalizing" player names df['name'] = df['name'].apply(lambda x: x.lower()) df.tail() # Getting the current time stamp for the data from datetime import datetime url = 'https://www.dreamteamfc.com/statistics/players/ALL/' r = requests.get(url) data = r.text soup = BeautifulSoup(data) raw_date = soup.find('li', {'class' : 'pointsupdateinfo' }).text raw_date = raw_date.split()[-1].replace('/', '').strip() d = datetime.strptime(raw_date, '%d%m%Y').date() date = d.strftime('%Y%m%d') print(date) df.to_csv('../data/dreamteamfc_%s.csv' % date, index=False)