%load_ext watermark
%watermark -a 'Sebastian Raschka' -v

import pandas as pd
from bs4 import BeautifulSoup
import bs4
import requests

# Downloading and parsing the data into a Python dict

player_dict = {}

url = 'https://www.dreamteamfc.com/statistics/players/ALL/'
r  = requests.get(url)
soup = BeautifulSoup(r.text, 'html5lib') 
# Note: html5lib deals better with broken html than lxml

name_list = []

for td in soup.findAll("td", { "class" : "tabName" }):
    name = td.text.split('Statistics')[-1].strip()
    if name:
        name_list.append(name)
        res = [i.text for i in td.next_siblings if isinstance(i, bs4.element.Tag)]
        position, team, vfm, value, points = res
        value = value.strip('m')
        player_dict[name] = [name, position, team, vfm, value, points]
        
print('Found: %s' % len(name_list))
print(name_list[-1])

# Reading the data into a pandas DataFrame

df = pd.DataFrame.from_dict(player_dict, orient='index')
df.columns = ['name', 'position', 'team', 'vfm', 'value', 'points']
df[['vfm','value']] = df[['vfm','value']].astype(float)
df[['points']] = df[['points']].astype(int)
df.tail()

df.describe()

df['status'] = pd.Series('', index=df.index)
df['description'] = pd.Series('', index=df.index)
df['returns'] = pd.Series('', index=df.index)

url = 'https://www.dreamteamfc.com/statistics/injuries-and-cards/ALL/'
r  = requests.get(url)
soup = BeautifulSoup(r.text, 'html5lib')

name_list = []

for td in soup.findAll("td", { "class" : "tabName2" }):
    name = td.text.split('stats')[-1].strip()
    if name:
        name_list.append(name)
        res = [i.text for i in td.next_siblings if isinstance(i, bs4.element.Tag)]
        position, team, status, description, returns = res
        df.loc[df.index==name,['status', 'description', 'returns']] = status, description, returns
        
print('Found: %s' % len(name_list))
print(name_list[-1])

df.tail()

df['month_points'] = pd.Series(0, index=df.index)
df['week_points'] = pd.Series(0, index=df.index)

url = 'https://www.dreamteamfc.com/statistics/form-guide/all'
r  = requests.get(url)
soup = BeautifulSoup(r.text, 'html5lib')

name_list = []

for td in soup.findAll("td", { "class" : "tabName" }):
    name = td.text.strip()
    if name:
        name_list.append(name)
        
        res = [i.text for i in td.next_siblings if isinstance(i, bs4.element.Tag)]
        try:
            month_pts, week_pts = float(res[-2]), float(res[-1])
            df.loc[df.index==name, ['month_points', 'week_points']] = month_pts, week_pts
        except ValueError:
            pass
        
print('Found: %s' % len(name_list))
print(name_list[-1])

# Reordering the columns

df = df[['name', 'position', 'team', 'vfm', 'value', 'points', 'month_points', 
         'week_points', 'status', 'description', 'returns']]

# "Normalizing" player names
df['name'] = df['name'].apply(lambda x: x.lower())

df.tail()

# Getting the current time stamp for the data

from datetime import datetime

url = 'https://www.dreamteamfc.com/statistics/players/ALL/'
r  = requests.get(url)
data = r.text
soup = BeautifulSoup(data)

raw_date = soup.find('li', {'class' : 'pointsupdateinfo' }).text
raw_date = raw_date.split()[-1].replace('/', '').strip()
d = datetime.strptime(raw_date, '%d%m%Y').date()
date = d.strftime('%Y%m%d')
print(date)

df.to_csv('../data/dreamteamfc_%s.csv' % date, index=False)