from oauth2client.gce import AppAssertionCredentials
from bigquery_client import BigqueryClient
from pandas.io import gbq

def GetMetadata(path):
  import urllib2
  BASE_PATH = 'http://metadata/computeMetadata/v1/'
  request = urllib2.Request(BASE_PATH + path, headers={'Metadata-Flavor': 'Google'})
  return urllib2.urlopen(request).read()

credentials = AppAssertionCredentials(scope='https://www.googleapis.com/auth/bigquery')

client = BigqueryClient(credentials=credentials,
                        api='https://www.googleapis.com',
                        api_version='v2',
                        project_id=GetMetadata('project/project-id'))

gbq._authenticate = lambda: client

from pandas.io import gbq

# Import the four python modules that we use.
import match_stats
import features
import world_cup
import power
query = "SELECT * FROM (%(summary_query)s) LIMIT 1" % {
    'summary_query': match_stats.team_game_summary_query()}
gbq.read_gbq(query)

import features
reload(features)

# Sets the history size. This is how far back we will look before each game to aggregate statistics
# to predict the next game. For example, a history size of 5 will look at the previous 5 games played
# by a particular team in order to predict the next game.
history_size = 6

game_summaries = features.get_game_summaries()
data = features.get_features(history_size)

# Partition the world cup data and the club data. We're only going to train our model using club data.

club_data = data[data['competitionid'] <> 4]
# Show the features latest game in competition id 4, which is the world cup.
data[data['competitionid'] == 4].iloc[0]


import pandas as pd
pd.crosstab(
    club_data['goals'], 
    club_data.replace(
        {'points': {
            0: 'lose', 1: 'tie', 3: 'win'}})['points'])

import world_cup
reload(world_cup)
import match_stats
pd.set_option('display.max_rows', 5000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# Don't train on games that ended in a draw, since they have less signal.
train = club_data.loc[club_data['points'] <> 1] 
# train = club_data

(model, test) = world_cup.train_model(
     train, match_stats.get_non_feature_columns())
print "\nRsquared: %0.03g" % model.prsquared

def print_params(model, limit=None):    
    params = model.params.copy()
    params.sort(ascending=False)
    del params['intercept']
    
    if not limit:
        limit = len(params)

    print("Positive features")
    params.sort(ascending=False)
    print np.exp(params[[param > 0.001 for param in params]]).sub(1)[:limit]

    print("\nDropped features")
    print params[[param  == 0.0 for param in params]][:limit]

    print("\nNegative features")
    params.sort(ascending=True)
    print np.exp(params[[param < -0.001 for param in params]]).sub(1)[:limit]

print_params(model, 10)


reload(world_cup)
results = world_cup.predict_model(model, test, 
    match_stats.get_non_feature_columns())

predictions = world_cup.extract_predictions(
    results.copy(), results['predicted'])

print 'Correct predictions:'
predictions[(predictions['predicted'] > 50) & (predictions['points'] == 3)][:5]

print '\nIncorrect predictions:'
predictions[(predictions['predicted'] > 50) & (predictions['points'] < 3)][:5]

import pylab as pl
# Compute a baseline, which is the percentage of overall outcomes are actually wins.
# (remember in soccer we can have draws too).
baseline = (sum([yval == 3 for yval in club_data['points']]) 
            * 1.0 / len(club_data))
y = [yval == 3 for yval in test['points']]
world_cup.validate(3, y, results['predicted'], baseline, 
                   compute_auc=True)
pl.show()

import power
reload(power)
reload(world_cup)
def points_to_sgn(p):
  if p > 0.1: return 1.0
  elif p < -0.1: return -1.0
  else: return 0.0
power_cols = [
  ('points', points_to_sgn, 'points'),
]

power_data = power.add_power(club_data, game_summaries, power_cols)
power_train = power_data.loc[power_data['points'] <> 1] 

# power_train = power_data
(power_model, power_test) = world_cup.train_model(
    power_train, match_stats.get_non_feature_columns())
print "\nRsquared: %0.03g, Power Coef %0.03g" % (
    power_model.prsquared, 
    math.exp(power_model.params['power_points']))

power_results = world_cup.predict_model(power_model, power_test, 
    match_stats.get_non_feature_columns())
power_y = [yval == 3 for yval in power_test['points']]
world_cup.validate(3, power_y, power_results['predicted'], baseline, 
                   compute_auc=True, quiet=False)

pl.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck')
# Add the old model to the graph
world_cup.validate('old', y, results['predicted'], baseline, 
                   compute_auc=True, quiet=True)
pl.legend(loc="lower right")
pl.show()

print_params(power_model, 8)

import world_cup
import features
reload(match_stats)
reload(features)
reload(world_cup)

wc_data = world_cup.prepare_data(features.get_wc_features(history_size))
wc_labeled = world_cup.prepare_data(features.get_features(history_size))
wc_labeled = wc_labeled[wc_labeled['competitionid'] == 4]
wc_power_train = game_summaries[game_summaries['competitionid'] == 4].copy()

import pandas as pd
wc_home = pd.read_csv('wc_home.csv')

def add_home_override(df, home_map):
  for ii in xrange(len(df)):
    team = df.iloc[ii]['teamid']
    if team in home_map:
        df['is_home'].iloc[ii] = home_map[team]
    else:
        # If we don't know, assume not at home.
        df['is_home'].iloc[ii] = 0.0
        
home_override = {}
for ii in xrange(len(wc_home)):
    row = wc_home.iloc[ii]
    home_override[row['teamid']] = row['is_home']

# Add home team overrides.
add_home_override(wc_data, home_override)    

# When training power data, since the games span multiple competitions, just set is_home to 0.5
# Otherwise when we looked at games from the 2010 world cup, we'd think Brazil was still at
# home instead of South Africa.
wc_power_train['is_home'] = 0.5
wc_power_data = power.add_power(wc_data, wc_power_train, power_cols)

wc_results = world_cup.predict_model(power_model, wc_power_data, 
    match_stats.get_non_feature_columns())

pd.set_option('display.max_rows', 5000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

wc_with_points = wc_power_data.copy()
wc_with_points.index = pd.Index(
    zip(wc_with_points['matchid'], wc_with_points['teamid']))
wc_labeled.index = pd.Index(
    zip(wc_labeled['matchid'], wc_labeled['teamid']))
wc_with_points['points'] = wc_labeled['points']

wc_pred = world_cup.extract_predictions(wc_with_points, 
                                        wc_results['predicted'])

# Reverse our predictions to show the most recent first.
wc_pred.reindex(index=wc_pred.index[::-1])
# Show our predictions for the games that have already happenned.
wc_pred[wc_pred['points'] >= 0.0]


final = wc_power_data[wc_power_data['matchid'] == '731830']
final

op = game_summaries

def countryStats(d, name):
  pred = d['team_name'] == name
  return d[pred]

fr = countryStats(op, 'France')
ge = countryStats(op, 'Germany')
ar = countryStats(op, 'Argentina')
br = countryStats(op, 'Brazil')
ne = countryStats(op, 'Netherlands')
ge[:6]

wc_pred[~(wc_pred['points'] >= 0)][[
    'team_name', 'op_team_name', 'predicted']]