from oauth2client.gce import AppAssertionCredentials from bigquery_client import BigqueryClient from pandas.io import gbq def GetMetadata(path): import urllib2 BASE_PATH = 'http://metadata/computeMetadata/v1/' request = urllib2.Request(BASE_PATH + path, headers={'Metadata-Flavor': 'Google'}) return urllib2.urlopen(request).read() credentials = AppAssertionCredentials(scope='https://www.googleapis.com/auth/bigquery') client = BigqueryClient(credentials=credentials, api='https://www.googleapis.com', api_version='v2', project_id=GetMetadata('project/project-id')) gbq._authenticate = lambda: client from pandas.io import gbq # Import the four python modules that we use. import match_stats import features import world_cup import power query = "SELECT * FROM (%(summary_query)s) LIMIT 1" % { 'summary_query': match_stats.team_game_summary_query()} gbq.read_gbq(query) import features reload(features) # Sets the history size. This is how far back we will look before each game to aggregate statistics # to predict the next game. For example, a history size of 5 will look at the previous 5 games played # by a particular team in order to predict the next game. history_size = 6 game_summaries = features.get_game_summaries() data = features.get_features(history_size) # Partition the world cup data and the club data. We're only going to train our model using club data. club_data = data[data['competitionid'] <> 4] # Show the features latest game in competition id 4, which is the world cup. data[data['competitionid'] == 4].iloc[0] import pandas as pd pd.crosstab( club_data['goals'], club_data.replace( {'points': { 0: 'lose', 1: 'tie', 3: 'win'}})['points']) import world_cup reload(world_cup) import match_stats pd.set_option('display.max_rows', 5000) pd.set_option('display.max_columns', 500) pd.set_option('display.width', 1000) # Don't train on games that ended in a draw, since they have less signal. train = club_data.loc[club_data['points'] <> 1] # train = club_data (model, test) = world_cup.train_model( train, match_stats.get_non_feature_columns()) print "\nRsquared: %0.03g" % model.prsquared def print_params(model, limit=None): params = model.params.copy() params.sort(ascending=False) del params['intercept'] if not limit: limit = len(params) print("Positive features") params.sort(ascending=False) print np.exp(params[[param > 0.001 for param in params]]).sub(1)[:limit] print("\nDropped features") print params[[param == 0.0 for param in params]][:limit] print("\nNegative features") params.sort(ascending=True) print np.exp(params[[param < -0.001 for param in params]]).sub(1)[:limit] print_params(model, 10) reload(world_cup) results = world_cup.predict_model(model, test, match_stats.get_non_feature_columns()) predictions = world_cup.extract_predictions( results.copy(), results['predicted']) print 'Correct predictions:' predictions[(predictions['predicted'] > 50) & (predictions['points'] == 3)][:5] print '\nIncorrect predictions:' predictions[(predictions['predicted'] > 50) & (predictions['points'] < 3)][:5] import pylab as pl # Compute a baseline, which is the percentage of overall outcomes are actually wins. # (remember in soccer we can have draws too). baseline = (sum([yval == 3 for yval in club_data['points']]) * 1.0 / len(club_data)) y = [yval == 3 for yval in test['points']] world_cup.validate(3, y, results['predicted'], baseline, compute_auc=True) pl.show() import power reload(power) reload(world_cup) def points_to_sgn(p): if p > 0.1: return 1.0 elif p < -0.1: return -1.0 else: return 0.0 power_cols = [ ('points', points_to_sgn, 'points'), ] power_data = power.add_power(club_data, game_summaries, power_cols) power_train = power_data.loc[power_data['points'] <> 1] # power_train = power_data (power_model, power_test) = world_cup.train_model( power_train, match_stats.get_non_feature_columns()) print "\nRsquared: %0.03g, Power Coef %0.03g" % ( power_model.prsquared, math.exp(power_model.params['power_points'])) power_results = world_cup.predict_model(power_model, power_test, match_stats.get_non_feature_columns()) power_y = [yval == 3 for yval in power_test['points']] world_cup.validate(3, power_y, power_results['predicted'], baseline, compute_auc=True, quiet=False) pl.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck') # Add the old model to the graph world_cup.validate('old', y, results['predicted'], baseline, compute_auc=True, quiet=True) pl.legend(loc="lower right") pl.show() print_params(power_model, 8) import world_cup import features reload(match_stats) reload(features) reload(world_cup) wc_data = world_cup.prepare_data(features.get_wc_features(history_size)) wc_labeled = world_cup.prepare_data(features.get_features(history_size)) wc_labeled = wc_labeled[wc_labeled['competitionid'] == 4] wc_power_train = game_summaries[game_summaries['competitionid'] == 4].copy() import pandas as pd wc_home = pd.read_csv('wc_home.csv') def add_home_override(df, home_map): for ii in xrange(len(df)): team = df.iloc[ii]['teamid'] if team in home_map: df['is_home'].iloc[ii] = home_map[team] else: # If we don't know, assume not at home. df['is_home'].iloc[ii] = 0.0 home_override = {} for ii in xrange(len(wc_home)): row = wc_home.iloc[ii] home_override[row['teamid']] = row['is_home'] # Add home team overrides. add_home_override(wc_data, home_override) # When training power data, since the games span multiple competitions, just set is_home to 0.5 # Otherwise when we looked at games from the 2010 world cup, we'd think Brazil was still at # home instead of South Africa. wc_power_train['is_home'] = 0.5 wc_power_data = power.add_power(wc_data, wc_power_train, power_cols) wc_results = world_cup.predict_model(power_model, wc_power_data, match_stats.get_non_feature_columns()) pd.set_option('display.max_rows', 5000) pd.set_option('display.max_columns', 500) pd.set_option('display.width', 1000) wc_with_points = wc_power_data.copy() wc_with_points.index = pd.Index( zip(wc_with_points['matchid'], wc_with_points['teamid'])) wc_labeled.index = pd.Index( zip(wc_labeled['matchid'], wc_labeled['teamid'])) wc_with_points['points'] = wc_labeled['points'] wc_pred = world_cup.extract_predictions(wc_with_points, wc_results['predicted']) # Reverse our predictions to show the most recent first. wc_pred.reindex(index=wc_pred.index[::-1]) # Show our predictions for the games that have already happenned. wc_pred[wc_pred['points'] >= 0.0] final = wc_power_data[wc_power_data['matchid'] == '731830'] final op = game_summaries def countryStats(d, name): pred = d['team_name'] == name return d[pred] fr = countryStats(op, 'France') ge = countryStats(op, 'Germany') ar = countryStats(op, 'Argentina') br = countryStats(op, 'Brazil') ne = countryStats(op, 'Netherlands') ge[:6] wc_pred[~(wc_pred['points'] >= 0)][[ 'team_name', 'op_team_name', 'predicted']]