from random import random from IPython.display import SVG import pygal from pybrain.structure import SigmoidLayer from pybrain.tools.shortcuts import buildNetwork from pybrain.supervised.trainers import BackpropTrainer from pybrain.datasets import ClassificationDataSet from pybrain.utilities import percentError from utils import get_matches, get_team_stats, extract_samples, normalize, split_samples, graph_teams_stat_bars, graph_matches_results_scatter # the features I will feed to the classifier as input data. input_features = ['year', 'matches_won_percent', 'podium_score_yearly', 'matches_won_percent_2', 'podium_score_yearly_2',] # the feature giving the result the classifier must learn to predict (I recommend allways using 'winner') output_feature = 'winner' # used to avoid including tied matches in the learning process. I found this greatly improves the classifier accuracy. # I know there will be some ties, but I'm willing to fail on those and have better accuracy with all the rest. # at this point, this code will break if you set it to False, because the network uses a sigmoid function with a # threeshold for output, so it is able to distinquish only 2 kinds of results. exclude_ties = True # used to duplicate matches data, reversing the teams (team1->team2, and viceversa). # This helps on visualizations, and also improves precission of the predictions avoiding a dependence on the # order of the teams from the input. duplicate_with_reversed = True def show(graph): '''Small utility to display pygal graphs''' return SVG(graph.render()) team_stats = get_team_stats() team_stats show(graph_teams_stat_bars(team_stats, 'matches_won_percent')) show(graph_teams_stat_bars(team_stats, 'podium_score_yearly')) matches = get_matches(with_team_stats=True, duplicate_with_reversed=duplicate_with_reversed, exclude_ties=exclude_ties) matches show(graph_matches_results_scatter(matches, 'matches_won_percent', 'matches_won_percent_2')) show(graph_matches_results_scatter(matches, 'podium_score_yearly', 'podium_score_yearly_2')) inputs, outputs = extract_samples(matches, input_features, output_feature) normalizer, inputs = normalize(inputs) train_inputs, train_outputs, test_inputs, test_outputs = split_samples(inputs, outputs) n = buildNetwork(len(input_features), 10 * len(input_features), 10 * len(input_features), 1, outclass=SigmoidLayer, bias=True) def neural_result(input): """Call the neural network, and translates its output to a match result.""" n_output = n.activate(input) if n_output >= 0.5: return 2 else: return 1 def test_network(): """Calculate train and test sets errors.""" print (100 - percentError(map(neural_result, train_inputs), train_outputs), 100 - percentError(map(neural_result, test_inputs), test_outputs)) train_set = ClassificationDataSet(len(input_features)) for i, input_line in enumerate(train_inputs): train_set.addSample(train_inputs[i], [train_outputs[i] - 1]) trainer = BackpropTrainer(n, dataset=train_set, momentum=0.5, weightdecay=0.0) train_set.assignClasses() test_network() for i in range(20): trainer.train() test_network() def predict(year, team1, team2): inputs = [] for feature in input_features: from_team_2 = '_2' in feature feature = feature.replace('_2', '') if feature in team_stats.columns.values: team = team2 if from_team_2 else team1 value = team_stats.loc[team, feature] elif feature == 'year': value = year else: raise ValueError("Don't know where to get feature: " + feature) inputs.append(value) inputs = normalizer.transform(inputs) result = neural_result(inputs) if result == 0: return 'tie' elif result == 1: return team1 elif result == 2: return team2 else: return 'Unknown result: ' + str(result) predict(1950, 'Mexico', 'Brazil') # real result: 4-0 wins Brazil predict(1990, 'United Arab Emirates', 'Colombia') # real result: 2-0 wins Colombia predict(2002, 'South Africa', 'Spain') # real result: 2-3 wins Spain predict(2010, 'Japan', 'Cameroon') # real result: 1-0 wins Japan predict(2014, 'Argentina', 'Brazil') predict(2014, 'Spain', 'Haiti') predict(2014, 'Russia', 'Germany') predict(2014, 'Russia', 'Russia')