#!/usr/bin/env python # coding: utf-8 # In[86]: import tensorflow as tf import pandas as pd import numpy as np # In[87]: def read_csv(file): return pd.read_csv(file) def cleanup_data(data): # remove games when he didn't play data = data.drop(data[data.G.isnull()].index) # normalize age to years data['Age'] = data['Age'].str.split('-', 1).str[0].astype(int) # normalize minutes to exclude seconds data['MP'] = data["MP"].str.split(":").str[0].astype(int) data['PTS'] = data['PTS'].astype(int) return data def read_and_clean_csv(file, columns=[]): data = read_csv(file) clean_data = cleanup_data(data) return clean_data[columns] csv_label_name = "PTS" # points per game "+/-" feature_names = ["G", "MP", "Age"] csv_column_names = feature_names + [csv_label_name] feature_columns = [ tf.feature_column.numeric_column(feature) for feature in feature_names ] def read_seasons(seasons=range(2002, 2018)): stats = [] for season in seasons: file = str(season) + ".csv" season_stat = read_and_clean_csv(file, columns=csv_column_names) stats.append(season_stat) return pd.concat(stats) stats = read_seasons() train_factor = 0.8 train, test = stats[:int(len(stats)*train_factor)], stats[int(len(stats)*train_factor):] # In[88]: print("test:" + str(test.shape) + " train: " + str(train.shape) + " stats: " + str(stats.shape)) # In[89]: input_fn_train = tf.estimator.inputs.pandas_input_fn( x = pd.DataFrame({ "Age" : train["Age"].values, "MP" : train["MP"].values, "G" : train["G"].values }), y = pd.Series(train.PTS.values), shuffle=False, num_epochs=20000 ) # In[90]: estimator = tf.estimator.DNNRegressor(hidden_units=[5, 5], feature_columns=feature_columns) # estimator = tf.estimator.LinearRegressor(feature_columns=feature_columns) # In[91]: # train estimator.train(input_fn=input_fn_train, steps=20000) # In[92]: input_fn_test = tf.estimator.inputs.pandas_input_fn( x = pd.DataFrame({ "Age" : test["Age"].values, "MP" : test["MP"].values, "G" : test["G"].values }), y = pd.Series(test.PTS.values), shuffle=False ) evaluation = estimator.evaluate(input_fn=input_fn_test) loss_score = evaluation["loss"] print("Loss: {0:f}".format(loss_score)) # In[93]: import itertools input_fn_predict = tf.estimator.inputs.pandas_input_fn( x = pd.DataFrame({ "Age" : test["Age"].values, "MP" : test["MP"].values, "G" : test["G"].values }), y=None, # we are predicting shuffle=False ) predictions = estimator.predict(input_fn=input_fn_predict) predictions = list(p["predictions"][0] for p in predictions) # In[94]: get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plt plt.figure(figsize=(10,10)) plt.plot(test.PTS.values) plt.plot(predictions) # In[95]: delta=test.PTS-predictions np.abs(np.mean(delta)).round() # mean prediction error # In[96]: plt.plot(stats.Age, stats.PTS) plt.show() # In[100]: plt.scatter(stats["Age"], stats["PTS"]) plt.show()