import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# handling warning messages:
import warnings
warnings.filterwarnings('ignore')
# read in the csv file
train = pd.read_csv("football_train.csv", index_col = 0)
# check if the file has been correctly read in
train.head()
train.shape
# standardize the numeric columns
n = train.shape[1]
for i in range(n - 1):
train.iloc[:, i] = (train.iloc[:, i] - np.mean(train.iloc[:, i])) / np.std(train.iloc[:, i])
train.head()
train.corr().iloc[:, n-1]
plt.figure(figsize=(16,16))
sns.heatmap(train.corr(), annot = True)
plt.xlim([0, n])
plt.ylim([0, n])
plt.title("correlation heatmap: numeric variables")
plt.show()
indices = []
for i in range(n-1):
if (abs(train.corr().iloc[:, n-1]) > 0.35)[i]:
indices.append(i)
Suppose that we want to predict the variable Wins
of each team based on the predictors in the dataframe.
We can choose to use the entire dataset as the testing set, but we then cannot have a measure of how good predictions the model can make on future/unseen data.
Therefore, to enhance the predictive ability of the model, we split the dataset into a training set and a testing set:
# import the train_test_split() function from sklearn
from sklearn.model_selection import train_test_split
X = train.iloc[:, range(0, n-1)]
y = train.iloc[:, n-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)
print("size of the training set is %d, size of the testing set is %d." %(X_train.shape[0], X_test.shape[0]))
# true test set
test_x = pd.read_csv("football_test.csv", index_col = 0)
test_y = pd.read_csv("football_answers.csv")
test_y = test_y.loc[:, "Wins"]
# standardize the features
n = test_x.shape[1]
for i in range(n - 1):
test_x.iloc[:, i] = (test_x.iloc[:, i] - np.mean(test_x.iloc[:, i])) / np.std(test_x.iloc[:, i])
predictions = {}
def createTrees(model, train_x, train_y, test_x):
'''
This function takes in a model, fit it with the training set and
return the predicted values on the testing set.
model: regression model built with selected parameters (not fitted
with data)
train_x: training set of features
train_y: training set of true outcome
test_x: testing set of the features
return value: predictions - fitted values on the testing set
'''
model = model.fit(train_x, train_y)
predictions = model.predict(test_x)
return predictions
from sklearn.tree import DecisionTreeRegressor
predictions_tree = []
for i in range(4, len(indices) + 1):
model_tree = DecisionTreeRegressor(max_depth = i)
predictions_tree.append(createTrees(model_tree, X_train, y_train, test_x))
predictions["Regression Tree"] = predictions_tree
import xgboost as xgb
test_x.columns = X_train.columns
eta = list(np.arange(0.1, 1.6, 0.1))
predictions_xgb = []
for param in eta:
xgb_model = xgb.XGBRegressor(objective = "reg:squarederror", max_iter = 10, seed = 7,
max_depth = 2, learning_rate = param, n_estimators = 80)
predictions_xgb.append(createTrees(xgb_model, X_train, y_train, test_x))
predictions["xgboost"] = predictions_xgb
xgb_model = xgb.XGBRegressor(objective = "reg:squarederror", max_iter = 10, seed = 7,
max_depth = 2, learning_rate = 0.5, n_estimators = 80)
model_xgb = xgb_model.fit(X_train, y_train)
fig, ax = plt.subplots(figsize = (30, 30))
xgb.plot_tree(model_xgb, num_trees = 4, ax = ax)
plt.show()
import lightgbm as lgb
train_set = lgb.Dataset(X_train, label = y_train)
num_round = 10
features = list(np.arange(0.1, 1, 0.1))
predictions_lgb = []
for i in features:
param = {'objective': 'regression', 'num_iterations' : 500, 'max_depth' : 3,
'learning_rate' : 0.8, 'max_bin': 100, 'feature_fraction': i,
'bagging_freq': 20}
param['metric'] = 'rmse'
model_lgb = lgb.train(param, train_set, num_round)
predictions_lgb.append(model_lgb.predict(test_x))
param = {'objective': 'regression', 'num_iterations' : 500, 'max_depth' : 3,
'learning_rate' : 0.8, 'max_bin': 100, 'feature_fraction': 0.5,
'bagging_freq': 20}
param['metric'] = 'rmse'
model_lgb = lgb.train(param, train_set, num_round)
lgb.plot_tree(model_lgb, figsize = (100, 100))
plt.show()
predictions['LightGBM'] = predictions_lgb
from sklearn.ensemble import RandomForestRegressor
predictions_RF = []
for i in range(4, len(indices)):
model_RF = RandomForestRegressor(max_depth = 2, random_state = 0, max_features = i)
predictions_RF.append(createTrees(model_RF, X_train, y_train, test_x))
predictions["Random Forest"] = predictions_RF
averages = {key:np.mean(predictions[key], axis = 0) for key in predictions.keys()}
variances = {key:np.var(predictions[key]) for key in predictions.keys()}
biases = {key:np.mean(np.abs(value - test_y)) for key, value in averages.items()}
colors = {1:"blue", 2:"red", 3:"orange", 4:"pink"}
plt.figure(figsize = (10, 10))
i = 1
for key, value in averages.items():
plt.subplot(2, 2, i)
plt.scatter(test_y, value, color = colors[i])
plt.xlabel("true values")
plt.ylabel("predicted values")
plt.title(key)
i += 1
plt.figure(figsize = (12, 5))
plt.subplot(1, 2, 1)
plt.plot(list(biases.keys()), list(biases.values()), "-o")
plt.xlabel("type of tree")
plt.ylabel("bias")
plt.subplot(1, 2, 2)
plt.plot(list(variances.keys()), list(variances.values()), "-o")
plt.xlabel("type of tree")
plt.ylabel("variance")
plt.show()
plt.plot(list(biases.keys()), list(biases.values()), "-o")
plt.plot(list(variances.keys()), list(variances.values()), "-o")
plt.xlabel("type of tree")
plt.ylabel("quantity")
plt.legend(["bias", "variance"])
© Kaixin Wang, updated February 2020