import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# read in the csv file
train = pd.read_csv("football_train.csv", index_col = 0)
# check if the file has been correctly read in
train.head()
train.shape
# standardize the numeric columns
n = train.shape[1]
for i in range(n - 1):
train.iloc[:, i] = (train.iloc[:, i] - np.mean(train.iloc[:, i])) / np.std(train.iloc[:, i])
train.head()
train.corr().iloc[:, n-1]
plt.figure(figsize=(16,16))
sns.heatmap(train.corr(), annot = True)
plt.xlim([0, n])
plt.ylim([0, n])
plt.title("correlation heatmap: numeric variables")
plt.show()
indices = []
for i in range(n-1):
if (abs(train.corr().iloc[:, n-1]) > 0.35)[i]:
indices.append(i)
Suppose that we want to predict the variable Wins
of each team based on the predictors in the dataframe.
We can choose to use the entire dataset as the testing set, but we then cannot have a measure of how good predictions the model can make on future/unseen data.
Therefore, to enhance the predictive ability of the model, we split the dataset into a training set and a testing set:
Since there are 380 observations in the data frame train
, we can use the 70%/30% split method, where 70% of the entries will serve as the training set while the other 30% will be the testing set.
There are two methods to split the dataset:
train_test_split
function in scikit-learn
modulesample
function in random
module# import the function from module
from sklearn.model_selection import train_test_split
X = train.iloc[:, range(0, n-1)]
y = train.iloc[:, n-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)
print("size of the training set is %d, size of the testing set is %d." %(X_train.shape[0], X_test.shape[0]))
Note: this method takes relatively more steps than using the train_test_split()
function.
import random
start = 0
end = X.shape[0]
size = X.shape[0]
# obtain the indices of training set
train_indices = random.sample(range(start, end), int(size * 0.7))
# to obtain the indices for testing set
test_indices = list(set(range(start, end)) - set(train_indices))
train_X = X.iloc[train_indices, :]
train_y = y.iloc[train_indices]
test_X = X.iloc[test_indices, :]
test_y = y.iloc[test_indices]
print("size of the training set is %d, size of the testing set is %d." %(train_X.shape[0], test_X.shape[0]))
# check if there is any duplicates in two dataframes
print(test_indices in train_indices)
print(train_indices in test_indices)
import statsmodels.api as sm # another package for linear model
import statsmodels.formula.api as smf
# create a dataframe for both the training and the testing
df_train = pd.concat([X_train, y_train], axis = 1)
# linear regression model using statsmodel
model1 = smf.ols("Wins ~ Yards + TurnOversLost + FirstDowns + InterceptionsThrown + OppYardsGainedRushing", data = df_train)
results = model1.fit()
print(results.summary())
model2 = smf.ols("Wins ~ Yards + TurnOversLost + FirstDowns", data = df_train)
results2 = model2.fit()
print(results2.summary())
# true test set
test_x = pd.read_csv("football_test.csv", index_col = 0)
test_y = pd.read_csv("football_answers.csv")
# standardize the features
n = test_x.shape[1]
for i in range(n - 1):
test_x.iloc[:, i] = (test_x.iloc[:, i] - np.mean(test_x.iloc[:, i])) / np.std(test_x.iloc[:, i])
from sklearn.linear_model import LinearRegression
model1_indices = [0, 2, 4, 8, 25]
model2_indices = [0, 2, 4]
model1 = LinearRegression().fit(X_train.iloc[:, model1_indices], y_train)
pred1 = model1.predict(X.iloc[:, model1_indices])
model2 = LinearRegression().fit(X_train.iloc[:, model2_indices], y_train)
pred2 = model2.predict(X.iloc[:, model2_indices])
model1.coef_
model2.coef_
nrows = X.shape[0]
plt.figure(figsize = (16, 6))
plt.subplot(1, 2, 1)
plt.scatter(y, pred1)
plt.xlabel("true values")
plt.ylabel("predicted values")
plt.subplot(1, 2, 2)
plt.scatter(y, pred2)
plt.xlabel("true values")
plt.ylabel("predicted values")
plt.show()
Now we will fit multiple models on the training set and see the model performance on the dataset:
indices
# to generate all possible combinations of the indices
import itertools
def createMLRModel(train_X, train_y, test_X, indices, complexity):
'''
This function creates a Multiple Linear Regression Model based on the input
dataframes, indices of the predictors, and the complexity of the model.
train_X: training set of the predictors
train_y: training set of the outcome
test_X: testing set of the predictors
indices: index of each predictor used in the modeling
complexity: number of predictors required
return: predictions - a list of list that contains the predictions of each model
'''
predictions = []
for subset in itertools.combinations(indices, complexity):
model = LinearRegression().fit(train_X.iloc[:, list(subset)], train_y)
pred = model.predict(test_X.iloc[:, list(subset)])
predictions.append(pred.reshape(1,-1))
return predictions
Create regression models and obtain the predictions
predictions = {}
for i in range(1, 10):
predictions[i] = createMLRModel(X_train, y_train, test_x, indices, i)
print("%d MLR models built with complexity = %d" %(len(predictions[i]), i))
Calculate the average value of predictions at each complexity level
averages = {}
for i in range(1, 10):
averages[i] = list(np.mean(predictions[i], axis = 0)[0])
Calculate the bias of predictions at each complexity level
test_y = test_y.loc[:, "Wins"]
biases = {}
for i in range(1, 10):
biases[i] = np.mean(np.abs(averages[i] - test_y))
Calculate the average value of variance in predictions at each complexity level
variances = {}
for i in range(1, 10):
variances[i] = np.mean(np.var(predictions[i]))
Visualize the decrease in bias as the complexity level increases
nrows = test_y.shape[0]
plt.figure(figsize = (16, 9))
plt.subplots_adjust(hspace = 0.5)
for i in range(9):
plt.subplot(3, 3, i+1)
plt.scatter(test_y, averages[i+1])
plt.xlabel("true values")
plt.ylabel("average predictions")
if i == 0:
plt.title("SLR model (1 feature)")
else:
plt.title("MLR with %d features" %(i+1))
Bias vs. model complexity
plt.plot(list(biases.keys()), list(biases.values()),"-o")
plt.xlabel("number of features")
plt.ylabel("bias in the predictions")
plt.show()
Variance vs. model complexity
plt.plot(list(variances.keys()), list(variances.values()),"-o", color = "orange")
plt.xlabel("number of features")
plt.ylabel("variance in the predictions")
plt.show()
Bias vs. Variance as model complexity increases
plt.plot(list(biases.keys()), list(biases.values()),"-o")
plt.plot(list(variances.keys()), list(variances.values()),"-o")
plt.xlabel("number of features")
plt.ylabel("bias or variance")
plt.legend(["bias", "variance"])
plt.title("bias vs. variance tradeoff")
plt.show()
© Kaixin Wang, updated February 2020