Module/Package import
import numpy as np # numpy module for linear algebra
import pandas as pd # pandas module for data manipulation
import matplotlib.pyplot as plt # module for plotting
import seaborn as sns # another module for plotting
import warnings # to handle warning messages
warnings.filterwarnings('ignore')
from sklearn.linear_model import LinearRegression # package for linear model
import statsmodels.api as sm # another package for linear model
import statsmodels.formula.api as smf
import scipy as sp
from sklearn.model_selection import train_test_split # split data into training and testing sets
Dataset import
The dataset that we will be using is the meuse
dataset.
As described by the author of the data: "This data set gives locations and topsoil heavy metal concentrations, along with a number of soil and landscape variables at the observation locations, collected in a flood plain of the river Meuse, near the village of Stein (NL). Heavy metal concentrations are from composite samples of an area of approximately 15 m $\times$ 15 m."
soil = pd.read_csv("soil.csv") # data import
soil.head() # check if read in correctly
soil.shape # rows x columns
print(soil.describe())
index = pd.isnull(soil).any(axis = 1)
soil = soil[-index]
soil = soil.reset_index(drop = True)
soil.shape
n = soil.shape[1]
1. Correlation Heatmap
plt.figure(figsize = (10, 8))
sns.heatmap(soil.corr(), annot = True, square = True, linewidths = 0.1)
plt.ylim(n-1, 0)
plt.xlim(0, n-1)
plt.title("Pearson Correlation Heatmap between variables")
plt.show()
plt.figure(figsize = (10, 8))
corr = soil.corr()
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
with sns.axes_style("white"):
sns.heatmap(corr, mask = mask, linewidths = 0.1, vmax = .3, square = True)
plt.ylim(n-1, 0)
plt.xlim(0, n-1)
plt.title("correlation heatmap without symmetric information")
plt.show()
fig, axs = plt.subplots(figsize = (8, 12))
plt.subplots_adjust(left=0, bottom=0, right=1, top=1, wspace=0, hspace=0.2)
plt.subplot(2, 1, 1)
# correlation heatmap without annotation
sns.heatmap(soil.corr(), linewidths = 0.1, square = True, cmap = "YlGnBu")
plt.ylim(n-1, 0)
plt.xlim(0, n-1)
plt.title("correlation heatmap without annotation")
plt.subplot(2, 1, 2)
# correlation heatmap with annotation
sns.heatmap(soil.corr(), linewidths = 0.1, square = True, annot = True, cmap = "YlGnBu")
plt.ylim(n-1, 0)
plt.xlim(0, n-1)
plt.title("correlation heatmap with annotation")
plt.show()
2. Boxplots
plt.figure(figsize = (20, 10))
soil.iloc[:, 2:].boxplot()
# soil.iloc[:, 2:].plot(kind = "box") # 2nd method
plt.title("boxplot for each variable", fontsize = "xx-large")
plt.show()
fig, axs = plt.subplots(2, int(n/2), figsize = (20, 10))
colors = ['b', 'g', 'r', 'c', 'm', 'y', 'pink'] # to set color
for i, var in enumerate(soil.columns.values):
if var == "landuse":
axs[1, i-int(n/2)].set_xlabel(var, fontsize = 'large')
continue
if i < int(n/2):
axs[0, i].boxplot(soil[var])
axs[0, i].set_xlabel(var, fontsize = 'large')
axs[0, i].scatter(x = np.tile(1, soil.shape[0]), y = soil[var], color = colors[i])
else:
axs[1, i-int(n/2)].boxplot(soil[var])
axs[1, i-int(n/2)].set_xlabel(var, fontsize = 'large')
axs[1, i-int(n/2)].scatter(x = np.tile(1, soil.shape[0]), y = soil[var],
color = colors[i-int(n/2)])
plt.suptitle("boxplot of variables", fontsize = 'xx-large')
plt.show()
3. scatterplots
fig, axs = plt.subplots(2, int(n/2), figsize = (20, 10))
colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k'] # to set color
for i, var in enumerate(soil.columns.values):
if var == "landuse":
axs[1, i-int(n/2)].set_xlabel(var, fontsize = 'large')
continue
if i < int(n/2):
axs[0, i].scatter(soil[var], soil["lead"], color = colors[i])
axs[0, i].set_xlabel(var, fontsize = 'large')
else:
axs[1, i-int(n/2)].scatter(soil[var], soil["lead"], color = colors[i-int(n/2)])
axs[1, i-int(n/2)].set_xlabel(var, fontsize = 'large')
plt.suptitle("scatterplot of lead vs. predictors", fontsize = 'xx-large')
plt.show()
4. histograms and density plots
fig, axs = plt.subplots(2, int(n/2), figsize = (20, 10))
colors = ['b', 'g', 'r', 'c', 'm', 'y', 'pink'] # to set color
for i, var in enumerate(soil.columns.values):
if var == "landuse":
axs[1, i-int(n/2)].set_xlabel(var, fontsize = 'large')
continue
if i < int(n/2):
#axs[0, i].hist(soil[var], color = colors[i], alpha = 0.8)
#axs[0, i].set_xlabel(var, fontsize = 'large')
# add density plot
sns.distplot(soil[var], color = colors[i], ax = axs[0, i])
#kde = gaussian_kde(soil[var])
#xaxis = np.linspace(min(soil[var]), max(soil[var]), 20)
#axs[0, i].plot(xaxis, kde(xaxis) * 100)
else:
sns.distplot(soil[var], color = colors[i-int(n/2)], ax = axs[1, i-int(n/2)])
plt.suptitle("histogram and density plot of each variable", fontsize = 'xx-large')
plt.show()
Variable Selection and Modeling
Based on the scatterplot of lead vs. predictors:
cadmium
, copper
, zinc
and om
seem to have strong positive correlation with leadelev
, dist
and dist.m
seem to have strong negative correlation with leadlime
seems to be a good indicator predictor variableTherefore, we now build a linear model using 6 predictors: cadmium
, copper
, zinc
, elev
, dist
and lime
.
# variables = ["cadmium", "copper", "zinc", "elev", "dist", "dist.m", "lime"]
variables = ["cadmium", "copper", "zinc", "elev", "dist", "lime"]
x = soil[variables]
y = soil["lead"]
split the dataset into training and testing set
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.33, random_state = 42)
build a linear model using sklearn
package
model = LinearRegression().fit(X_train, y_train)
# R^2 on training set
R2_train = model.score(X_train, y_train)
R2_train
# R^2 on testing set
R2_test = model.score(X_test, y_test)
R2_test
# value of intercepts
for var, coef in zip(variables, model.coef_):
print(var, coef)
build a linear model using statsmodel
package
df_train = pd.concat([X_train, y_train], axis = 1) # build a dataframe for training set
fullModel = smf.ols("lead ~ cadmium + copper + zinc + elev + lime", data = df_train)
fullModel = fullModel.fit()
print(fullModel.summary())
diagnostics plots
# diagnosticsPlots function definition
def diagnosticsPlots(predictions, truevalues):
'''
This function will plot the following two diagonistics plots:
1. residuals vs. fitted values plot (plot on the left)
2. qqplot of the residuals (plot on the right)
parameters required:
- predictions: predicted values using the linear regression model
- truevalues: true values of the response variable
required modules/packages:
- matplotlib.pyplot (plt)
- scipy (sp)
Author: Kaixin Wang
Created: October 2019
'''
residuals = truevalues - predictions
fig, axs = plt.subplots(figsize = (12, 2.5))
plt.subplot(1, 2, 1)
# residuals vs. fitted values
plt.scatter(predictions, residuals)
plt.plot(predictions, np.tile(np.mean(residuals), residuals.shape[0]), "r-")
plt.xlim([np.min(predictions) - 2.5, np.max(predictions) + 2.5])
plt.title("residuals vs. fitted values")
plt.ylabel("residuals")
plt.xlabel("fitted values")
plt.legend(["E[residuals]"])
print("Average of residuals: ", np.mean(residuals))
# qqplot
plot = plt.subplot(1, 2, 2)
sp.stats.probplot(residuals, plot = plot, fit = True)
plt.show()
pred_val = fullModel.fittedvalues.copy()
true_val = y_train.copy()
diagnosticsPlots(pred_val, true_val)
training and testing Root-Mean-Square Error (RMSE)
The root-mean-square error of a prediction is calcuated as the following:
$$\text{RMSE} = \frac{\sqrt{\sum_{i=1}^n (y_i - \hat {y_i}) ^ 2}}{n}$$where $y_i$ is the true value of the response, $\hat {y_i}$ is the fitted value of the response, n is the size of the input
def RMSETable(train_y, train_fitted, train_size, train_R2, test_y, test_fitted, test_size, test_R2):
'''
This function creates a function that returns a dataframe in the following format:
-------------------------------------------------
RMSE R-squared size
training set train_RMSE train_R2 n_training
testing set test_RMSE tesint_R2 n_testing
-------------------------------------------------
parameters required:
- train_y: true values of the response in the training set
- train_fitted: fitted values of the response in the training set
- train_size: size of the training set
- train_R2: R-squared on the training set
- test_y: true values of the response in the testing set
- test_fitted: fitted values of the response in the testing set
- test_size: size of the testing size
- test_R2: R-squared on the testing set
Author: Kaixin Wang
Created: October 2019
'''
train_RMSE = np.sqrt(sum(np.power(train_y - train_fitted, 2))) / train_size
test_RMSE = np.sqrt(sum(np.power(test_y - test_fitted, 2))) / test_size
train = [train_RMSE, train_size, train_R2]
test = [test_RMSE, test_size, test_R2]
return pd.DataFrame([train, test], index = ["training set", "testing set"], columns = ["RMSE", "R-squared", "size"])
table1 = RMSETable(y_train.values, model.predict(X_train), R2_train, len(y_train),
y_test.values, model.predict(X_test), R2_test, len(y_test))
print(table1)
Since we are using 5 predictor variables for 152 entries, we can consider using fewer predictors to improve the $R_{adj}^2$, where is a version of $R^2$ that penalizes model complexity.
Selecting fewer predictor will also help with preventing overfitting.
Therefore, observing that zinc
, copper
and cadmium
have relatively stronger correlation with lead
, we now build a model using only these three predictors.
using sklearn
linear regression module
variables = ["cadmium", "lime", "zinc"]
x = soil[variables]
y = soil["lead"]
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.33, random_state = 42)
model2 = LinearRegression().fit(X_train, y_train)
# R^2 on training set
R2_train = model2.score(X_train, y_train)
print("Training set R-squared", R2_train)
# R^2 on testing set
R2_test = model2.score(X_test, y_test)
print("Testing set R-squared", R2_test)
# value of intercepts
for var, coef in zip(variables, model2.coef_):
print(var, coef)
using statsmodel
ordinary linear regrssion module
df_train = pd.concat([X_train, y_train], axis = 1) # build a dataframe for training set
reducedModel = smf.ols("lead ~ cadmium + C(lime) + zinc", data = df_train)
reducedModel = reducedModel.fit()
print(reducedModel.summary())
diagnostics plots
pred_val = reducedModel.fittedvalues.copy()
true_val = y_train.copy()
diagnosticsPlots(pred_val, true_val)
table2 = RMSETable(y_train.values, model2.predict(X_train), R2_train, len(y_train),
y_test.values, model2.predict(X_test), R2_test, len(y_test))
print(table2)
print(table1)
print(table2)
table = sm.stats.anova_lm(reducedModel, fullModel)
table
Comparing the tables obtained above:
where $n$ is the sample size and $p$ is the number of predictors used in the model.
By building two linear regression models using 6 and 3 predictors respectively, we observe that predictor variables cadmium
, copper
and zinc
have very strong positive correlation with the response variable lead
. In addition, we also observed that by adding three more predictors, dist
, elev
and lime
, the $R^2_{adj}$ didn't get much improvement comparing to the model with only 3 predictors.
Therefore, the final model that we decided to adopt is the reduced model with 3 predictors:
$$\text{Lead} \sim \text{Cadmium} + \text{lime} + \text{Zinc}$$Reference:
Dataset: meuse
dataset
To access this dataset in R
:
install.packages("sp")
library(sp)
data(meuse)
© Kaixin Wang, updated October 2019