#!/usr/bin/env python # coding: utf-8 # ### OSCR Machine Learning in Python # # **Linear Regression Module** # # **© Kaixin Wang**, Fall 2019 # # # Module/Package import # In[1]: import numpy as np # numpy module for linear algebra import pandas as pd # pandas module for data manipulation import matplotlib.pyplot as plt # module for plotting import seaborn as sns # another module for plotting # In[2]: import warnings # to handle warning messages warnings.filterwarnings('ignore') # In[3]: from sklearn.linear_model import LinearRegression # package for linear model import statsmodels.api as sm # another package for linear model import statsmodels.formula.api as smf import scipy as sp # In[4]: from sklearn.model_selection import train_test_split # split data into training and testing sets # Dataset import # The dataset that we will be using is the `meuse` dataset. # # As described by the author of the data: "This data set gives locations and topsoil heavy metal concentrations, along with a number of soil and landscape variables at the observation locations, collected in a flood plain of the river Meuse, near the village of Stein (NL). Heavy metal concentrations are from composite samples of an area of approximately 15 m $\times$ 15 m." # In[5]: soil = pd.read_csv("soil.csv") # data import soil.head() # check if read in correctly # In[6]: soil.shape # rows x columns # In[7]: print(soil.describe()) # In[8]: index = pd.isnull(soil).any(axis = 1) soil = soil[-index] soil = soil.reset_index(drop = True) # In[9]: soil.shape # In[10]: n = soil.shape[1] # #### Exploratory Data Analysis (EDA) # 1. Correlation Heatmap # In[11]: plt.figure(figsize = (10, 8)) sns.heatmap(soil.corr(), annot = True, square = True, linewidths = 0.1) plt.ylim(n-1, 0) plt.xlim(0, n-1) plt.title("Pearson Correlation Heatmap between variables") plt.show() # In[12]: plt.figure(figsize = (10, 8)) corr = soil.corr() mask = np.zeros_like(corr) mask[np.triu_indices_from(mask)] = True with sns.axes_style("white"): sns.heatmap(corr, mask = mask, linewidths = 0.1, vmax = .3, square = True) plt.ylim(n-1, 0) plt.xlim(0, n-1) plt.title("correlation heatmap without symmetric information") plt.show() # In[13]: fig, axs = plt.subplots(figsize = (8, 12)) plt.subplots_adjust(left=0, bottom=0, right=1, top=1, wspace=0, hspace=0.2) plt.subplot(2, 1, 1) # correlation heatmap without annotation sns.heatmap(soil.corr(), linewidths = 0.1, square = True, cmap = "YlGnBu") plt.ylim(n-1, 0) plt.xlim(0, n-1) plt.title("correlation heatmap without annotation") plt.subplot(2, 1, 2) # correlation heatmap with annotation sns.heatmap(soil.corr(), linewidths = 0.1, square = True, annot = True, cmap = "YlGnBu") plt.ylim(n-1, 0) plt.xlim(0, n-1) plt.title("correlation heatmap with annotation") plt.show() # 2. Boxplots # In[14]: plt.figure(figsize = (20, 10)) soil.iloc[:, 2:].boxplot() # soil.iloc[:, 2:].plot(kind = "box") # 2nd method plt.title("boxplot for each variable", fontsize = "xx-large") plt.show() # In[15]: fig, axs = plt.subplots(2, int(n/2), figsize = (20, 10)) colors = ['b', 'g', 'r', 'c', 'm', 'y', 'pink'] # to set color for i, var in enumerate(soil.columns.values): if var == "landuse": axs[1, i-int(n/2)].set_xlabel(var, fontsize = 'large') continue if i < int(n/2): axs[0, i].boxplot(soil[var]) axs[0, i].set_xlabel(var, fontsize = 'large') axs[0, i].scatter(x = np.tile(1, soil.shape[0]), y = soil[var], color = colors[i]) else: axs[1, i-int(n/2)].boxplot(soil[var]) axs[1, i-int(n/2)].set_xlabel(var, fontsize = 'large') axs[1, i-int(n/2)].scatter(x = np.tile(1, soil.shape[0]), y = soil[var], color = colors[i-int(n/2)]) plt.suptitle("boxplot of variables", fontsize = 'xx-large') plt.show() # 3. scatterplots # In[16]: fig, axs = plt.subplots(2, int(n/2), figsize = (20, 10)) colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k'] # to set color for i, var in enumerate(soil.columns.values): if var == "landuse": axs[1, i-int(n/2)].set_xlabel(var, fontsize = 'large') continue if i < int(n/2): axs[0, i].scatter(soil[var], soil["lead"], color = colors[i]) axs[0, i].set_xlabel(var, fontsize = 'large') else: axs[1, i-int(n/2)].scatter(soil[var], soil["lead"], color = colors[i-int(n/2)]) axs[1, i-int(n/2)].set_xlabel(var, fontsize = 'large') plt.suptitle("scatterplot of lead vs. predictors", fontsize = 'xx-large') plt.show() # 4. histograms and density plots # In[17]: fig, axs = plt.subplots(2, int(n/2), figsize = (20, 10)) colors = ['b', 'g', 'r', 'c', 'm', 'y', 'pink'] # to set color for i, var in enumerate(soil.columns.values): if var == "landuse": axs[1, i-int(n/2)].set_xlabel(var, fontsize = 'large') continue if i < int(n/2): sns.distplot(soil[var], color = colors[i], ax = axs[0, i]) else: sns.distplot(soil[var], color = colors[i-int(n/2)], ax = axs[1, i-int(n/2)]) plt.suptitle("histogram and density plot of each variable", fontsize = 'xx-large') plt.show() # Reference: # # Dataset: `meuse` dataset # # - P.A. Burrough, R.A. McDonnell, 1998. Principles of Geographical Information Systems. Oxford University Press. # - Stichting voor Bodemkartering (STIBOKA), 1970. Bodemkaart van Nederland : Blad 59 Peer, Blad 60 West en 60 Oost Sittard: schaal 1 : 50 000. Wageningen, STIBOKA. # # To access this dataset in `R`: # # ``` # install.packages("sp") # library(sp) # data(meuse) # ``` # **© Kaixin Wang**, updated October 2019