#!/usr/bin/env python
# coding: utf-8

# ### OSCR Machine Learning in Python
# 
# **Linear Regression Module**
# 
# **&copy; Kaixin Wang**, Fall 2019
# 
# 

# Module/Package import

# In[1]:


import numpy as np # numpy module for linear algebra
import pandas as pd # pandas module for data manipulation
import matplotlib.pyplot as plt # module for plotting
import seaborn as sns # another module for plotting


# In[2]:


import warnings # to handle warning messages
warnings.filterwarnings('ignore')


# In[3]:


from sklearn.linear_model import LinearRegression # package for linear model
import statsmodels.api as sm # another package for linear model
import statsmodels.formula.api as smf
import scipy as sp


# In[4]:


from sklearn.model_selection import train_test_split # split data into training and testing sets


# Dataset import

# The dataset that we will be using is the `meuse` dataset.
# 
# As described by the author of the data: "This data set gives locations and topsoil heavy metal concentrations, along with a number of soil and landscape variables at the observation locations, collected in a flood plain of the river Meuse, near the village of Stein (NL). Heavy metal concentrations are from composite samples of an area of approximately 15 m $\times$ 15 m."

# In[5]:


soil = pd.read_csv("soil.csv")  # data import
soil.head() # check if read in correctly


# In[6]:


soil.shape # rows x columns


# In[7]:


print(soil.describe())


# In[8]:


index = pd.isnull(soil).any(axis = 1)
soil = soil[-index]
soil = soil.reset_index(drop = True)


# In[9]:


soil.shape


# In[10]:


n = soil.shape[1]


# #### Exploratory Data Analysis (EDA)

#     1. Correlation Heatmap

# In[11]:


plt.figure(figsize = (10, 8))
sns.heatmap(soil.corr(), annot = True, square = True, linewidths = 0.1)
plt.ylim(n-1, 0)
plt.xlim(0, n-1)
plt.title("Pearson Correlation Heatmap between variables")
plt.show()


# In[12]:


plt.figure(figsize = (10, 8))
corr = soil.corr()
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
with sns.axes_style("white"):
    sns.heatmap(corr, mask = mask, linewidths = 0.1, vmax = .3, square = True)
plt.ylim(n-1, 0)
plt.xlim(0, n-1)
plt.title("correlation heatmap without symmetric information")
plt.show()


# In[13]:


fig, axs = plt.subplots(figsize = (8, 12))
plt.subplots_adjust(left=0, bottom=0, right=1, top=1, wspace=0, hspace=0.2)
plt.subplot(2, 1, 1)
# correlation heatmap without annotation
sns.heatmap(soil.corr(), linewidths = 0.1, square = True, cmap = "YlGnBu")
plt.ylim(n-1, 0)
plt.xlim(0, n-1)
plt.title("correlation heatmap without annotation")
plt.subplot(2, 1, 2)
# correlation heatmap with annotation
sns.heatmap(soil.corr(), linewidths = 0.1, square = True, annot = True, cmap = "YlGnBu")
plt.ylim(n-1, 0)
plt.xlim(0, n-1)
plt.title("correlation heatmap with annotation")
plt.show()


#      2. Boxplots

# In[14]:


plt.figure(figsize = (20, 10))
soil.iloc[:, 2:].boxplot()
# soil.iloc[:, 2:].plot(kind = "box")  # 2nd method
plt.title("boxplot for each variable", fontsize = "xx-large")
plt.show()


# In[15]:


fig, axs = plt.subplots(2, int(n/2), figsize = (20, 10))
colors = ['b', 'g', 'r', 'c', 'm', 'y', 'pink']  # to set color

for i, var in enumerate(soil.columns.values):
    if var == "landuse":
        axs[1, i-int(n/2)].set_xlabel(var, fontsize = 'large')
        continue
    if i < int(n/2):
        axs[0, i].boxplot(soil[var])
        axs[0, i].set_xlabel(var, fontsize = 'large')
        axs[0, i].scatter(x = np.tile(1, soil.shape[0]), y = soil[var], color = colors[i])
    else:
        axs[1, i-int(n/2)].boxplot(soil[var])
        axs[1, i-int(n/2)].set_xlabel(var, fontsize = 'large')
        axs[1, i-int(n/2)].scatter(x = np.tile(1, soil.shape[0]), y = soil[var], 
                                   color = colors[i-int(n/2)])
        
plt.suptitle("boxplot of variables", fontsize = 'xx-large')
plt.show()


#     3. scatterplots

# In[16]:


fig, axs = plt.subplots(2, int(n/2), figsize = (20, 10))
colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k']  # to set color

for i, var in enumerate(soil.columns.values):
    if var == "landuse":
        axs[1, i-int(n/2)].set_xlabel(var, fontsize = 'large')
        continue
    if i < int(n/2):
        axs[0, i].scatter(soil[var], soil["lead"], color = colors[i])
        axs[0, i].set_xlabel(var, fontsize = 'large')
    else:
        axs[1, i-int(n/2)].scatter(soil[var], soil["lead"], color = colors[i-int(n/2)])
        axs[1, i-int(n/2)].set_xlabel(var, fontsize = 'large')
        
plt.suptitle("scatterplot of lead vs. predictors", fontsize = 'xx-large')
plt.show()


#     4. histograms and density plots

# In[17]:


fig, axs = plt.subplots(2, int(n/2), figsize = (20, 10))
colors = ['b', 'g', 'r', 'c', 'm', 'y', 'pink']  # to set color

for i, var in enumerate(soil.columns.values):
    if var == "landuse":
        axs[1, i-int(n/2)].set_xlabel(var, fontsize = 'large')
        continue
    if i < int(n/2):
        sns.distplot(soil[var], color = colors[i], ax = axs[0, i])
    else:
        sns.distplot(soil[var], color = colors[i-int(n/2)], ax = axs[1, i-int(n/2)])
        
plt.suptitle("histogram and density plot of each variable", fontsize = 'xx-large')
plt.show()


# Reference:
# 
# Dataset: `meuse` dataset
# 
# - P.A. Burrough, R.A. McDonnell, 1998. Principles of Geographical Information Systems. Oxford University Press.
# - Stichting voor Bodemkartering (STIBOKA), 1970. Bodemkaart van Nederland : Blad 59 Peer, Blad 60 West en 60 Oost Sittard: schaal 1 : 50 000. Wageningen, STIBOKA.
# 
# To access this dataset in `R`:
# 
# ```
# install.packages("sp")
# library(sp)
# data(meuse)
# ```

# **&copy; Kaixin Wang**, updated October 2019