#!/usr/bin/env python
# coding: utf-8

# # The scikit-learn interface

# In[1]:


import addutils.toc ; addutils.toc.js(ipy_notebook=True)


# In[2]:


import scipy.io
import numpy as np
import pandas as pd
import warnings
from addutils import css_notebook
css_notebook()


# In[3]:


warnings.filterwarnings('ignore')


# ## 1 Defining the estimator object

# In scikit-learn, almost all operations are done through an **estimator object**. For example, a linear regression estimator can be instantiated as follows:

# In[4]:


from sklearn import linear_model
model = linear_model.LinearRegression(fit_intercept=True, normalize=True)
print(model)


# In brackets are displayed the current values for the “hyperparameters” of the estimator. To learn about the specific “hyperparameters” check the documentation:

# In[5]:


# Try: model?


# Meta parameters can be changed after the model has been created:

# In[6]:


model.fit_intercept = False
print(model)


# Given a scikit-learn *estimator* object named `model`, the following methods are available:
# 
# * *Available in all Estimators*
#   * `model.fit()` : fit training data.
#       * For supervised learning, this accepts two arguments: `model.fit(X, y)`.
#       * For unsupervised learning, this accepts only a single argument `model.fit(X)`.
# * *Available in supervised estimators*
#   * `model.predict()` : predict the label of a new set of data. This accepts one argument `model.predict(X_new)`).
#   * `model.predict_proba()`: Returns the probability of a categorical label. The label itself is returned by `model.predict()`.
#   * `model.score()`: Scores are between 0 and 1, with a larger score indicating a better fit.
# * *Available in unsupervised estimators*
#   * `model.transform()`: Transform new data into the new basis. This accepts one argument `X_new`, and returns the new representation of the data.
#   * `model.fit_transform()`: some estimators implement this method, which more efficiently performs a fit and a transform on the same input data.

# ## 2 Simple estimator example: fit a linear regression model

# In[7]:


import bokeh.plotting as bk
bk.output_notebook()


# In[8]:


from sklearn import datasets, preprocessing, metrics
X, y = datasets.samples_generator.make_regression(n_samples=70,
                                                  n_features=1, n_informative=1,
                                                  random_state=0, noise=5)
scaler = preprocessing.MinMaxScaler()
X_sc = scaler.fit_transform(X)

lin = linear_model.LinearRegression(fit_intercept=True)
lin.fit(X_sc, y)

print(lin)
print("Model coefficient: %.5f, and intercept: %.5f" % (lin.coef_, lin.intercept_))
err = metrics.mean_squared_error(lin.predict(X_sc), y)
print("Mean Squared Error: %.2f" % err)

# Plot the data and the model prediction
X_p = np.linspace(0, 1, 2)[:, np.newaxis]
y_p = lin.predict(X_p)

fig = bk.figure(title='Simple Regression', 
                x_axis_label='X scaled',
                y_axis_label='y',
                plot_width=600, plot_height=300)
fig.circle(X_sc.squeeze(), y, line_color='darkgreen', size=10,
           fill_color='green', fill_alpha=0.5, legend='Measured Data')
fig.line(X_p.ravel(), y_p, line_color='blue', legend='Predicted Values')
fig.legend.location = 'bottom_right'
bk.show(fig)


# ## 3 Working with MATLAB files

# v4 (Level 1.0), v6 and v7 to 7.2 matfiles are supported.
# To read matlab 7.3 format mat files an HDF5 python library is required. Please check the scipy documentation for more information.

# The data can be generated with the following MATLAB code:
# 
#     % Generate Regression Test Data
#     X = [1 2 3
#          4 5 6
#          7 8 9
#          0 1 2] + 0.1;
#     y = sum(X,2);
#     feat_names = strvcat('Feature One', 'Feature Two', 'Feature Three');
#     save ('matlab_test_data_01', 'X','y', 'feat_names')

# In[9]:


mat_data = scipy.io.loadmat('example_data/matlab_test_data_01.mat')


# Variables names included in the `.mat` file are keys of the `mat_data` dictionary. Moreover the key `'__header__'` contains the mat-file information.
# Here we load the two variables in Pandas varialbles:

# In[10]:


mat_data.keys()


# In the following code the `.strip()` method is used to remove the trailing white spaces used by MATLAB to make all the variable names of the same lenght:

# In[11]:


X = pd.DataFrame(mat_data['X'], columns=[s.strip() for s in list(mat_data['feat_names'])])
y = pd.DataFrame(mat_data['y'], columns=['measured'])
print(X, '\n\n', y)


# ## 4 Datasets available in scikit-learn

# Typical *scikit-learn* dataset are dictionary-like object that holds all the data and metadata.
# 
# * **Features** are usually stored in the `.data` field in the form of a 2D array `[n_samples, n_features]`.
# * **Explanatory variables (targets)** are usually stored in the `.target` field in the form of a 1D array.
# 
# Scikit-learn makes available a host of datasets for testing learning algorithms:
# 
# - **Packaged Data:** these small datasets can be downloaded with ``sklearn.datasets.load_*``
# - **Downloadable Data:** larger datasets that can be fetched from the web with ``sklearn.datasets.fetch_*``
# - **Generated Data:** can be created with `sklearn.datasets.make_*`
# 
# Try by yourself:
# 
# * `datasets.load_<TAB>`
# * `datasets.fetch_<TAB>`
# * `datasets.make_<TAB>`

# In[12]:


#datasets.make_


# ### 4.1 Example: the "Iris" Packaged Dataset

# - Features in the Iris dataset:
# 
#   1. sepal length in cm
#   2. sepal width in cm
#   3. petal length in cm
#   4. petal width in cm
# 
# - Target classes to predict:
# 
#   1. Iris Setosa
#   2. Iris Versicolour
#   3. Iris Virginica

# In[13]:


d = datasets.load_iris()


# Try by yourself one of the following commands where *'d'* is the variable containing the dataset:
# 
#     print d.keys()           # Structure of the contained data
#     print d.DESCR            # A complete description of the dataset
#     print d.data.shape       # [n_samples, n_features]
#     print d.target.shape     # [n_samples,]
#     print d.feature_names
#     datasets.get_data_home() # This is where the datasets are stored

# In[14]:


print(d.keys())
print(d.target_names)
print(d.feature_names)


# ### 4.2 Example: the "Digits" Packaged Dataset

# The *Digits* contains 1797 samples made of 64 features: each feature represents the grey-scale value of a 8x8 digit image:

# In[15]:


import bokeh.plotting as bk
bk.output_notebook()


# In[16]:


from bokeh.palettes import Greys9
from bokeh.models.ranges import Range1d
import addutils.palette as pal
import addutils.imagegrid as ig

digits = datasets.load_digits()

# plot the digits: each image is 8x8 pixels
images = [ digits.images[i][::-1, :] for i in range(40) ]
txt =    [ str(i) for i in range(10) ] * 4

fig = ig.imagegrid_figure(figure_plot_width=760, figure_plot_height=100,
                          figure_title=None,
                          images=images, grid_size=(20, 2), 
                          text=txt, text_font_size='9pt', text_color='red',
                          palette=Greys9[::-1], padding=0.2)
bk.show(fig)


# ### 4.3 Example: the "Blob" Generated Dataset

# In[17]:


import seaborn as sns
cat_colors = list(map(pal.to_hex, sns.color_palette('Paired', 7)))


# In[18]:


data, color_indices = datasets.make_blobs(n_samples=2000, n_features=2, centers=7,
                                   center_box=(-4.0, 6.0), cluster_std=0.5)

fig = bk.figure(title=None)
fig.circle(data[:,0], data[:,1],
            line_color='black', line_alpha=0.5, size=8,
            fill_color=pal.linear_map(color_indices, cat_colors,
                                      low=0, high=6))
bk.show(fig)


# ---
# 
# Visit [www.add-for.com](<http://www.add-for.com/IT>) for more tutorials and updates.
# 
# This work is licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">Creative Commons Attribution-ShareAlike 4.0 International License</a>.