#!/usr/bin/env python # coding: utf-8 # # The scikit-learn interface # In[1]: import addutils.toc ; addutils.toc.js(ipy_notebook=True) # In[2]: import scipy.io import numpy as np import pandas as pd import warnings from addutils import css_notebook css_notebook() # In[3]: warnings.filterwarnings('ignore') # ## 1 Defining the estimator object # In scikit-learn, almost all operations are done through an **estimator object**. For example, a linear regression estimator can be instantiated as follows: # In[4]: from sklearn import linear_model model = linear_model.LinearRegression(fit_intercept=True, normalize=True) print(model) # In brackets are displayed the current values for the “hyperparameters” of the estimator. To learn about the specific “hyperparameters” check the documentation: # In[5]: # Try: model? # Meta parameters can be changed after the model has been created: # In[6]: model.fit_intercept = False print(model) # Given a scikit-learn *estimator* object named `model`, the following methods are available: # # * *Available in all Estimators* # * `model.fit()` : fit training data. # * For supervised learning, this accepts two arguments: `model.fit(X, y)`. # * For unsupervised learning, this accepts only a single argument `model.fit(X)`. # * *Available in supervised estimators* # * `model.predict()` : predict the label of a new set of data. This accepts one argument `model.predict(X_new)`). # * `model.predict_proba()`: Returns the probability of a categorical label. The label itself is returned by `model.predict()`. # * `model.score()`: Scores are between 0 and 1, with a larger score indicating a better fit. # * *Available in unsupervised estimators* # * `model.transform()`: Transform new data into the new basis. This accepts one argument `X_new`, and returns the new representation of the data. # * `model.fit_transform()`: some estimators implement this method, which more efficiently performs a fit and a transform on the same input data. # ## 2 Simple estimator example: fit a linear regression model # In[7]: import bokeh.plotting as bk bk.output_notebook() # In[8]: from sklearn import datasets, preprocessing, metrics X, y = datasets.samples_generator.make_regression(n_samples=70, n_features=1, n_informative=1, random_state=0, noise=5) scaler = preprocessing.MinMaxScaler() X_sc = scaler.fit_transform(X) lin = linear_model.LinearRegression(fit_intercept=True) lin.fit(X_sc, y) print(lin) print("Model coefficient: %.5f, and intercept: %.5f" % (lin.coef_, lin.intercept_)) err = metrics.mean_squared_error(lin.predict(X_sc), y) print("Mean Squared Error: %.2f" % err) # Plot the data and the model prediction X_p = np.linspace(0, 1, 2)[:, np.newaxis] y_p = lin.predict(X_p) fig = bk.figure(title='Simple Regression', x_axis_label='X scaled', y_axis_label='y', plot_width=600, plot_height=300) fig.circle(X_sc.squeeze(), y, line_color='darkgreen', size=10, fill_color='green', fill_alpha=0.5, legend='Measured Data') fig.line(X_p.ravel(), y_p, line_color='blue', legend='Predicted Values') fig.legend.location = 'bottom_right' bk.show(fig) # ## 3 Working with MATLAB files # v4 (Level 1.0), v6 and v7 to 7.2 matfiles are supported. # To read matlab 7.3 format mat files an HDF5 python library is required. Please check the scipy documentation for more information. # The data can be generated with the following MATLAB code: # # % Generate Regression Test Data # X = [1 2 3 # 4 5 6 # 7 8 9 # 0 1 2] + 0.1; # y = sum(X,2); # feat_names = strvcat('Feature One', 'Feature Two', 'Feature Three'); # save ('matlab_test_data_01', 'X','y', 'feat_names') # In[9]: mat_data = scipy.io.loadmat('example_data/matlab_test_data_01.mat') # Variables names included in the `.mat` file are keys of the `mat_data` dictionary. Moreover the key `'__header__'` contains the mat-file information. # Here we load the two variables in Pandas varialbles: # In[10]: mat_data.keys() # In the following code the `.strip()` method is used to remove the trailing white spaces used by MATLAB to make all the variable names of the same lenght: # In[11]: X = pd.DataFrame(mat_data['X'], columns=[s.strip() for s in list(mat_data['feat_names'])]) y = pd.DataFrame(mat_data['y'], columns=['measured']) print(X, '\n\n', y) # ## 4 Datasets available in scikit-learn # Typical *scikit-learn* dataset are dictionary-like object that holds all the data and metadata. # # * **Features** are usually stored in the `.data` field in the form of a 2D array `[n_samples, n_features]`. # * **Explanatory variables (targets)** are usually stored in the `.target` field in the form of a 1D array. # # Scikit-learn makes available a host of datasets for testing learning algorithms: # # - **Packaged Data:** these small datasets can be downloaded with ``sklearn.datasets.load_*`` # - **Downloadable Data:** larger datasets that can be fetched from the web with ``sklearn.datasets.fetch_*`` # - **Generated Data:** can be created with `sklearn.datasets.make_*` # # Try by yourself: # # * `datasets.load_` # * `datasets.fetch_` # * `datasets.make_` # In[12]: #datasets.make_ # ### 4.1 Example: the "Iris" Packaged Dataset # - Features in the Iris dataset: # # 1. sepal length in cm # 2. sepal width in cm # 3. petal length in cm # 4. petal width in cm # # - Target classes to predict: # # 1. Iris Setosa # 2. Iris Versicolour # 3. Iris Virginica # In[13]: d = datasets.load_iris() # Try by yourself one of the following commands where *'d'* is the variable containing the dataset: # # print d.keys() # Structure of the contained data # print d.DESCR # A complete description of the dataset # print d.data.shape # [n_samples, n_features] # print d.target.shape # [n_samples,] # print d.feature_names # datasets.get_data_home() # This is where the datasets are stored # In[14]: print(d.keys()) print(d.target_names) print(d.feature_names) # ### 4.2 Example: the "Digits" Packaged Dataset # The *Digits* contains 1797 samples made of 64 features: each feature represents the grey-scale value of a 8x8 digit image: # In[15]: import bokeh.plotting as bk bk.output_notebook() # In[16]: from bokeh.palettes import Greys9 from bokeh.models.ranges import Range1d import addutils.palette as pal import addutils.imagegrid as ig digits = datasets.load_digits() # plot the digits: each image is 8x8 pixels images = [ digits.images[i][::-1, :] for i in range(40) ] txt = [ str(i) for i in range(10) ] * 4 fig = ig.imagegrid_figure(figure_plot_width=760, figure_plot_height=100, figure_title=None, images=images, grid_size=(20, 2), text=txt, text_font_size='9pt', text_color='red', palette=Greys9[::-1], padding=0.2) bk.show(fig) # ### 4.3 Example: the "Blob" Generated Dataset # In[17]: import seaborn as sns cat_colors = list(map(pal.to_hex, sns.color_palette('Paired', 7))) # In[18]: data, color_indices = datasets.make_blobs(n_samples=2000, n_features=2, centers=7, center_box=(-4.0, 6.0), cluster_std=0.5) fig = bk.figure(title=None) fig.circle(data[:,0], data[:,1], line_color='black', line_alpha=0.5, size=8, fill_color=pal.linear_map(color_indices, cat_colors, low=0, high=6)) bk.show(fig) # --- # # Visit [www.add-for.com]() for more tutorials and updates. # # This work is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License.