Notebook

Stochastic gradient descent (SGD)¶

SGD is an incremental gradient descent algorithm which modifies its weights, in an effort to reach a local minimum. The cuML implementation can take array-like objects, either in host as NumPy arrays or in device (as Numba or _cuda_array_interface_compliant), as well as cuDF DataFrames. In order to convert your dataset into a cuDF dataframe format please refer the documentation on https://rapidsai.github.io/projects/cudf/en/latest/. The SGD algorithm implemented in cuML can accept the following parameters:

loss : 'hinge', 'log', 'squared_loss' (default = 'squared_loss')
penalty: 'none', 'l1', 'l2', 'elasticnet' (default = 'none')
alpha: float (default = 0.0001)
fit_intercept : boolean (default = True)
epochs : int (default = 1000)
tol : float (default = 1e-3)
shuffle : boolean (default = True)
eta0 : float (default = 0.0)
power_t : float (default = 0.5)
learning_rate : 'optimal', 'constant', 'invscaling', 'adaptive' (default = 'constant')
n_iter_no_change : int (default = 5)

For additional information on the SGD model please refer to the documentation on https://rapidsai.github.io/projects/cuml/en/latest/index.html

In [ ]:

import numpy as np
import pandas as pd
import cudf
import os
from cuml.solvers import SGD as cumlSGD
from sklearn.linear_model import SGDRegressor

Helper Functions¶

In [ ]:

# check if the mortgage dataset is present and then extract the data from it, else just create a random dataset for sgd 
import gzip
# change the path of the mortgage dataset if you have saved it in a different directory
def load_data(nrows, ncols, cached = 'data/mortgage.npy.gz'):
    if os.path.exists(cached):
        print('use mortgage data')

        with gzip.open(cached) as f:
            X = np.load(f)
        # the 4th column is 'adj_remaining_months_to_maturity'
        # used as the label
        X = X[:,[i for i in range(X.shape[1]) if i!=4]]
        y = X[:,4:5]
        rindices = np.random.randint(0,X.shape[0]-1,nrows)
        X = X[rindices,:ncols]
        y = y[rindices]

    else:
        # create a random dataset
        print('use random data')
        X = np.random.rand(nrows,ncols)
        y = np.random.randint(0,10,size=(nrows,1))
    train_rows = int(nrows*0.8)
    df_X_train = pd.DataFrame({'fea%d'%i:X[0:train_rows,i] for i in range(X.shape[1])})
    df_X_test = pd.DataFrame({'fea%d'%i:X[train_rows:,i] for i in range(X.shape[1])})
    df_y_train = pd.DataFrame({'fea%d'%i:y[0:train_rows,i] for i in range(y.shape[1])})
    df_y_test = pd.DataFrame({'fea%d'%i:y[train_rows:,i] for i in range(y.shape[1])})
    return df_X_train, df_X_test, df_y_train, df_y_test

In [ ]:

# this function checks if the results obtained from two different methods (sklearn and cuml) are the same
from sklearn.metrics import mean_squared_error
def array_equal(a,b,threshold=2e-3,with_sign=True):
    a = to_nparray(a).ravel()
    b = to_nparray(b).ravel()
    if with_sign == False:
        a,b = np.abs(a),np.abs(b)
    error = mean_squared_error(a,b)
    res = error<threshold
    return res

# the function converts a variable from ndarray or dataframe format to numpy array
def to_nparray(x):
    if isinstance(x,np.ndarray) or isinstance(x,pd.DataFrame):
        return np.array(x)
    elif isinstance(x,np.float64):
        return np.array([x])
    elif isinstance(x,cudf.DataFrame) or isinstance(x,cudf.Series):
        return x.to_pandas().values
    return x

Run tests¶

In [ ]:

%%time
# nrows = number of samples
# ncols = number of features of each sample
nrows = 2**20
ncols = 399

# dataset is split into a ratio of 80:20, 
# 80% is used as the training data and the remaining 20% is used as the test data
X_train, X_test, y_train, y_test = load_data(nrows,ncols)
y_train_ser = y_train['fea0']
print('training data',X_train.shape)
print('training label',y_train.shape)
print('testing data',X_test.shape)
print('testing label',y_test.shape)

Here we set the parameters usedby both libraries. You can change the number of iterations used by changing the iterations variable. Please note that making this too high can cause the functions to take a long time to complete.

In [ ]:

#set parameters 
learning_rate = 'adaptive'
datatype = np.float32
penalty = 'elasticnet'
loss = 'squared_loss'
iterations = 10 

The max_iter parameter controls the maxixmum number of iterations the model can run for but it doesn’t guarantee that the model will definitely run for all those epochs, therefore the sklearn might run for less number of epochs than the cuML model

In [ ]:

%%time
# use the sklearn SGD Regressor model to fit the dataset 
sk_sgd = SGDRegressor(learning_rate=learning_rate, eta0=0.07,
                       max_iter=iterations, tol=0.0, fit_intercept=True,
                       penalty=penalty, loss=loss)
sk_sgd.fit(X_train, y_train_ser)

In [ ]:

%%time
# test the model by predicting its results for the unseen test set
y_sk = sk_sgd.predict(X_test)

# calculate the Mean Squared Error for the model's predictions
error_sk = mean_squared_error(y_test,y_sk)

In [ ]:

%%time
# convert the pandas dataframe to cuDF dataframe and series
X_cudf = cudf.DataFrame.from_pandas(X_train)
X_cudf_test = cudf.DataFrame.from_pandas(X_test)
y_cudf = cudf.Series(y_train_ser)

In [ ]:

%%time
# fit the training data on cuML's implementation of SGD
cu_sgd = cumlSGD(learning_rate=learning_rate, eta0=0.07, epochs=iterations, #epochs == n_iter
                 batch_size=512,
                 tol=0.0, penalty=penalty, loss=loss)
cu_sgd.fit(X_cudf, y_cudf)

In [ ]:

%%time
# test the model by predicting its values for the test set
y_pred = cu_sgd.predict(X_cudf_test)
y_pred = to_nparray(y_pred).ravel()
# calculate the Mean Squared Error for the model's predictions
error_cu = mean_squared_error(y_test,y_pred)

In [ ]:

# print the MSE of the sklearn and cuML models to compare them
print("SKL MSE(y):")
print(error_sk)
print("CUML MSE(y):")
print(error_cu)