Ridge extends LinearRegression by providing L2 regularization on the coefficients when predicting response y with a linear combination of the predictors in X. It can reduce the variance of the predictors, and improves the conditioning of the problem.
The Ridge Regression function implemented in the cuml library allows the user to change the fit_intercept, normalize, solver and alpha parameters. Here is a brief on RAPIDS' Ridge Regression's parameters:
The methods that can be used with the Ridge Regression are:
The model can take array-like objects, either in host as NumPy arrays or in device (as Numba or _cuda_array_interface_compliant), as well as cuDF DataFrames. In order to convert your dataset to cudf format please read the cudf documentation on https://rapidsai.github.io/projects/cudf/en/latest/. It is important to understand that the 'svd' solver will run slower than the 'eig' solver however, the 'svd' solver is more stable and robust. Therefore, we would recomend that you use the 'eig' solver when a slight error is acceptable. For additional information please refer to the documentation on https://rapidsai.github.io/projects/cuml/en/latest/index.html
import numpy as np
import pandas as pd
import cudf
import os
from cuml import Ridge as cuRidge
from sklearn.linear_model import Ridge as skRidge
from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_error
# check if mortgage dataset is present and then extract the data from it, else just create a random dataset for regression
import gzip
# change the path of the mortgage dataset if you have saved it in a different directory
def load_data(nrows, ncols, cached = 'data/mortgage.npy.gz'):
train_rows = int(nrows*0.8)
if os.path.exists(cached):
print('use mortgage data')
with gzip.open(cached) as f:
X = np.load(f)
# the 4th column is 'adj_remaining_months_to_maturity'
# used as the label
X = X[:,[i for i in range(X.shape[1]) if i!=4]]
y = X[:,4:5]
rindices = np.random.randint(0,X.shape[0]-1,nrows)
X = X[rindices,:ncols]
y = y[rindices]
df_y_train = pd.DataFrame({'fea%d'%i:y[0:train_rows,i] for i in range(y.shape[1])})
df_y_test = pd.DataFrame({'fea%d'%i:y[train_rows:,i] for i in range(y.shape[1])})
else:
print('use random data')
# create a random regression dataset
X,y = make_regression(n_samples=nrows,n_features=ncols,n_informative=ncols, random_state=0)
df_y_train = pd.DataFrame({'fea0':y[0:train_rows,]})
df_y_test = pd.DataFrame({'fea0':y[train_rows:,]})
df_X_train = pd.DataFrame({'fea%d'%i:X[0:train_rows,i] for i in range(X.shape[1])})
df_X_test = pd.DataFrame({'fea%d'%i:X[train_rows:,i] for i in range(X.shape[1])})
return df_X_train, df_X_test, df_y_train, df_y_test
%%time
# nrows = number of samples
# ncols = number of features of each sample
nrows = 2**20
ncols = 399
#split the dataset into training and testing sets, in the ratio of 80:20 respectively
X_train, X_test, y_train, y_test = load_data(nrows,ncols)
print('training data',X_train.shape)
print('training label',y_train.shape)
print('testing data',X_test.shape)
print('testing label',y_test.shape)
%%time
# use the sklearn ridge regression model to fit the training dataset
skridge = skRidge(fit_intercept=False,
normalize=True, alpha=0.1)
skridge.fit(X_train, y_train)
%%time
# calculate the mean squared error of the sklearn ridge regression model on the testing dataset
sk_predict = skridge.predict(X_test)
error_sk = mean_squared_error(y_test,sk_predict)
%%time
# convert the pandas dataframe to cudf format
X_cudf = cudf.DataFrame.from_pandas(X_train)
X_cudf_test = cudf.DataFrame.from_pandas(X_test)
y_cudf = y_train.values
y_cudf = y_cudf[:,0]
y_cudf = cudf.Series(y_cudf)
%%time
# run the cuml ridge regression model to fit the training dataset. Eig is the faster algorithm, but svd is more accurate
curidge = cuRidge(fit_intercept=False,
normalize=True,
solver='svd', alpha=0.1)
curidge.fit(X_cudf, y_cudf)
%%time
# calculate the mean squared error of the testing dataset using the cuml ridge regression model
cu_predict = curidge.predict(X_cudf_test).to_array()
error_cu = mean_squared_error(y_test,cu_predict)
# print the mean squared error of the sklearn and cuml model to analyse them
print("SKL MSE(y):")
print(error_sk)
print("CUML MSE(y):")
print(error_cu)