import numpy as np
import pandas as pd
import cudf
import os
from cuml import LinearRegression as cuLinearRegression
from sklearn.linear_model import LinearRegression as skLinearRegression
from sklearn.datasets import make_regression
# Select a particular GPU to run the notebook
os.environ["CUDA_VISIBLE_DEVICES"]="2"
from timeit import default_timer
class Timer(object):
def __init__(self):
self._timer = default_timer
def __enter__(self):
self.start()
return self
def __exit__(self, *args):
self.stop()
def start(self):
"""Start the timer."""
self.start = self._timer()
def stop(self):
"""Stop the timer. Calculate the interval in seconds."""
self.end = self._timer()
self.interval = self.end - self.start
import gzip
def load_data(nrows, ncols, cached = 'data/mortgage.npy.gz'):
#split the dataset in a 80:20 split
train_rows = int(nrows*0.8)
if os.path.exists(cached):
print('use mortgage data')
with gzip.open(cached) as f:
X = np.load(f)
# the 4th column is 'adj_remaining_months_to_maturity'
# used as the label
X = X[:,[i for i in range(X.shape[1]) if i!=4]]
y = X[:,4:5]
rindices = np.random.randint(0,X.shape[0]-1,nrows)
X = X[rindices,:ncols]
y = y[rindices]
df_y_train = pd.DataFrame({'fea%d'%i:y[0:train_rows,i] for i in range(y.shape[1])})
df_y_test = pd.DataFrame({'fea%d'%i:y[train_rows:,i] for i in range(y.shape[1])})
else:
print('use random data')
X,y = make_regression(n_samples=nrows,n_features=ncols,n_informative=ncols, random_state=0)
df_y_train = pd.DataFrame({'fea0':y[0:train_rows,]})
df_y_test = pd.DataFrame({'fea0':y[train_rows:,]})
df_X_train = pd.DataFrame({'fea%d'%i:X[0:train_rows,i] for i in range(X.shape[1])})
df_X_test = pd.DataFrame({'fea%d'%i:X[train_rows:,i] for i in range(X.shape[1])})
return df_X_train, df_X_test, df_y_train, df_y_test
from sklearn.metrics import mean_squared_error
def array_equal(a,b,threshold=2e-3,with_sign=True):
a = to_nparray(a).ravel()
b = to_nparray(b).ravel()
if with_sign == False:
a,b = np.abs(a),np.abs(b)
error = mean_squared_error(a,b)
res = error<threshold
return res
def to_nparray(x):
if isinstance(x,np.ndarray) or isinstance(x,pd.DataFrame):
return np.array(x)
elif isinstance(x,np.float64):
return np.array([x])
elif isinstance(x,cudf.DataFrame) or isinstance(x,cudf.Series):
return x.to_pandas().values
return x
%%time
nrows = 2**20
ncols = 399
X_train, X_test, y_train, y_test = load_data(nrows,ncols)
print('training data',X_train.shape)
print('training label',y_train.shape)
print('testing data',X_test.shape)
print('testing label',y_test.shape)
print('label',y_test.shape)
use mortgage data training data (838860, 399) training label (838860, 1) testing data (209716, 399) testing label (209716, 1) label (209716, 1) CPU times: user 14.4 s, sys: 3.4 s, total: 17.8 s Wall time: 17.8 s
%%time
skols = skLinearRegression(fit_intercept=True,
normalize=True)
skols.fit(X_train, y_train)
CPU times: user 1min 4s, sys: 16.9 s, total: 1min 20s Wall time: 5.74 s
%%time
sk_predict = skols.predict(X_test)
error_sk = mean_squared_error(y_test,sk_predict)
CPU times: user 1.09 s, sys: 44 ms, total: 1.14 s Wall time: 107 ms
%%time
X_cudf = cudf.DataFrame.from_pandas(X_train)
X_cudf_test = cudf.DataFrame.from_pandas(X_test)
y_cudf = y_train.values
y_cudf = y_cudf[:,0]
y_cudf = cudf.Series(y_cudf)
CPU times: user 10 s, sys: 3.08 s, total: 13.1 s Wall time: 6.69 s
%%time
cuols = cuLinearRegression(fit_intercept=True,
normalize=True,
algorithm='eig')
cuols.fit(X_cudf, y_cudf)
CPU times: user 656 ms, sys: 356 ms, total: 1.01 s Wall time: 1.01 s
%%time
cu_predict = cuols.predict(X_cudf_test).to_array()
error_cu = mean_squared_error(y_test,cu_predict)
CPU times: user 540 ms, sys: 4 ms, total: 544 ms Wall time: 541 ms
print("SKL MSE(y):")
print(error_sk)
print("CUML MSE(y):")
print(error_cu)
SKL MSE(y): 5.0556037e-07 CUML MSE(y): 1.22318395e-08