It's much faster to load the training and test data from NumPy native files. Load up the CSV files and dump them back out as compressed .npz
files.
from __future__ import print_function
import numpy as np
X_train = np.genfromtxt("X_train_public.csv", delimiter=',')
Y_train = np.genfromtxt("Y_train_public.csv", delimiter=',')
X_test = np.genfromtxt("X_test_public.csv", delimiter=',')
Y_test = np.genfromtxt("Y_test_public.csv", delimiter=',')
X_comp = np.genfromtxt("X_test_private.csv", delimiter=',')
for arr in [X_train, Y_train, X_test, Y_test, X_comp]:
print(arr.min(), arr.max())
-32768.0 32767.0 0.0 1.0 -32768.0 32767.0 0.0 1.0 -32768.0 32767.0
X_train = np.int16(X_train)
Y_train = np.int16(Y_train)
X_test = np.int16(X_test)
Y_test = np.int16(Y_test)
X_comp = np.int16(X_comp)
for arr in [X_train, Y_train, X_test, Y_test, X_comp]:
print(arr.min(), arr.max())
-32768 32767 0 1 -32768 32767 0 1 -32768 32767
np.savez_compressed(
"data_files.npz",
X_train=X_train,
Y_train=Y_train,
X_test=X_test,
Y_test=Y_test,
X_comp=X_comp)