Convert Input Files

It's much faster to load the training and test data from NumPy native files. Load up the CSV files and dump them back out as compressed .npz files.

In [1]:
from __future__ import print_function
import numpy as np

Load raw CSVs

In [2]:
X_train = np.genfromtxt("X_train_public.csv", delimiter=',')
Y_train = np.genfromtxt("Y_train_public.csv", delimiter=',')
X_test = np.genfromtxt("X_test_public.csv", delimiter=',')
Y_test = np.genfromtxt("Y_test_public.csv", delimiter=',')
X_comp = np.genfromtxt("X_test_private.csv", delimiter=',')

Check Input Ranges

In [3]:
for arr in [X_train, Y_train, X_test, Y_test, X_comp]:
    print(arr.min(), arr.max())
-32768.0 32767.0
0.0 1.0
-32768.0 32767.0
0.0 1.0
-32768.0 32767.0

Convert to int for Efficiency

In [4]:
X_train = np.int16(X_train)
Y_train = np.int16(Y_train)
X_test = np.int16(X_test)
Y_test = np.int16(Y_test)
X_comp = np.int16(X_comp)
In [5]:
for arr in [X_train, Y_train, X_test, Y_test, X_comp]:
    print(arr.min(), arr.max())
-32768 32767
0 1
-32768 32767
0 1
-32768 32767

Save to Binary Compressed File

In [6]:
np.savez_compressed(
    "data_files.npz",
    X_train=X_train,
    Y_train=Y_train,
    X_test=X_test,
    Y_test=Y_test,
    X_comp=X_comp)