import numpy as np
from sklearn.datasets import load_iris
from sklearn.preprocessing import KBinsDiscretizer as skKBinsDiscretizer
class KBinsDiscretizer():
def __init__(self, n_bins=5, strategy="quantile"):
self.n_bins = n_bins
self.strategy = strategy
def fit(self, X):
self.n_bins_ = np.full(X.shape[1], self.n_bins)
self.bin_edges_ = np.empty(X.shape[1], dtype=object)
for i in range(X.shape[1]):
if self.strategy == "uniform":
self.bin_edges_[i] = np.linspace(X[:, i].min(), X[:, i].max(),
self.n_bins_[i] + 1)
elif self.strategy == "quantile":
quantiles = np.linspace(0, 100, self.n_bins_[i] + 1)
self.bin_edges_[i] = np.percentile(X[:, i], quantiles)
return self
def transform(self, X):
Xt = np.empty_like(X)
for i in range(X.shape[1]):
# similar to scikit-learn solution
Xt[:, i] = np.digitize(X[:, i] + np.finfo(float).eps, self.bin_edges_[i][1:])
Xt = np.clip(Xt, 0, self.n_bins_ - 1)
return Xt
X, _ = load_iris(return_X_y=True)
trans1 = KBinsDiscretizer(n_bins=5).fit(X)
trans2 = skKBinsDiscretizer(n_bins=5, encode="ordinal").fit(X)
for i in range(X.shape[1]):
assert np.allclose(trans1.bin_edges_[i], trans2.bin_edges_[i])
Xt1 = trans1.transform(X)
Xt2 = trans2.transform(X)
assert np.array_equal(Xt1, Xt2)
X, _ = load_iris(return_X_y=True)
trans1 = KBinsDiscretizer(n_bins=5, strategy="uniform").fit(X)
trans2 = skKBinsDiscretizer(n_bins=5, encode="ordinal", strategy="uniform").fit(X)
for i in range(X.shape[1]):
assert np.allclose(trans1.bin_edges_[i], trans2.bin_edges_[i])
Xt1 = trans1.transform(X)
Xt2 = trans2.transform(X)
assert np.array_equal(Xt1, Xt2)