import numpy as np
from scipy.sparse import diags
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer as skTfidfTransformer
class TfidfTransformer():
def fit(self, X):
df = np.bincount(X.indices, minlength=X.shape[1]) + 1
n_samples = X.shape[0] + 1
self.idf_ = np.log(n_samples / df) + 1
self._idf_diag = diags(self.idf_, shape=(X.shape[1], X.shape[1]), format='csr')
return self
def transform(self, X):
return X * self._idf_diag
X = fetch_20newsgroups(remove=('headers', 'footers', 'quotes')).data
for subset in [10, 100, 1000]:
X_train = X[:subset]
X_test = X[subset: 2 * subset]
vec = CountVectorizer().fit(X_train)
Xt_train = vec.transform(X_train)
Xt_test = vec.transform(X_test)
trans1 = TfidfTransformer().fit(Xt_train)
# scikit-learn uses l2 norm by default
trans2 = skTfidfTransformer(norm=None).fit(Xt_train)
assert np.allclose(trans1.idf_, trans2.idf_)
Xt1 = trans1.transform(Xt_train)
Xt2 = trans2.transform(Xt_train)
assert np.allclose(Xt1.toarray(), Xt2.toarray())
Xt1 = trans1.transform(Xt_test)
Xt2 = trans2.transform(Xt_test)
assert np.allclose(Xt1.toarray(), Xt2.toarray())