class CountVectorizer():
def _analyze(self, doc):
token_pattern = re.compile(r"\b\w\w+\b")
return token_pattern.findall(doc.lower())
def _count_vocab(self, X, fixed_vocabulary):
if fixed_vocabulary is False:
vocabulary = {}
vocabulary_cnt = 0
else:
vocabulary = self.vocabulary_
values = []
j_indices = []
indptr = [0]
for doc in X:
feature_counter = {}
for feature in self._analyze(doc):
if fixed_vocabulary is False:
if feature not in vocabulary:
vocabulary[feature] = vocabulary_cnt
vocabulary_cnt += 1
else:
if feature not in vocabulary:
continue
feature_idx = vocabulary[feature]
if feature_idx not in feature_counter:
feature_counter[feature_idx] = 1
else:
feature_counter[feature_idx] += 1
values.extend(feature_counter.values())
j_indices.extend(feature_counter.keys())
indptr.append(len(j_indices))
Xt = csr_matrix((values, j_indices, indptr),
shape=(len(indptr) - 1, len(vocabulary)))
return vocabulary, Xt
def fit(self, X):
vocabulary, Xt = self. _count_vocab(X, fixed_vocabulary=False)
sorted_features = sorted(vocabulary.items())
for new_val, (term, old_val) in enumerate(sorted_features):
vocabulary[term] = new_val
self.vocabulary_ = vocabulary
return self
def transform(self, X):
_, Xt = self._count_vocab(X, fixed_vocabulary=True)
return Xt
def get_feature_names(self):
return sorted(self.vocabulary_.keys())