from biased_stop_words import get_stop_words
from sklearn.feature_extraction.text import CountVectorizer
biased_stop_words = get_stop_words('gendered')
corpus = [
'He is an astronaut, he is on Venus',
'He is an accountant, he is on Earth',
'She is an astronaut, she is on Mars'
]
vectorizer = CountVectorizer(stop_words=biased_stop_words)
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())
print(X.toarray())
['accountant', 'an', 'astronaut', 'earth', 'is', 'mars', 'on', 'venus'] [[0 1 1 0 2 0 1 1] [1 1 0 1 2 0 1 0] [0 1 1 0 2 1 1 0]]