from sklearn.feature_extraction import DictVectorizer measurements = [ {'city': 'Dubai', 'temperature': 33.}, {'city': 'London', 'temperature': 12.}, {'city': 'San Fransisco', 'temperature': 18.}, ] vec = DictVectorizer() vec.fit_transform(measurements).toarray() vec.get_feature_names() pos_window = [ { 'word-2': 'the', 'pos-2': 'DT', 'word-1': 'cat', 'pos-1': 'NN', 'word+1': 'on', 'pos+1': 'PP', }, ] pos_window vec = DictVectorizer() pos_vectorized = vec.fit_transform(pos_window) pos_vectorized pos_vectorized.toarray() vec.get_feature_names() def token_features(token, part_of_speech): if token.isdigit(): yield "numeric" else: yield "token={}".format(token.lower()) yield "token,pos={},{}".format(token, part_of_speech) if token[0].isupper(): yield "uppercase_initial" if token.isupper(): yield "all_uppercase" yield "pos={}".format(part_of_speech) raw_X = (token_features(tok, pos_tagger(tok)) for tok in corpus) hasher = FeatureHasher(input_type=string) X = hasher.transform(raw_X) from sklearn.feature_extraction.text import CountVectorizer vectorizer = CountVectorizer(min_df=1) vectorizer corpus = [ 'This is the first document.', 'This is the second second document.', 'And the third one.', 'Is this the first document?', ] X = vectorizer.fit_transform(corpus) X analyze = vectorizer.build_analyzer() analyze("This is a text document to analyze.") vectorizer.get_feature_names() X.toarray() vectorizer.vocabulary_.get('document') vectorizer.transform(['Something completely new.']).toarray() bigram_vectorizer = CountVectorizer(ngram_range=(1, 2), token_pattern=ur'\b\w+\b', min_df=1) analyze = bigram_vectorizer.build_analyzer() analyze('Bi-grams are cool!') X_2 = bigram_vectorizer.fit_transform(corpus).toarray() X_2 feature_index = bigram_vectorizer.vocabulary_.get(u'is this') X_2[:, feature_index] from sklearn.feature_extraction.text import TfidfTransformer transformer = TfidfTransformer() transformer counts = [[3, 0, 1], [2, 0, 0], [3, 0, 0], [4, 0, 0], [3, 2, 0], [3, 0, 2]] tfidf = transformer.fit_transform(counts) tfidf tfidf.toarray() transformer.idf_ from sklearn.feature_extraction.text import TfidfVectorizer vectorizer = TfidfVectorizer(min_df=1) vectorizer.fit_transform(corpus) ngram_vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(2, 2), min_df=1) counts = ngram_vectorizer.fit_transform(['words', 'wprds']) ngram_vectorizer.get_feature_names() counts.toarray().astype(int) ngram_vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(5, 5), min_df=1) ngram_vectorizer.fit_transform(['jumpy fox']) ngram_vectorizer.get_feature_names() ngram_vectorizer = CountVectorizer(analyzer='char', ngram_range=(5, 5), min_df=1) ngram_vectorizer.fit_transform(['jumpy fox']) ngram_vectorizer.get_feature_names() from sklearn.feature_extraction.text import HashingVectorizer hv = HashingVectorizer(n_features=10) hv.transform(corpus) hv = HashingVectorizer() hv.transform(corpus) def my_tokenizer(s): return s.split() vectorizer = CountVectorizer(tokenizer=my_tokenizer) vectorizer.build_analyzer()(u"Some... punctuation!") from nltk import word_tokenize from nltk.stem import WordNetLemmatizer class LemmaTokenizer(object): def __init__(self): self.wnl = WordNetLemmatizer() def __call__(self, doc): return [self.wnl.lemmatize(t) for t in word_tokenize(doc)] vect = CountVectorizer(tokenizer=LemmaTokenizer()) vect import numpy as np from sklearn.feature_extraction import image one_image = np.arange(4 * 4 * 3).reshape((4, 4, 3)) one_image[:, :, 0] patches = image.extract_patches_2d(one_image, (2, 2), max_patches=2, random_state=0) patches.shape patches[:, :, :, 0] patches = image.extract_patches_2d(one_image, (2, 2)) patches.shape patches[4, :, :, 0] reconstructed = image.reconstruct_from_patches_2d(patches, (4, 4, 3)) np.testing.assert_array_equal(one_image, reconstructed) five_images = np.arange(5 * 4 * 4 * 3).reshape(5, 4, 4, 3) patches = image.PatchExtractor((2, 2)).transform(five_images) patches.shape