from sklearn.feature_extraction import DictVectorizer
measurements = [
{'city': 'Dubai', 'temperature': 33.},
{'city': 'London', 'temperature': 12.},
{'city': 'San Fransisco', 'temperature': 18.},
]
vec = DictVectorizer()
vec.fit_transform(measurements).toarray()
array([[ 1., 0., 0., 33.], [ 0., 1., 0., 12.], [ 0., 0., 1., 18.]])
vec.get_feature_names()
['city=Dubai', 'city=London', 'city=San Fransisco', 'temperature']
pos_window = [
{
'word-2': 'the',
'pos-2': 'DT',
'word-1': 'cat',
'pos-1': 'NN',
'word+1': 'on',
'pos+1': 'PP',
},
]
pos_window
[{'pos+1': 'PP', 'pos-1': 'NN', 'pos-2': 'DT', 'word+1': 'on', 'word-1': 'cat', 'word-2': 'the'}]
vec = DictVectorizer()
pos_vectorized = vec.fit_transform(pos_window)
pos_vectorized
<1x6 sparse matrix of type '<type 'numpy.float64'>' with 6 stored elements in Compressed Sparse Row format>
pos_vectorized.toarray()
array([[ 1., 1., 1., 1., 1., 1.]])
vec.get_feature_names()
['pos+1=PP', 'pos-1=NN', 'pos-2=DT', 'word+1=on', 'word-1=cat', 'word-2=the']
def token_features(token, part_of_speech):
if token.isdigit():
yield "numeric"
else:
yield "token={}".format(token.lower())
yield "token,pos={},{}".format(token, part_of_speech)
if token[0].isupper():
yield "uppercase_initial"
if token.isupper():
yield "all_uppercase"
yield "pos={}".format(part_of_speech)
raw_X = (token_features(tok, pos_tagger(tok)) for tok in corpus)
hasher = FeatureHasher(input_type=string)
X = hasher.transform(raw_X)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-34-c1034094b1d2> in <module>() ----> 1 hasher = FeatureHasher(input_type=string) 2 X = hasher.transform(raw_X) NameError: name 'FeatureHasher' is not defined
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=1)
vectorizer
CountVectorizer(analyzer='word', binary=False, charset='utf-8', charset_error='strict', dtype=<type 'long'>, input='content', lowercase=True, max_df=1.0, max_features=None, max_n=None, min_df=1, min_n=None, ngram_range=(1, 1), preprocessor=None, stop_words=None, strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None, vocabulary=None)
corpus = [
'This is the first document.',
'This is the second second document.',
'And the third one.',
'Is this the first document?',
]
X = vectorizer.fit_transform(corpus)
X
<4x9 sparse matrix of type '<type 'numpy.int64'>' with 19 stored elements in COOrdinate format>
analyze = vectorizer.build_analyzer()
analyze("This is a text document to analyze.")
[u'this', u'is', u'text', u'document', u'to', u'analyze']
vectorizer.get_feature_names()
[u'and', u'document', u'first', u'is', u'one', u'second', u'the', u'third', u'this']
X.toarray()
array([[0, 1, 1, 1, 0, 0, 1, 0, 1], [0, 1, 0, 1, 0, 2, 1, 0, 1], [1, 0, 0, 0, 1, 0, 1, 1, 0], [0, 1, 1, 1, 0, 0, 1, 0, 1]])
vectorizer.vocabulary_.get('document')
1
vectorizer.transform(['Something completely new.']).toarray()
array([[0, 0, 0, 0, 0, 0, 0, 0, 0]])
bigram_vectorizer = CountVectorizer(ngram_range=(1, 2), token_pattern=ur'\b\w+\b', min_df=1)
analyze = bigram_vectorizer.build_analyzer()
analyze('Bi-grams are cool!')
[u'bi', u'grams', u'are', u'cool', u'bi grams', u'grams are', u'are cool']
X_2 = bigram_vectorizer.fit_transform(corpus).toarray()
X_2
array([[0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0], [0, 0, 1, 0, 0, 1, 1, 0, 0, 2, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0], [1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0], [0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1]])
feature_index = bigram_vectorizer.vocabulary_.get(u'is this')
X_2[:, feature_index]
array([0, 0, 0, 1])
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer()
transformer
TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)
counts = [[3, 0, 1],
[2, 0, 0],
[3, 0, 0],
[4, 0, 0],
[3, 2, 0],
[3, 0, 2]]
tfidf = transformer.fit_transform(counts)
tfidf
tfidf.toarray()
array([[ 0.85151335, 0. , 0.52433293], [ 1. , 0. , 0. ], [ 1. , 0. , 0. ], [ 1. , 0. , 0. ], [ 0.55422893, 0.83236428, 0. ], [ 0.63035731, 0. , 0.77630514]])
transformer.idf_
array([ 1. , 2.25276297, 1.84729786])
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df=1)
vectorizer.fit_transform(corpus)
<4x9 sparse matrix of type '<type 'numpy.float64'>' with 19 stored elements in Compressed Sparse Row format>
ngram_vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(2, 2), min_df=1)
counts = ngram_vectorizer.fit_transform(['words', 'wprds'])
ngram_vectorizer.get_feature_names()
counts.toarray().astype(int)
array([[1, 1, 1, 0, 1, 1, 1, 0], [1, 1, 0, 1, 1, 1, 0, 1]])
ngram_vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(5, 5), min_df=1)
ngram_vectorizer.fit_transform(['jumpy fox'])
<1x4 sparse matrix of type '<type 'numpy.int64'>' with 4 stored elements in COOrdinate format>
ngram_vectorizer.get_feature_names()
[u' fox ', u' jump', u'jumpy', u'umpy ']
ngram_vectorizer = CountVectorizer(analyzer='char', ngram_range=(5, 5), min_df=1)
ngram_vectorizer.fit_transform(['jumpy fox'])
<1x5 sparse matrix of type '<type 'numpy.int64'>' with 5 stored elements in COOrdinate format>
ngram_vectorizer.get_feature_names()
[u'jumpy', u'mpy f', u'py fo', u'umpy ', u'y fox']
from sklearn.feature_extraction.text import HashingVectorizer
hv = HashingVectorizer(n_features=10)
hv.transform(corpus)
<4x10 sparse matrix of type '<type 'numpy.float64'>' with 16 stored elements in Compressed Sparse Row format>
hv = HashingVectorizer()
hv.transform(corpus)
<4x1048576 sparse matrix of type '<type 'numpy.float64'>' with 19 stored elements in Compressed Sparse Row format>
def my_tokenizer(s):
return s.split()
vectorizer = CountVectorizer(tokenizer=my_tokenizer)
vectorizer.build_analyzer()(u"Some... punctuation!")
[u'some...', u'punctuation!']
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
class LemmaTokenizer(object):
def __init__(self):
self.wnl = WordNetLemmatizer()
def __call__(self, doc):
return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]
vect = CountVectorizer(tokenizer=LemmaTokenizer())
vect
CountVectorizer(analyzer='word', binary=False, charset='utf-8', charset_error='strict', dtype=<type 'long'>, input='content', lowercase=True, max_df=1.0, max_features=None, max_n=None, min_df=2, min_n=None, ngram_range=(1, 1), preprocessor=None, stop_words=None, strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=<__main__.LemmaTokenizer object at 0x1111e5290>, vocabulary=None)
import numpy as np
from sklearn.feature_extraction import image
one_image = np.arange(4 * 4 * 3).reshape((4, 4, 3))
one_image[:, :, 0]
array([[ 0, 3, 6, 9], [12, 15, 18, 21], [24, 27, 30, 33], [36, 39, 42, 45]])
patches = image.extract_patches_2d(one_image, (2, 2), max_patches=2, random_state=0)
patches.shape
(2, 2, 2, 3)
patches[:, :, :, 0]
array([[[ 0, 3], [12, 15]], [[15, 18], [27, 30]]])
patches = image.extract_patches_2d(one_image, (2, 2))
patches.shape
(9, 2, 2, 3)
patches[4, :, :, 0]
array([[15, 18], [27, 30]])
reconstructed = image.reconstruct_from_patches_2d(patches, (4, 4, 3))
np.testing.assert_array_equal(one_image, reconstructed)
five_images = np.arange(5 * 4 * 4 * 3).reshape(5, 4, 4, 3)
patches = image.PatchExtractor((2, 2)).transform(five_images)
patches.shape
(45, 2, 2, 3)