vectorizing-a-large-text-corpus-with-the-hashing-trick¶

http://scikit-learn.org/dev/modules/feature_extraction.html

Loading features from dicts¶

In [22]:

from sklearn.feature_extraction import DictVectorizer

measurements = [
    {'city': 'Dubai', 'temperature': 33.},
    {'city': 'London', 'temperature': 12.},
    {'city': 'San Fransisco', 'temperature': 18.},
]

In [23]:

vec = DictVectorizer()

In [24]:

vec.fit_transform(measurements).toarray()

Out[24]:

array([[  1.,   0.,   0.,  33.],
       [  0.,   1.,   0.,  12.],
       [  0.,   0.,   1.,  18.]])

In [25]:

vec.get_feature_names()

Out[25]:

['city=Dubai', 'city=London', 'city=San Fransisco', 'temperature']

In [26]:

pos_window = [
   {
        'word-2': 'the',
        'pos-2': 'DT',
        'word-1': 'cat',
        'pos-1': 'NN',
        'word+1': 'on',
        'pos+1': 'PP',
    },
]

In [27]:

pos_window

Out[27]:

[{'pos+1': 'PP',
  'pos-1': 'NN',
  'pos-2': 'DT',
  'word+1': 'on',
  'word-1': 'cat',
  'word-2': 'the'}]

In [28]:

vec = DictVectorizer()
pos_vectorized = vec.fit_transform(pos_window)

In [29]:

pos_vectorized

Out[29]:

<1x6 sparse matrix of type '<type 'numpy.float64'>'
	with 6 stored elements in Compressed Sparse Row format>

In [30]:

pos_vectorized.toarray()

Out[30]:

array([[ 1.,  1.,  1.,  1.,  1.,  1.]])

In [31]:

vec.get_feature_names()

Out[31]:

['pos+1=PP', 'pos-1=NN', 'pos-2=DT', 'word+1=on', 'word-1=cat', 'word-2=the']

Feature hashing¶

In [32]:

def token_features(token, part_of_speech):
    if token.isdigit():
        yield "numeric"
    else:
        yield "token={}".format(token.lower())
        yield "token,pos={},{}".format(token, part_of_speech)
    if token[0].isupper():
        yield "uppercase_initial"
    if token.isupper():
        yield "all_uppercase"
    yield "pos={}".format(part_of_speech)

In [33]:

raw_X = (token_features(tok, pos_tagger(tok)) for tok in corpus)

In [34]:

hasher = FeatureHasher(input_type=string)
X = hasher.transform(raw_X)

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-34-c1034094b1d2> in <module>()
----> 1 hasher = FeatureHasher(input_type=string)
      2 X = hasher.transform(raw_X)

NameError: name 'FeatureHasher' is not defined

In [35]:

from sklearn.feature_extraction.text import CountVectorizer

In [36]:

vectorizer = CountVectorizer(min_df=1)
vectorizer

Out[36]:

CountVectorizer(analyzer='word', binary=False, charset='utf-8',
        charset_error='strict', dtype=<type 'long'>, input='content',
        lowercase=True, max_df=1.0, max_features=None, max_n=None,
        min_df=1, min_n=None, ngram_range=(1, 1), preprocessor=None,
        stop_words=None, strip_accents=None,
        token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None,
        vocabulary=None)

In [37]:

corpus = [
    'This is the first document.',
    'This is the second second document.',
    'And the third one.',
    'Is this the first document?',
]
X = vectorizer.fit_transform(corpus)
X

Out[37]:

<4x9 sparse matrix of type '<type 'numpy.int64'>'
	with 19 stored elements in COOrdinate format>

In [38]:

analyze = vectorizer.build_analyzer()
analyze("This is a text document to analyze.")

Out[38]:

[u'this', u'is', u'text', u'document', u'to', u'analyze']

In [39]:

vectorizer.get_feature_names()

Out[39]:

[u'and',
 u'document',
 u'first',
 u'is',
 u'one',
 u'second',
 u'the',
 u'third',
 u'this']

In [40]:

X.toarray()

Out[40]:

array([[0, 1, 1, 1, 0, 0, 1, 0, 1],
       [0, 1, 0, 1, 0, 2, 1, 0, 1],
       [1, 0, 0, 0, 1, 0, 1, 1, 0],
       [0, 1, 1, 1, 0, 0, 1, 0, 1]])

In [41]:

vectorizer.vocabulary_.get('document')

Out[41]:

In [42]:

vectorizer.transform(['Something completely new.']).toarray()

Out[42]:

array([[0, 0, 0, 0, 0, 0, 0, 0, 0]])

Bigram vectorizer¶

In [43]:

bigram_vectorizer = CountVectorizer(ngram_range=(1, 2), token_pattern=ur'\b\w+\b', min_df=1)

In [44]:

analyze = bigram_vectorizer.build_analyzer()
analyze('Bi-grams are cool!')

Out[44]:

[u'bi', u'grams', u'are', u'cool', u'bi grams', u'grams are', u'are cool']

In [45]:

X_2 = bigram_vectorizer.fit_transform(corpus).toarray()
X_2

Out[45]:

array([[0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0],
       [0, 0, 1, 0, 0, 1, 1, 0, 0, 2, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0],
       [1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0],
       [0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1]])

In [46]:

feature_index = bigram_vectorizer.vocabulary_.get(u'is this')
X_2[:, feature_index] 

Out[46]:

array([0, 0, 0, 1])

Tf–idf term weighting¶

In [47]:

from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer()
transformer 

Out[47]:

TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)

In [48]:

counts = [[3, 0, 1],
          [2, 0, 0],
          [3, 0, 0],
          [4, 0, 0],
          [3, 2, 0],
          [3, 0, 2]]
tfidf = transformer.fit_transform(counts)
tfidf
tfidf.toarray()               

Out[48]:

array([[ 0.85151335,  0.        ,  0.52433293],
       [ 1.        ,  0.        ,  0.        ],
       [ 1.        ,  0.        ,  0.        ],
       [ 1.        ,  0.        ,  0.        ],
       [ 0.55422893,  0.83236428,  0.        ],
       [ 0.63035731,  0.        ,  0.77630514]])

In [49]:

transformer.idf_

Out[49]:

array([ 1.        ,  2.25276297,  1.84729786])

In [50]:

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df=1)
vectorizer.fit_transform(corpus)

Out[50]:

<4x9 sparse matrix of type '<type 'numpy.float64'>'
	with 19 stored elements in Compressed Sparse Row format>

Bag of Words representation¶

In [51]:

ngram_vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(2, 2), min_df=1)
counts = ngram_vectorizer.fit_transform(['words', 'wprds'])
ngram_vectorizer.get_feature_names()
counts.toarray().astype(int)

Out[51]:

array([[1, 1, 1, 0, 1, 1, 1, 0],
       [1, 1, 0, 1, 1, 1, 0, 1]])

In [52]:

ngram_vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(5, 5), min_df=1)
ngram_vectorizer.fit_transform(['jumpy fox'])

Out[52]:

<1x4 sparse matrix of type '<type 'numpy.int64'>'
	with 4 stored elements in COOrdinate format>

In [53]:

ngram_vectorizer.get_feature_names()

Out[53]:

[u' fox ', u' jump', u'jumpy', u'umpy ']

In [54]:

ngram_vectorizer = CountVectorizer(analyzer='char', ngram_range=(5, 5), min_df=1)

In [55]:

ngram_vectorizer.fit_transform(['jumpy fox'])

Out[55]:

<1x5 sparse matrix of type '<type 'numpy.int64'>'
	with 5 stored elements in COOrdinate format>

In [56]:

ngram_vectorizer.get_feature_names()

Out[56]:

[u'jumpy', u'mpy f', u'py fo', u'umpy ', u'y fox']

Vectorizing a large text corpus with the hashing trick¶

In [57]:

from sklearn.feature_extraction.text import HashingVectorizer
hv = HashingVectorizer(n_features=10)
hv.transform(corpus)

Out[57]:

<4x10 sparse matrix of type '<type 'numpy.float64'>'
	with 16 stored elements in Compressed Sparse Row format>

In [58]:

hv = HashingVectorizer()
hv.transform(corpus)

Out[58]:

<4x1048576 sparse matrix of type '<type 'numpy.float64'>'
	with 19 stored elements in Compressed Sparse Row format>

In [59]:

def my_tokenizer(s):
    return s.split()

vectorizer = CountVectorizer(tokenizer=my_tokenizer)
vectorizer.build_analyzer()(u"Some... punctuation!")

Out[59]:

[u'some...', u'punctuation!']

In [60]:

from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

vect = CountVectorizer(tokenizer=LemmaTokenizer())

In [61]:

vect

Out[61]:

CountVectorizer(analyzer='word', binary=False, charset='utf-8',
        charset_error='strict', dtype=<type 'long'>, input='content',
        lowercase=True, max_df=1.0, max_features=None, max_n=None,
        min_df=2, min_n=None, ngram_range=(1, 1), preprocessor=None,
        stop_words=None, strip_accents=None,
        token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=<__main__.LemmaTokenizer object at 0x1111e5290>,
        vocabulary=None)

Image Feature extraction¶

In [62]:

import numpy as np
from sklearn.feature_extraction import image

one_image = np.arange(4 * 4 * 3).reshape((4, 4, 3))
one_image[:, :, 0]

Out[62]:

array([[ 0,  3,  6,  9],
       [12, 15, 18, 21],
       [24, 27, 30, 33],
       [36, 39, 42, 45]])

In [63]:

patches = image.extract_patches_2d(one_image, (2, 2), max_patches=2, random_state=0)
patches.shape

Out[63]:

(2, 2, 2, 3)

In [64]:

patches[:, :, :, 0]

Out[64]:

array([[[ 0,  3],
        [12, 15]],

       [[15, 18],
        [27, 30]]])

In [65]:

patches = image.extract_patches_2d(one_image, (2, 2))
patches.shape

Out[65]:

(9, 2, 2, 3)

In [66]:

patches[4, :, :, 0]

Out[66]:

array([[15, 18],
       [27, 30]])

In [67]:

reconstructed = image.reconstruct_from_patches_2d(patches, (4, 4, 3))
np.testing.assert_array_equal(one_image, reconstructed)

In [68]:

five_images = np.arange(5 * 4 * 4 * 3).reshape(5, 4, 4, 3)
patches = image.PatchExtractor((2, 2)).transform(five_images)
patches.shape

Out[68]:

(45, 2, 2, 3)

In [ ]: