vectorizing-a-large-text-corpus-with-the-hashing-trick

Loading features from dicts

In [22]:
from sklearn.feature_extraction import DictVectorizer

measurements = [
    {'city': 'Dubai', 'temperature': 33.},
    {'city': 'London', 'temperature': 12.},
    {'city': 'San Fransisco', 'temperature': 18.},
]
In [23]:
vec = DictVectorizer()
In [24]:
vec.fit_transform(measurements).toarray()
Out[24]:
array([[  1.,   0.,   0.,  33.],
       [  0.,   1.,   0.,  12.],
       [  0.,   0.,   1.,  18.]])
In [25]:
vec.get_feature_names()
Out[25]:
['city=Dubai', 'city=London', 'city=San Fransisco', 'temperature']
In [26]:
pos_window = [
   {
        'word-2': 'the',
        'pos-2': 'DT',
        'word-1': 'cat',
        'pos-1': 'NN',
        'word+1': 'on',
        'pos+1': 'PP',
    },
]
In [27]:
pos_window
Out[27]:
[{'pos+1': 'PP',
  'pos-1': 'NN',
  'pos-2': 'DT',
  'word+1': 'on',
  'word-1': 'cat',
  'word-2': 'the'}]
In [28]:
vec = DictVectorizer()
pos_vectorized = vec.fit_transform(pos_window)
In [29]:
pos_vectorized
Out[29]:
<1x6 sparse matrix of type '<type 'numpy.float64'>'
	with 6 stored elements in Compressed Sparse Row format>
In [30]:
pos_vectorized.toarray()
Out[30]:
array([[ 1.,  1.,  1.,  1.,  1.,  1.]])
In [31]:
vec.get_feature_names()
Out[31]:
['pos+1=PP', 'pos-1=NN', 'pos-2=DT', 'word+1=on', 'word-1=cat', 'word-2=the']

Feature hashing

In [32]:
def token_features(token, part_of_speech):
    if token.isdigit():
        yield "numeric"
    else:
        yield "token={}".format(token.lower())
        yield "token,pos={},{}".format(token, part_of_speech)
    if token[0].isupper():
        yield "uppercase_initial"
    if token.isupper():
        yield "all_uppercase"
    yield "pos={}".format(part_of_speech)
In [33]:
raw_X = (token_features(tok, pos_tagger(tok)) for tok in corpus)
In [34]:
hasher = FeatureHasher(input_type=string)
X = hasher.transform(raw_X)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-34-c1034094b1d2> in <module>()
----> 1 hasher = FeatureHasher(input_type=string)
      2 X = hasher.transform(raw_X)

NameError: name 'FeatureHasher' is not defined
In [35]:
from sklearn.feature_extraction.text import CountVectorizer
In [36]:
vectorizer = CountVectorizer(min_df=1)
vectorizer
Out[36]:
CountVectorizer(analyzer='word', binary=False, charset='utf-8',
        charset_error='strict', dtype=<type 'long'>, input='content',
        lowercase=True, max_df=1.0, max_features=None, max_n=None,
        min_df=1, min_n=None, ngram_range=(1, 1), preprocessor=None,
        stop_words=None, strip_accents=None,
        token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None,
        vocabulary=None)
In [37]:
corpus = [
    'This is the first document.',
    'This is the second second document.',
    'And the third one.',
    'Is this the first document?',
]
X = vectorizer.fit_transform(corpus)
X
Out[37]:
<4x9 sparse matrix of type '<type 'numpy.int64'>'
	with 19 stored elements in COOrdinate format>
In [38]:
analyze = vectorizer.build_analyzer()
analyze("This is a text document to analyze.")
Out[38]:
[u'this', u'is', u'text', u'document', u'to', u'analyze']
In [39]:
vectorizer.get_feature_names()
Out[39]:
[u'and',
 u'document',
 u'first',
 u'is',
 u'one',
 u'second',
 u'the',
 u'third',
 u'this']
In [40]:
X.toarray()
Out[40]:
array([[0, 1, 1, 1, 0, 0, 1, 0, 1],
       [0, 1, 0, 1, 0, 2, 1, 0, 1],
       [1, 0, 0, 0, 1, 0, 1, 1, 0],
       [0, 1, 1, 1, 0, 0, 1, 0, 1]])
In [41]:
vectorizer.vocabulary_.get('document')
Out[41]:
1
In [42]:
vectorizer.transform(['Something completely new.']).toarray()
Out[42]:
array([[0, 0, 0, 0, 0, 0, 0, 0, 0]])

Bigram vectorizer

In [43]:
bigram_vectorizer = CountVectorizer(ngram_range=(1, 2), token_pattern=ur'\b\w+\b', min_df=1)
In [44]:
analyze = bigram_vectorizer.build_analyzer()
analyze('Bi-grams are cool!')
Out[44]:
[u'bi', u'grams', u'are', u'cool', u'bi grams', u'grams are', u'are cool']
In [45]:
X_2 = bigram_vectorizer.fit_transform(corpus).toarray()
X_2
Out[45]:
array([[0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0],
       [0, 0, 1, 0, 0, 1, 1, 0, 0, 2, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0],
       [1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0],
       [0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1]])
In [46]:
feature_index = bigram_vectorizer.vocabulary_.get(u'is this')
X_2[:, feature_index] 
Out[46]:
array([0, 0, 0, 1])

Tf–idf term weighting

In [47]:
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer()
transformer 
Out[47]:
TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)
In [48]:
counts = [[3, 0, 1],
          [2, 0, 0],
          [3, 0, 0],
          [4, 0, 0],
          [3, 2, 0],
          [3, 0, 2]]
tfidf = transformer.fit_transform(counts)
tfidf
tfidf.toarray()               
Out[48]:
array([[ 0.85151335,  0.        ,  0.52433293],
       [ 1.        ,  0.        ,  0.        ],
       [ 1.        ,  0.        ,  0.        ],
       [ 1.        ,  0.        ,  0.        ],
       [ 0.55422893,  0.83236428,  0.        ],
       [ 0.63035731,  0.        ,  0.77630514]])
In [49]:
transformer.idf_
Out[49]:
array([ 1.        ,  2.25276297,  1.84729786])
In [50]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df=1)
vectorizer.fit_transform(corpus)
Out[50]:
<4x9 sparse matrix of type '<type 'numpy.float64'>'
	with 19 stored elements in Compressed Sparse Row format>

Bag of Words representation

In [51]:
ngram_vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(2, 2), min_df=1)
counts = ngram_vectorizer.fit_transform(['words', 'wprds'])
ngram_vectorizer.get_feature_names()
counts.toarray().astype(int)
Out[51]:
array([[1, 1, 1, 0, 1, 1, 1, 0],
       [1, 1, 0, 1, 1, 1, 0, 1]])
In [52]:
ngram_vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(5, 5), min_df=1)
ngram_vectorizer.fit_transform(['jumpy fox'])
Out[52]:
<1x4 sparse matrix of type '<type 'numpy.int64'>'
	with 4 stored elements in COOrdinate format>
In [53]:
ngram_vectorizer.get_feature_names()
Out[53]:
[u' fox ', u' jump', u'jumpy', u'umpy ']
In [54]:
ngram_vectorizer = CountVectorizer(analyzer='char', ngram_range=(5, 5), min_df=1)
In [55]:
ngram_vectorizer.fit_transform(['jumpy fox'])
Out[55]:
<1x5 sparse matrix of type '<type 'numpy.int64'>'
	with 5 stored elements in COOrdinate format>
In [56]:
ngram_vectorizer.get_feature_names()
Out[56]:
[u'jumpy', u'mpy f', u'py fo', u'umpy ', u'y fox']

Vectorizing a large text corpus with the hashing trick

In [57]:
from sklearn.feature_extraction.text import HashingVectorizer
hv = HashingVectorizer(n_features=10)
hv.transform(corpus)
Out[57]:
<4x10 sparse matrix of type '<type 'numpy.float64'>'
	with 16 stored elements in Compressed Sparse Row format>
In [58]:
hv = HashingVectorizer()
hv.transform(corpus)
Out[58]:
<4x1048576 sparse matrix of type '<type 'numpy.float64'>'
	with 19 stored elements in Compressed Sparse Row format>
In [59]:
def my_tokenizer(s):
    return s.split()

vectorizer = CountVectorizer(tokenizer=my_tokenizer)
vectorizer.build_analyzer()(u"Some... punctuation!")
Out[59]:
[u'some...', u'punctuation!']
In [60]:
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

vect = CountVectorizer(tokenizer=LemmaTokenizer())
In [61]:
vect
Out[61]:
CountVectorizer(analyzer='word', binary=False, charset='utf-8',
        charset_error='strict', dtype=<type 'long'>, input='content',
        lowercase=True, max_df=1.0, max_features=None, max_n=None,
        min_df=2, min_n=None, ngram_range=(1, 1), preprocessor=None,
        stop_words=None, strip_accents=None,
        token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=<__main__.LemmaTokenizer object at 0x1111e5290>,
        vocabulary=None)

Image Feature extraction

In [62]:
import numpy as np
from sklearn.feature_extraction import image

one_image = np.arange(4 * 4 * 3).reshape((4, 4, 3))
one_image[:, :, 0]
Out[62]:
array([[ 0,  3,  6,  9],
       [12, 15, 18, 21],
       [24, 27, 30, 33],
       [36, 39, 42, 45]])
In [63]:
patches = image.extract_patches_2d(one_image, (2, 2), max_patches=2, random_state=0)
patches.shape
Out[63]:
(2, 2, 2, 3)
In [64]:
patches[:, :, :, 0]
Out[64]:
array([[[ 0,  3],
        [12, 15]],

       [[15, 18],
        [27, 30]]])
In [65]:
patches = image.extract_patches_2d(one_image, (2, 2))
patches.shape
Out[65]:
(9, 2, 2, 3)
In [66]:
patches[4, :, :, 0]
Out[66]:
array([[15, 18],
       [27, 30]])
In [67]:
reconstructed = image.reconstruct_from_patches_2d(patches, (4, 4, 3))
np.testing.assert_array_equal(one_image, reconstructed)
In [68]:
five_images = np.arange(5 * 4 * 4 * 3).reshape(5, 4, 4, 3)
patches = image.PatchExtractor((2, 2)).transform(five_images)
patches.shape
Out[68]:
(45, 2, 2, 3)
In [ ]: