from sklearn.feature_extraction import DictVectorizer

measurements = [
    {'city': 'Dubai', 'temperature': 33.},
    {'city': 'London', 'temperature': 12.},
    {'city': 'San Fransisco', 'temperature': 18.},
]

vec = DictVectorizer()

vec.fit_transform(measurements).toarray()

vec.get_feature_names()

pos_window = [
   {
        'word-2': 'the',
        'pos-2': 'DT',
        'word-1': 'cat',
        'pos-1': 'NN',
        'word+1': 'on',
        'pos+1': 'PP',
    },
]

pos_window

vec = DictVectorizer()
pos_vectorized = vec.fit_transform(pos_window)

pos_vectorized

pos_vectorized.toarray()

vec.get_feature_names()

def token_features(token, part_of_speech):
    if token.isdigit():
        yield "numeric"
    else:
        yield "token={}".format(token.lower())
        yield "token,pos={},{}".format(token, part_of_speech)
    if token[0].isupper():
        yield "uppercase_initial"
    if token.isupper():
        yield "all_uppercase"
    yield "pos={}".format(part_of_speech)

raw_X = (token_features(tok, pos_tagger(tok)) for tok in corpus)

hasher = FeatureHasher(input_type=string)
X = hasher.transform(raw_X)

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(min_df=1)
vectorizer

corpus = [
    'This is the first document.',
    'This is the second second document.',
    'And the third one.',
    'Is this the first document?',
]
X = vectorizer.fit_transform(corpus)
X

analyze = vectorizer.build_analyzer()
analyze("This is a text document to analyze.")

vectorizer.get_feature_names()

X.toarray()

vectorizer.vocabulary_.get('document')

vectorizer.transform(['Something completely new.']).toarray()

bigram_vectorizer = CountVectorizer(ngram_range=(1, 2), token_pattern=ur'\b\w+\b', min_df=1)

analyze = bigram_vectorizer.build_analyzer()
analyze('Bi-grams are cool!')

X_2 = bigram_vectorizer.fit_transform(corpus).toarray()
X_2

feature_index = bigram_vectorizer.vocabulary_.get(u'is this')
X_2[:, feature_index] 

from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer()
transformer 

counts = [[3, 0, 1],
          [2, 0, 0],
          [3, 0, 0],
          [4, 0, 0],
          [3, 2, 0],
          [3, 0, 2]]
tfidf = transformer.fit_transform(counts)
tfidf
tfidf.toarray()               

transformer.idf_

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df=1)
vectorizer.fit_transform(corpus)

ngram_vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(2, 2), min_df=1)
counts = ngram_vectorizer.fit_transform(['words', 'wprds'])
ngram_vectorizer.get_feature_names()
counts.toarray().astype(int)

ngram_vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(5, 5), min_df=1)
ngram_vectorizer.fit_transform(['jumpy fox'])

ngram_vectorizer.get_feature_names()

ngram_vectorizer = CountVectorizer(analyzer='char', ngram_range=(5, 5), min_df=1)

ngram_vectorizer.fit_transform(['jumpy fox'])

ngram_vectorizer.get_feature_names()

from sklearn.feature_extraction.text import HashingVectorizer
hv = HashingVectorizer(n_features=10)
hv.transform(corpus)

hv = HashingVectorizer()
hv.transform(corpus)

def my_tokenizer(s):
    return s.split()

vectorizer = CountVectorizer(tokenizer=my_tokenizer)
vectorizer.build_analyzer()(u"Some... punctuation!")

from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

vect = CountVectorizer(tokenizer=LemmaTokenizer())

vect

import numpy as np
from sklearn.feature_extraction import image

one_image = np.arange(4 * 4 * 3).reshape((4, 4, 3))
one_image[:, :, 0]

patches = image.extract_patches_2d(one_image, (2, 2), max_patches=2, random_state=0)
patches.shape

patches[:, :, :, 0]

patches = image.extract_patches_2d(one_image, (2, 2))
patches.shape

patches[4, :, :, 0]

reconstructed = image.reconstruct_from_patches_2d(patches, (4, 4, 3))
np.testing.assert_array_equal(one_image, reconstructed)

five_images = np.arange(5 * 4 * 4 * 3).reshape(5, 4, 4, 3)
patches = image.PatchExtractor((2, 2)).transform(five_images)
patches.shape