In [2]:

data = [{'price': 850000, 'rooms': 4, 'neighborhood': 'Queen Anne'},
       {'price': 700000, 'rooms': 3, 'neighborhood': 'Fremont'},
       {'price': 650000, 'rooms': 3, 'neighborhood': 'Wallingford'},
       {'price': 600000, 'rooms': 2, 'neighborhood': 'Fremont'}
]

In [3]:

from sklearn.feature_extraction import DictVectorizer 
vec = DictVectorizer(sparse=False, dtype=int)
vec.fit_transform(data)

Out[3]:

array([[     0,      1,      0, 850000,      4],
       [     1,      0,      0, 700000,      3],
       [     0,      0,      1, 650000,      3],
       [     1,      0,      0, 600000,      2]])

In [4]:

vec.get_feature_names()

Out[4]:

['neighborhood=Fremont',
 'neighborhood=Queen Anne',
 'neighborhood=Wallingford',
 'price',
 'rooms']

In [5]:

vec = DictVectorizer(sparse=True, dtype=int)
vec.fit_transform(data)

Out[5]:

<4x5 sparse matrix of type '<type 'numpy.int64'>'
	with 12 stored elements in Compressed Sparse Row format>

In [7]:

sample = ['problem of evil', 'evil queen', 'horizon problem']

In [8]:

from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer()
X = vec.fit_transform(sample)
X

Out[8]:

<3x5 sparse matrix of type '<type 'numpy.int64'>'
	with 7 stored elements in Compressed Sparse Row format>

In [9]:

import pandas as pd
pd.DataFrame(X.toarray(), columns=vec.get_feature_names())

Out[9]:

	evil	horizon	of	problem	queen
0	1	0	1	1	0
1	1	0	0	0	1
2	0	1	0	1	0

In [10]:

from sklearn.feature_extraction.text import TfidfVectorizer
vec = TfidfVectorizer()
X = vec.fit_transform(sample)
pd.DataFrame(X.toarray(), columns=vec.get_feature_names())

Out[10]:

	evil	horizon	of	problem	queen
0	0.517856	0.000000	0.680919	0.517856	0.000000
1	0.605349	0.000000	0.000000	0.000000	0.795961
2	0.000000	0.795961	0.000000	0.605349	0.000000

In [12]:

%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
x = np.array([1, 2, 3, 4, 5])
y = np.array([4, 2, 1, 3, 7])
plt.scatter(x, y);

In [13]:

from sklearn.linear_model import LinearRegression 
X = x[:, np.newaxis]
model = LinearRegression().fit(X, y)
yfit = model.predict(X)
plt.scatter(x, y)
plt.plot(x, yfit);

/Users/dsr/Documents/HustleProjects/exploring_machine_learning/venv/lib/python2.7/site-packages/scipy/linalg/basic.py:1226: RuntimeWarning: internal gelsd driver lwork query error, required iwork dimension not returned. This is likely the result of LAPACK bug 0038, fixed in LAPACK 3.2.2 (released July 21, 2010). Falling back to 'gelss' driver.
  warnings.warn(mesg, RuntimeWarning)

In [14]:

from sklearn.preprocessing import PolynomialFeatures 
poly = PolynomialFeatures(degree=3, include_bias=False) 
X2 = poly.fit_transform(X)
print(X2)

[[   1.    1.    1.]
 [   2.    4.    8.]
 [   3.    9.   27.]
 [   4.   16.   64.]
 [   5.   25.  125.]]

In [15]:

model = LinearRegression().fit(X2, y)
yfit = model.predict(X2)
plt.scatter(x, y)
plt.plot(x, yfit);

In [16]:

from numpy import nan
X=np.array([[nan,0, 3],
            [3, 7, 9], 
            [3, 5, 2], 
            [4, nan,6], 
            [8, 8, 1]])
y = np.array([14, 16, -1,  8, -5])

In [17]:

from sklearn.preprocessing import Imputer 
imp = Imputer(strategy='mean')
X2 = imp.fit_transform(X)
X2

Out[17]:

array([[ 4.5,  0. ,  3. ],
       [ 3. ,  7. ,  9. ],
       [ 3. ,  5. ,  2. ],
       [ 4. ,  5. ,  6. ],
       [ 8. ,  8. ,  1. ]])

In [18]:

model = LinearRegression().fit(X2, y)
model.predict(X2)

Out[18]:

array([ 13.14869292,  14.3784627 ,  -1.15539732,  10.96606197,  -5.33782027])

In [19]:

from sklearn.pipeline import make_pipeline
model = make_pipeline(Imputer(strategy='mean'),PolynomialFeatures(degree=2), LinearRegression())

In [20]:

model.fit(X, y)
print(y)
print(model.predict(X))

[14 16 -1  8 -5]
[ 14.  16.  -1.   8.  -5.]