data = [{'price': 850000, 'rooms': 4, 'neighborhood': 'Queen Anne'},
{'price': 700000, 'rooms': 3, 'neighborhood': 'Fremont'},
{'price': 650000, 'rooms': 3, 'neighborhood': 'Wallingford'},
{'price': 600000, 'rooms': 2, 'neighborhood': 'Fremont'}
]
from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer(sparse=False, dtype=int)
vec.fit_transform(data)
array([[ 0, 1, 0, 850000, 4], [ 1, 0, 0, 700000, 3], [ 0, 0, 1, 650000, 3], [ 1, 0, 0, 600000, 2]])
vec.get_feature_names()
['neighborhood=Fremont', 'neighborhood=Queen Anne', 'neighborhood=Wallingford', 'price', 'rooms']
vec = DictVectorizer(sparse=True, dtype=int)
vec.fit_transform(data)
<4x5 sparse matrix of type '<type 'numpy.int64'>' with 12 stored elements in Compressed Sparse Row format>
sample = ['problem of evil', 'evil queen', 'horizon problem']
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer()
X = vec.fit_transform(sample)
X
<3x5 sparse matrix of type '<type 'numpy.int64'>' with 7 stored elements in Compressed Sparse Row format>
import pandas as pd
pd.DataFrame(X.toarray(), columns=vec.get_feature_names())
evil | horizon | of | problem | queen | |
---|---|---|---|---|---|
0 | 1 | 0 | 1 | 1 | 0 |
1 | 1 | 0 | 0 | 0 | 1 |
2 | 0 | 1 | 0 | 1 | 0 |
from sklearn.feature_extraction.text import TfidfVectorizer
vec = TfidfVectorizer()
X = vec.fit_transform(sample)
pd.DataFrame(X.toarray(), columns=vec.get_feature_names())
evil | horizon | of | problem | queen | |
---|---|---|---|---|---|
0 | 0.517856 | 0.000000 | 0.680919 | 0.517856 | 0.000000 |
1 | 0.605349 | 0.000000 | 0.000000 | 0.000000 | 0.795961 |
2 | 0.000000 | 0.795961 | 0.000000 | 0.605349 | 0.000000 |
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
x = np.array([1, 2, 3, 4, 5])
y = np.array([4, 2, 1, 3, 7])
plt.scatter(x, y);
from sklearn.linear_model import LinearRegression
X = x[:, np.newaxis]
model = LinearRegression().fit(X, y)
yfit = model.predict(X)
plt.scatter(x, y)
plt.plot(x, yfit);
/Users/dsr/Documents/HustleProjects/exploring_machine_learning/venv/lib/python2.7/site-packages/scipy/linalg/basic.py:1226: RuntimeWarning: internal gelsd driver lwork query error, required iwork dimension not returned. This is likely the result of LAPACK bug 0038, fixed in LAPACK 3.2.2 (released July 21, 2010). Falling back to 'gelss' driver. warnings.warn(mesg, RuntimeWarning)
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=3, include_bias=False)
X2 = poly.fit_transform(X)
print(X2)
[[ 1. 1. 1.] [ 2. 4. 8.] [ 3. 9. 27.] [ 4. 16. 64.] [ 5. 25. 125.]]
model = LinearRegression().fit(X2, y)
yfit = model.predict(X2)
plt.scatter(x, y)
plt.plot(x, yfit);
from numpy import nan
X=np.array([[nan,0, 3],
[3, 7, 9],
[3, 5, 2],
[4, nan,6],
[8, 8, 1]])
y = np.array([14, 16, -1, 8, -5])
from sklearn.preprocessing import Imputer
imp = Imputer(strategy='mean')
X2 = imp.fit_transform(X)
X2
array([[ 4.5, 0. , 3. ], [ 3. , 7. , 9. ], [ 3. , 5. , 2. ], [ 4. , 5. , 6. ], [ 8. , 8. , 1. ]])
model = LinearRegression().fit(X2, y)
model.predict(X2)
array([ 13.14869292, 14.3784627 , -1.15539732, 10.96606197, -5.33782027])
from sklearn.pipeline import make_pipeline
model = make_pipeline(Imputer(strategy='mean'),PolynomialFeatures(degree=2), LinearRegression())
model.fit(X, y)
print(y)
print(model.predict(X))
[14 16 -1 8 -5] [ 14. 16. -1. 8. -5.]