import pandas as pd
from sklearn.decomposition import PCA
from sklearn.cross_decomposition import PLSRegression
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
import numpy as np
from sklearn.metrics import r2_score
import seaborn as sb
import pylab as pl
%pylab inline --no-import-all
Populating the interactive namespace from numpy and matplotlib
## Soludity data obtained from book code 'AppliedPredictiveModeling'
train_data = pd.read_csv('../data/solTrain.csv', )
test_data = pd.read_csv('../data/solTest.csv')
print train_data.shape, test_data.shape
print train_data.columns
(951, 229) (316, 229) Index([u'FP001', u'FP002', u'FP003', u'FP004', u'FP005', u'FP006', u'FP007', u'FP008', u'FP009', u'FP010', u'FP011', u'FP012', u'FP013', u'FP014', u'FP015', u'FP016', u'FP017', u'FP018', u'FP019', u'FP020', u'FP021', u'FP022', u'FP023', u'FP024', u'FP025', u'FP026', u'FP027', u'FP028', u'FP029', u'FP030', u'FP031', u'FP032', u'FP033', u'FP034', u'FP035', u'FP036', u'FP037', u'FP038', u'FP039', u'FP040', u'FP041', u'FP042', u'FP043', u'FP044', u'FP045', u'FP046', u'FP047', u'FP048', u'FP049', u'FP050', u'FP051', u'FP052', u'FP053', u'FP054', u'FP055', u'FP056', u'FP057', u'FP058', u'FP059', u'FP060', u'FP061', u'FP062', u'FP063', u'FP064', u'FP065', u'FP066', u'FP067', u'FP068', u'FP069', u'FP070', u'FP071', u'FP072', u'FP073', u'FP074', u'FP075', u'FP076', u'FP077', u'FP078', u'FP079', u'FP080', u'FP081', u'FP082', u'FP083', u'FP084', u'FP085', u'FP086', u'FP087', u'FP088', u'FP089', u'FP090', u'FP091', u'FP092', u'FP093', u'FP094', u'FP095', u'FP096', u'FP097', u'FP098', u'FP099', u'FP100', u'FP101', u'FP102', u'FP103', u'FP104', u'FP105', u'FP106', u'FP107', u'FP108', u'FP109', u'FP110', u'FP111', u'FP112', u'FP113', u'FP114', u'FP115', u'FP116', u'FP117', u'FP118', u'FP119', u'FP120', u'FP121', u'FP122', u'FP123', u'FP124', u'FP125', u'FP126', u'FP127', u'FP128', u'FP129', u'FP130', u'FP131', u'FP132', u'FP133', u'FP134', u'FP135', u'FP136', u'FP137', u'FP138', u'FP139', u'FP140', u'FP141', u'FP142', u'FP143', u'FP144', u'FP145', u'FP146', u'FP147', u'FP148', u'FP149', u'FP150', u'FP151', u'FP152', u'FP153', u'FP154', u'FP155', u'FP156', u'FP157', u'FP158', u'FP159', u'FP160', u'FP161', u'FP162', u'FP163', u'FP164', u'FP165', u'FP166', u'FP167', u'FP168', u'FP169', u'FP170', u'FP171', u'FP172', u'FP173', u'FP174', u'FP175', u'FP176', u'FP177', u'FP178', u'FP179', u'FP180', u'FP181', u'FP182', u'FP183', u'FP184', u'FP185', u'FP186', u'FP187', u'FP188', u'FP189', u'FP190', u'FP191', u'FP192', u'FP193', u'FP194', u'FP195', u'FP196', u'FP197', u'FP198', u'FP199', u'FP200', u'FP201', u'FP202', u'FP203', u'FP204', u'FP205', u'FP206', u'FP207', u'FP208', u'MolWeight', u'NumAtoms', u'NumNonHAtoms', u'NumBonds', u'NumNonHBonds', u'NumMultBonds', u'NumRotBonds', u'NumDblBonds', u'NumAromaticBonds', u'NumHydrogen', u'NumCarbon', u'NumNitrogen', u'NumOxygen', u'NumSulfer', u'NumChlorine', u'NumHalogen', u'NumRings', u'HydrophilicFactor', u'SurfaceArea1', u'SurfaceArea2', u'Target'], dtype=object)
## preprocess data
## leave alone binary data
## box-cox transform, center, scale numeric data (integers and floats)
binary_features = [f for f in train_data.columns if f.startswith('FP')]
target_features = ['Target']
numeric_features = train_data.columns - binary_features - target_features
print numeric_features
Index([u'HydrophilicFactor', u'MolWeight', u'NumAromaticBonds', u'NumAtoms', u'NumBonds', u'NumCarbon', u'NumChlorine', u'NumDblBonds', u'NumHalogen', u'NumHydrogen', u'NumMultBonds', u'NumNitrogen', u'NumNonHAtoms', u'NumNonHBonds', u'NumOxygen', u'NumRings', u'NumRotBonds', u'NumSulfer', u'SurfaceArea1', u'SurfaceArea2'], dtype=object)
## Box-Cox Transformation
from sklearn.base import BaseEstimator, TransformerMixin
from scipy.stats import skew
class BoxCoxTransformation(BaseEstimator, TransformerMixin):
def __init__(self):
pass
def fit(self, X, y = None):
n_features = X.shape[1]
self.gammas_ = np.zeros(n_features)
for i in range(n_features):
self.gammas_[i] = self._fit_boxcox(X[:, i])
return self
def transform(self, X):
assert X.shape[1] == self.gammas_.shape[0]
XX = X.copy()
for i, gamma in enumerate(self.gammas_):
XX[:, i] = np.log(X[:, i]) if gamma == 0 else (X[:, i] ** gamma - 1.)/ gamma
return XX
def _fit_boxcox(self, x):
gammas = np.array([-3, -2, -1., 0, 0.5, 1.])
scores = np.empty_like(gammas)
scores[:] = 100
for i, gamma in enumerate(gammas):
if gamma == 0:
xx = np.log(x)
else:
xx = (x ** gamma - 1.)/ gamma
try:
scores[i] = skew(xx)
except Exception, ex:
pass
boxcox_gamma = gammas[np.nanargmin(np.abs(scores))]
#boxcox_xx = np.log(x) if boxcox_gamma == 0 else (x ** boxcox_gamma - 1.)/ boxcox_gamma
return boxcox_gamma
preprocesser = Pipeline(steps = [
('boxcox', BoxCoxTransformation()),
('standarizer', StandardScaler())
])
train_X = np.c_[np.asarray(train_data.loc[:, binary_features]),
preprocesser.fit_transform(np.asarray(train_data.loc[:, numeric_features]))]
test_X = np.c_[np.asarray(test_data.loc[:, binary_features]),
preprocesser.transform(np.asarray(test_data.loc[:, numeric_features]))]
train_y = np.asarray(train_data.loc[:, target_features]).ravel()
test_y = np.asarray(test_data.loc[:, target_features]).ravel()
feature_names = list(binary_features) + list(numeric_features)
print train_X.shape, test_X.shape, train_y.shape, test_y.shape
print preprocesser.named_steps['boxcox'].gammas_
(951, 228) (316, 228) (951,) (316,) [ 1. 0. 0.5 0. 0. 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0. 0.5 0.5 0.5 0.5 0.5 0.5 0.5]
## IN CASE YOU ARE SKEPTICAL
## OUTPUT SKEWNESS and OUTLIERS
print BoxCoxTransformation()._fit_boxcox(train_data[target_features])
sb.kdeplot(train_data[target_features])
pl.figure()
sb.boxplot(train_data[target_features])
1.0
<matplotlib.axes.AxesSubplot at 0x113666890>
## test colinearity in features by scree plot and corrplot
pl.figure(figsize = (25, 25))
sb.corrplot(train_X[:, len(binary_features):], names = numeric_features)
<matplotlib.axes.AxesSubplot at 0x11365db10>
pca = PCA()
pca.fit(train_X[:, len(binary_features):])
R2 = pd.DataFrame(pca.explained_variance_ratio_,
index=['pca%i' % i for i in range(len(numeric_features))])
R2.plot(kind = 'bar', figsize=(50, 20))
<matplotlib.axes.AxesSubplot at 0x1145e1250>
## any outliers?
pl.figure(figsize=(16, 16))
sb.boxplot(train_X[:, len(binary_features):], names=numeric_features)
<matplotlib.axes.AxesSubplot at 0x1149f9b50>
clf = Pipeline(steps = [
('pca', PCA()),
('lr', LinearRegression())
])
params = {'pca__n_components': [2, 5, 10, 15, 20, 30, 40, 50]}
gs = GridSearchCV(clf, params, cv = 10, n_jobs=1)
gs.fit(train_X, train_y)
print gs.best_params_
print gs.score(train_X, train_y)
print gs.score(test_X, test_y)
{'pca__n_components': 50} 0.894187857392 0.859656453287
XX = PCA(n_components=50).fit_transform(train_X)
sb.regplot(XX[:, 0], train_y, )
sb.regplot(XX[:, 1], train_y, )
## PLS DOESNT SUPPORT A PIPELINE DUE TO ITS ASSUMPTION ON Y shape
## AND IT IS NOT SUPPOSED TO BE USED THAT WAY - ITS A LINEAR REGRESSOR BY ITSELF
clf = Pipeline(steps = [
('pls', PLSRegression()),
('lr', LinearRegression())
])
params = {'pls__n_components': [2, 5, 10, 15, 20, 30, 40, 50]}
gs = GridSearchCV(clf, params, cv = 3, n_jobs=1)
gs.fit(train_X, np.asarray([train_y]))
print gs.best_params_
print gs.score(train_X, train_y)
print gs.score(test_X, test_y)
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-12-150eaaa37b78> in <module>() 8 9 gs = GridSearchCV(clf, params, cv = 3, n_jobs=1) ---> 10 gs.fit(train_X, np.asarray([train_y])) 11 print gs.best_params_ 12 print gs.score(train_X, train_y) /Library/Python/2.7/site-packages/sklearn/grid_search.pyc in fit(self, X, y, **params) 705 " The params argument will be removed in 0.15.", 706 DeprecationWarning) --> 707 return self._fit(X, y, ParameterGrid(self.param_grid)) 708 709 /Library/Python/2.7/site-packages/sklearn/grid_search.pyc in _fit(self, X, y, parameter_iterable) 461 462 n_samples = _num_samples(X) --> 463 X, y = check_arrays(X, y, allow_lists=True, sparse_format='csr') 464 465 self.scorer_ = _deprecate_loss_and_score_funcs( /Library/Python/2.7/site-packages/sklearn/utils/validation.pyc in check_arrays(*arrays, **options) 209 if size != n_samples: 210 raise ValueError("Found array with dim %d. Expected %d" --> 211 % (size, n_samples)) 212 213 if not allow_lists or hasattr(array, "shape"): ValueError: Found array with dim 1. Expected 951
clf = PLSRegression()
params = {'n_components': [2, 5, 10, 15, 20, 30, 40, 50]}
gs = GridSearchCV(clf, params, cv = 10, n_jobs=1)
gs.fit(train_X, train_y)
print gs.best_params_
print gs.score(train_X, train_y)
print gs.score(test_X, test_y)
{'n_components': 15} 0.930483998989 0.880719668867
XX = gs.transform(train_X)
sb.regplot(XX[:, 0], train_y, )
sb.regplot(XX[:, 1], train_y, )
clf = RidgeCV(alphas = [0.01, 0.03, 0.1, 0.3, 1.0, 3.0, 10], cv = 5)
clf.fit(train_X, train_y)
print clf.alpha_
print clf.score(test_X, test_y)
3.0 0.885287787989
selected = [i for i, f in enumerate(clf.coef_) if abs(f) > 0]
print len(selected), 'features selected'
feature_importances = pd.DataFrame(clf.coef_[selected],
index = np.array(feature_names)[selected])
feature_importances.plot(kind = 'bar', figsize = (16, 16))
228 features selected
<matplotlib.axes.AxesSubplot at 0x119a7cd90>
np.asarray(feature_names)[np.abs(clf.coef_).argsort()[:-20:-1]]
array(['NumAromaticBonds', 'NumNonHBonds', 'NumOxygen', 'NumCarbon', 'SurfaceArea1', 'FP154', 'MolWeight', 'FP142', 'FP111', 'FP109', 'FP127', 'FP164', 'FP173', 'FP076', 'FP130', 'FP171', 'FP184', 'FP176', 'NumChlorine'], dtype='|S17')
clf = LassoCV(cv = 5, )
clf.fit(train_X, train_y)
print clf.alpha_
print clf.score(test_X, test_y)
0.00357773977666 0.885057776694
selected = [i for i, f in enumerate(clf.coef_) if abs(f) > 0]
print len(selected), 'features selected'
feature_importances = pd.DataFrame(clf.coef_[selected],
index = np.array(feature_names)[selected])
feature_importances.plot(kind = 'bar', figsize = (16, 16))
103 features selected
<matplotlib.axes.AxesSubplot at 0x11954df10>
np.asarray(feature_names)[np.abs(clf.coef_).argsort()[:-20:-1]]
array(['SurfaceArea1', 'NumNonHBonds', 'NumCarbon', 'FP142', 'MolWeight', 'FP002', 'FP127', 'FP039', 'NumOxygen', 'NumAromaticBonds', 'FP154', 'FP111', 'FP083', 'FP164', 'FP124', 'FP202', 'FP099', 'FP173', 'FP053'], dtype='|S17')
from sklearn.ensemble import ExtraTreesRegressor
trees = ExtraTreesRegressor(100, max_features='auto', )
trees.fit(train_X, train_y)
print trees.score(test_X, test_y)
0.899405478203
feature_importances = trees.feature_importances_
np.asarray(feature_names)[feature_importances.argsort()[:-40:-1]]
array(['FP076', 'NumCarbon', 'MolWeight', 'NumNonHAtoms', 'NumNonHBonds', 'SurfaceArea1', 'SurfaceArea2', 'FP072', 'NumBonds', 'NumAromaticBonds', 'FP009', 'FP092', 'NumHalogen', 'FP044', 'NumChlorine', 'FP015', 'FP075', 'FP116', 'NumAtoms', 'NumOxygen', 'FP059', 'HydrophilicFactor', 'FP063', 'FP070', 'NumRotBonds', 'NumMultBonds', 'FP105', 'FP141', 'FP161', 'FP112', 'NumHydrogen', 'NumNitrogen', 'FP096', 'FP178', 'FP068', 'FP013', 'FP082', 'FP006', 'FP168'], dtype='|S17')
pd.DataFrame(feature_importances, index=feature_names).plot(kind='bar', figsize = (32, 32))
<matplotlib.axes.AxesSubplot at 0x11a81d110>
sb.boxplot(train_data.Target, groupby=train_data.FP076, )
<matplotlib.axes.AxesSubplot at 0x11d0b9910>
sb.lmplot(x = 'MolWeight', y = 'Target', data = train_data)
## slight outliers in y
from sklearn.svm import SVR
svr = SVR()
gammas = [0.0001, 0.0003, 0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1]
gs = GridSearchCV(svr, {'gamma': gammas}, cv = 10)
gs.fit(train_X, train_y)
print gs.best_score_
print gs.best_params_
-1.80315685695 {'gamma': 0.01}
svr.set_params(gamma = 0.01)
svr.fit(train_X, train_y)
print svr.score(test_X, test_y)
0.903302338807