from sklearn import datasets
import numpy as np
import pandas as pd
import bokeh
from bokeh.plotting import output_notebook
from sklearn.model_selection import train_test_split
from datascienceutils import analyze
from datascienceutils import predictiveModels as pm
from datascienceutils import sklearnUtils as sku
output_notebook()
irisDf = pd.read_excel('/home/anand/DataScientist/data/titanic3.xls')
irisDf.head()
pclass | survived | name | sex | age | sibsp | parch | ticket | fare | cabin | embarked | boat | body | home.dest | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 1 | Allen, Miss. Elisabeth Walton | female | 29.0000 | 0 | 0 | 24160 | 211.3375 | B5 | S | 2 | NaN | St Louis, MO |
1 | 1 | 1 | Allison, Master. Hudson Trevor | male | 0.9167 | 1 | 2 | 113781 | 151.5500 | C22 C26 | S | 11 | NaN | Montreal, PQ / Chesterville, ON |
2 | 1 | 0 | Allison, Miss. Helen Loraine | female | 2.0000 | 1 | 2 | 113781 | 151.5500 | C22 C26 | S | NaN | NaN | Montreal, PQ / Chesterville, ON |
3 | 1 | 0 | Allison, Mr. Hudson Joshua Creighton | male | 30.0000 | 1 | 2 | 113781 | 151.5500 | C22 C26 | S | NaN | 135.0 | Montreal, PQ / Chesterville, ON |
4 | 1 | 0 | Allison, Mrs. Hudson J C (Bessie Waldo Daniels) | female | 25.0000 | 1 | 2 | 113781 | 151.5500 | C22 C26 | S | NaN | NaN | Montreal, PQ / Chesterville, ON |
target = irisDf.survived
copy_df = irisDf.copy(deep=True)
copy_df.drop('survived', 1, inplace=True)
copy_df['cabin'].unique()
array(['B5', 'C22 C26', 'E12', 'D7', 'A36', 'C101', nan, 'C62 C64', 'B35', 'A23', 'B58 B60', 'D15', 'C6', 'D35', 'C148', 'C97', 'B49', 'C99', 'C52', 'T', 'A31', 'C7', 'C103', 'D22', 'E33', 'A21', 'B10', 'B4', 'E40', 'B38', 'E24', 'B51 B53 B55', 'B96 B98', 'C46', 'E31', 'E8', 'B61', 'B77', 'A9', 'C89', 'A14', 'E58', 'E49', 'E52', 'E45', 'B22', 'B26', 'C85', 'E17', 'B71', 'B20', 'A34', 'C86', 'A16', 'A20', 'A18', 'C54', 'C45', 'D20', 'A29', 'C95', 'E25', 'C111', 'C23 C25 C27', 'E36', 'D34', 'D40', 'B39', 'B41', 'B102', 'C123', 'E63', 'C130', 'B86', 'C92', 'A5', 'C51', 'B42', 'C91', 'C125', 'D10 D12', 'B82 B84', 'E50', 'D33', 'C83', 'B94', 'D49', 'D45', 'B69', 'B11', 'E46', 'C39', 'B18', 'D11', 'C93', 'B28', 'C49', 'B52 B54 B56', 'E60', 'C132', 'B37', 'D21', 'D19', 'C124', 'D17', 'B101', 'D28', 'D6', 'D9', 'B80', 'C106', 'B79', 'C47', 'D30', 'C90', 'E38', 'C78', 'C30', 'C118', 'D36', 'D48', 'D47', 'C105', 'B36', 'B30', 'D43', 'B24', 'C2', 'C65', 'B73', 'C104', 'C110', 'C50', 'B3', 'A24', 'A32', 'A11', 'A10', 'B57 B59 B63 B66', 'C28', 'E44', 'A26', 'A6', 'A7', 'C31', 'A19', 'B45', 'E34', 'B78', 'B50', 'C87', 'C116', 'C55 C57', 'D50', 'E68', 'E67', 'C126', 'C68', 'C70', 'C53', 'B19', 'D46', 'D37', 'D26', 'C32', 'C80', 'C82', 'C128', 'E39 E41', 'D', 'F4', 'D56', 'F33', 'E101', 'E77', 'F2', 'D38', 'F', 'F G63', 'F E57', 'F E46', 'F G73', 'E121', 'F E69', 'E10', 'G6', 'F38'], dtype=object)
from sklearn import preprocessing
lb = preprocessing.LabelBinarizer()
lb.fit_transform(['yes', 'no', 'no', 'yes'])
array([[1], [0], [0], [1]])
## FEATURE ENGINEERING
copy_df['sex'] = sku.encode_labels(copy_df, 'sex') # Encode gender to number
copy_df['cabin'] = copy_df['cabin'].apply(lambda x: str(x))
copy_df['cabin_class'] = copy_df['cabin'].apply(lambda x: [s for s in x.split() if not s.isdigit()])
copy_df.head(2)
pclass | name | sex | age | sibsp | parch | ticket | fare | cabin | embarked | boat | body | home.dest | cabin_class | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Allen, Miss. Elisabeth Walton | 0 | 29.0000 | 0 | 0 | 24160 | 211.3375 | B5 | S | 2 | NaN | St Louis, MO | [B5] |
1 | 1 | Allison, Master. Hudson Trevor | 1 | 0.9167 | 1 | 2 | 113781 | 151.5500 | C22 C26 | S | 11 | NaN | Montreal, PQ / Chesterville, ON | [C22, C26] |
X_train, X_test, y_train, y_test = train_test_split(copy_df, target)
# Train the model using the training sets
lin_model = pm.train(X_train, y_train, 'LinearRegression')
print('Coefficients: \n', lin_model.coef_)
# The mean squared error
print("Mean squared error: %.2f"
% np.mean((lin_model.predict(X_test) - y_test) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % lin_model.score(X_test, y_test))
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-9-30d4bda23342> in <module>() 1 X_train, X_test, y_train, y_test = train_test_split(copy_df, target) 2 # Train the model using the training sets ----> 3 lin_model = pm.train(X_train, y_train, 'LinearRegression') 4 5 print('Coefficients: \n', lin_model.coef_) /home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/datascienceutils-1.2.19-py3.5.egg/datascienceutils/predictiveModels.py in train(dataframe, target, modelType, column, **kwargs) 32 model.fit(source, target) 33 else: ---> 34 model.fit(dataframe, target) 35 return model 36 /home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/sklearn/linear_model/base.py in fit(self, X, y, sample_weight) 510 n_jobs_ = self.n_jobs 511 X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], --> 512 y_numeric=True, multi_output=True) 513 514 if sample_weight is not None and np.atleast_1d(sample_weight).ndim > 1: /home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/sklearn/utils/validation.py in check_X_y(X, y, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, warn_on_dtype, estimator) 519 X = check_array(X, accept_sparse, dtype, order, copy, force_all_finite, 520 ensure_2d, allow_nd, ensure_min_samples, --> 521 ensure_min_features, warn_on_dtype, estimator) 522 if multi_output: 523 y = check_array(y, 'csr', force_all_finite=True, ensure_2d=False, /home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator) 400 # make sure we actually converted to numeric: 401 if dtype_numeric and array.dtype.kind == "O": --> 402 array = array.astype(np.float64) 403 if not allow_nd and array.ndim >= 3: 404 raise ValueError("Found array with dim %d. %s expected <= 2." ValueError: setting an array element with a sequence.
# Train the model using the training sets
log_model = pm.train(X_train, y_train, 'LogisticRegression')
#print('Coefficients: \n', log_model.coef_)
# The mean squared error
print("Mean squared error: %.2f"
% np.mean((log_model.predict(X_test) - y_test) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % log_model.score(X_test, y_test))
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-11-a6973d76ea01> in <module>() 1 # Train the model using the training sets ----> 2 log_model = pm.train(X_train, y_train, 'LogisticRegression') 3 4 #print('Coefficients: \n', log_model.coef_) 5 # The mean squared error /home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/datascienceutils-1.2.19-py3.5.egg/datascienceutils/predictiveModels.py in train(dataframe, target, modelType, column, **kwargs) 32 model.fit(source, target) 33 else: ---> 34 model.fit(dataframe, target) 35 return model 36 /home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/sklearn/linear_model/logistic.py in fit(self, X, y, sample_weight) 1171 1172 X, y = check_X_y(X, y, accept_sparse='csr', dtype=np.float64, -> 1173 order="C") 1174 check_classification_targets(y) 1175 self.classes_ = np.unique(y) /home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/sklearn/utils/validation.py in check_X_y(X, y, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, warn_on_dtype, estimator) 519 X = check_array(X, accept_sparse, dtype, order, copy, force_all_finite, 520 ensure_2d, allow_nd, ensure_min_samples, --> 521 ensure_min_features, warn_on_dtype, estimator) 522 if multi_output: 523 y = check_array(y, 'csr', force_all_finite=True, ensure_2d=False, /home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator) 380 force_all_finite) 381 else: --> 382 array = np.array(array, dtype=dtype, order=order, copy=copy) 383 384 if ensure_2d: ValueError: setting an array element with a sequence.
# Train the model using the training sets
rf_model = pm.train(X_train, y_train, 'randomForest')
#print('Coefficients: \n', rf_model.coef_)
# The mean squared error
print("Mean squared error: %.2f"
% np.mean((rf_model.predict(X_test) - y_test) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % rf_model.score(X_test, y_test))
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-12-7851e5fc266c> in <module>() 1 # Train the model using the training sets ----> 2 rf_model = pm.train(X_train, y_train, 'randomForest') 3 4 #print('Coefficients: \n', rf_model.coef_) 5 # The mean squared error /home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/datascienceutils-1.2.19-py3.5.egg/datascienceutils/predictiveModels.py in train(dataframe, target, modelType, column, **kwargs) 32 model.fit(source, target) 33 else: ---> 34 model.fit(dataframe, target) 35 return model 36 /home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/sklearn/ensemble/forest.py in fit(self, X, y, sample_weight) 245 """ 246 # Validate or convert input data --> 247 X = check_array(X, accept_sparse="csc", dtype=DTYPE) 248 y = check_array(y, accept_sparse='csc', ensure_2d=False, dtype=None) 249 if issparse(X): /home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator) 380 force_all_finite) 381 else: --> 382 array = np.array(array, dtype=dtype, order=order, copy=copy) 383 384 if ensure_2d: ValueError: setting an array element with a sequence.
# Train the model using the training sets
sgd_model = pm.train(X_train, y_train, 'sgd')
sgd_model.fit(X_train, y_train)
# The mean squared error
print("Mean squared error: %.2f"
% np.mean((sgd_model.predict(X_test) - y_test) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % sgd_model.score(X_test, y_test))
# Train the model using the training sets
xgb_model = pm.train(X_train, y_train, 'xgboost')
xgb_model.fit(X_train, y_train)
# The mean squared error
print("Mean squared error: %.2f"
% np.mean((xgb_model.predict(X_test) - y_test) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % xgb_model.score(X_test, y_test))
# Train the model using the training sets
svm_model = pm.train(X_train, y_train, 'svm')
svm_model.fit(X_train, y_train)
# The mean squared error
print("Mean squared error: %.2f"
% np.mean((svm_model.predict(X_test) - y_test) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % svm_model.score(X_test, y_test))
# Train the model using the training sets
bnb_model = pm.train(X_train, y_train, 'bernoulliNB')
bnb_model.fit(X_train, y_train)
# The mean squared error
print("Mean squared error: %.2f"
% np.mean((bnb_model.predict(X_test) - y_test) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % bnb_model.score(X_test, y_test))
# Train the model using the training sets
knn_model = pm.train(X_train, y_train, 'knn')
knn_model.fit(X_train, y_train)
# The mean squared error
print("Mean squared error: %.2f"
% np.mean((knn_model.predict(X_test) - y_test) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % knn_model.score(X_test, y_test))
# Train the model using the training sets
kde_model = pm.train(X_train, y_train, 'kde')
kde_model.fit(X_train, y_train)
# The mean squared error
print("Mean squared error: %.2f"
% np.mean((kde_model.predict(X_test) - y_test) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % kde_model.score(X_test, y_test))
# Train the model using the training sets
kde_model = pm.train(X_train, y_train, 'kde')
kde_model.fit(X_train, y_train)
# The mean squared error
print("Mean squared error: %.2f"
% np.mean((kde_model.predict(X_test) - y_test) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % kde_model.score(X_test, y_test))
# Train the model using the training sets
mnb_model = pm.train(X_train, y_train, 'multinomialNB')
print('Coefficients: \n', mnb_model.coef_)
# The mean squared error
print("Mean squared error: %.2f"
% np.mean((mnb_model.predict(X_test) - y_test) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % mnb_model.score(X_test, y_test))