In [1]:

from sklearn import datasets
import numpy as np
import pandas as pd
import bokeh
from bokeh.plotting import output_notebook

from sklearn.model_selection import train_test_split

from datascienceutils import analyze
from datascienceutils import predictiveModels as pm
from datascienceutils import sklearnUtils as sku

output_notebook()

Loading BokehJS ...

In [2]:

irisDf = pd.read_excel('/home/anand/DataScientist/data/titanic3.xls')

In [3]:

irisDf.head()

Out[3]:

	pclass	survived	name	sex	age	sibsp	parch	ticket	fare	cabin	embarked	boat	body	home.dest
0	1	1	Allen, Miss. Elisabeth Walton	female	29.0000	0	0	24160	211.3375	B5	S	2	NaN	St Louis, MO
1	1	1	Allison, Master. Hudson Trevor	male	0.9167	1	2	113781	151.5500	C22 C26	S	11	NaN	Montreal, PQ / Chesterville, ON
2	1	0	Allison, Miss. Helen Loraine	female	2.0000	1	2	113781	151.5500	C22 C26	S	NaN	NaN	Montreal, PQ / Chesterville, ON
3	1	0	Allison, Mr. Hudson Joshua Creighton	male	30.0000	1	2	113781	151.5500	C22 C26	S	NaN	135.0	Montreal, PQ / Chesterville, ON
4	1	0	Allison, Mrs. Hudson J C (Bessie Waldo Daniels)	female	25.0000	1	2	113781	151.5500	C22 C26	S	NaN	NaN	Montreal, PQ / Chesterville, ON

In [4]:

target = irisDf.survived
copy_df = irisDf.copy(deep=True)

copy_df.drop('survived', 1, inplace=True)

In [5]:

copy_df['cabin'].unique()

Out[5]:

array(['B5', 'C22 C26', 'E12', 'D7', 'A36', 'C101', nan, 'C62 C64', 'B35',
       'A23', 'B58 B60', 'D15', 'C6', 'D35', 'C148', 'C97', 'B49', 'C99',
       'C52', 'T', 'A31', 'C7', 'C103', 'D22', 'E33', 'A21', 'B10', 'B4',
       'E40', 'B38', 'E24', 'B51 B53 B55', 'B96 B98', 'C46', 'E31', 'E8',
       'B61', 'B77', 'A9', 'C89', 'A14', 'E58', 'E49', 'E52', 'E45', 'B22',
       'B26', 'C85', 'E17', 'B71', 'B20', 'A34', 'C86', 'A16', 'A20',
       'A18', 'C54', 'C45', 'D20', 'A29', 'C95', 'E25', 'C111',
       'C23 C25 C27', 'E36', 'D34', 'D40', 'B39', 'B41', 'B102', 'C123',
       'E63', 'C130', 'B86', 'C92', 'A5', 'C51', 'B42', 'C91', 'C125',
       'D10 D12', 'B82 B84', 'E50', 'D33', 'C83', 'B94', 'D49', 'D45',
       'B69', 'B11', 'E46', 'C39', 'B18', 'D11', 'C93', 'B28', 'C49',
       'B52 B54 B56', 'E60', 'C132', 'B37', 'D21', 'D19', 'C124', 'D17',
       'B101', 'D28', 'D6', 'D9', 'B80', 'C106', 'B79', 'C47', 'D30',
       'C90', 'E38', 'C78', 'C30', 'C118', 'D36', 'D48', 'D47', 'C105',
       'B36', 'B30', 'D43', 'B24', 'C2', 'C65', 'B73', 'C104', 'C110',
       'C50', 'B3', 'A24', 'A32', 'A11', 'A10', 'B57 B59 B63 B66', 'C28',
       'E44', 'A26', 'A6', 'A7', 'C31', 'A19', 'B45', 'E34', 'B78', 'B50',
       'C87', 'C116', 'C55 C57', 'D50', 'E68', 'E67', 'C126', 'C68', 'C70',
       'C53', 'B19', 'D46', 'D37', 'D26', 'C32', 'C80', 'C82', 'C128',
       'E39 E41', 'D', 'F4', 'D56', 'F33', 'E101', 'E77', 'F2', 'D38', 'F',
       'F G63', 'F E57', 'F E46', 'F G73', 'E121', 'F E69', 'E10', 'G6',
       'F38'], dtype=object)

In [7]:

from sklearn import preprocessing
lb = preprocessing.LabelBinarizer()
lb.fit_transform(['yes', 'no', 'no', 'yes'])

Out[7]:

array([[1],
       [0],
       [0],
       [1]])

In [8]:

## FEATURE ENGINEERING
copy_df['sex'] = sku.encode_labels(copy_df, 'sex') # Encode gender to number
copy_df['cabin'] = copy_df['cabin'].apply(lambda x: str(x))
copy_df['cabin_class'] = copy_df['cabin'].apply(lambda x: [s for s in x.split() if not s.isdigit()])

In [13]:

copy_df.head(2)

Out[13]:

	pclass	name	sex	age	sibsp	parch	ticket	fare	cabin	embarked	boat	body	home.dest	cabin_class
0	1	Allen, Miss. Elisabeth Walton	0	29.0000	0	0	24160	211.3375	B5	S	2	NaN	St Louis, MO	[B5]
1	1	Allison, Master. Hudson Trevor	1	0.9167	1	2	113781	151.5500	C22 C26	S	11	NaN	Montreal, PQ / Chesterville, ON	[C22, C26]

In [9]:

X_train, X_test, y_train, y_test = train_test_split(copy_df, target)
# Train the model using the training sets
lin_model = pm.train(X_train, y_train, 'LinearRegression')

print('Coefficients: \n', lin_model.coef_)
# The mean squared error
print("Mean squared error: %.2f"
      % np.mean((lin_model.predict(X_test) - y_test) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % lin_model.score(X_test, y_test))

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-9-30d4bda23342> in <module>()
      1 X_train, X_test, y_train, y_test = train_test_split(copy_df, target)
      2 # Train the model using the training sets
----> 3 lin_model = pm.train(X_train, y_train, 'LinearRegression')
      4 
      5 print('Coefficients: \n', lin_model.coef_)

/home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/datascienceutils-1.2.19-py3.5.egg/datascienceutils/predictiveModels.py in train(dataframe, target, modelType, column, **kwargs)
     32         model.fit(source, target)
     33     else:
---> 34         model.fit(dataframe, target)
     35     return model
     36 

/home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/sklearn/linear_model/base.py in fit(self, X, y, sample_weight)
    510         n_jobs_ = self.n_jobs
    511         X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
--> 512                          y_numeric=True, multi_output=True)
    513 
    514         if sample_weight is not None and np.atleast_1d(sample_weight).ndim > 1:

/home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/sklearn/utils/validation.py in check_X_y(X, y, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, warn_on_dtype, estimator)
    519     X = check_array(X, accept_sparse, dtype, order, copy, force_all_finite,
    520                     ensure_2d, allow_nd, ensure_min_samples,
--> 521                     ensure_min_features, warn_on_dtype, estimator)
    522     if multi_output:
    523         y = check_array(y, 'csr', force_all_finite=True, ensure_2d=False,

/home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
    400         # make sure we actually converted to numeric:
    401         if dtype_numeric and array.dtype.kind == "O":
--> 402             array = array.astype(np.float64)
    403         if not allow_nd and array.ndim >= 3:
    404             raise ValueError("Found array with dim %d. %s expected <= 2."

ValueError: setting an array element with a sequence.

In [11]:

# Train the model using the training sets
log_model = pm.train(X_train, y_train, 'LogisticRegression')

#print('Coefficients: \n', log_model.coef_)
# The mean squared error
print("Mean squared error: %.2f"
      % np.mean((log_model.predict(X_test) - y_test) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % log_model.score(X_test, y_test))

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-11-a6973d76ea01> in <module>()
      1 # Train the model using the training sets
----> 2 log_model = pm.train(X_train, y_train, 'LogisticRegression')
      3 
      4 #print('Coefficients: \n', log_model.coef_)
      5 # The mean squared error

/home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/datascienceutils-1.2.19-py3.5.egg/datascienceutils/predictiveModels.py in train(dataframe, target, modelType, column, **kwargs)
     32         model.fit(source, target)
     33     else:
---> 34         model.fit(dataframe, target)
     35     return model
     36 

/home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/sklearn/linear_model/logistic.py in fit(self, X, y, sample_weight)
   1171 
   1172         X, y = check_X_y(X, y, accept_sparse='csr', dtype=np.float64,
-> 1173                          order="C")
   1174         check_classification_targets(y)
   1175         self.classes_ = np.unique(y)

/home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/sklearn/utils/validation.py in check_X_y(X, y, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, warn_on_dtype, estimator)
    519     X = check_array(X, accept_sparse, dtype, order, copy, force_all_finite,
    520                     ensure_2d, allow_nd, ensure_min_samples,
--> 521                     ensure_min_features, warn_on_dtype, estimator)
    522     if multi_output:
    523         y = check_array(y, 'csr', force_all_finite=True, ensure_2d=False,

/home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
    380                                       force_all_finite)
    381     else:
--> 382         array = np.array(array, dtype=dtype, order=order, copy=copy)
    383 
    384         if ensure_2d:

ValueError: setting an array element with a sequence.

In [12]:

# Train the model using the training sets
rf_model = pm.train(X_train, y_train, 'randomForest')

#print('Coefficients: \n', rf_model.coef_)
# The mean squared error
print("Mean squared error: %.2f"
      % np.mean((rf_model.predict(X_test) - y_test) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % rf_model.score(X_test, y_test))

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-12-7851e5fc266c> in <module>()
      1 # Train the model using the training sets
----> 2 rf_model = pm.train(X_train, y_train, 'randomForest')
      3 
      4 #print('Coefficients: \n', rf_model.coef_)
      5 # The mean squared error

/home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/datascienceutils-1.2.19-py3.5.egg/datascienceutils/predictiveModels.py in train(dataframe, target, modelType, column, **kwargs)
     32         model.fit(source, target)
     33     else:
---> 34         model.fit(dataframe, target)
     35     return model
     36 

/home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/sklearn/ensemble/forest.py in fit(self, X, y, sample_weight)
    245         """
    246         # Validate or convert input data
--> 247         X = check_array(X, accept_sparse="csc", dtype=DTYPE)
    248         y = check_array(y, accept_sparse='csc', ensure_2d=False, dtype=None)
    249         if issparse(X):

/home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
    380                                       force_all_finite)
    381     else:
--> 382         array = np.array(array, dtype=dtype, order=order, copy=copy)
    383 
    384         if ensure_2d:

ValueError: setting an array element with a sequence.

In [ ]:

# Train the model using the training sets
sgd_model = pm.train(X_train, y_train, 'sgd')
sgd_model.fit(X_train, y_train)
# The mean squared error
print("Mean squared error: %.2f"
      % np.mean((sgd_model.predict(X_test) - y_test) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % sgd_model.score(X_test, y_test))

In [ ]:

# Train the model using the training sets
xgb_model = pm.train(X_train, y_train, 'xgboost')
xgb_model.fit(X_train, y_train)
# The mean squared error
print("Mean squared error: %.2f"
      % np.mean((xgb_model.predict(X_test) - y_test) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % xgb_model.score(X_test, y_test))

In [ ]:

# Train the model using the training sets
svm_model = pm.train(X_train, y_train, 'svm')
svm_model.fit(X_train, y_train)
# The mean squared error
print("Mean squared error: %.2f"
      % np.mean((svm_model.predict(X_test) - y_test) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % svm_model.score(X_test, y_test))

In [ ]:

# Train the model using the training sets
bnb_model = pm.train(X_train, y_train, 'bernoulliNB')
bnb_model.fit(X_train, y_train)
# The mean squared error
print("Mean squared error: %.2f"
      % np.mean((bnb_model.predict(X_test) - y_test) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % bnb_model.score(X_test, y_test))

In [ ]:

# Train the model using the training sets
knn_model = pm.train(X_train, y_train, 'knn')
knn_model.fit(X_train, y_train)
# The mean squared error
print("Mean squared error: %.2f"
      % np.mean((knn_model.predict(X_test) - y_test) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % knn_model.score(X_test, y_test))

In [ ]:

# Train the model using the training sets
kde_model = pm.train(X_train, y_train, 'kde')
kde_model.fit(X_train, y_train)
# The mean squared error
print("Mean squared error: %.2f"
      % np.mean((kde_model.predict(X_test) - y_test) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % kde_model.score(X_test, y_test))

In [ ]:

# Train the model using the training sets
kde_model = pm.train(X_train, y_train, 'kde')
kde_model.fit(X_train, y_train)
# The mean squared error
print("Mean squared error: %.2f"
      % np.mean((kde_model.predict(X_test) - y_test) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % kde_model.score(X_test, y_test))

In [ ]:

# Train the model using the training sets
mnb_model = pm.train(X_train, y_train, 'multinomialNB')

print('Coefficients: \n', mnb_model.coef_)
# The mean squared error
print("Mean squared error: %.2f"
      % np.mean((mnb_model.predict(X_test) - y_test) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % mnb_model.score(X_test, y_test))