#!/usr/bin/env python # coding: utf-8 # In[366]: get_ipython().run_line_magic('load_ext', 'watermark') get_ipython().run_line_magic('watermark', '-a "Romell D.Z." -u -d -p numpy,pandas,matplotlib,sklearn') # # 1. Supervised Learning # In[367]: from __future__ import division import warnings warnings.simplefilter('ignore' ) get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plt plt.rcParams['figure.figsize'] = (12,6) get_ipython().run_line_magic('config', "InlineBackend.figure_format = 'retina'") import numpy as np import pandas as pd # In[368]: df_global = pd.read_csv('global_metrics.csv') print('Columns into dataFrame:') print(df_global.columns) df_global = df_global.iloc[:,1:] # ## Variables description of Global Statistic # In[369]: df_global.info() # ## Descriptive statistics # In[370]: df_global.describe().T.iloc[:,1:] # ## Descriptive analysis about hope of life # In[371]: ax = df_global.boxplot('esperanza_vida',by='region',rot=5) plt.title('Hope of life by Region',);plt.xlabel('');plt.ylabel('') plt.savefig('snapshot/global_hope_rate',bbox_inches='tight',dpi=100); # In[372]: df_global.boxplot('fertilidad','region',rot=5,) plt.title('Fertility by Region',);plt.xlabel('');plt.ylabel('') plt.savefig('snapshot/global_fertility_rate',bbox_inches='tight',dpi=100); # In[373]: y = df_global['esperanza_vida'].values X = df_global['fertilidad'].values X.shape,y.shape # In[374]: X,y = X.reshape(-1,1), y.reshape(-1,1) X.shape,y.shape # In[375]: # plot the relation between variables plt.scatter(X,y) plt.title('Relation between Fertility & Hope of Life') plt.xlabel('Fertilidad') plt.ylabel('Esperanza de Vida') # In[376]: from sklearn.linear_model import LinearRegression reg = LinearRegression() prediction_space = np.linspace(min(X), max(X)).reshape(-1,1) # Train with all reg.fit(X,y) y_pred = reg.predict(prediction_space) print('The R score:',reg.score(X, y)) # R^2 print('Slope %.4f & Intercept: %.4f' % (reg.coef_[0][0],reg.intercept_[0])) plt.scatter(X, y,marker='*') plt.plot(prediction_space, y_pred,marker='+', color='red', linewidth=3) plt.legend(['%.4f %.4f * F ' % (reg.intercept_[0],reg.coef_[0][0])]) plt.margins(.05) plt.title('Relation between Fertility & Hope of Life') plt.xlabel('Fertilidad') plt.ylabel('Esperanza de Vida') plt.savefig('snapshot/global_fertility_hopelife',bbox_inches='tight',dpi=100); # In[377]: X = df_global.drop(['fertilidad','region'],axis=1) y = df_global['fertilidad'] from sklearn.metrics import mean_squared_error from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42) reg = LinearRegression() reg.fit(X_train,y_train) y_pred = reg.predict(X_test) print("R^2: %.4f" % reg.score(X_test, y_test)) rmse = np.sqrt(mean_squared_error(y_test,y_pred)) print("RMSE: %.4f" % rmse ) # In[378]: from sklearn.model_selection import cross_val_score cv_scores = cross_val_score(reg,X,y,cv=7) print('Cross valuation scores using 7-Fold:\n',cv_scores) print("Average 7-Fold CV Score using cross validation: %.4f"%np.mean(cv_scores)) # In[379]: print("Average 10-Fold CV Score using cross validation %.4f" % np.mean(cross_val_score(reg,X,y,cv=10))) # In[380]: from sklearn.model_selection import ShuffleSplit cv = ShuffleSplit(n_splits=10, test_size=0.3) scores = cross_val_score(reg, X, y, cv=cv) print('Cross valuation scores using 10-ShuffleSplitFold:\n',cv_scores) print('Accuary: %0.2f (+/- %.3f)' %(scores.mean(),scores.std())) # In[381]: cv = ShuffleSplit(n_splits=5, test_size=0.25) scores = cross_val_score(reg, X, y, cv=cv) print('Cross valuation scores using 5-ShuffleSplitFold:\n',cv_scores) print('Accuary: %0.2f (+/- %.3f)' %(scores.mean(),scores.std())) # In[382]: from sklearn.pipeline import make_pipeline from sklearn.preprocessing import StandardScaler clf = make_pipeline(StandardScaler(),reg) scores = cross_val_score(clf, X, y, cv=cv,scoring='r2') print('Cross valuation scores using StandarScale into pipline:\n',cv_scores) print('Accuary with f1_macro: %0.2f (+/- %.3f)' %(scores.mean(),scores.std())) # In[383]: # dropping hope of life X = df_global.drop(['esperanza_vida','region'],axis=1) y = df_global['esperanza_vida'].values df_columns = df_global.drop(['esperanza_vida','region'], axis=1).columns # In[384]: from sklearn.linear_model import Lasso lasso4 = Lasso(alpha=0.4,normalize=True) lasso4.fit(X,y) print("Lasso Regresion with alpha .4:\n ",lasso4.coef_) lasso2 = Lasso(alpha=0.2,normalize=True) lasso2.fit(X,y) print("Lasso Regresion with alpha .2:\n ",lasso2.coef_) x_axis = range(len(X.columns)) # Plot the coefficients plt.plot(x_axis, lasso4.coef_) plt.plot(x_axis, lasso2.coef_) plt.xticks(x_axis, X.columns.values, rotation=25) plt.grid(True) plt.margins(0.02) plt.title('Coefficients using Lasso Regression') plt.savefig('snapshot/global_lasso_regression',bbox_inches='tight',dpi=100); plt.show() # In[385]: def plotRidgeRegressionScores(alpha_space,cv_scores, cv_scores_std): fig = plt.figure() ax = fig.add_subplot(1,1,1) ax.plot(alpha_space, cv_scores) std_error = cv_scores_std / np.sqrt(10) ax.fill_between(alpha_space, cv_scores - std_error, cv_scores + std_error, alpha=0.2) ax.set_ylabel('CV Score +/- Std Error') ax.set_xlabel('Alpha') ax.axhline(np.max(cv_scores), linestyle='--', color='.5') ax.set_ylim([0,0.5]) ax.set_xlim([alpha_space[0], alpha_space[-1]]) ax.set_xscale('log') plt.show() # In[386]: from sklearn.linear_model import Ridge alpha_space = np.logspace(-3, 0, 50) ridge_reg_scores = [] ridge_reg_scores_std = [] ridge = Ridge(normalize=True) for alpha in alpha_space: ridge.alpha = alpha ridge_reg_cv_scores = cross_val_score(ridge,X,y,cv=5) ridge_reg_scores.append(np.mean(ridge_reg_cv_scores)) ridge_reg_scores_std.append(np.std(ridge_reg_cv_scores)) plotRidgeRegressionScores(alpha_space,ridge_reg_scores, ridge_reg_scores_std) # In[387]: diabetes = pd.read_csv('diabetes.csv').iloc[:,1:] X = diabetes.drop(['diabetes'], axis=1) y = diabetes['diabetes'] X.columns # In[388]: from sklearn.metrics import classification_report from sklearn.metrics import confusion_matrix from sklearn.neighbors import KNeighborsClassifier X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25) knn = KNeighborsClassifier(n_neighbors=6) knn.fit(X_train,y_train) y_pred = knn.predict(X_test) print('K-Neighboors Classifier score: ',knn.score(X_test,y_test)) print('Callasification Report: \n',classification_report(y_test, y_pred,target_names=['No has Diabetes','Has Diabetes'])) # In[389]: X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,stratify=y) knn = KNeighborsClassifier(n_neighbors=6) knn.fit(X_train,y_train) y_pred = knn.predict(X_test) print('K-Neighboors Classifier score: ',knn.score(X_test,y_test)) print('Callasification Report: \n',classification_report(y_test, y_pred,target_names=['No has Diabetes','Has Diabetes'])) # In[390]: neighbors = np.arange(1, 15) train = np.empty(len(neighbors)) test = np.empty(len(neighbors)) for i, k in enumerate(neighbors): knn = KNeighborsClassifier(n_neighbors=k) knn.fit(X_train,y_train) train[i] = knn.score(X_train, y_train) test[i] = knn.score(X_test, y_test) plt.title('k-Nearst Neighbors: Varying Number of Neighbors') plt.plot(neighbors, train, label = 'Training Accuracy') plt.plot(neighbors, test, label = 'Testing Accuracy') plt.legend(loc='best') plt.xlabel('Number of Neighbors') plt.ylabel('Accuracy') plt.savefig('snapshot/global_varying_knn_numbers',bbox_inches='tight',dpi=100); # ### Best Neighbors k is 5 # In[391]: from sklearn.metrics import roc_curve,roc_auc_score from sklearn.linear_model import LogisticRegression logreg = LogisticRegression() logreg.fit(X_train,y_train) y_pred = logreg.predict(X_test) y_pred_prob = logreg.predict_proba(X_test)[:,1] fpr,tpr, thresholds = roc_curve(y_test, y_pred_prob) # Plot ROC curve plt.plot([0, 1], [0, 1], 'k--') plt.plot(fpr, tpr) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.legend(['Random Math','ROC AUC score: %.4f'%roc_auc_score(y_test,y_pred)]) plt.title('ROC Curve: Logistic Regression') plt.savefig('snapshot/global_roc_logistic_regression',bbox_inches='tight',dpi=100); # In[392]: cv_auc = cross_val_score(logreg,X,y,cv=10,scoring='roc_auc') print("MEAN of AUC for 10-fold: %.4f" % np.mean(cv_auc)) # In[393]: from sklearn.model_selection import GridSearchCV c_space = np.logspace(-5, 8, 15) param_grid = {'C': c_space,'penalty':['l1','l2']} logreg_cv = GridSearchCV(logreg, param_grid, cv=5) logreg_cv.fit(X_train,y_train) y_pred = logreg_cv.predict(X_test) y_pred_prob = logreg_cv.predict_proba(X_test)[:,1] r2 = logreg_cv.score(X_test, y_test) mse = mean_squared_error(y_test, y_pred) print("Logistic Regression best Parameters: %s"%logreg_cv.best_params_) print("Best score %.4f" % logreg_cv.best_score_) print("Logistic Regression R squared: {}".format(r2)) print("Logistic Regression MSE: {}".format(mse)) print("AUC: {}".format(roc_auc_score(y_test, y_pred_prob))) # In[394]: from scipy.stats import randint from sklearn.tree import DecisionTreeClassifier from sklearn.model_selection import RandomizedSearchCV param_dist = {"max_depth": [3,5,7, None], "max_features": randint(1, 9), "min_samples_leaf": randint(1, 9), "criterion": ["gini", "entropy"]} tree = DecisionTreeClassifier() tree_cv = RandomizedSearchCV(tree, param_dist, cv=5) tree_cv.fit(X_train,y_train) y_pred = tree_cv.predict(X_test) y_pred_prob = tree_cv.predict_proba(X_test)[:,1] r2 = tree_cv.score(X_test, y_test) mse = mean_squared_error(y_test, y_pred) print("Decision Tree best parameters: %s"%tree_cv.best_params_) print("Best score %.4f"%tree_cv.best_score_) print("DecisionTreeClassifier R squared: {}".format(r2)) print("DecisionTreeClassifier MSE: {}".format(mse)) print("AUC: {}".format(roc_auc_score(y_test, y_pred_prob))) # In[395]: from sklearn.linear_model import ElasticNet from sklearn.metrics import mean_squared_error l1_space = np.linspace(-1, 1, 30) param_grid = {'l1_ratio': l1_space} elastic_net = ElasticNet() gm_cv = GridSearchCV(elastic_net, param_grid, cv=5) gm_cv.fit(X_train,y_train) y_pred = gm_cv.predict(X_test) r2 = gm_cv.score(X_test, y_test) mse = mean_squared_error(y_test, y_pred) print("Tuned ElasticNet best params: {}".format(gm_cv.best_params_)) print("Tuned ElasticNet R squared: {}".format(r2)) print("Tuned ElasticNet MSE: {}".format(mse)) # In[396]: df_global_dummy = pd.get_dummies(df_global,drop_first=True) # bye bye region_America df_global_dummy.columns # In[397]: # if region_America is selected: df_global_dummy.iloc[2:3,-5:] # dummy variables has zero # In[398]: X = df_global_dummy.drop('fertilidad',axis=1) y = df_global_dummy[['fertilidad']] from sklearn.linear_model import Ridge ridge = Ridge(alpha=0.5,normalize=True) ridge_cv = cross_val_score(ridge,X,y,cv=5) print(ridge_cv) print(np.mean(ridge_cv)) # In[399]: ridge.fit(X,y) plt.plot(range(len(ridge.coef_[0])),ridge.coef_[0]) plt.xticks(range(len(ridge.coef_[0])),X.columns,rotation=35) plt.ylabel('Coefficient Values') plt.title('Coefficients using Ridge Regression') plt.savefig('snapshot/global_ridge_regression',bbox_inches='tight',dpi=100); # In[400]: X = diabetes.drop(['diabetes'], axis=1) y = diabetes['diabetes'] # In[401]: for i in range(30): idx = np.random.choice(np.arange(len(X)),1) idy = np.random.choice(np.arange(len(X.columns[:-1])),1) X.iloc[idx,idy] = np.nan X.info() # In[402]: X.isnull().sum() # In[403]: X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,stratify=y) # In[404]: from sklearn.impute import SimpleImputer imp = SimpleImputer(missing_values=np.NaN, strategy='mean') X_clean = pd.DataFrame(imp.fit_transform(X),columns=X.columns) # In[405]: print('Are there any value null o fail value?') X_clean.isnull().sum() # In[406]: from sklearn.impute import SimpleImputer from sklearn.svm import SVC from sklearn.pipeline import Pipeline imp = SimpleImputer(missing_values=np.NaN, strategy='mean') svm = SVC(probability=True) steps = [('imputer', imp), ('SVM', svm)] pipeline = Pipeline(steps) pipeline.fit(X_train,y_train) y_pred = pipeline.predict(X_test) y_pred_prob_svm = pipeline.predict_proba(X_test)[:,1] print('classification report:\n',classification_report(y_test, y_pred)) print("ROC AUC: %.4f"%roc_auc_score(y_test,y_pred)) # In[407]: steps = [('imputer', imp), ('knn', KNeighborsClassifier())] pipeline = Pipeline(steps) pipeline.fit(X_train,y_train) y_pred = pipeline.predict(X_test) y_pred_prob_knn = pipeline.predict_proba(X_test)[:,1] print('classification report:\n',classification_report(y_test, y_pred)) print("ROC AUC: %.4f"%roc_auc_score(y_test,y_pred)) # In[408]: steps = [('imputer', imp), ('knn', LogisticRegression())] pipeline = Pipeline(steps) pipeline.fit(X_train,y_train) y_pred = pipeline.predict(X_test) y_pred_prob_lgr = pipeline.predict_proba(X_test)[:,1] print('classification report:\n',classification_report(y_test, y_pred)) print("ROC AUC: %.4f"%roc_auc_score(y_test,y_pred)) # In[409]: steps = [('imputer', imp), ('knn', DecisionTreeClassifier(max_depth=5))] pipeline = Pipeline(steps) pipeline.fit(X_train,y_train) y_pred = pipeline.predict(X_test) y_pred_prob_dtc = pipeline.predict_proba(X_test)[:,1] print('classification report:\n',classification_report(y_test, y_pred)) print("ROC AUC: %.4f"%roc_auc_score(y_test,y_pred)) # In[410]: from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline pipeline = Pipeline([('imputer',imp), ('scaler', StandardScaler()), ('knn', KNeighborsClassifier(n_neighbors=5))]) knn_standar_scale = pipeline.fit(X_train,y_train) y_pred = knn_standar_scale.predict(X_test) y_pred_prob_knn_ss = knn_standar_scale.predict_proba(X_test)[:,1] print('classification report:\n',classification_report(y_test, y_pred)) print('KNN with 5 neighbors, Test Accuracy: %.4f'%knn_standar_scale.score(X_test,y_test)) # In[411]: from sklearn.metrics import auc for label,y_pred_prob in zip(["SVM","KNN","LGR","DTC","KNN_SS"], [y_pred_prob_svm,y_pred_prob_knn,y_pred_prob_lgr,y_pred_prob_dtc,y_pred_prob_knn_ss]): fpr,tpr, _ = roc_curve(y_test, y_pred_prob) plt.plot(fpr, tpr,label=label+str(", AUC: %.4f"%auc(fpr,tpr))) plt.plot([0, 1], [0, 1], 'k--') plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.legend() plt.title('Benchmarking between ROCs & AUCs') plt.savefig('snapshot/global_benchmarking',bbox_inches='tight',dpi=100); # # KNN SELECTED # In[412]: from sklearn.model_selection import learning_curve from sklearn.model_selection import ShuffleSplit plt.figure() plt.title("KNN-6 Learning Curve") plt.xlabel("Training examples") plt.ylabel("Score") cv = ShuffleSplit(n_splits=100, test_size=0.2) train_sizes, train_scores, test_scores = learning_curve( knn_standar_scale, X, y, cv=cv, n_jobs=3, train_sizes=np.linspace(.1, 1.0, 15)) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.grid() print('train_sizes:',train_sizes) plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r") plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g") plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score") plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score") plt.legend(loc="lower right") plt.savefig('snapshot/global_knn6_learning_curve',bbox_inches='tight',dpi=100); # In[413]: knn_standar_scale.get_params() # In[414]: param_range = np.linspace(3,14,12,dtype=int) from sklearn.model_selection import validation_curve train_scores, test_scores = validation_curve( knn_standar_scale, X, y, param_name="knn__n_neighbors", param_range=param_range, cv=cv, scoring="accuracy", n_jobs=3) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.title("KNN-6 Validation Curve") plt.xlabel("n neighbors") plt.ylabel("Score") plt.ylim(.6, 1) lw = 2 plt.plot(param_range, train_scores_mean, label="Training score", color="darkorange", lw=lw) plt.fill_between(param_range, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.2, color="darkorange", lw=lw) plt.plot(param_range, test_scores_mean, label="Cross-validation score", color="navy", lw=lw) plt.fill_between(param_range, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.2, color="navy", lw=lw) print(train_scores_mean, train_scores_std) plt.legend(loc="best") plt.savefig('snapshot/global_knn6_validation_curve',bbox_inches='tight',dpi=100); # In[415]: pipeline = Pipeline([('imputer',imp), ('scaler', StandardScaler()), ('knn', KNeighborsClassifier())]) parameters = {'knn__n_neighbors':[3,4,5,6,7]} cv = GridSearchCV(pipeline,param_grid=parameters) cv.fit(X_train,y_train) y_pred = cv.predict(X_test) print('KNeighbors classifier - Classification Report: \n',classification_report(y_test, y_pred)) print("Best Parameters: %s"%cv.best_params_) print("Best Accuracy: %.4f"%cv.best_score_) print("Test Accuracy: %.4f"%cv.score(X_test, y_test)) # In[416]: pipeline = Pipeline([('imputer',imp), ('scaler', StandardScaler()), ('SVM', SVC())]) parameters = {'SVM__C':[1, 10, 100], 'SVM__gamma':[0.1,0.01, 0.001]} cv = GridSearchCV(pipeline,param_grid=parameters) cv.fit(X_train,y_train) y_pred = cv.predict(X_test) print('Support Vector Classification - Classification Report: \n',classification_report(y_test, y_pred)) print("Best Parameters: %s"%cv.best_params_) print("Best Accuracy: %.4f"%cv.best_score_) print("Test Accuracy: %.4f"%cv.score(X_test, y_test)) # In[417]: pipeline = Pipeline([('imputer',imp), ('scaler', StandardScaler()), ('lgr', LogisticRegression())]) parameters = {'lgr__C':[1, 10, 100], 'lgr__penalty':['l1', 'l2']} cv = GridSearchCV(pipeline,param_grid=parameters) cv.fit(X_train,y_train) y_pred = cv.predict(X_test) print('Logistic regression Classification Report: \n',classification_report(y_test, y_pred)) print("Best Parameters: %s"%cv.best_params_) print("Best Accuracy: %.4f"%cv.best_score_) print("Test Accuracy: %.4f"%cv.score(X_test, y_test)) # In[418]: pipeline = Pipeline([('imputer',imp), ('scaler', StandardScaler()), ('tree', DecisionTreeClassifier())]) parameters = {"tree__max_depth": [3,5,7, None], "tree__max_features": randint(1, 9), "tree__min_samples_leaf": randint(1, 9), "tree__criterion": ["gini", "entropy"]} cv = RandomizedSearchCV(pipeline,parameters) cv.fit(X_train,y_train) y_pred = cv.predict(X_test) print('Decision tree classifier Callasification Report: \n',classification_report(y_test, y_pred)) print("Best Parameters: %s"%cv.best_params_) print("Best Accuracy: %.4f"%cv.best_score_) print("Test Accuracy: %.4f"%cv.score(X_test, y_test)) # In[419]: X = df_global_dummy.drop(['esperanza_vida'], axis=1) y = df_global_dummy['esperanza_vida'] # In[420]: X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4,random_state=42) # In[421]: pipeline = Pipeline([('scaler', StandardScaler()), ('elasticnet', ElasticNet())]) parameters = {"elasticnet__l1_ratio": np.linspace(0.1,0.9,9), "elasticnet__normalize": [True,False], "elasticnet__selection": ['cyclic','random']} cv = GridSearchCV(pipeline,parameters) cv.fit(X_train,y_train) y_pred = cv.predict(X_test) print("Best Parameters: %s"%cv.best_params_) print("Best R^2: %.4f"%cv.best_score_) print("Test R^2: %.4f"%cv.score(X_test, y_test)) # In[422]: pipeline = Pipeline([('scaler', StandardScaler()), ('lasso', Lasso())]) parameters = {"lasso__alpha": np.linspace(-0.5,0.5,20), "lasso__normalize": [True,False], "lasso__selection": ['cyclic','random']} cv = GridSearchCV(pipeline,parameters) cv.fit(X_train,y_train) y_pred = cv.predict(X_test) print("Best Parameters: %s"%cv.best_params_) print("Best R^2: %.4f"%cv.best_score_) print("Test R^2: %.4f"%cv.score(X_test, y_test)) # In[423]: pipeline = Pipeline([('scaler', StandardScaler()), ('ridge', Ridge())]) parameters = {"ridge__alpha": np.linspace(-1,1,20), "ridge__normalize": [True,False]} cv = GridSearchCV(pipeline,parameters) cv.fit(X_train,y_train) y_pred = cv.predict(X_test) print("Best Parameters: %s"%cv.best_params_) print("Best R^2: {}".format(cv.best_score_)) print("Test R^2: %.4f"%cv.score(X_test, y_test)) # In[ ]: