#Necesario para que los plots de matplotlib aparezcan en el notebook
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
carpeta_datos="G:/Mi unidad/DOCTORADO/Docencia/Curso_2018-2019/IN_2018-2019/data/"
#True si cada variable categórica se convierte en varias binarias (tantas como categorías),
#False si solo se convierte la categórica a numérica (ordinal)
binarizar = False
'''
devuelve un DataFrame, los valores perdidos notados como '?' se convierten a NaN,
si no, se consideraría '?' como una categoría más
'''
if not binarizar:
adult_orig = pd.read_csv(carpeta_datos+'adult.csv')
else:
adult_orig = pd.read_csv(carpeta_datos+'adult.csv',na_values="?")
print("------ Lista de características y tipos (object=categórica)")
print(adult_orig.dtypes,"\n")
print("------ Distribución de datos en la característica 'workclass'")
print(adult_orig['workclass'].value_counts(),"\n")
print("------ Y en la clase")
print(adult_orig['class'].value_counts(),"\n")
------ Lista de características y tipos (object=categórica) age int64 workclass object fnlwgt int64 education object education-num int64 marital-status object occupation object relationship object race object sex object capital-gain int64 capital-loss int64 hours-per-week int64 native-country object class object dtype: object ------ Distribución de datos en la característica 'workclass' Private 22696 Self-emp-not-inc 2541 Local-gov 2093 ? 1836 State-gov 1298 Self-emp-inc 1116 Federal-gov 960 Without-pay 14 Never-worked 7 Name: workclass, dtype: int64 ------ Y en la clase <=50K 24720 >50K 7841 Name: class, dtype: int64
adult_orig.all
<bound method DataFrame.all of age workclass fnlwgt education education-num \ 0 39 State-gov 77516 Bachelors 13 1 50 Self-emp-not-inc 83311 Bachelors 13 2 38 Private 215646 HS-grad 9 3 53 Private 234721 11th 7 4 28 Private 338409 Bachelors 13 5 37 Private 284582 Masters 14 6 49 Private 160187 9th 5 7 52 Self-emp-not-inc 209642 HS-grad 9 8 31 Private 45781 Masters 14 9 42 Private 159449 Bachelors 13 10 37 Private 280464 Some-college 10 11 30 State-gov 141297 Bachelors 13 12 23 Private 122272 Bachelors 13 13 32 Private 205019 Assoc-acdm 12 14 40 Private 121772 Assoc-voc 11 15 34 Private 245487 7th-8th 4 16 25 Self-emp-not-inc 176756 HS-grad 9 17 32 Private 186824 HS-grad 9 18 38 Private 28887 11th 7 19 43 Self-emp-not-inc 292175 Masters 14 20 40 Private 193524 Doctorate 16 21 54 Private 302146 HS-grad 9 22 35 Federal-gov 76845 9th 5 23 43 Private 117037 11th 7 24 59 Private 109015 HS-grad 9 25 56 Local-gov 216851 Bachelors 13 26 19 Private 168294 HS-grad 9 27 54 ? 180211 Some-college 10 28 39 Private 367260 HS-grad 9 29 49 Private 193366 HS-grad 9 ... ... ... ... ... ... 32531 30 ? 33811 Bachelors 13 32532 34 Private 204461 Doctorate 16 32533 54 Private 337992 Bachelors 13 32534 37 Private 179137 Some-college 10 32535 22 Private 325033 12th 8 32536 34 Private 160216 Bachelors 13 32537 30 Private 345898 HS-grad 9 32538 38 Private 139180 Bachelors 13 32539 71 ? 287372 Doctorate 16 32540 45 State-gov 252208 HS-grad 9 32541 41 ? 202822 HS-grad 9 32542 72 ? 129912 HS-grad 9 32543 45 Local-gov 119199 Assoc-acdm 12 32544 31 Private 199655 Masters 14 32545 39 Local-gov 111499 Assoc-acdm 12 32546 37 Private 198216 Assoc-acdm 12 32547 43 Private 260761 HS-grad 9 32548 65 Self-emp-not-inc 99359 Prof-school 15 32549 43 State-gov 255835 Some-college 10 32550 43 Self-emp-not-inc 27242 Some-college 10 32551 32 Private 34066 10th 6 32552 43 Private 84661 Assoc-voc 11 32553 32 Private 116138 Masters 14 32554 53 Private 321865 Masters 14 32555 22 Private 310152 Some-college 10 32556 27 Private 257302 Assoc-acdm 12 32557 40 Private 154374 HS-grad 9 32558 58 Private 151910 HS-grad 9 32559 22 Private 201490 HS-grad 9 32560 52 Self-emp-inc 287927 HS-grad 9 marital-status occupation relationship \ 0 Never-married Adm-clerical Not-in-family 1 Married-civ-spouse Exec-managerial Husband 2 Divorced Handlers-cleaners Not-in-family 3 Married-civ-spouse Handlers-cleaners Husband 4 Married-civ-spouse Prof-specialty Wife 5 Married-civ-spouse Exec-managerial Wife 6 Married-spouse-absent Other-service Not-in-family 7 Married-civ-spouse Exec-managerial Husband 8 Never-married Prof-specialty Not-in-family 9 Married-civ-spouse Exec-managerial Husband 10 Married-civ-spouse Exec-managerial Husband 11 Married-civ-spouse Prof-specialty Husband 12 Never-married Adm-clerical Own-child 13 Never-married Sales Not-in-family 14 Married-civ-spouse Craft-repair Husband 15 Married-civ-spouse Transport-moving Husband 16 Never-married Farming-fishing Own-child 17 Never-married Machine-op-inspct Unmarried 18 Married-civ-spouse Sales Husband 19 Divorced Exec-managerial Unmarried 20 Married-civ-spouse Prof-specialty Husband 21 Separated Other-service Unmarried 22 Married-civ-spouse Farming-fishing Husband 23 Married-civ-spouse Transport-moving Husband 24 Divorced Tech-support Unmarried 25 Married-civ-spouse Tech-support Husband 26 Never-married Craft-repair Own-child 27 Married-civ-spouse ? Husband 28 Divorced Exec-managerial Not-in-family 29 Married-civ-spouse Craft-repair Husband ... ... ... ... 32531 Never-married ? Not-in-family 32532 Married-civ-spouse Prof-specialty Husband 32533 Married-civ-spouse Exec-managerial Husband 32534 Divorced Adm-clerical Unmarried 32535 Never-married Protective-serv Own-child 32536 Never-married Exec-managerial Not-in-family 32537 Never-married Craft-repair Not-in-family 32538 Divorced Prof-specialty Unmarried 32539 Married-civ-spouse ? Husband 32540 Separated Adm-clerical Own-child 32541 Separated ? Not-in-family 32542 Married-civ-spouse ? Husband 32543 Divorced Prof-specialty Unmarried 32544 Divorced Other-service Not-in-family 32545 Married-civ-spouse Adm-clerical Wife 32546 Divorced Tech-support Not-in-family 32547 Married-civ-spouse Machine-op-inspct Husband 32548 Never-married Prof-specialty Not-in-family 32549 Divorced Adm-clerical Other-relative 32550 Married-civ-spouse Craft-repair Husband 32551 Married-civ-spouse Handlers-cleaners Husband 32552 Married-civ-spouse Sales Husband 32553 Never-married Tech-support Not-in-family 32554 Married-civ-spouse Exec-managerial Husband 32555 Never-married Protective-serv Not-in-family 32556 Married-civ-spouse Tech-support Wife 32557 Married-civ-spouse Machine-op-inspct Husband 32558 Widowed Adm-clerical Unmarried 32559 Never-married Adm-clerical Own-child 32560 Married-civ-spouse Exec-managerial Wife race sex capital-gain capital-loss \ 0 White Male 2174 0 1 White Male 0 0 2 White Male 0 0 3 Black Male 0 0 4 Black Female 0 0 5 White Female 0 0 6 Black Female 0 0 7 White Male 0 0 8 White Female 14084 0 9 White Male 5178 0 10 Black Male 0 0 11 Asian-Pac-Islander Male 0 0 12 White Female 0 0 13 Black Male 0 0 14 Asian-Pac-Islander Male 0 0 15 Amer-Indian-Eskimo Male 0 0 16 White Male 0 0 17 White Male 0 0 18 White Male 0 0 19 White Female 0 0 20 White Male 0 0 21 Black Female 0 0 22 Black Male 0 0 23 White Male 0 2042 24 White Female 0 0 25 White Male 0 0 26 White Male 0 0 27 Asian-Pac-Islander Male 0 0 28 White Male 0 0 29 White Male 0 0 ... ... ... ... ... 32531 Asian-Pac-Islander Female 0 0 32532 White Male 0 0 32533 Asian-Pac-Islander Male 0 0 32534 White Female 0 0 32535 Black Male 0 0 32536 White Female 0 0 32537 Black Male 0 0 32538 Black Female 15020 0 32539 White Male 0 0 32540 White Female 0 0 32541 Black Female 0 0 32542 White Male 0 0 32543 White Female 0 0 32544 Other Female 0 0 32545 White Female 0 0 32546 White Female 0 0 32547 White Male 0 0 32548 White Male 1086 0 32549 White Female 0 0 32550 White Male 0 0 32551 Amer-Indian-Eskimo Male 0 0 32552 White Male 0 0 32553 Asian-Pac-Islander Male 0 0 32554 White Male 0 0 32555 White Male 0 0 32556 White Female 0 0 32557 White Male 0 0 32558 White Female 0 0 32559 White Male 0 0 32560 White Female 15024 0 hours-per-week native-country class 0 40 United-States <=50K 1 13 United-States <=50K 2 40 United-States <=50K 3 40 United-States <=50K 4 40 Cuba <=50K 5 40 United-States <=50K 6 16 Jamaica <=50K 7 45 United-States >50K 8 50 United-States >50K 9 40 United-States >50K 10 80 United-States >50K 11 40 India >50K 12 30 United-States <=50K 13 50 United-States <=50K 14 40 ? >50K 15 45 Mexico <=50K 16 35 United-States <=50K 17 40 United-States <=50K 18 50 United-States <=50K 19 45 United-States >50K 20 60 United-States >50K 21 20 United-States <=50K 22 40 United-States <=50K 23 40 United-States <=50K 24 40 United-States <=50K 25 40 United-States >50K 26 40 United-States <=50K 27 60 South >50K 28 80 United-States <=50K 29 40 United-States <=50K ... ... ... ... 32531 99 United-States <=50K 32532 60 United-States >50K 32533 50 Japan >50K 32534 39 United-States <=50K 32535 35 United-States <=50K 32536 55 United-States >50K 32537 46 United-States <=50K 32538 45 United-States >50K 32539 10 United-States >50K 32540 40 United-States <=50K 32541 32 United-States <=50K 32542 25 United-States <=50K 32543 48 United-States <=50K 32544 30 United-States <=50K 32545 20 United-States >50K 32546 40 United-States <=50K 32547 40 Mexico <=50K 32548 60 United-States <=50K 32549 40 United-States <=50K 32550 50 United-States <=50K 32551 40 United-States <=50K 32552 45 United-States <=50K 32553 11 Taiwan <=50K 32554 40 United-States >50K 32555 40 United-States <=50K 32556 38 United-States <=50K 32557 40 United-States >50K 32558 40 United-States <=50K 32559 20 United-States <=50K 32560 40 United-States >50K [32561 rows x 15 columns]>
adult_orig.columns
Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'class'], dtype='object')
#Visualización del paquete pandas: https://pandas.pydata.org/pandas-docs/stable/visualization.html
#Algunos ejemplos
adult_orig.plot.hist(x='age', y='hours-per-week',bins=20)
<matplotlib.axes._subplots.AxesSubplot at 0x16b0b22db00>
adult_orig.boxplot(showfliers=False)
adult_orig.boxplot(column=['hours-per-week'])
<matplotlib.axes._subplots.AxesSubplot at 0x2714788d518>
adult_orig.boxplot(column='hours-per-week', by='sex')
<matplotlib.axes._subplots.AxesSubplot at 0x27147882b00>
#'''
# gráfico de barras horizontales con la proporción de cada clase
plt.figure(1)
plt.clf()
import seaborn as sns
ax = sns.countplot(y="class", data=adult_orig, color="c");
ncount = adult_orig.shape[0]
for p in ax.patches:
val_x=p.get_bbox().get_points()[:,0]
val_y=p.get_bbox().get_points()[1,1]
ax.annotate('{:.0f} ({:.1f}%)'.format(val_x[1], 100.*val_x[1]/ncount), (val_x.mean(), (val_y-0.4)), ha='center', va='center')
#'''
#'''
print("------ Preparando el scatter matrix...")
plt.figure(2)
plt.clf()
# para scatter matrix, se convierten las variables categóricas a numéricas
adult_int = adult_orig
char_cols = adult_int.dtypes.pipe(lambda x: x[x == 'object']).index #lista de columnas con var. categóticas (las de tipo 'object')
for c in char_cols:
adult_int[c] = pd.factorize(adult_int[c])[0]
lista_vars = list(adult_int)
lista_vars.remove('class') #excluimos la columna 'class' del plot
#se genera el scatter matrix
sns.set()
sns_plot = sns.pairplot(adult_int, vars=lista_vars, hue="class", diag_kind="kde") #en hue indicamos que la columna 'class' define los colores
sns_plot.savefig("adult_scatter_plot.png")
print("")
#'''
------ Preparando el scatter matrix...
C:\Users\elena\Anaconda3\lib\site-packages\scipy\stats\stats.py:1713: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result. return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
<Figure size 432x288 with 0 Axes>
'''
si el dataset contiene variables categóricas con cadenas, es necesario convertirlas a numéricas antes de usar 'fit', y para
no hacerlas ordinales, mejor convertirlas a variables binarias con get_dummies
Otras alternativas para convertir las variables categóricas es usar LabelEncoder, One-Hot-Encoding o LabelBinarizer en la matriz numpy (ver más abajo)
Para saber más: http://pbpython.com/categorical-encoding.html
'''
# devuelve una lista de las características categóricas excluyendo la columna 'class' que contiene la clase
lista_categoricas = [x for x in adult_orig.columns if (adult_orig[x].dtype == object and adult_orig[x].name != 'class')]
if not binarizar:
adult = adult_orig
else:
# reemplaza las cateogóricas por binarias
adult = pd.get_dummies(adult_orig, columns=lista_categoricas)
# coloco la columna que contiene la clase como última columna por convención
clase = adult['class']
adult.drop(labels=['class'], axis=1,inplace = True)
adult.insert(len(adult.columns), 'class', clase)
# separamos el DataFrame en dos arrays numpy, uno con las características (X) y otro (y) con la clase
# si la última columna contiene la clase, se puede separar así
X = adult.values[:,0:len(adult.columns)-1]
y = adult.values[:,len(adult.columns)-1]
print("X", X)
print("y", y)
'''
#también se puede separar indicando los nombres de las columnas
columns = ["age","workclass","fnlwgt","education","education-num","marital-status","occupation","relationship","race","sex","capital-gain","capital-loss","hours-per-week","native-country"]
X = adult[list(columns)].values
y = adult["class"].values
'''
X [[ 39 0 77516 ... 0 40 0] [ 50 1 83311 ... 0 13 0] [ 38 2 215646 ... 0 40 0] ... [ 58 2 151910 ... 0 40 0] [ 22 2 201490 ... 0 20 0] [ 52 6 287927 ... 0 40 0]] y [0 0 0 ... 0 0 1]
'\n#también se puede separar indicando los nombres de las columnas\ncolumns = ["age","workclass","fnlwgt","education","education-num","marital-status","occupation","relationship","race","sex","capital-gain","capital-loss","hours-per-week","native-country"]\nX = adult[list(columns)].values\ny = adult["class"].values\n'
'''
Si las variables categóricas tienen muchas categorías, se generarán muchas variables y algunos algoritmos (por ejemplo, SVM) serán
extremadamente lentos. Se puede optar por solo convertirlas a variables numéricas (ordinales) sin binarizar. Esto se haría si no se ha
ejecutado pd.get_dummies() previamente. No funciona si hay valores perdidos notados como NaN
'''
if not binarizar:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
for i in range(0,X.shape[1]):
if isinstance(X[0,i],str):
X[:,i] = le.fit_transform(X[:,i])
# validación cruzada, pero sin control de semilla ni particionado estratificado
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(random_state=0)
from sklearn.model_selection import cross_val_score
scores = cross_val_score(tree, X, y, cv=5, scoring='accuracy')
print(scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
[0.81191463 0.80912162 0.81342138 0.81772113 0.81342138] Accuracy: 0.81 (+/- 0.01)
'''
Validación cruzada con particionado estratificado y control de la aleatoriedad fijando la semilla
'''
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from imblearn.metrics import geometric_mean_score
from sklearn import preprocessing
import numpy
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=123456)
le = preprocessing.LabelEncoder()
def validacion_cruzada(modelo, X, y, cv):
y_test_all = []
y_prob_all = []
for train, test in cv.split(X, y):
modelo = modelo.fit(X[train],y[train])
y_pred = modelo.predict(X[test])
y_prob = modelo.predict_proba(X[test])[:,1] #la segunda columna es la clase positiva '>50K' en adult
y_test_bin = le.fit_transform(y[test]) #se convierte a binario para AUC: '>50K' -> 1 (clase positiva) y '<=50K' -> 0 en adult
print("Accuracy: {:6.2f}%, F1-score: {:.4f}, G-mean: {:.4f}, AUC: {:.4f}".format(accuracy_score(y[test],y_pred)*100 , f1_score(y[test],y_pred,average='macro'), geometric_mean_score(y[test],y_pred,average='macro'), roc_auc_score(y_test_bin,y_prob)))
y_test_all = numpy.concatenate([y_test_all,y_test_bin])
y_prob_all = numpy.concatenate([y_prob_all,y_prob])
print("")
return modelo, y_test_all, y_prob_all
'''
Dibuja la curva ROC
'''
from sklearn.metrics import roc_curve, auc
def curva_ROC(figura_id,new,y_test,y_prob,nombre):
fpr, tpr, _ = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)
plt.figure(figura_id)
if new:
plt.clf()
plt.plot(fpr, tpr, lw=2, label=nombre+' (%0.4f)' % roc_auc) #color='darkorange',
plt.plot([0, 1], [0, 1], color='navy', lw=1.5, linestyle='--')
plt.legend(loc="lower right")
if new:
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC')
plt.show()
return roc_auc
#'''
print("------ Árbol de decisión...")
from sklearn import tree
arbol = tree.DecisionTreeClassifier(random_state=0, max_depth=10) #podemos limitar a profundidad 5 para generar un árbol legible aunque pierda algo de precisión
arbol, y_test_arbol, y_prob_arbol = validacion_cruzada(arbol,X,y,skf)
#curva_ROC(3,True,y_test_arbol,y_prob_arbol,'Árbol')
------ Árbol de decisión... Accuracy: 85.98%, F1-score: 0.8019, G-mean: 0.7924, AUC: 0.9035 Accuracy: 86.36%, F1-score: 0.8005, G-mean: 0.7822, AUC: 0.8991 Accuracy: 85.61%, F1-score: 0.7891, G-mean: 0.7711, AUC: 0.8996 Accuracy: 84.98%, F1-score: 0.7866, G-mean: 0.7763, AUC: 0.8930 Accuracy: 84.69%, F1-score: 0.7775, G-mean: 0.7622, AUC: 0.8943
'''
Para visualizar el árbol generado, se puede usar graphviz, que debe ser previamente instalado
Por ejemplo, desde Anaconda Navigator: Environments / Seleccionar "Not installed" / Buscar "graphviz" / Marcar + "Apply"
Incluir el directorio "...Anaconda3\pkgs\graphviz-2.38.0-4\Library\bin\graphviz" en las variables de entorno PATH y GRAPHVIZ_DOT (variable nueva que debe crearse)
'''
print("------ Generando una visualización del árbol en 'adult.pdf'...")
import graphviz
feat = list(adult)
feat.remove('class')
dot_data = tree.export_graphviz(arbol, out_file=None, filled=True, feature_names=feat, class_names=['menos_50K', 'mas_50K'], rounded=True, special_characters=True)
graph = graphviz.Source(dot_data)
graph.render()
#graph.render("adult", view=True) #genera un fichero adult.pdf con el árbol
#'''
------ Generando una visualización del árbol en 'adult.pdf'...
'Source.gv.pdf'
print("------ XGB...")
import xgboost as xgb
clf = xgb.XGBClassifier(n_estimators = 200)
clf, y_test_clf, y_prob_clf = validacion_cruzada(clf,X,y,skf)
curva_ROC(3,False,y_test_clf,y_prob_clf,'XGB')
------ XGB... Accuracy: 87.12%, F1-score: 0.8083, G-mean: 0.7853, AUC: 0.9259 Accuracy: 87.65%, F1-score: 0.8197, G-mean: 0.8007, AUC: 0.9320 Accuracy: 86.99%, F1-score: 0.8086, G-mean: 0.7883, AUC: 0.9252 Accuracy: 86.72%, F1-score: 0.8010, G-mean: 0.7771, AUC: 0.9235 Accuracy: 86.53%, F1-score: 0.7987, G-mean: 0.7754, AUC: 0.9226
0.9257465529502421
'''
Visualizar las características más importantes según la frecuencia con que se usan en los árboles de XGB (sobre el último modelo de la CV)
'''
plt.figure(4)
plt.clf()
features = list(adult)
mapFeat = dict(zip(["f"+str(i) for i in range(len(features))],features))
type(clf.get_booster().get_fscore())
ts = pd.Series(clf.get_booster().get_fscore())
ts.index = ts.reset_index()['index'].map(mapFeat)
ax2=ts.sort_values()[-20:].plot(kind="barh", figsize = (8,8), title=("20 características más importantes"), color='orange')
ax2.set_xlabel("importancia")
ax2.set_ylabel("característica")
Text(0,0.5,'característica')