#Necesario para que los plots de matplotlib aparezcan en el notebook
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
carpeta_datos="G:/Mi unidad/DOCTORADO/Docencia/Curso_2018-2019/IN_2018-2019/data/"
#True si cada variable categórica se convierte en varias binarias (tantas como categorías),
#False si solo se convierte la categórica a numérica (ordinal)
binarizar = False
'''
devuelve un DataFrame, los valores perdidos notados como '?' se convierten a NaN,
si no, se consideraría '?' como una categoría más
'''
if not binarizar:
bank_orig = pd.read_csv(carpeta_datos+'bank-additional-full.csv', delimiter=';')
else:
bank_orig = pd.read_csv(carpeta_datos+'bank-additional-full.csv',na_values="?", delimiter=';')
print("------ Lista de características y tipos (object=categórica)")
print(bank_orig.dtypes,"\n")
print("------ Distribución de datos en la característica 'job'")
print(bank_orig['job'].value_counts(),"\n")
print("------ Y en la clase")
print(bank_orig['y'].value_counts(),"\n")
------ Lista de características y tipos (object=categórica) age int64 job object marital object education object default object housing object loan object contact object month object day_of_week object duration int64 campaign int64 pdays int64 previous int64 poutcome object emp.var.rate float64 cons.price.idx float64 cons.conf.idx float64 euribor3m float64 nr.employed float64 y object dtype: object ------ Distribución de datos en la característica 'job' admin. 10422 blue-collar 9254 technician 6743 services 3969 management 2924 retired 1720 entrepreneur 1456 self-employed 1421 housemaid 1060 unemployed 1014 student 875 unknown 330 Name: job, dtype: int64 ------ Y en la clase no 36548 yes 4640 Name: y, dtype: int64
# Previsualizar el principio y fin del dataframe
bank_orig.all
<bound method DataFrame.all of age job marital education default housing \ 0 56 housemaid married basic.4y no no 1 57 services married high.school unknown no 2 37 services married high.school no yes 3 40 admin. married basic.6y no no 4 56 services married high.school no no 5 45 services married basic.9y unknown no 6 59 admin. married professional.course no no 7 41 blue-collar married unknown unknown no 8 24 technician single professional.course no yes 9 25 services single high.school no yes 10 41 blue-collar married unknown unknown no 11 25 services single high.school no yes 12 29 blue-collar single high.school no no 13 57 housemaid divorced basic.4y no yes 14 35 blue-collar married basic.6y no yes 15 54 retired married basic.9y unknown yes 16 35 blue-collar married basic.6y no yes 17 46 blue-collar married basic.6y unknown yes 18 50 blue-collar married basic.9y no yes 19 39 management single basic.9y unknown no 20 30 unemployed married high.school no no 21 55 blue-collar married basic.4y unknown yes 22 55 retired single high.school no yes 23 41 technician single high.school no yes 24 37 admin. married high.school no yes 25 35 technician married university.degree no no 26 59 technician married unknown no yes 27 39 self-employed married basic.9y unknown no 28 54 technician single university.degree unknown no 29 55 unknown married university.degree unknown unknown ... ... ... ... ... ... ... 41158 35 technician divorced basic.4y no no 41159 35 technician divorced basic.4y no yes 41160 33 admin. married university.degree no no 41161 33 admin. married university.degree no yes 41162 60 blue-collar married basic.4y no yes 41163 35 technician divorced basic.4y no yes 41164 54 admin. married professional.course no no 41165 38 housemaid divorced university.degree no no 41166 32 admin. married university.degree no no 41167 32 admin. married university.degree no yes 41168 38 entrepreneur married university.degree no no 41169 62 services married high.school no yes 41170 40 management divorced university.degree no yes 41171 33 student married professional.course no yes 41172 31 admin. single university.degree no yes 41173 62 retired married university.degree no yes 41174 62 retired married university.degree no yes 41175 34 student single unknown no yes 41176 38 housemaid divorced high.school no yes 41177 57 retired married professional.course no yes 41178 62 retired married university.degree no no 41179 64 retired divorced professional.course no yes 41180 36 admin. married university.degree no no 41181 37 admin. married university.degree no yes 41182 29 unemployed single basic.4y no yes 41183 73 retired married professional.course no yes 41184 46 blue-collar married professional.course no no 41185 56 retired married university.degree no yes 41186 44 technician married professional.course no no 41187 74 retired married professional.course no yes loan contact month day_of_week ... campaign pdays previous \ 0 no telephone may mon ... 1 999 0 1 no telephone may mon ... 1 999 0 2 no telephone may mon ... 1 999 0 3 no telephone may mon ... 1 999 0 4 yes telephone may mon ... 1 999 0 5 no telephone may mon ... 1 999 0 6 no telephone may mon ... 1 999 0 7 no telephone may mon ... 1 999 0 8 no telephone may mon ... 1 999 0 9 no telephone may mon ... 1 999 0 10 no telephone may mon ... 1 999 0 11 no telephone may mon ... 1 999 0 12 yes telephone may mon ... 1 999 0 13 no telephone may mon ... 1 999 0 14 no telephone may mon ... 1 999 0 15 yes telephone may mon ... 1 999 0 16 no telephone may mon ... 1 999 0 17 yes telephone may mon ... 1 999 0 18 yes telephone may mon ... 1 999 0 19 no telephone may mon ... 1 999 0 20 no telephone may mon ... 1 999 0 21 no telephone may mon ... 1 999 0 22 no telephone may mon ... 1 999 0 23 no telephone may mon ... 1 999 0 24 no telephone may mon ... 1 999 0 25 yes telephone may mon ... 1 999 0 26 no telephone may mon ... 1 999 0 27 no telephone may mon ... 1 999 0 28 no telephone may mon ... 2 999 0 29 unknown telephone may mon ... 1 999 0 ... ... ... ... ... ... ... ... ... 41158 no cellular nov tue ... 1 999 0 41159 no cellular nov tue ... 1 9 4 41160 no cellular nov tue ... 1 999 0 41161 no cellular nov tue ... 1 999 1 41162 no cellular nov tue ... 2 4 1 41163 no cellular nov tue ... 3 4 2 41164 no cellular nov tue ... 2 10 1 41165 no cellular nov wed ... 2 999 0 41166 no telephone nov wed ... 1 999 1 41167 no cellular nov wed ... 3 999 0 41168 no cellular nov wed ... 2 999 0 41169 no cellular nov wed ... 5 999 0 41170 no cellular nov wed ... 2 999 4 41171 no telephone nov thu ... 1 999 0 41172 no cellular nov thu ... 1 999 0 41173 no cellular nov thu ... 1 999 2 41174 no cellular nov thu ... 1 1 6 41175 no cellular nov thu ... 1 999 2 41176 yes cellular nov thu ... 1 999 0 41177 no cellular nov thu ... 6 999 0 41178 no cellular nov thu ... 2 6 3 41179 no cellular nov fri ... 3 999 0 41180 no cellular nov fri ... 2 999 0 41181 no cellular nov fri ... 1 999 0 41182 no cellular nov fri ... 1 9 1 41183 no cellular nov fri ... 1 999 0 41184 no cellular nov fri ... 1 999 0 41185 no cellular nov fri ... 2 999 0 41186 no cellular nov fri ... 1 999 0 41187 no cellular nov fri ... 3 999 1 poutcome emp.var.rate cons.price.idx cons.conf.idx euribor3m \ 0 nonexistent 1.1 93.994 -36.4 4.857 1 nonexistent 1.1 93.994 -36.4 4.857 2 nonexistent 1.1 93.994 -36.4 4.857 3 nonexistent 1.1 93.994 -36.4 4.857 4 nonexistent 1.1 93.994 -36.4 4.857 5 nonexistent 1.1 93.994 -36.4 4.857 6 nonexistent 1.1 93.994 -36.4 4.857 7 nonexistent 1.1 93.994 -36.4 4.857 8 nonexistent 1.1 93.994 -36.4 4.857 9 nonexistent 1.1 93.994 -36.4 4.857 10 nonexistent 1.1 93.994 -36.4 4.857 11 nonexistent 1.1 93.994 -36.4 4.857 12 nonexistent 1.1 93.994 -36.4 4.857 13 nonexistent 1.1 93.994 -36.4 4.857 14 nonexistent 1.1 93.994 -36.4 4.857 15 nonexistent 1.1 93.994 -36.4 4.857 16 nonexistent 1.1 93.994 -36.4 4.857 17 nonexistent 1.1 93.994 -36.4 4.857 18 nonexistent 1.1 93.994 -36.4 4.857 19 nonexistent 1.1 93.994 -36.4 4.857 20 nonexistent 1.1 93.994 -36.4 4.857 21 nonexistent 1.1 93.994 -36.4 4.857 22 nonexistent 1.1 93.994 -36.4 4.857 23 nonexistent 1.1 93.994 -36.4 4.857 24 nonexistent 1.1 93.994 -36.4 4.857 25 nonexistent 1.1 93.994 -36.4 4.857 26 nonexistent 1.1 93.994 -36.4 4.857 27 nonexistent 1.1 93.994 -36.4 4.857 28 nonexistent 1.1 93.994 -36.4 4.857 29 nonexistent 1.1 93.994 -36.4 4.857 ... ... ... ... ... ... 41158 nonexistent -1.1 94.767 -50.8 1.035 41159 success -1.1 94.767 -50.8 1.035 41160 nonexistent -1.1 94.767 -50.8 1.035 41161 failure -1.1 94.767 -50.8 1.035 41162 success -1.1 94.767 -50.8 1.035 41163 success -1.1 94.767 -50.8 1.035 41164 success -1.1 94.767 -50.8 1.035 41165 nonexistent -1.1 94.767 -50.8 1.030 41166 failure -1.1 94.767 -50.8 1.030 41167 nonexistent -1.1 94.767 -50.8 1.030 41168 nonexistent -1.1 94.767 -50.8 1.030 41169 nonexistent -1.1 94.767 -50.8 1.030 41170 failure -1.1 94.767 -50.8 1.030 41171 nonexistent -1.1 94.767 -50.8 1.031 41172 nonexistent -1.1 94.767 -50.8 1.031 41173 failure -1.1 94.767 -50.8 1.031 41174 success -1.1 94.767 -50.8 1.031 41175 failure -1.1 94.767 -50.8 1.031 41176 nonexistent -1.1 94.767 -50.8 1.031 41177 nonexistent -1.1 94.767 -50.8 1.031 41178 success -1.1 94.767 -50.8 1.031 41179 nonexistent -1.1 94.767 -50.8 1.028 41180 nonexistent -1.1 94.767 -50.8 1.028 41181 nonexistent -1.1 94.767 -50.8 1.028 41182 success -1.1 94.767 -50.8 1.028 41183 nonexistent -1.1 94.767 -50.8 1.028 41184 nonexistent -1.1 94.767 -50.8 1.028 41185 nonexistent -1.1 94.767 -50.8 1.028 41186 nonexistent -1.1 94.767 -50.8 1.028 41187 failure -1.1 94.767 -50.8 1.028 nr.employed y 0 5191.0 no 1 5191.0 no 2 5191.0 no 3 5191.0 no 4 5191.0 no 5 5191.0 no 6 5191.0 no 7 5191.0 no 8 5191.0 no 9 5191.0 no 10 5191.0 no 11 5191.0 no 12 5191.0 no 13 5191.0 no 14 5191.0 no 15 5191.0 no 16 5191.0 no 17 5191.0 no 18 5191.0 no 19 5191.0 no 20 5191.0 no 21 5191.0 no 22 5191.0 no 23 5191.0 no 24 5191.0 no 25 5191.0 no 26 5191.0 no 27 5191.0 no 28 5191.0 no 29 5191.0 no ... ... ... 41158 4963.6 yes 41159 4963.6 yes 41160 4963.6 yes 41161 4963.6 no 41162 4963.6 no 41163 4963.6 yes 41164 4963.6 yes 41165 4963.6 yes 41166 4963.6 yes 41167 4963.6 no 41168 4963.6 no 41169 4963.6 no 41170 4963.6 no 41171 4963.6 yes 41172 4963.6 yes 41173 4963.6 yes 41174 4963.6 yes 41175 4963.6 no 41176 4963.6 no 41177 4963.6 no 41178 4963.6 yes 41179 4963.6 no 41180 4963.6 no 41181 4963.6 yes 41182 4963.6 no 41183 4963.6 yes 41184 4963.6 no 41185 4963.6 no 41186 4963.6 yes 41187 4963.6 no [41188 rows x 21 columns]>
bank_orig
age | job | marital | education | default | housing | loan | contact | month | day_of_week | ... | campaign | pdays | previous | poutcome | emp.var.rate | cons.price.idx | cons.conf.idx | euribor3m | nr.employed | y | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 56 | housemaid | married | basic.4y | no | no | no | telephone | may | mon | ... | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no |
1 | 57 | services | married | high.school | unknown | no | no | telephone | may | mon | ... | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no |
2 | 37 | services | married | high.school | no | yes | no | telephone | may | mon | ... | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no |
3 | 40 | admin. | married | basic.6y | no | no | no | telephone | may | mon | ... | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no |
4 | 56 | services | married | high.school | no | no | yes | telephone | may | mon | ... | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no |
5 | 45 | services | married | basic.9y | unknown | no | no | telephone | may | mon | ... | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no |
6 | 59 | admin. | married | professional.course | no | no | no | telephone | may | mon | ... | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no |
7 | 41 | blue-collar | married | unknown | unknown | no | no | telephone | may | mon | ... | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no |
8 | 24 | technician | single | professional.course | no | yes | no | telephone | may | mon | ... | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no |
9 | 25 | services | single | high.school | no | yes | no | telephone | may | mon | ... | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no |
10 | 41 | blue-collar | married | unknown | unknown | no | no | telephone | may | mon | ... | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no |
11 | 25 | services | single | high.school | no | yes | no | telephone | may | mon | ... | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no |
12 | 29 | blue-collar | single | high.school | no | no | yes | telephone | may | mon | ... | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no |
13 | 57 | housemaid | divorced | basic.4y | no | yes | no | telephone | may | mon | ... | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no |
14 | 35 | blue-collar | married | basic.6y | no | yes | no | telephone | may | mon | ... | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no |
15 | 54 | retired | married | basic.9y | unknown | yes | yes | telephone | may | mon | ... | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no |
16 | 35 | blue-collar | married | basic.6y | no | yes | no | telephone | may | mon | ... | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no |
17 | 46 | blue-collar | married | basic.6y | unknown | yes | yes | telephone | may | mon | ... | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no |
18 | 50 | blue-collar | married | basic.9y | no | yes | yes | telephone | may | mon | ... | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no |
19 | 39 | management | single | basic.9y | unknown | no | no | telephone | may | mon | ... | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no |
20 | 30 | unemployed | married | high.school | no | no | no | telephone | may | mon | ... | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no |
21 | 55 | blue-collar | married | basic.4y | unknown | yes | no | telephone | may | mon | ... | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no |
22 | 55 | retired | single | high.school | no | yes | no | telephone | may | mon | ... | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no |
23 | 41 | technician | single | high.school | no | yes | no | telephone | may | mon | ... | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no |
24 | 37 | admin. | married | high.school | no | yes | no | telephone | may | mon | ... | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no |
25 | 35 | technician | married | university.degree | no | no | yes | telephone | may | mon | ... | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no |
26 | 59 | technician | married | unknown | no | yes | no | telephone | may | mon | ... | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no |
27 | 39 | self-employed | married | basic.9y | unknown | no | no | telephone | may | mon | ... | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no |
28 | 54 | technician | single | university.degree | unknown | no | no | telephone | may | mon | ... | 2 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no |
29 | 55 | unknown | married | university.degree | unknown | unknown | unknown | telephone | may | mon | ... | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
41158 | 35 | technician | divorced | basic.4y | no | no | no | cellular | nov | tue | ... | 1 | 999 | 0 | nonexistent | -1.1 | 94.767 | -50.8 | 1.035 | 4963.6 | yes |
41159 | 35 | technician | divorced | basic.4y | no | yes | no | cellular | nov | tue | ... | 1 | 9 | 4 | success | -1.1 | 94.767 | -50.8 | 1.035 | 4963.6 | yes |
41160 | 33 | admin. | married | university.degree | no | no | no | cellular | nov | tue | ... | 1 | 999 | 0 | nonexistent | -1.1 | 94.767 | -50.8 | 1.035 | 4963.6 | yes |
41161 | 33 | admin. | married | university.degree | no | yes | no | cellular | nov | tue | ... | 1 | 999 | 1 | failure | -1.1 | 94.767 | -50.8 | 1.035 | 4963.6 | no |
41162 | 60 | blue-collar | married | basic.4y | no | yes | no | cellular | nov | tue | ... | 2 | 4 | 1 | success | -1.1 | 94.767 | -50.8 | 1.035 | 4963.6 | no |
41163 | 35 | technician | divorced | basic.4y | no | yes | no | cellular | nov | tue | ... | 3 | 4 | 2 | success | -1.1 | 94.767 | -50.8 | 1.035 | 4963.6 | yes |
41164 | 54 | admin. | married | professional.course | no | no | no | cellular | nov | tue | ... | 2 | 10 | 1 | success | -1.1 | 94.767 | -50.8 | 1.035 | 4963.6 | yes |
41165 | 38 | housemaid | divorced | university.degree | no | no | no | cellular | nov | wed | ... | 2 | 999 | 0 | nonexistent | -1.1 | 94.767 | -50.8 | 1.030 | 4963.6 | yes |
41166 | 32 | admin. | married | university.degree | no | no | no | telephone | nov | wed | ... | 1 | 999 | 1 | failure | -1.1 | 94.767 | -50.8 | 1.030 | 4963.6 | yes |
41167 | 32 | admin. | married | university.degree | no | yes | no | cellular | nov | wed | ... | 3 | 999 | 0 | nonexistent | -1.1 | 94.767 | -50.8 | 1.030 | 4963.6 | no |
41168 | 38 | entrepreneur | married | university.degree | no | no | no | cellular | nov | wed | ... | 2 | 999 | 0 | nonexistent | -1.1 | 94.767 | -50.8 | 1.030 | 4963.6 | no |
41169 | 62 | services | married | high.school | no | yes | no | cellular | nov | wed | ... | 5 | 999 | 0 | nonexistent | -1.1 | 94.767 | -50.8 | 1.030 | 4963.6 | no |
41170 | 40 | management | divorced | university.degree | no | yes | no | cellular | nov | wed | ... | 2 | 999 | 4 | failure | -1.1 | 94.767 | -50.8 | 1.030 | 4963.6 | no |
41171 | 33 | student | married | professional.course | no | yes | no | telephone | nov | thu | ... | 1 | 999 | 0 | nonexistent | -1.1 | 94.767 | -50.8 | 1.031 | 4963.6 | yes |
41172 | 31 | admin. | single | university.degree | no | yes | no | cellular | nov | thu | ... | 1 | 999 | 0 | nonexistent | -1.1 | 94.767 | -50.8 | 1.031 | 4963.6 | yes |
41173 | 62 | retired | married | university.degree | no | yes | no | cellular | nov | thu | ... | 1 | 999 | 2 | failure | -1.1 | 94.767 | -50.8 | 1.031 | 4963.6 | yes |
41174 | 62 | retired | married | university.degree | no | yes | no | cellular | nov | thu | ... | 1 | 1 | 6 | success | -1.1 | 94.767 | -50.8 | 1.031 | 4963.6 | yes |
41175 | 34 | student | single | unknown | no | yes | no | cellular | nov | thu | ... | 1 | 999 | 2 | failure | -1.1 | 94.767 | -50.8 | 1.031 | 4963.6 | no |
41176 | 38 | housemaid | divorced | high.school | no | yes | yes | cellular | nov | thu | ... | 1 | 999 | 0 | nonexistent | -1.1 | 94.767 | -50.8 | 1.031 | 4963.6 | no |
41177 | 57 | retired | married | professional.course | no | yes | no | cellular | nov | thu | ... | 6 | 999 | 0 | nonexistent | -1.1 | 94.767 | -50.8 | 1.031 | 4963.6 | no |
41178 | 62 | retired | married | university.degree | no | no | no | cellular | nov | thu | ... | 2 | 6 | 3 | success | -1.1 | 94.767 | -50.8 | 1.031 | 4963.6 | yes |
41179 | 64 | retired | divorced | professional.course | no | yes | no | cellular | nov | fri | ... | 3 | 999 | 0 | nonexistent | -1.1 | 94.767 | -50.8 | 1.028 | 4963.6 | no |
41180 | 36 | admin. | married | university.degree | no | no | no | cellular | nov | fri | ... | 2 | 999 | 0 | nonexistent | -1.1 | 94.767 | -50.8 | 1.028 | 4963.6 | no |
41181 | 37 | admin. | married | university.degree | no | yes | no | cellular | nov | fri | ... | 1 | 999 | 0 | nonexistent | -1.1 | 94.767 | -50.8 | 1.028 | 4963.6 | yes |
41182 | 29 | unemployed | single | basic.4y | no | yes | no | cellular | nov | fri | ... | 1 | 9 | 1 | success | -1.1 | 94.767 | -50.8 | 1.028 | 4963.6 | no |
41183 | 73 | retired | married | professional.course | no | yes | no | cellular | nov | fri | ... | 1 | 999 | 0 | nonexistent | -1.1 | 94.767 | -50.8 | 1.028 | 4963.6 | yes |
41184 | 46 | blue-collar | married | professional.course | no | no | no | cellular | nov | fri | ... | 1 | 999 | 0 | nonexistent | -1.1 | 94.767 | -50.8 | 1.028 | 4963.6 | no |
41185 | 56 | retired | married | university.degree | no | yes | no | cellular | nov | fri | ... | 2 | 999 | 0 | nonexistent | -1.1 | 94.767 | -50.8 | 1.028 | 4963.6 | no |
41186 | 44 | technician | married | professional.course | no | no | no | cellular | nov | fri | ... | 1 | 999 | 0 | nonexistent | -1.1 | 94.767 | -50.8 | 1.028 | 4963.6 | yes |
41187 | 74 | retired | married | professional.course | no | yes | no | cellular | nov | fri | ... | 3 | 999 | 1 | failure | -1.1 | 94.767 | -50.8 | 1.028 | 4963.6 | no |
41188 rows × 21 columns
bank_orig.columns
Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed', 'y'], dtype='object')
#Visualización del paquete pandas: https://pandas.pydata.org/pandas-docs/stable/visualization.html
#Algunos ejemplos
#bank_orig.plot.hist(x='age', y='euribor3m',bins=20)
bank_orig['age'].plot.hist(bins=20)
<matplotlib.axes._subplots.AxesSubplot at 0x1e4e2b1ba58>
bank_orig.boxplot(showfliers=False)
<matplotlib.axes._subplots.AxesSubplot at 0x249a3957cf8>
bank_orig.boxplot(column=['age'], showfliers=False)
<matplotlib.axes._subplots.AxesSubplot at 0x249a3d204e0>
# Agrupar por los valores de otro atributo
bank_orig.boxplot(column=['age'], by='marital', showfliers=False)
<matplotlib.axes._subplots.AxesSubplot at 0x1e4e2bbacc0>
# gráfico de barras horizontales con la proporción de cada clase
plt.figure(1) # Crea una figura con un solo gráfico
plt.clf() # Limpia la figura
import seaborn as sns # Librería de visualización de Python basada en matplotlib
ax = sns.countplot(y="y", data=bank_orig, color="c");
ncount = bank_orig.shape[0]
for p in ax.patches:
val_x=p.get_bbox().get_points()[:,0]
val_y=p.get_bbox().get_points()[1,1]
ax.annotate('{:.0f} ({:.1f}%)'.format(val_x[1], 100.*val_x[1]/ncount), (val_x.mean(), (val_y-0.4)), ha='center', va='center')
# gráfico de barras verticales con la proporción de cada clase
plt.figure(1) # Crea una figura con un solo gráfico
plt.clf() # Limpia la figura
import seaborn as sns # Librería de visualización de Python basada en matplotlib
ax = sns.countplot(x="y", data=bank_orig, color="c"); # Pinta el gráfico de barras básico
ncount = bank_orig.shape[0] # Total instancias
for p in ax.patches: # Para cada una de las barras
# p.get_bbox().get_points() => puntos que definen la barra (esquina inferior izda y superior dcha)
val_x=p.get_bbox().get_points()[:,0]
val_y=p.get_bbox().get_points()[1,1]
ax.annotate('{:.0f} ({:.1f}%)'.format(val_y, 100.*val_y/ncount), (val_x.mean(), (val_y/2.)), ha='center', va='center')
print("------ Preparando el scatter matrix...")
plt.figure(2)
plt.clf()
# para scatter matrix, se convierten las variables categóricas a numéricas
bank_int = bank_orig
# pipe aplica una función (lambda)
char_cols = bank_int.dtypes.pipe(lambda x: x[x == 'object']).index #lista de columnas con var. categóticas (las de tipo 'object')
for c in char_cols:
bank_int[c] = pd.factorize(bank_int[c])[0]
# factorize: codifica el objeto como un enumerado o una variable categórica. Es útil para obtener una representación numérica de un array cuando lo único que importa es identificar valores distintos
lista_vars = list(bank_int)
lista_vars.remove('y') #excluimos la columna 'y' del plot
#se genera el scatter matrix
sns.set()
sns_plot = sns.pairplot(bank_int, vars=lista_vars, hue="y", diag_kind="kde") #en hue indicamos que la columna 'y' define los colores
# diag_kind tipo de gráficos en la diagonal
sns_plot.savefig("bank_scatter_plot.png")
print("")
------ Preparando el scatter matrix...
C:\Users\elena\Anaconda3\lib\site-packages\scipy\stats\stats.py:1713: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result. return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
'\nsns_plot.savefig("bank_scatter_plot.png")\nprint("")\n'
<Figure size 432x288 with 0 Axes>
'''
si el dataset contiene variables categóricas con cadenas, es necesario convertirlas a numéricas antes de usar 'fit', y para
no hacerlas ordinales, mejor convertirlas a variables binarias con get_dummies
Otras alternativas para convertir las variables categóricas es usar LabelEncoder, One-Hot-Encoding o LabelBinarizer en la matriz numpy (ver más abajo)
Para saber más: http://pbpython.com/categorical-encoding.html
'''
# devuelve una lista de las características categóricas excluyendo la columna 'class' que contiene la clase
lista_categoricas = [x for x in bank_orig.columns if (bank_orig[x].dtype == object and bank_orig[x].name != 'y')]
if not binarizar:
bank = bank_orig
else:
# reemplaza las cateogóricas por binarias
bank = pd.get_dummies(bank_orig, columns=lista_categoricas)
# coloco la columna que contiene la clase como última columna por convención
clase = bank['y']
bank.drop(labels=['y'], axis=1,inplace = True)
bank.insert(len(bank.columns), 'y', clase)
# separamos el DataFrame en dos arrays numpy, uno con las características (X) y otro (y) con la clase
# si la última columna contiene la clase, se puede separar así
X = bank.values[:,0:len(bank.columns)-1]
y = bank.values[:,len(bank.columns)-1]
print("X", X)
print("y", y)
'''
#también se puede separar indicando los nombres de las columnas
columns = ['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
'cons.conf.idx', 'euribor3m', 'nr.employed']
X = bank[list(columns)].values
y = bank["y"].values
'''
X [[ 5.6000e+01 0.0000e+00 0.0000e+00 ... -3.6400e+01 4.8570e+00 5.1910e+03] [ 5.7000e+01 1.0000e+00 0.0000e+00 ... -3.6400e+01 4.8570e+00 5.1910e+03] [ 3.7000e+01 1.0000e+00 0.0000e+00 ... -3.6400e+01 4.8570e+00 5.1910e+03] ... [ 5.6000e+01 5.0000e+00 0.0000e+00 ... -5.0800e+01 1.0280e+00 4.9636e+03] [ 4.4000e+01 4.0000e+00 0.0000e+00 ... -5.0800e+01 1.0280e+00 4.9636e+03] [ 7.4000e+01 5.0000e+00 0.0000e+00 ... -5.0800e+01 1.0280e+00 4.9636e+03]] y [0. 0. 0. ... 0. 1. 0.]
'\n#también se puede separar indicando los nombres de las columnas\ncolumns = [\'age\', \'job\', \'marital\', \'education\', \'default\', \'housing\', \'loan\',\n \'contact\', \'month\', \'day_of_week\', \'duration\', \'campaign\', \'pdays\',\n \'previous\', \'poutcome\', \'emp.var.rate\', \'cons.price.idx\',\n \'cons.conf.idx\', \'euribor3m\', \'nr.employed\']\nX = bank[list(columns)].values\ny = bank["y"].values\n'
'''
Si las variables categóricas tienen muchas categorías, se generarán muchas variables y algunos algoritmos (por ejemplo, SVM) serán
extremadamente lentos. Se puede optar por solo convertirlas a variables numéricas (ordinales) sin binarizar. Esto se haría si no se ha
ejecutado pd.get_dummies() previamente. No funciona si hay valores perdidos notados como NaN
'''
if not binarizar:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
# LabelEncoder codifica los valores originales entre 0 y el número de valores - 1
# Se puede usar para normalizar variables o para transformar variables no-numéricas en numéricas
for i in range(0,X.shape[1]):
if isinstance(X[0,i],str):
X[:,i] = le.fit_transform(X[:,i])
# validación cruzada, pero sin control de semilla ni particionado estratificado
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(random_state=0)
from sklearn.model_selection import cross_val_score
scores = cross_val_score(tree, X, y, cv=5, scoring='accuracy')
print(scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
[0.88905074 0.36222384 0.61641175 0.17858444 0.1417992 ] Accuracy: 0.44 (+/- 0.56)
'''
Validación cruzada con particionado estratificado y control de la aleatoriedad fijando la semilla
'''
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from imblearn.metrics import geometric_mean_score
from sklearn import preprocessing
import numpy
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=123456)
le = preprocessing.LabelEncoder()
def validacion_cruzada(modelo, X, y, cv):
y_test_all = []
y_prob_all = []
for train, test in cv.split(X, y):
modelo = modelo.fit(X[train],y[train])
y_pred = modelo.predict(X[test])
y_prob = modelo.predict_proba(X[test])[:,1] #la segunda columna es la clase positiva '1' en bank-marketing
y_test_bin = y[test]
#y_test_bin = le.fit_transform(y[test]) #se convierte a binario para AUC: 'yes' -> 1 (clase positiva) y 'no' -> 0 en bank-marketing
print("Accuracy: {:6.2f}%, F1-score: {:.4f}, G-mean: {:.4f}, AUC: {:.4f}".format(accuracy_score(y[test],y_pred)*100 , f1_score(y[test],y_pred,average='macro'), geometric_mean_score(y[test],y_pred,average='macro'), roc_auc_score(y_test_bin,y_prob)))
y_test_all = numpy.concatenate([y_test_all,y_test_bin])
y_prob_all = numpy.concatenate([y_prob_all,y_prob])
print("")
return modelo, y_test_all, y_prob_all
'''
Dibuja la curva ROC
'''
from sklearn.metrics import roc_curve, auc
def curva_ROC(figura_id,new,y_test,y_prob,nombre):
fpr, tpr, _ = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)
plt.figure(figura_id)
if new:
plt.clf()
plt.plot(fpr, tpr, lw=2, label=nombre+' (%0.4f)' % roc_auc) #color='darkorange',
plt.plot([0, 1], [0, 1], color='navy', lw=1.5, linestyle='--')
plt.legend(loc="lower right")
if new:
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC')
plt.show()
return roc_auc
#'''
print("------ Árbol de decisión...")
from sklearn import tree
arbol = tree.DecisionTreeClassifier(random_state=0, max_depth=10) #podemos limitar a profundidad 5 para generar un árbol legible aunque pierda algo de precisión
arbol, y_test_arbol, y_prob_arbol = validacion_cruzada(arbol,X,y,skf)
#curva_ROC(3,True,y_test_arbol,y_prob_arbol,'Árbol')
------ Árbol de decisión... Accuracy: 91.22%, F1-score: 0.7735, G-mean: 0.7652, AUC: 0.8833 Accuracy: 90.35%, F1-score: 0.7455, G-mean: 0.7330, AUC: 0.8880 Accuracy: 90.75%, F1-score: 0.7445, G-mean: 0.7221, AUC: 0.8800 Accuracy: 90.54%, F1-score: 0.7436, G-mean: 0.7252, AUC: 0.8636 Accuracy: 90.91%, F1-score: 0.7686, G-mean: 0.7639, AUC: 0.8817
'''
Para visualizar el árbol generado, se puede usar graphviz, que debe ser previamente instalado
Por ejemplo, desde Anaconda Navigator: Environments / Seleccionar "Not installed" / Buscar "graphviz" / Marcar + "Apply"
Incluir el directorio "...Anaconda3\pkgs\graphviz-2.38.0-4\Library\bin\graphviz" en las variables de entorno PATH y GRAPHVIZ_DOT (variable nueva que debe crearse)
'''
print("------ Generando una visualización del árbol en 'bank.pdf'...")
import graphviz
feat = list(bank)
feat.remove('y')
dot_data = tree.export_graphviz(arbol, out_file=None, filled=True, feature_names=feat, class_names=['no', 'yes'], rounded=True, special_characters=True)
graph = graphviz.Source(dot_data)
#graph.render()
graph.render("bank", view=True) #genera un fichero bank.pdf con el árbol
#'''
------ Generando una visualización del árbol en 'bank.pdf'...
'bank.pdf'
print("------ XGB...")
import xgboost as xgb
clf = xgb.XGBClassifier(n_estimators = 200)
clf, y_test_clf, y_prob_clf = validacion_cruzada(clf,X,y,skf)
curva_ROC(3,False,y_test_clf,y_prob_clf,'XGB')
------ XGB... Accuracy: 91.89%, F1-score: 0.7758, G-mean: 0.7502, AUC: 0.9493 Accuracy: 91.61%, F1-score: 0.7726, G-mean: 0.7519, AUC: 0.9499 Accuracy: 91.47%, F1-score: 0.7629, G-mean: 0.7374, AUC: 0.9497 Accuracy: 91.67%, F1-score: 0.7702, G-mean: 0.7456, AUC: 0.9482 Accuracy: 91.71%, F1-score: 0.7718, G-mean: 0.7477, AUC: 0.9500
0.9494267576319095
'''
Visualizar las características más importantes según la frecuencia con que se usan en los árboles de XGB (sobre el último modelo de la CV)
'''
plt.figure(4)
plt.clf()
features = list(bank)
mapFeat = dict(zip(["f"+str(i) for i in range(len(features))],features))
type(clf.get_booster().get_fscore())
ts = pd.Series(clf.get_booster().get_fscore())
ts.index = ts.reset_index()['index'].map(mapFeat)
ax2=ts.sort_values()[-20:].plot(kind="barh", figsize = (8,8), title=("20 características más importantes"), color='orange')
ax2.set_xlabel("importancia")
ax2.set_ylabel("característica")
Text(0,0.5,'característica')