In [1]:
#Necesario para que los plots de matplotlib aparezcan en el notebook
%matplotlib inline
In [2]:
import matplotlib.pyplot as plt
import pandas as pd

carpeta_datos="G:/Mi unidad/DOCTORADO/Docencia/Curso_2018-2019/IN_2018-2019/data/"

#True si cada variable categórica se convierte en varias binarias (tantas como categorías),
#False si solo se convierte la categórica a numérica (ordinal)
binarizar = False

'''
devuelve un DataFrame, los valores perdidos notados como '?' se convierten a NaN,
si no, se consideraría '?' como una categoría más
'''

if not binarizar:
    adult_orig = pd.read_csv(carpeta_datos+'adult.csv')
else:
    adult_orig = pd.read_csv(carpeta_datos+'adult.csv',na_values="?")
    
print("------ Lista de características y tipos (object=categórica)")
print(adult_orig.dtypes,"\n")

print("------ Distribución de datos en la característica 'workclass'")
print(adult_orig['workclass'].value_counts(),"\n")

print("------ Y en la clase")
print(adult_orig['class'].value_counts(),"\n")
------ Lista de características y tipos (object=categórica)
age                int64
workclass         object
fnlwgt             int64
education         object
education-num      int64
marital-status    object
occupation        object
relationship      object
race              object
sex               object
capital-gain       int64
capital-loss       int64
hours-per-week     int64
native-country    object
class             object
dtype: object 

------ Distribución de datos en la característica 'workclass'
 Private             22696
 Self-emp-not-inc     2541
 Local-gov            2093
 ?                    1836
 State-gov            1298
 Self-emp-inc         1116
 Federal-gov           960
 Without-pay            14
 Never-worked            7
Name: workclass, dtype: int64 

------ Y en la clase
 <=50K    24720
 >50K      7841
Name: class, dtype: int64 

In [3]:
adult_orig.all
Out[3]:
<bound method DataFrame.all of        age          workclass  fnlwgt      education  education-num  \
0       39          State-gov   77516      Bachelors             13   
1       50   Self-emp-not-inc   83311      Bachelors             13   
2       38            Private  215646        HS-grad              9   
3       53            Private  234721           11th              7   
4       28            Private  338409      Bachelors             13   
5       37            Private  284582        Masters             14   
6       49            Private  160187            9th              5   
7       52   Self-emp-not-inc  209642        HS-grad              9   
8       31            Private   45781        Masters             14   
9       42            Private  159449      Bachelors             13   
10      37            Private  280464   Some-college             10   
11      30          State-gov  141297      Bachelors             13   
12      23            Private  122272      Bachelors             13   
13      32            Private  205019     Assoc-acdm             12   
14      40            Private  121772      Assoc-voc             11   
15      34            Private  245487        7th-8th              4   
16      25   Self-emp-not-inc  176756        HS-grad              9   
17      32            Private  186824        HS-grad              9   
18      38            Private   28887           11th              7   
19      43   Self-emp-not-inc  292175        Masters             14   
20      40            Private  193524      Doctorate             16   
21      54            Private  302146        HS-grad              9   
22      35        Federal-gov   76845            9th              5   
23      43            Private  117037           11th              7   
24      59            Private  109015        HS-grad              9   
25      56          Local-gov  216851      Bachelors             13   
26      19            Private  168294        HS-grad              9   
27      54                  ?  180211   Some-college             10   
28      39            Private  367260        HS-grad              9   
29      49            Private  193366        HS-grad              9   
...    ...                ...     ...            ...            ...   
32531   30                  ?   33811      Bachelors             13   
32532   34            Private  204461      Doctorate             16   
32533   54            Private  337992      Bachelors             13   
32534   37            Private  179137   Some-college             10   
32535   22            Private  325033           12th              8   
32536   34            Private  160216      Bachelors             13   
32537   30            Private  345898        HS-grad              9   
32538   38            Private  139180      Bachelors             13   
32539   71                  ?  287372      Doctorate             16   
32540   45          State-gov  252208        HS-grad              9   
32541   41                  ?  202822        HS-grad              9   
32542   72                  ?  129912        HS-grad              9   
32543   45          Local-gov  119199     Assoc-acdm             12   
32544   31            Private  199655        Masters             14   
32545   39          Local-gov  111499     Assoc-acdm             12   
32546   37            Private  198216     Assoc-acdm             12   
32547   43            Private  260761        HS-grad              9   
32548   65   Self-emp-not-inc   99359    Prof-school             15   
32549   43          State-gov  255835   Some-college             10   
32550   43   Self-emp-not-inc   27242   Some-college             10   
32551   32            Private   34066           10th              6   
32552   43            Private   84661      Assoc-voc             11   
32553   32            Private  116138        Masters             14   
32554   53            Private  321865        Masters             14   
32555   22            Private  310152   Some-college             10   
32556   27            Private  257302     Assoc-acdm             12   
32557   40            Private  154374        HS-grad              9   
32558   58            Private  151910        HS-grad              9   
32559   22            Private  201490        HS-grad              9   
32560   52       Self-emp-inc  287927        HS-grad              9   

               marital-status          occupation     relationship  \
0               Never-married        Adm-clerical    Not-in-family   
1          Married-civ-spouse     Exec-managerial          Husband   
2                    Divorced   Handlers-cleaners    Not-in-family   
3          Married-civ-spouse   Handlers-cleaners          Husband   
4          Married-civ-spouse      Prof-specialty             Wife   
5          Married-civ-spouse     Exec-managerial             Wife   
6       Married-spouse-absent       Other-service    Not-in-family   
7          Married-civ-spouse     Exec-managerial          Husband   
8               Never-married      Prof-specialty    Not-in-family   
9          Married-civ-spouse     Exec-managerial          Husband   
10         Married-civ-spouse     Exec-managerial          Husband   
11         Married-civ-spouse      Prof-specialty          Husband   
12              Never-married        Adm-clerical        Own-child   
13              Never-married               Sales    Not-in-family   
14         Married-civ-spouse        Craft-repair          Husband   
15         Married-civ-spouse    Transport-moving          Husband   
16              Never-married     Farming-fishing        Own-child   
17              Never-married   Machine-op-inspct        Unmarried   
18         Married-civ-spouse               Sales          Husband   
19                   Divorced     Exec-managerial        Unmarried   
20         Married-civ-spouse      Prof-specialty          Husband   
21                  Separated       Other-service        Unmarried   
22         Married-civ-spouse     Farming-fishing          Husband   
23         Married-civ-spouse    Transport-moving          Husband   
24                   Divorced        Tech-support        Unmarried   
25         Married-civ-spouse        Tech-support          Husband   
26              Never-married        Craft-repair        Own-child   
27         Married-civ-spouse                   ?          Husband   
28                   Divorced     Exec-managerial    Not-in-family   
29         Married-civ-spouse        Craft-repair          Husband   
...                       ...                 ...              ...   
32531           Never-married                   ?    Not-in-family   
32532      Married-civ-spouse      Prof-specialty          Husband   
32533      Married-civ-spouse     Exec-managerial          Husband   
32534                Divorced        Adm-clerical        Unmarried   
32535           Never-married     Protective-serv        Own-child   
32536           Never-married     Exec-managerial    Not-in-family   
32537           Never-married        Craft-repair    Not-in-family   
32538                Divorced      Prof-specialty        Unmarried   
32539      Married-civ-spouse                   ?          Husband   
32540               Separated        Adm-clerical        Own-child   
32541               Separated                   ?    Not-in-family   
32542      Married-civ-spouse                   ?          Husband   
32543                Divorced      Prof-specialty        Unmarried   
32544                Divorced       Other-service    Not-in-family   
32545      Married-civ-spouse        Adm-clerical             Wife   
32546                Divorced        Tech-support    Not-in-family   
32547      Married-civ-spouse   Machine-op-inspct          Husband   
32548           Never-married      Prof-specialty    Not-in-family   
32549                Divorced        Adm-clerical   Other-relative   
32550      Married-civ-spouse        Craft-repair          Husband   
32551      Married-civ-spouse   Handlers-cleaners          Husband   
32552      Married-civ-spouse               Sales          Husband   
32553           Never-married        Tech-support    Not-in-family   
32554      Married-civ-spouse     Exec-managerial          Husband   
32555           Never-married     Protective-serv    Not-in-family   
32556      Married-civ-spouse        Tech-support             Wife   
32557      Married-civ-spouse   Machine-op-inspct          Husband   
32558                 Widowed        Adm-clerical        Unmarried   
32559           Never-married        Adm-clerical        Own-child   
32560      Married-civ-spouse     Exec-managerial             Wife   

                      race      sex  capital-gain  capital-loss  \
0                    White     Male          2174             0   
1                    White     Male             0             0   
2                    White     Male             0             0   
3                    Black     Male             0             0   
4                    Black   Female             0             0   
5                    White   Female             0             0   
6                    Black   Female             0             0   
7                    White     Male             0             0   
8                    White   Female         14084             0   
9                    White     Male          5178             0   
10                   Black     Male             0             0   
11      Asian-Pac-Islander     Male             0             0   
12                   White   Female             0             0   
13                   Black     Male             0             0   
14      Asian-Pac-Islander     Male             0             0   
15      Amer-Indian-Eskimo     Male             0             0   
16                   White     Male             0             0   
17                   White     Male             0             0   
18                   White     Male             0             0   
19                   White   Female             0             0   
20                   White     Male             0             0   
21                   Black   Female             0             0   
22                   Black     Male             0             0   
23                   White     Male             0          2042   
24                   White   Female             0             0   
25                   White     Male             0             0   
26                   White     Male             0             0   
27      Asian-Pac-Islander     Male             0             0   
28                   White     Male             0             0   
29                   White     Male             0             0   
...                    ...      ...           ...           ...   
32531   Asian-Pac-Islander   Female             0             0   
32532                White     Male             0             0   
32533   Asian-Pac-Islander     Male             0             0   
32534                White   Female             0             0   
32535                Black     Male             0             0   
32536                White   Female             0             0   
32537                Black     Male             0             0   
32538                Black   Female         15020             0   
32539                White     Male             0             0   
32540                White   Female             0             0   
32541                Black   Female             0             0   
32542                White     Male             0             0   
32543                White   Female             0             0   
32544                Other   Female             0             0   
32545                White   Female             0             0   
32546                White   Female             0             0   
32547                White     Male             0             0   
32548                White     Male          1086             0   
32549                White   Female             0             0   
32550                White     Male             0             0   
32551   Amer-Indian-Eskimo     Male             0             0   
32552                White     Male             0             0   
32553   Asian-Pac-Islander     Male             0             0   
32554                White     Male             0             0   
32555                White     Male             0             0   
32556                White   Female             0             0   
32557                White     Male             0             0   
32558                White   Female             0             0   
32559                White     Male             0             0   
32560                White   Female         15024             0   

       hours-per-week  native-country   class  
0                  40   United-States   <=50K  
1                  13   United-States   <=50K  
2                  40   United-States   <=50K  
3                  40   United-States   <=50K  
4                  40            Cuba   <=50K  
5                  40   United-States   <=50K  
6                  16         Jamaica   <=50K  
7                  45   United-States    >50K  
8                  50   United-States    >50K  
9                  40   United-States    >50K  
10                 80   United-States    >50K  
11                 40           India    >50K  
12                 30   United-States   <=50K  
13                 50   United-States   <=50K  
14                 40               ?    >50K  
15                 45          Mexico   <=50K  
16                 35   United-States   <=50K  
17                 40   United-States   <=50K  
18                 50   United-States   <=50K  
19                 45   United-States    >50K  
20                 60   United-States    >50K  
21                 20   United-States   <=50K  
22                 40   United-States   <=50K  
23                 40   United-States   <=50K  
24                 40   United-States   <=50K  
25                 40   United-States    >50K  
26                 40   United-States   <=50K  
27                 60           South    >50K  
28                 80   United-States   <=50K  
29                 40   United-States   <=50K  
...               ...             ...     ...  
32531              99   United-States   <=50K  
32532              60   United-States    >50K  
32533              50           Japan    >50K  
32534              39   United-States   <=50K  
32535              35   United-States   <=50K  
32536              55   United-States    >50K  
32537              46   United-States   <=50K  
32538              45   United-States    >50K  
32539              10   United-States    >50K  
32540              40   United-States   <=50K  
32541              32   United-States   <=50K  
32542              25   United-States   <=50K  
32543              48   United-States   <=50K  
32544              30   United-States   <=50K  
32545              20   United-States    >50K  
32546              40   United-States   <=50K  
32547              40          Mexico   <=50K  
32548              60   United-States   <=50K  
32549              40   United-States   <=50K  
32550              50   United-States   <=50K  
32551              40   United-States   <=50K  
32552              45   United-States   <=50K  
32553              11          Taiwan   <=50K  
32554              40   United-States    >50K  
32555              40   United-States   <=50K  
32556              38   United-States   <=50K  
32557              40   United-States    >50K  
32558              40   United-States   <=50K  
32559              20   United-States   <=50K  
32560              40   United-States    >50K  

[32561 rows x 15 columns]>
In [4]:
adult_orig.columns
Out[4]:
Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'class'],
      dtype='object')
In [16]:
#Visualización del paquete pandas: https://pandas.pydata.org/pandas-docs/stable/visualization.html
#Algunos ejemplos

adult_orig.plot.hist(x='age', y='hours-per-week',bins=20)
Out[16]:
<matplotlib.axes._subplots.AxesSubplot at 0x16b0b22db00>
In [ ]:
adult_orig.boxplot(showfliers=False)
In [7]:
adult_orig.boxplot(column=['hours-per-week'])
Out[7]:
<matplotlib.axes._subplots.AxesSubplot at 0x2714788d518>
In [8]:
adult_orig.boxplot(column='hours-per-week', by='sex')
Out[8]:
<matplotlib.axes._subplots.AxesSubplot at 0x27147882b00>
In [9]:
#'''
# gráfico de barras horizontales con la proporción de cada clase
plt.figure(1)
plt.clf()
import seaborn as sns
ax = sns.countplot(y="class", data=adult_orig, color="c");
ncount = adult_orig.shape[0]
for p in ax.patches:
    val_x=p.get_bbox().get_points()[:,0]
    val_y=p.get_bbox().get_points()[1,1]
    ax.annotate('{:.0f} ({:.1f}%)'.format(val_x[1], 100.*val_x[1]/ncount), (val_x.mean(), (val_y-0.4)), ha='center', va='center')
#'''
In [10]:
#'''
print("------ Preparando el scatter matrix...")
plt.figure(2)
plt.clf()
# para scatter matrix, se convierten las variables categóricas a numéricas
adult_int = adult_orig
char_cols = adult_int.dtypes.pipe(lambda x: x[x == 'object']).index #lista de columnas con var. categóticas (las de tipo 'object')
for c in char_cols:
    adult_int[c] = pd.factorize(adult_int[c])[0]
lista_vars = list(adult_int)
lista_vars.remove('class') #excluimos la columna 'class' del plot
#se genera el scatter matrix
sns.set()
sns_plot = sns.pairplot(adult_int, vars=lista_vars, hue="class", diag_kind="kde") #en hue indicamos que la columna 'class' define los colores
sns_plot.savefig("adult_scatter_plot.png")
print("")
#'''
------ Preparando el scatter matrix...
C:\Users\elena\Anaconda3\lib\site-packages\scipy\stats\stats.py:1713: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval

<Figure size 432x288 with 0 Axes>