from sklearn import preprocessing
from IPython.display import display
import numpy as np
import pandas as pd
data = np.array([[2.2, 5.9, -1.8], [5.4, -3.2, -5.1], [-1.9, 4.2, 3.2]])
bin_data = preprocessing.Binarizer(threshold=1.5).transform(data)
bin_data
array([[1., 1., 0.], [1., 0., 0.], [0., 1., 1.]])
data.mean(axis=0)
array([ 1.9 , 2.3 , -1.23333333])
data.std(axis=0)
array([2.98775278, 3.95052739, 3.41207008])
scaled_data = preprocessing.scale(data)
scaled_data.mean(axis=0)
array([0.00000000e+00, 0.00000000e+00, 7.40148683e-17])
scaled_data.std(axis=0)
array([1., 1., 1.])
data
array([[ 2.2, 5.9, -1.8], [ 5.4, -3.2, -5.1], [-1.9, 4.2, 3.2]])
min_max_scaler = preprocessing.MinMaxScaler(feature_range=(0,1))
data_min_max = min_max_scaler.fit_transform(data)
data_min_max
array([[0.56164384, 1. , 0.39759036], [1. , 0. , 0. ], [0. , 0.81318681, 1. ]])
Ato de trazer os valores de cada vetor de features em uma escala comum
data
array([[ 2.2, 5.9, -1.8], [ 5.4, -3.2, -5.1], [-1.9, 4.2, 3.2]])
data_l1 = preprocessing.normalize(data, norm='l1')
data_l2 = preprocessing.normalize(data, norm='l2')
data_l1
array([[ 0.22222222, 0.5959596 , -0.18181818], [ 0.39416058, -0.23357664, -0.37226277], [-0.20430108, 0.4516129 , 0.34408602]])
data_l2
array([[ 0.3359268 , 0.90089461, -0.2748492 ], [ 0.6676851 , -0.39566524, -0.63059148], [-0.33858465, 0.74845029, 0.57024784]])
labels = ['setosa', 'versicolor', 'virginica']
encoder = preprocessing.LabelEncoder()
encoder.fit(labels)
LabelEncoder()
for i, items in enumerate(encoder.classes_):
print(items, i)
setosa 0 versicolor 1 virginica 2
more_labels = ['versicolor', 'versicolor', 'virginica', 'setosa', 'versicolor']
more_labels_encoded = encoder.transform(more_labels)
more_labels
['versicolor', 'versicolor', 'virginica', 'setosa', 'versicolor']
more_labels_encoded
array([1, 1, 2, 0, 1])
data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', header=None,
index_col=False, names=['age', 'workclass', 'fnlwgt', 'education',
'education-num', 'marital-status', 'occupation',
'relationship', 'race', 'gender', 'capital-gain',
'capital-loss', 'hours-per-week', 'native-country',
'income'])
data = data[['age', 'workclass', 'education', 'gender', 'hours-per-week', 'occupation', 'income']]
display(data)
age | workclass | education | gender | hours-per-week | occupation | income | |
---|---|---|---|---|---|---|---|
0 | 39 | State-gov | Bachelors | Male | 40 | Adm-clerical | <=50K |
1 | 50 | Self-emp-not-inc | Bachelors | Male | 13 | Exec-managerial | <=50K |
2 | 38 | Private | HS-grad | Male | 40 | Handlers-cleaners | <=50K |
3 | 53 | Private | 11th | Male | 40 | Handlers-cleaners | <=50K |
4 | 28 | Private | Bachelors | Female | 40 | Prof-specialty | <=50K |
... | ... | ... | ... | ... | ... | ... | ... |
32556 | 27 | Private | Assoc-acdm | Female | 38 | Tech-support | <=50K |
32557 | 40 | Private | HS-grad | Male | 40 | Machine-op-inspct | >50K |
32558 | 58 | Private | HS-grad | Female | 40 | Adm-clerical | <=50K |
32559 | 22 | Private | HS-grad | Male | 20 | Adm-clerical | <=50K |
32560 | 52 | Self-emp-inc | HS-grad | Female | 40 | Exec-managerial | >50K |
32561 rows × 7 columns
data.head(10)
age | workclass | education | gender | hours-per-week | occupation | income | |
---|---|---|---|---|---|---|---|
0 | 39 | State-gov | Bachelors | Male | 40 | Adm-clerical | <=50K |
1 | 50 | Self-emp-not-inc | Bachelors | Male | 13 | Exec-managerial | <=50K |
2 | 38 | Private | HS-grad | Male | 40 | Handlers-cleaners | <=50K |
3 | 53 | Private | 11th | Male | 40 | Handlers-cleaners | <=50K |
4 | 28 | Private | Bachelors | Female | 40 | Prof-specialty | <=50K |
5 | 37 | Private | Masters | Female | 40 | Exec-managerial | <=50K |
6 | 49 | Private | 9th | Female | 16 | Other-service | <=50K |
7 | 52 | Self-emp-not-inc | HS-grad | Male | 45 | Exec-managerial | >50K |
8 | 31 | Private | Masters | Female | 50 | Prof-specialty | >50K |
9 | 42 | Private | Bachelors | Male | 40 | Exec-managerial | >50K |
list(data.columns)
['age', 'workclass', 'education', 'gender', 'hours-per-week', 'occupation', 'income']
data_dummies = pd.get_dummies(data)
list(data_dummies.columns)
['age', 'hours-per-week', 'workclass_ ?', 'workclass_ Federal-gov', 'workclass_ Local-gov', 'workclass_ Never-worked', 'workclass_ Private', 'workclass_ Self-emp-inc', 'workclass_ Self-emp-not-inc', 'workclass_ State-gov', 'workclass_ Without-pay', 'education_ 10th', 'education_ 11th', 'education_ 12th', 'education_ 1st-4th', 'education_ 5th-6th', 'education_ 7th-8th', 'education_ 9th', 'education_ Assoc-acdm', 'education_ Assoc-voc', 'education_ Bachelors', 'education_ Doctorate', 'education_ HS-grad', 'education_ Masters', 'education_ Preschool', 'education_ Prof-school', 'education_ Some-college', 'gender_ Female', 'gender_ Male', 'occupation_ ?', 'occupation_ Adm-clerical', 'occupation_ Armed-Forces', 'occupation_ Craft-repair', 'occupation_ Exec-managerial', 'occupation_ Farming-fishing', 'occupation_ Handlers-cleaners', 'occupation_ Machine-op-inspct', 'occupation_ Other-service', 'occupation_ Priv-house-serv', 'occupation_ Prof-specialty', 'occupation_ Protective-serv', 'occupation_ Sales', 'occupation_ Tech-support', 'occupation_ Transport-moving', 'income_ <=50K', 'income_ >50K']
features = data_dummies.loc[:, 'age':'occupation_ Transport-moving']
X = features.values
y = data_dummies['income_ >50K'].values
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
logreg = LogisticRegression(max_iter=1500)
logreg.fit(X_train, y_train)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, l1_ratio=None, max_iter=1500, multi_class='auto', n_jobs=None, penalty='l2', random_state=None, solver='lbfgs', tol=0.0001, verbose=0, warm_start=False)
logreg.score(X_test, y_test)
0.8088686893502027