Demo 1: Predicting Income from Census Data
A Logistic Regression Example
# import necessary packages
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
inc_data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', header = None, names = ['age', 'workclass', 'fnlwgt', 'education', 'education.num', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'capital.gain', 'capital.loss', 'hours.per.week', 'native.country', 'income'])
# drop null values
inc_data = inc_data.dropna()
# the column is present in both categorical and numeric form
del inc_data['education']
# convert all features to categorical integer values
enc = LabelEncoder()
for i in inc_data.columns:
inc_data[i] = enc.fit_transform(inc_data[i])
# target is stored in y
y = inc_data['income']
# X contains all other features, which we will use to predict target
X = inc_data.drop('income', axis=1)
# train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
# build model and fit on train set
logit = LogisticRegression()
logit.fit(X_train, y_train)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1, penalty='l2', random_state=None, solver='liblinear', tol=0.0001, verbose=0, warm_start=False)
# make predictions on test set
pred_logit = logit.predict(X_test)
pred_logit
array([1, 0, 0, ..., 0, 0, 1])
# measure accuracy
accuracy_score(y_true = y_test, y_pred = pred_logit)
0.80898761388064289
Demo 2: Predict Higgs Boson Signal
A Decision Tree Example
# import necessary packages
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
# this code is only necessary because of the data is in an arff file
from scipy.io.arff import loadarff
import urllib2
from StringIO import StringIO
data = urllib2.urlopen('https://www.openml.org/data/download/2063675/phpZLgL9q').read(5005537)
dataset = loadarff(StringIO(data))
higgs = pd.DataFrame(dataset[0], columns=dataset[1].names())
# target is stored in y
Y = higgs['class']
# X contains all other features, which we will use to predict target
X = higgs.drop('class', axis=1)
# train/test split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)
# build model and fit on train set
dTree = DecisionTreeClassifier(max_leaf_nodes=15)
dTree.fit(X_train, Y_train)
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None, max_features=None, max_leaf_nodes=15, min_impurity_split=1e-07, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, presort=False, random_state=None, splitter='best')
# make predictions on test set
dTree_pred = dTree.predict(X_test)
dTree_pred
array(['1', '1', '0', ..., '0', '0', '1'], dtype=object)
# measure accuracy
accuracy_score(y_true = Y_test, y_pred = dTree_pred)
0.67222222222222228