Demo 1: Predicting Income from Census Data

A Logistic Regression Example

In [1]:
# import necessary packages
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression

In [2]:
inc_data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', header = None, names = ['age', 'workclass', 'fnlwgt', 'education', 'education.num', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'capital.gain', 'capital.loss', 'hours.per.week', 'native.country', 'income'])

# drop null values
inc_data = inc_data.dropna()

In [3]:
# the column is present in both categorical and numeric form
del inc_data['education']

# convert all features to categorical integer values
enc = LabelEncoder()
for i in inc_data.columns:
inc_data[i] = enc.fit_transform(inc_data[i])

In [4]:
# target is stored in y
y = inc_data['income']

# X contains all other features, which we will use to predict target
X = inc_data.drop('income', axis=1)

In [5]:
# train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [6]:
# build model and fit on train set
logit = LogisticRegression()
logit.fit(X_train, y_train)

Out[6]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
verbose=0, warm_start=False)
In [7]:
# make predictions on test set
pred_logit = logit.predict(X_test)
pred_logit

Out[7]:
array([1, 0, 0, ..., 0, 0, 1])
In [8]:
# measure accuracy
accuracy_score(y_true = y_test, y_pred = pred_logit)

Out[8]:
0.80898761388064289

Demo 2: Predict Higgs Boson Signal

A Decision Tree Example

In [9]:
# import necessary packages
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

In [10]:
# this code is only necessary because of the data is in an arff file
import urllib2
from StringIO import StringIO
higgs = pd.DataFrame(dataset[0], columns=dataset[1].names())

# target is stored in y
Y = higgs['class']

# X contains all other features, which we will use to predict target
X = higgs.drop('class', axis=1)

In [16]:
# train/test split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)

# build model and fit on train set
dTree = DecisionTreeClassifier(max_leaf_nodes=15)
dTree.fit(X_train, Y_train)

Out[16]:
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
max_features=None, max_leaf_nodes=15, min_impurity_split=1e-07,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, presort=False, random_state=None,
splitter='best')
In [17]:
# make predictions on test set
dTree_pred = dTree.predict(X_test)
dTree_pred

Out[17]:
array(['1', '1', '0', ..., '0', '0', '1'], dtype=object)
In [18]:
# measure accuracy
accuracy_score(y_true = Y_test, y_pred = dTree_pred)

Out[18]:
0.67222222222222228