Demo 1: Predicting Income from Census Data


A Logistic Regression Example

In [1]:
# import necessary packages
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
In [2]:
inc_data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', header = None, names = ['age', 'workclass', 'fnlwgt', 'education', 'education.num', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'capital.gain', 'capital.loss', 'hours.per.week', 'native.country', 'income'])

# drop null values
inc_data = inc_data.dropna()
In [3]:
# the column is present in both categorical and numeric form
del inc_data['education']

# convert all features to categorical integer values
enc = LabelEncoder()
for i in inc_data.columns:
    inc_data[i] = enc.fit_transform(inc_data[i])
In [4]:
# target is stored in y
y = inc_data['income']

# X contains all other features, which we will use to predict target
X = inc_data.drop('income', axis=1)
In [5]:
# train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
In [6]:
# build model and fit on train set
logit = LogisticRegression()
logit.fit(X_train, y_train)
Out[6]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
In [7]:
# make predictions on test set
pred_logit = logit.predict(X_test)
pred_logit
Out[7]:
array([1, 0, 0, ..., 0, 0, 1])
In [8]:
# measure accuracy
accuracy_score(y_true = y_test, y_pred = pred_logit)
Out[8]:
0.80898761388064289


Demo 2: Predict Higgs Boson Signal


A Decision Tree Example

In [9]:
# import necessary packages
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
In [10]:
# this code is only necessary because of the data is in an arff file
from scipy.io.arff import loadarff
import urllib2
from StringIO import StringIO
data = urllib2.urlopen('https://www.openml.org/data/download/2063675/phpZLgL9q').read(5005537)
dataset = loadarff(StringIO(data))
higgs = pd.DataFrame(dataset[0], columns=dataset[1].names())

# target is stored in y
Y = higgs['class']

# X contains all other features, which we will use to predict target
X = higgs.drop('class', axis=1)
In [16]:
# train/test split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)

# build model and fit on train set
dTree = DecisionTreeClassifier(max_leaf_nodes=15)
dTree.fit(X_train, Y_train)
Out[16]:
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=15, min_impurity_split=1e-07,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
In [17]:
# make predictions on test set
dTree_pred = dTree.predict(X_test)
dTree_pred
Out[17]:
array(['1', '1', '0', ..., '0', '0', '1'], dtype=object)
In [18]:
# measure accuracy
accuracy_score(y_true = Y_test, y_pred = dTree_pred)
Out[18]:
0.67222222222222228