Notebook

In [9]:

import pandas as pd
import numpy as np
import sklearn as sk
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import KFold
import random
import matplotlib.pyplot as plt
%matplotlib inline

In [10]:

Train = pd.read_csv('/home/student/SPECT.train', header=None)
Test = pd.read_csv('/home/student/SPECT.test' ,header=None)

In [11]:

frames = [Train, Test]
Data = np.array(pd.concat(frames).values)
Y,X = Data[:,0],Data[:,1:]
x_train, x_test, y_train, y_test = train_test_split(X,Y,train_size=0.9,test_size=0.1)
x_train.shape, x_test, y_train, y_test

Out[11]:

((240, 22),
 array([[1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0],
        [1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1],
        [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1],
        [1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1],
        [0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1],
        [1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0],
        [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1],
        [1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0],
        [1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1],
        [1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
        [0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1],
        [1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0],
        [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1],
        [0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0]]),
 array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0,
        0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
        1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1,
        1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1,
        1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1,
        1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
        0, 1, 1, 1, 0, 0, 1, 1, 1, 1]),
 array([0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1,
        1, 1, 1, 1]))

In [12]:

input_array = x_train
weights = [random.random() for x in range(22)]
s = sum(weights)
n_weights = [x/s for x in weights]
weights = n_weights
threshold = 0.5
actual_result = 1
learning_rate = 0.5

print(weights)

[0.03385511869397899, 0.06878489913730873, 0.07029712556220485, 0.0634493722610696, 0.06799944392105774, 0.07949590984370473, 0.04644816506348176, 0.055354324283170935, 0.067042440358336, 0.07758983139760595, 0.06355426822930273, 0.039611520541607596, 0.06814338785741138, 0.026586052976336465, 0.010372187070941386, 0.055822484955270125, 0.010100802025692903, 0.025179590172097695, 0.000509769752051636, 0.023711010123725288, 0.04436187929950815, 0.0017304164741353504]

In [13]:

def percept(input_array,y_train,weights,lr):
    n = len(input_array)
    m = len(input_array[0])
    error_flag = 0
    for i in range(n):
        
        actual_result = y_train[i]
        inputs = input_array[i]
        pred_op = sum([x*w for x,w in zip(inputs,weights)])
        if pred_op > threshold:
            OP = 1
        else:
            OP = 0
        pred_op = OP
        error = actual_result - pred_op
        if error > 0:
            error_flag = 1
            
            

        for j in range(m):
            weights[j] = weights[j] + lr*(actual_result-pred_op)*inputs[j]
            #pred_op = sum([x*w for x,w in zip(inputs,weigts)])
    return error_flag, weights

    
    

def Predict(weights,vector,my_res):
    t = len(vector)
    for it in range(t):
        if np.array(vector[it]).dot(np.array(weights)) > threshold:
            int_dot = 1
        else:
            int_dot = 0
        my_res = np.append(my_res,int_dot)
    return my_res
    

In [14]:

error_flag = 1
weights_ = np.array([])

acc = np.array([])
for threshold in np.arange(0.3,0.7,0.1):
    accuracy = np.array([])
    for learning_rate in np.arange(0.1,1,0.1):
        my_res = np.array([])
        #print(learning_rate)
        it = 0
        while it < 500:
            error_flag_,weights_ = percept(input_array,y_train,weights,learning_rate)
            #if it == 499:
                #print(weights)
            it += 1

        my_res = Predict(weights_,x_test,my_res)
        #print(len(my_res),len(y_test))
        
        accuracy = np.append(accuracy,sk.metrics.accuracy_score(y_test,my_res,normalize=True,sample_weight=None))
        #print(accuracy.shape)
    acc = np.concatenate((acc, accuracy), axis=0)
#print(weights)    

In [15]:

print(acc)
acc.shape

[ 0.88888889  0.81481481  0.77777778  0.85185185  0.81481481  0.81481481
  0.81481481  0.81481481  0.81481481  0.85185185  0.81481481  0.77777778
  0.81481481  0.77777778  0.77777778  0.81481481  0.85185185  0.81481481
  0.77777778  0.81481481  0.81481481  0.74074074  0.77777778  0.85185185
  0.77777778  0.77777778  0.81481481  0.85185185  0.88888889  0.85185185
  0.81481481  0.81481481  0.81481481  0.77777778  0.77777778  0.81481481]

Out[15]:

(36,)

In [16]:

acc_th = np.reshape(acc,(-1,9))
acc_th

Out[16]:

array([[ 0.88888889,  0.81481481,  0.77777778,  0.85185185,  0.81481481,
         0.81481481,  0.81481481,  0.81481481,  0.81481481],
       [ 0.85185185,  0.81481481,  0.77777778,  0.81481481,  0.77777778,
         0.77777778,  0.81481481,  0.85185185,  0.81481481],
       [ 0.77777778,  0.81481481,  0.81481481,  0.74074074,  0.77777778,
         0.85185185,  0.77777778,  0.77777778,  0.81481481],
       [ 0.85185185,  0.88888889,  0.85185185,  0.81481481,  0.81481481,
         0.81481481,  0.77777778,  0.77777778,  0.81481481]])

In [18]:

thresholds = [0.3,0.4,0.5,0.6]
print('max accuracy =',max(acc),'with threshold =',0.4,'and learning rate =',0.8)

max accuracy = 0.888888888889 with threshold = 0.4 and learning rate = 0.8

In [19]:

x_axis = np.arange(0.1,1,0.1)
plt.gca().set_prop_cycle('color',['red','green','blue','yellow'])

plt.plot(x_axis,acc_th[0])
plt.plot(x_axis,acc_th[1])
plt.plot(x_axis,acc_th[2])
plt.plot(x_axis,acc_th[3])
plt.legend(['th=0.3','th=0.4','th=0.5','th=0.6'], loc='lower right')
plt.ylabel('accuracy')
plt.xlabel('learning rate')
plt.show()

In [51]:

kf = KFold(267,n_folds=10)
acc = 0
totcmat = np.zeros((2,2))
totacc = 0
totpre = 0
totrec = 0
error_flag = 1
weights_ = np.array([])
weights = [random.random() for x in range(22)]
s = sum(weights)
n_weights = [x/s for x in weights]
weights = n_weights

for train_index, test_index in kf:
    my_res = np.array([])
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = Y[train_index], Y[test_index]
    it = 0
    while it < 500:
            error_flag_,weights_ = percept(X_train,y_train,weights,learning_rate)
            #if it == 499:
                #print(weights)
            it += 1
    my_res = Predict(weights_,X_test,my_res)
    cmat = sk.metrics.confusion_matrix(y_test,my_res,[1,0])
    totacc += (cmat[0][0]+cmat[1][1])/np.sum(cmat)
    totcmat += cmat
    totpre += (cmat[0][0])/(cmat[0][0]+cmat[0][1])
    totrec += (cmat[0][0])/(cmat[0][0]+cmat[1][0])
    acc = acc + np.sum(my_res == y_test)/len(my_res)

In [52]:

avgacc  = totacc/10
avgcmat = totcmat/10
avgpre = totpre/10
avgrec = totrec/10

In [53]:

sk.metrics.confusion_matrix(y_test,my_res)
print(avgacc)
print(avgcmat)
print(avgpre)
print(avgrec)

0.568376068376
[[ 11.4   9.8]
 [  1.7   3.8]]
0.502874902875
0.808823529412

In [ ]: