#!/usr/bin/env python
# coding: utf-8

# # Logistic Regression (로지스틱 회귀분석) 분류 기법

# ## 1. 로지스틱 회귀분석의 이해

# ### 1) 사전 지식
#   - logistic function (= sigmoid function)
#   - $g(z) = \dfrac{1}{1 + e^{-z}}$

# In[1]:


import math 
from scipy import stats
from pandas import Series, DataFrame
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy
get_ipython().run_line_magic('matplotlib', 'inline')

def logistic(z):
    return 1.0 / (1.0 + np.power(math.e, -1.0 * z))

fig = plt.figure(figsize=(12, 8))
ax1 = fig.add_subplot(111)
ax1.grid(True)
xx = np.linspace(-5, 5, 100)
ax1.plot(xx,  logistic(xx))


#   - 즉, logistic 함수에 입력으로 들어가는 $z$ 변수에 대해 다음과 같은 성질이 있다.
#     - $g(z) >= 0.5$ if $z >= 0$ ($z$가 양수)
#     - $g(z) < 0.5$ if $z < 0$ ($z$가 음수)  

# ### 2) 로지스틱 회귀분석을 이해하기 위한 간단한 예제

# - 예제 (Age and coronary heart disease (CHD))
# ![CHD Data](./figures/chd.png)

# In[2]:


age = np.array([22, 23, 24, 27, 28, 30, 30, 32, 33, 35, 38, 40, 41, 46, 47, 48, 49,\
                49, 50, 51, 51, 52, 54, 55, 58, 60, 60, 62, 65, 67, 71, 77, 81])
chd = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1])
df = pd.DataFrame({'age': age, 'chd': chd})
df


# In[3]:


df.plot(kind='scatter', x='age', y='chd', figsize=(12, 8));


# - 위 그림에서...
#  - chd = 1은 심장병이 있다는 것이고 앞으로 y = 1 (양성 반응) 이라고 표기한다.
#  - chd = 0은 심장병이 없다는 것이고 앞으로 y = 0 (음성 반응) 이라고 표기한다.

# - 로지스틱 회귀 분석을 이해하기 위하여, 우선 위 데이터에 대하여 단순하게 ***회귀 분석 (Linear Regression)***을 수행하여 산출한 h 함수 (hyperthesis)를 plotting 

# In[4]:


slope, intercept, r_value, p_value, std_err = stats.linregress(age, chd)
fig = plt.figure(figsize=(12, 8))
ax1 = fig.add_subplot(111)
ax1.scatter(age, chd)
xx = np.linspace(age.min(), age.max(), 2)
ax1.plot(xx, intercept + slope * xx)


# - 복습: 회귀 분석 (Linear Regression) 모형
#   - $n$개의 속성($n$개의 예측 변수)을 지닌 훈련 데이터 $n$-벡터 $x^i=\{x_{1}, x_{2}, ..., x_{n}\}$가 총 $m$ (즉, $1 \le i \le m$)개 주어지고,
#   - 각 $x^i$ 벡터마다 연관된 실수 값 $y^i$ (결과 변수)이 주어질 때,
#   - 임의의 $n$-벡터 $x^i =\{x_1, x_2,...,x_n\}$에 대해 Hypothesis Function $h_{\theta}(x^i)$ 는 다음과 같이 정의된다.
#   $$ h_{\theta}(x^i) = \theta_0 + \theta_1 x_1 + \theta_2 x_2 + ... + \theta_n x_n$$
#     - 위 식에서 $\theta=\{\theta_0, \theta_1, \theta_2, ..., \theta_n\}$는 계수 벡터(Coefficient Vector)이다. 
#   - 위 모형을 좀 더 간단히 표현하면 아래와 같다.
#     - 항상 $x_0 = 1$일 때, 임의의 $(n+1)$-벡터 $x^i =\{x_0, x_1, x_2,...,x_n\}$에 대해
#     - $h_\theta(x^i) = \theta^T \cdot x^i$

# ### 3) 로지스틱 회귀 분석

# - 로지스틱 회귀 분석 (Logistic Regression) 모형
#   - ***logistic 함수와 기존 회귀 분석 모형을 합성한 함수 형태***
#   - $n$개의 속성($n$개의 예측 변수)을 지닌 훈련 데이터 및 $x_{0}=1$를 지닌 $(n+1)$-벡터 $x^ii=\{x_{0}, x_{1}, x_{2}, ..., x_{n}\}$가 총 $m$ (즉, $1 \le i \le m$)개 주어지고,
#   - 각 $X^i$ 벡터마다 연관된 분류 표기 값 $y^i$ ($y_i \in \{0, 1\}$)이 주어질 때,
#   - 임의의 $(n+1)$-벡터 $x^i =\{1, x_1, x_2,...,x_n\}$에 대해 Hypothesis Function $h_{\theta}^L(x^i)$ 는 다음과 같이 정의된다.
#   
#      $$h_\theta^L(x^i) = g(h_\theta(x^i))= \dfrac{1}{1 + e ^ {-h_\theta(x^i)}} = \dfrac{1}{1 + e ^ {-\theta^T \cdot x^i}}$$ 
# 
#    - 위 식에서 $\theta=\{\theta_0, \theta_1, \theta_2, ..., \theta_n\}$는 계수 벡터(Coefficient Vector)이다.

# - ***로지스틱 회귀 분석 모형($h_\theta^L(X)$)의 해석***
# 
#   - 계수 백터  $\theta$를 구했다고 가정할 때, 임의의 입력 $x$에 대해 $y=1$ (분류 결과가 1)이 되는 추정 확률
#   - 즉, $h_\theta^L(x) = P(y = 1 | x; \theta)$
#   - $h_\theta^L(x) = 0.75$의 의미
#     - 입력 $x$에 대해 심장병 (chd)이 존재할 확률이 0.75이다.

# - ***로지스틱 회귀 분석 모형($h_\theta^L(x)$)을 활용한 분류 결정 (Classification Decision)***
#   - $h_\theta^L(x) = g(h_\theta(x)) >= 0.5$ if $h_\theta(x) =  \theta^T \cdot x >= 0$
#     - 즉, $\theta^T \cdot x >= 0$이면 양성으로 분류하여 $y=1$로 분류한다.
#   - $h_\theta^L(x) = g(h_\theta(x)) < 0.5$ if $h_\theta(x) =  \theta^T \cdot x < 0$
#     - 즉, $\theta^T \cdot x < 0$이면 음성으로 분류하여 $y=0$로 분류한다.

# - 로지스틱 회귀 분석에서 계수 벡터 $\theta$를 구하는 수학적 모델
#   - 주어진 통계적 수치(훈련 데이터)들에 대해 다음 비용 함수 (Cost Function) $J^L(\theta)$를 구한다. 
#   
#      $$J^L(\theta) = \dfrac{1}{m} \sum_{i = 1}^m \big( h_\theta^L(x_i) - y_i \big)^2 = \dfrac{1}{m} \sum_{i=1}^m \big( \dfrac{1}{1 + e^{-h_\theta(x_i)}} - y_i \big)^2$$
#   
#     - 위 식에서 
#       - $y_i \in \{0, 1\}$
#   - 비용 함수 $J^L(\theta)$를 최소로 만드는 $\hat \theta$ 벡터가 로지스틱 회귀 분석에서 찾으려고 하는 것임
#   
#      $$\hat \theta = \newcommand{\argmin}{\arg\!\min}  \argmin_\theta J^L(\theta)$$

# - 비용 함수 $J^L(\theta)$를 최소로 만드는 $\theta$ 벡터를 구하는 방법
#   - Gradient Descent
#   - Stochastic Gradient Descent
#   - [참고]: http://www.cs.rpi.edu/~magdon/courses/LFD-Slides/SlidesLect09.pdf

# ## 2. Univariate Logistic Regression (단일 변수 로지스틱 회귀분석)

# - $m$개의 통계치 $x^i$ (즉, $1 \le i \le m$)와 이와 연관된 실수 값 $y^i$에 대하여
# - 새로운 통계 수치 $x$와 연관된 실수 값 $y$를 예측하기 위해 다음과 같은 모형을 고려함
#     $$$$ $$h_{\theta_0, \theta_1}(x^i) = \theta_0 + \theta_1 \cdot x^i$$ $$$$
#     $$$$ $$h_{\theta_0, \theta_1}^L(x^i) = g(h_{\theta_0, \theta_1}(x^i))= \dfrac{1}{1 + e ^ {-h_{\theta_0, \theta_1}(x^i)}} = \dfrac{1}{1 + e ^ {-(\theta_0 + \theta_1 \cdot x^i)}}$$ $$$$
# 
# - 최적의 h 함수를 위한 $\theta_0$와 $\theta_1$을 구하기 위하여 다음 비용 함수 $J(\theta_0, \theta_1)$를 최소로 만드는 $\theta_0, \theta_1$ 벡터를 구한다.
#     $$$$ $$J^L(\theta_0, \theta_1) = \dfrac{1}{m} \sum_{i = 1}^m \big( h_{\theta_0, \theta_1}^L(x^i) - y_i \big)^2 = \dfrac{1}{m} \sum_{i=1}^m \big( \dfrac{1}{1 + e^{-(\theta_0 + \theta_1 \cdot x^i)}} - y^i \big)^2$$ $$$$
#   - 위 식에서 
#     - $y_i \in \{0, 1\}$
#     - $\theta_0$는 intercept, $\theta_1$은 coefficient 라고 부른다.

# ### - sklearn 모듈 활용

# In[13]:


from sklearn import linear_model
regr = linear_model.LogisticRegression()

age_ = []
for i in age:
    age_.append([i])

print age_
print chd

regr = regr.fit(age_, chd)

print 'Coefficients:', regr.coef_
print 'Intercept:', regr.intercept_


# In[14]:


def h_theta(x):
    return regr.intercept_[0] + regr.coef_[0][0] * x


# In[15]:


fig = plt.figure(figsize=(12, 8))
ax1 = fig.add_subplot(111)
ax1.scatter(age, chd)
xx = np.linspace(age.min(), age.max(), 1000)
ax1.plot(xx, logistic(h_theta(xx)))


# - $h_\theta^L(x) = P(y = 1 | x; \theta) = 0.5$인 데이터 x 찾기
#   - 즉, $intercept + coef \times x = 0$인 데이터 x 찾기
#   - $x = - \dfrac{intercept}{coef}$

# In[16]:


print -1.0 * regr.intercept_[0] / regr.coef_[0][0]


# - $h_\theta^L(x) = P(y = 1 | x; \theta) = 0.5$을 직접 고려하여 x 찾기

# In[18]:


xx = np.linspace(age.min(), age.max(), 1000)
for x in xx:
    if abs(logistic(h_theta(x)) - 0.5000) < 0.0002:
        print x


# - 즉, 나이가 49.16 세부터는 심장병이 존재할 확률이 0.5 이상이며 "심장병 존재 가능자"로 분류할 수 있다.

# - 나이가 50세, 60세, 70세, 80세일 때 심장병이 존재할 확률은?

# In[19]:


print logistic(h_theta(50))
print logistic(h_theta(60))
print logistic(h_theta(70))
print logistic(h_theta(80))


# ### - 로지스틱 분석 이해를 위한 데이터 변형

# - 데이터를 조금 변경해보자.
#   - 변경된 데이터에서는 40세 부터 100% 심장병이 존재한다.

# In[20]:


age = np.array([22, 23, 24, 27, 28, 30, 30, 32, 33, 35, 38, 40, 41, 46, 47, 48, 49,\
                49, 50, 51, 51, 52, 54, 55, 58, 60, 60, 62, 65, 67, 71, 77, 81])
chd2 = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
df2 = pd.DataFrame({'age': age, 'chd': chd2})
regr2 = linear_model.LogisticRegression()

age_ = []
for i in age:
    age_.append((i,))
regr2 = regr2.fit(age_, chd2)

print 'Coefficients:', regr2.coef_
print 'Intercept:', regr2.intercept_


# In[22]:


def h_theta(x):
    return regr2.intercept_[0] + regr2.coef_[0][0] * x

fig2 = plt.figure(figsize=(12, 8))
ax2 = fig2.add_subplot(111)
ax2.scatter(age, chd2)
xx2 = np.linspace(age.min(), age.max(), 1000)
ax2.plot(xx2, logistic(h_theta(xx2)))


# In[24]:


print -1.0 * regr2.intercept_[0] / regr2.coef_[0][0]
xx2 = np.linspace(age.min(), age.max(), 1000)
for x in xx2:
    if abs(logistic(h_theta(x)) - 0.5000) < 0.0002:
        print x


# - 나이가 31.65 세부터는 심장병이 존재할 확률이 0.5 이상이며 "심장병 존재 가능자"로 분류할 수 있다.

# - 나이가 50세, 60세, 70세, 80세일 때 심장병이 존재할 확률은?

# In[25]:


print logistic(h_theta(50))
print logistic(h_theta(60))
print logistic(h_theta(70))
print logistic(h_theta(80))


# ## 3.  Multivariate Logistic Regression (다변수 로지스틱 회귀분석)

# ### 1) Mushroom Data Set 로드 및 scikit을 활용하기 위한 데이터 가공¶

# - Data Set 로드
#   - 21개의 Features (예측 변수)
#   - 2개의 분류 (타겟 변수, outcomes)

# In[26]:


import urllib2
path = 'http://ftp.ics.uci.edu/pub/machine-learning-databases/mushroom/agaricus-lepiota.data'
raw_csv = urllib2.urlopen(path)
col_names = range(23)
df = pd.read_csv(raw_csv, names = col_names)


# - 수치형 데이터로 변경

# In[27]:


map_dic = {}
num_columns = df.shape[1]
for i in range(num_columns):
    unique_array = df[i].unique()
    map_dic_sub = {}
    for j in range(len(unique_array)):
        map_dic_sub[unique_array[j]] = j
    df[i] = df[i].map(map_dic_sub)
df


# - 예측 변수와 타겟 변수의 분리

# In[28]:


attributes = df.iloc[:, 1:22]
mushroom_data = attributes.values
mushroom_data


# In[29]:


target_series = df.iloc[:, 0]
mushroom_target = target_series.values
mushroom_target


# - train data와 test data의 분리 샘플 코드

# In[48]:


from sklearn.cross_validation import train_test_split

data, labels = np.arange(10).reshape((5, 2)), range(5)
print data
print labels

print 

data_train, data_test, labels_train, labels_test = train_test_split(data, labels, test_size=0.20)
print data_train, labels_train 
print data_test, labels_test


# - mushroom 데이터에 대한 train data와 test data의 분리

# In[50]:


data_train, data_test, labels_train, labels_test = train_test_split(mushroom_data, mushroom_target, test_size=0.20)
print len(data_train), len(labels_train) 
print len(data_test), len(labels_test)


# ### 2) scikit을 활용한 로지스틱 회귀분석¶

# In[51]:


regr3 = linear_model.LogisticRegression()
regr3.fit(data_train, labels_train)


# In[52]:


print 'Coefficients:', regr3.coef_
print 'Intercept:', regr3.intercept_


# In[53]:


print data_test[0]
print data_test[0].reshape(1, -1)


# In[57]:


print data_test[0], ":", labels_test[0]
print regr3.predict(data_test[0].reshape(1,-1))[0]


# In[58]:


print data_test[1], ":", labels_test[1]
print regr3.predict(data_test[1].reshape(1,-1))[0]


# - 로지스틱 회귀분석에 의한 분류 정확도

# In[63]:


predicted = []
for i in range(0, len(data_test)):
    predicted.append(regr3.predict(data_test[i].reshape(1,-1))[0] == labels_test[i])

total = len(predicted)
numTrue = 0
for i in range(0, total):
    if predicted[i]:
        numTrue = numTrue + 1
print float(numTrue) / total        


# ## 4. Iris Data Set 로드 및 scikit 활용 (Multiclass Classification)
# - [reference] Stanford's machine learning course presented by Professor Andrew Ng
# - 4개의 Features (예측 변수)
# - ***3개의 분류 (타겟 변수, outcomes)***

# ![Multiclass](./figures/multi-class-logistic-regression.png)

# - 훈련 데이터를 다음과 같이 3개의 Binary Logistic Regression 분류 문제로 변경한다. 
#   - Triangle (1) vs crosses and squares (0) 
#     - $h_\theta^{L1}(X) = P(y=1 | X; \theta)$
#   - Crosses (1) vs triangle and square (0) 
#     - $h_\theta^{L2}(X) = P(y=1 | X; \theta)$
#   - Square (1) vs crosses and square (0) 
#     - $h_\theta^{L2}(X) = P(y=1 | X; \theta)$    
#     
# <!-- -->    
# - 분류하려는 타킷 변수가 $k$개 이면 총 $k$개의 Binary Logistic Regression 분류 문제로 변경하여 해결한다.

# ![Multiclass](./figures/multi-class-logistic-regression2.png)

# In[64]:


from sklearn.datasets import load_iris
from sklearn import tree
iris = load_iris()


# In[65]:


iris.keys()


# - 아래와 같이 분류 종류가 3가지이다.

# In[66]:


iris.target_names


# In[67]:


iris.feature_names


# In[68]:


print len(iris.data), len(iris.target)


# In[69]:


iris.data[0:5]


# In[70]:


iris.target[0:5]


# In[71]:


iris.data[50:55]


# In[72]:


iris.target[50:55]


# In[57]:


regr5 = linear_model.LogisticRegression()
regr5.fit(iris.data[:, :2], iris.target)


# In[58]:


print 'Coefficients:', regr5.coef_
print 'Intercept:', regr5.intercept_


# - 두 개의 속성에 대해서만 Logistic Regression 수행
#   - http://scikit-learn.org/stable/auto_examples/linear_model/plot_iris_logistic.html

# In[80]:


iris = load_iris()
X = iris.data[:, :2]  # we only take the first two features.
Y = iris.target

h = .02  # step size in the mesh

regr6 = linear_model.LogisticRegression()

# we create an instance of Neighbours Classifier and fit the data.
regr6.fit(X, Y)

# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, m_max]x[y_min, y_max].
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
x_array = np.arange(x_min - .5, x_max + .5, h)
y_array = np.arange(y_min - .5, y_max + .5, h)
xx, yy = np.meshgrid(x_array, y_array)
Z = regr6.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111)
ax.pcolormesh(xx, yy, Z, cmap=plt.cm.Paired)

# Plot also the training points
ax.scatter(X[:, 0], X[:, 1], c=Y, edgecolors='k', cmap=plt.cm.Paired)
plt.xlabel('Sepal length')
plt.ylabel('Sepal width')