#!/usr/bin/env python
# coding: utf-8

# # Linear Regression (선형회귀)

# - 회귀
#   - 회귀: '다시 본디 상태로 되돌아 온다'는 뜻
#   - 영국의 유전학자 갈톤 (F. Galton, 1822 ~ 1891)에 의해서 회귀라는 용어가 등장
#     - 그의 유전에 관한 논문 'Family Likeness in Stature (1886)'에서 처음 회귀에 대하여 정의함
#       - 일반적으로 키가 큰 부모에게서 키 큰 자녀가, 키가 작은 부모에게서 키 작은 자녀가 태어난다. 그렇지만 자녀들의 평균 키는 전체 인구의 평균 키로 회귀하는 경향이 있다. 
#       - 즉, 키가 큰 부모이든 작은 부모이든 그들에게서 태어난 자녀들의 평균 키는 전체 평균 키 수준에 접근하는 현상으로 나타난다
#       - 이러한 현상을 '보편적 회귀법칙 (law of universal regression)' 이라 한다.
#   - 보편적 회귀법칙은 피어슨 (K. Pearson, 1857 ~ 1936) 에 의해서 체계적인 회귀분석 이론으로 정립되었음 
#     - 피어슨은, 1,078 개 가족 구성원 자료를 수집하여 아버지 키와 아들 키 사이에 존재하는 회귀법칙을 규명하였다. 
#     - 피어슨의 회귀법칙 결론
#       - 아버지 키가 비교적 큰 집단에서 태어난 아들들의 평균 키는 그 아버지 평균 키보다는 작다.
#       - 아버지 키가 비교적 작은 집단에서 태어난 아들들의 평균 키는 그 아버지 평균 키보다 크다.
#       - 따라서 아들의 키는 전체 평균 키 수준을 향하여 회귀하는 현상을 나타냄.
# 
# <!-- more -->
# 
# - 변수의 종류
#   - 결과 변수 (Outcome variable): 반응 변수 (Response Variable), 종속 변수 (Dependent Variable), 분류명 (Classified Name)
#   - 예측 변수 (Predictor): 설명 변수 (Explanatory Variable), 독립 변수 (Independent Variable), 특징 (Feature)
#   
# <!-- more -->
# 
# - 회귀분석
#   - 한 종속변수가 하나 이상의 독립변수에 의해 어떠한 영향을 받고 또한 어떠한 관계로 나타나는지 분석하는 기법
#   - 결과 변수의 값을 다른 예측 변수들로 부터 설명하고 예측하는 것
# 
# <!-- more -->
# 
# - 회귀분석의 종류
#   - 선형회귀 (Linear Regression)
#     - 여러 개의 예측 변수와 하나의 결과 변수 사이에 선형 관계 (Linear Relationship)가 존재한다는 가정하에 그러한 변수 사이의 선형 구조를 *모형화* 하는 것
#     - **선형 관계 (Linear Relationship)**
#       - 예측 변수 $X1$에 대해서 결과 변수 $Y1$이 결정되고, 예측 변수 $X2$에 대해서는 결과 변수 $Y2$가 결정된다고 할 때, $aX1 + bX2$에 대해서는 결과 변수가 $aY1 + bY2$가 될 때 예측 변수와 결과 변수 사이에는 선형 관계가 있다고 말한다.
#   - 비선형회귀 (Nonlinear Regression)
#     - 종속변수와 독립변수 간의 임의적 관계로서 모형화
# 
# <!-- more -->
# 
# - 선형회귀분석 모형 (Hypothesis Function)
#   - 회귀 분석 알고리즘 및 훈련 데이터를 통하여 임의의 예측 함수 h를 생성
#   - 예측 함수 h를 hypothesis 라고 부름
#   - 예측 함수 h에 새로운 예측 변수를 넣으면 결과 변수를 출력함 (결과를 예측함)
#   
# <!-- more -->
# 
# - Hypothesis Function 정의
#   - $n$개의 속성($n$개의 예측 변수)을 지닌 훈련 데이터 $n$-벡터 $x^i=\{x_{1}, x_{2}, ..., x_{n}\}$가 총 $m$ (즉, $1 \le i \le m$)개 주어지고,
#   - 각 $x^i$ 벡터마다 연관된 실수 값 $y^i$ (결과 변수)이 주어질 때,
#   - 임의의 $n$-벡터 $x^i =\{x_1, x_2,...,x_n\}$에 대해 Hypothesis Function $h_{\theta}(x^i)$ 는 다음과 같이 정의된다.
#     $$h_{\theta}(x^i) = \theta_0 + \theta_1 x_1 + \theta_2 x_2 + ... + \theta_n x_n$$
#     
#     - 위 식에서 $\theta=\{\theta_0, \theta_1, \theta_2, ..., \theta_n\}$는 계수 벡터(Coefficient Vector)이다.

# - 계수 벡터 $\theta$를 구하는 수학적 모델
#   - 주어진 통계적 수치(훈련 데이터)들에 대해 다음 비용 함수 (Cost Function) $J(\theta)$를 구한다. 
#   
#   $$J(\theta) = \dfrac{1}{2m} \sum_{i=1}^m \big( h_\theta(x^i) - y^i \big)^2$$
#   
#     - 비용 함수 $J(\theta)$를 Square Error Function 이라고도 칭함
#   - 선형회귀분석의 목적: 비용 함수 $J(\theta)$를 최소로 만드는 $\hat{\theta}$ 벡터를 찾기
#   
#   $$\newcommand{\argmin}{\arg\!\min} \hat{\theta} = \argmin_\theta J(\theta)$$

# - 비용 함수 $J(\theta)$를 최소로 만드는 $\hat{\theta}$ 벡터를 구하는 방법
#   - 대표적인 방법: Gradient Descent

# ## 1. Univariate Linear Regression (단일 변수 선형 회귀분석)

# - $m$개의 통계치 $x^i$ (즉, $1 \le i \le m$)와 이와 연관된 실수 값 $y^i$에 대하여 새로운 통계 수치 $x$와 연관된 실수 값 $y$를 예측하기 위해 다음과 같은 모형 (Hyporthesis Function)을 고려함
#     $$h_{\theta_0, \theta_1}(x^i) = \theta_0 + \theta_1 \cdot x^i$$    

# - 최적의 $h$ 함수를 위한 $\theta_0$와 $\theta_1$을 구하기 위하여 다음 비용 함수 $J(\theta_0, \theta_1)$를 고려
#     $$J(\theta_0, \theta_1) = \dfrac{1}{2m}\sum_{i=1}^m \big( \theta_0 + \theta_1\cdot x^i - y^i \big)^2$$

# ### 1) 필요한 모듈을 지역 이름 공간에 불러오기

# In[1]:


from scipy import stats
from pandas import Series, DataFrame
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy
get_ipython().run_line_magic('matplotlib', 'inline')


# ### 2) 통계 수치 데이터 정의 및 Pandas를 활용한 데이터프레임 정의
# - 최고 기온과 에어콘 판매 대수 데이터 (다소 극단적인 예) 

# In[2]:


data = {
    'Temperature': [26, 27, 28, 29, 30, 31, 32, 33],
    'Number of Sells': [270, 280, 290, 300, 310, 320, 330, 340]
}
df = pd.DataFrame(data)


# In[3]:


df


# In[4]:


df['Temperature'].values


# In[5]:


df['Number of Sells'].values


# ### 3) 통계 수치에 대한 Scatter Plot 생성

# In[6]:


df.plot(kind="scatter", x="Temperature", y="Number of Sells")


# ### 4) scipy 모듈을 활용한 선형 회귀 분석

# In[7]:


slope, intercept, r_value, p_value, std_err = stats.linregress(df['Temperature'].values, df['Number of Sells'].values)


# In[8]:


format = "%40s: %12.10f"
print format % ("slope", slope)
print format % ("intercept", intercept)
print format % ("r_value (Correlation Coefficient)", r_value)
print format % ("r-squared (Coefficient of Determination)", r_value**2)
print format % ("p_value (Hyperthesis Testing)", p_value)
print format % ("std_err (Standard Error)", std_err)


# ### 5) 선형 회귀식 및 예측

# - 회귀식: $y = intercept + slope \times x$
#   - 위 예제의 회귀식: $y = 10.0 + 10.0 \times x$
# 
# - 질문: 온도가 34일 때 예상 에어콘 판매량은? $10 + 10 \times 34 = 350$

# ### 6) 상관 계수 (correlation coefficient)
# - r-value (Pearson correlation coefficient): 
#   $$ r-value = \frac{cov(x_i, y_i)}{\sigma_{x_i} \sigma_{y_i}}$$
#   
#   where 
#   - $cov(x_i,y_i)$ is the covariance of $x_i$ and $y_i$
#     - corvariance: https://goo.gl/ryS1dO
#   - $\sigma_{x_i}$ and $\sigma_{y_i}$ are the standard deviation of $x_i$ and $y_i$, respectively.
# - r-value의 의미
#   - $x_i$ 값들과 $y_i$ 값들 사이의 관계
#     - 1: 강한 양의 상관 관계
#     - 0: 아무 관계 없음
#     - -1: 강한 음의 상관 관계
#   - 참고: https://en.wikipedia.org/wiki/Pearson_product-moment_correlation_coefficient

# - [참고] Covariance 구하는 Python 코딩

# In[9]:


def cov(a, b):

    if len(a) != len(b):
        return

    a_mean = np.mean(a)
    b_mean = np.mean(b)

    sum = 0

    for i in range(0, len(a)):
        sum += ((a[i] - a_mean) * (b[i] - b_mean))

    return sum/(len(a) - 1)


# - 위 코드에서 마지막 라인에 1을 빼는 이유
#   - 자유도 (Degree of Feeedom): http://math7.tistory.com/113

# - numpy를 활용한 r-value (상관 계수) 구하기

# In[10]:


a = np.cov(df['Temperature'].values, df['Number of Sells'].values, ddof = 1)[0][1]
print a
b = np.std(df['Temperature'].values, ddof = 1)
print b
c = np.std(df['Number of Sells'].values, ddof = 1)
print c
print a / (b * c)


# - numpy를 이용한 r-value (상관 계수) 구하기

# In[11]:


np.corrcoef(df['Temperature'].values, df['Number of Sells'].values, ddof = 1)[0][1]


# ### 7) 결정 계수 (Coefficient of Determination)
# - r-squared: 추정된 회귀선이 표본 자료에 적합한가를 측정하는 결정 계수
# - 0 에서 1 사이에 값을 갖는다
#   - r-squared이 0에 가까울 때: 두 변수 X와 Y 사이에는 관계가 거의 없으며 회귀분석을 적용함에 적합치 않다는 의미
#   - r-squared이 1에 가까울 때: 두 변수 X와 Y 사이에는 매우 강한 관계가 있으며 회귀분석 적용이 적합함을 의미
#     - r-squared가 1이라는 것은 두 변수 X와 Y가 확정적(Deterministic) 관계이며 그 관계성에 오차가 없으므로 회귀분석을 적용할 이유가 없다.

# ### 8) 가설 검증 (Hyperthesis Testing)과 p-value (유의 확률)
# - 다음의 두 가지 가설을 세울 수 있음
#   - **Null Hyperthesis** (귀무 가설): 예측 변수(Predictor)와 결과 변수(Outcome Variable)사이에 아무런 관계가 없다.
#   - **Alternative Hypothesis** (대립 가설): 예측 변수(Predictor)와 결과 변수(Outcome Variable)사이에 (선형)관계가 존재한다.
# - p-value (유의 확률): **Null Hyperthesis**가 만족할 확률
# - 통계학적으로, **p-value < 0.05** 이면 **Null Hyperthesis**가 만족할 확률은 거의 없다고 간주하고 **Alternative Hyperthesis**를 받아들인다.
#   - 참고: http://www.editage.co.kr/edm-nw-v6.html
# 
# 

# ### 9) 표준 오차 (Standard Error)

# - 표준 오차 (std_err): 통계치 ($x_i$, $y_i$)들과 선형 회귀 모델사이의 평균 거리
# - 표준 오차가 적을 수록 그러한 통계치가 선형 회귀 모델식에 가깝게 분포

# ## 2. Simple Linear Regression - II (실제 통계 데이터를 활용한 선형 회귀분석)

# ### 1) 데이터 설명
# - URL
#   - https://raw.githubusercontent.com/bluebibi/LINK_ML_BIG_DATA/master/death_rate.csv
# - D: 사망률
# - A1 ~ A15: 사망률에 영향이 있을 것 같은 각종 요인 데이터
# - References
#   - Richard Gunst and Robert Mason, Regression Analysis and Its Applications: a data-oriented approach, Dekker, 1980, pages 370-371
#   - Gary McDonald and Richard Schwing, Instabilities of regression estimates relating air pollution to mortality, Technometrics, Volume 15, Number 3, pages 463-482, 1973.
#   - Helmut Spaeth, Mathematical Algorithms for Linear Regression, Academic Press, 1991, ISBN 0-12-656460-4.
#     
# - The death rate is to be represented as a function of other variables.
# - There are 60 rows of data. The data includes:
#   - I: the index;
#   - A1: the average annual precipitation (강수량);
#   - A2: the average January temperature;
#   - A3: the average July temperature;
#   - A4: the size of the population older than 65;
#   - A5: the number of members per household;
#   - A6: the number of years of schooling for persons over 22;
#   - A7: the number of households with fully equipped kitchens;
#   - A8: the population per square mile; 
#   - A9: the size of the nonwhite population;
#   - A10: the number of office workers;
#   - A11: the number of families with an income less than $3000;
#   - A12: the hydrocarbon pollution index;
#   - A13: the nitric oxide pollution index;
#   - A14: the sulfur dioxide pollution index;
#   - A15: the degree of atmospheric moisture.
#   - D: the death rate.

# ### 2) Pandas를 활용한 데이터프레임 정의

# In[7]:


import urllib2
import json

path = 'https://raw.githubusercontent.com/bluebibi/LINK_ML_BIG_DATA/master/death_rate.csv'
raw_csv = urllib2.urlopen(path)
df = pd.read_csv(raw_csv)


# In[8]:


df.head()


# ### 3) 회귀 분석을 위한 몇가지 요인 선택

# In[25]:


print df['A1'].values
print df['D'].values


# In[32]:


corr_dic = {}
for i in range(1,16):
    corr_dic[i] = np.corrcoef(df['A' + str(i)].values, df['D'].values, ddof = 1)[0][1]
print corr_dic

print 

sorted_corr_dic = sorted(corr_dic.items(), key=lambda x: x[1], reverse=True)
print sorted_corr_dic


# - 선택한 요인
#   - A9: the size of the nonwhite population
#   - A1: the average annual precipitation (강수량)
#   - A6: the number of years of schooling for persons over 22

# In[33]:


df_sub = df[['A9','A1','A6', 'D']]


# In[34]:


df_sub.head()


# ### 4) 통계 수치에 대한 Scatter Plot 생성

# In[35]:


fig = plt.figure(figsize=(17, 6))
ax1 = fig.add_subplot(131)
ax1.scatter(df_sub['A9'], df_sub['D'])
ax1.set_title("Data Rate vs. Size of the nonwhite population")

ax2 = fig.add_subplot(132)
ax2.scatter(df_sub['A1'], df_sub['D'])
ax2.set_title("Data Rate vs. Average annual precipitation")

ax3 = fig.add_subplot(133)
ax3.scatter(df_sub['A6'], df_sub['D'])
ax3.set_title("Data Rate vs. Number of years of schooling for persons over 22")


# ### 5) scipy 모듈을 활용한 선형 회귀 분석

# In[36]:


slope, intercept, r_value, p_value, std_err = stats.linregress(df_sub['A9'].values, df_sub['D'].values)


# In[37]:


format = "%40s: %12.10f"
print format % ("slope", slope)
print format % ("intercept", intercept)
print format % ("r_value (Correlation Coefficient)", r_value)
print format % ("r-squared (Coefficient of Determination)", r_value**2)
print format % ("p_value (Hyperthesis Testing)", p_value)
print format % ("std_err (Standard Error)", std_err)


# - 모든 Predicator 변수들에 대한 분석

# In[39]:


predicator_analysis = {}
for i in range(1, 16):
    predicator_analysis[i] = Series(np.empty(6), index=['slope', 'intercept', 'r_value', 'r_squared', 'p_value', 'std_err'])
    predicator_analysis[i][0],\
    predicator_analysis[i][1],\
    predicator_analysis[i][2],\
    predicator_analysis[i][4],\
    predicator_analysis[i][5] = stats.linregress(df['A' + str(i)].values, df['D'].values)
    predicator_analysis[i][3] = predicator_analysis[i][2] ** 2


format1 = "%3s %15s %15s %15s %15s %15s %15s"
format2 = "%3d %15f %15f %15f %15f %15f %15f"
print format1 % ('No.', 'slope', 'intercept', 'r_value', 'r_squared', 'p_value', 'std_err')
for i in range(1, 16):
    lst = [i]
    for j in range(6):
        lst.append(predicator_analysis[i][j])
    print format2 % tuple(lst)


# ### 6) 선형 회귀식 및 예측

#  - 연간 강수량 (A1)과 사망률간의 관계: $y = 0.82 + 0.003 * x$ 
#    - 강수량이 70일 때 사망률은? $0.82 + 0.003 * 70 = 1.03$

# ### 7) 선형 회귀식과 Scatter Plot을 함께 생성

# In[43]:


fig = plt.figure(figsize=(17, 6))
ax1 = fig.add_subplot(131)
ax1.scatter(df_sub['A9'], df_sub['D'])
line_plot_x1 = np.linspace(df_sub['A9'].min(), df_sub['A9'].max(), 10)
slope, intercept, r_value, p_value, std_err = stats.linregress(df_sub['A9'].values, df_sub['D'].values)
ax1.plot(line_plot_x1, intercept + slope * line_plot_x1)

ax2 = fig.add_subplot(132)
ax2.scatter(df_sub['A1'], df_sub['D'])
line_plot_x2 = np.linspace(df_sub['A1'].min(), df_sub['A1'].max(), 10)
slope, intercept, r_value, p_value, std_err = stats.linregress(df_sub['A1'].values, df_sub['D'].values)
ax2.plot(line_plot_x2, intercept + slope * line_plot_x2)

ax3 = fig.add_subplot(133)
ax3.scatter(df_sub['A6'], df_sub['D'])
line_plot_x3 = np.linspace(df_sub['A6'].min(), df_sub['A6'].max(), 10)
slope, intercept, r_value, p_value, std_err = stats.linregress(df_sub['A6'].values, df_sub['D'].values)
ax3.plot(line_plot_x3, intercept + slope * line_plot_x3)


# ## 3. Multivariate Linear Regression (다변수 선형 회귀분석) 
# ### - sklearn 모듈을 활용한 다중 변수 선형 회귀

# In[47]:


from sklearn import linear_model
regr = linear_model.LinearRegression()


# ### 1) A9과 A1에 대한 사망률 변화
#  - A9: the size of the nonwhite population
#  - A1: the average annual precipitation (강수량)

# In[130]:


df[['A9', 'A1']].head()


# In[131]:


X = zip(df['A9'], df['A1'])
print X
y = df['D'].values


# In[132]:


regr = regr.fit(X, y)

print 'Coefficients:', regr.coef_
print 'Intercept:', regr.intercept_


# ### 2) 선형 회귀식 및 예측

# - 선형 회귀식: $y = 0.8280 + 0.0036 * A9 + 0.0018 * A1$ 

# In[133]:


test_x = [36, 12]
print regr.predict(test_x)
print 0.8280 + 0.0036 * test_x[0] + 0.0018 * test_x[1] 


# ### 3) Scatter Plot 및 선형 회귀 Plane 생성

# In[134]:


# Plot outputs
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure(figsize=(17, 7))
ax1 = fig.add_subplot(121, projection='3d')
ax1.scatter(df['A9'], df['A1'], df_sub['D'])

ax2 = fig.add_subplot(122, projection='3d')
ax2.scatter(df['A9'], df['A1'], df_sub['D'])
# create x,y
xx, yy = np.meshgrid(range(int(df['A9'].min()), int(df['A9'].max())), range(int(df['A1'].min()), int(df['A1'].max())))
# calculate corresponding z
z = 0.8280 + 0.0036 * xx + 0.0018 * yy
ax2.plot_surface(xx, yy, z, rstride=1, cstride=1, linewidth=0, color="yellow", shade=False)


# ## 4. Tensorflow를 활용한 회귀분석

# In[135]:


import tensorflow as tf
import numpy as np

# Numpy 랜덤으로 100개의 가짜 데이터 채우기.
x_data = np.float32(np.random.rand(2, 100))
# 학습 레이블(목표값)은 아래의 식으로 산출. (W = [0.1, 0.2], b = 0.3)
y_data = np.dot([0.100, 0.200], x_data) + 0.300
print type(x_data), x_data.shape
print type(y_data), y_data.shape


# In[145]:


import tensorflow as tf
import numpy as np

x_data = df[['A9', 'A1']].T
y_data = df['D']
x_data = x_data.as_matrix().astype('float32')
y_data = y_data.as_matrix().astype('float32')
print type(x_data), x_data.shape
print type(y_data), y_data.shape


# - 입력 데이터와 W, b를 사용해 선형 모델 정의

# In[146]:


# b는 0,
b = tf.Variable(tf.zeros([1]))
# W는 1x2 형태의 웨이트 변수
W = tf.Variable(tf.zeros([1, 2]))
y = tf.matmul(W, x_data) + b
print W.get_shape()
print y.get_shape()


# - 손실과 학습 함수 정의 --> 평균 제곱 오차가 최소화 되는 지점을 경사하강법으로 구함

# In[155]:


# 손실 함수 정의
loss = tf.reduce_mean(tf.square(y - y_data))
# 경사하강법으로 손실 함수를 최소화 (0.0005는 학습 비율)
optimizer = tf.train.GradientDescentOptimizer(0.0005)
# 학습 오퍼레이션 정의
train = optimizer.minimize(loss)


# - 학습 세션 시작
# Coefficients: [ 0.00364445  0.00183603]
# Intercept: 0.827988572515

# In[162]:


# 모든 변수를 초기화.
init = tf.initialize_all_variables()

# 세션 시작
sess = tf.Session()
sess.run(init)

# 200000번 학습.
for step in xrange(0, 200001):
    sess.run(train)
    if step % 10000 == 0:
        print step, sess.run(W), sess.run(b)


# ## 5. Refererence

# - http://scikit-learn.org/stable/modules/linear_model.html
# - http://radimrehurek.com/data_science_python/
# - http://nbviewer.ipython.org/github/justmarkham/DAT4/blob/master/notebooks/08_linear_regression.ipynb
# - https://gist.github.com/haje01/202ac276bace4b25dd3f