import math
from scipy import stats
from pandas import Series, DataFrame
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy
%matplotlib inline
def logistic(z):
return 1.0 / (1.0 + np.power(math.e, -1.0 * z))
fig = plt.figure(figsize=(12, 8))
ax1 = fig.add_subplot(111)
ax1.grid(True)
xx = np.linspace(-5, 5, 100)
ax1.plot(xx, logistic(xx))
[<matplotlib.lines.Line2D at 0x116740390>]
age = np.array([22, 23, 24, 27, 28, 30, 30, 32, 33, 35, 38, 40, 41, 46, 47, 48, 49,\
49, 50, 51, 51, 52, 54, 55, 58, 60, 60, 62, 65, 67, 71, 77, 81])
chd = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1])
df = pd.DataFrame({'age': age, 'chd': chd})
df
age | chd | |
---|---|---|
0 | 22 | 0 |
1 | 23 | 0 |
2 | 24 | 0 |
3 | 27 | 0 |
4 | 28 | 0 |
5 | 30 | 0 |
6 | 30 | 0 |
7 | 32 | 0 |
8 | 33 | 0 |
9 | 35 | 1 |
10 | 38 | 0 |
11 | 40 | 0 |
12 | 41 | 1 |
13 | 46 | 0 |
14 | 47 | 0 |
15 | 48 | 0 |
16 | 49 | 1 |
17 | 49 | 0 |
18 | 50 | 1 |
19 | 51 | 0 |
20 | 51 | 1 |
21 | 52 | 0 |
22 | 54 | 0 |
23 | 55 | 1 |
24 | 58 | 1 |
25 | 60 | 1 |
26 | 60 | 0 |
27 | 62 | 1 |
28 | 65 | 1 |
29 | 67 | 1 |
30 | 71 | 1 |
31 | 77 | 1 |
32 | 81 | 1 |
df.plot(kind='scatter', x='age', y='chd', figsize=(12, 8));
slope, intercept, r_value, p_value, std_err = stats.linregress(age, chd)
fig = plt.figure(figsize=(12, 8))
ax1 = fig.add_subplot(111)
ax1.scatter(age, chd)
xx = np.linspace(age.min(), age.max(), 2)
ax1.plot(xx, intercept + slope * xx)
[<matplotlib.lines.Line2D at 0x119f65b50>]
로지스틱 회귀 분석 (Logistic Regression) 모형
*logistic 함수와 기존 회귀 분석 모형을 합성한 함수 형태*
$n$개의 속성($n$개의 예측 변수)을 지닌 훈련 데이터 및 $x_{0}=1$를 지닌 $(n+1)$-벡터 $x^ii=\{x_{0}, x_{1}, x_{2}, ..., x_{n}\}$가 총 $m$ (즉, $1 \le i \le m$)개 주어지고,
각 $X^i$ 벡터마다 연관된 분류 표기 값 $y^i$ ($y_i \in \{0, 1\}$)이 주어질 때,
임의의 $(n+1)$-벡터 $x^i =\{1, x_1, x_2,...,x_n\}$에 대해 Hypothesis Function $h_{\theta}^L(x^i)$ 는 다음과 같이 정의된다.
$$h_\theta^L(x^i) = g(h_\theta(x^i))= \dfrac{1}{1 + e ^ {-h_\theta(x^i)}} = \dfrac{1}{1 + e ^ {-\theta^T \cdot x^i}}$$
위 식에서 $\theta=\{\theta_0, \theta_1, \theta_2, ..., \theta_n\}$는 계수 벡터(Coefficient Vector)이다.
*로지스틱 회귀 분석 모형($h_\theta^L(X)$)의 해석*
로지스틱 회귀 분석에서 계수 벡터 $\theta$를 구하는 수학적 모델
주어진 통계적 수치(훈련 데이터)들에 대해 다음 비용 함수 (Cost Function) $J^L(\theta)$를 구한다.
$$J^L(\theta) = \dfrac{1}{m} \sum_{i = 1}^m \big( h_\theta^L(x_i) - y_i \big)^2 = \dfrac{1}{m} \sum_{i=1}^m \big( \dfrac{1}{1 + e^{-h_\theta(x_i)}} - y_i \big)^2$$
비용 함수 $J^L(\theta)$를 최소로 만드는 $\hat \theta$ 벡터가 로지스틱 회귀 분석에서 찾으려고 하는 것임
$$\hat \theta = \newcommand{\argmin}{\arg\!\min} \argmin_\theta J^L(\theta)$$
$m$개의 통계치 $x^i$ (즉, $1 \le i \le m$)와 이와 연관된 실수 값 $y^i$에 대하여
새로운 통계 수치 $x$와 연관된 실수 값 $y$를 예측하기 위해 다음과 같은 모형을 고려함 $$$$ $$h_{\theta_0, \theta_1}(x^i) = \theta_0 + \theta_1 \cdot x^i$$ $$$$ $$$$ $$h_{\theta_0, \theta_1}^L(x^i) = g(h_{\theta_0, \theta_1}(x^i))= \dfrac{1}{1 + e ^ {-h_{\theta_0, \theta_1}(x^i)}} = \dfrac{1}{1 + e ^ {-(\theta_0 + \theta_1 \cdot x^i)}}$$ $$$$
최적의 h 함수를 위한 $\theta_0$와 $\theta_1$을 구하기 위하여 다음 비용 함수 $J(\theta_0, \theta_1)$를 최소로 만드는 $\theta_0, \theta_1$ 벡터를 구한다. $$$$ $$J^L(\theta_0, \theta_1) = \dfrac{1}{m} \sum_{i = 1}^m \big( h_{\theta_0, \theta_1}^L(x^i) - y_i \big)^2 = \dfrac{1}{m} \sum_{i=1}^m \big( \dfrac{1}{1 + e^{-(\theta_0 + \theta_1 \cdot x^i)}} - y^i \big)^2$$ $$$$
from sklearn import linear_model
regr = linear_model.LogisticRegression()
age_ = []
for i in age:
age_.append([i])
print age_
print chd
regr = regr.fit(age_, chd)
print 'Coefficients:', regr.coef_
print 'Intercept:', regr.intercept_
[[22], [23], [24], [27], [28], [30], [30], [32], [33], [35], [38], [40], [41], [46], [47], [48], [49], [49], [50], [51], [51], [52], [54], [55], [58], [60], [60], [62], [65], [67], [71], [77], [81]] [0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 1 0 1 0 0 1 1 1 0 1 1 1 1 1 1] Coefficients: [[ 0.03851967]] Intercept: [-1.89317939]
def h_theta(x):
return regr.intercept_[0] + regr.coef_[0][0] * x
fig = plt.figure(figsize=(12, 8))
ax1 = fig.add_subplot(111)
ax1.scatter(age, chd)
xx = np.linspace(age.min(), age.max(), 1000)
ax1.plot(xx, logistic(h_theta(xx)))
[<matplotlib.lines.Line2D at 0x11ae9a550>]
print -1.0 * regr.intercept_[0] / regr.coef_[0][0]
49.1483851852
xx = np.linspace(age.min(), age.max(), 1000)
for x in xx:
if abs(logistic(h_theta(x)) - 0.5000) < 0.0002:
print x
49.1671671672
print logistic(h_theta(50))
print logistic(h_theta(60))
print logistic(h_theta(70))
print logistic(h_theta(80))
0.508200244269 0.603004707604 0.690657986785 0.766453692561
age = np.array([22, 23, 24, 27, 28, 30, 30, 32, 33, 35, 38, 40, 41, 46, 47, 48, 49,\
49, 50, 51, 51, 52, 54, 55, 58, 60, 60, 62, 65, 67, 71, 77, 81])
chd2 = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
df2 = pd.DataFrame({'age': age, 'chd': chd2})
regr2 = linear_model.LogisticRegression()
age_ = []
for i in age:
age_.append((i,))
regr2 = regr2.fit(age_, chd2)
print 'Coefficients:', regr2.coef_
print 'Intercept:', regr2.intercept_
Coefficients: [[ 0.07131931]] Intercept: [-2.25741509]
def h_theta(x):
return regr2.intercept_[0] + regr2.coef_[0][0] * x
fig2 = plt.figure(figsize=(12, 8))
ax2 = fig2.add_subplot(111)
ax2.scatter(age, chd2)
xx2 = np.linspace(age.min(), age.max(), 1000)
ax2.plot(xx2, logistic(h_theta(xx2)))
[<matplotlib.lines.Line2D at 0x11b741990>]
print -1.0 * regr2.intercept_[0] / regr2.coef_[0][0]
xx2 = np.linspace(age.min(), age.max(), 1000)
for x in xx2:
if abs(logistic(h_theta(x)) - 0.5000) < 0.0002:
print x
31.6522293282
print logistic(h_theta(50))
print logistic(h_theta(60))
print logistic(h_theta(70))
print logistic(h_theta(80))
0.787270468923 0.883061158965 0.939056959384 0.969175310841
import urllib2
path = 'http://ftp.ics.uci.edu/pub/machine-learning-databases/mushroom/agaricus-lepiota.data'
raw_csv = urllib2.urlopen(path)
col_names = range(23)
df = pd.read_csv(raw_csv, names = col_names)
map_dic = {}
num_columns = df.shape[1]
for i in range(num_columns):
unique_array = df[i].unique()
map_dic_sub = {}
for j in range(len(unique_array)):
map_dic_sub[unique_array[j]] = j
df[i] = df[i].map(map_dic_sub)
df
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 |
2 | 1 | 1 | 0 | 2 | 0 | 2 | 0 | 0 | 1 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 2 |
3 | 0 | 0 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 1 | 0 | 0 | 3 | 1 | 3 | 0 | 1 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 2 | 1 |
5 | 1 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 1 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
6 | 1 | 1 | 0 | 2 | 0 | 1 | 0 | 0 | 1 | 2 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 2 |
7 | 1 | 1 | 1 | 2 | 0 | 2 | 0 | 0 | 1 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 2 |
8 | 0 | 0 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 3 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
9 | 1 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 2 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 |
10 | 1 | 0 | 1 | 1 | 0 | 2 | 0 | 0 | 1 | 2 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 |
11 | 1 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 1 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 |
12 | 1 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 4 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 |
13 | 0 | 0 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 3 | 0 |
14 | 1 | 0 | 2 | 0 | 1 | 3 | 0 | 1 | 1 | 1 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 2 | 1 |
15 | 1 | 2 | 2 | 3 | 1 | 3 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 4 | 0 |
16 | 1 | 3 | 2 | 2 | 1 | 3 | 0 | 1 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 2 | 1 |
17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
18 | 0 | 0 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
19 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
20 | 1 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 2 |
21 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 3 | 1 |
22 | 1 | 1 | 1 | 1 | 0 | 2 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 2 |
23 | 1 | 1 | 1 | 2 | 0 | 1 | 0 | 0 | 1 | 4 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 2 |
24 | 1 | 1 | 0 | 2 | 0 | 2 | 0 | 0 | 1 | 2 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 |
25 | 0 | 3 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 3 | 1 |
26 | 1 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 1 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 2 |
27 | 1 | 0 | 1 | 2 | 0 | 2 | 0 | 0 | 1 | 4 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 2 |
28 | 1 | 3 | 2 | 0 | 1 | 3 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0 |
29 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 3 | 3 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
8094 | 1 | 1 | 0 | 3 | 1 | 3 | 0 | 1 | 1 | 2 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 4 | 1 | 1 |
8095 | 0 | 0 | 1 | 8 | 1 | 8 | 0 | 0 | 1 | 10 | ... | 2 | 7 | 8 | 0 | 0 | 2 | 4 | 4 | 5 | 3 |
8096 | 1 | 4 | 2 | 2 | 1 | 3 | 0 | 1 | 1 | 4 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 4 | 1 | 1 |
8097 | 0 | 4 | 1 | 0 | 1 | 7 | 0 | 0 | 0 | 8 | ... | 3 | 2 | 1 | 0 | 0 | 0 | 1 | 4 | 3 | 6 |
8098 | 0 | 4 | 0 | 4 | 1 | 6 | 0 | 0 | 0 | 8 | ... | 3 | 0 | 1 | 0 | 0 | 0 | 1 | 4 | 3 | 3 |
8099 | 1 | 4 | 2 | 2 | 1 | 3 | 0 | 1 | 1 | 4 | ... | 3 | 0 | 0 | 0 | 0 | 1 | 0 | 4 | 0 | 1 |
8100 | 1 | 3 | 0 | 0 | 1 | 3 | 1 | 0 | 1 | 11 | ... | 0 | 6 | 7 | 0 | 1 | 0 | 0 | 8 | 3 | 6 |
8101 | 0 | 4 | 0 | 4 | 1 | 7 | 0 | 0 | 0 | 8 | ... | 0 | 2 | 0 | 0 | 0 | 0 | 1 | 4 | 3 | 4 |
8102 | 1 | 0 | 0 | 0 | 1 | 3 | 1 | 0 | 1 | 10 | ... | 0 | 6 | 7 | 0 | 1 | 0 | 0 | 1 | 5 | 6 |
8103 | 1 | 4 | 0 | 0 | 1 | 3 | 1 | 0 | 1 | 10 | ... | 0 | 6 | 7 | 0 | 1 | 0 | 0 | 6 | 5 | 6 |
8104 | 1 | 4 | 0 | 0 | 1 | 3 | 1 | 0 | 1 | 10 | ... | 0 | 6 | 7 | 0 | 2 | 0 | 0 | 1 | 3 | 6 |
8105 | 1 | 4 | 0 | 0 | 1 | 3 | 1 | 0 | 1 | 10 | ... | 0 | 6 | 7 | 0 | 1 | 0 | 0 | 7 | 3 | 6 |
8106 | 1 | 4 | 0 | 0 | 1 | 3 | 1 | 0 | 1 | 11 | ... | 0 | 6 | 7 | 0 | 2 | 0 | 0 | 1 | 3 | 6 |
8107 | 1 | 0 | 0 | 0 | 1 | 3 | 1 | 0 | 1 | 10 | ... | 0 | 6 | 7 | 0 | 2 | 0 | 0 | 1 | 5 | 6 |
8108 | 0 | 4 | 1 | 4 | 1 | 6 | 0 | 0 | 0 | 8 | ... | 0 | 2 | 0 | 0 | 0 | 0 | 1 | 4 | 3 | 6 |
8109 | 1 | 1 | 0 | 2 | 1 | 3 | 0 | 1 | 1 | 4 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 4 | 1 | 1 |
8110 | 1 | 0 | 0 | 0 | 1 | 3 | 1 | 0 | 1 | 11 | ... | 0 | 6 | 7 | 0 | 2 | 0 | 0 | 1 | 3 | 6 |
8111 | 1 | 4 | 0 | 2 | 1 | 3 | 0 | 1 | 1 | 3 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 4 | 1 | 1 |
8112 | 1 | 4 | 0 | 0 | 1 | 3 | 1 | 0 | 1 | 11 | ... | 0 | 6 | 7 | 0 | 1 | 0 | 0 | 8 | 3 | 6 |
8113 | 0 | 4 | 1 | 4 | 1 | 6 | 0 | 0 | 0 | 8 | ... | 3 | 2 | 1 | 0 | 0 | 0 | 1 | 4 | 3 | 3 |
8114 | 0 | 3 | 1 | 8 | 1 | 8 | 1 | 0 | 1 | 10 | ... | 2 | 7 | 8 | 0 | 0 | 2 | 4 | 4 | 5 | 3 |
8115 | 1 | 0 | 0 | 0 | 1 | 3 | 1 | 0 | 1 | 10 | ... | 0 | 6 | 7 | 0 | 2 | 0 | 0 | 6 | 3 | 6 |
8116 | 0 | 4 | 1 | 0 | 1 | 7 | 0 | 0 | 0 | 8 | ... | 3 | 2 | 0 | 0 | 0 | 0 | 1 | 4 | 3 | 6 |
8117 | 0 | 4 | 0 | 4 | 1 | 6 | 0 | 0 | 0 | 8 | ... | 0 | 2 | 0 | 0 | 0 | 0 | 1 | 4 | 3 | 3 |
8118 | 0 | 4 | 1 | 0 | 1 | 4 | 0 | 0 | 0 | 8 | ... | 0 | 2 | 0 | 0 | 0 | 0 | 1 | 4 | 3 | 3 |
8119 | 1 | 4 | 0 | 0 | 1 | 3 | 1 | 0 | 1 | 10 | ... | 0 | 6 | 7 | 0 | 2 | 0 | 0 | 8 | 5 | 6 |
8120 | 1 | 0 | 0 | 0 | 1 | 3 | 1 | 0 | 1 | 10 | ... | 0 | 6 | 7 | 0 | 1 | 0 | 0 | 8 | 3 | 6 |
8121 | 1 | 3 | 0 | 0 | 1 | 3 | 1 | 0 | 1 | 1 | ... | 0 | 6 | 7 | 0 | 2 | 0 | 0 | 8 | 5 | 6 |
8122 | 0 | 4 | 1 | 0 | 1 | 6 | 0 | 0 | 0 | 8 | ... | 3 | 0 | 0 | 0 | 0 | 0 | 1 | 4 | 3 | 6 |
8123 | 1 | 0 | 0 | 0 | 1 | 3 | 1 | 0 | 1 | 10 | ... | 0 | 6 | 7 | 0 | 2 | 0 | 0 | 6 | 5 | 6 |
8124 rows × 23 columns
attributes = df.iloc[:, 1:22]
mushroom_data = attributes.values
mushroom_data
array([[0, 0, 0, ..., 0, 0, 0], [0, 0, 1, ..., 0, 1, 1], [1, 0, 2, ..., 0, 1, 1], ..., [3, 0, 0, ..., 0, 8, 5], [4, 1, 0, ..., 1, 4, 3], [0, 0, 0, ..., 0, 6, 5]])
target_series = df.iloc[:, 0]
mushroom_target = target_series.values
mushroom_target
array([0, 1, 1, ..., 1, 0, 1])
from sklearn.cross_validation import train_test_split
data, labels = np.arange(10).reshape((5, 2)), range(5)
print data
print labels
print
data_train, data_test, labels_train, labels_test = train_test_split(data, labels, test_size=0.20)
print data_train, labels_train
print data_test, labels_test
[[0 1] [2 3] [4 5] [6 7] [8 9]] [0, 1, 2, 3, 4] [[2 3] [0 1] [6 7] [8 9]] [1, 0, 3, 4] [[4 5]] [2]
data_train, data_test, labels_train, labels_test = train_test_split(mushroom_data, mushroom_target, test_size=0.20)
print len(data_train), len(labels_train)
print len(data_test), len(labels_test)
6499 6499 1625 1625
regr3 = linear_model.LogisticRegression()
regr3.fit(data_train, labels_train)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1, penalty='l2', random_state=None, solver='liblinear', tol=0.0001, verbose=0, warm_start=False)
print 'Coefficients:', regr3.coef_
print 'Intercept:', regr3.intercept_
Coefficients: [[ 0.04030487 1.17110534 0.03021184 8.46849072 -3.0813763 0.33461377 3.23485394 9.85864101 -0.33056911 1.70857447 2.0504694 -2.20960252 -2.39936225 -0.66134385 0.7534827 0. 0.46046821 2.92733412 0.12170563 -1.47497958 0.66568998]] Intercept: [-2.32896608]
print data_test[0]
print data_test[0].reshape(1, -1)
[0 0 2 0 2 0 1 0 3 1 2 0 0 0 0 0 0 0 0 2 3] [[0 0 2 0 2 0 1 0 3 1 2 0 0 0 0 0 0 0 0 2 3]]
print data_test[0], ":", labels_test[0]
print regr3.predict(data_test[0].reshape(1,-1))[0]
[0 0 2 0 2 0 1 0 3 1 2 0 0 0 0 0 0 0 0 2 3] : 1 0
print data_test[1], ":", labels_test[1]
print regr3.predict(data_test[1].reshape(1,-1))[0]
[0 0 4 1 4 0 0 0 8 1 4 0 0 0 0 0 0 0 1 4 3] : 0 0
predicted = []
for i in range(0, len(data_test)):
predicted.append(regr3.predict(data_test[i].reshape(1,-1))[0] == labels_test[i])
total = len(predicted)
numTrue = 0
for i in range(0, total):
if predicted[i]:
numTrue = numTrue + 1
print float(numTrue) / total
0.983384615385
from sklearn.datasets import load_iris
from sklearn import tree
iris = load_iris()
iris.keys()
['target_names', 'data', 'target', 'DESCR', 'feature_names']
iris.target_names
array(['setosa', 'versicolor', 'virginica'], dtype='|S10')
iris.feature_names
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
print len(iris.data), len(iris.target)
150 150
iris.data[0:5]
array([[ 5.1, 3.5, 1.4, 0.2], [ 4.9, 3. , 1.4, 0.2], [ 4.7, 3.2, 1.3, 0.2], [ 4.6, 3.1, 1.5, 0.2], [ 5. , 3.6, 1.4, 0.2]])
iris.target[0:5]
array([0, 0, 0, 0, 0])
iris.data[50:55]
array([[ 7. , 3.2, 4.7, 1.4], [ 6.4, 3.2, 4.5, 1.5], [ 6.9, 3.1, 4.9, 1.5], [ 5.5, 2.3, 4. , 1.3], [ 6.5, 2.8, 4.6, 1.5]])
iris.target[50:55]
array([1, 1, 1, 1, 1])
regr5 = linear_model.LogisticRegression()
regr5.fit(iris.data[:, :2], iris.target)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, max_iter=100, multi_class='ovr', penalty='l2', random_state=None, solver='liblinear', tol=0.0001, verbose=0)
print 'Coefficients:', regr5.coef_
print 'Intercept:', regr5.intercept_
Coefficients: [[-2.49579289 4.01011301] [ 0.49709451 -1.63380222] [ 1.15921404 -1.77736568]] Intercept: [ 0.81713932 1.22543562 -2.22516119]
iris = load_iris()
X = iris.data[:, :2] # we only take the first two features.
Y = iris.target
h = .02 # step size in the mesh
regr6 = linear_model.LogisticRegression()
# we create an instance of Neighbours Classifier and fit the data.
regr6.fit(X, Y)
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, m_max]x[y_min, y_max].
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
x_array = np.arange(x_min - .5, x_max + .5, h)
y_array = np.arange(y_min - .5, y_max + .5, h)
xx, yy = np.meshgrid(x_array, y_array)
Z = regr6.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111)
ax.pcolormesh(xx, yy, Z, cmap=plt.cm.Paired)
# Plot also the training points
ax.scatter(X[:, 0], X[:, 1], c=Y, edgecolors='k', cmap=plt.cm.Paired)
plt.xlabel('Sepal length')
plt.ylabel('Sepal width')
<matplotlib.text.Text at 0x11e0b4610>