In [1]:

# Import Dependencies
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.datasets import load_boston
boston = load_boston()

독립 변수와 종속 변수를 분리한다.¶

In [2]:

X_data = pd.DataFrame(boston.data, columns=boston.feature_names)
y_data = pd.DataFrame(boston.target, columns=["Target"])

In [3]:

X_data.head()

Out[3]:

	CRIM	ZN	INDUS	NOX	RM	AGE	DIS	RAD	TAX	PTRATIO	B	LSTAT
0	0.00632	18.0	2.31	0.538	6.575	65.2	4.0900	1.0	296.0	15.3	396.90	4.98
1	0.02731	0.0	7.07	0.469	6.421	78.9	4.9671	2.0	242.0	17.8	396.90	9.14
2	0.02729	0.0	7.07	0.469	7.185	61.1	4.9671	2.0	242.0	17.8	392.83	4.03
3	0.03237	0.0	2.18	0.458	6.998	45.8	6.0622	3.0	222.0	18.7	394.63	2.94
4	0.06905	0.0	2.18	0.458	7.147	54.2	6.0622	3.0	222.0	18.7	396.90	5.33

In [4]:

y_data.head()

Out[4]:

	Target
0	24.0
1	21.6
2	34.7
3	33.4
4	36.2

Train Test 데이터를 분리한다¶

In [5]:

from sklearn.model_selection import train_test_split

In [6]:

X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2, random_state=1)

In [7]:

X_train.shape, X_test.shape, y_train.shape, y_test.shape

Out[7]:

((404, 13), (102, 13), (404, 1), (102, 1))

StandardScaler를 사용하여 스케일링한다¶

In [8]:

from sklearn.preprocessing import StandardScaler
# 객체로 사용해야 나중에 Test데이터에 같은 Mean, Variance를 사용할 수 있다.
scaler = StandardScaler()
scaler.fit(X_train)

Out[8]:

StandardScaler(copy=True, with_mean=True, with_std=True)

In [9]:

X_train = pd.DataFrame(data=scaler.transform(X_train), columns=X_train.columns, index=X_train.index)
X_train.head()

Out[9]:

	CRIM	ZN	INDUS	CHAS	NOX	RM	AGE	DIS	RAD	TAX	PTRATIO	B	LSTAT
42	-0.386768	-0.495593	-0.609290	-0.293294	-0.899583	-0.144968	-2.150030	0.894455	-0.746330	-1.008508	-0.248578	0.286742	-0.966850
58	-0.385349	0.579239	-0.869526	-0.293294	-0.856756	-0.179832	-1.357820	1.882903	-0.169594	-0.706413	0.582147	0.366695	-0.821168
385	1.439108	-0.495593	1.026692	-0.293294	1.258877	-1.440773	1.057367	-1.132950	1.675959	1.556337	0.812904	0.434727	2.501775
78	-0.396082	-0.495593	0.256216	-0.293294	-0.993801	-0.053448	-0.499009	0.560803	-0.515636	-0.031142	0.120633	0.319883	-0.060845
424	0.560723	-0.495593	1.026692	-0.293294	0.265300	-1.022396	0.093395	-0.832059	1.675959	1.556337	0.812904	-3.866459	0.607906

In [10]:

# 주의 : Test는 Fit을하면 안된다!
X_test = pd.DataFrame(data=scaler.transform(X_test), columns=X_test.columns, index=X_test.index)
X_test.head()

Out[10]:

	CRIM	ZN	INDUS	CHAS	NOX	RM	AGE	DIS	RAD	TAX	PTRATIO	B	LSTAT
307	-0.396864	0.923185	-1.300817	-0.293294	-0.694015	0.842866	0.082879	-0.303729	-0.284942	-1.073665	-0.017821	0.434727	-0.728209
343	-0.399481	1.869037	-1.066897	-0.293294	-0.591232	0.620603	-0.404365	0.899742	-0.515636	-0.196998	-0.387032	0.434727	-0.776769
47	-0.377155	-0.495593	-0.609290	-0.293294	-0.899583	-0.346892	0.615692	0.879585	-0.746330	-1.008508	-0.248578	0.389227	0.835448
67	-0.395926	0.041823	-0.732098	-0.293294	-1.233630	-0.567702	-1.631238	1.261294	-0.630983	-0.345084	0.212936	0.427180	-0.649124
362	0.000604	-0.495593	1.026692	-0.293294	1.858449	-1.317293	0.990765	-0.813129	1.675959	1.556337	0.812904	0.258523	-0.359147

Tensorflow에서 사용할 땐 Numpy 데이터 타입으로 사용할 예정이니 변환하자¶

In [11]:

X_train = np.array(X_train)
y_train = np.array(y_train)
X_test = np.array(X_test)
y_test = np.array(y_test)
type(X_train), type(y_train), type(X_test), type(y_test)

Out[11]:

(numpy.ndarray, numpy.ndarray, numpy.ndarray, numpy.ndarray)

Tensorflow Model 정의¶

In [12]:

# Learning Rate
lr = 0.01

# 가중치를 몇번 업데이트 할 것인가?
epochs = 2000

# Features 독립 변수
X = tf.placeholder(dtype=tf.float32, shape=[None, X_train.shape[1]])
# Labels 종속 변수
y = tf.placeholder(dtype=tf.float32, shape=[None, 1])

# Weight 가중치, 초기값은 정규분포에서 랜덤하게 뽑는다
W = tf.Variable(tf.random_normal([X_train.shape[1], 1]))
# Bias 초기값은 정규분포에서 랜덤하게 뽑는다
b = tf.Variable(tf.random_normal([1]))

In [13]:

# tf.Variable을 사용했거나, 메서드 내부적으로 변수가 존재하는 경우에는 Variables
# 초기화해줘야 한다.
init = tf.global_variables_initializer()

In [14]:

# 우리가 예측하는 값 W*X + b
hypothesis = tf.add(tf.matmul(X, W), b)

# cost function으로는 MSE를 사용
cost = tf.reduce_mean(tf.square(y - hypothesis))

# Gradient Descent 방법으로 최적화
optimizer = tf.train.GradientDescentOptimizer(learning_rate=lr).minimize(cost)

In [15]:

# cost_history를 기록하면 마지막에 epoch 변화에 따른 cost 변화를 확인할 때 편리하다
cost_history = np.empty(shape=[1], dtype=float)

In [16]:

with tf.Session() as sess:
    sess.run(init)
    
    for epoch in range(0, epochs):
        # optimizer에서 반환하는 값은 의미가 없으니 _로 받아주자
        _, err = sess.run([optimizer, cost], feed_dict={X: X_train, y: y_train})
        
        cost_history = np.append(cost_history, err)
        
        # 100 번에 한번씩 Error 변화를 확인하자
        if epoch%100 == 0:
            print('Epoch: {0}, Error: {1}'.format(epoch, err))
    
    print('Epoch: {0}, Error: {1}'.format(epoch + 1, err))
    
    # 우리가 설정한 Epochs만큼의 학습이 끝난 후에 나온 값을 확인하기 위해 받아두자
    updated_W = sess.run(W)
    updated_b = sess.run(b)
    
    # Test 데이터를 예측한 값
    y_pred = sess.run(hypothesis, feed_dict={X: X_test})
    
    # Mean Squared Error
    mse = sess.run(tf.reduce_mean(tf.square(y_pred - y_test)))

Epoch: 0, Error: 604.2737426757812
Epoch: 100, Error: 33.710201263427734
Epoch: 200, Error: 23.05824851989746
Epoch: 300, Error: 22.47170639038086
Epoch: 400, Error: 22.255775451660156
Epoch: 500, Error: 22.13414764404297
Epoch: 600, Error: 22.059106826782227
Epoch: 700, Error: 22.009729385375977
Epoch: 800, Error: 21.975486755371094
Epoch: 900, Error: 21.950754165649414
Epoch: 1000, Error: 21.932348251342773
Epoch: 1100, Error: 21.918357849121094
Epoch: 1200, Error: 21.907577514648438
Epoch: 1300, Error: 21.899185180664062
Epoch: 1400, Error: 21.892616271972656
Epoch: 1500, Error: 21.8874568939209
Epoch: 1600, Error: 21.88338851928711
Epoch: 1700, Error: 21.88017463684082
Epoch: 1800, Error: 21.877634048461914
Epoch: 1900, Error: 21.87563133239746
Epoch: 2000, Error: 21.874052047729492

In [17]:

# 최종 Bias
print('Trained Bias: \n', updated_b)
print('Trained Weights: \n', updated_W)

Trained Bias: 
 [22.52223]
Trained Weights: 
 [[-1.0024459 ]
 [ 1.3155702 ]
 [ 0.05793529]
 [ 0.5868825 ]
 [-2.2774897 ]
 [ 2.1402504 ]
 [ 0.11897256]
 [-3.1695282 ]
 [ 2.4372222 ]
 [-1.6443572 ]
 [-2.1415503 ]
 [ 0.67585033]
 [-3.9236772 ]]

In [18]:

print('Mean Squared Error: ',mse)

Mean Squared Error:  23.407308966316197

In [19]:

plt.plot(range(len(cost_history)), cost_history)
plt.axis([0,epochs,0,np.max(cost_history)])
plt.xlabel('Epochs')
plt.ylabel('Cost')
plt.title('Cost 변화', fontsize=15)
plt.show()

In [20]:

plt.scatter(y_test, y_pred)
plt.xlabel(u"실제 집값")
plt.ylabel(u"집값 예측치")
plt.title("집값 예측치와 실제 집값의 관계", fontsize=15)
plt.show()

Tensorflow Estimator API를 사용하여 Linear Regression하는 방법¶

In [21]:

X_data.columns

Out[21]:

Index(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
       'PTRATIO', 'B', 'LSTAT'],
      dtype='object')

In [22]:

np.array(X_train).shape[1:]

Out[22]:

(13,)

In [23]:

feat_cols = [tf.feature_column.numeric_column('x', shape=np.array(X_train).shape[1:])]

In [24]:

# Make Feature Columns
feat_cols = [tf.feature_column.numeric_column('x', shape=np.array(X_train).shape[1:])]
# Make Input Function
input_func = tf.estimator.inputs.numpy_input_fn({'x': X_train}, y_train, batch_size=1, num_epochs=2000, shuffle=True)

In [25]:

# Define Linear Regressor Model
# Supported Optimizers: ('Adagrad', 'Adam', 'Ftrl', 'RMSProp', 'SGD')
linear_model = tf.estimator.LinearRegressor(feature_columns=feat_cols, optimizer='Adam')

INFO:tensorflow:Using default config.
WARNING:tensorflow:Using temporary folder as model directory: /var/folders/gc/y94kqvf109v1_tthvbls56wc0000gn/T/tmp28oygs1t
INFO:tensorflow:Using config: {'_model_dir': '/var/folders/gc/y94kqvf109v1_tthvbls56wc0000gn/T/tmp28oygs1t', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x119148320>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}

In [26]:

# Set up Estimator Training Inputs
train_input_func = tf.estimator.inputs.numpy_input_fn(X_train, y_train, batch_size=1, num_epochs=1000, shuffle=False)
# Set up Estimator Test Inputs
eval_input_func = tf.estimator.inputs.numpy_input_fn({'x': X_test}, y_test, batch_size=1, num_epochs=1, shuffle=False)

In [27]:

# Train the Linear Regressor Estimator
linear_model.train(input_fn=input_func, steps=2000)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into /var/folders/gc/y94kqvf109v1_tthvbls56wc0000gn/T/tmp28oygs1t/model.ckpt.
INFO:tensorflow:loss = 225.0, step = 1
INFO:tensorflow:global_step/sec: 890.828
INFO:tensorflow:loss = 253.62346, step = 101 (0.114 sec)
INFO:tensorflow:global_step/sec: 1038.08
INFO:tensorflow:loss = 0.07589463, step = 201 (0.097 sec)
INFO:tensorflow:global_step/sec: 1002.72
INFO:tensorflow:loss = 2.1461585, step = 301 (0.100 sec)
INFO:tensorflow:global_step/sec: 1143.59
INFO:tensorflow:loss = 0.66625136, step = 401 (0.087 sec)
INFO:tensorflow:global_step/sec: 1009.44
INFO:tensorflow:loss = 21.957817, step = 501 (0.099 sec)
INFO:tensorflow:global_step/sec: 988.006
INFO:tensorflow:loss = 8.972441, step = 601 (0.101 sec)
INFO:tensorflow:global_step/sec: 1050.7
INFO:tensorflow:loss = 0.17537297, step = 701 (0.095 sec)
INFO:tensorflow:global_step/sec: 1006.66
INFO:tensorflow:loss = 6.7108874, step = 801 (0.100 sec)
INFO:tensorflow:global_step/sec: 1124.59
INFO:tensorflow:loss = 38.028217, step = 901 (0.089 sec)
INFO:tensorflow:global_step/sec: 1075.72
INFO:tensorflow:loss = 3.514638, step = 1001 (0.093 sec)
INFO:tensorflow:global_step/sec: 1086.39
INFO:tensorflow:loss = 0.009802317, step = 1101 (0.092 sec)
INFO:tensorflow:global_step/sec: 1124.27
INFO:tensorflow:loss = 6.4648666, step = 1201 (0.090 sec)
INFO:tensorflow:global_step/sec: 1099.32
INFO:tensorflow:loss = 1.2076899, step = 1301 (0.090 sec)
INFO:tensorflow:global_step/sec: 1145.83
INFO:tensorflow:loss = 21.732845, step = 1401 (0.087 sec)
INFO:tensorflow:global_step/sec: 1134.79
INFO:tensorflow:loss = 0.6822102, step = 1501 (0.089 sec)
INFO:tensorflow:global_step/sec: 1114.55
INFO:tensorflow:loss = 31.725891, step = 1601 (0.090 sec)
INFO:tensorflow:global_step/sec: 1090.22
INFO:tensorflow:loss = 1.2142678, step = 1701 (0.092 sec)
INFO:tensorflow:global_step/sec: 1036.86
INFO:tensorflow:loss = 19.478325, step = 1801 (0.095 sec)
INFO:tensorflow:global_step/sec: 1112.72
INFO:tensorflow:loss = 62.885754, step = 1901 (0.090 sec)
INFO:tensorflow:Saving checkpoints for 2000 into /var/folders/gc/y94kqvf109v1_tthvbls56wc0000gn/T/tmp28oygs1t/model.ckpt.
INFO:tensorflow:Loss for final step: 55.036022.

Out[27]:

<tensorflow.python.estimator.canned.linear.LinearRegressor at 0x119148eb8>

In [28]:

# Test the Model
test_metrics = linear_model.evaluate(input_fn=eval_input_func, steps=100)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-11-01-10:59:40
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /var/folders/gc/y94kqvf109v1_tthvbls56wc0000gn/T/tmp28oygs1t/model.ckpt-2000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Evaluation [10/100]
INFO:tensorflow:Evaluation [20/100]
INFO:tensorflow:Evaluation [30/100]
INFO:tensorflow:Evaluation [40/100]
INFO:tensorflow:Evaluation [50/100]
INFO:tensorflow:Evaluation [60/100]
INFO:tensorflow:Evaluation [70/100]
INFO:tensorflow:Evaluation [80/100]
INFO:tensorflow:Evaluation [90/100]
INFO:tensorflow:Evaluation [100/100]
INFO:tensorflow:Finished evaluation at 2018-11-01-10:59:40
INFO:tensorflow:Saving dict for global step 2000: average_loss = 26.29743, global_step = 2000, loss = 26.29743
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 2000: /var/folders/gc/y94kqvf109v1_tthvbls56wc0000gn/T/tmp28oygs1t/model.ckpt-2000

In [29]:

# Get Predicted Values as an Array
predicted_vals = []

for pred in linear_model.predict(input_fn=eval_input_func):
    predicted_vals.append(pred['predictions'])

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /var/folders/gc/y94kqvf109v1_tthvbls56wc0000gn/T/tmp28oygs1t/model.ckpt-2000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.

In [30]:

plt.scatter(y_test, predicted_vals)
plt.xlabel(u"실제 집값")
plt.ylabel(u"집값 예측치")
plt.title("집값 예측치와 실제 집값의 관계", fontsize=15)
plt.show()

In [ ]: