# Import Dependencies
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.datasets import load_boston
boston = load_boston()
X_data = pd.DataFrame(boston.data, columns=boston.feature_names)
y_data = pd.DataFrame(boston.target, columns=["Target"])
X_data.head()
CRIM | ZN | INDUS | CHAS | NOX | RM | AGE | DIS | RAD | TAX | PTRATIO | B | LSTAT | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.00632 | 18.0 | 2.31 | 0.0 | 0.538 | 6.575 | 65.2 | 4.0900 | 1.0 | 296.0 | 15.3 | 396.90 | 4.98 |
1 | 0.02731 | 0.0 | 7.07 | 0.0 | 0.469 | 6.421 | 78.9 | 4.9671 | 2.0 | 242.0 | 17.8 | 396.90 | 9.14 |
2 | 0.02729 | 0.0 | 7.07 | 0.0 | 0.469 | 7.185 | 61.1 | 4.9671 | 2.0 | 242.0 | 17.8 | 392.83 | 4.03 |
3 | 0.03237 | 0.0 | 2.18 | 0.0 | 0.458 | 6.998 | 45.8 | 6.0622 | 3.0 | 222.0 | 18.7 | 394.63 | 2.94 |
4 | 0.06905 | 0.0 | 2.18 | 0.0 | 0.458 | 7.147 | 54.2 | 6.0622 | 3.0 | 222.0 | 18.7 | 396.90 | 5.33 |
y_data.head()
Target | |
---|---|
0 | 24.0 |
1 | 21.6 |
2 | 34.7 |
3 | 33.4 |
4 | 36.2 |
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2, random_state=1)
X_train.shape, X_test.shape, y_train.shape, y_test.shape
((404, 13), (102, 13), (404, 1), (102, 1))
from sklearn.preprocessing import StandardScaler
# 객체로 사용해야 나중에 Test데이터에 같은 Mean, Variance를 사용할 수 있다.
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler(copy=True, with_mean=True, with_std=True)
X_train = pd.DataFrame(data=scaler.transform(X_train), columns=X_train.columns, index=X_train.index)
X_train.head()
CRIM | ZN | INDUS | CHAS | NOX | RM | AGE | DIS | RAD | TAX | PTRATIO | B | LSTAT | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
42 | -0.386768 | -0.495593 | -0.609290 | -0.293294 | -0.899583 | -0.144968 | -2.150030 | 0.894455 | -0.746330 | -1.008508 | -0.248578 | 0.286742 | -0.966850 |
58 | -0.385349 | 0.579239 | -0.869526 | -0.293294 | -0.856756 | -0.179832 | -1.357820 | 1.882903 | -0.169594 | -0.706413 | 0.582147 | 0.366695 | -0.821168 |
385 | 1.439108 | -0.495593 | 1.026692 | -0.293294 | 1.258877 | -1.440773 | 1.057367 | -1.132950 | 1.675959 | 1.556337 | 0.812904 | 0.434727 | 2.501775 |
78 | -0.396082 | -0.495593 | 0.256216 | -0.293294 | -0.993801 | -0.053448 | -0.499009 | 0.560803 | -0.515636 | -0.031142 | 0.120633 | 0.319883 | -0.060845 |
424 | 0.560723 | -0.495593 | 1.026692 | -0.293294 | 0.265300 | -1.022396 | 0.093395 | -0.832059 | 1.675959 | 1.556337 | 0.812904 | -3.866459 | 0.607906 |
# 주의 : Test는 Fit을하면 안된다!
X_test = pd.DataFrame(data=scaler.transform(X_test), columns=X_test.columns, index=X_test.index)
X_test.head()
CRIM | ZN | INDUS | CHAS | NOX | RM | AGE | DIS | RAD | TAX | PTRATIO | B | LSTAT | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
307 | -0.396864 | 0.923185 | -1.300817 | -0.293294 | -0.694015 | 0.842866 | 0.082879 | -0.303729 | -0.284942 | -1.073665 | -0.017821 | 0.434727 | -0.728209 |
343 | -0.399481 | 1.869037 | -1.066897 | -0.293294 | -0.591232 | 0.620603 | -0.404365 | 0.899742 | -0.515636 | -0.196998 | -0.387032 | 0.434727 | -0.776769 |
47 | -0.377155 | -0.495593 | -0.609290 | -0.293294 | -0.899583 | -0.346892 | 0.615692 | 0.879585 | -0.746330 | -1.008508 | -0.248578 | 0.389227 | 0.835448 |
67 | -0.395926 | 0.041823 | -0.732098 | -0.293294 | -1.233630 | -0.567702 | -1.631238 | 1.261294 | -0.630983 | -0.345084 | 0.212936 | 0.427180 | -0.649124 |
362 | 0.000604 | -0.495593 | 1.026692 | -0.293294 | 1.858449 | -1.317293 | 0.990765 | -0.813129 | 1.675959 | 1.556337 | 0.812904 | 0.258523 | -0.359147 |
X_train = np.array(X_train)
y_train = np.array(y_train)
X_test = np.array(X_test)
y_test = np.array(y_test)
type(X_train), type(y_train), type(X_test), type(y_test)
(numpy.ndarray, numpy.ndarray, numpy.ndarray, numpy.ndarray)
# Learning Rate
lr = 0.01
# 가중치를 몇번 업데이트 할 것인가?
epochs = 2000
# Features 독립 변수
X = tf.placeholder(dtype=tf.float32, shape=[None, X_train.shape[1]])
# Labels 종속 변수
y = tf.placeholder(dtype=tf.float32, shape=[None, 1])
# Weight 가중치, 초기값은 정규분포에서 랜덤하게 뽑는다
W = tf.Variable(tf.random_normal([X_train.shape[1], 1]))
# Bias 초기값은 정규분포에서 랜덤하게 뽑는다
b = tf.Variable(tf.random_normal([1]))
# tf.Variable을 사용했거나, 메서드 내부적으로 변수가 존재하는 경우에는 Variables
# 초기화해줘야 한다.
init = tf.global_variables_initializer()
# 우리가 예측하는 값 W*X + b
hypothesis = tf.add(tf.matmul(X, W), b)
# cost function으로는 MSE를 사용
cost = tf.reduce_mean(tf.square(y - hypothesis))
# Gradient Descent 방법으로 최적화
optimizer = tf.train.GradientDescentOptimizer(learning_rate=lr).minimize(cost)
# cost_history를 기록하면 마지막에 epoch 변화에 따른 cost 변화를 확인할 때 편리하다
cost_history = np.empty(shape=[1], dtype=float)
with tf.Session() as sess:
sess.run(init)
for epoch in range(0, epochs):
# optimizer에서 반환하는 값은 의미가 없으니 _로 받아주자
_, err = sess.run([optimizer, cost], feed_dict={X: X_train, y: y_train})
cost_history = np.append(cost_history, err)
# 100 번에 한번씩 Error 변화를 확인하자
if epoch%100 == 0:
print('Epoch: {0}, Error: {1}'.format(epoch, err))
print('Epoch: {0}, Error: {1}'.format(epoch + 1, err))
# 우리가 설정한 Epochs만큼의 학습이 끝난 후에 나온 값을 확인하기 위해 받아두자
updated_W = sess.run(W)
updated_b = sess.run(b)
# Test 데이터를 예측한 값
y_pred = sess.run(hypothesis, feed_dict={X: X_test})
# Mean Squared Error
mse = sess.run(tf.reduce_mean(tf.square(y_pred - y_test)))
Epoch: 0, Error: 604.2737426757812 Epoch: 100, Error: 33.710201263427734 Epoch: 200, Error: 23.05824851989746 Epoch: 300, Error: 22.47170639038086 Epoch: 400, Error: 22.255775451660156 Epoch: 500, Error: 22.13414764404297 Epoch: 600, Error: 22.059106826782227 Epoch: 700, Error: 22.009729385375977 Epoch: 800, Error: 21.975486755371094 Epoch: 900, Error: 21.950754165649414 Epoch: 1000, Error: 21.932348251342773 Epoch: 1100, Error: 21.918357849121094 Epoch: 1200, Error: 21.907577514648438 Epoch: 1300, Error: 21.899185180664062 Epoch: 1400, Error: 21.892616271972656 Epoch: 1500, Error: 21.8874568939209 Epoch: 1600, Error: 21.88338851928711 Epoch: 1700, Error: 21.88017463684082 Epoch: 1800, Error: 21.877634048461914 Epoch: 1900, Error: 21.87563133239746 Epoch: 2000, Error: 21.874052047729492
# 최종 Bias
print('Trained Bias: \n', updated_b)
print('Trained Weights: \n', updated_W)
Trained Bias: [22.52223] Trained Weights: [[-1.0024459 ] [ 1.3155702 ] [ 0.05793529] [ 0.5868825 ] [-2.2774897 ] [ 2.1402504 ] [ 0.11897256] [-3.1695282 ] [ 2.4372222 ] [-1.6443572 ] [-2.1415503 ] [ 0.67585033] [-3.9236772 ]]
print('Mean Squared Error: ',mse)
Mean Squared Error: 23.407308966316197
plt.plot(range(len(cost_history)), cost_history)
plt.axis([0,epochs,0,np.max(cost_history)])
plt.xlabel('Epochs')
plt.ylabel('Cost')
plt.title('Cost 변화', fontsize=15)
plt.show()
plt.scatter(y_test, y_pred)
plt.xlabel(u"실제 집값")
plt.ylabel(u"집값 예측치")
plt.title("집값 예측치와 실제 집값의 관계", fontsize=15)
plt.show()
X_data.columns
Index(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='object')
np.array(X_train).shape[1:]
(13,)
feat_cols = [tf.feature_column.numeric_column('x', shape=np.array(X_train).shape[1:])]
# Make Feature Columns
feat_cols = [tf.feature_column.numeric_column('x', shape=np.array(X_train).shape[1:])]
# Make Input Function
input_func = tf.estimator.inputs.numpy_input_fn({'x': X_train}, y_train, batch_size=1, num_epochs=2000, shuffle=True)
# Define Linear Regressor Model
# Supported Optimizers: ('Adagrad', 'Adam', 'Ftrl', 'RMSProp', 'SGD')
linear_model = tf.estimator.LinearRegressor(feature_columns=feat_cols, optimizer='Adam')
INFO:tensorflow:Using default config. WARNING:tensorflow:Using temporary folder as model directory: /var/folders/gc/y94kqvf109v1_tthvbls56wc0000gn/T/tmp28oygs1t INFO:tensorflow:Using config: {'_model_dir': '/var/folders/gc/y94kqvf109v1_tthvbls56wc0000gn/T/tmp28oygs1t', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x119148320>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
# Set up Estimator Training Inputs
train_input_func = tf.estimator.inputs.numpy_input_fn(X_train, y_train, batch_size=1, num_epochs=1000, shuffle=False)
# Set up Estimator Test Inputs
eval_input_func = tf.estimator.inputs.numpy_input_fn({'x': X_test}, y_test, batch_size=1, num_epochs=1, shuffle=False)
# Train the Linear Regressor Estimator
linear_model.train(input_fn=input_func, steps=2000)
INFO:tensorflow:Calling model_fn. INFO:tensorflow:Done calling model_fn. INFO:tensorflow:Create CheckpointSaverHook. INFO:tensorflow:Graph was finalized. INFO:tensorflow:Running local_init_op. INFO:tensorflow:Done running local_init_op. INFO:tensorflow:Saving checkpoints for 0 into /var/folders/gc/y94kqvf109v1_tthvbls56wc0000gn/T/tmp28oygs1t/model.ckpt. INFO:tensorflow:loss = 225.0, step = 1 INFO:tensorflow:global_step/sec: 890.828 INFO:tensorflow:loss = 253.62346, step = 101 (0.114 sec) INFO:tensorflow:global_step/sec: 1038.08 INFO:tensorflow:loss = 0.07589463, step = 201 (0.097 sec) INFO:tensorflow:global_step/sec: 1002.72 INFO:tensorflow:loss = 2.1461585, step = 301 (0.100 sec) INFO:tensorflow:global_step/sec: 1143.59 INFO:tensorflow:loss = 0.66625136, step = 401 (0.087 sec) INFO:tensorflow:global_step/sec: 1009.44 INFO:tensorflow:loss = 21.957817, step = 501 (0.099 sec) INFO:tensorflow:global_step/sec: 988.006 INFO:tensorflow:loss = 8.972441, step = 601 (0.101 sec) INFO:tensorflow:global_step/sec: 1050.7 INFO:tensorflow:loss = 0.17537297, step = 701 (0.095 sec) INFO:tensorflow:global_step/sec: 1006.66 INFO:tensorflow:loss = 6.7108874, step = 801 (0.100 sec) INFO:tensorflow:global_step/sec: 1124.59 INFO:tensorflow:loss = 38.028217, step = 901 (0.089 sec) INFO:tensorflow:global_step/sec: 1075.72 INFO:tensorflow:loss = 3.514638, step = 1001 (0.093 sec) INFO:tensorflow:global_step/sec: 1086.39 INFO:tensorflow:loss = 0.009802317, step = 1101 (0.092 sec) INFO:tensorflow:global_step/sec: 1124.27 INFO:tensorflow:loss = 6.4648666, step = 1201 (0.090 sec) INFO:tensorflow:global_step/sec: 1099.32 INFO:tensorflow:loss = 1.2076899, step = 1301 (0.090 sec) INFO:tensorflow:global_step/sec: 1145.83 INFO:tensorflow:loss = 21.732845, step = 1401 (0.087 sec) INFO:tensorflow:global_step/sec: 1134.79 INFO:tensorflow:loss = 0.6822102, step = 1501 (0.089 sec) INFO:tensorflow:global_step/sec: 1114.55 INFO:tensorflow:loss = 31.725891, step = 1601 (0.090 sec) INFO:tensorflow:global_step/sec: 1090.22 INFO:tensorflow:loss = 1.2142678, step = 1701 (0.092 sec) INFO:tensorflow:global_step/sec: 1036.86 INFO:tensorflow:loss = 19.478325, step = 1801 (0.095 sec) INFO:tensorflow:global_step/sec: 1112.72 INFO:tensorflow:loss = 62.885754, step = 1901 (0.090 sec) INFO:tensorflow:Saving checkpoints for 2000 into /var/folders/gc/y94kqvf109v1_tthvbls56wc0000gn/T/tmp28oygs1t/model.ckpt. INFO:tensorflow:Loss for final step: 55.036022.
<tensorflow.python.estimator.canned.linear.LinearRegressor at 0x119148eb8>
# Test the Model
test_metrics = linear_model.evaluate(input_fn=eval_input_func, steps=100)
INFO:tensorflow:Calling model_fn. INFO:tensorflow:Done calling model_fn. INFO:tensorflow:Starting evaluation at 2018-11-01-10:59:40 INFO:tensorflow:Graph was finalized. INFO:tensorflow:Restoring parameters from /var/folders/gc/y94kqvf109v1_tthvbls56wc0000gn/T/tmp28oygs1t/model.ckpt-2000 INFO:tensorflow:Running local_init_op. INFO:tensorflow:Done running local_init_op. INFO:tensorflow:Evaluation [10/100] INFO:tensorflow:Evaluation [20/100] INFO:tensorflow:Evaluation [30/100] INFO:tensorflow:Evaluation [40/100] INFO:tensorflow:Evaluation [50/100] INFO:tensorflow:Evaluation [60/100] INFO:tensorflow:Evaluation [70/100] INFO:tensorflow:Evaluation [80/100] INFO:tensorflow:Evaluation [90/100] INFO:tensorflow:Evaluation [100/100] INFO:tensorflow:Finished evaluation at 2018-11-01-10:59:40 INFO:tensorflow:Saving dict for global step 2000: average_loss = 26.29743, global_step = 2000, loss = 26.29743 INFO:tensorflow:Saving 'checkpoint_path' summary for global step 2000: /var/folders/gc/y94kqvf109v1_tthvbls56wc0000gn/T/tmp28oygs1t/model.ckpt-2000
# Get Predicted Values as an Array
predicted_vals = []
for pred in linear_model.predict(input_fn=eval_input_func):
predicted_vals.append(pred['predictions'])
INFO:tensorflow:Calling model_fn. INFO:tensorflow:Done calling model_fn. INFO:tensorflow:Graph was finalized. INFO:tensorflow:Restoring parameters from /var/folders/gc/y94kqvf109v1_tthvbls56wc0000gn/T/tmp28oygs1t/model.ckpt-2000 INFO:tensorflow:Running local_init_op. INFO:tensorflow:Done running local_init_op.
plt.scatter(y_test, predicted_vals)
plt.xlabel(u"실제 집값")
plt.ylabel(u"집값 예측치")
plt.title("집값 예측치와 실제 집값의 관계", fontsize=15)
plt.show()