#!/usr/bin/env python # coding: utf-8 # **19장 – 대규모 텐서플로 모델 훈련과 배포** # _이 노트북은 19장에 있는 모든 샘플 코드를 담고 있습니다._ # # #

# # 설정 # # 먼저 몇 개의 모듈을 임포트합니다. 맷플롯립 그래프를 인라인으로 출력하도록 만들고 그림을 저장하는 함수를 준비합니다. 또한 파이썬 버전이 3.5 이상인지 확인합니다(파이썬 2.x에서도 동작하지만 곧 지원이 중단되므로 파이썬 3을 사용하는 것이 좋습니다). 사이킷런 버전이 0.20 이상인지와 텐서플로 버전이 2.0 이상인지 확인합니다. # In[1]: # 파이썬 ≥3.5 필수 import sys assert sys.version_info >= (3, 5) # 사이킷런 ≥0.20 필수 import sklearn assert sklearn.__version__ >= "0.20" try: # %tensorflow_version은 코랩에만 있습니다. get_ipython().run_line_magic('tensorflow_version', '2.x') get_ipython().system('echo "deb http://storage.googleapis.com/tensorflow-serving-apt stable tensorflow-model-server tensorflow-model-server-universal" > /etc/apt/sources.list.d/tensorflow-serving.list') get_ipython().system('curl https://storage.googleapis.com/tensorflow-serving-apt/tensorflow-serving.release.pub.gpg | apt-key add -') get_ipython().system('apt update && apt-get install -y tensorflow-model-server') get_ipython().run_line_magic('pip', 'install -q -U tensorflow-serving-api') IS_COLAB = True except Exception: IS_COLAB = False # 텐서플로 ≥2.0 필수 import tensorflow as tf from tensorflow import keras assert tf.__version__ >= "2.0" if not tf.config.list_physical_devices('GPU'): print("감지된 GPU가 없습니다. GPU가 없으면 LSTM과 CNN이 매우 느릴 수 있습니다.") if IS_COLAB: print("런타임 > 런타임 유형 변경 메뉴를 선택하고 하드웨어 가속기로 GPU를 고르세요.") # 공통 모듈 임포트 import numpy as np import os # 노트북 실행 결과를 동일하게 유지하기 위해 np.random.seed(42) tf.random.set_seed(42) # 깔끔한 그래프 출력을 위해 get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib as mpl import matplotlib.pyplot as plt mpl.rc('axes', labelsize=14) mpl.rc('xtick', labelsize=12) mpl.rc('ytick', labelsize=12) # 그림을 저장할 위치 PROJECT_ROOT_DIR = "." CHAPTER_ID = "deploy" IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID) os.makedirs(IMAGES_PATH, exist_ok=True) def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300): path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension) print("그림 저장", fig_id) if tight_layout: plt.tight_layout() plt.savefig(path, format=fig_extension, dpi=resolution) # # 텐서플로 서빙(TFS)으로 텐서플로 모델 배포하기 # # REST API나 gRPC API를 사용합니다. # ## `SavedModel` 저장과 로딩 # In[2]: (X_train_full, y_train_full), (X_test, y_test) = keras.datasets.mnist.load_data() X_train_full = X_train_full[..., np.newaxis].astype(np.float32) / 255. X_test = X_test[..., np.newaxis].astype(np.float32) / 255. X_valid, X_train = X_train_full[:5000], X_train_full[5000:] y_valid, y_train = y_train_full[:5000], y_train_full[5000:] X_new = X_test[:3] # In[3]: np.random.seed(42) tf.random.set_seed(42) model = keras.models.Sequential([ keras.layers.Flatten(input_shape=[28, 28, 1]), keras.layers.Dense(100, activation="relu"), keras.layers.Dense(10, activation="softmax") ]) model.compile(loss="sparse_categorical_crossentropy", optimizer=keras.optimizers.SGD(learning_rate=1e-2), metrics=["accuracy"]) model.fit(X_train, y_train, epochs=10, validation_data=(X_valid, y_valid)) # In[4]: np.round(model.predict(X_new), 2) # In[5]: model_version = "0001" model_name = "my_mnist_model" model_path = os.path.join(model_name, model_version) model_path # In[6]: get_ipython().system('rm -rf {model_name}') # In[7]: tf.saved_model.save(model, model_path) # In[8]: for root, dirs, files in os.walk(model_name): indent = ' ' * root.count(os.sep) print('{}{}/'.format(indent, os.path.basename(root))) for filename in files: print('{}{}'.format(indent + ' ', filename)) # In[9]: get_ipython().system('saved_model_cli show --dir {model_path}') # In[10]: get_ipython().system('saved_model_cli show --dir {model_path} --tag_set serve') # In[11]: get_ipython().system('saved_model_cli show --dir {model_path} --tag_set serve --signature_def serving_default') # In[12]: get_ipython().system('saved_model_cli show --dir {model_path} --all') # X_new를 `npy` 파일로 만들면 모델에 쉽게 전달할 수 있습니다: # In[13]: np.save("my_mnist_tests.npy", X_new) # In[14]: input_name = model.input_names[0] input_name # 그리고 이제 `saved_model_cli`를 사용해 방금 저장한 샘플에 대한 예측을 만듭니다: # In[15]: get_ipython().system('saved_model_cli run --dir {model_path} --tag_set serve --signature_def serving_default --inputs {input_name}=my_mnist_tests.npy') # In[16]: np.round([[1.1347984e-04, 1.5187356e-07, 9.7032893e-04, 2.7640699e-03, 3.7826971e-06, 7.6876910e-05, 3.9140293e-08, 9.9559116e-01, 5.3502394e-05, 4.2665208e-04], [8.2443521e-04, 3.5493889e-05, 9.8826385e-01, 7.0466995e-03, 1.2957400e-07, 2.3389691e-04, 2.5639210e-03, 9.5886099e-10, 1.0314899e-03, 8.7952529e-08], [4.4693781e-05, 9.7028232e-01, 9.0526715e-03, 2.2641101e-03, 4.8766597e-04, 2.8800720e-03, 2.2714981e-03, 8.3753867e-03, 4.0439744e-03, 2.9759688e-04]], 2) # ## 텐서플로 서빙 # [도커](https://docs.docker.com/install/)가 없다면 설치하세요. 그리고 다음을 실행하세요: # # ```bash # docker pull tensorflow/serving # # export ML_PATH=$HOME/ml # or wherever this project is # docker run -it --rm -p 8500:8500 -p 8501:8501 \ # -v "$ML_PATH/my_mnist_model:/models/my_mnist_model" \ # -e MODEL_NAME=my_mnist_model \ # tensorflow/serving # ``` # # 사용이 끝나면 Ctrl-C를 눌러 서버를 종료하세요. # 또는 `tensorflow_model_server`가 설치되어 있다면 (예를 들어, 이 노트북을 코랩에서 실행하는 경우) 다음 세 개의 셀을 실행하여 서버를 시작하세요: # In[17]: os.environ["MODEL_DIR"] = os.path.split(os.path.abspath(model_path))[0] # In[18]: get_ipython().run_cell_magic('bash', '--bg', 'nohup tensorflow_model_server \\\n --rest_api_port=8501 \\\n --model_name=my_mnist_model \\\n --model_base_path="${MODEL_DIR}" >server.log 2>&1\n') # In[19]: get_ipython().system('tail server.log') # In[20]: import json input_data_json = json.dumps({ "signature_name": "serving_default", "instances": X_new.tolist(), }) # In[21]: repr(input_data_json)[:1500] + "..." # 이제 텐서플로 서빙의 REST API를 사용해 예측을 만들어 보죠: # In[22]: import requests SERVER_URL = 'http://localhost:8501/v1/models/my_mnist_model:predict' response = requests.post(SERVER_URL, data=input_data_json) response.raise_for_status() # raise an exception in case of error response = response.json() # In[23]: response.keys() # In[24]: y_proba = np.array(response["predictions"]) y_proba.round(2) # ### gRPC API 사용하기 # In[25]: from tensorflow_serving.apis.predict_pb2 import PredictRequest request = PredictRequest() request.model_spec.name = model_name request.model_spec.signature_name = "serving_default" input_name = model.input_names[0] request.inputs[input_name].CopyFrom(tf.make_tensor_proto(X_new)) # In[26]: import grpc from tensorflow_serving.apis import prediction_service_pb2_grpc channel = grpc.insecure_channel('localhost:8500') predict_service = prediction_service_pb2_grpc.PredictionServiceStub(channel) response = predict_service.Predict(request, timeout=10.0) # In[27]: response # 응답을 텐서로 변환합니다: # In[28]: output_name = model.output_names[0] outputs_proto = response.outputs[output_name] y_proba = tf.make_ndarray(outputs_proto) y_proba.round(2) # 클라이언트가 텐서플로 라이브러리를 사용하지 않는다면 넘파이 배열로 변환합니다: # In[29]: output_name = model.output_names[0] outputs_proto = response.outputs[output_name] shape = [dim.size for dim in outputs_proto.tensor_shape.dim] y_proba = np.array(outputs_proto.float_val).reshape(shape) y_proba.round(2) # ## 새로운 버전의 모델 배포하기 # In[30]: np.random.seed(42) tf.random.set_seed(42) model = keras.models.Sequential([ keras.layers.Flatten(input_shape=[28, 28, 1]), keras.layers.Dense(50, activation="relu"), keras.layers.Dense(50, activation="relu"), keras.layers.Dense(10, activation="softmax") ]) model.compile(loss="sparse_categorical_crossentropy", optimizer=keras.optimizers.SGD(learning_rate=1e-2), metrics=["accuracy"]) history = model.fit(X_train, y_train, epochs=10, validation_data=(X_valid, y_valid)) # In[31]: model_version = "0002" model_name = "my_mnist_model" model_path = os.path.join(model_name, model_version) model_path # In[32]: tf.saved_model.save(model, model_path) # In[33]: for root, dirs, files in os.walk(model_name): indent = ' ' * root.count(os.sep) print('{}{}/'.format(indent, os.path.basename(root))) for filename in files: print('{}{}'.format(indent + ' ', filename)) # **경고**: 새로운 모델이 텐서플로 서빙에 로드되기 전까지 잠시 기다려야 할 수 있습니다: # In[34]: import requests SERVER_URL = 'http://localhost:8501/v1/models/my_mnist_model:predict' response = requests.post(SERVER_URL, data=input_data_json) response.raise_for_status() response = response.json() # In[35]: response.keys() # In[36]: y_proba = np.array(response["predictions"]) y_proba.round(2) # # 구글 클라우드 AI 플랫폼에 모델 배포하기 # 구글 클라우드 AI 플랫폼에 모델을 배포하는 책의 안내를 따르고, 서비스 계정의 개인키를 다운로드하여 프로젝트 디렉토리에 있는 `my_service_account_private_key.json` 파일에 저장하세요. 또한 `project_id`를 업데이트하세요: # In[37]: project_id = "onyx-smoke-242003" # In[38]: import googleapiclient.discovery os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "my_service_account_private_key.json" model_id = "my_mnist_model" model_path = "projects/{}/models/{}".format(project_id, model_id) model_path += "/versions/v0001/" # 특정 버전을 실행하고 싶다면 ml_resource = googleapiclient.discovery.build("ml", "v1").projects() # In[39]: def predict(X): input_data_json = {"signature_name": "serving_default", "instances": X.tolist()} request = ml_resource.predict(name=model_path, body=input_data_json) response = request.execute() if "error" in response: raise RuntimeError(response["error"]) return np.array([pred[output_name] for pred in response["predictions"]]) # In[40]: Y_probas = predict(X_new) np.round(Y_probas, 2) # # GPU 사용하기 # **노트**: `tf.test.is_gpu_available()`는 deprecated 되었습니다. 대신 `tf.config.list_physical_devices('GPU')`를 사용하세요. # In[41]: #tf.test.is_gpu_available() # deprecated tf.config.list_physical_devices('GPU') # In[42]: tf.test.gpu_device_name() # In[43]: tf.test.is_built_with_cuda() # In[44]: from tensorflow.python.client.device_lib import list_local_devices devices = list_local_devices() devices # # 분산 훈련 # In[45]: keras.backend.clear_session() tf.random.set_seed(42) np.random.seed(42) # In[46]: def create_model(): return keras.models.Sequential([ keras.layers.Conv2D(filters=64, kernel_size=7, activation="relu", padding="same", input_shape=[28, 28, 1]), keras.layers.MaxPooling2D(pool_size=2), keras.layers.Conv2D(filters=128, kernel_size=3, activation="relu", padding="same"), keras.layers.Conv2D(filters=128, kernel_size=3, activation="relu", padding="same"), keras.layers.MaxPooling2D(pool_size=2), keras.layers.Flatten(), keras.layers.Dense(units=64, activation='relu'), keras.layers.Dropout(0.5), keras.layers.Dense(units=10, activation='softmax'), ]) # In[47]: batch_size = 100 model = create_model() model.compile(loss="sparse_categorical_crossentropy", optimizer=keras.optimizers.SGD(learning_rate=1e-2), metrics=["accuracy"]) model.fit(X_train, y_train, epochs=10, validation_data=(X_valid, y_valid), batch_size=batch_size) # In[48]: keras.backend.clear_session() tf.random.set_seed(42) np.random.seed(42) distribution = tf.distribute.MirroredStrategy() # Change the default all-reduce algorithm: #distribution = tf.distribute.MirroredStrategy( # cross_device_ops=tf.distribute.HierarchicalCopyAllReduce()) # Specify the list of GPUs to use: #distribution = tf.distribute.MirroredStrategy(devices=["/gpu:0", "/gpu:1"]) # Use the central storage strategy instead: #distribution = tf.distribute.experimental.CentralStorageStrategy() #if IS_COLAB and "COLAB_TPU_ADDR" in os.environ: # tpu_address = "grpc://" + os.environ["COLAB_TPU_ADDR"] #else: # tpu_address = "" #resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu_address) #tf.config.experimental_connect_to_cluster(resolver) #tf.tpu.experimental.initialize_tpu_system(resolver) #distribution = tf.distribute.experimental.TPUStrategy(resolver) with distribution.scope(): model = create_model() model.compile(loss="sparse_categorical_crossentropy", optimizer=keras.optimizers.SGD(learning_rate=1e-2), metrics=["accuracy"]) # In[49]: batch_size = 100 # must be divisible by the number of workers model.fit(X_train, y_train, epochs=10, validation_data=(X_valid, y_valid), batch_size=batch_size) # In[50]: model.predict(X_new) # 사용자 정의 훈련 루프: # In[51]: keras.backend.clear_session() tf.random.set_seed(42) np.random.seed(42) K = keras.backend distribution = tf.distribute.MirroredStrategy() with distribution.scope(): model = create_model() optimizer = keras.optimizers.SGD() with distribution.scope(): dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)).repeat().batch(batch_size) input_iterator = distribution.make_dataset_iterator(dataset) @tf.function def train_step(): def step_fn(inputs): X, y = inputs with tf.GradientTape() as tape: Y_proba = model(X) loss = K.sum(keras.losses.sparse_categorical_crossentropy(y, Y_proba)) / batch_size grads = tape.gradient(loss, model.trainable_variables) optimizer.apply_gradients(zip(grads, model.trainable_variables)) return loss per_replica_losses = distribution.experimental_run(step_fn, input_iterator) mean_loss = distribution.reduce(tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None) return mean_loss n_epochs = 10 with distribution.scope(): input_iterator.initialize() for epoch in range(n_epochs): print("Epoch {}/{}".format(epoch + 1, n_epochs)) for iteration in range(len(X_train) // batch_size): print("\rLoss: {:.3f}".format(train_step().numpy()), end="") print() # ## 여러 서버에서 훈련하기 # 텐서플로 클러스터는 일반적으로 여러 서버에서 병렬로 실행되는 텐서플로 프로세스의 그룹입니다. 신경망을 훈련하거나 실행하는 작업을 완료하기 위해 프로세스가 서로 대화합니다. 클러스터에 있는 개별 TF 프로세스를 "태스크"라고 부릅니다(또는 "TF 서버"). 태스크는 IP 주소, 포트, 타입(또는 역할이나 잡(job)으로 부릅니다). 타입은 `"worker"`, `"chief"`, `"ps"`(파라미터 서버), `"evaluator"`가 있습니다: # * 각 **워커**는 일반적으로 하나 이상의 GPU를 가진 머신에서 계산을 수행합니다. # * **치프**도 계산을 수행합니다. 하지만 텐서 보드 로그를 기록하거나 체크포인트를 저장하는 등의 추가적인 작업을 처리합니다. 클러스터에는 하나의 치프가 있고 일반적으로 첫 번째 워커가 치프입니다(즉, 워커 #0). # * **파라미터 서버**(ps)는 변수 값만 가지고 있습니다. 일반적으로 CPU만 가진 머신입니다. # * **evaluator**는 평가를 담당합ㄴ디ㅏ. 일반적으로 클러스터 내에 하나의 evaluator가 있습니다. # # 동일한 타입을 공유하는 작업을 종종 "잡"(job)이라고 부릅니다. 예를 들어, "워커" 잡은 모든 워커의 집합입니다. # # 텐서플로 클러스터를 시작하려면 먼저 이를 정의해야 합니다. 모든 태스크(IP 주소, TCP 포트, 타입)를 지정한다는 것을 의미합니다. 예를 들어 다음 클러스터 명세는 세 개의 태스크로 구성된 클러스터를 정의합니다(두 대의 워커와 한 대의 파라미터 서버). 잡마다 하나의 키를 가진 딕셔너리이며 값은 태스크 주소의 리스트입니다: # In[52]: cluster_spec = { "worker": [ "machine-a.example.com:2222", # /job:worker/task:0 "machine-b.example.com:2222" # /job:worker/task:1 ], "ps": ["machine-c.example.com:2222"] # /job:ps/task:0 } # 클러스터에 있는 각 태스크는 서버에 있는 다른 태스크와 통신할 수 있습니다. 따라서 해당 머신의 포트 사이에 모든 통신이 가능하도록 방화벽을 설정해야 합니다(모든 머신에서 동일한 포트를 사용하면 간단히 설정할 수 있습니다). # # 태스크가 시작될 때, 타입과 인덱스(태스크 인덱스를 태스크 아이디라고도 합니다)를 알려야 합니다. 한 번에 (클러스터 스펙과 현재 작업의 타입, 아이디를) 모두 정의하는 일반적인 방법은 프로그램을 시작하기 전에 `TF_CONFIG` 환경 변수를 설정하는 것입니다. (`"cluster"` 키 아래) 클러스터 스펙과 (`"task"` 키 아래) 시작할 태스크의 타입과 인덱스를 담은 JSON으로 인코딩된 딕셔너리입니다. 예를 들어 다음 `TF_CONFIG` 환경 변수는 위와 동일한 클러스터를 정의합니다. 두 대의 워커와 한 대의 파라미터 서버, 그리고 시작할 태스크는 워커 #1입니다: # In[53]: import os import json os.environ["TF_CONFIG"] = json.dumps({ "cluster": cluster_spec, "task": {"type": "worker", "index": 1} }) os.environ["TF_CONFIG"] # 일부 플랫폼(예를 들면, 구글 클라우드 ML 엔진)은 자동으로 이런 환경을 설정합니다. # 텐서플로의 `TFConfigClusterResolver` 클래스는 환경 변수에서 클러스터 스펙을 읽습니다: # In[54]: import tensorflow as tf resolver = tf.distribute.cluster_resolver.TFConfigClusterResolver() resolver.cluster_spec() # In[55]: resolver.task_type # In[56]: resolver.task_id # 이제 간단한 클러스터를 시작해 보죠. 두 개의 워커 태스크를 로컬 머신에서 실행합니다. `MultiWorkerMirroredStrategy`를 사용해 두 태스크에서 모델을 훈련하겠습니다. # # 첫 번째 단계로 훈련 코드를 작성합니다. 이 코드는 자체 프로세스를 가진 두 워커를 실행하는데 사용되기 때문에 별도의 파이썬 파일 `my_mnist_multiworker_task.py`로 이 코드를 저장합니다. 이 코드는 비교적 간단하지만 몇 가지 중요한 점이 있습니다: # * 텐서플로로 무엇을 하기전에 `MultiWorkerMirroredStrategy`를 생성합니다. # * 워커 중 한 대에서 텐서보드 로깅과 체크포인트 저장을 담당합니다. 앞서 언급한대로 이 워커를 *치프* 라고 부릅니다. 관례적으로 워커 #0입니다. # In[57]: get_ipython().run_cell_magic('writefile', 'my_mnist_multiworker_task.py', '\nimport os\nimport numpy as np\nimport tensorflow as tf\nfrom tensorflow import keras\nimport time\n\n# 프로그램 시작 부분에\ndistribution = tf.distribute.MultiWorkerMirroredStrategy()\n\nresolver = tf.distribute.cluster_resolver.TFConfigClusterResolver()\nprint("Starting task {}{}".format(resolver.task_type, resolver.task_id))\n\n# 워커 #0이 체크포인트 저장과 텐서보드 로깅을 수행합니다\nif resolver.task_id == 0:\n root_logdir = os.path.join(os.curdir, "my_mnist_multiworker_logs")\n run_id = time.strftime("run_%Y_%m_%d-%H_%M_%S")\n run_dir = os.path.join(root_logdir, run_id)\n callbacks = [\n keras.callbacks.TensorBoard(run_dir),\n keras.callbacks.ModelCheckpoint("my_mnist_multiworker_model.h5",\n save_best_only=True),\n ]\nelse:\n callbacks = []\n\n# MNIST 데이터셋을 로드하고 준비합니다\n(X_train_full, y_train_full), (X_test, y_test) = keras.datasets.mnist.load_data()\nX_train_full = X_train_full[..., np.newaxis] / 255.\nX_valid, X_train = X_train_full[:5000], X_train_full[5000:]\ny_valid, y_train = y_train_full[:5000], y_train_full[5000:]\n\nwith distribution.scope():\n model = keras.models.Sequential([\n keras.layers.Conv2D(filters=64, kernel_size=7, activation="relu",\n padding="same", input_shape=[28, 28, 1]),\n keras.layers.MaxPooling2D(pool_size=2),\n keras.layers.Conv2D(filters=128, kernel_size=3, activation="relu",\n padding="same"), \n keras.layers.Conv2D(filters=128, kernel_size=3, activation="relu",\n padding="same"),\n keras.layers.MaxPooling2D(pool_size=2),\n keras.layers.Flatten(),\n keras.layers.Dense(units=64, activation=\'relu\'),\n keras.layers.Dropout(0.5),\n keras.layers.Dense(units=10, activation=\'softmax\'),\n ])\n model.compile(loss="sparse_categorical_crossentropy",\n optimizer=keras.optimizers.SGD(learning_rate=1e-2),\n metrics=["accuracy"])\n\nmodel.fit(X_train, y_train, validation_data=(X_valid, y_valid),\n epochs=10, callbacks=callbacks)\n') # 실제 애플리케이션에서는 일반적으로 머신마다 하나의 워커가 있지만 이 예에서는 동일한 머신에 두 워커를 실행합니다. 따라서 (GPU가 있다면) 두 워커가 모두 가용한 GPU 램을 사용하려고 하기 때문에 메모리 부족 에러가 날 수 있습니다. 이를 피하려면 `CUDA_VISIBLE_DEVICES` 환경 변수를 사용해 워커마다 다른 GPU를 할당할 수 있습니다. 또는 다음처럼 간단하게 GPU를 사용하지 않게 설정할 수 있습니다: # In[58]: os.environ["CUDA_VISIBLE_DEVICES"] = "-1" # 이제 파이썬의 `subprocess` 모델을 사용해 두 워커를 각각의 개별 프로세스로 시작할 준비가 되었습니다. 프로세스를 시작하기 전에 `TF_CONFIG` 환경 변수에 태스크 인덱스를 적절히 설정해야 합니다: # In[59]: import subprocess cluster_spec = {"worker": ["127.0.0.1:9901", "127.0.0.1:9902"]} for index, worker_address in enumerate(cluster_spec["worker"]): os.environ["TF_CONFIG"] = json.dumps({ "cluster": cluster_spec, "task": {"type": "worker", "index": index} }) subprocess.Popen("python my_mnist_multiworker_task.py", shell=True) # 이제 됐습니다! 텐서플로 클러스터가 실행되었습니다. 하지만 별도의 프로세스로 실행되기 때문에 이 노트북에서는 볼 수 없습니다(하지만 이 노트북을 주피터에서 실행한다면 주피터 서버 로그에서 워커 로그를 볼 수 있습니다). # # 치프(워커 #0)가 텐서보드 로그를 작성하기 때문에 텐서보드로 훈련 과정을 볼 수 있습니다. 다음 셀을 실행하고 텐서보드 인터페이스의 설정(setting) 버튼을 누르고, "Reload data" 체크박스를 선택하면 텐서보드가 자동으로 30초마다 리프레시됩니다. 첫 번째 훈련 에포크가 끝나고 (몇 분 걸립니다) 텐서보드가 리프레시되면 SCALARS 탭이 나타날 것입니다. 이 탭을 클릭하고 모델의 훈련과 검증 정확도를 확인하세요. # In[60]: get_ipython().run_line_magic('load_ext', 'tensorboard') get_ipython().run_line_magic('tensorboard', '--logdir=./my_mnist_multiworker_logs --port=6006') # 훈련이 끝나면 최상의 모델 체크포인트가 `my_mnist_multiworker_model.h5` 파일에 저장됩니다. `keras.models.load_model()`를 사용해 이를 로드하고 예측에 사용할 수 있습니다: # In[61]: from tensorflow import keras model = keras.models.load_model("my_mnist_multiworker_model.h5") Y_pred = model.predict(X_new) np.argmax(Y_pred, axis=-1) # 이 장의 노트북은 여기까지입니다! 이 내용이 도움이 되었으면 좋겠습니다. 😊 # # 연습문제 해답 # ## 1. to 8. # # 부록 A 참조 # ## 9. # _연습문제: (원하는 어떤 모델이든) 모델을 훈련하고 TF 서빙이나 구글 클라우드 AI 플랫폼에 배포해보세요. REST API나 gRPC API를 사용해 쿼리하는 클라이언트 코드를 작성해보세요. 모델을 업데이트하고 새로운 버전을 배포해보세요. 클라이언트 코드가 새로운 버전으로 쿼리할 것입니다. 첫 번째 버전으로 롤백해보세요._ # 텐서플로 서빙(TFS)으로 텐서플로 모델 배포하기 절에 있는 단계를 따라해 보세요. # # 10. # _연습문제: 하나의 머신에 여러 개의 GPU에서 `MirroredStrategy` 전략으로 모델을 훈련해보세요(GPU를 준비하지 못하면 코랩의 GPU 런타임을 사용하여 가상 GPU 2개를 만들 수 있습니다). `CentralStorageStrategy` 전략으로 모델을 다시 훈련하고 훈련 시간을 비교해보세요._ # [분산 훈련](#분산-훈련) 절에 있는 단계를 따라해 보세요. # # 11. # _연습문제: 구글 클라우드 AI 플랫폼에서 블랙 박스 하이퍼파라미터 튜닝을 사용해 작은 모델을 훈련해보세요._ # 책의 837~838 페이지에 있는 내용을 따라해 보세요. 또는 [이 문서](https://cloud.google.com/ai-platform/training/docs/hyperparameter-tuning-overview)를 읽고 Lak Lakshmanan가 쓴 훌륭한 [블로그 포스트](https://towardsdatascience.com/how-to-do-bayesian-hyper-parameter-tuning-on-a-blackbox-model-882009552c6d)에 있는 예를 따라해 보세요.