! mkdir -p census_data
! gsutil cp gs://cloudml-public/census/data/adult.data.csv census_data/adult.data.csv
! gsutil cp gs://cloudml-public/census/data/adult.test.csv census_data/adult.test.csv

# list the contents of the data directory as a check
!ls -l census_data
! head census_data/adult.data.csv

TRAIN_FILES = ['census_data/adult.data.csv']
EVAL_FILES  = ['census_data/adult.test.csv']

%env TRAIN_FILE=census_data/adult.data.csv
%env EVAL_FILE=census_data/adult.test.csv

from __future__ import division
from __future__ import print_function

import argparse
import multiprocessing
import os
import time

import tensorflow as tf
from tensorflow.contrib.learn.python.learn.utils import (
    saved_model_export_utils)
print(tf.__version__)

CSV_COLUMNS = ['age', 'workclass', 'fnlwgt', 'education', 'education_num',
               'marital_status', 'occupation', 'relationship', 'race', 'gender',
               'capital_gain', 'capital_loss', 'hours_per_week',
               'native_country', 'income_bracket']
CSV_COLUMN_DEFAULTS = [[0], [''], [0], [''], [0], [''], [''], [''], [''], [''],
                       [0], [0], [0], [''], ['']]
LABEL_COLUMN = 'income_bracket'
LABELS = [' <=50K', ' >50K']

# Define the initial ingestion of each feature used by your model.
# Additionally, provide metadata about the feature.
INPUT_COLUMNS = [
    # Categorical base columns

    # For categorical columns with known values we can provide lists
    # of values ahead of time.
    tf.feature_column.categorical_column_with_vocabulary_list(
        'gender', [' Female', ' Male']),

    tf.feature_column.categorical_column_with_vocabulary_list(
        'race',
        [' Amer-Indian-Eskimo', ' Asian-Pac-Islander',
         ' Black', ' Other', ' White']
    ),
    tf.feature_column.categorical_column_with_vocabulary_list(
        'education',
        [' Bachelors', ' HS-grad', ' 11th', ' Masters', ' 9th',
         ' Some-college', ' Assoc-acdm', ' Assoc-voc', ' 7th-8th',
         ' Doctorate', ' Prof-school', ' 5th-6th', ' 10th',
         ' 1st-4th', ' Preschool', ' 12th']),
    tf.feature_column.categorical_column_with_vocabulary_list(
        'marital_status',
        [' Married-civ-spouse', ' Divorced', ' Married-spouse-absent',
         ' Never-married', ' Separated', ' Married-AF-spouse', ' Widowed']),
    tf.feature_column.categorical_column_with_vocabulary_list(
        'relationship',
        [' Husband', ' Not-in-family', ' Wife', ' Own-child', ' Unmarried',
         ' Other-relative']),
    tf.feature_column.categorical_column_with_vocabulary_list(
        'workclass',
        [' Self-emp-not-inc', ' Private', ' State-gov',
         ' Federal-gov', ' Local-gov', ' ?', ' Self-emp-inc',
         ' Without-pay', ' Never-worked']
    ),

    # For columns with a large number of values, or unknown values
    # We can use a hash function to convert to categories.
    tf.feature_column.categorical_column_with_hash_bucket(
        'occupation', hash_bucket_size=100, dtype=tf.string),
    tf.feature_column.categorical_column_with_hash_bucket(
        'native_country', hash_bucket_size=100, dtype=tf.string),

    # Continuous base columns.
    tf.feature_column.numeric_column('age'),
    tf.feature_column.numeric_column('education_num'),
    tf.feature_column.numeric_column('capital_gain'),
    tf.feature_column.numeric_column('capital_loss'),
    tf.feature_column.numeric_column('hours_per_week'),
]

# Now we'll define the unused columns-- those we won't use for this example.
# In this case, there's just one: 'fnlwgt'.
UNUSED_COLUMNS = set(CSV_COLUMNS) - {col.name for col in INPUT_COLUMNS} - {LABEL_COLUMN}
print('unused columns: %s' % UNUSED_COLUMNS)
    

def build_estimator(config, embedding_size=8, hidden_units=None):
  """Build a wide and deep model for predicting income category.
  """
  (gender, race, education, marital_status, relationship,
   workclass, occupation, native_country, age,
   education_num, capital_gain, capital_loss, hours_per_week) = INPUT_COLUMNS

  # Continuous columns can be converted to categorical via bucketization
  age_buckets = tf.feature_column.bucketized_column(
      age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])

  # Wide columns and deep columns.
  wide_columns = [
      # Interactions between different categorical features can also
      # be added as new virtual features.
      tf.feature_column.crossed_column(
          ['education', 'occupation'], hash_bucket_size=int(1e4)),
      tf.feature_column.crossed_column(
          [age_buckets, race, 'occupation'], hash_bucket_size=int(1e6)),
      tf.feature_column.crossed_column(
          ['native_country', 'occupation'], hash_bucket_size=int(1e4)),
      gender,
      native_country,
      education,
      occupation,
      workclass,
      marital_status,
      relationship,
      age_buckets,
  ]

  deep_columns = [
      # Use indicator columns for low dimensional vocabularies
      tf.feature_column.indicator_column(workclass),
      tf.feature_column.indicator_column(education),
      tf.feature_column.indicator_column(marital_status),
      tf.feature_column.indicator_column(gender),
      tf.feature_column.indicator_column(relationship),
      tf.feature_column.indicator_column(race),

      # Use embedding columns for high dimensional vocabularies
      tf.feature_column.embedding_column(
          native_country, dimension=embedding_size),
      tf.feature_column.embedding_column(occupation, dimension=embedding_size),
      age,
      education_num,
      capital_gain,
      capital_loss,
      hours_per_week,
  ]

  return tf.estimator.DNNLinearCombinedClassifier(
      config=config,
      linear_feature_columns=wide_columns,
      dnn_feature_columns=deep_columns,
      dnn_hidden_units=hidden_units or [100, 70, 50, 25]
  )


output_dir = "census_%s" % (int(time.time()))
print(output_dir)

run_config = tf.estimator.RunConfig()
run_config = run_config.replace(model_dir=output_dir)

FIRST_LAYER_SIZE = 100  # Number of nodes in the first layer of the DNN
NUM_LAYERS = 4  # Number of layers in the DNN
SCALE_FACTOR = 0.7  # How quickly should the size of the layers in the DNN decay
EMBEDDING_SIZE = 8  # Number of embedding dimensions for categorical columns

estimator = build_estimator(
    embedding_size=EMBEDDING_SIZE,
    # Construct layers sizes with exponential decay
    hidden_units=[
        max(2, int(FIRST_LAYER_SIZE *
                   SCALE_FACTOR**i))
        for i in range(NUM_LAYERS)
    ],
    config=run_config
)

def parse_label_column(label_string_tensor):
  """Parses a string tensor into the label tensor
  """
  # Build a Hash Table inside the graph
  table = tf.contrib.lookup.index_table_from_tensor(tf.constant(LABELS))

  # Use the hash table to convert string labels to ints and one-hot encode
  return table.lookup(label_string_tensor)

def parse_csv(rows_string_tensor):
  """Takes the string input tensor and returns a dict of rank-2 tensors."""

  # Takes a rank-1 tensor and converts it into rank-2 tensor
  # Example if the data is ['csv,line,1', 'csv,line,2', ..] to
  # [['csv,line,1'], ['csv,line,2']] which after parsing will result in a
  # tuple of tensors: [['csv'], ['csv']], [['line'], ['line']], [[1], [2]]
  row_columns = tf.expand_dims(rows_string_tensor, -1)
  columns = tf.decode_csv(row_columns, record_defaults=CSV_COLUMN_DEFAULTS)
  features = dict(zip(CSV_COLUMNS, columns))

  # Remove unused columns
  for col in UNUSED_COLUMNS:
    features.pop(col)
  return features

# This function returns a (features, indices) tuple, where features is a dictionary of
# Tensors, and indices is a single Tensor of label indices.
def input_fn(filenames,
                      num_epochs=None,
                      shuffle=True,
                      skip_header_lines=0,
                      batch_size=200):
  """Generates features and labels for training or evaluation.
  """

  dataset = tf.data.TextLineDataset(filenames).skip(skip_header_lines).map(parse_csv)

  if shuffle:
    dataset = dataset.shuffle(buffer_size=batch_size * 10)
  dataset = dataset.repeat(num_epochs)
  dataset = dataset.batch(batch_size)
  iterator = dataset.make_one_shot_iterator()
  features = iterator.get_next()
  return features, parse_label_column(features.pop(LABEL_COLUMN))

train_input = lambda: input_fn(
    TRAIN_FILES,
    batch_size=40
)

# Don't shuffle evaluation data
eval_input = lambda: input_fn(
    EVAL_FILES,
    batch_size=40,
    shuffle=False
)

train_spec = tf.estimator.TrainSpec(train_input,
                                  max_steps=1000
                                  )

def json_serving_input_fn():
  """Build the serving inputs."""
  inputs = {}
  for feat in INPUT_COLUMNS:
    inputs[feat.name] = tf.placeholder(shape=[None], dtype=feat.dtype)

  return tf.estimator.export.ServingInputReceiver(inputs, inputs)

exporter = tf.estimator.FinalExporter('census',
      json_serving_input_fn)
eval_spec = tf.estimator.EvalSpec(eval_input,
                                steps=100,
                                exporters=[exporter],
                                name='census-eval'
                                )

tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)

# List the directory that contains the model.  You'll use this info in the next section too.
!ls -R $output_dir/export/census

# Now, view the model signature
# This is an example.  Edit this command to use your own directory path
!saved_model_cli show --dir $output_dir/export/census/<timestamp> --tag serve --signature_def predict

! cat test.json
# This is an example. Edit this command to use your own directory path.
! gcloud ml-engine local predict --model-dir $output_dir/export/census/<timestamp> --json-instances test.json

output_dir = "census_%s" % (int(time.time()))
%env OUTPUT_DIR=$output_dir
! gcloud ml-engine local train --package-path trainer \
                           --module-name trainer.task \
                           -- \
                           --train-files $TRAIN_FILE \
                           --eval-files $EVAL_FILE \
                           --train-steps 1000 \
                           --job-dir $OUTPUT_DIR \
                           --eval-steps 100

job_name = "census_job_%s" % (int(time.time()))

# Edit the following to point to your GCS bucket directory
gcs_job_dir = "gs://your-gcs-bucket/path/%s" % job_name
# For training on CMLE, we'll use datasets stored in Google Cloud Storage (GCS) instead of local files.
%env GCS_TRAIN_FILE=gs://cloudml-public/census/data/adult.data.csv
%env GCS_EVAL_FILE=gs://cloudml-public/census/data/adult.test.csv
%env SCALE_TIER=STANDARD_1
%env JOB_NAME=$job_name
%env GCS_JOB_DIR=$gcs_job_dir

# submit your distributed training job to CMLE
!gcloud ml-engine jobs submit training $JOB_NAME --scale-tier $SCALE_TIER \
    --runtime-version 1.4 --job-dir $GCS_JOB_DIR \
    --module-name trainer.task --package-path trainer/ \
    --region us-central1 \
    -- --train-steps 10000 --train-files $GCS_TRAIN_FILE --eval-files $GCS_EVAL_FILE --eval-steps 100      

# Run this when the training job is finished.  Look for the directory with the 'saved_model.pb' file.
!gsutil ls -R $GCS_JOB_DIR

# This is just an example.
# Edit this path to point to the GCS directory that contains your saved_model.pb binary
%env MODEL_BINARY=gs://$gcs_job_dir/export/census/<timestamp>/

!gcloud ml-engine models create census --regions us-central1
!gcloud ml-engine models list
!gcloud ml-engine versions list --model census

!gcloud ml-engine versions create v1 --model census --origin $MODEL_BINARY --runtime-version 1.4

# Use your deployed model for prediction
!cat test.json
!gcloud ml-engine predict --model census --version v1 --json-instances test.json

!cat config_custom_gpus.yaml

job_name = "census_job_%s" % (int(time.time()))
# Edit the following to point to your GCS bucket directory
gcs_job_dir = "gs://your-gcs-bucket/path/%s" % job_name
%env GCS_TRAIN_FILE=gs://cloudml-public/census/data/adult.data.csv
%env GCS_EVAL_FILE=gs://cloudml-public/census/data/adult.test.csv
%env SCALE_TIER=CUSTOM
%env JOB_NAME=$job_name
%env GCS_JOB_DIR=$gcs_job_dir

!gcloud ml-engine jobs submit training $JOB_NAME --scale-tier $SCALE_TIER \
    --runtime-version 1.4 --job-dir $GCS_JOB_DIR \
    --module-name trainer.task --package-path trainer/ \
    --region us-central1 --config config_custom_gpus.yaml \
    -- --train-steps 15000 --train-files $GCS_TRAIN_FILE --eval-files $GCS_EVAL_FILE --eval-steps 100  
    

job_name = "census_job_%s" % (int(time.time()))
# Edit the following to point to your GCS bucket directory
gcs_job_dir = "gs://your-gcs-bucket/path/%s" % job_name
%env GCS_TRAIN_FILE=gs://cloudml-public/census/data/adult.data.csv
%env GCS_EVAL_FILE=gs://cloudml-public/census/data/adult.test.csv
%env SCALE_TIER=STANDARD_1
%env JOB_NAME=$job_name
%env GCS_JOB_DIR=$gcs_job_dir

# We'll use the `hptuning_config.yaml` file for this run.
!cat hptuning_config.yaml

!gcloud ml-engine jobs submit training $JOB_NAME --scale-tier $SCALE_TIER \
    --runtime-version 1.4 --job-dir $GCS_JOB_DIR \
    --module-name trainer.task --package-path trainer/ \
    --region us-central1 --config hptuning_config.yaml \
    -- --train-steps 15000 --train-files $GCS_TRAIN_FILE --eval-files $GCS_EVAL_FILE --eval-steps 100