! mkdir -p census_data ! gsutil cp gs://cloudml-public/census/data/adult.data.csv census_data/adult.data.csv ! gsutil cp gs://cloudml-public/census/data/adult.test.csv census_data/adult.test.csv # list the contents of the data directory as a check !ls -l census_data ! head census_data/adult.data.csv TRAIN_FILES = ['census_data/adult.data.csv'] EVAL_FILES = ['census_data/adult.test.csv'] %env TRAIN_FILE=census_data/adult.data.csv %env EVAL_FILE=census_data/adult.test.csv from __future__ import division from __future__ import print_function import argparse import multiprocessing import os import time import tensorflow as tf from tensorflow.contrib.learn.python.learn.utils import ( saved_model_export_utils) print(tf.__version__) CSV_COLUMNS = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation', 'relationship', 'race', 'gender', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'income_bracket'] CSV_COLUMN_DEFAULTS = [[0], [''], [0], [''], [0], [''], [''], [''], [''], [''], [0], [0], [0], [''], ['']] LABEL_COLUMN = 'income_bracket' LABELS = [' <=50K', ' >50K'] # Define the initial ingestion of each feature used by your model. # Additionally, provide metadata about the feature. INPUT_COLUMNS = [ # Categorical base columns # For categorical columns with known values we can provide lists # of values ahead of time. tf.feature_column.categorical_column_with_vocabulary_list( 'gender', [' Female', ' Male']), tf.feature_column.categorical_column_with_vocabulary_list( 'race', [' Amer-Indian-Eskimo', ' Asian-Pac-Islander', ' Black', ' Other', ' White'] ), tf.feature_column.categorical_column_with_vocabulary_list( 'education', [' Bachelors', ' HS-grad', ' 11th', ' Masters', ' 9th', ' Some-college', ' Assoc-acdm', ' Assoc-voc', ' 7th-8th', ' Doctorate', ' Prof-school', ' 5th-6th', ' 10th', ' 1st-4th', ' Preschool', ' 12th']), tf.feature_column.categorical_column_with_vocabulary_list( 'marital_status', [' Married-civ-spouse', ' Divorced', ' Married-spouse-absent', ' Never-married', ' Separated', ' Married-AF-spouse', ' Widowed']), tf.feature_column.categorical_column_with_vocabulary_list( 'relationship', [' Husband', ' Not-in-family', ' Wife', ' Own-child', ' Unmarried', ' Other-relative']), tf.feature_column.categorical_column_with_vocabulary_list( 'workclass', [' Self-emp-not-inc', ' Private', ' State-gov', ' Federal-gov', ' Local-gov', ' ?', ' Self-emp-inc', ' Without-pay', ' Never-worked'] ), # For columns with a large number of values, or unknown values # We can use a hash function to convert to categories. tf.feature_column.categorical_column_with_hash_bucket( 'occupation', hash_bucket_size=100, dtype=tf.string), tf.feature_column.categorical_column_with_hash_bucket( 'native_country', hash_bucket_size=100, dtype=tf.string), # Continuous base columns. tf.feature_column.numeric_column('age'), tf.feature_column.numeric_column('education_num'), tf.feature_column.numeric_column('capital_gain'), tf.feature_column.numeric_column('capital_loss'), tf.feature_column.numeric_column('hours_per_week'), ] # Now we'll define the unused columns-- those we won't use for this example. # In this case, there's just one: 'fnlwgt'. UNUSED_COLUMNS = set(CSV_COLUMNS) - {col.name for col in INPUT_COLUMNS} - {LABEL_COLUMN} print('unused columns: %s' % UNUSED_COLUMNS) def build_estimator(config, embedding_size=8, hidden_units=None): """Build a wide and deep model for predicting income category. """ (gender, race, education, marital_status, relationship, workclass, occupation, native_country, age, education_num, capital_gain, capital_loss, hours_per_week) = INPUT_COLUMNS # Continuous columns can be converted to categorical via bucketization age_buckets = tf.feature_column.bucketized_column( age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65]) # Wide columns and deep columns. wide_columns = [ # Interactions between different categorical features can also # be added as new virtual features. tf.feature_column.crossed_column( ['education', 'occupation'], hash_bucket_size=int(1e4)), tf.feature_column.crossed_column( [age_buckets, race, 'occupation'], hash_bucket_size=int(1e6)), tf.feature_column.crossed_column( ['native_country', 'occupation'], hash_bucket_size=int(1e4)), gender, native_country, education, occupation, workclass, marital_status, relationship, age_buckets, ] deep_columns = [ # Use indicator columns for low dimensional vocabularies tf.feature_column.indicator_column(workclass), tf.feature_column.indicator_column(education), tf.feature_column.indicator_column(marital_status), tf.feature_column.indicator_column(gender), tf.feature_column.indicator_column(relationship), tf.feature_column.indicator_column(race), # Use embedding columns for high dimensional vocabularies tf.feature_column.embedding_column( native_country, dimension=embedding_size), tf.feature_column.embedding_column(occupation, dimension=embedding_size), age, education_num, capital_gain, capital_loss, hours_per_week, ] return tf.estimator.DNNLinearCombinedClassifier( config=config, linear_feature_columns=wide_columns, dnn_feature_columns=deep_columns, dnn_hidden_units=hidden_units or [100, 70, 50, 25] ) output_dir = "census_%s" % (int(time.time())) print(output_dir) run_config = tf.estimator.RunConfig() run_config = run_config.replace(model_dir=output_dir) FIRST_LAYER_SIZE = 100 # Number of nodes in the first layer of the DNN NUM_LAYERS = 4 # Number of layers in the DNN SCALE_FACTOR = 0.7 # How quickly should the size of the layers in the DNN decay EMBEDDING_SIZE = 8 # Number of embedding dimensions for categorical columns estimator = build_estimator( embedding_size=EMBEDDING_SIZE, # Construct layers sizes with exponential decay hidden_units=[ max(2, int(FIRST_LAYER_SIZE * SCALE_FACTOR**i)) for i in range(NUM_LAYERS) ], config=run_config ) def parse_label_column(label_string_tensor): """Parses a string tensor into the label tensor """ # Build a Hash Table inside the graph table = tf.contrib.lookup.index_table_from_tensor(tf.constant(LABELS)) # Use the hash table to convert string labels to ints and one-hot encode return table.lookup(label_string_tensor) def parse_csv(rows_string_tensor): """Takes the string input tensor and returns a dict of rank-2 tensors.""" # Takes a rank-1 tensor and converts it into rank-2 tensor # Example if the data is ['csv,line,1', 'csv,line,2', ..] to # [['csv,line,1'], ['csv,line,2']] which after parsing will result in a # tuple of tensors: [['csv'], ['csv']], [['line'], ['line']], [[1], [2]] row_columns = tf.expand_dims(rows_string_tensor, -1) columns = tf.decode_csv(row_columns, record_defaults=CSV_COLUMN_DEFAULTS) features = dict(zip(CSV_COLUMNS, columns)) # Remove unused columns for col in UNUSED_COLUMNS: features.pop(col) return features # This function returns a (features, indices) tuple, where features is a dictionary of # Tensors, and indices is a single Tensor of label indices. def input_fn(filenames, num_epochs=None, shuffle=True, skip_header_lines=0, batch_size=200): """Generates features and labels for training or evaluation. """ dataset = tf.data.TextLineDataset(filenames).skip(skip_header_lines).map(parse_csv) if shuffle: dataset = dataset.shuffle(buffer_size=batch_size * 10) dataset = dataset.repeat(num_epochs) dataset = dataset.batch(batch_size) iterator = dataset.make_one_shot_iterator() features = iterator.get_next() return features, parse_label_column(features.pop(LABEL_COLUMN)) train_input = lambda: input_fn( TRAIN_FILES, batch_size=40 ) # Don't shuffle evaluation data eval_input = lambda: input_fn( EVAL_FILES, batch_size=40, shuffle=False ) train_spec = tf.estimator.TrainSpec(train_input, max_steps=1000 ) def json_serving_input_fn(): """Build the serving inputs.""" inputs = {} for feat in INPUT_COLUMNS: inputs[feat.name] = tf.placeholder(shape=[None], dtype=feat.dtype) return tf.estimator.export.ServingInputReceiver(inputs, inputs) exporter = tf.estimator.FinalExporter('census', json_serving_input_fn) eval_spec = tf.estimator.EvalSpec(eval_input, steps=100, exporters=[exporter], name='census-eval' ) tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec) # List the directory that contains the model. You'll use this info in the next section too. !ls -R $output_dir/export/census # Now, view the model signature # This is an example. Edit this command to use your own directory path !saved_model_cli show --dir $output_dir/export/census/ --tag serve --signature_def predict ! cat test.json # This is an example. Edit this command to use your own directory path. ! gcloud ml-engine local predict --model-dir $output_dir/export/census/ --json-instances test.json output_dir = "census_%s" % (int(time.time())) %env OUTPUT_DIR=$output_dir ! gcloud ml-engine local train --package-path trainer \ --module-name trainer.task \ -- \ --train-files $TRAIN_FILE \ --eval-files $EVAL_FILE \ --train-steps 1000 \ --job-dir $OUTPUT_DIR \ --eval-steps 100 job_name = "census_job_%s" % (int(time.time())) # Edit the following to point to your GCS bucket directory gcs_job_dir = "gs://your-gcs-bucket/path/%s" % job_name # For training on CMLE, we'll use datasets stored in Google Cloud Storage (GCS) instead of local files. %env GCS_TRAIN_FILE=gs://cloudml-public/census/data/adult.data.csv %env GCS_EVAL_FILE=gs://cloudml-public/census/data/adult.test.csv %env SCALE_TIER=STANDARD_1 %env JOB_NAME=$job_name %env GCS_JOB_DIR=$gcs_job_dir # submit your distributed training job to CMLE !gcloud ml-engine jobs submit training $JOB_NAME --scale-tier $SCALE_TIER \ --runtime-version 1.4 --job-dir $GCS_JOB_DIR \ --module-name trainer.task --package-path trainer/ \ --region us-central1 \ -- --train-steps 10000 --train-files $GCS_TRAIN_FILE --eval-files $GCS_EVAL_FILE --eval-steps 100 # Run this when the training job is finished. Look for the directory with the 'saved_model.pb' file. !gsutil ls -R $GCS_JOB_DIR # This is just an example. # Edit this path to point to the GCS directory that contains your saved_model.pb binary %env MODEL_BINARY=gs://$gcs_job_dir/export/census// !gcloud ml-engine models create census --regions us-central1 !gcloud ml-engine models list !gcloud ml-engine versions list --model census !gcloud ml-engine versions create v1 --model census --origin $MODEL_BINARY --runtime-version 1.4 # Use your deployed model for prediction !cat test.json !gcloud ml-engine predict --model census --version v1 --json-instances test.json !cat config_custom_gpus.yaml job_name = "census_job_%s" % (int(time.time())) # Edit the following to point to your GCS bucket directory gcs_job_dir = "gs://your-gcs-bucket/path/%s" % job_name %env GCS_TRAIN_FILE=gs://cloudml-public/census/data/adult.data.csv %env GCS_EVAL_FILE=gs://cloudml-public/census/data/adult.test.csv %env SCALE_TIER=CUSTOM %env JOB_NAME=$job_name %env GCS_JOB_DIR=$gcs_job_dir !gcloud ml-engine jobs submit training $JOB_NAME --scale-tier $SCALE_TIER \ --runtime-version 1.4 --job-dir $GCS_JOB_DIR \ --module-name trainer.task --package-path trainer/ \ --region us-central1 --config config_custom_gpus.yaml \ -- --train-steps 15000 --train-files $GCS_TRAIN_FILE --eval-files $GCS_EVAL_FILE --eval-steps 100 job_name = "census_job_%s" % (int(time.time())) # Edit the following to point to your GCS bucket directory gcs_job_dir = "gs://your-gcs-bucket/path/%s" % job_name %env GCS_TRAIN_FILE=gs://cloudml-public/census/data/adult.data.csv %env GCS_EVAL_FILE=gs://cloudml-public/census/data/adult.test.csv %env SCALE_TIER=STANDARD_1 %env JOB_NAME=$job_name %env GCS_JOB_DIR=$gcs_job_dir # We'll use the `hptuning_config.yaml` file for this run. !cat hptuning_config.yaml !gcloud ml-engine jobs submit training $JOB_NAME --scale-tier $SCALE_TIER \ --runtime-version 1.4 --job-dir $GCS_JOB_DIR \ --module-name trainer.task --package-path trainer/ \ --region us-central1 --config hptuning_config.yaml \ -- --train-steps 15000 --train-files $GCS_TRAIN_FILE --eval-files $GCS_EVAL_FILE --eval-steps 100