from __future__ import print_function
import sys, os
import tempfile, urllib, zipfile
# Confirm that we're using Python 2
assert sys.version_info.major is 2, 'Oops, not running Python 2'
# Set up some globals for our file paths
BASE_DIR = tempfile.mkdtemp()
DATA_DIR = os.path.join(BASE_DIR, 'data')
OUTPUT_DIR = os.path.join(BASE_DIR, 'chicago_taxi_output')
TRAIN_DATA = os.path.join(DATA_DIR, 'train', 'data.csv')
EVAL_DATA = os.path.join(DATA_DIR, 'eval', 'data.csv')
SERVING_DATA = os.path.join(DATA_DIR, 'serving', 'data.csv')
# Download the zip file from GCP and unzip it
zip, headers = urllib.urlretrieve('https://storage.googleapis.com/tfx-colab-datasets/chicago_data.zip')
zipfile.ZipFile(zip).extractall(BASE_DIR)
zipfile.ZipFile(zip).close()
print("Here's what we downloaded:")
!ls -lR {os.path.join(BASE_DIR, 'data')}
Here's what we downloaded: total 0 drwxr-xr-x 3 byeon staff 96 4 14 10:55 eval drwxr-xr-x 3 byeon staff 96 4 14 10:55 serving drwxr-xr-x 3 byeon staff 96 4 14 10:55 train /var/folders/f7/lrsclmhd6mx2hgq049xw8dv80000gn/T/tmp5hoTPP/data/eval: total 1256 -rw-r--r-- 1 byeon staff 641080 4 14 10:55 data.csv /var/folders/f7/lrsclmhd6mx2hgq049xw8dv80000gn/T/tmp5hoTPP/data/serving: total 32 -rw-r--r-- 1 byeon staff 12727 4 14 10:55 data.csv /var/folders/f7/lrsclmhd6mx2hgq049xw8dv80000gn/T/tmp5hoTPP/data/train: total 2504 -rw-r--r-- 1 byeon staff 1281866 4 14 10:55 data.csv
!pip2 install -q tensorflow_data_validation
import tensorflow_data_validation as tfdv
print('TFDV version: {}'.format(tfdv.version.__version__))
DEPRECATION: Python 2.7 will reach the end of its life on January 1st, 2020. Please upgrade your Python as Python 2.7 won't be maintained after that date. A future version of pip will drop support for Python 2.7.
/usr/local/lib/python2.7/site-packages/h5py/__init__.py:34: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`. from ._conv import register_converters as _register_converters
TFDV version: 0.13.1
train_stats = tfdv.generate_statistics_from_csv(data_location=TRAIN_DATA)
WARNING:root:Couldn't find python-snappy so the implementation of _TFRecordUtil._masked_crc32c is not as fast as it could be.
tfdv.visualize_statistics(train_stats)
schema = tfdv.infer_schema(statistics=train_stats)
tfdv.display_schema(schema=schema)
Type | Presence | Valency | Domain | |
---|---|---|---|---|
Feature name | ||||
'fare' | FLOAT | required | - | |
'trip_start_hour' | INT | required | - | |
'pickup_census_tract' | BYTES | optional | - | |
'dropoff_census_tract' | FLOAT | optional | single | - |
'company' | STRING | optional | single | 'company' |
'trip_start_timestamp' | INT | required | - | |
'pickup_longitude' | FLOAT | required | - | |
'trip_start_month' | INT | required | - | |
'trip_miles' | FLOAT | required | - | |
'dropoff_longitude' | FLOAT | optional | single | - |
'dropoff_community_area' | FLOAT | optional | single | - |
'pickup_community_area' | INT | required | - | |
'payment_type' | STRING | required | 'payment_type' | |
'trip_seconds' | FLOAT | optional | single | - |
'trip_start_day' | INT | required | - | |
'tips' | FLOAT | required | - | |
'pickup_latitude' | FLOAT | required | - | |
'dropoff_latitude' | FLOAT | optional | single | - |
Values | |
---|---|
Domain | |
'company' | '0118 - 42111 Godfrey S.Awir', '0694 - 59280 Chinesco Trans Inc', '1085 - 72312 N and W Cab Co', '2733 - 74600 Benny Jona', '2809 - 95474 C & D Cab Co Inc.', '3011 - 66308 JBL Cab Inc.', '3152 - 97284 Crystal Abernathy', '3201 - C&D Cab Co Inc', '3201 - CID Cab Co Inc', '3253 - 91138 Gaither Cab Co.', '3385 - 23210 Eman Cab', '3623 - 72222 Arrington Enterprises', '3897 - Ilie Malec', '4053 - Adwar H. Nikola', '4197 - 41842 Royal Star', '4615 - 83503 Tyrone Henderson', '4615 - Tyrone Henderson', '4623 - Jay Kim', '5006 - 39261 Salifu Bawa', '5006 - Salifu Bawa', '5074 - 54002 Ahzmi Inc', '5074 - Ahzmi Inc', '5129 - 87128', '5129 - 98755 Mengisti Taxi', '5129 - Mengisti Taxi', '5724 - KYVI Cab Inc', '585 - Valley Cab Co', '5864 - 73614 Thomas Owusu', '5864 - Thomas Owusu', '5874 - 73628 Sergey Cab Corp.', '5997 - 65283 AW Services Inc.', '5997 - AW Services Inc.', '6488 - 83287 Zuha Taxi', '6743 - Luhak Corp', 'Blue Ribbon Taxi Association Inc.', 'C & D Cab Co Inc', 'Chicago Elite Cab Corp.', 'Chicago Elite Cab Corp. (Chicago Carriag', 'Chicago Medallion Leasing INC', 'Chicago Medallion Management', 'Choice Taxi Association', 'Dispatch Taxi Affiliation', 'KOAM Taxi Association', 'Northwest Management LLC', 'Taxi Affiliation Services', 'Top Cab Affiliation' |
'payment_type' | 'Cash', 'Credit Card', 'Dispute', 'No Charge', 'Pcard', 'Unknown' |
# Compute stats for evaluation data
eval_stats = tfdv.generate_statistics_from_csv(data_location=EVAL_DATA)
# Compare evaluation data with training data
tfdv.visualize_statistics(lhs_statistics=eval_stats, rhs_statistics=train_stats,
lhs_name='EVAL_DATASET', rhs_name='TRAIN_DATASET')
# Check eval data for errors by validating the eval data stats using the previously inferred schema.
anomalies = tfdv.validate_statistics(statistics=eval_stats, schema=schema)
tfdv.display_anomalies(anomalies)
Anomaly short description | Anomaly long description | |
---|---|---|
Feature name | ||
'payment_type' | Unexpected string values | Examples contain values missing from the schema: Prcard (<1%). |
'company' | Unexpected string values | Examples contain values missing from the schema: 2092 - 61288 Sbeih company (<1%), 2192 - 73487 Zeymane Corp (<1%), 2192 - Zeymane Corp (<1%), 2823 - 73307 Seung Lee (<1%), 3094 - 24059 G.L.B. Cab Co (<1%), 3319 - CD Cab Co (<1%), 3385 - Eman Cab (<1%), 3897 - 57856 Ilie Malec (<1%), 4053 - 40193 Adwar H. Nikola (<1%), 4197 - Royal Star (<1%), 585 - 88805 Valley Cab Co (<1%), 5874 - Sergey Cab Corp. (<1%), 6057 - 24657 Richard Addo (<1%), 6574 - Babylon Express Inc. (<1%), 6742 - 83735 Tasha ride inc (<1%). |
# Relax the minimum fraction of values that must come from the domain for feature company.
company = tfdv.get_feature(schema, 'company')
company.distribution_constraints.min_domain_mass = 0.9
# Add new value to the domain of feature payment_type.
payment_type_domain = tfdv.get_domain(schema, 'payment_type')
payment_type_domain.value.append('Prcard')
# Validate eval stats after updating the schema
updated_anomalies = tfdv.validate_statistics(eval_stats, schema)
tfdv.display_anomalies(updated_anomalies)
serving_stats = tfdv.generate_statistics_from_csv(SERVING_DATA)
serving_anomalies = tfdv.validate_statistics(serving_stats, schema)
tfdv.display_anomalies(serving_anomalies)
Anomaly short description | Anomaly long description | |
---|---|---|
Feature name | ||
'tips' | Column dropped | Column is completely missing |
'trip_seconds' | Expected data of type: FLOAT but got INT |
options = tfdv.StatsOptions(schema=schema, infer_type_from_schema=True)
serving_stats = tfdv.generate_statistics_from_csv(SERVING_DATA, stats_options=options)
serving_anomalies = tfdv.validate_statistics(serving_stats, schema)
tfdv.display_anomalies(serving_anomalies)
Anomaly short description | Anomaly long description | |
---|---|---|
Feature name | ||
'tips' | Column dropped | Column is completely missing |
# All features are by default in both TRAINING and SERVING environments.
schema.default_environment.append('TRAINING')
schema.default_environment.append('SERVING')
# Specify that 'tips' feature is not in SERVING environment.
tfdv.get_feature(schema, 'tips').not_in_environment.append('SERVING')
serving_anomalies_with_env = tfdv.validate_statistics(
serving_stats, schema, environment='SERVING')
tfdv.display_anomalies(serving_anomalies_with_env)
# Add skew comparator for 'payment_type' feature.
payment_type = tfdv.get_feature(schema, 'payment_type')
payment_type.skew_comparator.infinity_norm.threshold = 0.01
# Add drift comparator for 'company' feature.
company=tfdv.get_feature(schema, 'company')
company.drift_comparator.infinity_norm.threshold = 0.001
skew_anomalies = tfdv.validate_statistics(train_stats, schema,
previous_statistics=eval_stats,
serving_statistics=serving_stats)
tfdv.display_anomalies(skew_anomalies)
Anomaly short description | Anomaly long description | |
---|---|---|
Feature name | ||
'payment_type' | High Linfty distance between serving and training | The Linfty distance between serving and training is 0.0225 (up to six significant digits), above the threshold 0.01. The feature value with maximum difference is: Credit Card |
'company' | High Linfty distance between current and previous | The Linfty distance between current and previous is 0.00820891 (up to six significant digits), above the threshold 0.001. The feature value with maximum difference is: Blue Ribbon Taxi Association Inc. |
from tensorflow.python.lib.io import file_io
from google.protobuf import text_format
file_io.recursive_create_dir(OUTPUT_DIR)
schema_file = os.path.join(OUTPUT_DIR, 'schema.pbtxt')
tfdv.write_schema_text(schema, schema_file)
!cat {schema_file}
feature { name: "fare" type: FLOAT presence { min_fraction: 1.0 min_count: 1 } shape { dim { size: 1 } } } feature { name: "trip_start_hour" type: INT presence { min_fraction: 1.0 min_count: 1 } shape { dim { size: 1 } } } feature { name: "pickup_census_tract" type: BYTES presence { min_count: 0 } } feature { name: "dropoff_census_tract" value_count { min: 1 max: 1 } type: FLOAT presence { min_count: 1 } } feature { name: "company" value_count { min: 1 max: 1 } type: BYTES domain: "company" presence { min_count: 1 } distribution_constraints { min_domain_mass: 0.9 } drift_comparator { infinity_norm { threshold: 0.001 } } } feature { name: "trip_start_timestamp" type: INT presence { min_fraction: 1.0 min_count: 1 } shape { dim { size: 1 } } } feature { name: "pickup_longitude" type: FLOAT presence { min_fraction: 1.0 min_count: 1 } shape { dim { size: 1 } } } feature { name: "trip_start_month" type: INT presence { min_fraction: 1.0 min_count: 1 } shape { dim { size: 1 } } } feature { name: "trip_miles" type: FLOAT presence { min_fraction: 1.0 min_count: 1 } shape { dim { size: 1 } } } feature { name: "dropoff_longitude" value_count { min: 1 max: 1 } type: FLOAT presence { min_count: 1 } } feature { name: "dropoff_community_area" value_count { min: 1 max: 1 } type: FLOAT presence { min_count: 1 } } feature { name: "pickup_community_area" type: INT presence { min_fraction: 1.0 min_count: 1 } shape { dim { size: 1 } } } feature { name: "payment_type" type: BYTES domain: "payment_type" presence { min_fraction: 1.0 min_count: 1 } skew_comparator { infinity_norm { threshold: 0.01 } } shape { dim { size: 1 } } } feature { name: "trip_seconds" value_count { min: 1 max: 1 } type: FLOAT presence { min_count: 1 } } feature { name: "trip_start_day" type: INT presence { min_fraction: 1.0 min_count: 1 } shape { dim { size: 1 } } } feature { name: "tips" type: FLOAT presence { min_fraction: 1.0 min_count: 1 } not_in_environment: "SERVING" shape { dim { size: 1 } } } feature { name: "pickup_latitude" type: FLOAT presence { min_fraction: 1.0 min_count: 1 } shape { dim { size: 1 } } } feature { name: "dropoff_latitude" value_count { min: 1 max: 1 } type: FLOAT presence { min_count: 1 } } string_domain { name: "company" value: "0118 - 42111 Godfrey S.Awir" value: "0694 - 59280 Chinesco Trans Inc" value: "1085 - 72312 N and W Cab Co" value: "2733 - 74600 Benny Jona" value: "2809 - 95474 C & D Cab Co Inc." value: "3011 - 66308 JBL Cab Inc." value: "3152 - 97284 Crystal Abernathy" value: "3201 - C&D Cab Co Inc" value: "3201 - CID Cab Co Inc" value: "3253 - 91138 Gaither Cab Co." value: "3385 - 23210 Eman Cab" value: "3623 - 72222 Arrington Enterprises" value: "3897 - Ilie Malec" value: "4053 - Adwar H. Nikola" value: "4197 - 41842 Royal Star" value: "4615 - 83503 Tyrone Henderson" value: "4615 - Tyrone Henderson" value: "4623 - Jay Kim" value: "5006 - 39261 Salifu Bawa" value: "5006 - Salifu Bawa" value: "5074 - 54002 Ahzmi Inc" value: "5074 - Ahzmi Inc" value: "5129 - 87128" value: "5129 - 98755 Mengisti Taxi" value: "5129 - Mengisti Taxi" value: "5724 - KYVI Cab Inc" value: "585 - Valley Cab Co" value: "5864 - 73614 Thomas Owusu" value: "5864 - Thomas Owusu" value: "5874 - 73628 Sergey Cab Corp." value: "5997 - 65283 AW Services Inc." value: "5997 - AW Services Inc." value: "6488 - 83287 Zuha Taxi" value: "6743 - Luhak Corp" value: "Blue Ribbon Taxi Association Inc." value: "C & D Cab Co Inc" value: "Chicago Elite Cab Corp." value: "Chicago Elite Cab Corp. (Chicago Carriag" value: "Chicago Medallion Leasing INC" value: "Chicago Medallion Management" value: "Choice Taxi Association" value: "Dispatch Taxi Affiliation" value: "KOAM Taxi Association" value: "Northwest Management LLC" value: "Taxi Affiliation Services" value: "Top Cab Affiliation" } string_domain { name: "payment_type" value: "Cash" value: "Credit Card" value: "Dispute" value: "No Charge" value: "Pcard" value: "Unknown" value: "Prcard" } default_environment: "TRAINING" default_environment: "SERVING"