anton@xavier:~$ sudo -H python3 -m pip install jetson-stats -U ... Successfully installed jetson-stats-3.0.3
anton@xavier:~$ jetson_release - NVIDIA Jetson AGX Xavier [16GB] * Jetpack 4.5 [L4T 32.5.0] * NV Power Mode: MAXN - Type: 0 * jetson_stats.service: active - Libraries: * CUDA: 10.2.89 * cuDNN: 8.0.0.180 * TensorRT: 7.1.3.0 * Visionworks: 1.6.0.501 * OpenCV: 4.1.1 compiled CUDA: NO * VPI: ii libnvvpi1 1.0.12 arm64 NVIDIA Vision Programming Interface library * Vulkan: 1.2.70
anton@xavier:~$ cat /etc/lsb-release DISTRIB_ID=Ubuntu DISTRIB_RELEASE=18.04 DISTRIB_CODENAME=bionic DISTRIB_DESCRIPTION="Ubuntu 18.04.5 LTS"
anton@xavier:~$ lscpu Architecture: aarch64 Byte Order: Little Endian CPU(s): 8 On-line CPU(s) list: 0-7 Thread(s) per core: 1 Core(s) per socket: 2 Socket(s): 4 Vendor ID: Nvidia Model: 0 Model name: ARMv8 Processor rev 0 (v8l) Stepping: 0x0 CPU max MHz: 2265.6001 CPU min MHz: 115.2000 BogoMIPS: 62.50 L1d cache: 64K L1i cache: 128K L2 cache: 2048K L3 cache: 4096K Flags: fp asimd evtstrm aes pmull sha1 sha2 crc32 atomics fphp asimdhp
anton@xavier:~$ sudo jetson_clocks anton@xavier:~$ sudo jetson_clocks --show SOC family:tegra194 Machine:Jetson-AGX Online CPUs: 0-7 cpu0: Online=1 Governor=schedutil MinFreq=2265600 MaxFreq=2265600 CurrentFreq=2265600 IdleStates: C1=0 c6=0 cpu1: Online=1 Governor=schedutil MinFreq=2265600 MaxFreq=2265600 CurrentFreq=2265600 IdleStates: C1=0 c6=0 cpu2: Online=1 Governor=schedutil MinFreq=2265600 MaxFreq=2265600 CurrentFreq=2265600 IdleStates: C1=0 c6=0 cpu3: Online=1 Governor=schedutil MinFreq=2265600 MaxFreq=2265600 CurrentFreq=2265600 IdleStates: C1=0 c6=0 cpu4: Online=1 Governor=schedutil MinFreq=2265600 MaxFreq=2265600 CurrentFreq=2265600 IdleStates: C1=0 c6=0 cpu5: Online=1 Governor=schedutil MinFreq=2265600 MaxFreq=2265600 CurrentFreq=2265600 IdleStates: C1=0 c6=0 cpu6: Online=1 Governor=schedutil MinFreq=2265600 MaxFreq=2265600 CurrentFreq=2265600 IdleStates: C1=0 c6=0 cpu7: Online=1 Governor=schedutil MinFreq=2265600 MaxFreq=2265600 CurrentFreq=2265600 IdleStates: C1=0 c6=0 GPU MinFreq=1377000000 MaxFreq=1377000000 CurrentFreq=1377000000 EMC MinFreq=204000000 MaxFreq=2133000000 CurrentFreq=2133000000 FreqOverride=1 Fan: PWM=0 NV Power Mode: MAXN
anton@xavier:~$ ck compile program:tool-print-cuda-devices anton@xavier:~$ ck run program:tool-print-cuda-devices ... GPU Device ID: 0 GPU Name: Xavier GPU compute capability: 7.2 CUDA driver version: 10.2 CUDA runtime version: 10.2 Global memory: 33479647232 Max clock rate: 1377.000000 MHz Total amount of shared memory per block: 49152 Total number of registers available per block: 65536 Warp size: 32 Maximum number of threads per multiprocessor: 2048 Maximum number of threads per block: 1024 Max dimension size of a thread block X: 1024 Max dimension size of a thread block Y: 1024 Max dimension size of a thread block Z: 64 Max dimension size of a grid size X: 2147483647 Max dimension size of a grid size Y: 65535 Max dimension size of a grid size Z: 65535
anton@xavier:~$ df -h Filesystem Size Used Avail Use% Mounted on /dev/mmcblk0p1 28G 17G 11G 62% / none 16G 0 16G 0% /dev tmpfs 16G 52K 16G 1% /dev/shm tmpfs 16G 30M 16G 1% /run tmpfs 5.0M 4.0K 5.0M 1% /run/lock tmpfs 16G 0 16G 0% /sys/fs/cgroup /dev/mmcblk1p1 361G 300G 42G 88% /sd tmpfs 3.2G 12K 3.2G 1% /run/user/120 tmpfs 3.2G 0 3.2G 0% /run/user/1000 tmpfs 3.2G 0 3.2G 0% /run/user/1004
anton@xavier:~$ git clone https://github.com/mlcommons/inference_results_v0.7.git
We reused the datasets we generated while reproducing the v0.5 results.
anton@xavier:~$ export MLPERF_SCRATCH_PATH=/datasets/mlperf_scratch_path anton@xavier:~$ mkdir $MLPERF_SCRATCH_PATH anton@xavier:~$ cd $MLPERF_SCRATCH_PATH anton@xavier:/datasets/mlperf_scratch_path$ ln -s /datasets/inference_results_v0.5-nvidia/closed/NVIDIA/build/preprocessed_data preprocessed_data anton@xavier:/datasets/mlperf_scratch_path$ ln -s /datasets/inference_results_v0.5-nvidia/closed/NVIDIA/build/data data anton@xavier:/datasets/mlperf_scratch_path$ ls -la /datasets/mlperf_scratch_path/ total 20 drwxrwsr-x 3 anton dvdt 4096 Jan 28 11:10 . drwxrwsr-x 17 root dvdt 4096 Jan 28 09:53 .. lrwxrwxrwx 1 anton dvdt 64 Jan 28 11:10 data -> /datasets/inference_results_v0.5-nvidia/closed/NVIDIA/build/data drwxrwsr-x 5 anton dvdt 4096 Jan 28 09:54 models lrwxrwxrwx 1 anton dvdt 77 Jan 28 11:10 preprocessed_data -> /datasets/inference_results_v0.5-nvidia/closed/NVIDIA/build/preprocessed_data
anton@xavier:/datasets/inference_results_v0.7/closed/NVIDIA$ git diff Makefile diff --git a/closed/NVIDIA/Makefile b/closed/NVIDIA/Makefile index 4679ec29..5bdf6a59 100644 --- a/closed/NVIDIA/Makefile +++ b/closed/NVIDIA/Makefile @@ -218,7 +218,7 @@ endif ############################## DOWNLOAD_MODEL ############################## -BENCHMARKS = resnet50 ssd-resnet34 ssd-mobilenet bert dlrm rnnt 3d-unet +BENCHMARKS = resnet50 ssd-resnet34 ssd-mobilenet bert rnnt # dlrm 3d-unet .PHONY: download_model download_model: link_dirs
anton@xavier:/datasets/inference_results_v0.7/closed/NVIDIA$ MLPERF_SCRATCH_PATH=/datasets/mlperf_scratch_path make download_model
anton@xavier:/datasets/inference_results_v0.7/closed/NVIDIA$ git diff code/harness/ diff --git a/closed/NVIDIA/code/harness/lwis/include/lwis_buffers.h b/closed/NVIDIA/code/harness/lwis/include/lwis_buffers.h index 5a79260c..4a9e52de 100644 --- a/closed/NVIDIA/code/harness/lwis/include/lwis_buffers.h +++ b/closed/NVIDIA/code/harness/lwis/include/lwis_buffers.h @@ -80,10 +80,10 @@ inline int64_t volume(const nvinfer1::Dims& d, const nvinfer1::TensorFormat& for case nvinfer1::TensorFormat::kCHW2: spv = 2; channelDim = d_new.nbDims - 3; break; case nvinfer1::TensorFormat::kCHW4: spv = 4; channelDim = d_new.nbDims - 3; break; case nvinfer1::TensorFormat::kHWC8: spv = 8; channelDim = d_new.nbDims - 3; break; - case nvinfer1::TensorFormat::kDHWC8: spv = 8; channelDim = d_new.nbDims - 4; break; + //case nvinfer1::TensorFormat::kDHWC8: spv = 8; channelDim = d_new.nbDims - 4; break; case nvinfer1::TensorFormat::kCHW16: spv = 16; channelDim = d_new.nbDims - 3; break; case nvinfer1::TensorFormat::kCHW32: spv = 32; channelDim = d_new.nbDims - 3; break; - case nvinfer1::TensorFormat::kCDHW32: spv = 32; channelDim = d_new.nbDims - 4; break; + //case nvinfer1::TensorFormat::kCDHW32: spv = 32; channelDim = d_new.nbDims - 4; break; case nvinfer1::TensorFormat::kLINEAR: default: spv = 1; channelDim = -1; break; }
anton@xavier:/datasets/inference_results_v0.7/closed/NVIDIA$ make build ...
anton@xavier:/datasets/inference_results_v0.7/closed/NVIDIA$ ls -la build/ total 20 drwxrwsr-x 5 anton dvdt 4096 Jan 28 16:06 . drwxrwsr-x 12 anton dvdt 4096 Jan 28 09:53 .. lrwxrwxrwx 1 anton dvdt 34 Jan 28 09:54 data -> /datasets/mlperf_scratch_path/data drwxrwsr-x 3 anton dvdt 4096 Jan 28 11:10 engines drwxrwsr-x 16 anton dvdt 4096 Jan 28 16:05 inference lrwxrwxrwx 1 anton dvdt 36 Jan 28 09:54 models -> /datasets/mlperf_scratch_path/models drwxrwsr-x 3 anton dvdt 4096 Jan 28 16:06 plugins lrwxrwxrwx 1 anton dvdt 47 Jan 28 09:54 preprocessed_data -> /datasets/mlperf_scratch_path/preprocessed_data
anton@xavier:~$ grep AGX_Xavier /datasets/inference_results_v0.7/closed/NVIDIA/configs/resnet50/Offline/config.json -A 17 "AGX_Xavier": { "concurrent_offline_expected_qps": 2181, "dla_batch_size": 32, "dla_copy_streams": 1, "dla_core": 0, "dla_inference_streams": 1, "dla_offline_expected_qps": 396, "gpu_batch_size": 64, "gpu_copy_streams": 1, "gpu_inference_streams": 1, "gpu_offline_expected_qps": 1478.33, "input_dtype": "int8", "input_format": "linear", "map_path": "data_maps/imagenet/val_map.txt", "precision": "int8", "tensor_path": "${PREPROCESSED_DATA_DIR}/imagenet/ResNet50/int8_linear", "use_graphs": false },
anton@xavier:/datasets/inference_results_v0.7/closed/NVIDIA$ time \ MLPERF_SCRATCH_PATH=/datasets/mlperf_scratch_path \ make generate_engines RUN_ARGS="--benchmarks=resnet50 --scenarios=offline" ... [2021-01-29 08:04:08,685 main.py:153 INFO] Finished building engines for resnet50 benchmark in Offline scenario. Time taken to generate engines: 65.47717833518982 seconds real 1m6.612s user 0m24.812s sys 0m7.088s
anton@xavier:/datasets/inference_results_v0.7/closed/NVIDIA$ find . \ -name resnet50-Offline*.plan -exec du -hs {} \; 48M ./build/engines/AGX_Xavier/resnet50/Offline/resnet50-Offline-gpu-b64-int8.default.plan 33M ./build/engines/AGX_Xavier/resnet50/Offline/resnet50-Offline-dla-b32-int8.default.plan
Samples per second: 2074.53 Result is : VALID
anton@xavier:/datasets/inference_results_v0.7/closed/NVIDIA$ time \ MLPERF_SCRATCH_PATH=/datasets/mlperf_scratch_path \ make run_harness RUN_ARGS="--benchmarks=resnet50 --scenarios=offline --test_mode=PerformanceOnly" ... ================================================ MLPerf Results Summary ================================================ SUT name : LWIS_Server Scenario : Offline Mode : Performance Samples per second: 2087.07 Result is : VALID Min duration satisfied : Yes Min queries satisfied : Yes ================================================ Additional Stats ================================================ Min latency (ns) : 47041359 Max latency (ns) : 68970284749 Mean latency (ns) : 34509141573 50.00 percentile latency (ns) : 34507536580 90.00 percentile latency (ns) : 62052439865 95.00 percentile latency (ns) : 65528745973 97.00 percentile latency (ns) : 66902483725 99.00 percentile latency (ns) : 68276603731 99.90 percentile latency (ns) : 68878743387 ================================================ Test Parameters Used ================================================ samples_per_query : 143946 target_qps : 2181 target_latency (ns): 0 max_async_queries : 1 min_duration (ms): 60000 max_duration (ms): 0 min_query_count : 1 max_query_count : 0 qsl_rng_seed : 12786827339337101903 sample_index_rng_seed : 12640797754436136668 schedule_rng_seed : 3135815929913719677 accuracy_log_rng_seed : 0 accuracy_log_probability : 0 accuracy_log_sampling_target : 0 print_timestamps : false performance_issue_unique : false performance_issue_same : false performance_issue_same_index : 0 performance_sample_count : 2048 No warnings encountered during test. No errors encountered during test. Finished running actual test. Device Device:0 processed: 1 batches of size 10 1551 batches of size 64 Memcpy Calls: 0 PerSampleCudaMemcpy Calls: 1408 BatchedCudaMemcpy Calls: 1530 Device Device:0.DLA-0 processed: 696 batches of size 32 Memcpy Calls: 0 PerSampleCudaMemcpy Calls: 0 BatchedCudaMemcpy Calls: 0 Device Device:0.DLA-1 processed: 700 batches of size 32 Memcpy Calls: 0 PerSampleCudaMemcpy Calls: 0 BatchedCudaMemcpy Calls: 0 &&&& PASSED Default_Harness # ./build/bin/harness_default [2021-02-10 22:03:53,421 main.py:341 INFO] Result: Samples per second: 2087.07 and Result is : VALID ======================= Perf harness results: ======================= AGX_Xavier_TRT-default-Offline: resnet50: Samples per second: 2087.07 and Result is : VALID ======================= Accuracy results: ======================= AGX_Xavier_TRT-default-Offline: resnet50: No accuracy results in PerformanceOnly mode. real 1m24.386s user 1m13.924s sys 0m5.764s
accuracy=76.004%, good=38002, total=50000 hash=745b138bf6552d30a42b23fde8f2519867d16a6ca319aad954e4dae897bf452c
anton@xavier:/datasets/inference_results_v0.7/closed/NVIDIA$ time \ MLPERF_SCRATCH_PATH=/datasets/mlperf_scratch_path \ make run_harness RUN_ARGS="--benchmarks=resnet50 --scenarios=offline --test_mode=AccuracyOnly" [2021-01-28 23:26:07,164 main.py:692 INFO] Detected System ID: AGX_Xavier [2021-01-28 23:26:07,164 main.py:546 INFO] Using config files: configs/resnet50/Offline/config.json [2021-01-28 23:26:07,165 __init__.py:283 INFO] Parsing config file configs/resnet50/Offline/config.json ... [2021-01-28 23:26:07,166 main.py:559 INFO] Processing config "AGX_Xavier_resnet50_Offline" [2021-01-28 23:26:07,166 main.py:294 INFO] Running harness for resnet50 benchmark in Offline scenario... concurrent_offline_expected_qps : 2181 dla_batch_size : 32 dla_copy_streams : 1 dla_core : 0 dla_inference_streams : 1 dla_offline_expected_qps : 396 gpu_batch_size : 64 gpu_copy_streams : 1 gpu_inference_streams : 1 gpu_offline_expected_qps : 1478.33 input_dtype : int8 input_format : linear map_path : data_maps/imagenet/val_map.txt precision : int8 tensor_path : ${PREPROCESSED_DATA_DIR}/imagenet/ResNet50/int8_linear use_graphs : False system_id : AGX_Xavier scenario : Offline benchmark : resnet50 config_name : AGX_Xavier_resnet50_Offline accuracy_level : 99% optimization_level : plugin-enabled inference_server : lwis config_ver : default system_name : None test_mode : AccuracyOnly log_dir : /datasets/inference_results_v0.7/closed/NVIDIA/build/logs/2021.01.28-23.26.06 gpu_num_bundles : 2 [2021-01-28 23:26:07,182 __init__.py:207 INFO] Running command: ./build/bin/harness_default --logfile_outdir="/datasets/inference_results_v0.7/closed/NVIDIA/build/logs/2021.01.28-23.26.06/AGX_Xavier_TRT$ resnet50/Offline" --logfile_prefix="mlperf_log_" --performance_sample_count=2048 --test_mode="AccuracyOnly" --gpu_copy_streams=1 --gpu_inference_streams=1 --dla_batch_size=32 --dla_copy_streams=1 --dla_$ nference_streams=1 --use_graphs=false --gpu_batch_size=64 --map_path="data_maps/imagenet/val_map.txt" --tensor_path="${PREPROCESSED_DATA_DIR}/imagenet/ResNet50/int8_linear" --gpu_engines="./build/engine$ /AGX_Xavier/resnet50/Offline/resnet50-Offline-gpu-b64-int8.default.plan" --mlperf_conf_path="measurements/AGX_Xavier_TRT/resnet50/Offline/mlperf.conf" --user_conf_path="measurements/AGX_Xavier_TRT/resne$ 50/Offline/user.conf" --dla_engines="./build/engines/AGX_Xavier/resnet50/Offline/resnet50-Offline-dla-b32-int8.default.plan" --scenario Offline --model resnet50 [2021-01-28 23:26:07,183 __init__.py:213 INFO] Overriding Environment &&&& RUNNING Default_Harness # ./build/bin/harness_default [I] mlperf.conf path: measurements/AGX_Xavier_TRT/resnet50/Offline/mlperf.conf [I] user.conf path: measurements/AGX_Xavier_TRT/resnet50/Offline/user.conf [I] Device:0: ./build/engines/AGX_Xavier/resnet50/Offline/resnet50-Offline-gpu-b64-int8.default.plan has been successfully loaded. [I] Device:0.DLA-0: ./build/engines/AGX_Xavier/resnet50/Offline/resnet50-Offline-dla-b32-int8.default.plan has been successfully loaded. [I] Device:0.DLA-1: ./build/engines/AGX_Xavier/resnet50/Offline/resnet50-Offline-dla-b32-int8.default.plan has been successfully loaded. [I] Creating batcher thread: 0 EnableBatcherThreadPerDevice: false Starting warmup. Running for a minimum of 5 seconds. Finished warmup. Ran for 7.81217s. Starting running actual test. No warnings encountered during test. No errors encountered during test. Finished running actual test. Device Device:0 processed: 1 batches of size 16 537 batches of size 64 Memcpy Calls: 0 PerSampleCudaMemcpy Calls: 0 BatchedCudaMemcpy Calls: 538 Device Device:0.DLA-0 processed: 244 batches of size 32 Memcpy Calls: 0 PerSampleCudaMemcpy Calls: 0 BatchedCudaMemcpy Calls: 0 Device Device:0.DLA-1 processed: 244 batches of size 32 Memcpy Calls: 0 PerSampleCudaMemcpy Calls: 0 BatchedCudaMemcpy Calls: 0 &&&& PASSED Default_Harness # ./build/bin/harness_default [2021-01-28 23:27:03,219 main.py:341 INFO] Result: Cannot find performance result. Maybe you are running in AccuracyOnly mode. [2021-01-28 23:27:03,252 __init__.py:207 INFO] Running command: python3 build/inference/vision/classification_and_detection/tools/accuracy-imagenet.py --mlperf-accuracy-file /datasets/inference_results_v 0.7/closed/NVIDIA/build/logs/2021.01.28-23.26.06/AGX_Xavier_TRT/resnet50/Offline/mlperf_log_accuracy.json --imagenet-val-file data_maps/imagenet/val_map.txt --dtype int32 accuracy=76.040%, good=38020, total=50000 ======================= Perf harness results: ======================= AGX_Xavier_TRT-default-Offline: resnet50: Cannot find performance result. Maybe you are running in AccuracyOnly mode. ======================= Accuracy results: ======================= AGX_Xavier_TRT-default-Offline: resnet50: Accuracy = 76.040, Threshold = 75.695. Accuracy test PASSED. real0m57.327s user0m45.560s sys 0m7.316s
anton@xavier:/datasets/inference_results_v0.7/closed/NVIDIA$ grep AGX_Xavier /datasets/inference_results_v0.7/closed/NVIDIA/configs/resnet50/SingleStream/config.json -A 11 "AGX_Xavier": { "gpu_batch_size": 1, "gpu_copy_streams": 1, "gpu_inference_streams": 1, "gpu_single_stream_expected_latency_ns": 2273000, "input_dtype": "int8", "input_format": "linear", "map_path": "data_maps/imagenet/val_map.txt", "precision": "int8", "tensor_path": "${PREPROCESSED_DATA_DIR}/imagenet/ResNet50/int8_linear", "use_graphs": false },
anton@xavier:/datasets/inference_results_v0.7/closed/NVIDIA$ time \ MLPERF_SCRATCH_PATH=/datasets/mlperf_scratch_path \ make generate_engines RUN_ARGS="--benchmarks=resnet50 --scenarios=singlestream" ... [2021-01-29 23:14:18,111 main.py:153 INFO] Finished building engines for resnet50 benchmark in SingleStream scenario. Time taken to generate engines: 56.77854871749878 seconds real 0m58.002s user 0m23.136s sys 0m6.292s
anton@xavier:/datasets/inference_results_v0.7/closed/NVIDIA$ find . \ -name resnet50-SingleStream*.plan -exec du -hs {} \; 44M ./build/engines/AGX_Xavier/resnet50/SingleStream/resnet50-SingleStream-gpu-b1-int8.default.plan
90th percentile latency (ns) : 2144898 Result is : VALID ... QPS w/ loadgen overhead : 470.07 QPS w/o loadgen overhead : 472.77
anton@xavier:/datasets/inference_results_v0.7/closed/NVIDIA$ time \ MLPERF_SCRATCH_PATH=/datasets/mlperf_scratch_path \ make run_harness RUN_ARGS="--benchmarks=resnet50 --scenarios=SingleStream --test_mode=PerformanceOnly" ... ================================================ MLPerf Results Summary ================================================ SUT name : LWIS_Server Scenario : Single Stream Mode : Performance 90th percentile latency (ns) : 2078719 Result is : VALID Min duration satisfied : Yes Min queries satisfied : Yes ================================================ Additional Stats ================================================ QPS w/ loadgen overhead : 485.07 QPS w/o loadgen overhead : 488.48 Min latency (ns) : 1980890 Max latency (ns) : 15165301 Mean latency (ns) : 2047146 50.00 percentile latency (ns) : 2038621 90.00 percentile latency (ns) : 2078719 95.00 percentile latency (ns) : 2098912 97.00 percentile latency (ns) : 2118465 99.00 percentile latency (ns) : 2187844 99.90 percentile latency (ns) : 2564821 ================================================ Test Parameters Used ================================================ samples_per_query : 1 target_qps : 439.947 target_latency (ns): 0 max_async_queries : 1 min_duration (ms): 60000 max_duration (ms): 0 min_query_count : 1024 max_query_count : 0 qsl_rng_seed : 12786827339337101903 sample_index_rng_seed : 12640797754436136668 schedule_rng_seed : 3135815929913719677 accuracy_log_rng_seed : 0 accuracy_log_probability : 0 accuracy_log_sampling_target : 0 print_timestamps : false performance_issue_unique : false performance_issue_same : false performance_issue_same_index : 0 performance_sample_count : 2048 No warnings encountered during test. No errors encountered during test. Finished running actual test. Device Device:0 processed: 29106 batches of size 1 Memcpy Calls: 0 PerSampleCudaMemcpy Calls: 0 BatchedCudaMemcpy Calls: 29106 &&&& PASSED Default_Harness # ./build/bin/harness_default [2021-01-29 23:25:15,883 main.py:341 INFO] Result: 90th percentile latency (ns) : 2078719 and Result is : VALID ======================= Perf harness results: ======================= AGX_Xavier_TRT-default-SingleStream: resnet50: 90th percentile latency (ns) : 2078719 and Result is : VALID ======================= Accuracy results: ======================= AGX_Xavier_TRT-default-SingleStream: resnet50: No accuracy results in PerformanceOnly mode. real 1m16.374s user 1m6.484s sys 0m5.004s
accuracy=76.064%, good=38032, total=50000 hash=7458cd3f1154670a0d063c87b38d2eba7aa8c1921f2558a46333cfef8d9b4036
anton@xavier:/datasets/inference_results_v0.7/closed/NVIDIA$ time \ MLPERF_SCRATCH_PATH=/datasets/mlperf_scratch_path \ make run_harness RUN_ARGS="--benchmarks=resnet50 --scenarios=SingleStream --test_mode=AccuracyOnly" ... accuracy=76.078%, good=38039, total=50000 ======================= Perf harness results: ======================= AGX_Xavier_TRT-default-SingleStream: resnet50: Cannot find performance result. Maybe you are running in AccuracyOnly mode. ======================= Accuracy results: ======================= AGX_Xavier_TRT-default-SingleStream: resnet50: Accuracy = 76.078, Threshold = 75.695. Accuracy test PASSED. real 4m27.773s user 2m9.392s sys 0m11.332s
anton@xavier:/datasets/inference_results_v0.7/closed/NVIDIA$ grep AGX_Xavier /datasets/inference_results_v0.7/closed/NVIDIA/configs/resnet50/MultiStream/config.json -A 17 "AGX_Xavier": { "concurrent_multi_stream_samples_per_query": 96, "dla_batch_size": 15, "dla_copy_streams": 2, "dla_core": 0, "dla_inference_streams": 4, "dla_multi_stream_samples_per_query": 15, "gpu_batch_size": 66, "gpu_copy_streams": 2, "gpu_inference_streams": 4, "gpu_multi_stream_samples_per_query": 66, "input_dtype": "int8", "input_format": "linear", "map_path": "data_maps/imagenet/val_map.txt", "precision": "int8", "tensor_path": "${PREPROCESSED_DATA_DIR}/imagenet/ResNet50/int8_linear", "use_graphs": false },
anton@xavier:/datasets/inference_results_v0.7/closed/NVIDIA$ time \ MLPERF_SCRATCH_PATH=/datasets/mlperf_scratch_path \ make generate_engines RUN_ARGS="--benchmarks=resnet50 --scenarios=multistream" ... [2021-01-28 23:38:23,569 main.py:153 INFO] Finished building engines for resnet50 benchmark in MultiStream scenario. Time taken to generate engines: 113.68001079559326 seconds real 1m54.783s user 0m31.604s sys 0m15.164s
anton@xavier:/datasets/inference_results_v0.7/closed/NVIDIA$ find . \ -name resnet50-MultiStream*.plan -exec du -hs {} \; 96M ./build/engines/AGX_Xavier/resnet50/MultiStream/resnet50-MultiStream-gpu-b66-int8.default.plan 29M ./build/engines/AGX_Xavier/resnet50/MultiStream/resnet50-MultiStream-dla-b15-int8.default.plan
Samples per query : 96 Result is : VALID
anton@xavier:/datasets/inference_results_v0.7/closed/NVIDIA$ time \ MLPERF_SCRATCH_PATH=/datasets/mlperf_scratch_path \ make run_harness RUN_ARGS="--benchmarks=resnet50 --scenarios=MultiStream --test_mode=PerformanceOnly" [2021-01-28 23:40:13,526 main.py:692 INFO] Detected System ID: AGX_Xavier [2021-01-28 23:40:13,527 main.py:546 INFO] Using config files: configs/resnet50/MultiStream/config.json [2021-01-28 23:40:13,527 __init__.py:283 INFO] Parsing config file configs/resnet50/MultiStream/config.json ... [2021-01-28 23:40:13,528 main.py:559 INFO] Processing config "AGX_Xavier_resnet50_MultiStream" [2021-01-28 23:40:13,528 main.py:294 INFO] Running harness for resnet50 benchmark in MultiStream scenario... concurrent_multi_stream_samples_per_query : 96 dla_batch_size : 15 dla_copy_streams : 2 dla_core : 0 dla_inference_streams : 4 dla_multi_stream_samples_per_query : 15 gpu_batch_size : 66 gpu_copy_streams : 2 gpu_inference_streams : 4 gpu_multi_stream_samples_per_query : 66 input_dtype : int8 input_format : linear map_path : data_maps/imagenet/val_map.txt precision : int8 tensor_path : ${PREPROCESSED_DATA_DIR}/imagenet/ResNet50/int8_linear use_graphs : False system_id : AGX_Xavier scenario : MultiStream benchmark : resnet50 config_name : AGX_Xavier_resnet50_MultiStream accuracy_level : 99% optimization_level : plugin-enabled inference_server : lwis config_ver : default system_name : None test_mode : PerformanceOnly log_dir : /datasets/inference_results_v0.7/closed/NVIDIA/build/logs/2021.01.28-23.40.13 gpu_num_bundles : 2 [2021-01-28 23:40:13,542 __init__.py:207 INFO] Running command: ./build/bin/harness_default --logfile_outdir="/datasets/inference_results_v0.7/closed/NVIDIA/build/logs/2021.01.28-23.40.13/AGX_Xavier_TRT/ resnet50/MultiStream" --logfile_prefix="mlperf_log_" --performance_sample_count=2048 --test_mode="PerformanceOnly" --gpu_copy_streams=2 --gpu_inference_streams=4 --dla_batch_size=15 --dla_copy_streams=2 --dla_inference_streams=4 --use_graphs=false --gpu_batch_size=66 --map_path="data_maps/imagenet/val_map.txt" --tensor_path="${PREPROCESSED_DATA_DIR}/imagenet/ResNet50/int8_linear" --gpu_engines="./build/ engines/AGX_Xavier/resnet50/MultiStream/resnet50-MultiStream-gpu-b66-int8.default.plan" --mlperf_conf_path="measurements/AGX_Xavier_TRT/resnet50/MultiStream/mlperf.conf" --user_conf_path="measurements/AG X_Xavier_TRT/resnet50/MultiStream/user.conf" --dla_engines="./build/engines/AGX_Xavier/resnet50/MultiStream/resnet50-MultiStream-dla-b15-int8.default.plan" --scenario MultiStream --model resnet50 [2021-01-28 23:40:13,542 __init__.py:213 INFO] Overriding Environment &&&& RUNNING Default_Harness # ./build/bin/harness_default [I] mlperf.conf path: measurements/AGX_Xavier_TRT/resnet50/MultiStream/mlperf.conf [I] user.conf path: measurements/AGX_Xavier_TRT/resnet50/MultiStream/user.conf [I] Device:0: ./build/engines/AGX_Xavier/resnet50/MultiStream/resnet50-MultiStream-gpu-b66-int8.default.plan has been successfully loaded. [I] Device:0.DLA-0: ./build/engines/AGX_Xavier/resnet50/MultiStream/resnet50-MultiStream-dla-b15-int8.default.plan has been successfully loaded. [I] Device:0.DLA-1: ./build/engines/AGX_Xavier/resnet50/MultiStream/resnet50-MultiStream-dla-b15-int8.default.plan has been successfully loaded. [E] [TRT] Profile 0 has been chosen by another IExecutionContext. Use another profileIndex or destroy the IExecutionContext that use this profile. [W] [TRT] Could not set default profile 0 for execution context. Profile index must be set explicitly. [I] Creating batcher thread: 0 EnableBatcherThreadPerDevice: false Starting warmup. Running for a minimum of 5 seconds. Finished warmup. Ran for 6.81739s. Starting running actual test. ================================================ MLPerf Results Summary ================================================ SUT name : LWIS_Server Scenario : Multi Stream Mode : Performance Samples per query : 96 Result is : VALID Performance constraints satisfied : Yes Min duration satisfied : Yes Min queries satisfied : Yes ================================================ Additional Stats ================================================ Intervals between each IssueQuery:"qps" : 20, "ms" : 50 50.00 percentile : 1 90.00 percentile : 1 95.00 percentile : 1 97.00 percentile : 1 99.00 percentile : 1 99.90 percentile : 1 Per-query latency:"target_ns" : 50000000, "target_ms" : 50 50.00 percentile latency (ns) : 48075642 90.00 percentile latency (ns) : 48764781 95.00 percentile latency (ns) : 48964945 97.00 percentile latency (ns) : 49093855 99.00 percentile latency (ns) : 49334248 99.90 percentile latency (ns) : 49785859 Per-sample latency: Min latency (ns): 45713517 Max latency (ns): 62771126 Mean latency (ns) : 47592969 50.00 percentile latency (ns) : 47553785 90.00 percentile latency (ns) : 48336265 95.00 percentile latency (ns) : 48552497 97.00 percentile latency (ns) : 48713470 99.00 percentile latency (ns) : 48975182 99.90 percentile latency (ns) : 49500776 ================================================ Test Parameters Used ================================================ samples_per_query : 96 target_qps : 20 target_latency (ns): 50000000 max_async_queries : 1 min_duration (ms): 60000 max_duration (ms): 0 min_query_count : 270336 max_query_count : 0 qsl_rng_seed : 12786827339337101903 sample_index_rng_seed : 12640797754436136668 schedule_rng_seed : 3135815929913719677 accuracy_log_rng_seed : 0 accuracy_log_probability : 0 accuracy_log_sampling_target : 0 print_timestamps : false performance_issue_unique : false performance_issue_same : false performance_issue_same_index : 0 performance_sample_count : 2048 No warnings encountered during test. No errors encountered during test. Finished running actual test. Equivalent QPS computed by samples_per_query*target_qps : 1920 Device Device:0 processed: 1 batches of size 30 270336 batches of size 66 Memcpy Calls: 0 PerSampleCudaMemcpy Calls: 0 BatchedCudaMemcpy Calls: 270337 Device Device:0.DLA-0 processed: 270335 batches of size 15 Memcpy Calls: 0 PerSampleCudaMemcpy Calls: 0 BatchedCudaMemcpy Calls: 0 Device Device:0.DLA-1 processed: 270335 batches of size 15 Memcpy Calls: 0 PerSampleCudaMemcpy Calls: 0 BatchedCudaMemcpy Calls: 0 &&&& PASSED Default_Harness # ./build/bin/harness_default [2021-01-29 03:25:56,386 main.py:341 INFO] Result: Samples per query : 96 and Result is : VALID ======================= Perf harness results: ======================= AGX_Xavier_TRT-default-MultiStream: resnet50: Samples per query : 96 and Result is : VALID ======================= Accuracy results: ======================= AGX_Xavier_TRT-default-MultiStream: resnet50: No accuracy results in PerformanceOnly mode. real225m43.401s user8m35.936s sys 3m57.940s
accuracy=76.044%, good=38022, total=50000 hash=3183baa0e60b647fc54e5469b23fc91353bf8bccc807dd249766cd3ed04d0593
anton@xavier:/datasets/inference_results_v0.7/closed/NVIDIA$ time \ MLPERF_SCRATCH_PATH=/datasets/mlperf_scratch_path \ make run_harness RUN_ARGS="--benchmarks=resnet50 --scenarios=MultiStream --test_mode=AccuracyOnly" ... accuracy=76.046%, good=38023, total=50000 ======================= Perf harness results: ======================= AGX_Xavier_TRT-default-MultiStream: resnet50: Cannot find performance result. Maybe you are running in AccuracyOnly mode. ======================= Accuracy results: ======================= AGX_Xavier_TRT-default-MultiStream: resnet50: Accuracy = 76.046, Threshold = 75.695. Accuracy test PASSED. real 0m59.871s user 0m24.236s sys 0m6.596s
anton@xavier:/datasets/inference_results_v0.7/closed/NVIDIA$ grep AGX_Xavier /datasets/inference_results_v0.7/closed/NVIDIA/configs/ssd-resnet34/Offline/config.json -A 18 "AGX_Xavier": { "concurrent_offline_expected_qps": 50, "dla_batch_size": 1, "dla_copy_streams": 1, "dla_core": 0, "dla_inference_streams": 1, "dla_offline_expected_qps": 10, "gpu_batch_size": 2, "gpu_copy_streams": 4, "gpu_inference_streams": 1, "gpu_offline_expected_qps": 35.1243, "input_dtype": "int8", "input_format": "linear", "map_path": "data_maps/coco/val_map.txt", "min_query_count": 24576, "precision": "int8", "tensor_path": "${PREPROCESSED_DATA_DIR}/coco/val2017/SSDResNet34/int8_linear", "use_graphs": false },
anton@xavier:/datasets/inference_results_v0.7/closed/NVIDIA$ time \ MLPERF_SCRATCH_PATH=/datasets/mlperf_scratch_path \ make generate_engines RUN_ARGS="--benchmarks=ssd-resnet34 --scenarios=offline" ... [2021-01-29 17:52:19,860 main.py:153 INFO] Finished building engines for ssd-resnet34 benchmark in Offline scenario. Time taken to generate engines: 464.4344849586487 seconds real 7m46.577s user 2m12.112s sys 0m15.832s
anton@xavier:/datasets/inference_results_v0.7/closed/NVIDIA$ find . \ -name ssd-resnet34-Offline*.plan -exec du -hs {} \; 148M ./build/engines/AGX_Xavier/ssd-resnet34/Offline/ssd-resnet34-Offline-gpu-b2-int8.default.plan 23M ./build/engines/AGX_Xavier/ssd-resnet34/Offline/ssd-resnet34-Offline-dla-b1-int8.default.plan
Samples per second: 50.9757 Result is : VALID
anton@xavier:/datasets/inference_results_v0.7/closed/NVIDIA$ time \ MLPERF_SCRATCH_PATH=/datasets/mlperf_scratch_path \ make run_harness RUN_ARGS="--benchmarks=ssd-resnet34 --scenarios=Offline --test_mode=PerformanceOnly" ... ================================================ MLPerf Results Summary ================================================ SUT name : LWIS_Server Scenario : Offline Mode : Performance Samples per second: 50.8099 Result is : VALID Min duration satisfied : Yes Min queries satisfied : Yes ================================================ Additional Stats ================================================ Min latency (ns): 58655035 Max latency (ns): 483684969003 Mean latency (ns) : 241873131399 50.00 percentile latency (ns) : 241862755973 90.00 percentile latency (ns) : 435313276430 95.00 percentile latency (ns) : 459447810260 97.00 percentile latency (ns) : 469144908922 99.00 percentile latency (ns) : 478833559170 99.90 percentile latency (ns) : 483161611392 ================================================ Test Parameters Used ================================================ samples_per_query : 24576 target_qps : 50 target_latency (ns): 0 max_async_queries : 1 min_duration (ms): 60000 max_duration (ms): 0 min_query_count : 1 max_query_count : 0 qsl_rng_seed : 12786827339337101903 sample_index_rng_seed : 12640797754436136668 schedule_rng_seed : 3135815929913719677 accuracy_log_rng_seed : 0 accuracy_log_probability : 0 accuracy_log_sampling_target : 0 print_timestamps : false performance_issue_unique : false performance_issue_same : false performance_issue_same_index : 0 performance_sample_count : 64 No warnings encountered during test. No errors encountered during test. Finished running actual test. Device Device:0 processed: 8193 batches of size 2 Memcpy Calls: 0 PerSampleCudaMemcpy Calls: 0 BatchedCudaMemcpy Calls: 8193 Device Device:0.DLA-0 processed: 4095 batches of size 1 Memcpy Calls: 0 PerSampleCudaMemcpy Calls: 0 BatchedCudaMemcpy Calls: 0 Device Device:0.DLA-1 processed: 4095 batches of size 1 Memcpy Calls: 0 PerSampleCudaMemcpy Calls: 0 BatchedCudaMemcpy Calls: 0 &&&& PASSED Default_Harness # ./build/bin/harness_default [2021-01-29 22:55:27,391 main.py:341 INFO] Result: Samples per second: 50.8099 and Result is : VALID ======================= Perf harness results: ======================= AGX_Xavier_TRT-default-Offline: ssd-resnet34: Samples per second: 50.8099 and Result is : VALID ======================= Accuracy results: ======================= AGX_Xavier_TRT-default-Offline: ssd-resnet34: No accuracy results in PerformanceOnly mode. real8m15.281s user8m8.460s sys 0m6.888s
Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.200 Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.380 Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.187 Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.121 Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.257 Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.238 Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.203 Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.331 Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.352 Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.179 Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.411 Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.428 mAP=20.048% hash=95b218c56f20ce3e77d5af209df4b8219bf86c6339aebf93ac3d954126f60fc9
anton@xavier:/datasets/inference_results_v0.7/closed/NVIDIA$ time \ MLPERF_SCRATCH_PATH=/datasets/mlperf_scratch_path \ make run_harness RUN_ARGS="--benchmarks=ssd-resnet34 --scenarios=Offline --test_mode=AccuracyOnly" ... Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.201 Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.380 Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.187 Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.121 Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.257 Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.238 Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.203 Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.331 Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.352 Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.179 Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.411 Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.428 mAP=20.057% ======================= Perf harness results: ======================= AGX_Xavier_TRT-default-Offline: ssd-resnet34: Cannot find performance result. Maybe you are running in AccuracyOnly mode. ======================= Accuracy results: ======================= AGX_Xavier_TRT-default-Offline: ssd-resnet34: Accuracy = 20.057, Threshold = 19.800. Accuracy test PASSED. real 8m52.357s user 5m58.536s sys 0m15.148s
anton@xavier:/datasets/inference_results_v0.7/closed/NVIDIA$ grep AGX_Xavier /datasets/inference_results_v0.7/closed/NVIDIA/configs/ssd-resnet34/SingleStream/config.json -A 11 "AGX_Xavier": { "gpu_batch_size": 1, "gpu_copy_streams": 1, "gpu_inference_streams": 1, "gpu_single_stream_expected_latency_ns": 29478000, "input_dtype": "int8", "input_format": "linear", "map_path": "data_maps/coco/val_map.txt", "precision": "int8", "tensor_path": "${PREPROCESSED_DATA_DIR}/coco/val2017/SSDResNet34/int8_linear", "use_graphs": false },
anton@xavier:/datasets/inference_results_v0.7/closed/NVIDIA$ time \ MLPERF_SCRATCH_PATH=/datasets/mlperf_scratch_path \ make generate_engines RUN_ARGS="--benchmarks=ssd-resnet34 --scenarios=singlestream" ... [2021-01-29 17:45:05,185 main.py:153 INFO] Finished building engines for ssd-resnet34 benchmark in SingleStream scenario. Time taken to generate engines: 63.35827445983887 seconds real 1m5.118s user 0m15.400s sys 0m5.076s
anton@xavier:/datasets/inference_results_v0.7/closed/NVIDIA$ find . \ -name ssd-resnet34-SingleStream*.plan -exec du -hs {} \; 37M ./build/engines/AGX_Xavier/ssd-resnet34/SingleStream/ssd-resnet34-SingleStream-gpu-b1-int8.default.plan
90th percentile latency (ns) : 28531845 Result is : VALID
anton@xavier:/datasets/inference_results_v0.7/closed/NVIDIA$ time \ MLPERF_SCRATCH_PATH=/datasets/mlperf_scratch_path \ make run_harness RUN_ARGS="--benchmarks=ssd-resnet34 --scenarios=SingleStream --test_mode=PerformanceOnly" ... ================================================ MLPerf Results Summary ================================================ SUT name : LWIS_Server Scenario : Single Stream Mode : Performance 90th percentile latency (ns) : 28554901 Result is : VALID Min duration satisfied : Yes Min queries satisfied : Yes ================================================ Additional Stats ================================================ QPS w/ loadgen overhead : 35.17 QPS w/o loadgen overhead : 35.23 Min latency (ns) : 28109905 Max latency (ns) : 32556991 Mean latency (ns) : 28385289 50.00 percentile latency (ns) : 28361924 90.00 percentile latency (ns) : 28554901 95.00 percentile latency (ns) : 28625392 97.00 percentile latency (ns) : 28688816 99.00 percentile latency (ns) : 28884610 99.90 percentile latency (ns) : 29821107 ================================================ Test Parameters Used ================================================ samples_per_query : 1 target_qps : 33.9236 target_latency (ns): 0 max_async_queries : 1 min_duration (ms): 60000 max_duration (ms): 0 min_query_count : 1024 max_query_count : 0 qsl_rng_seed : 12786827339337101903 sample_index_rng_seed : 12640797754436136668 schedule_rng_seed : 3135815929913719677 accuracy_log_rng_seed : 0 accuracy_log_probability : 0 accuracy_log_sampling_target : 0 print_timestamps : false performance_issue_unique : false performance_issue_same : false performance_issue_same_index : 0 performance_sample_count : 64 No warnings encountered during test. No errors encountered during test. Finished running actual test. Device Device:0 processed: 2111 batches of size 1 Memcpy Calls: 0 PerSampleCudaMemcpy Calls: 0 BatchedCudaMemcpy Calls: 2111 &&&& PASSED Default_Harness # ./build/bin/harness_default [2021-01-29 22:39:22,362 main.py:341 INFO] Result: 90th percentile latency (ns) : 28554901 and Result is : VALID ======================= Perf harness results: ======================= AGX_Xavier_TRT-default-SingleStream: ssd-resnet34: 90th percentile latency (ns) : 28554901 and Result is : VALID ======================= Accuracy results: ======================= AGX_Xavier_TRT-default-SingleStream: ssd-resnet34: No accuracy results in PerformanceOnly mode. real 1m9.437s user 1m4.720s sys 0m3.964s
Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.201 Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.381 Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.188 Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.121 Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.258 Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.238 Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.203 Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.332 Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.353 Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.179 Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.411 Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.430 mAP=20.111% hash=526aac286ebb67218a3528397b4aecbff9269cbe01307069569345d9c3fbb445
anton@xavier:/datasets/inference_results_v0.7/closed/NVIDIA$ time \ MLPERF_SCRATCH_PATH=/datasets/mlperf_scratch_path \ make run_harness RUN_ARGS="--benchmarks=ssd-resnet34 --scenarios=SingleStream --test_mode=AccuracyOnly" ... Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.201 Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.381 Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.188 Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.121 Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.258 Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.238 Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.203 Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.332 Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.353 Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.179 Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.411 Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.430 mAP=20.111% ======================= Perf harness results: ======================= AGX_Xavier_TRT-default-SingleStream: ssd-resnet34: Cannot find performance result. Maybe you are running in AccuracyOnly mode. ======================= Accuracy results: ======================= AGX_Xavier_TRT-default-SingleStream: ssd-resnet34: Accuracy = 20.111, Threshold = 19.800. Accuracy test PASSED. real 12m55.238s user 8m38.988s sys 0m16.408s
anton@xavier:/datasets/inference_results_v0.7/closed/NVIDIA$ grep AGX_Xavier /datasets/inference_results_v0.7/closed/NVIDIA/configs/ssd-resnet34/MultiStream/config.json -A 9 "AGX_Xavier": { "gpu_batch_size": 2, "gpu_multi_stream_samples_per_query": 2, "input_dtype": "int8", "input_format": "linear", "map_path": "data_maps/coco/val_map.txt", "precision": "int8", "tensor_path": "${PREPROCESSED_DATA_DIR}/coco/val2017/SSDResNet34/int8_linear", "use_graphs": false },
anton@xavier:/datasets/inference_results_v0.7/closed/NVIDIA$ time \ MLPERF_SCRATCH_PATH=/datasets/mlperf_scratch_path \ make generate_engines RUN_ARGS="--benchmarks=ssd-resnet34 --scenarios=multistream" ... [2021-01-29 07:33:48,592 main.py:153 INFO] Finished building engines for ssd-resnet34 benchmark in MultiStream scenario. Time taken to generate engines: 263.8264467716217 seconds real 4m25.496s user 0m32.000s sys 0m35.784s
anton@xavier:/datasets/inference_results_v0.7/closed/NVIDIA$ find . \ -name ssd-resnet34-MultiStream*.plan -exec du -hs {} \; 148M ./build/engines/AGX_Xavier/ssd-resnet34/MultiStream/ssd-resnet34-MultiStream-gpu-b2-int8.default.plan
Samples per query : 2 Result is : VALID
anton@xavier:/datasets/inference_results_v0.7/closed/NVIDIA$ time \ MLPERF_SCRATCH_PATH=/datasets/mlperf_scratch_path \ make run_harness RUN_ARGS="--benchmarks=ssd-resnet34 --scenarios=MultiStream --test_mode=PerformanceOnly" ... ================================================ MLPerf Results Summary ================================================ SUT name : LWIS_Server Scenario : Multi Stream Mode : Performance Samples per query : 2 Result is : VALID Performance constraints satisfied : Yes Min duration satisfied : Yes Min queries satisfied : Yes ================================================ Additional Stats ================================================ Intervals between each IssueQuery: "qps" : 15, "ms" : 66.6667 50.00 percentile : 1 90.00 percentile : 1 95.00 percentile : 1 97.00 percentile : 1 99.00 percentile : 1 99.90 percentile : 1 Per-query latency: "target_ns" : 66666666, "target_ms" : 66.6667 50.00 percentile latency (ns) : 55624812 90.00 percentile latency (ns) : 55913099 95.00 percentile latency (ns) : 56006924 97.00 percentile latency (ns) : 56067621 99.00 percentile latency (ns) : 56189283 99.90 percentile latency (ns) : 56401684 Per-sample latency: Min latency (ns) : 55180175 Max latency (ns) : 61310689 Mean latency (ns) : 55646311 50.00 percentile latency (ns) : 55624812 90.00 percentile latency (ns) : 55913099 95.00 percentile latency (ns) : 56006924 97.00 percentile latency (ns) : 56067621 99.00 percentile latency (ns) : 56189283 99.90 percentile latency (ns) : 56401684 ================================================ Test Parameters Used ================================================ samples_per_query : 2 target_qps : 15 target_latency (ns): 66666666 max_async_queries : 1 min_duration (ms): 60000 max_duration (ms): 0 min_query_count : 270336 max_query_count : 0 qsl_rng_seed : 12786827339337101903 sample_index_rng_seed : 12640797754436136668 schedule_rng_seed : 3135815929913719677 accuracy_log_rng_seed : 0 accuracy_log_probability : 0 accuracy_log_sampling_target : 0 print_timestamps : false performance_issue_unique : false performance_issue_same : false performance_issue_same_index : 0 performance_sample_count : 64 No warnings encountered during test. No errors encountered during test. Finished running actual test. Equivalent QPS computed by samples_per_query*target_qps : 30 Device Device:0 processed: 270336 batches of size 2 Memcpy Calls: 0 PerSampleCudaMemcpy Calls: 0 BatchedCudaMemcpy Calls: 270336 &&&& PASSED Default_Harness # ./build/bin/harness_default [2021-01-29 13:12:12,503 main.py:341 INFO] Result: Samples per query : 2 and Result is : VALID ======================= Perf harness results: ======================= AGX_Xavier_TRT-default-MultiStream: ssd-resnet34: Samples per query : 2 and Result is : VALID ======================= Accuracy results: ======================= AGX_Xavier_TRT-default-MultiStream: ssd-resnet34: No accuracy results in PerformanceOnly mode. real 300m37.676s user 5m11.760s sys 0m27.856s
Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.201 Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.381 Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.188 Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.121 Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.258 Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.238 Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.203 Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.332 Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.353 Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.179 Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.411 Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.430 mAP=20.111% hash=859242388c9a94513b189eb58a55bd11ad1d2f7d094880dfb72157a7ac5e45fd
anton@xavier:/datasets/inference_results_v0.7/closed/NVIDIA$ time \ MLPERF_SCRATCH_PATH=/datasets/mlperf_scratch_path \ make run_harness RUN_ARGS="--benchmarks=ssd-resnet34 --scenarios=MultiStream --test_mode=AccuracyOnly" ... Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.201 Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.381 Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.188 Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.121 Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.258 Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.238 Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.203 Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.332 Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.353 Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.179 Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.411 Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.430 mAP=20.111% ======================= Perf harness results: ======================= AGX_Xavier_TRT-default-MultiStream: ssd-resnet34: Cannot find performance result. Maybe you are running in AccuracyOnly mode. ======================= Accuracy results: ======================= AGX_Xavier_TRT-default-MultiStream: ssd-resnet34: Accuracy = 20.111, Threshold = 19.800. Accuracy test PASSED. real 11m44.285s user 4m22.432s sys 0m17.144s