From 0eaa93ebc238b696f4a6eb41afa0d8250d408e04 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B3=8A=E9=9C=86?= Date: Mon, 22 Jan 2024 19:58:38 +0800 Subject: [PATCH] [TensorRT] Upgrade TF-TRT version to TF2's implementation. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 泊霆 --- tensorflow/compiler/tf2tensorrt/BUILD | 662 +- .../compiler/tf2tensorrt/_pywrap_py_utils.pyi | 19 + .../compiler/tf2tensorrt/common/datavec.h | 38 + .../compiler/tf2tensorrt/common/utils.cc | 242 + .../compiler/tf2tensorrt/common/utils.h | 133 +- .../tf2tensorrt/convert/algorithm_selector.cc | 272 + .../tf2tensorrt/convert/algorithm_selector.h | 121 + .../convert/algorithm_selector_test.cc | 97 + .../tf2tensorrt/convert/convert_graph.cc | 610 +- .../tf2tensorrt/convert/convert_graph.h | 51 +- .../tf2tensorrt/convert/convert_graph_test.cc | 79 +- .../tf2tensorrt/convert/convert_nodes.cc | 6175 +++++---- .../tf2tensorrt/convert/convert_nodes.h | 574 +- .../tf2tensorrt/convert/convert_nodes_test.cc | 11333 ++++++++++------ .../tf2tensorrt/convert/logger_registry.cc | 60 + .../tf2tensorrt/convert/logger_registry.h | 58 + .../convert/logger_registry_test.cc | 34 + .../tf2tensorrt/convert/op_converter.h | 225 + .../convert/op_converter_registry.cc | 158 + .../convert/op_converter_registry.h | 104 + .../convert/op_converter_registry_test.cc | 67 + .../tf2tensorrt/convert/op_converter_test.cc | 123 + .../tf2tensorrt/convert/ops/binary_ops.cc | 235 + .../convert/ops/data_format_vec_permute.cc | 179 + .../tf2tensorrt/convert/ops/fill_ops.cc | 316 + .../tf2tensorrt/convert/ops/layer_utils.h | 736 + .../tf2tensorrt/convert/ops/like_ops.cc | 95 + .../tf2tensorrt/convert/ops/log_softmax.cc | 104 + .../convert/ops/quantization_ops.cc | 426 + .../convert/ops/quantization_ops.h | 76 + .../convert/ops/quantization_ops_test.cc | 619 + .../tf2tensorrt/convert/ops/selectv2.cc | 220 + .../tf2tensorrt/convert/ops/softmax.cc | 81 + .../compiler/tf2tensorrt/convert/ops/tile.cc | 208 + .../tf2tensorrt/convert/ops/unary_ops.cc | 251 + .../tf2tensorrt/convert/ops/variable_ops.cc | 370 + .../tf2tensorrt/convert/timing_cache.cc | 87 + .../tf2tensorrt/convert/timing_cache.h | 70 + .../convert/trt_layout_optimization_pass.cc | 97 + .../convert/trt_layout_optimization_pass.h | 69 + .../convert/trt_optimization_pass.cc | 322 +- .../convert/trt_optimization_pass.h | 57 +- .../tf2tensorrt/convert/trt_parameters.cc | 104 + .../tf2tensorrt/convert/trt_parameters.h | 72 + .../compiler/tf2tensorrt/convert/utils.cc | 261 +- .../compiler/tf2tensorrt/convert/utils.h | 369 +- .../compiler/tf2tensorrt/convert/weights.cc | 216 + .../compiler/tf2tensorrt/convert/weights.h | 295 + .../kernels/get_calibration_data_op.cc | 6 +- .../tf2tensorrt/kernels/trt_engine_op.cc | 1413 +- .../tf2tensorrt/kernels/trt_engine_op_test.cc | 246 +- .../kernels/trt_engine_resource_ops.cc | 144 +- .../kernels/trt_engine_resource_ops_test.cc | 313 +- .../ops/get_calibration_data_op.cc | 6 +- .../compiler/tf2tensorrt/ops/trt_engine_op.cc | 21 +- .../ops/trt_engine_resource_ops.cc | 7 +- .../compiler/tf2tensorrt/plugin/trt_plugin.cc | 6 +- .../compiler/tf2tensorrt/plugin/trt_plugin.h | 12 +- .../compiler/tf2tensorrt/segment/segment.cc | 841 +- .../compiler/tf2tensorrt/segment/segment.h | 46 +- .../tf2tensorrt/segment/segment_test.cc | 329 +- .../tf2tensorrt/segment/union_find.cc | 154 + .../compiler/tf2tensorrt/segment/union_find.h | 181 +- .../tf2tensorrt/stub/NvInferPlugin_7_0.inc | 95 + .../compiler/tf2tensorrt/stub/NvInfer_7_0.inc | 47 + .../tf2tensorrt/stub/nvinfer_plugin_stub.cc | 11 +- .../compiler/tf2tensorrt/stub/nvinfer_stub.cc | 14 +- .../compiler/tf2tensorrt/tensorrt_test.cc | 233 +- .../compiler/tf2tensorrt/trt_convert_api.cc | 512 + .../compiler/tf2tensorrt/trt_convert_api.h | 129 + .../tf2tensorrt/trt_convert_api_test.cc | 358 + .../compiler/tf2tensorrt/utils/py_utils.cc | 31 +- .../compiler/tf2tensorrt/utils/py_utils.h | 5 + .../compiler/tf2tensorrt/utils/py_utils.i | 2 +- .../tf2tensorrt/utils/py_utils_wrapper.cc | 46 + .../tf2tensorrt/utils/trt_allocator.cc | 24 +- .../tf2tensorrt/utils/trt_allocator.h | 19 +- .../tf2tensorrt/utils/trt_engine_utils.cc | 286 + .../tf2tensorrt/utils/trt_engine_utils.h | 82 + .../tf2tensorrt/utils/trt_execution_context.h | 43 + .../utils/trt_experimental_features.cc | 35 + .../utils/trt_experimental_features.h | 31 + .../tf2tensorrt/utils/trt_int8_calibrator.cc | 11 +- .../tf2tensorrt/utils/trt_int8_calibrator.h | 16 +- .../compiler/tf2tensorrt/utils/trt_logger.cc | 95 +- .../compiler/tf2tensorrt/utils/trt_logger.h | 17 +- .../tf2tensorrt/utils/trt_lru_cache.cc | 56 +- .../tf2tensorrt/utils/trt_lru_cache.h | 108 +- .../utils/trt_shape_optimization_profiles.cc | 664 + .../utils/trt_shape_optimization_profiles.h | 351 + .../trt_shape_optimization_profiles_test.cc | 256 + .../tf2tensorrt/utils/trt_tensor_proxy.h | 75 +- .../tf2tensorrt/utils/trt_testutils.cc | 76 + .../tf2tensorrt/utils/trt_testutils.h | 183 + .../tf2tensorrt/utils/trt_testutils_test.cc | 99 + .../core/framework/selective_registration.h | 65 + .../core/profiler/lib/annotated_traceme.h | 59 + tensorflow/core/util/device_name_utils.h | 16 + third_party/tensorrt/BUILD.tpl | 16 +- third_party/tensorrt/tensorrt_configure.bzl | 49 +- 100 files changed, 26474 insertions(+), 8636 deletions(-) create mode 100644 tensorflow/compiler/tf2tensorrt/_pywrap_py_utils.pyi create mode 100644 tensorflow/compiler/tf2tensorrt/common/datavec.h create mode 100644 tensorflow/compiler/tf2tensorrt/common/utils.cc create mode 100644 tensorflow/compiler/tf2tensorrt/convert/algorithm_selector.cc create mode 100644 tensorflow/compiler/tf2tensorrt/convert/algorithm_selector.h create mode 100644 tensorflow/compiler/tf2tensorrt/convert/algorithm_selector_test.cc create mode 100644 tensorflow/compiler/tf2tensorrt/convert/logger_registry.cc create mode 100644 tensorflow/compiler/tf2tensorrt/convert/logger_registry.h create mode 100644 tensorflow/compiler/tf2tensorrt/convert/logger_registry_test.cc create mode 100644 tensorflow/compiler/tf2tensorrt/convert/op_converter.h create mode 100644 tensorflow/compiler/tf2tensorrt/convert/op_converter_registry.cc create mode 100644 tensorflow/compiler/tf2tensorrt/convert/op_converter_registry.h create mode 100644 tensorflow/compiler/tf2tensorrt/convert/op_converter_registry_test.cc create mode 100644 tensorflow/compiler/tf2tensorrt/convert/op_converter_test.cc create mode 100644 tensorflow/compiler/tf2tensorrt/convert/ops/binary_ops.cc create mode 100644 tensorflow/compiler/tf2tensorrt/convert/ops/data_format_vec_permute.cc create mode 100644 tensorflow/compiler/tf2tensorrt/convert/ops/fill_ops.cc create mode 100644 tensorflow/compiler/tf2tensorrt/convert/ops/layer_utils.h create mode 100644 tensorflow/compiler/tf2tensorrt/convert/ops/like_ops.cc create mode 100644 tensorflow/compiler/tf2tensorrt/convert/ops/log_softmax.cc create mode 100644 tensorflow/compiler/tf2tensorrt/convert/ops/quantization_ops.cc create mode 100644 tensorflow/compiler/tf2tensorrt/convert/ops/quantization_ops.h create mode 100644 tensorflow/compiler/tf2tensorrt/convert/ops/quantization_ops_test.cc create mode 100644 tensorflow/compiler/tf2tensorrt/convert/ops/selectv2.cc create mode 100644 tensorflow/compiler/tf2tensorrt/convert/ops/softmax.cc create mode 100644 tensorflow/compiler/tf2tensorrt/convert/ops/tile.cc create mode 100644 tensorflow/compiler/tf2tensorrt/convert/ops/unary_ops.cc create mode 100644 tensorflow/compiler/tf2tensorrt/convert/ops/variable_ops.cc create mode 100644 tensorflow/compiler/tf2tensorrt/convert/timing_cache.cc create mode 100644 tensorflow/compiler/tf2tensorrt/convert/timing_cache.h create mode 100644 tensorflow/compiler/tf2tensorrt/convert/trt_layout_optimization_pass.cc create mode 100644 tensorflow/compiler/tf2tensorrt/convert/trt_layout_optimization_pass.h create mode 100644 tensorflow/compiler/tf2tensorrt/convert/trt_parameters.cc create mode 100644 tensorflow/compiler/tf2tensorrt/convert/trt_parameters.h create mode 100644 tensorflow/compiler/tf2tensorrt/convert/weights.cc create mode 100644 tensorflow/compiler/tf2tensorrt/convert/weights.h create mode 100644 tensorflow/compiler/tf2tensorrt/segment/union_find.cc create mode 100644 tensorflow/compiler/tf2tensorrt/stub/NvInferPlugin_7_0.inc create mode 100644 tensorflow/compiler/tf2tensorrt/stub/NvInfer_7_0.inc create mode 100644 tensorflow/compiler/tf2tensorrt/trt_convert_api.cc create mode 100644 tensorflow/compiler/tf2tensorrt/trt_convert_api.h create mode 100644 tensorflow/compiler/tf2tensorrt/trt_convert_api_test.cc create mode 100644 tensorflow/compiler/tf2tensorrt/utils/py_utils_wrapper.cc create mode 100755 tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.cc create mode 100644 tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.h create mode 100644 tensorflow/compiler/tf2tensorrt/utils/trt_execution_context.h create mode 100644 tensorflow/compiler/tf2tensorrt/utils/trt_experimental_features.cc create mode 100644 tensorflow/compiler/tf2tensorrt/utils/trt_experimental_features.h create mode 100644 tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.cc create mode 100644 tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h create mode 100644 tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles_test.cc create mode 100644 tensorflow/compiler/tf2tensorrt/utils/trt_testutils.cc create mode 100644 tensorflow/compiler/tf2tensorrt/utils/trt_testutils.h create mode 100644 tensorflow/compiler/tf2tensorrt/utils/trt_testutils_test.cc create mode 100644 tensorflow/core/profiler/lib/annotated_traceme.h diff --git a/tensorflow/compiler/tf2tensorrt/BUILD b/tensorflow/compiler/tf2tensorrt/BUILD index 2b9196f874c..4e35a4912d2 100644 --- a/tensorflow/compiler/tf2tensorrt/BUILD +++ b/tensorflow/compiler/tf2tensorrt/BUILD @@ -5,39 +5,45 @@ load( "//tensorflow:tensorflow.bzl", - "tf_cc_test", + "VERSION", "tf_copts", "tf_cuda_library", "tf_custom_op_library_additional_deps", - "tf_gen_op_libs", "tf_gen_op_wrapper_py", - "tf_gpu_kernel_library", + "tf_cuda_cc_test", + "tf_gen_op_libs", + "tf_py_wrap_cc", + "tf_custom_op_py_library", + "pybind_extension", ) -load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test") -load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library") -load("//tensorflow:tensorflow.bzl", "tf_py_wrap_cc") + load( "//tensorflow/core/platform:default/build_config.bzl", "tf_additional_all_protos", "tf_proto_library", ) + +# Platform specific build config +load( + "//tensorflow/core/platform:default/build_config_root.bzl", + "if_static", +) + load("@local_config_tensorrt//:build_defs.bzl", "if_tensorrt") -# Placeholder for Google-internal load statements. package( + # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"], default_visibility = ["//visibility:public"], - licenses = ["notice"], # Apache 2.0 + licenses = ["notice"], ) -exports_files(["LICENSE"]) - cc_library( name = "tensorrt_stub", srcs = if_tensorrt([ "stub/nvinfer_stub.cc", "stub/nvinfer_plugin_stub.cc", ]), - textual_hdrs = glob(["stub/*.inc", "common/utils.h"]), + textual_hdrs = glob(["stub/*.inc"]), deps = if_tensorrt([ "@local_config_tensorrt//:tensorrt_headers", "//tensorflow/core:lib", @@ -48,8 +54,8 @@ cc_library( alias( name = "tensorrt_lib", actual = select({ - "//tensorflow:oss": ":tensorrt_stub", - "//conditions:default": "@local_config_tensorrt//:tensorrt", + "@local_config_tensorrt//:use_static_tensorrt": "@local_config_tensorrt//:tensorrt", + "//conditions:default": ":tensorrt_stub", }), visibility = ["//visibility:private"], ) @@ -57,22 +63,155 @@ alias( tf_cuda_cc_test( name = "tensorrt_test_cc", size = "small", - srcs = ["tensorrt_test.cc"], + srcs = [ + "tensorrt_test.cc", + ], tags = [ + "no_cuda_on_cpu_tap", "no_windows", "nomac", + # TODO(b/303453873): Re-enable test once TensorRT has been updated + "notap", ], deps = [ ":trt_logging", ":utils", - "//tensorflow/core:gpu_init", "//tensorflow/core:lib", + "//tensorflow/core:gpu_init", + "//tensorflow/core:test", + "//tensorflow/core:test_main", "//tensorflow/core:stream_executor", + ] + if_tensorrt([ + ":tensorrt_lib", + ]), +) + +cc_library( + name = "trt_convert_api", + srcs = ["trt_convert_api.cc"], + hdrs = [ + "trt_convert_api.h", + ], + copts = tf_copts(), + deps = [ + ":trt_parameters", + ":trt_resources", + "//tensorflow/cc/tools:freeze_saved_model", + "//tensorflow/core:direct_session", + "//tensorflow/core:framework", + "//tensorflow/core/grappler:grappler_item_builder", + "//tensorflow/core/grappler/clusters:single_machine", + "//tensorflow/core/platform:logging", + "@com_google_absl//absl/strings", + ] + if_tensorrt([":tensorrt_lib"]), +) + +filegroup( + name = "headers", + srcs = [ + "trt_convert_api.h", + ], +) + +tf_cuda_cc_test( + name = "trt_convert_api_test", + size = "small", + srcs = ["trt_convert_api_test.cc"], + tags = [ + "no_cuda_on_cpu_tap", + "no_windows", + "nomac", + # TODO(b/303453873): Re-enable test once TensorRT has been updated + "notap", + ], + deps = [ + ":common_utils", + ":testutils", + ":trt_conversion", + ":trt_convert_api", + ":trt_logging", + ":trt_op_kernels", + ":trt_resources", + ":utils", + "//tensorflow/cc:cc_ops", + "//tensorflow/cc:resource_variable_ops", + "//tensorflow/cc:scope", + "//tensorflow/core:array_ops_op_lib", + "//tensorflow/core:core_cpu", + "//tensorflow/core:core_cpu_internal", + "//tensorflow/core:direct_session", + "//tensorflow/core:framework", + "//tensorflow/core:function_ops_op_lib", + "//tensorflow/core:lib", + "//tensorflow/core:lib_internal", + "//tensorflow/core:math_ops_op_lib", + "//tensorflow/core:no_op_op_lib", + "//tensorflow/core:ops", + "//tensorflow/core:protos_all_cc", + "//tensorflow/core:state_ops_op_lib", "//tensorflow/core:test", "//tensorflow/core:test_main", + "//tensorflow/core:testlib", + "//tensorflow/core/kernels:array", + "//tensorflow/core/kernels:assign_op", + "//tensorflow/core/kernels:ops_testutil", + "//tensorflow/core/kernels:partitioned_function_ops", + "//tensorflow/core/kernels:resource_variable_ops", + ], +) + +cc_library( + name = "common_utils", + srcs = ["common/utils.cc"], + hdrs = [ + "common/datavec.h", + "common/utils.h", + ], + copts = tf_copts(), + deps = [ + "//tensorflow/core:framework", + "//tensorflow/core/platform:logging", + "//tensorflow/core/profiler/lib:traceme", + ] + if_tensorrt([":tensorrt_lib"]), +) + +cc_library( + name = "testutils", + testonly = 1, + srcs = ["utils/trt_testutils.cc"], + hdrs = [ + "utils/trt_testutils.h", + ], + copts = tf_copts(), + visibility = ["//visibility:private"], + deps = [ + ":trt_conversion", + "//tensorflow/cc:cc_ops", + "//tensorflow/core:protos_all_cc", + "//tensorflow/core/framework:tensor_testutil", + "@com_google_absl//absl/strings", + "@com_google_googletest//:gtest", + ] + if_tensorrt([":tensorrt_lib"]), +) + +tf_cuda_cc_test( + name = "testutils_test", + size = "small", + srcs = ["utils/trt_testutils_test.cc"], + tags = [ + "no_cuda_on_cpu_tap", + "no_windows", + "nomac", + # TODO(b/303453873): Re-enable test once TensorRT has been updated + "notap", + ], + deps = [ + ":testutils", + "//tensorflow/core:protos_all_cc", + "//tensorflow/core:test_main", + "//tensorflow/core/platform:protobuf", ] + if_tensorrt([ ":tensorrt_lib", - "@local_config_cuda//cuda:cuda_headers", ]), ) @@ -85,26 +224,30 @@ cc_library( copts = tf_copts(), visibility = ["//visibility:public"], deps = [ + ":common_utils", ":trt_allocator", ":trt_conversion", + ":trt_engine_utils", ":trt_logging", ":trt_plugins", ":trt_resources", ":utils", - "@com_google_absl//absl/memory", - "@com_google_absl//absl/strings", - "@local_config_cuda//cuda:cuda_headers", - "//tensorflow/core:core_cpu_lib_no_ops", "//tensorflow/core:framework", "//tensorflow/core:gpu_headers_lib", "//tensorflow/core:lib", "//tensorflow/core:lib_internal", "//tensorflow/core:lib_proto_parsing", - "//tensorflow/core:stream_executor", "//tensorflow/core:stream_executor_headers_lib", + "//tensorflow/core:core_cpu_lib_no_ops", "//tensorflow/core/grappler/costs:graph_properties", - "//tensorflow/stream_executor/lib", - ] + if_tensorrt([":tensorrt_lib"]) + tf_custom_op_library_additional_deps(), + "//tensorflow/core:stream_executor", + "//tensorflow/core/profiler/lib:traceme", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/strings", + ] + if_tensorrt([ + ":tensorrt_lib", + "@local_config_cuda//cuda:cuda_headers", + ]) + tf_custom_op_library_additional_deps(), alwayslink = 1, ) @@ -119,13 +262,14 @@ cc_library( ":trt_logging", ":trt_plugins", ":trt_resources", - "@com_google_absl//absl/memory", - "@com_google_absl//absl/strings", "//tensorflow/core:framework", "//tensorflow/core:gpu_headers_lib", "//tensorflow/core:lib", "//tensorflow/core:lib_internal", "//tensorflow/core:lib_proto_parsing", + "//tensorflow/core/profiler/lib:traceme", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/strings", ] + if_tensorrt([":tensorrt_lib"]) + tf_custom_op_library_additional_deps(), alwayslink = 1, ) @@ -138,23 +282,32 @@ tf_cuda_cc_test( "no_cuda_on_cpu_tap", "no_windows", "nomac", + # TODO(b/303453873): Re-enable test once TensorRT has been updated + "notap", ], deps = [ + ":common_utils", + ":testutils", ":trt_engine_instance_proto_cc", ":trt_engine_resource_op_kernels", ":trt_engine_resource_ops_op_lib", ":trt_logging", ":trt_resources", + ":utils", + "//tensorflow/core:core_cpu", "//tensorflow/core:framework", "//tensorflow/core:lib", "//tensorflow/core:lib_internal", + "//tensorflow/core:ops", "//tensorflow/core:protos_all_cc", "//tensorflow/core:test", "//tensorflow/core:test_main", - "//tensorflow/core:testlib", + "//tensorflow/core/framework:fake_input", "//tensorflow/core/kernels:ops_testutil", "//tensorflow/core/kernels:resource_variable_ops", + "@com_google_absl//absl/container:inlined_vector", "@com_google_absl//absl/memory", + "@com_google_absl//absl/strings", ], ) @@ -166,77 +319,115 @@ tf_cuda_cc_test( "no_cuda_on_cpu_tap", "no_windows", "nomac", + # TODO(b/303453873): Re-enable test once TensorRT has been updated + "notap", ], deps = [ + ":testutils", + ":trt_conversion", ":trt_op_kernels", ":trt_op_libs", ":trt_resources", - ":trt_conversion", - ":utils", - "@com_google_googletest//:gtest", - "@com_google_absl//absl/strings", "//tensorflow/cc:cc_ops", "//tensorflow/cc:function_ops", - "//tensorflow/cc:ops", "//tensorflow/cc:scope", + "//tensorflow/core:core_cpu_lib", "//tensorflow/core:framework", "//tensorflow/core:lib", "//tensorflow/core:lib_internal", "//tensorflow/core:protos_all_cc", "//tensorflow/core:test", "//tensorflow/core:test_main", - "//tensorflow/core:testlib", + "//tensorflow/core/framework:fake_input", + "//tensorflow/core/kernels:array", + "//tensorflow/core/kernels:function_ops", "//tensorflow/core/kernels:ops_testutil", + "@com_google_absl//absl/container:inlined_vector", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/types:span", + "@com_google_googletest//:gtest", + "@eigen_archive//:eigen3", ] + if_tensorrt([ "@local_config_cuda//cuda:cuda_headers", ]), ) -tf_gen_op_libs( - op_lib_names = [ - "trt_engine_op", - "get_calibration_data_op", - "trt_engine_resource_ops", - ], -) +tf_gen_op_libs(op_lib_names = [ + "trt_engine_op", + "get_calibration_data_op", + "trt_engine_resource_ops", +]) cc_library( name = "trt_op_libs", deps = [ ":get_calibration_data_op_op_lib", ":trt_engine_op_op_lib", + ":trt_engine_utils", ], ) +tf_cuda_library( + name = "trt_engine_utils", + srcs = [ + "utils/trt_engine_utils.cc", + "utils/trt_shape_optimization_profiles.cc", + ], + hdrs = [ + "utils/trt_engine_utils.h", + "utils/trt_execution_context.h", + "utils/trt_shape_optimization_profiles.h", + ], + deps = [ + ":common_utils", + ":trt_allocator", + ":trt_logging", + ":trt_parameters", + ":utils", + "//tensorflow/core:framework", + "//tensorflow/core:framework_headers_lib", + "//tensorflow/core:lib", + "//tensorflow/core:stream_executor_headers_lib", + "//tensorflow/core/platform:status", + "//tensorflow/core/profiler/lib:traceme", + "@com_google_absl//absl/strings", + ] + if_tensorrt([":tensorrt_lib"]), +) + tf_cuda_library( name = "trt_logging", srcs = ["utils/trt_logger.cc"], hdrs = ["utils/trt_logger.h"], visibility = ["//visibility:public"], deps = [ + ":common_utils", + ":logger_registry", + ":utils", "//tensorflow/core:lib_proto_parsing", + "@com_google_absl//absl/strings", ] + if_tensorrt([":tensorrt_lib"]), ) tf_gen_op_wrapper_py( name = "trt_ops", deps = [ - ":trt_engine_resource_ops_op_lib", ":trt_op_libs", ], ) -tf_custom_op_py_library( - name = "trt_ops_loader", - srcs_version = "PY2AND3", - deps = [ - ":trt_ops", - ":wrap_py_utils", - "//tensorflow/python:errors", - "//tensorflow/python:framework_for_generated_wrappers", - "//tensorflow/python:platform", - "//tensorflow/python:resources", +tf_cuda_library( + name = "trt_parameters", + srcs = ["convert/trt_parameters.cc"], + hdrs = [ + "convert/trt_parameters.h", ], + copts = tf_copts(), + deps = [ + ":utils", + "//tensorflow/core:framework", + "//tensorflow/core:lib", + "@com_google_absl//absl/strings", + ] + if_tensorrt([":tensorrt_lib"]), ) tf_cuda_library( @@ -248,18 +439,21 @@ tf_cuda_library( hdrs = [ "utils/trt_int8_calibrator.h", "utils/trt_lru_cache.h", + "utils/trt_shape_optimization_profiles.h", "utils/trt_tensor_proxy.h", ], deps = [ + ":common_utils", ":trt_allocator", + ":trt_engine_utils", ":trt_logging", ":utils", "//tensorflow/core:framework_headers_lib", "//tensorflow/core:framework_lite", - "//tensorflow/core/grappler:op_types", - "//tensorflow/core:graph", "//tensorflow/core:gpu_runtime", + "//tensorflow/core:graph", "//tensorflow/core:lib_proto_parsing", + "//tensorflow/core/grappler:op_types", ] + if_tensorrt([":tensorrt_lib"]), ) @@ -274,7 +468,7 @@ tf_cuda_library( ] + if_tensorrt([":tensorrt_lib"]), ) -tf_cc_test( +tf_cuda_cc_test( name = "trt_allocator_test", size = "small", srcs = ["utils/trt_allocator_test.cc"], @@ -289,7 +483,7 @@ tf_cc_test( ], ) -tf_cc_test( +tf_cuda_cc_test( name = "trt_lru_cache_test", size = "small", srcs = ["utils/trt_lru_cache_test.cc"], @@ -304,33 +498,205 @@ tf_cc_test( ], ) +tf_cuda_cc_test( + name = "trt_shape_optimization_profiles_test", + size = "small", + srcs = ["utils/trt_shape_optimization_profiles_test.cc"], + tags = [ + "no_cuda_on_cpu_tap", + "no_windows", + "nomac", + # TODO(b/303453873): Re-enable test once TensorRT has been updated + "notap", + ], + deps = [ + ":trt_resources", + "//tensorflow/core:test", + "//tensorflow/core:test_main", + ], +) + +tf_cuda_library( + name = "logger_registry", + srcs = ["convert/logger_registry.cc"], + hdrs = [ + "convert/logger_registry.h", + ], + copts = tf_copts(), + deps = [ + "//tensorflow/core:lib", + "@com_google_absl//absl/strings", + ] + if_tensorrt([":tensorrt_lib"]), +) + +tf_cuda_library( + name = "trt_weights", + srcs = ["convert/weights.cc"], + hdrs = [ + "convert/weights.h", + ], + copts = tf_copts(), + deps = [ + ":utils", + "//tensorflow/core:framework", + "//tensorflow/core:lib", + ] + if_tensorrt([":tensorrt_lib"]), +) + +tf_cuda_library( + name = "op_converter", + srcs = [], + hdrs = [ + "convert/op_converter.h", + ], + deps = [ + ":trt_parameters", + ":trt_weights", + ] + if_tensorrt([":tensorrt_lib"]), +) + +# This rule contains static variables for the converter registry. Do not depend +# on it directly; use :op_converter_registry, and link against +# libtensorflow_framework.so for the registry symbols. The library +# libtensorflow_framework.so depends on this target so that users can +# register custom op converters without the need to incorporate Tensorflow into +# their build system. +tf_cuda_library( + name = "op_converter_registry_impl", + srcs = ["convert/op_converter_registry.cc"], + hdrs = [ + "convert/op_converter_registry.h", + ], + visibility = ["//tensorflow:__subpackages__"], + deps = [ + ":op_converter", + ":utils", + "//tensorflow/core:lib", + "@com_google_absl//absl/strings", + ] + if_tensorrt([":tensorrt_lib"]), +) + +tf_cuda_library( + name = "op_converter_registry", + hdrs = [ + "convert/op_converter_registry.h", + ], + copts = tf_copts(), + deps = [ + ":op_converter", + ":utils", + "//tensorflow/core:lib", + "//tensorflow/core/platform:logging", + "//tensorflow/core/platform:status", + "@com_google_absl//absl/container:flat_hash_set", + "@com_google_absl//absl/strings", + ":op_converter_registry_impl", + ], +) + +tf_py_wrap_cc( + name = "wrap_py_utils", + srcs = ["utils/py_utils.i"], + copts = tf_copts(), + deps = [ + ":py_utils", + "//third_party/python_runtime:headers", + ], +) + +tf_custom_op_py_library( + name = "trt_ops_loader", + srcs_version = "PY2AND3", + deps = [ + ":wrap_py_utils", + ":trt_ops", + "//tensorflow/python:errors", + "//tensorflow/python:framework_for_generated_wrappers", + "//tensorflow/python:resources", + ], +) + +tf_cuda_cc_test( + name = "op_converter_registry_test", + size = "small", + srcs = ["convert/op_converter_registry_test.cc"], + tags = [ + "no_windows", + "nomac", + ], + deps = [ + ":op_converter_registry", + "//tensorflow/core:test", + "//tensorflow/core:test_main", + ], +) + +tf_cuda_library( + name = "algorithm_selector", + srcs = [ + "convert/algorithm_selector.cc", + ], + hdrs = [ + "convert/algorithm_selector.h", + ], + deps = [":common_utils"] + if_tensorrt([":tensorrt_lib"]), +) + +tf_cuda_cc_test( + name = "algorithm_selector_test", + srcs = [ + "convert/algorithm_selector_test.cc", + ], + deps = [ + ":algorithm_selector", + "//tensorflow/core:test", + "//tensorflow/core:test_main", + ] + if_tensorrt([":tensorrt_lib"]), +) + # Library for the node-level conversion portion of TensorRT operation creation tf_cuda_library( name = "trt_conversion", srcs = [ "convert/convert_graph.cc", "convert/convert_nodes.cc", + "convert/ops/binary_ops.cc", + "convert/ops/data_format_vec_permute.cc", + "convert/ops/fill_ops.cc", + "convert/ops/like_ops.cc", + "convert/ops/log_softmax.cc", + "convert/ops/quantization_ops.cc", + "convert/ops/selectv2.cc", + "convert/ops/softmax.cc", + "convert/ops/tile.cc", + "convert/ops/unary_ops.cc", + "convert/ops/variable_ops.cc", + "convert/timing_cache.cc", "convert/trt_optimization_pass.cc", ], hdrs = [ "convert/convert_graph.h", "convert/convert_nodes.h", + "convert/ops/layer_utils.h", + "convert/ops/quantization_ops.h", + "convert/timing_cache.h", "convert/trt_optimization_pass.h", ], deps = [ + ":algorithm_selector", + ":common_utils", + ":logger_registry", + ":op_converter", + ":op_converter_registry", ":segment", ":trt_allocator", - ":trt_plugins", ":trt_logging", + ":trt_parameters", + ":trt_plugins", ":trt_resources", + ":trt_weights", ":utils", - "@com_google_absl//absl/strings", - "//tensorflow/core/grappler/clusters:cluster", - "//tensorflow/core/grappler/optimizers:custom_graph_optimizer", - "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry", - "//tensorflow/core/grappler:grappler_item", - "//tensorflow/core/grappler:op_types", - "//tensorflow/core/grappler:utils", + "//tensorflow/cc:array_ops", "//tensorflow/core:framework", "//tensorflow/core:framework_lite", "//tensorflow/core:gpu_runtime", @@ -338,11 +704,23 @@ tf_cuda_library( "//tensorflow/core:lib", "//tensorflow/core:lib_internal", "//tensorflow/core:protos_all_cc", + "//tensorflow/core:core_cpu", "//tensorflow/core/grappler:devices", + "//tensorflow/core/grappler:grappler_item", + "//tensorflow/core/grappler:op_types", + "//tensorflow/core/grappler:utils", + "//tensorflow/core/grappler/clusters:cluster", "//tensorflow/core/grappler/clusters:virtual_cluster", "//tensorflow/core/grappler/costs:graph_properties", + "//tensorflow/core/grappler/optimizers:custom_graph_optimizer", + "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry", "//tensorflow/core/grappler/optimizers:meta_optimizer", - ] + if_tensorrt([":tensorrt_lib"]) + tf_custom_op_library_additional_deps(), + "//tensorflow/core/grappler/utils:functions", + "//tensorflow/core/profiler/lib:traceme", + "//tensorflow/tools/graph_transforms:transform_utils", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/strings", + ] + if_tensorrt([":tensorrt_lib"]), alwayslink = 1, ) @@ -354,18 +732,17 @@ tf_cuda_cc_test( "no_cuda_on_cpu_tap", "no_windows", "nomac", + # TODO(b/303453873): Re-enable test once TensorRT has been updated + "notap", ], deps = [ + ":testutils", + ":trt_conversion", ":trt_op_kernels", ":trt_op_libs", - ":trt_conversion", - "@com_google_googletest//:gtest", - "@com_google_absl//absl/strings", "//tensorflow/cc:cc_ops", "//tensorflow/cc:ops", "//tensorflow/cc:scope", - "//tensorflow/core/grappler:grappler_item", - "//tensorflow/core/grappler/clusters:cluster", "//tensorflow/core:core_cpu", "//tensorflow/core:core_cpu_base", "//tensorflow/core:direct_session", @@ -374,40 +751,104 @@ tf_cuda_cc_test( "//tensorflow/core:protos_all_cc", "//tensorflow/core:test", "//tensorflow/core:test_main", - "//tensorflow/core:testlib", + "//tensorflow/core/grappler:grappler_item", + "//tensorflow/core/grappler/clusters:cluster", + "@com_google_absl//absl/strings", + "@com_google_googletest//:gtest", ] + if_tensorrt([":tensorrt_lib"]), ) tf_cuda_cc_test( name = "convert_nodes_test", size = "medium", - srcs = ["convert/convert_nodes_test.cc"], + srcs = [ + "convert/convert_nodes_test.cc", + "convert/op_converter_test.cc", + ], tags = [ "no_cuda_on_cpu_tap", "no_windows", "nomac", + # TODO(b/303453873): Re-enable test once TensorRT has been updated + "notap", ], deps = [ - ":trt_logging", + ":testutils", ":trt_conversion", + ":trt_engine_utils", + ":trt_logging", ":trt_plugins", - "@com_google_googletest//:gtest", + ":utils", + "//tensorflow/cc:cc_ops", + "//tensorflow/cc:cc_ops_internal", + "//tensorflow/cc:ops", + "//tensorflow/cc:scope", + "//tensorflow/core:core_cpu", + "//tensorflow/core:core_cpu_base", + "//tensorflow/core:framework", + "//tensorflow/core:lib", + "//tensorflow/core:protos_all_cc", + "//tensorflow/core:test", + "//tensorflow/core/framework:tensor_testutil", + "//tensorflow/core/grappler/costs:graph_properties", + "//tensorflow/core/kernels:function_ops", + "//tensorflow/core/kernels:identity_op", + "//tensorflow/core/kernels:resource_variable_ops", + "//tensorflow/core/platform:status_matchers", "@com_google_absl//absl/strings", "@com_google_absl//absl/types:span", + "@com_google_googletest//:gtest", + ] + if_tensorrt([ + ":tensorrt_lib", + "@local_config_cuda//cuda:cuda_headers", + ]), +) + +tf_cuda_cc_test( + name = "convert_qdq_test", + size = "medium", + srcs = [ + "convert/ops/quantization_ops_test.cc", + ], + tags = [ + "no_cuda_on_cpu_tap", + "no_windows", + "nomac", + "notap", # Fails w/ tensorrt 8.x + ], + deps = [ + ":testutils", + ":trt_conversion", + ":trt_convert_api", + ":trt_engine_utils", + ":trt_logging", + ":trt_op_kernels", + ":trt_plugins", + ":trt_resources", + ":utils", "//tensorflow/cc:cc_ops", "//tensorflow/cc:cc_ops_internal", "//tensorflow/cc:ops", "//tensorflow/cc:scope", - "//tensorflow/core/grappler/costs:graph_properties", + "//tensorflow/compiler/jit:shape_inference", "//tensorflow/core:core_cpu", "//tensorflow/core:core_cpu_base", "//tensorflow/core:framework", "//tensorflow/core:lib", + "//tensorflow/core:ops", "//tensorflow/core:protos_all_cc", - "//tensorflow/core:tensor_testutil", "//tensorflow/core:test", "//tensorflow/core:test_main", - "//tensorflow/core:testlib", + "//tensorflow/core/framework:tensor_testutil", + "//tensorflow/core/kernels:array", + "//tensorflow/core/kernels:function_ops", + "//tensorflow/core/kernels:nn", + "//tensorflow/core/kernels:ops_testutil", + "//tensorflow/core/kernels:pooling_ops", + "//tensorflow/core/platform:status_matchers", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/types:span", + "@com_google_googletest//:gtest", ] + if_tensorrt([ ":tensorrt_lib", "@local_config_cuda//cuda:cuda_headers", @@ -415,21 +856,46 @@ tf_cuda_cc_test( ) # Library for the segmenting portion of TensorRT operation creation +cc_library( + name = "union_find", + srcs = ["segment/union_find.cc"], + hdrs = [ + "segment/union_find.h", + ], + copts = tf_copts(), + deps = [ + ":utils", + "//tensorflow/core:framework", + "//tensorflow/core:lib", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", + "@com_google_absl//absl/types:optional", + ], +) + cc_library( name = "segment", srcs = ["segment/segment.cc"], hdrs = [ "segment/segment.h", - "segment/union_find.h", ], copts = tf_copts(), deps = [ + ":common_utils", + ":union_find", + ":utils", "//tensorflow/core:graph", "//tensorflow/core:lib", "//tensorflow/core:lib_internal", "//tensorflow/core:lib_proto_parsing", "//tensorflow/core:protos_all_cc", + "//tensorflow/core:core_cpu", + "//tensorflow/core/grappler/costs:graph_properties", + "//tensorflow/core/profiler/lib:traceme", + "@com_google_absl//absl/container:flat_hash_set", "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", + "@com_google_absl//absl/types:optional", "@com_google_protobuf//:protobuf_headers", ], ) @@ -453,22 +919,9 @@ tf_cuda_cc_test( "//tensorflow/core:protos_all_cc", "//tensorflow/core:test", "//tensorflow/core:test_main", - "//tensorflow/core:testlib", ], ) -#tf_gpu_kernel_library( -# name = "plugin_cast", -# srcs = ["plugin/plugin_cast.cu.cc"], -# deps = [ -# ":trt_plugins", -# "//tensorflow/core:framework_lite", -# ] + if_tensorrt([ -# "@local_config_cuda//cuda:cuda_headers", -# "@local_config_tensorrt//:tensorrt", -# ]), -#) - tf_cuda_library( name = "trt_plugins", srcs = ["plugin/trt_plugin.cc"], @@ -481,17 +934,25 @@ tf_cuda_library( cc_library( name = "utils", - srcs = ["convert/utils.cc"], + srcs = [ + "convert/utils.cc", + "utils/trt_experimental_features.cc", + ], hdrs = [ "common/utils.h", "convert/utils.h", + "utils/trt_experimental_features.h", "utils/trt_tensor_proxy.h", ], copts = tf_copts(), deps = [ "//tensorflow/core:framework", + "//tensorflow/core:graph", + "//tensorflow/core:lib", "//tensorflow/core:lib_proto_parsing", - ], + "@com_google_absl//absl/algorithm:container", + "@com_google_absl//absl/strings", + ] + if_tensorrt([":tensorrt_lib"]), ) tf_proto_library( @@ -501,23 +962,18 @@ tf_proto_library( protodeps = tf_additional_all_protos(), ) -cc_library( +tf_cuda_library( name = "py_utils", srcs = ["utils/py_utils.cc"], hdrs = ["utils/py_utils.h"], - copts = tf_copts(), + copts = select({ + "@local_config_tensorrt//:use_static_tensorrt": ["TF_USE_TENSORRT_STATIC=1"], + "//conditions:default": [], + }), deps = if_tensorrt([ + ":common_utils", ":tensorrt_lib", + ":op_converter_registry", "//tensorflow/stream_executor/platform:dso_loader", ]), ) - -tf_py_wrap_cc( - name = "wrap_py_utils", - srcs = ["utils/py_utils.i"], - copts = tf_copts(), - deps = [ - ":py_utils", - "//third_party/python_runtime:headers", - ], -) diff --git a/tensorflow/compiler/tf2tensorrt/_pywrap_py_utils.pyi b/tensorflow/compiler/tf2tensorrt/_pywrap_py_utils.pyi new file mode 100644 index 00000000000..1ef7abbd7d1 --- /dev/null +++ b/tensorflow/compiler/tf2tensorrt/_pywrap_py_utils.pyi @@ -0,0 +1,19 @@ +# Copyright 2023 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +def get_linked_tensorrt_version() -> tuple[int,int,int]: ... +def get_loaded_tensorrt_version() -> tuple[int,int,int]: ... +def get_registered_op_converters() -> list[str]: ... +def is_tensorrt_enabled() -> bool: ... diff --git a/tensorflow/compiler/tf2tensorrt/common/datavec.h b/tensorflow/compiler/tf2tensorrt/common/datavec.h new file mode 100644 index 00000000000..eff32f1f521 --- /dev/null +++ b/tensorflow/compiler/tf2tensorrt/common/datavec.h @@ -0,0 +1,38 @@ +/* Copyright 2021 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_COMMON_DATAVEC_H_ +#define TENSORFLOW_COMPILER_TF2TENSORRT_COMMON_DATAVEC_H_ + +#include + +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/platform/types.h" + +namespace tensorflow { +namespace tensorrt { + +// Input/output data format for OpConverterTest::BuildAndRun(). +struct InputOutputData { + size_t TotalBytes() const { return tensor.TotalBytes(); } + string name; + Tensor tensor; +}; + +using DataVec = std::vector; + +} // namespace tensorrt +} // namespace tensorflow +#endif diff --git a/tensorflow/compiler/tf2tensorrt/common/utils.cc b/tensorflow/compiler/tf2tensorrt/common/utils.cc new file mode 100644 index 00000000000..14f0e3d487c --- /dev/null +++ b/tensorflow/compiler/tf2tensorrt/common/utils.cc @@ -0,0 +1,242 @@ +/* Copyright 2020 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/tf2tensorrt/common/utils.h" + +#include + +#if GOOGLE_CUDA && GOOGLE_TENSORRT +#include "absl/base/call_once.h" +#include "absl/strings/str_cat.h" +#include "absl/strings/str_join.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/profiler/lib/traceme.h" +#include "third_party/tensorrt/NvInferPlugin.h" + +#endif + +namespace tensorflow { +namespace tensorrt { + +std::tuple GetLinkedTensorRTVersion() { +#if GOOGLE_CUDA && GOOGLE_TENSORRT + return std::tuple{NV_TENSORRT_MAJOR, NV_TENSORRT_MINOR, + NV_TENSORRT_PATCH}; +#else + return std::tuple{0, 0, 0}; +#endif +} + +std::tuple GetLoadedTensorRTVersion() { +#if GOOGLE_CUDA && GOOGLE_TENSORRT + int ver = getInferLibVersion(); + int major = ver / 1000; + ver = ver - major * 1000; + int minor = ver / 100; + int patch = ver - minor * 100; + return std::tuple{major, minor, patch}; +#else + return std::tuple{0, 0, 0}; +#endif +} + +} // namespace tensorrt +} // namespace tensorflow + +#if GOOGLE_CUDA && GOOGLE_TENSORRT +namespace tensorflow { +namespace tensorrt { + +Status GetTrtBindingIndex(const char* tensor_name, int profile_index, + const nvinfer1::ICudaEngine* cuda_engine, + int* binding_index) { + tensorflow::profiler::TraceMe activity( + "GetTrtBindingIndex", tensorflow::profiler::TraceMeLevel::kInfo); + // If the engine has been built for K profiles, the first getNbBindings() / K + // bindings are used by profile number 0, the following getNbBindings() / K + // bindings are used by profile number 1 etc. + // + // GetBindingIndex(tensor_name) returns the binding index for the progile 0. + // We can also consider it as a "binding_index_within_profile". + *binding_index = cuda_engine->getBindingIndex(tensor_name); + if (*binding_index == -1) { + const string msg = absl::StrCat("Input node ", tensor_name, " not found"); + return errors::NotFound(msg); + } + int n_profiles = cuda_engine->getNbOptimizationProfiles(); + // If we have more then one optimization profile, then we need to shift the + // binding index according to the following formula: + // binding_index_within_engine = binding_index_within_profile + + // profile_index * bindings_per_profile + const int bindings_per_profile = cuda_engine->getNbBindings() / n_profiles; + *binding_index = *binding_index + profile_index * bindings_per_profile; + return Status::OK(); +} + +Status GetTrtBindingIndex(int network_input_index, int profile_index, + const nvinfer1::ICudaEngine* cuda_engine, + int* binding_index) { + const string input_name = + absl::StrCat(IONamePrefixes::kInputPHName, network_input_index); + return GetTrtBindingIndex(input_name.c_str(), profile_index, cuda_engine, + binding_index); +} + +namespace { + +void InitializeTrtPlugins(nvinfer1::ILogger* trt_logger) { +#if defined(PLATFORM_WINDOWS) + LOG_WARNING_WITH_PREFIX + << "Windows support is provided experimentally. No guarantee is made " + "regarding functionality or engineering support. Use at your own " + "risk."; +#endif + LOG(INFO) << "Linked TensorRT version: " + << absl::StrJoin(GetLinkedTensorRTVersion(), "."); + LOG(INFO) << "Loaded TensorRT version: " + << absl::StrJoin(GetLoadedTensorRTVersion(), "."); + + bool plugin_initialized = initLibNvInferPlugins(trt_logger, ""); + if (!plugin_initialized) { + LOG(ERROR) << "Failed to initialize TensorRT plugins, and conversion may " + "fail later."; + } + + int num_trt_plugins = 0; + nvinfer1::IPluginCreator* const* trt_plugin_creator_list = + getPluginRegistry()->getPluginCreatorList(&num_trt_plugins); + if (!trt_plugin_creator_list) { + LOG_WARNING_WITH_PREFIX << "Can not find any TensorRT plugins in registry."; + } else { + VLOG(1) << "Found the following " << num_trt_plugins + << " TensorRT plugins in registry:"; + for (int i = 0; i < num_trt_plugins; ++i) { + if (!trt_plugin_creator_list[i]) { + LOG_WARNING_WITH_PREFIX + << "TensorRT plugin at index " << i + << " is not accessible (null pointer returned by " + "getPluginCreatorList for this plugin)"; + } else { + VLOG(1) << " " << trt_plugin_creator_list[i]->getPluginName(); + } + } + } +} + +} // namespace + +void MaybeInitializeTrtPlugins(nvinfer1::ILogger* trt_logger) { + static absl::once_flag once; + absl::call_once(once, InitializeTrtPlugins, trt_logger); +} + +} // namespace tensorrt +} // namespace tensorflow + +namespace nvinfer1 { +std::ostream& operator<<(std::ostream& os, + const nvinfer1::TensorFormat& format) { + os << "nvinfer1::TensorFormat::"; + switch (format) { + case nvinfer1::TensorFormat::kLINEAR: + os << "kLINEAR"; + break; + + case nvinfer1::TensorFormat::kCHW2: + os << "kCHW2"; + break; + + case nvinfer1::TensorFormat::kHWC8: + os << "kHWC8"; + break; + + case nvinfer1::TensorFormat::kCHW4: + os << "kCHW4"; + break; + + case nvinfer1::TensorFormat::kCHW16: + os << "kCHW16"; + break; + + case nvinfer1::TensorFormat::kCHW32: + os << "kCHW32"; + break; + +#if IS_TRT_VERSION_GE(8, 0, 0, 0) + case nvinfer1::TensorFormat::kDHWC8: + os << "kDHWC8"; + break; + + case nvinfer1::TensorFormat::kCDHW32: + os << "kCDHW32"; + break; + + case nvinfer1::TensorFormat::kHWC: + os << "kHWC"; + break; + + case nvinfer1::TensorFormat::kDLA_LINEAR: + os << "kDLA_LINEAR"; + break; + + case nvinfer1::TensorFormat::kDLA_HWC4: + os << "kDLA_HWC4"; + break; + + case nvinfer1::TensorFormat::kHWC16: + os << "kHWC16"; + break; +#endif + + default: + os << "unknown format"; + } + return os; +} + +std::ostream& operator<<(std::ostream& os, const nvinfer1::DataType& v) { + os << "nvinfer1::DataType::"; + switch (v) { + case nvinfer1::DataType::kFLOAT: + os << "kFLOAT"; + break; + case nvinfer1::DataType::kHALF: + os << "kHalf"; + break; +#if IS_TRT_VERSION_GE(8, 6, 0, 0) + case nvinfer1::DataType::kFP8: + os << "kFP8"; + break; +#endif + case nvinfer1::DataType::kINT8: + os << "kINT8"; + break; + case nvinfer1::DataType::kINT32: + os << "kINT32"; + break; + case nvinfer1::DataType::kBOOL: + os << "kBOOL"; + break; +#if IS_TRT_VERSION_GE(8, 5, 0, 0) + case nvinfer1::DataType::kUINT8: + os << "kUINT8"; + break; +#endif + } + return os; +} +} // namespace nvinfer1 + +#endif diff --git a/tensorflow/compiler/tf2tensorrt/common/utils.h b/tensorflow/compiler/tf2tensorrt/common/utils.h index 4bfce409127..08e837a410c 100644 --- a/tensorflow/compiler/tf2tensorrt/common/utils.h +++ b/tensorflow/compiler/tf2tensorrt/common/utils.h @@ -16,8 +16,13 @@ limitations under the License. #ifndef TENSORFLOW_COMPILER_TF2TENSORRT_COMMON_UTILS_H_ #define TENSORFLOW_COMPILER_TF2TENSORRT_COMMON_UTILS_H_ +#include #include +#include "absl/strings/str_join.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/lib/core/status.h" + namespace tensorflow { namespace tensorrt { // Returns the compile time TensorRT library version information @@ -30,31 +35,143 @@ std::tuple GetLoadedTensorRTVersion(); } // namespace tensorrt } // namespace tensorflow -#define IS_TRT_VERSION_GE(major, minor, patch, build) \ - ((NV_TENSORRT_MAJOR > major) || \ - (NV_TENSORRT_MAJOR == major && NV_TENSORRT_MINOR > minor) || \ - (NV_TENSORRT_MAJOR == major && NV_TENSORRT_MINOR == minor && \ - NV_TENSORRT_PATCH > patch) || \ - (NV_TENSORRT_MAJOR == major && NV_TENSORRT_MINOR == minor && \ - NV_TENSORRT_PATCH == patch && NV_TENSORRT_BUILD >= build)) - #if GOOGLE_CUDA && GOOGLE_TENSORRT +#include "tensorflow/core/platform/errors.h" #include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/status.h" #include "third_party/tensorrt/NvInfer.h" +#define ERROR_LOC __FILE__, ":", __LINE__ + +#define TFTRT_INTERNAL_ERROR_AT_NODE(node) \ + return errors::Internal("TFTRT::", __FUNCTION__, "\n", ERROR_LOC, \ + " failed to add TRT layer, at: ", node); + +#define TFTRT_RETURN_ERROR_IF_NULLPTR(ptr, node) \ + if (ptr == nullptr) { \ + TFTRT_INTERNAL_ERROR_AT_NODE(node); \ + } + +// Use this macro within functions that return a Status or StatusOR to check +// boolean conditions. If the condition fails, it returns an +// errors::Internal message with the file and line number. +#define TRT_ENSURE(x) \ + if (!(x)) { \ + return errors::Internal(ERROR_LOC, " TRT_ENSURE failure"); \ + } + +// Checks that a Status or ::stream_executor::port::StatusOr object does not +// carry an error message. If it does have an error, returns an errors::Internal +// instance containing the error message, along with the file and line number. +// For pointer-containing ::stream_executor::port::StatusOr, use the below +// TRT_ENSURE_PTR_OK macro. +#define TRT_ENSURE_OK(x) \ + if (!x.ok()) { \ + return errors::Internal(ERROR_LOC, " TRT_ENSURE_OK failure:\n ", \ + x.status().ToString()); \ + } + +// Checks that a ::stream_executor::port::StatusOrobject does not carry an +// error, and that the contained T* is non-null. If it does have an error +// status, returns an errors::Internal instance containing the error message, +// along with the file and line number. +#define TRT_ENSURE_PTR_OK(x) \ + TRT_ENSURE_OK(x); \ + if (x.ValueOrDie() == nullptr) { \ + return errors::Internal(ERROR_LOC, " pointer had null value"); \ + } + namespace tensorflow { namespace tensorrt { +#define IS_TRT_VERSION_GE(major, minor, patch, build) \ + ((NV_TENSORRT_MAJOR > major) || \ + (NV_TENSORRT_MAJOR == major && NV_TENSORRT_MINOR > minor) || \ + (NV_TENSORRT_MAJOR == major && NV_TENSORRT_MINOR == minor && \ + NV_TENSORRT_PATCH > patch) || \ + (NV_TENSORRT_MAJOR == major && NV_TENSORRT_MINOR == minor && \ + NV_TENSORRT_PATCH == patch && NV_TENSORRT_BUILD >= build)) #define LOG_WARNING_WITH_PREFIX LOG(WARNING) << "TF-TRT Warning: " // Initializes the TensorRT plugin registry if this hasn't been done yet. void MaybeInitializeTrtPlugins(nvinfer1::ILogger* trt_logger); +class IONamePrefixes { + public: + static constexpr const char* const kInputPHName = "TensorRTInputPH_"; + static constexpr const char* const kOutputPHName = "TensorRTOutputPH_"; +}; + +// Gets the binding index of a tensor in an engine. +// +// The binding index is looked up using the tensor's name and the profile index. +// Profile index should be set to zero, if we do not have optimization profiles. +Status GetTrtBindingIndex(const char* tensor_name, int profile_index, + const nvinfer1::ICudaEngine* cuda_engine, + int* binding_index); + +// Gets the binding index of a tensor in an engine. +// +// Same as above, but uses the network input index to identify the tensor. +Status GetTrtBindingIndex(int network_input_idx, int profile_index, + const nvinfer1::ICudaEngine* cuda_engine, + int* binding_index); } // namespace tensorrt } // namespace tensorflow +namespace nvinfer1 { +// Prints nvinfer1::Dims or any drived type to the given ostream. Per GTest +// printing requirements, this must be in the nvinfer1 namespace. +inline std::ostream& operator<<(std::ostream& os, const nvinfer1::Dims& v) { + os << "nvinfer1::Dims["; + os << absl::StrJoin(std::vector(v.d, v.d + v.nbDims), ","); + os << "]"; + return os; +} + +// Returns true if any two derived nvinfer1::Dims type structs are equivalent. +inline bool operator==(const nvinfer1::Dims& lhs, const nvinfer1::Dims& rhs) { + if (rhs.nbDims != lhs.nbDims) { + return false; + } + for (int i = 0; i < lhs.nbDims; i++) { + if (rhs.d[i] != lhs.d[i]) { + return false; + } + } + return true; +} + +// Returns false if any 2 subclasses of nvinfer1::Dims are equivalent. +inline bool operator!=(const nvinfer1::Dims& lhs, const nvinfer1::Dims& rhs) { + return !(rhs == lhs); +} + +// Prints nvinfer1::INetworkDefinition* information to the given ostream. +inline std::ostream& operator<<(std::ostream& os, + nvinfer1::INetworkDefinition* n) { + os << "nvinfer1::INetworkDefinition{\n"; + std::vector layer_idxs(n->getNbLayers()); + std::iota(layer_idxs.begin(), layer_idxs.end(), 0); + os << absl::StrJoin(layer_idxs, "\n ", + [n](std::string* out, const int layer_idx) { + out->append(n->getLayer(layer_idx)->getName()); + }); + os << "}"; + return os; +} + +// Prints the TensorFormat enum name to the stream. +std::ostream& operator<<(std::ostream& os, + const nvinfer1::TensorFormat& format); + +// Prints the DataType enum name to the stream. +std::ostream& operator<<(std::ostream& os, const nvinfer1::DataType& data_type); + +} // namespace nvinfer1 + #endif // GOOGLE_CUDA && GOOGLE_TENSORRT #endif // TENSORFLOW_COMPILER_TF2TENSORRT_COMMON_UTILS_H_ diff --git a/tensorflow/compiler/tf2tensorrt/convert/algorithm_selector.cc b/tensorflow/compiler/tf2tensorrt/convert/algorithm_selector.cc new file mode 100644 index 00000000000..82ed2254989 --- /dev/null +++ b/tensorflow/compiler/tf2tensorrt/convert/algorithm_selector.cc @@ -0,0 +1,272 @@ +/* Copyright 2022 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#if GOOGLE_CUDA && GOOGLE_TENSORRT +#include "tensorflow/compiler/tf2tensorrt/convert/algorithm_selector.h" + +#include + +#include "absl/strings/str_format.h" +#include "absl/strings/str_join.h" +#include "tensorflow/compiler/tf2tensorrt/common/utils.h" +#include "tensorflow/core/util/env_var.h" +#include "third_party/tensorrt/NvInfer.h" + +// getAlgorithmIOInfo is deprecated in TRT >= 8, replaced by +// getAlgorithmIOInfoByIndex. +#if IS_TRT_VERSION_GE(8, 0, 0, 0) +#define ALGORITHM_IO_INFO_BY_IDX(alg, idx) *(alg).getAlgorithmIOInfoByIndex(idx) +#else +#define ALGORITHM_IO_INFO_BY_IDX(alg, idx) (alg).getAlgorithmIOInfo(idx) +#endif + +namespace nvinfer1 { + +std::ostream& operator<<(std::ostream& os, + const nvinfer1::IAlgorithmContext& ctx) { + os << "AlgorithmContext(name=" << ctx.getName() + << ",nbInputs=" << ctx.getNbInputs() << ",nbOutputs=" << ctx.getNbOutputs() + << ")"; + return os; +} + +std::ostream& operator<<(std::ostream& os, const nvinfer1::IAlgorithm& alg) { + const nvinfer1::IAlgorithmVariant& variant = alg.getAlgorithmVariant(); + os << "Algorithm(" + << "variant.implementation=" << variant.getImplementation() + << ",variant.tactic=" << variant.getTactic() + << ",timingMSec=" << alg.getTimingMSec() + << ",workspaceSize=" << alg.getWorkspaceSize() << ")"; + return os; +} + +std::ostream& operator<<(std::ostream& os, + const nvinfer1::IAlgorithmIOInfo& info) { + os << "IOTensor(format=" << info.getTensorFormat() + << ",dtype=" << info.getDataType() << ",strides=" << info.getStrides() + << ")"; + return os; +} +} // namespace nvinfer1 + +namespace tensorflow { +namespace tensorrt { +namespace convert { + +bool operator>=(const AlgorithmSelectorImpl::TRTVersion& lhs, + const AlgorithmSelectorImpl::TRTVersion& rhs) { + if (lhs[0] > rhs[0]) return true; + if (lhs[0] == rhs[0] && lhs[1] > rhs[1]) return true; + if (lhs[0] == rhs[0] && lhs[1] == rhs[1] && lhs[2] > rhs[2]) return true; + if (lhs[0] == rhs[0] && lhs[1] == rhs[1] && lhs[2] == rhs[2] && + lhs[3] >= rhs[3]) { + return true; + } + return false; +} + +bool AlgorithmSelectorImpl::IsTrtVersionGE(const TRTVersion& version) const { + return version_ >= version; +} + +bool AlgorithmSelectorImpl::IsShuffleLayer(ImplementationID id) const { + if (IsTrtVersionGE({8, 2, 0, 0})) { + return id == 0x80000000 + 13; + } + if (IsTrtVersionGE({8, 0, 0, 0})) { + return id == 0x80000000 + 14; + } + if (IsTrtVersionGE({7, 2, 0, 0})) { + return id == 0x80000000 + 16; + } + return id == 18; +} + +std::set +AlgorithmSelectorImpl::GetBannedTRT72TuringTactics() { + static const std::set banned_turing_72{ + // turing_fp16_s1688cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc_gelu_tn_v1 + -5927686925093575778, + // turing_fp16_s1688cudnn_fp16_128x128_ldg8_relu_f2f_exp_interior_nhwc_gelu_tn_v1 + -3848538574386518527, + // turing_fp16_s1688cudnn_fp16_128x128_ldg8_relu_f2f_exp_small_nhwc_gelu_tn_v1 + -959009792490796596}; + return banned_turing_72; +} + +bool AlgorithmSelectorImpl::IsBannedTactic(TacticID id) const { + // Disable problematic FP16-Turing tactics in TensorRT 7.2. + if (IsTrtVersionGE({7, 2, 0, 0}) && !IsTrtVersionGE({8, 0, 0, 0})) { + auto banned_turing_72 = GetBannedTRT72TuringTactics(); + return banned_turing_72.find(id) != banned_turing_72.end(); + } + return false; +} + +bool AlgorithmSelectorImpl::AllowShuffleAlgorithm( + TacticID tactic, nvinfer1::DataType input_dtype, + nvinfer1::TensorFormat input_format) const { + if (IsTrtVersionGE({8, 0, 0, 0}) && !IsTrtVersionGE({8, 0, 3, 0})) { + // Reject shuffle node when input format is linear row major INT8 + // format in TensorRT 8.0 GA. + return !(input_format == nvinfer1::TensorFormat::kLINEAR && + input_dtype == nvinfer1::DataType::kINT8); + } + + if (IsTrtVersionGE({7, 2, 0, 0}) && !IsTrtVersionGE({8, 0, 0, 0})) { + // For TRT 7.2, accept shuffle node when input format is not 32-wide + // channel vectorized row major FP32 format + return !(input_format == nvinfer1::TensorFormat::kCHW32 && + input_dtype == nvinfer1::DataType::kFLOAT); + } + return true; +} + +bool AlgorithmSelectorImpl::IsAlgorithmSelectorRequired() const { + // If we are in turing for TensorRT 7.2, we need the selector for shuffle and + // avoiding specfic Turing tactics. + if (IsTrtVersionGE({7, 2, 0, 0}) && !IsTrtVersionGE({8, 0, 0, 0})) { + return true; + } + + // If we are in TensorRT 8.0 GA, we want to reject certain types of shuffles. + if (IsTrtVersionGE({8, 0, 0, 0}) && !IsTrtVersionGE({8, 0, 3, 0})) { + return true; + } + + return false; +} + +namespace { + +string FormatAlgorithmList(const nvinfer1::IAlgorithmContext& ctx, + absl::Span algs) { + return absl::StrFormat( + "%s:\n\t%s", absl::FormatStreamed(ctx), + absl::StrJoin( + algs, "\n\t", + [&ctx](std::string* out, const nvinfer1::IAlgorithm* const alg) { + absl::StrAppendFormat(out, "%s", absl::FormatStreamed(*alg)); + for (int i = 0; i < ctx.getNbInputs() + ctx.getNbOutputs(); i++) { + absl::StrAppendFormat( + out, "\n\t\t%s", + absl::FormatStreamed(ALGORITHM_IO_INFO_BY_IDX(*alg, i))); + } + })); +} + +} // namespace + +TftrtAlgorithmSelector::TftrtAlgorithmSelector() + : fixed_algorithm_idx_(GetFixedAlgorithmID()), + selector_(AlgorithmSelectorImpl::CompileTimeTRTVersion()) {} + +absl::optional TftrtAlgorithmSelector::GetFixedAlgorithmID() { + int64 trt_algorithm_idx = 0; + constexpr auto null_idx = + std::numeric_limits::min(); + Status status = tensorflow::ReadInt64FromEnvVar("TF_TRT_FIXED_ALGORITHM_ID", + /*default_val=*/null_idx, + &trt_algorithm_idx); + if (!status.ok()) { + LOG(ERROR) << status; + return absl::nullopt; + } + if (trt_algorithm_idx != null_idx) { + return std::max(static_cast(trt_algorithm_idx), 0); + } + return absl::nullopt; +} + +bool TftrtAlgorithmSelector::AlgorithmPolicy( + const nvinfer1::IAlgorithmContext& context, + const nvinfer1::IAlgorithm& alg) const { + const nvinfer1::IAlgorithmVariant& variant = alg.getAlgorithmVariant(); + + // Check if this tactic ID is banned. + TacticID tactic_id = variant.getTactic(); + if (selector_.IsBannedTactic(tactic_id)) { + return false; + } + + if (selector_.IsShuffleLayer(variant.getImplementation())) { + return selector_.AllowShuffleAlgorithm( + tactic_id, alg.getAlgorithmIOInfo(0).getDataType(), + alg.getAlgorithmIOInfo(0).getTensorFormat()); + } + return true; +} + +int32_t TftrtAlgorithmSelector::selectAlgorithms( + const nvinfer1::IAlgorithmContext& algoContext, + const nvinfer1::IAlgorithm* const* algoChoices, int32_t nbChoices, + int32_t* selection) noexcept { + if (fixed_algorithm_idx_) { + LOG(WARNING) << "Forcing TRT algorithm selection to: ID = " + << *fixed_algorithm_idx_; + selection[0] = std::min(*fixed_algorithm_idx_, nbChoices - 1); + return 1; + } + + int num_selections = 0; + + VLOG(1) << "Algorithm selection choices: " + << FormatAlgorithmList(algoContext, + absl::MakeSpan(algoChoices, nbChoices)); + + for (int i = 0; i < nbChoices; i++) { + const nvinfer1::IAlgorithm& alg = *algoChoices[i]; + + // Check layer-specific issues. + if (!AlgorithmPolicy(algoContext, alg)) { + LOG(WARNING) << absl::StrFormat("Rejecting Algorithm: %s ", + absl::FormatStreamed(alg)); + continue; + } + selection[num_selections++] = i; + } + return num_selections; +} + +// Called by TensorRT to report choices it made. +void TftrtAlgorithmSelector::reportAlgorithms( + const nvinfer1::IAlgorithmContext* const* algoContexts, + const nvinfer1::IAlgorithm* const* algoChoices, + int32_t nbAlgorithms) noexcept { + if (VLOG_IS_ON(1)) { + string selection_msg = "Algorithms selected:\n"; + for (int i = 0; i < nbAlgorithms; i++) { + absl::StrAppend(&selection_msg, + FormatAlgorithmList(*algoContexts[i], + absl::MakeSpan(algoChoices + i, 1))); + } + VLOG(1) << selection_msg; + } +} + +std::unique_ptr MaybeCreateAlgorithmSelector() { + auto selector = std::make_unique(); + + if (selector->IsRequired()) { + return selector; + } + + return nullptr; +} + +} // namespace convert +} // namespace tensorrt +} // namespace tensorflow + +#endif // GOOGLE_CUDA && GOOGLE_TENSORRT diff --git a/tensorflow/compiler/tf2tensorrt/convert/algorithm_selector.h b/tensorflow/compiler/tf2tensorrt/convert/algorithm_selector.h new file mode 100644 index 00000000000..1ce0def0c75 --- /dev/null +++ b/tensorflow/compiler/tf2tensorrt/convert/algorithm_selector.h @@ -0,0 +1,121 @@ +/* Copyright 2022 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_ALGORITHM_SELECTOR_H_ +#define TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_ALGORITHM_SELECTOR_H_ +#if GOOGLE_CUDA && GOOGLE_TENSORRT +#include +#include +#include + +#include "absl/types/optional.h" +#include "third_party/tensorrt/NvInfer.h" + +namespace tensorflow { +namespace tensorrt { +namespace convert { + +// Implements core algorithm selection logic in a testable manner. The policy +// implemented depends on the given TRT version. We have this class because TRT +// interfaces make it difficult to directly test an IAlgorithmSelector +// implementation. +class AlgorithmSelectorImpl { + public: + using TRTVersion = std::array; + using ImplementationID = int64_t; + using TacticID = int64_t; + + static constexpr TRTVersion CompileTimeTRTVersion() { + return TRTVersion{NV_TENSORRT_MAJOR, NV_TENSORRT_MINOR, NV_TENSORRT_PATCH, + NV_TENSORRT_BUILD}; + } + + explicit AlgorithmSelectorImpl( + const TRTVersion& version = CompileTimeTRTVersion()) + : version_(version) {} + + bool IsShuffleLayer(ImplementationID id) const; + + bool IsBannedTactic(TacticID id) const; + + // Returns true if the algorithm implementing the IShuffleLayer is acceptable. + bool AllowShuffleAlgorithm(TacticID tactic, nvinfer1::DataType input_dtype, + nvinfer1::TensorFormat input_format) const; + + bool IsTrtVersionGE(const TRTVersion& version) const; + + // Returns true if we know at compile time that the algorithm selector + // should be required. This is a conservative estimate. + bool IsAlgorithmSelectorRequired() const; + + static std::set GetBannedTRT72TuringTactics(); + + private: + TRTVersion version_; +}; + +// Impelements the TRT IAlgorithmSelector interface. The method +// "selectAlgorithms" selects allowable algorithms for each layer, and +// "reportAlgorithms" summarizes the algorithms selected by TensorRT. +class TftrtAlgorithmSelector : public nvinfer1::IAlgorithmSelector { + private: + using TacticID = AlgorithmSelectorImpl::TacticID; + + // An index we should choose for all algorithms. Used for debugging. + absl::optional fixed_algorithm_idx_; + + AlgorithmSelectorImpl selector_; + + public: + TftrtAlgorithmSelector(); + + // If the environment variable TF_TRT_FIXED_ALGORITHM_ID is empty, this + // function returns nullopt. Otherwise, it returns the specified number. + static absl::optional GetFixedAlgorithmID(); + + // Returns true if the algorithm associated with context is acceptable. + bool AlgorithmPolicy(const nvinfer1::IAlgorithmContext& context, + const nvinfer1::IAlgorithm& alg) const; + + // This function fills the array "selection" with the indices of selected + // algorithm candidates from "algoChoices", each of which is an implementation + // for the kernel described by the given IAlgorithmContext. It should return a + // number in [0, nbChoices] indicating the number of selected indices. If 0 is + // returned, TensorRT will use its default selection mechanism. + int32_t selectAlgorithms(const nvinfer1::IAlgorithmContext& algoContext, + const nvinfer1::IAlgorithm* const* algoChoices, + int32_t nbChoices, + int32_t* selection) noexcept override; + + // Called by TensorRT to report choices it made. + void reportAlgorithms(const nvinfer1::IAlgorithmContext* const* algoContexts, + const nvinfer1::IAlgorithm* const* algoChoices, + int32_t nbAlgorithms) noexcept override; + + bool IsRequired() const { + return selector_.IsAlgorithmSelectorRequired() || + fixed_algorithm_idx_ != absl::nullopt; + } +}; + +// Returns an initialized AlgorithmSelector if an algorithm selector is required +// for the current TRT version. Otherwise, returns nullptr. +std::unique_ptr MaybeCreateAlgorithmSelector(); + +} // namespace convert +} // namespace tensorrt +} // namespace tensorflow + +#endif // GOOGLE_CUDA && GOOGLE_TENSORRT +#endif // TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_ALGORITHM_SELECTOR_H_ diff --git a/tensorflow/compiler/tf2tensorrt/convert/algorithm_selector_test.cc b/tensorflow/compiler/tf2tensorrt/convert/algorithm_selector_test.cc new file mode 100644 index 00000000000..12eb1fabc86 --- /dev/null +++ b/tensorflow/compiler/tf2tensorrt/convert/algorithm_selector_test.cc @@ -0,0 +1,97 @@ +/* Copyright 2022 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#if GOOGLE_CUDA && GOOGLE_TENSORRT + +#include "tensorflow/compiler/tf2tensorrt/convert/algorithm_selector.h" + +#include + +#include +#include "third_party/tensorrt/NvInfer.h" + +namespace tensorflow { +namespace tensorrt { +namespace convert { + +TEST(TestAlgorithmSelector, TensorRT7_1) { + // Verify that the algorithm selector for TRT 7.1 is not required. + AlgorithmSelectorImpl sel71({7, 1, 3, 4}); + ASSERT_FALSE(sel71.IsAlgorithmSelectorRequired()); +} + +TEST(TestAlgorithmSelector, TensorRT7_2) { + // Verify that the algorithm selector for TRT 7.2 is required. + AlgorithmSelectorImpl sel72({7, 2, 0, 0}); + ASSERT_TRUE(sel72.IsAlgorithmSelectorRequired()); + + // Check that the correct tactics are banned. + auto turing_tactics = AlgorithmSelectorImpl::GetBannedTRT72TuringTactics(); + + for (auto id : turing_tactics) { + EXPECT_TRUE(sel72.IsBannedTactic(id)); + } + + // Check that a bad shuffle format is banned. + EXPECT_FALSE(sel72.AllowShuffleAlgorithm(0, nvinfer1::DataType::kFLOAT, + nvinfer1::TensorFormat::kCHW32)); + + // Check that other formats are not banned. + EXPECT_TRUE(sel72.AllowShuffleAlgorithm(0, nvinfer1::DataType::kHALF, + nvinfer1::TensorFormat::kCHW32)); + EXPECT_TRUE(sel72.AllowShuffleAlgorithm(0, nvinfer1::DataType::kINT32, + nvinfer1::TensorFormat::kCHW32)); + EXPECT_TRUE(sel72.AllowShuffleAlgorithm(0, nvinfer1::DataType::kFLOAT, + nvinfer1::TensorFormat::kCHW16)); +} + +TEST(TestAlgorithmSelector, TensorRT8_0) { + // Verify that the algorithm selector for TRT 8.0 is required. + AlgorithmSelectorImpl sel80({8, 0, 1, 6}); + ASSERT_TRUE(sel80.IsAlgorithmSelectorRequired()); + + // Check that the turing 7.2 tactics are not banned. + auto turing_tactics = AlgorithmSelectorImpl::GetBannedTRT72TuringTactics(); + for (auto id : turing_tactics) { + EXPECT_FALSE(sel80.IsBannedTactic(id)); + } + + // Check that a bad shuffle format is banned. + EXPECT_FALSE(sel80.AllowShuffleAlgorithm(0, nvinfer1::DataType::kINT8, + nvinfer1::TensorFormat::kLINEAR)); + + // Check that other formats are not banned. + EXPECT_TRUE(sel80.AllowShuffleAlgorithm(0, nvinfer1::DataType::kHALF, + nvinfer1::TensorFormat::kLINEAR)); + EXPECT_TRUE(sel80.AllowShuffleAlgorithm(0, nvinfer1::DataType::kINT32, + nvinfer1::TensorFormat::kLINEAR)); + EXPECT_TRUE(sel80.AllowShuffleAlgorithm(0, nvinfer1::DataType::kFLOAT, + nvinfer1::TensorFormat::kLINEAR)); + EXPECT_TRUE(sel80.AllowShuffleAlgorithm(0, nvinfer1::DataType::kINT8, + nvinfer1::TensorFormat::kCHW16)); + EXPECT_TRUE(sel80.AllowShuffleAlgorithm(0, nvinfer1::DataType::kINT8, + nvinfer1::TensorFormat::kCHW32)); +} + +TEST(TestAlgorithmSelector, TensorRT8_2) { + // Verify that the algorithm selector for TRT 8.0 is required. + AlgorithmSelectorImpl sel({8, 2, 0, 0}); + ASSERT_FALSE(sel.IsAlgorithmSelectorRequired()); +} + +} // namespace convert +} // namespace tensorrt +} // namespace tensorflow + +#endif // GOOGLE_CUDA && GOOGLE_TENSORRT diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc index 01dcfba9c52..9c9fce4d30a 100644 --- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc +++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc @@ -25,7 +25,11 @@ limitations under the License. #include #include "absl/strings/str_cat.h" +#include "absl/strings/str_format.h" +#include "tensorflow/compiler/tf2tensorrt/common/utils.h" #include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h" +#include "tensorflow/compiler/tf2tensorrt/convert/logger_registry.h" +#include "tensorflow/compiler/tf2tensorrt/convert/ops/quantization_ops.h" #include "tensorflow/compiler/tf2tensorrt/convert/utils.h" #include "tensorflow/compiler/tf2tensorrt/segment/segment.h" #include "tensorflow/core/common_runtime/gpu/gpu_id.h" @@ -40,28 +44,30 @@ limitations under the License. #include "tensorflow/core/grappler/clusters/virtual_cluster.h" #include "tensorflow/core/grappler/costs/graph_properties.h" #include "tensorflow/core/grappler/devices.h" -#include "tensorflow/core/grappler/grappler_item.h" #include "tensorflow/core/grappler/optimizers/meta_optimizer.h" #include "tensorflow/core/grappler/utils.h" #include "tensorflow/core/lib/core/errors.h" -#include "tensorflow/core/lib/core/status.h" +#include "tensorflow/core/lib/gtl/cleanup.h" #include "tensorflow/core/lib/strings/numbers.h" #include "tensorflow/core/platform/logging.h" -#include "tensorflow/core/platform/types.h" -#include "tensorflow/core/protobuf/config.pb.h" // NOLINT +#include "tensorflow/core/protobuf/config.pb.h" // NOLINT #include "tensorflow/core/protobuf/device_properties.pb.h" // NOLINT -#include "tensorflow/core/protobuf/rewriter_config.pb.h" // NOLINT +#include "tensorflow/core/protobuf/rewriter_config.pb.h" // NOLINT #include "tensorflow/core/util/device_name_utils.h" +#include "tensorflow/tools/graph_transforms/transform_utils.h" -#if GOOGLE_CUDA -#if GOOGLE_TENSORRT +#if GOOGLE_CUDA && GOOGLE_TENSORRT #include "third_party/gpus/cuda/include/cuda_runtime_api.h" #include "third_party/tensorrt/NvInfer.h" namespace tensorflow { namespace tensorrt { namespace convert { + using absl::StrAppend; using absl::StrCat; +using ::tensorflow::tensorrt::segment::ClusterProperty; +using ::tensorflow::tensorrt::segment::NodePtrCompare; +using ::tensorflow::tensorrt::segment::Segment; namespace { @@ -76,7 +82,20 @@ Status BuildNodeMap(const Graph& graph, return Status::OK(); } -} // namespace +EngineInfo::EngineType GetEngineType( + const TRTOptimizationPass::ConversionParams& params) { + return (params.is_dynamic_op || params.use_calibration) + ? EngineInfo::EngineType::TRTDynamic + : EngineInfo::EngineType::TRTStatic; +} + +// Returns true when use_implicit_batch is false or when we are building dynamic +// engine, to allow unknown size for dimensions rather than dimension 0. +bool AllowDynamicNonBatchDimension( + const TRTOptimizationPass::ConversionParams& params) { + return !params.use_implicit_batch || + GetEngineType(params) == EngineInfo::EngineType::TRTDynamic; +} struct EdgePtrCompare { bool operator()(const Edge* lhs, const Edge* rhs) const { @@ -88,30 +107,48 @@ struct EdgePtrCompare { // a device name as one of the conversion parameter so users can control on // which device they want to run the conversion. std::pair GetFirstValidDeviceId() { - for (int tf_gpu_id_value = 0; tf_gpu_id_value < 100; ++tf_gpu_id_value) { - TfGpuId tf_gpu_id(tf_gpu_id_value); - PlatformGpuId platform_gpu_id; - Status s = GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id); + for (int tf_device_id_value = 0; tf_device_id_value < 100; + ++tf_device_id_value) { + TfGpuId tf_device_id(tf_device_id_value); + PlatformGpuId platform_device_id; + Status s = + GpuIdManager::TfToPlatformGpuId(tf_device_id, &platform_device_id); if (s.ok()) { - VLOG(1) << "Found TF GPU " << tf_gpu_id.value() << " at cuda device " - << platform_gpu_id.value(); - return std::make_pair(tf_gpu_id, platform_gpu_id); + VLOG(1) << "Found TF GPU " << tf_device_id.value() << " at cuda device " + << platform_device_id.value(); + return std::make_pair(tf_device_id, platform_device_id); } } LOG(ERROR) << "Could not find any TF GPUs"; return std::make_pair(TfGpuId(-1), PlatformGpuId(-1)); } +// Returns false for const nodes (we intend to drop control edges from those). +bool ShallKeepControlEdgeFrom(const Node* input_node) { + if (!input_node) { + LOG(ERROR) << "Node pointer is null, this should not happen"; + return false; + } + return input_node->type_string() != "Const"; +} + // Function to get subsegment information structure. Status GetEngineInfo(const Graph* g, const grappler::GraphProperties& graph_properties, - const std::set& segment_nodes, - const std::unordered_map& node_map, + const Segment& segment, const std::vector& reverse_topo_order, EngineInfo* info) { std::vector subgraph_nodes; // Topologically sorted nodes. std::set added_const_nodes; // Used to prevent double insertion. - std::set segment_devices; + + const ClusterProperty& segment_property = segment.property; + const std::set& segment_nodes = segment.nodes; + + // The device assignment accumulated from the compatible device assignments + // for the nodes in the segment. + const DeviceNameUtils::ParsedName segment_device = + segment_property.DeviceName(); + info->max_batch_size = segment_property.BatchSize().GetOptionalMaxBatchSize(); // Map from src_node_name+port to the unique port numbers of the TRT op, where // the src_node_name is the name of the source node of the input/output @@ -124,52 +161,12 @@ Status GetEngineInfo(const Graph* g, ++it) { const Node* node = *it; if (segment_nodes.count(node) == 0) continue; - auto node_device = node->requested_device(); - if (!node_device.empty()) { - // If device is set, it means device placement may have been done before, - // so we need to assign a device for the TRTEngineOp to maintain the - // invariance. - // If the device is CPU in this case, it tries to find the first available - // GPU and use it as the device. - DeviceNameUtils::ParsedName parsed_name; - const bool parse_succeeded = - DeviceNameUtils::ParseFullName(node_device, &parsed_name); - if (!parse_succeeded || (parse_succeeded && parsed_name.type == "CPU")) { - string msg; - if (!parse_succeeded) { - msg = StrCat("Failed to parse assigned device of node ", node->name(), - ". "); - } else { - msg = StrCat("Node ", node->name(), " was assigned to the CPU. "); - } - VLOG(1) << msg << "Attempting to place on GPU."; - TfGpuId tf_gpu_id; - PlatformGpuId platform_gpu_id; - std::tie(tf_gpu_id, platform_gpu_id) = GetFirstValidDeviceId(); - if (tf_gpu_id.value() >= 0) { - parsed_name.type = "GPU"; - parsed_name.id = tf_gpu_id.value(); - segment_devices.insert(DeviceNameUtils::FullName( - parsed_name.job, parsed_name.replica, parsed_name.task, - parsed_name.type, parsed_name.id)); - } - } else { - segment_devices.insert(node_device); - } - } else if (node->has_assigned_device_name()) { - // It appears that nodes will not have assigned devices at this point in - // execution. - segment_devices.insert(node->assigned_device_name()); - } else { - VLOG(2) << "Node " << node->name() - << " neither have requested device nor assigned device"; - } subgraph_nodes.push_back(node); const int node_id = node->id(); const string& node_name = node->name(); - // Create input connections. Sort edges first to make determnistic since + // Create input connections. Sort edges first to make deterministic since // in_edges is a set of pointers. std::vector in_edges(node->in_edges().begin(), node->in_edges().end()); @@ -180,7 +177,7 @@ Status GetEngineInfo(const Graph* g, continue; } if (edge->IsControlEdge()) { - if (input_node->type_string() != "Const") { + if (ShallKeepControlEdgeFrom(input_node)) { // Non-Const control input. info->connections.emplace_back(input_node->name(), input_node->id(), node_name, node_id, @@ -194,7 +191,7 @@ Status GetEngineInfo(const Graph* g, // If it doesn't have any edges, TF will prune it out. // // Note that the segmenter already ensure that the constant data input - // is valid and suppported by the engine. + // is valid and supported by the engine. if (!added_const_nodes.insert(input_node).second) { // Already added before. continue; @@ -217,7 +214,7 @@ Status GetEngineInfo(const Graph* g, node_id, edge->dst_input(), /*input_edge=*/true, port); } } - // Create output connections. Sort edges first to make determnistic since + // Create output connections. Sort edges first to make deterministic since // out_edges is a set of pointers. std::vector out_edges(node->out_edges().begin(), node->out_edges().end()); @@ -229,9 +226,11 @@ Status GetEngineInfo(const Graph* g, } if (edge->IsControlEdge()) { // Control output. - info->connections.emplace_back(output_node->name(), output_node->id(), - node_name, node_id, - /*input_edge=*/false); + if (ShallKeepControlEdgeFrom(node)) { + info->connections.emplace_back(output_node->name(), output_node->id(), + node_name, node_id, + /*input_edge=*/false); + } } else { // Data output. int port = Graph::kControlSlot - 1; @@ -254,22 +253,35 @@ Status GetEngineInfo(const Graph* g, // Construct the const nodes first. subgraph_nodes.insert(subgraph_nodes.begin(), added_const_nodes.begin(), added_const_nodes.end()); - string scope_name; - TF_RETURN_IF_ERROR(ConvertSegmentToGraphDef( - g, graph_properties, subgraph_nodes, &info->connections, - &info->segment_graph_def, &scope_name)); - info->engine_name = StrCat(scope_name, info->engine_name); + TF_RETURN_IF_ERROR( + ConvertSegmentToGraphDef(g, graph_properties, subgraph_nodes, info)); VLOG(1) << "Converted TensorRT candidate segment '" << info->engine_name << "' to a GraphDef"; - if (segment_devices.size() == 1) { - info->device = *segment_devices.begin(); - } else if (segment_devices.size() > 1) { - LOG(WARNING) << "Detected multiple (" << segment_devices.size() - << ") devices for the segment. Picking first one to continue."; - info->device = *segment_devices.begin(); + if (segment_device.has_type) { + // If the accumulated device assignment for the segment has a device type, + // the segmenter guarantees the device type is GPU. Use the device + // assignment in this case. + if (segment_device.type != "GPU") { + return errors::Internal( + "segment device is not GPU: ", + DeviceNameUtils::ParsedNameToString(segment_device)); + } + info->device = DeviceNameUtils::ParsedNameToString(segment_device); } else { - VLOG(1) << "No device is assigned to the segment. " - << "A device will be assigned during graph execution (inference)."; + TfGpuId tf_device_id; + PlatformGpuId platform_device_id; + std::tie(tf_device_id, platform_device_id) = GetFirstValidDeviceId(); + if (tf_device_id.value() >= 0) { + DeviceNameUtils::ParsedName parsed_name; + parsed_name.type = "GPU"; + parsed_name.has_type = true; + parsed_name.id = tf_device_id.value(); + parsed_name.has_id = true; + info->device = DeviceNameUtils::ParsedNameToString(parsed_name); + } else { + VLOG(1) << "No device is assigned to the segment. A device will be " + "assigned during graph execution (inference)."; + } } return Status::OK(); } @@ -303,7 +315,22 @@ void UpdateToEngineNode(const std::vector& infos, } } } - LOG(FATAL) << "Node " << (**node).name() << " not found in any engine."; + LOG(FATAL) << "Node " << node_name << " not found in any engine."; +} + +tensorflow::TensorShapeProto ComputeTRTNodeIOShape( + std::vector& partial_tensorshape_vect, + std::vector& shape_proto_vect, + const PartialTensorShape& conn_shape, int port_number) { + tensorflow::TensorShapeProto tmp_shape_proto; + conn_shape.AsProto(&tmp_shape_proto); + + if (partial_tensorshape_vect.size() <= port_number) { + shape_proto_vect.resize(port_number + 1); + partial_tensorshape_vect.resize(port_number + 1); + } + + return tmp_shape_proto; } // Function to insert a TRT engine node into the graph. @@ -318,15 +345,16 @@ void UpdateToEngineNode(const std::vector& infos, // one). Connect to the pre-existing engine node instead. // 3. In this way, we ensure the graph is topologically sort-able after each // invocation of CreateTRTNode(). -Status CreateTRTNode(const ConversionParams& params, +Status CreateTRTNode(const TRTOptimizationPass::ConversionParams& params, const std::vector& infos, int pos, - int max_batch_size, Graph* graph, - nvinfer1::IGpuAllocator* alloc, - std::vector* engine_nodes) { + int default_max_batch_size, Graph* graph, + std::vector* engine_nodes, + grappler::Cluster* cluster) { const auto& info = infos.at(pos); std::vector input_shape_protos; std::vector output_shape_protos; std::vector input_shapes; + std::vector output_shapes; std::vector inputs; std::vector input_nodes; std::vector control_input_nodes; @@ -359,36 +387,42 @@ Status CreateTRTNode(const ConversionParams& params, } else { // Data edges if (!conn.is_input_edge) { - // Set the data types of output edge. + // Set the shapes and data types of the output edge. + tensorflow::TensorShapeProto out_shape = ComputeTRTNodeIOShape( + /*partial_tensorshape_vect=*/output_shapes, + /*shape_proto_vect=*/output_shape_protos, + /*conn_shape=*/conn.inside_shape, + /*port_number=*/conn.port_number); + + output_shape_protos.at(conn.port_number) = out_shape; + output_shapes.at(conn.port_number) = conn.inside_shape; + if (out_types.size() <= conn.port_number) { out_types.resize(conn.port_number + 1); } out_types.at(conn.port_number) = conn.connection_type; - if (output_shape_protos.size() <= conn.port_number) { - output_shape_protos.resize(conn.port_number + 1); - } - conn.inside_shape.AsProto(&output_shape_protos.at(conn.port_number)); VLOG(2) << "Collected output shape " << output_shape_protos.at(conn.port_number).DebugString(); } else { - // Set the shapes and data types of input edge. - tensorflow::TensorShapeProto in_shape; - conn.outside_shape.AsProto(&in_shape); - if (input_shapes.size() <= conn.port_number) { - input_shape_protos.resize(conn.port_number + 1); - input_shapes.resize(conn.port_number + 1); - } + // Set the shapes of the input edge. + tensorflow::TensorShapeProto in_shape = ComputeTRTNodeIOShape( + /*partial_tensorshape_vect=*/input_shapes, + /*shape_proto_vect=*/input_shape_protos, + /*conn_shape=*/conn.outside_shape, + /*port_number=*/conn.port_number); + input_shape_protos.at(conn.port_number) = in_shape; input_shapes.at(conn.port_number) = conn.outside_shape; + // Shape must be fully defined (excluding batch dimension) for static // mode. - if (info.engine_type == EngineInfo::EngineType::TRTStatic) { + if (params.use_implicit_batch && + info.engine_type == EngineInfo::EngineType::TRTStatic) { for (int i = 1; i < conn.outside_shape.dims(); i++) { if (conn.outside_shape.dim_size(i) <= 0) { return errors::Internal( - "Input shapes must be fully defined when in static mode. " - "Please try is_dynamic_op=True (shape was ", - conn.outside_shape.DebugString(), ")"); + "Not fully defined input shape when in static mode which " + "should have been excluded by the segmenter. "); } } } @@ -421,25 +455,17 @@ Status CreateTRTNode(const ConversionParams& params, "Segment has no inputs (possible constfold failure)"); } - const bool calibrate_int8 = - (info.precision_mode == TrtPrecisionMode::INT8 && info.use_calibration); // Build the engine and get its serialized representation. string segment_string; + + int max_batch_size = info.max_batch_size.has_value() + ? info.max_batch_size.value() + : default_max_batch_size; + if (info.engine_type == EngineInfo::EngineType::TRTStatic) { - // Create static engine for fp32/fp16 mode. - Logger trt_logger; - TrtUniquePtrType engine; - // TODO(sami): What happens if 1st dim is not batch? - TF_RETURN_IF_ERROR(ConvertGraphDefToEngine( - info.segment_graph_def, - calibrate_int8 ? TrtPrecisionMode::FP32 : info.precision_mode, - max_batch_size, info.max_workspace_size_bytes, input_shapes, - &trt_logger, alloc, /*calibrator=*/nullptr, &engine, - info.use_calibration, - /*convert_successfully=*/nullptr)); - TrtUniquePtrType engine_data(engine->serialize()); - segment_string = string(static_cast(engine_data->data()), - engine_data->size()); + TF_RETURN_IF_ERROR(CreateStaticEngine(params, info, max_batch_size, + input_shapes, nullptr, + &segment_string, cluster)); } string prec_string; @@ -461,21 +487,31 @@ Status CreateTRTNode(const ConversionParams& params, NodeDef trt_node; NameAttrList function; function.set_name(StrCat(info.engine_name, "_native_segment")); - Status status = - node_builder - .Attr("input_shapes", input_shape_protos) - .Attr("output_shapes", output_shape_protos) - .Attr("static_engine", - info.engine_type == EngineInfo::EngineType::TRTStatic) - .Attr("segment_func", function) - .Attr("serialized_segment", segment_string) - .Attr("calibration_data", "") - .Attr("max_cached_engines_count", info.maximum_cached_engines) - .Attr("workspace_size_bytes", info.max_workspace_size_bytes) - .Attr("precision_mode", prec_string) - .Attr("use_calibration", info.use_calibration) - .Attr("OutT", out_types) - .Finalize(&trt_node); + + node_builder.Attr("input_shapes", input_shape_protos) + .Attr("output_shapes", output_shape_protos) + .Attr("static_engine", + info.engine_type == EngineInfo::EngineType::TRTStatic) + .Attr("segment_func", function) + .Attr("serialized_segment", segment_string) + .Attr("calibration_data", "") + .Attr("max_cached_engines_count", info.maximum_cached_engines) + .Attr("workspace_size_bytes", info.max_workspace_size_bytes) + .Attr("max_batch_size", max_batch_size) + .Attr("precision_mode", prec_string) + .Attr("use_calibration", info.use_calibration) + .Attr("_use_implicit_batch", params.use_implicit_batch) + .Attr("use_explicit_precision", params.use_explicit_precision) + .Attr("_allow_build_at_runtime", info.allow_build_at_runtime) + .Attr("OutT", out_types); + + if (!params.use_implicit_batch) { + node_builder.Attr("profile_strategy", + ProfileStrategyToName(params.profile_strategy)); + } + + Status status = node_builder.Finalize(&trt_node); + if (!status.ok()) { LOG(ERROR) << "Node construction failed with" << status; return status; @@ -488,10 +524,6 @@ Status CreateTRTNode(const ConversionParams& params, // instead of checking fail. Node* engine_node = graph->AddNode(trt_node, &status); (*engine_nodes)[pos] = engine_node; - if (!status.ok()) { - LOG(ERROR) << "Adding node failed " << status; - return status; - } // Add control input and input edges to the engine node. for (const auto in : control_input_nodes) { VLOG(1) << "Connecting control edge from " << in->name() << " to " @@ -536,6 +568,58 @@ Status CreateTRTNode(const ConversionParams& params, return Status::OK(); } +int64 GetNextGraphSequenceNumber() { + static std::atomic graph_sequence_num; + return graph_sequence_num++; +} + +constexpr char kCastInputTypeAttrName[] = "SrcT"; + +// Transforms node = cast(x, fp32) where datatype(x) != fp16 to: +// castToFp16 = cast(x, fp16) +// node = cast(castToFp16, fp32) +// +Status MaybeRewriteCastToFp32(GraphDef* graph_def, NodeDef* node_def) { + if (node_def->op() != "Cast") { + return Status::OK(); + } + + DataTypeVector input_types; + DataTypeVector output_types; + TF_RETURN_IF_ERROR( + graph_transforms::GetInOutTypes(*node_def, &input_types, &output_types)); + + if (input_types.size() != 1 || output_types.size() != 1) { + return errors::Internal("Bad cast operation"); + } + + if (input_types[0] == DT_HALF || output_types[0] != DT_FLOAT) { + return Status::OK(); + } + + VLOG(2) << "Rewriting cast to FP32 " << node_def->DebugString(); + + NodeDef* castToFp16 = graph_def->add_node(); + for (auto attr_value : node_def->attr()) { + (*castToFp16->mutable_attr())[attr_value.first] = attr_value.second; + } + castToFp16->set_name(node_def->name() + "_split"); + castToFp16->set_op("Cast"); + castToFp16->set_device(node_def->device()); + castToFp16->add_input(node_def->input(0)); + (*castToFp16->mutable_attr())[kCastOutputTypeAttrName].set_type(DT_HALF); + + node_def->set_input(0, castToFp16->name() + ":0"); + (*node_def->mutable_attr())[kCastInputTypeAttrName].set_type(DT_HALF); + + VLOG(2) << castToFp16->DebugString(); + VLOG(2) << node_def->DebugString(); + + return Status::OK(); +} + +} // namespace + Status RegisterGraphToFunctionLibrary(const GraphDef& segment_graph_def, Graph* graph, const string& engine_name) { Graph segment_graph(graph->flib_def()); @@ -545,11 +629,6 @@ Status RegisterGraphToFunctionLibrary(const GraphDef& segment_graph_def, auto segment_func = library.add_function(); TF_RETURN_IF_ERROR(GraphToFunctionDef( segment_graph, StrCat(engine_name, "_native_segment"), segment_func)); - // Set kIntsonDeviceAttr to true so that all TRTEngineOp outputs are always on - // a GPU device as expected. Otherwise, some of the tensors of type DT_INT32 - // would be on host if the op generating the tensor has host memory tag set. - (*segment_func->mutable_attr())[FunctionLibraryDefinition::kIntsOnDeviceAttr] - .set_b(true); if (VLOG_IS_ON(7)) { VLOG(7) << engine_name << " Function_Def "; VLOG(7) << segment_func->DebugString(); @@ -560,30 +639,30 @@ Status RegisterGraphToFunctionLibrary(const GraphDef& segment_graph_def, return Status::OK(); } -std::pair GetDeviceAndAllocator(const ConversionParams& params, - const EngineInfo& engine) { +std::pair GetDeviceAndAllocator( + const grappler::Cluster* cluster, const EngineInfo& engine) { int cuda_device_id = -1; Allocator* dev_allocator = nullptr; - if (params.cluster == nullptr || params.cluster->GetDeviceSet() == nullptr || + if (cluster == nullptr || cluster->GetDeviceSet() == nullptr || engine.device.empty()) { // If device is not set, use the first found GPU device for the conversion. - TfGpuId tf_gpu_id; - PlatformGpuId platform_gpu_id; - std::tie(tf_gpu_id, platform_gpu_id) = GetFirstValidDeviceId(); - cuda_device_id = platform_gpu_id.value(); + TfGpuId tf_device_id; + PlatformGpuId platform_device_id; + std::tie(tf_device_id, platform_device_id) = GetFirstValidDeviceId(); + cuda_device_id = platform_device_id.value(); if (cuda_device_id >= 0) { GPUOptions gpu_options; // If the TF to Cuda gpu id mapping exist, the device and corresponding // allocator must have been initialized already, so the // GetGPUAllocator() call won't create a new allocator. dev_allocator = GPUProcessState::singleton()->GetGPUAllocator( - gpu_options, tf_gpu_id, 1); + gpu_options, tf_device_id, /*total_bytes=*/1); } return std::make_pair(cuda_device_id, dev_allocator); } // Use the device requested by the engine. - auto device_set = params.cluster->GetDeviceSet(); + auto device_set = cluster->GetDeviceSet(); std::vector devices; DeviceNameUtils::ParsedName parsed_name; if (DeviceNameUtils::ParseFullName(engine.device, &parsed_name) && @@ -596,7 +675,7 @@ std::pair GetDeviceAndAllocator(const ConversionParams& params, StrAppend(&msg, engine.device, "': "); for (auto d : devices) StrAppend(&msg, d->name(), ", "); StrAppend(&msg, ". Will get the allocator from first one."); - LOG(WARNING) << msg; + LOG_WARNING_WITH_PREFIX << msg; } AllocatorAttributes alloc_attr; cuda_device_id = devices[0]->tensorflow_gpu_device_info()->gpu_id; @@ -604,92 +683,182 @@ std::pair GetDeviceAndAllocator(const ConversionParams& params, VLOG(1) << "Using allocator " << dev_allocator->Name() << " and cuda_device_id " << cuda_device_id; } else { - LOG(WARNING) << "Cluster is set but device '" << engine.device - << "' is not found in the cluster"; + LOG_WARNING_WITH_PREFIX << "Cluster is set but device '" << engine.device + << "' is not found in the cluster"; } return std::make_pair(cuda_device_id, dev_allocator); } -// Entry function from optimization pass. -Status ConvertAfterShapes(const ConversionParams& params) { +Status CreateStaticEngine(const TRTOptimizationPass::ConversionParams& params, + const EngineInfo& info, int max_batch_size, + const std::vector& input_shapes, + TrtShapeOptimizationProfile* profile, + string* segment_string, grappler::Cluster* cluster) { + std::pair device_allocator = + GetDeviceAndAllocator(cluster, info); + int cuda_device_id = 0; + std::unique_ptr trt_allocator; + if (device_allocator.first >= 0) { + cuda_device_id = device_allocator.first; + trt_allocator.reset(new TRTDeviceAllocator(device_allocator.second)); + } else { + // The value in trt_allocator is a nullptr and cudamalloc will be used. + LOG_WARNING_WITH_PREFIX << "Can't identify the cuda device. Running on " + "device 0 and use cudamalloc as an allocator"; + } + cudaSetDevice(cuda_device_id); + + auto trt_logger = GetLoggerRegistry()->LookUp(params.trt_logger_name); + const bool calibrate_int8 = + (info.precision_mode == TrtPrecisionMode::INT8 && info.use_calibration); + + // Create static engines with precision_mode fp32/fp16. + TrtUniquePtrType engine; + TF_RETURN_IF_ERROR(ConvertGraphDefToEngine( + info.segment_graph_def, nullptr, + calibrate_int8 ? TrtPrecisionMode::FP32 : info.precision_mode, + max_batch_size, info.max_workspace_size_bytes, input_shapes, trt_logger, + trt_allocator.get(), /*calibrator=*/nullptr, &engine, + info.use_calibration, params.use_implicit_batch, + /*convert_successfully=*/nullptr, profile, info.engine_name, + /*use_explicit_precision=*/params.use_explicit_precision, cluster)); + TrtUniquePtrType engine_data(engine->serialize()); + *segment_string = string(static_cast(engine_data->data()), + engine_data->size()); + return Status::OK(); +} + +Status ConvertGraph(const TRTOptimizationPass::ConversionParams& params, + grappler::GrapplerItem& grappler_item, + const std::vector& input_output_names, + grappler::Cluster* cluster, GraphDef* output) { // Sanity checks. + TRT_ENSURE(output != nullptr) if (params.precision_mode != TrtPrecisionMode::INT8 && params.use_calibration) { return errors::InvalidArgument( "Calibration with FP32 or FP16 is not supported."); } + GraphDef& graph_def = grappler_item.graph; + + // When precision_mode is FP16, transform cast(x, fp32) to + // cast(cast(x, fp16), fp32). This creates cast(fp16, f32) that can be + // included in the TRTEngineOp as an TensorRT Identity layer for performance: + // . Avoid cast(fp32, fp16) in the TRT engine implementation for fp16 + // precision. + // . Changing the input to the TRTEngine from fp32 to fp16 may reduce data + // moving from the host to the GPU. + if (params.precision_mode == TrtPrecisionMode::FP16) { + for (int i = 0; i < graph_def.node_size(); i++) { + NodeDef* node_def = graph_def.mutable_node(i); + TF_RETURN_IF_ERROR(MaybeRewriteCastToFp32(&graph_def, node_def)); + } + } + + // Construct a GrapplerItem using the modified graph_def and the input + // grappler_item. + grappler::GraphProperties static_graph_properties(grappler_item); + TF_RETURN_IF_ERROR(static_graph_properties.InferStatically(true)); + // Convert graphdef to graph. - FunctionLibraryDefinition flib(OpRegistry::Global(), - params.input_graph_def->library()); + FunctionLibraryDefinition flib(OpRegistry::Global(), graph_def.library()); Graph graph(flib); - TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(GraphConstructorOptions(), - *params.input_graph_def, &graph)); + TF_RETURN_IF_ERROR( + ConvertGraphDefToGraph(GraphConstructorOptions(), graph_def, &graph)); // Segment the graph into subgraphs that can be converted to TensorRT segment::SegmentOptions segment_options; // TODO(ben,jie,sami): exclude output nodes (DISCUSS IT) - for (auto node : *(params.output_names)) { + for (const auto& node : input_output_names) { segment_options.exclude_node_list.insert(node); } segment_options.minimum_segment_size = params.minimum_segment_size; - segment::SegmentNodesVector initial_segments; - TrtNodeValidator validator(*params.graph_properties, params.precision_mode, - params.use_calibration); + segment_options.use_implicit_batch = params.use_implicit_batch; + if (segment_options.use_implicit_batch) + segment_options.maximum_batch_size = params.max_batch_size; + segment_options.allow_dynamic_non_batch_dim = + AllowDynamicNonBatchDimension(params); + + segment::SegmentVector initial_segments; + TrtNodeValidator validator(static_graph_properties, params.precision_mode, + params.use_calibration, params.use_implicit_batch, + params.use_explicit_precision); TF_RETURN_IF_ERROR(segment::SegmentGraph( - &graph, + /*tf_graph=*/&graph, + /*graph_properties=*/&static_graph_properties, + /*candidate_fn=*/ std::bind(&TrtNodeValidator::IsTensorRTCandidate, &validator, std::placeholders::_1), // Input validation is already done by TrtNodeValidator, so we don't // need to check the input edges. - [](const Edge* edge) { return true; }, OutputEdgeValidator(), - segment_options, &initial_segments)); + /*input_candidate_fn=*/[](const Edge* edge) { return true; }, + /*output_candidate_fn=*/OutputEdgeValidator(), + /*options=*/segment_options, + /*segments=*/&initial_segments)); LOG(INFO) << "Number of TensorRT candidate segments: " << initial_segments.size(); // Get the EngineInfo for each segment. std::unordered_map node_map; TF_RETURN_IF_ERROR(BuildNodeMap(graph, &node_map)); - float total_num_nodes_in_segments = 0.; std::vector engine_segments; engine_segments.reserve(initial_segments.size()); std::vector reverse_topo_order; GetPostOrder(graph, &reverse_topo_order); - size_t total_engine_bytes_size = 0; - std::vector engine_bytes_size; - segment::SegmentNodesVector converted_segments; + segment::SegmentVector converted_segments; converted_segments.reserve(initial_segments.size()); + string engine_name_prefix = + StrCat("TRTEngineOp_", + absl::StrFormat("%0*d", 3, GetNextGraphSequenceNumber()), "_"); for (size_t t = 0; t < initial_segments.size(); t++) { auto& curr_segment = initial_segments.at(t); EngineInfo curr_engine; - curr_engine.engine_name = StrCat("TRTEngineOp_", t); - Status status = - GetEngineInfo(&graph, *params.graph_properties, curr_segment, node_map, - reverse_topo_order, &curr_engine); + curr_engine.engine_name = + StrCat(engine_name_prefix, absl::StrFormat("%0*d", 3, t)); + + bool int8_no_calib = (!params.use_calibration && + params.precision_mode == TrtPrecisionMode::INT8); + bool has_qdq = false; + if (int8_no_calib) { + has_qdq = absl::c_any_of(reverse_topo_order, IsQuantizeAndDequantizeOp); + } + + Status status = GetEngineInfo(&graph, static_graph_properties, curr_segment, + reverse_topo_order, &curr_engine); if (!status.ok()) { - LOG(WARNING) << "Failed to get engine info for segment " << t << ": " - << status; + LOG_WARNING_WITH_PREFIX << "Failed to get engine info for segment " << t + << ": " << status; continue; } - curr_engine.precision_mode = params.precision_mode; - curr_engine.engine_type = ((params.is_dyn_op || params.use_calibration) - ? EngineInfo::EngineType::TRTDynamic - : EngineInfo::EngineType::TRTStatic); + + curr_engine.engine_type = GetEngineType(params); curr_engine.use_calibration = params.use_calibration; + // Building cuda engines for INT8 without calibration and without dynamic + // range info cause TRT failure. Avoid this situation by setting the + // precision to FP16. + if (int8_no_calib && !has_qdq) { + LOG(WARNING) << "Set engine precision to FP16 due to missing QDQ OP"; + curr_engine.precision_mode = TrtPrecisionMode::FP16; + } else { + curr_engine.precision_mode = params.precision_mode; + } curr_engine.maximum_cached_engines = params.max_cached_engines; + curr_engine.allow_build_at_runtime = params.allow_build_at_runtime; + if (!curr_engine.max_batch_size.has_value()) { + curr_engine.max_batch_size = params.max_batch_size; + } status = RegisterGraphToFunctionLibrary(curr_engine.segment_graph_def, &graph, curr_engine.engine_name); if (!status.ok()) { - LOG(WARNING) << "Failed to register segment graphdef to the library " << t - << ": " << status; + LOG_WARNING_WITH_PREFIX + << "Failed to register segment graphdef to the library " << t << ": " + << status; continue; } - engine_bytes_size.push_back(curr_engine.segment_graph_def.ByteSizeLong()); - total_engine_bytes_size += engine_bytes_size.back(); - total_num_nodes_in_segments += curr_segment.size(); engine_segments.push_back(std::move(curr_engine)); converted_segments.push_back(std::move(curr_segment)); @@ -703,56 +872,54 @@ Status ConvertAfterShapes(const ConversionParams& params) { } } - // Create a TRT node for each segment using its EngineInfo. - int old_cuda_device = 0; - auto err = cudaGetDevice(&old_cuda_device); - if (err != cudaSuccess) { - LOG(ERROR) << "Couldn't get current device: " << cudaGetErrorString(err); + // Save the cuda device since we may need to switch to another cuda device to + // build static engines. + absl::optional old_cuda_device = absl::nullopt; + if (!params.is_dynamic_op) { + int cuda_device_id; + cudaError_t cuda_error = cudaGetDevice(&cuda_device_id); + if (cuda_error != cudaSuccess) { + LOG_WARNING_WITH_PREFIX << "Couldn't get current device: " + << cudaGetErrorString(cuda_error); + } else { + VLOG(1) << "Current cuda device is " << cuda_device_id; + old_cuda_device = cuda_device_id; + } } - VLOG(1) << "Current cuda device is " << old_cuda_device; + + auto restore_cuda_device = gtl::MakeCleanup([old_cuda_device] { + if (old_cuda_device.has_value()) { + cudaSetDevice(old_cuda_device.value()); + } + }); + std::vector engine_nodes; engine_nodes.resize(engine_segments.size()); for (int i = 0; i < engine_segments.size(); ++i) { auto& engine = engine_segments.at(i); - // Partition the workspace size by the average of node ratio and segment - // graphdef size - engine.max_workspace_size_bytes = - params.max_workspace_size_bytes * - (engine_bytes_size.at(i) / total_engine_bytes_size + - converted_segments.at(i).size() / total_num_nodes_in_segments) / - 2.0; + // TODO(b/170762693): implement the heuristic to calculate + // max_workspace_size_bytes. + engine.max_workspace_size_bytes = params.max_workspace_size_bytes; VLOG(1) << "Assigned " << engine.max_workspace_size_bytes << " bytes to " << engine.engine_name; - // The allocator is used to build the engine. The build and the built engine - // will be destroyed after we get the serialized engine string, so it's fine - // to use unique_ptr here. - std::unique_ptr alloc; - auto device_alloc = GetDeviceAndAllocator(params, engine); - int cuda_device_id = 0; - if (device_alloc.first >= 0) { - cuda_device_id = device_alloc.first; - alloc.reset(new TRTDeviceAllocator(device_alloc.second)); - } else { - // Setting allocator as nullptr should get revert to the cudamalloc - LOG(WARNING) << "Can't identify the cuda device. Running on device 0 "; - } - cudaSetDevice(cuda_device_id); auto status = CreateTRTNode(params, engine_segments, i, params.max_batch_size, &graph, - alloc.get(), &engine_nodes); + &engine_nodes, cluster); - string msg = - StrCat("TensorRT node ", engine.engine_name, " added for segment ", i, - " consisting of ", converted_segments.at(i).size(), " nodes"); + string msg = StrCat("segment ", i, " consisting of ", + converted_segments.at(i).nodes.size(), " nodes by ", + engine.engine_name); if (status.ok()) { - LOG(INFO) << msg << " succeeded."; + LOG(INFO) << "Replaced " << msg << "."; } else { // Graph is not modified. - LOG(WARNING) << msg << " failed: " << status << ". Fallback to TF..."; + LOG_WARNING_WITH_PREFIX << "Cannot replace " << msg + << " reason: " << status.error_message() + << " (keeping original segment)."; } if (VLOG_IS_ON(1)) { msg = "Segment consists of nodes: "; - for (const Node* node : converted_segments.at(i)) { + for (const Node* node : converted_segments.at(i).nodes) { StrAppend(&msg, node->name(), ", "); } VLOG(1) << msg; @@ -761,14 +928,12 @@ Status ConvertAfterShapes(const ConversionParams& params) { // If status is ok, we successfully added the node to the graph and can // remove segment ops. Otherwise graph is not modified. if (status.ok()) { - for (const Node* node : converted_segments.at(i)) { + for (const Node* node : converted_segments.at(i).nodes) { graph.RemoveNode(const_cast(node)); } } } - cudaSetDevice(old_cuda_device); - graph.ToGraphDef(params.output_graph_def); - VLOG(1) << "Returning from conversion"; + graph.ToGraphDef(output); return Status::OK(); } @@ -776,5 +941,4 @@ Status ConvertAfterShapes(const ConversionParams& params) { } // namespace tensorrt } // namespace tensorflow -#endif // GOOGLE_TENSORRT -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA && GOOGLE_TENSORRT diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h index 9288829574e..0607fb85346 100644 --- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h +++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h @@ -18,54 +18,53 @@ limitations under the License. #include #include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h" -#include "tensorflow/core/framework/function.pb.h" +#include "tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.h" +#include "tensorflow/compiler/tf2tensorrt/convert/utils.h" +#include "tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h" #include "tensorflow/core/framework/graph.pb.h" +#include "tensorflow/core/framework/tensor_shape.h" #include "tensorflow/core/grappler/clusters/cluster.h" -#include "tensorflow/core/grappler/costs/graph_properties.h" +#include "tensorflow/core/grappler/grappler_item.h" #include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/platform/types.h" -#if GOOGLE_CUDA -#if GOOGLE_TENSORRT +#if GOOGLE_CUDA && GOOGLE_TENSORRT namespace tensorflow { namespace tensorrt { namespace convert { -struct ConversionParams { - const GraphDef* input_graph_def = nullptr; - const std::vector* output_names = nullptr; - size_t max_batch_size = 1; - size_t max_workspace_size_bytes = 1 << 30; - GraphDef* output_graph_def = nullptr; - TrtPrecisionMode precision_mode = TrtPrecisionMode::FP32; - int minimum_segment_size = 3; - const grappler::GraphProperties* graph_properties = nullptr; - const grappler::Cluster* cluster = nullptr; - // Whether to create engine on conversion or execution time - bool is_dyn_op = false; - // maximum number of cached engines - int max_cached_engines = 1; - bool use_calibration = true; -}; +// These functions are internal implementation functions for the +// TRTOptimizationPass. -// Method to call from optimization pass -Status ConvertAfterShapes(const ConversionParams& params); +// Performs segmentation and conversion on the given Grappler item. This method +// contains the core logic of the TRTOptimizationPass. +Status ConvertGraph(const TRTOptimizationPass::ConversionParams& params, + grappler::GrapplerItem& grappler_item, + const std::vector& input_output_names, + grappler::Cluster* cluster, GraphDef* output); // Helper method for the conversion, expose for testing. -std::pair GetDeviceAndAllocator(const ConversionParams& params, - const EngineInfo& engine); +std::pair GetDeviceAndAllocator( + const grappler::Cluster* cluster, const EngineInfo& engine); // Helper method that registers `segment_graph` as a function to the function // library in `graph`. Status RegisterGraphToFunctionLibrary(const GraphDef& segment_graph_def, Graph* graph, const string& engine_name); +// Creates and serializes an ICudaEngine. Used only in is_dynamic_op=false, +// a.k.a. static engine mode. +Status CreateStaticEngine(const TRTOptimizationPass::ConversionParams& params, + const EngineInfo& info, int max_batch_size, + const std::vector& input_shapes, + TrtShapeOptimizationProfile* profile, + string* segment_string, grappler::Cluster* cluster); + } // namespace convert } // namespace tensorrt } // namespace tensorflow -#endif // GOOGLE_TENSORRT -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA && GOOGLE_TENSORRT #endif // TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_CONVERT_GRAPH_H_ diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph_test.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_graph_test.cc index 58fe39b08ba..ba74bd25528 100644 --- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph_test.cc +++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph_test.cc @@ -15,12 +15,15 @@ limitations under the License. #include "tensorflow/compiler/tf2tensorrt/convert/convert_graph.h" +#include // NOLINT + #include #include #include "tensorflow/cc/framework/ops.h" #include "tensorflow/cc/framework/scope.h" #include "tensorflow/cc/ops/standard_ops.h" #include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h" +#include "tensorflow/compiler/tf2tensorrt/utils/trt_testutils.h" #include "tensorflow/core/common_runtime/device_mgr.h" #include "tensorflow/core/common_runtime/device_set.h" #include "tensorflow/core/framework/tensor_shape.h" @@ -32,24 +35,12 @@ limitations under the License. #include "tensorflow/core/protobuf/config.pb.h" // NOLINT #include "tensorflow/core/public/session.h" -#if GOOGLE_CUDA -#if GOOGLE_TENSORRT +#if GOOGLE_CUDA && GOOGLE_TENSORRT namespace tensorflow { namespace tensorrt { namespace convert { -// TODO(laigd): put this into some test utils file. -void ExpectStatus(Status status, error::Code code = error::OK, - const char* substr = nullptr) { - EXPECT_EQ(code, status.code()) - << status << " vs expected error code \"" << error::Code_Name(code) - << "\" and message \"" << substr << "\""; - if (substr) { - EXPECT_THAT(status.error_message(), ::testing::HasSubstr(substr)) << status; - } -} - class FakeCluster : public grappler::Cluster { public: FakeCluster() : Cluster(0) {} @@ -70,15 +61,15 @@ class FakeCluster : public grappler::Cluster { } private: - const DeviceSet* device_set_; + const DeviceSet* device_set_ = nullptr; }; -TEST(ConvertGraphTest, GetDeviceAndAllocator) { - ConversionParams params; +TEST(GetDeviceAndAllocatorTest, GetDeviceAndAllocator) { + TRTOptimizationPass::ConversionParams params; EngineInfo engine_info; { - // params.cluster is not set, and no gpu device is available. - auto result = GetDeviceAndAllocator(params, engine_info); + // cluster is not set, and no gpu device is available. + auto result = GetDeviceAndAllocator(nullptr, engine_info); EXPECT_EQ(-1, result.first); EXPECT_EQ(nullptr, result.second); } @@ -94,20 +85,19 @@ TEST(ConvertGraphTest, GetDeviceAndAllocator) { std::unique_ptr session(NewSession(options)); { - // params.cluster is not set, should find and return first gpu id and + // cluster is not set, should find and return first gpu id and // corresponding allocator. - auto result = GetDeviceAndAllocator(params, engine_info); + auto result = GetDeviceAndAllocator(nullptr, engine_info); EXPECT_EQ(0, result.first); EXPECT_NE(nullptr, result.second); EXPECT_EQ("GPU_0_bfc", result.second->Name()); } FakeCluster cluster; - params.cluster = &cluster; { // params.cluster->GetDeviceSet() returns null, should find and return first // gpu id and corresponding allocator. - auto result = GetDeviceAndAllocator(params, engine_info); + auto result = GetDeviceAndAllocator(&cluster, engine_info); EXPECT_EQ(0, result.first); EXPECT_NE(nullptr, result.second); EXPECT_EQ("GPU_0_bfc", result.second->Name()); @@ -124,7 +114,7 @@ TEST(ConvertGraphTest, GetDeviceAndAllocator) { { // engine_info.device is not set, should find and return first gpu id and // corresponding allocator. - auto result = GetDeviceAndAllocator(params, engine_info); + auto result = GetDeviceAndAllocator(&cluster, engine_info); EXPECT_EQ(0, result.first); EXPECT_NE(nullptr, result.second); EXPECT_EQ("GPU_0_bfc", result.second->Name()); @@ -133,7 +123,7 @@ TEST(ConvertGraphTest, GetDeviceAndAllocator) { engine_info.device = "/GPU:1"; { // Set to use second device. - auto result = GetDeviceAndAllocator(params, engine_info); + auto result = GetDeviceAndAllocator(&cluster, engine_info); EXPECT_EQ(0, result.first); EXPECT_NE(nullptr, result.second); EXPECT_EQ("GPU_1_bfc", result.second->Name()); @@ -142,15 +132,16 @@ TEST(ConvertGraphTest, GetDeviceAndAllocator) { engine_info.device = "/GPU:3"; { // Set to use nonexistent device. - auto result = GetDeviceAndAllocator(params, engine_info); + auto result = GetDeviceAndAllocator(&cluster, engine_info); EXPECT_EQ(-1, result.first); EXPECT_EQ(nullptr, result.second); } } -class ConvertAfterShapesTest : public ::testing::Test { +class ConvertGraphTest : public ::testing::Test { public: - Status RunConvertAfterShape(Scope s, GraphDef* output_graph_def) { + Status RunConvertGraph(Scope s, GraphDef* output_graph_def, + int maximum_batch_size = 1000) { // Create GraphProperties. grappler::GrapplerItem item; TF_EXPECT_OK(s.ToGraphDef(&item.graph)); @@ -158,21 +149,19 @@ class ConvertAfterShapesTest : public ::testing::Test { TF_EXPECT_OK(graph_properties.InferStatically(true)); // Construct ConversionParams. - const std::vector output_names{"output"}; - ConversionParams params; - params.input_graph_def = &item.graph; - params.output_names = &output_names; + const std::vector input_output_names{"output"}; + TRTOptimizationPass::ConversionParams params; + params.max_batch_size = maximum_batch_size; params.max_workspace_size_bytes = 8 << 20; - params.output_graph_def = output_graph_def; params.minimum_segment_size = 1; - params.graph_properties = &graph_properties; params.use_calibration = false; - - return ConvertAfterShapes(params); + params.trt_logger_name = "DefaultLogger"; + return ConvertGraph(params, item, input_output_names, nullptr, + output_graph_def); } }; -TEST_F(ConvertAfterShapesTest, DirectlyConnectedEngines) { +TEST_F(ConvertGraphTest, DirectlyConnectedEngines) { // Create the graph. There will be two TRTEngineOps after the conversion, and // the upstream TRTEngineOp will have two output connections from the same // node:port inside the op to the downstream TRTEngineOp. Then, if it adds the @@ -200,17 +189,24 @@ TEST_F(ConvertAfterShapesTest, DirectlyConnectedEngines) { ops::Identity(s.WithOpName("output"), add3); GraphDef output_graph_def; - TF_EXPECT_OK(RunConvertAfterShape(s, &output_graph_def)); + TF_EXPECT_OK(RunConvertGraph(s, &output_graph_def)); + auto remove_graph_sequence_number = [](std::string node_name) { + const std::regex pattern("TRTEngineOp_[0-9]+_"); + return std::regex_replace(node_name, pattern, "TRTEngineOp_"); + }; int num_trt_ops = 0; for (const NodeDef& node : output_graph_def.node()) { - if (node.name() == "TRTEngineOp_1") { + std::string node_name = node.name(); + if (node.op() != "TRTEngineOp") continue; + node_name = remove_graph_sequence_number(node_name); + if (node_name == "TRTEngineOp_001") { EXPECT_EQ(1, node.input_size()); EXPECT_EQ("input", node.input(0)); ++num_trt_ops; - } else if (node.name() == "TRTEngineOp_0") { + } else if (node_name == "TRTEngineOp_000") { EXPECT_EQ(2, node.input_size()); - EXPECT_EQ("TRTEngineOp_1", node.input(0)); + EXPECT_EQ("TRTEngineOp_001", remove_graph_sequence_number(node.input(0))); EXPECT_EQ("reshape2", node.input(1)); ++num_trt_ops; } @@ -222,5 +218,4 @@ TEST_F(ConvertAfterShapesTest, DirectlyConnectedEngines) { } // namespace tensorrt } // namespace tensorflow -#endif // GOOGLE_TENSORRT -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA && GOOGLE_TENSORRT diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc index 459136d3eef..22799c00888 100644 --- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc +++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc @@ -16,6 +16,7 @@ limitations under the License. #include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h" #include +#include #include #include #include @@ -25,21 +26,37 @@ limitations under the License. #include #include +#include "absl/algorithm/container.h" +#include "absl/container/flat_hash_set.h" +#include "absl/memory/memory.h" #include "absl/strings/match.h" #include "absl/strings/str_cat.h" +#include "absl/strings/str_format.h" #include "absl/strings/string_view.h" +#include "tensorflow/compiler/tf2tensorrt/common/utils.h" +#include "tensorflow/compiler/tf2tensorrt/convert/algorithm_selector.h" +#include "tensorflow/compiler/tf2tensorrt/convert/op_converter_registry.h" +#include "tensorflow/compiler/tf2tensorrt/convert/ops/layer_utils.h" +#include "tensorflow/compiler/tf2tensorrt/convert/ops/quantization_ops.h" +#include "tensorflow/compiler/tf2tensorrt/convert/timing_cache.h" #include "tensorflow/compiler/tf2tensorrt/convert/utils.h" +#include "tensorflow/compiler/tf2tensorrt/utils/trt_experimental_features.h" #include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h" +#include "tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h" #include "tensorflow/core/framework/node_def.pb.h" // NOLINT #include "tensorflow/core/framework/node_def_builder.h" #include "tensorflow/core/framework/tensor.pb.h" // NOLINT #include "tensorflow/core/framework/tensor_shape.h" #include "tensorflow/core/framework/tensor_shape.pb.h" // NOLINT +#include "tensorflow/core/framework/tensor_util.h" #include "tensorflow/core/framework/types.h" #include "tensorflow/core/graph/algorithm.h" #include "tensorflow/core/graph/graph.h" #include "tensorflow/core/graph/graph_constructor.h" +#include "tensorflow/core/grappler/grappler_item.h" #include "tensorflow/core/grappler/op_types.h" +#include "tensorflow/core/grappler/optimizers/constant_folding.h" +#include "tensorflow/core/grappler/optimizers/generic_layout_optimizer.h" #include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/lib/strings/numbers.h" @@ -49,11 +66,14 @@ limitations under the License. #include "tensorflow/core/platform/mutex.h" #include "tensorflow/core/platform/protobuf.h" #include "tensorflow/core/platform/tensor_coding.h" +#include "tensorflow/core/platform/tensor_float_32_utils.h" #include "tensorflow/core/platform/types.h" +#include "tensorflow/core/profiler/lib/traceme.h" +#include "tensorflow/core/public/version.h" +#include "tensorflow/core/util/env_var.h" #include "tensorflow/core/util/strided_slice_op.h" -#if GOOGLE_CUDA -#if GOOGLE_TENSORRT +#if GOOGLE_CUDA && GOOGLE_TENSORRT #include "third_party/tensorrt/NvInfer.h" #include "third_party/tensorrt/NvInferPlugin.h" @@ -61,33 +81,97 @@ limitations under the License. // would work! #define TFTRT_CHECK_EQ_TYPE(val1, val2) CHECK_EQ((int)val1, (int)val2) -#define TFTRT_INTERNAL_ERROR_AT_NODE(node) \ - do { \ - return errors::Internal("TFTRT::", __FUNCTION__, ":", __LINE__, \ - " failed to add TRT layer, at: ", node); \ - } while (0) +#define TFTRT_CHECK_INPUT_SIZE(size, exp_size, node_def) \ + if ((size) != (exp_size)) { \ + TFTRT_ERROR(errors::InvalidArgument, node_def.op(), " got ", (size), \ + " inputs but expected ", (exp_size)); \ + } -#define TFTRT_RETURN_ERROR_IF_NULLPTR(ptr, node) \ - do { \ - if (ptr == nullptr) { \ - TFTRT_INTERNAL_ERROR_AT_NODE(node); \ - } \ - } while (0) +// Max kernel volume copied from TRT's limits. +#define MAX_KERNEL_DIMS_PRODUCT(x) (int64_t(std::pow(100000.0F, (x)*0.5F))) namespace tensorflow { namespace tensorrt { namespace convert { -bool IsEngineInput(absl::string_view name) { - return absl::StartsWith(name, IONamePrefixes::kInputPHName); -} -bool IsEngineOutput(absl::string_view name) { - return absl::StartsWith(name, IONamePrefixes::kOutputPHName); -} - using absl::StrAppend; using absl::StrCat; +namespace { + +#define ADD_LAYER(layer_name) \ + case nvinfer1::LayerType::k##layer_name: \ + return #layer_name; + +const char* LayerTypeToString(nvinfer1::LayerType layer_type) { + switch (layer_type) { + ADD_LAYER(CONVOLUTION) + ADD_LAYER(FULLY_CONNECTED) + ADD_LAYER(ACTIVATION) + ADD_LAYER(POOLING) + ADD_LAYER(LRN) + ADD_LAYER(SCALE) + ADD_LAYER(SOFTMAX) + ADD_LAYER(DECONVOLUTION) + ADD_LAYER(CONCATENATION) + ADD_LAYER(ELEMENTWISE) + ADD_LAYER(PLUGIN) + ADD_LAYER(UNARY) + ADD_LAYER(PADDING) + ADD_LAYER(SHUFFLE) + ADD_LAYER(REDUCE) + ADD_LAYER(TOPK) + ADD_LAYER(GATHER) +#if IS_TRT_VERSION_GE(8, 5, 0, 0) + ADD_LAYER(GRID_SAMPLE) +#endif + ADD_LAYER(MATRIX_MULTIPLY) + ADD_LAYER(RAGGED_SOFTMAX) + ADD_LAYER(CONSTANT) + ADD_LAYER(RNN_V2) + ADD_LAYER(IDENTITY) + ADD_LAYER(PLUGIN_V2) + ADD_LAYER(SLICE) + ADD_LAYER(SHAPE) + ADD_LAYER(PARAMETRIC_RELU) + ADD_LAYER(RESIZE) + ADD_LAYER(TRIP_LIMIT) + ADD_LAYER(RECURRENCE) + ADD_LAYER(ITERATOR) + ADD_LAYER(LOOP_OUTPUT) + ADD_LAYER(SELECT) + ADD_LAYER(FILL) +#if IS_TRT_VERSION_GE(8, 0, 0, 0) + ADD_LAYER(QUANTIZE) + ADD_LAYER(DEQUANTIZE) +#endif +#if IS_TRT_VERSION_GE(8, 2, 0, 0) + ADD_LAYER(CONDITION) + ADD_LAYER(CONDITIONAL_INPUT) + ADD_LAYER(CONDITIONAL_OUTPUT) + ADD_LAYER(SCATTER) + ADD_LAYER(EINSUM) + ADD_LAYER(ASSERTION) +#endif +#if IS_TRT_VERSION_GE(8, 5, 0, 0) + ADD_LAYER(ONE_HOT) + ADD_LAYER(NON_ZERO) + ADD_LAYER(NMS) +#endif +#if IS_TRT_VERSION_GE(8, 6, 0, 0) + ADD_LAYER(REVERSE_SEQUENCE) +#endif +#if !IS_TRT_VERSION_GE(8, 0, 0, 0) + // The TRT IRNNv2Layer has been deprecated in favor of the loop API. + ADD_LAYER(RNN) +#endif + default: + return "UNKNOWN_LAYER"; + } +} + +#undef ADD_LAYER + inline Status TfDataTypeToTrt(DataType tf_dtype, nvinfer1::DataType* trt_dtype) { switch (tf_dtype) { @@ -126,6 +210,62 @@ inline Status TrtDataTypeToTf(nvinfer1::DataType trt_dtype, return Status::OK(); } +// Sets the ILayer name in the form of +// /:. +void SetLayerNameHelper(nvinfer1::ILayer* layer, absl::string_view engine_name, + absl::string_view tf_name) { + const char* trt_name = LayerTypeToString(layer->getType()); + layer->setName( + absl::StrCat(engine_name, "/", tf_name, ":", trt_name).c_str()); +} + +// Returns a string in the form of . +std::string GetLayerNameSuffix(absl::string_view sub_op_name, + absl::optional sub_op_instance) { + std::string op_suffix(sub_op_name); + if (sub_op_instance.has_value()) { + op_suffix = + absl::StrCat(op_suffix, "_", std::to_string(sub_op_instance.value())); + } + return op_suffix; +} + +} // namespace + +bool IsEngineInput(absl::string_view name) { + return absl::StartsWith(name, IONamePrefixes::kInputPHName); +} +bool IsEngineOutput(absl::string_view name) { + return absl::StartsWith(name, IONamePrefixes::kOutputPHName); +} + +void GetOutputProperties(const grappler::GraphProperties& graph_properties, + const Node* node, const int out_port, + PartialTensorShape* shape, DataType* dtype) { + if (graph_properties.HasOutputProperties(node->name())) { + auto output_params = graph_properties.GetOutputProperties(node->name()); + auto out_shape = output_params.at(out_port); + *dtype = out_shape.dtype(); + *shape = out_shape.shape(); + } else { + LOG(INFO) << "Unknown output shape at node: " << node->name(); + *dtype = node->output_type(out_port); + } +} + +void GetInputProperties(const grappler::GraphProperties& graph_properties, + const Node* node, const int in_port, + PartialTensorShape* shape, DataType* dtype) { + if (graph_properties.HasInputProperties(node->name())) { + auto input_params = graph_properties.GetInputProperties(node->name()); + auto in_shape = input_params.at(in_port); + *dtype = in_shape.dtype(); + *shape = in_shape.shape(); + } else { + *dtype = node->input_type(in_port); + } +} + class TFAttrs { public: explicit TFAttrs(const NodeDef& tf_node) { @@ -220,71 +360,53 @@ Status TensorShapeArrayToTrtDims(const Container& shape, nvinfer1::Dims* out, return Status::OK(); } -// TODO(laigd): use this utility function in more places. -Status RemoveBatchDimension(nvinfer1::Dims* dims) { - if (dims->nbDims < 2) { - return errors::InvalidArgument( - "Dropping batch dimension requires dims with rank>=2."); - } - std::copy(dims->d + 1, dims->d + dims->nbDims, dims->d); - dims->nbDims--; - return Status::OK(); -} - -void GetOutputProperties(const grappler::GraphProperties& graph_properties, - const Node* node, const int out_port, - PartialTensorShape* shape, DataType* dtype) { - if (graph_properties.HasOutputProperties(node->name())) { - auto output_params = graph_properties.GetOutputProperties(node->name()); - auto out_shape = output_params.at(out_port); - *dtype = out_shape.dtype(); - *shape = out_shape.shape(); - } else { - LOG(INFO) << "Unknown output shape" << node->name(); - *dtype = node->output_type(out_port); - } -} - -void GetInputProperties(const grappler::GraphProperties& graph_properties, - const Node* node, const int in_port, - PartialTensorShape* shape, DataType* dtype) { - if (graph_properties.HasInputProperties(node->name())) { - auto input_params = graph_properties.GetInputProperties(node->name()); - auto in_shape = input_params.at(in_port); - *dtype = in_shape.dtype(); - *shape = in_shape.shape(); - } else { - *dtype = node->input_type(in_port); - } -} - +// This function checks if a tensor is compatible with TRT. +// +// We check that the shape and datatype are compatible with TensorRT. We also +// return the corresponding trt_dtype, the trt_dims and the batch_size (latter +// is only needed in implicit batch mode). +// +// The return status indicates wether the tensor is compatible. +// +// For implicit batch mode, when validation_only == false, we also check that +// all input dimensions (besides the batch dimension) are known dimensions. Status ValidateTensorProperties(const string& producer_node_type, const DataType dtype, const PartialTensorShape& shape, + const bool use_implicit_batch, bool validation_only, nvinfer1::DataType* trt_dtype, nvinfer1::Dims* trt_dims, int* batch_size) { // Convert data type. - TF_RETURN_IF_ERROR(TfDataTypeToTrt(dtype, trt_dtype)); + TF_RETURN_IF_ERROR(TfTypeToTrtType(dtype, trt_dtype)); // Convert shape. if (shape.dims() < 0) { return errors::InvalidArgument("Input tensor rank is unknown."); } - if (shape.dims() > nvinfer1::Dims::MAX_DIMS + 1) { // +1 for batch dim - return errors::OutOfRange("Input tensor rank is greater than ", - nvinfer1::Dims::MAX_DIMS + 1); + // Add 1 to maximum rank for implicit batch dim. + const int max_rank = nvinfer1::Dims::MAX_DIMS + (use_implicit_batch ? 1 : 0); + if (shape.dims() > max_rank) { + return errors::OutOfRange("Input tensor rank is greater than ", max_rank); } - if (producer_node_type != "Const" && shape.dims() < 1) { + if (use_implicit_batch && (producer_node_type != "Const") && + (shape.dims() < 1)) { return errors::InvalidArgument( "Scalar input tensor is not supported since the first dimension " "is treated as batch dimension by TRT"); } - *trt_dims = TensorShapeToTrtDims(shape, /*ignore_first_dim=*/true); - *batch_size = shape.dim_size(0); + ::stream_executor::port::StatusOr dims = + DimsAdapter::Create(shape, use_implicit_batch); + TRT_ENSURE_OK(dims); + *trt_dims = dims.ValueOrDie().AsTrtDims(); + // Get batch size for tensor if it will not be included the shape. + if (use_implicit_batch) { + *batch_size = shape.dim_size(0); + } // Don't convert empty tensors (dim value of 0). - for (int d = 1; d < shape.dims(); ++d) { + const int first_trt_dim = use_implicit_batch ? 1 : 0; + for (int d = first_trt_dim; d < shape.dims(); ++d) { if (shape.dim_size(d) == 0) { return errors::Unimplemented( "Input tensor with shape ", shape.DebugString(), @@ -293,69 +415,24 @@ Status ValidateTensorProperties(const string& producer_node_type, } if (validation_only) return Status::OK(); - // Following are validations at runtime. - for (int d = 1; d < shape.dims(); ++d) { - if (shape.dim_size(d) < 0) { - return errors::InvalidArgument( - "Input tensor with shape ", shape.DebugString(), - " has an unknown non-batch dimension at dim ", d); + // Following checks are only used during TRT engine creation time. + if (use_implicit_batch) { + for (int d = first_trt_dim; d < shape.dims(); ++d) { + if (shape.dim_size(d) < 0) { + return errors::InvalidArgument( + "Input tensor with shape ", shape.DebugString(), + " has an unknown non-batch dimension at dim ", d); + } } } return Status::OK(); } -string DebugString(const nvinfer1::DataType trt_dtype) { - switch (trt_dtype) { - case nvinfer1::DataType::kFLOAT: - return "kFLOAT"; - case nvinfer1::DataType::kHALF: - return "kHALF"; - case nvinfer1::DataType::kINT8: - return "kINT8"; - case nvinfer1::DataType::kINT32: - return "kINT32"; - default: - return "Invalid TRT data type"; - } -} - -string DebugString(const nvinfer1::Dims& dims) { - string out = StrCat("nvinfer1::Dims(nbDims=", dims.nbDims, ", d="); - for (int i = 0; i < dims.nbDims; ++i) { - StrAppend(&out, dims.d[i]); - StrAppend(&out, ","); - } - StrAppend(&out, ")"); - return out; -} - -string DebugString(const nvinfer1::Permutation& permutation, int len) { - string out = "nvinfer1::Permutation("; - for (int i = 0; i < len; ++i) { - StrAppend(&out, permutation.order[i], ","); - } - StrAppend(&out, ")"); - return out; -} - -string DebugString(const ITensorProxyPtr& tensor) { - return StrCat("nvinfer1::ITensor(@", reinterpret_cast(tensor->trt_tensor()), - ", name=", tensor->trt_tensor()->getName(), - ", dtype=", DebugString(tensor->trt_tensor()->getType()), - ", dims=", DebugString(tensor->trt_tensor()->getDimensions()), ")"); -} - -string DebugString(const nvinfer1::ITensor& tensor) { - return StrCat("nvinfer1::ITensor(@", reinterpret_cast(&tensor), - ", name=", tensor.getName(), - ", dtype=", DebugString(tensor.getType()), - ", dims=", DebugString(tensor.getDimensions()), ")"); -} - Status GetTrtBroadcastShape(const TRT_TensorOrWeights& operand_l, const TRT_TensorOrWeights& operand_r, const bool check_feasibility, + const bool use_implicit_batch, nvinfer1::Dims* operand_l_new_dims, nvinfer1::Dims* operand_r_new_dims) { // TensorRT Elementwise op supports broadcast but requires both tensor to be @@ -382,19 +459,24 @@ Status GetTrtBroadcastShape(const TRT_TensorOrWeights& operand_l, // -> W: 1 1 1 1 3 5 1 // *************************************************************************** if (!operand_l.is_tensor() && !operand_r.is_tensor()) { + // TODO(lsugy): remove this check in dynamic shapes mode. This should work + // if both inputs are weights. return errors::InvalidArgument( "Broadcasting requires at least one of the operands be tensors"); } - const int max_nb_dims = nvinfer1::Dims::MAX_DIMS + 1; - auto compute_output_dims = [](const TRT_TensorOrWeights& input, - int broadcast_num_dims, int* output_dims_array, - nvinfer1::Dims* output_dims) { + constexpr int max_nb_dims = nvinfer1::Dims::MAX_DIMS + 1; + auto compute_output_dims = + [use_implicit_batch](const TRT_TensorOrWeights& input, + int broadcast_num_dims, + std::array* output_dims_array, + nvinfer1::Dims* output_dims) -> Status { const nvinfer1::Dims input_dims = input.GetTrtDims(); - std::fill(output_dims_array, output_dims_array + max_nb_dims, 1); - std::copy(input_dims.d, input_dims.d + input_dims.nbDims, - output_dims_array + broadcast_num_dims - input_dims.nbDims); - if (input.is_tensor()) { + absl::c_fill(*output_dims_array, 1); + absl::c_copy( + DimsAdapter(input_dims), + output_dims_array->begin() + broadcast_num_dims - input_dims.nbDims); + if (use_implicit_batch && input.is_tensor()) { const int true_input_dims = input_dims.nbDims + 1; if (true_input_dims < broadcast_num_dims) { return errors::InvalidArgument( @@ -404,28 +486,44 @@ Status GetTrtBroadcastShape(const TRT_TensorOrWeights& operand_l, } // Set the batch dimension to -1, since batch size is not supposed to // be broadcasted. - output_dims_array[0] = -1; + (*output_dims_array)[0] = -1; } - // Copy to output dimensions (stripping the batch dimension). - output_dims->nbDims = broadcast_num_dims - 1; - std::copy(output_dims_array + 1, output_dims_array + broadcast_num_dims, - output_dims->d); + // Copy to output dimensions + auto offt = use_implicit_batch ? 1 : 0; + output_dims->nbDims = broadcast_num_dims - offt; + absl::c_copy( + absl::MakeSpan(*output_dims_array).subspan(offt, broadcast_num_dims), + output_dims->d); return Status::OK(); }; // Compute the output dimensions. const int broadcast_num_dims = - std::max(operand_l.GetTrtDims().nbDims + (operand_l.is_tensor() ? 1 : 0), - operand_r.GetTrtDims().nbDims + (operand_r.is_tensor() ? 1 : 0)); - int output_l[max_nb_dims], output_r[max_nb_dims]; + std::max(operand_l.GetTrtDims().nbDims + + (use_implicit_batch && operand_l.is_tensor()), + operand_r.GetTrtDims().nbDims + + (use_implicit_batch && operand_r.is_tensor())); + std::array output_l, output_r; TF_RETURN_IF_ERROR(compute_output_dims(operand_l, broadcast_num_dims, - output_l, operand_l_new_dims)); + &output_l, operand_l_new_dims)); TF_RETURN_IF_ERROR(compute_output_dims(operand_r, broadcast_num_dims, - output_r, operand_r_new_dims)); + &output_r, operand_r_new_dims)); // Compare broadcast feasibility if (check_feasibility) { for (int i = 0; i < broadcast_num_dims; ++i) { + if (!use_implicit_batch && (output_l[i] == -1 || output_r[i] == -1)) { + // If the condition is true then we are in explicit batch mode and (at + // least) one of the input dimensions are unknown. In other words we + // are in dynamic shape mode. During conversion time we only see -1 for + // the unknown shapes, therefore we cannot decide on the feasibility of + // broadcast over the unknown dimensions. Therefore we just continue for + // the next dimension. In dynamic shape mode TRT can only check the + // feasibility of the broadcast when the actual input dimensions are + // specified by SetTrtEngineInputs and the inference job is launched by + // TrtEnque. + continue; + } if ((output_l[i] != output_r[i]) && (output_l[i] != 1) && (output_r[i] != 1)) { return errors::InvalidArgument("Infeasible broadcast scheme (", @@ -439,32 +537,141 @@ Status GetTrtBroadcastShape(const TRT_TensorOrWeights& operand_l, return Status::OK(); } -ITensorProxyPtr Converter::CreateConstantLayer( - const TRT_ShapedWeights& weights, const nvinfer1::Dims& dims) { +// Prepares a dynamic shape tensor for broadcast by adding leading 1 dimensions. +Status DynamicBroadcast(ITensorProxyPtr operand, + const OpConverterParams* params, + ITensorProxyPtr* output, int broadcasted_nbDims, + absl::optional op_instance) { + int operand_nbDims = operand->getDimensions().nbDims; + if (broadcasted_nbDims > operand_nbDims) { + if (params->validation_only) return Status::OK(); + int n_extra_dims = broadcasted_nbDims - operand_nbDims; + VLOG(2) << "Dynamic broadcast adding " << n_extra_dims << " leading 1s"; + TF_RETURN_IF_ERROR(params->converter->DynamicReshape( + /*input=*/operand, + /*slices=*/{std::make_pair(0, operand_nbDims)}, + /*params=*/params, + /*output=*/output, + /*size_for_added_dims*/ {n_extra_dims}, + /*op_instance=*/op_instance)); + } else { + *output = operand; + } + return Status::OK(); +} + +Status BroadcastWeights(std::unique_ptr& p, + const DimsAdapter& broadcasted_dims) { + if (!p->is_weights()) return errors::Internal("Weight input expected"); + if (p->GetTrtDims().nbDims != broadcasted_dims.NumDims()) { + TRT_ShapedWeights weights(p->weights()); + TF_RETURN_IF_ERROR(weights.SetShape(broadcasted_dims)); + p = std::make_unique(weights); + } + return Status::OK(); +} + +Status ApplyBroadcast(std::unique_ptr& operand, + const DimsAdapter& broadcasted_dims, + const OpConverterParams* params, + absl::optional op_instance) { + if (operand->is_weights()) { + TF_RETURN_IF_ERROR(BroadcastWeights(operand, broadcasted_dims)); + } else { + ITensorProxyPtr tensor = nullptr; + auto is_static_shuffle_compatible = [](const auto& dims) { + return absl::c_count(dims, -1) <= 1; + }; + if (is_static_shuffle_compatible(broadcasted_dims)) { + TF_RETURN_IF_ERROR(PrepareTensorForShape( + params->converter, *operand, broadcasted_dims, + params->validation_only, &tensor, params->node_def)); + } else { + TF_RETURN_IF_ERROR(DynamicBroadcast( + /*operand=*/operand->tensor(), + /*params=*/params, + /*output=*/&tensor, + /*broadcasted_nbDims*/ broadcasted_dims.NumDims(), + /*op_instance=*/op_instance)); + } + operand = std::make_unique(tensor); + } + return Status::OK(); +} + +// Inserts leading 1 dimensions so that both operands have the same rank. +// Note: In implicit batch mode, weights' shape can include an explicit 1 batch +// dimension. The broadcasted shape might loose this leading batch dim, because +// the broadcasted shape does not include the implicit batch dim. +// TODO(tfeher): Other code blocks that use GetTrtBroadcastShape need to be +// fixed to use this routine to handle dynamic inputs. Eventually, +// GetTrtBroadcastShape should only be used by this routine. +Status BroadcastTensors(std::unique_ptr& operand_l, + std::unique_ptr& operand_r, + bool check_feasibility, + const OpConverterParams* params) { + nvinfer1::Dims broadcasted_dims_l, broadcasted_dims_r; + TF_RETURN_IF_ERROR(GetTrtBroadcastShape( + *operand_l, *operand_r, check_feasibility, params->use_implicit_batch, + &broadcasted_dims_l, &broadcasted_dims_r)); + + if (params->validation_only) return Status::OK(); + + TF_RETURN_IF_ERROR(ApplyBroadcast( + /*operand=*/operand_l, + /*broadcasted_dims=*/broadcasted_dims_l, + /*params=*/params, + /*op_instance=*/0)); + + TF_RETURN_IF_ERROR(ApplyBroadcast( + /*operand=*/operand_r, + /*broadcasted_dims=*/broadcasted_dims_r, + /*params=*/params, + /*op_instance=*/1)); + + return Status::OK(); +} + +ITensorProxyPtr Converter::CreateConstantLayer(const TRT_ShapedWeights& weights, + const nvinfer1::Dims& dims) { nvinfer1::Weights trt_weights = weights.GetTrtWeights(); nvinfer1::IConstantLayer* layer = network()->addConstant(dims, trt_weights); if (!layer) return nullptr; + SetLayerName(layer, "_tftrt_constant_", + std::to_string(next_constant_layer_id_)); + next_constant_layer_id_++; ITensorProxyPtr trt_tensor = layer->getOutput(0); -#if !IS_TRT_VERSION_GE(5, 1, 3, 0) - // TODO(laigd): there is a bug in TensorRT 5.0 library that, if we don't set - // the data type below, it will always be kFLOAT regardless what the data type - // of the weights is. Once NVIDIA fixes this bug, we should remove the data - // type setting logic below and test should still pass. - trt_tensor->setType(trt_weights.type); -#endif return trt_tensor; } -Status CreateBroadcastableScalarConstant(OpConverterParams* params, float value, +// Creates a scalar constant and fills with value. +template +Status CreateScalarConstant( + const OpConverterParams* params, T value, ITensorProxyPtr* tensor, + nvinfer1::DataType trt_type = nvinfer1::DataType::kINT32, + const nvinfer1::Dims& dims = {1, {1}}) { + ::stream_executor::port::StatusOr weights = + params->weight_store->GetTempWeights(trt_type, dims); + TRT_ENSURE_OK(weights); + TF_RETURN_IF_ERROR(weights.ValueOrDie().SetValues(value)); + *tensor = params->converter->CreateConstantLayer(weights.ValueOrDie(), dims); + TFTRT_RETURN_ERROR_IF_NULLPTR(*tensor, params->node_def.name()); + return Status::OK(); +} + +// Creates a constant with the same rank as dims, where each dimension has +// size = 1. +Status CreateBroadcastableScalarConstant(const OpConverterParams* params, + float value, const nvinfer1::Dims& dims, ITensorProxyPtr* tensor, const char* dtype_attr_name = "T") { - nvinfer1::DataType trt_dtype = - nvinfer1::DataType::kFLOAT; // Default to FP32. - TFAttrs attrs(params->node_def); - if (attrs.count(dtype_attr_name)) { - DataType dtype = attrs.get(dtype_attr_name); - TF_RETURN_IF_ERROR(TfDataTypeToTrt(dtype, &trt_dtype)); + nvinfer1::DataType trt_type = nvinfer1::DataType::kFLOAT; // Default to FP32. + AttrSlice attrs(params->node_def); + if (attrs.Find(dtype_attr_name) != nullptr) { + DataType dtype; + TF_RETURN_IF_ERROR(GetNodeAttr(attrs, dtype_attr_name, &dtype)); + TF_RETURN_IF_ERROR(TfTypeToTrtType(dtype, &trt_type)); } // In order to be broadcastable, the number of dims has to match. @@ -472,24 +679,29 @@ Status CreateBroadcastableScalarConstant(OpConverterParams* params, float value, for (int i = 0; i < broadcastable_dims.nbDims; i++) { broadcastable_dims.d[i] = 1; } - TRT_ShapedWeights weights = - params->weight_store->GetTempWeights(trt_dtype, broadcastable_dims); - void* raw_ptr = weights.GetValues(); - switch (trt_dtype) { - case nvinfer1::DataType::kFLOAT: - static_cast(raw_ptr)[0] = value; - break; - case nvinfer1::DataType::kHALF: - static_cast(raw_ptr)[0] = Eigen::half(value); - break; - default: - return errors::InvalidArgument("Unsupported data type ", - DebugString(trt_dtype)); + return CreateScalarConstant(params, value, tensor, trt_type, + broadcastable_dims); +} + +// The function concatenates tensors on the first axis. This can be used to +// create a shape tensor from individual dimension sizes. +::stream_executor::port::StatusOr ConcatenateTensors( + const OpConverterParams* params, + const std::vector input_tensors, + absl::optional op_instance = absl::nullopt) { + std::vector trt_input_tensors; + for (const auto& t : input_tensors) { + trt_input_tensors.push_back(t->trt_tensor()); } - *tensor = params->converter->CreateConstantLayer(weights, broadcastable_dims); - TFTRT_RETURN_ERROR_IF_NULLPTR(*tensor, params->node_def.name()); - params->converter->ProvideQuantizationRange(tensor, value, value); - return Status::OK(); + nvinfer1::IConcatenationLayer* layer = + params->converter->network()->addConcatenation( + static_cast(trt_input_tensors.data()), + input_tensors.size()); + TFTRT_RETURN_ERROR_IF_NULLPTR(layer, params->node_def.op()); + params->converter->SetLayerName(layer, params->node_def.name(), + "concat_shapes", op_instance); + layer->setAxis(0); + return ITensorProxyPtr(layer->getOutput(0)); } // Convert an axis from TF format to TRT format while validating. TF format @@ -509,27 +721,13 @@ Status ConvertAxis(int tf_axis, int trt_nb_dims, absl::string_view node_name, // Don't allow axis to be the batch dimension. if (use_implicit_batch && tf_axis == 0) { return errors::Unimplemented( - "TensorRT does not allow manipulation of the batch dimension, at ", - node_name); + "TensorRT does not allow manipulation of the batch dimension"); } // Remove batch dimension if it is implicit. *trt_axis = use_implicit_batch ? tf_axis - 1 : tf_axis; return Status::OK(); } -inline bool DimsEqual(const nvinfer1::Dims& dim_l, - const nvinfer1::Dims& dim_r) { - if (dim_l.nbDims != dim_r.nbDims) { - return false; - } - for (int i = 0; i < dim_l.nbDims; i++) { - if (dim_l.d[i] != dim_r.d[i]) { - return false; - } - } - return true; -} - bool AllLengthsEqual(const std::vector>& inputs) { if (inputs.size() == 0) return true; int length = inputs.at(0).size(); @@ -539,69 +737,21 @@ bool AllLengthsEqual(const std::vector>& inputs) { return true; } -inline nvinfer1::Dims GetTrtDimsForTensor(const Tensor& tensor) { - nvinfer1::Dims dims; - dims.nbDims = tensor.dims(); - for (int i = 0; i < dims.nbDims; i++) { - dims.d[i] = tensor.dim_size(i); - } - return dims; -} - -inline bool HasStaticShape(const nvinfer1::Dims& dims) { - if (dims.nbDims < 0) return false; - for (int d = 0; d < dims.nbDims; ++d) { - if (dims.d[d] < 0) return false; - } - return true; -} - -int64_t Prod(const nvinfer1::Dims& dims) { - int64_t count = 1; - for (int d = 0; d < dims.nbDims; ++d) { - count *= dims.d[d]; - } - return count; -} - -// Returns total number of elements in a TensorRT weights dimensions. -// Returning 0 means either some dim is 0 or the number of dims is 0 (TensorRT -// doesn't allow scalar weights). -// Note that for TF scalar constant, we always convert to dims [1]. -int64_t TrtWeightDimsNumElements(const nvinfer1::Dims& dims) { - if (dims.nbDims == 0) return 0; - return Prod(dims); -} - -// Returns total number of elements in an ITensor dimension. -// Returns 1 if the number of dims is 0 (the total number is fully determined by -// the batch size). -// Returns -1 if any dimension is known. -int64_t TrtTensorDimsNumElements(const nvinfer1::Dims& dims) { - if (!HasStaticShape(dims)) return -1; - return Prod(dims); -} - -bool DimsHaveSameSize(const nvinfer1::Dims& lhs, const nvinfer1::Dims& rhs, - bool is_tensor) { - if (is_tensor) { - return TrtTensorDimsNumElements(lhs) == TrtTensorDimsNumElements(rhs); - } - return TrtWeightDimsNumElements(lhs) == TrtWeightDimsNumElements(rhs); +bool DimsHaveSameSize(const DimsAdapter& lhs, const DimsAdapter& rhs) { + return lhs.Volume() == rhs.Volume(); } // Returns whether both dimensions are fully specified and the total number of // elements equals. -bool AreDimsStaticWithSameSize(const nvinfer1::Dims& lhs, - const nvinfer1::Dims& rhs, bool is_tensor) { - if (!HasStaticShape(lhs) || !HasStaticShape(rhs)) return false; - return DimsHaveSameSize(lhs, rhs, is_tensor); +bool AreDimsStaticWithSameSize(const DimsAdapter& lhs, const DimsAdapter& rhs) { + if (!lhs.IsStatic() || !rhs.IsStatic()) return false; + return DimsHaveSameSize(lhs, rhs); } -bool AreDimsStaticWithDifferentSize(const nvinfer1::Dims& lhs, - const nvinfer1::Dims& rhs, bool is_tensor) { - if (!HasStaticShape(lhs) || !HasStaticShape(rhs)) return false; - return !DimsHaveSameSize(lhs, rhs, is_tensor); +bool AreDimsStaticWithDifferentSize(const DimsAdapter& lhs, + const DimsAdapter& rhs) { + if (!lhs.IsStatic() || !rhs.IsStatic()) return false; + return !DimsHaveSameSize(lhs, rhs); } static std::vector> CreateSamePadding( @@ -653,6 +803,8 @@ Status VerifyShapesMatch(absl::Span inputs, "Received inputs with inconsistent rank, at ", node_name); } for (size_t j = 0; j < dims_0.nbDims; ++j) { + // Dynamic dimensions will be verified at runtime. + if (dim_i.d[j] == -1 || dims_0.d[j] == -1) continue; if (dim_i.d[j] != dims_0.d[j] && j != masked_dim) { return errors::InvalidArgument( "Received inputs with inconsistent shape, at ", node_name); @@ -662,115 +814,6 @@ Status VerifyShapesMatch(absl::Span inputs, return Status::OK(); } -TRT_ShapedWeights::TRT_ShapedWeights(nvinfer1::DataType type) : type_(type) { - shape_.nbDims = 0; -} - -TRT_ShapedWeights::TRT_ShapedWeights(nvinfer1::DataType type, - nvinfer1::Dims dims, Tensor tensor) - : shape_(dims), type_(type), tensor_(tensor) {} - -TRT_ShapedWeights::TRT_ShapedWeights(const TRT_ShapedWeights& rhs) - : shape_(rhs.shape_), type_(rhs.type_), tensor_(rhs.tensor_) {} - -int64_t TRT_ShapedWeights::count() const { - return TrtWeightDimsNumElements(shape_); -} - -nvinfer1::Weights TRT_ShapedWeights::GetTrtWeights() const { - return nvinfer1::Weights{type_, GetValues(), count()}; -} - -size_t TRT_ShapedWeights::size_bytes() const { - size_t data_type_size = -1; - switch (type_) { - case nvinfer1::DataType::kFLOAT: - case nvinfer1::DataType::kINT32: - data_type_size = 4; - break; - case nvinfer1::DataType::kHALF: - data_type_size = 2; - break; - case nvinfer1::DataType::kINT8: - data_type_size = 1; - break; - } - return this->count() * data_type_size; -} - -string TRT_ShapedWeights::DebugString() const { - return StrCat("TRT_ShapedWeights(shape=", convert::DebugString(shape_), - ", type=", convert::DebugString(type_), - ", values=", reinterpret_cast(GetValues()), ")"); -} - -TRT_TensorOrWeights::TRT_TensorOrWeights(ITensorProxyPtr tensor) - : tensor_proxy_ptr_(tensor), initialized_(true), is_tensor_(true) {} - -TRT_TensorOrWeights::TRT_TensorOrWeights(ITensorProxyPtr tensor, int batch_size) - : tensor_proxy_ptr_(tensor), - batch_size_(batch_size), - initialized_(true), - is_tensor_(true) {} - -TRT_TensorOrWeights::TRT_TensorOrWeights(nvinfer1::ITensor* tensor, - int batch_size) - : tensor_proxy_ptr_(tensor), - batch_size_(batch_size), - initialized_(true), - is_tensor_(true) {} - -TRT_TensorOrWeights::TRT_TensorOrWeights(nvinfer1::DataType trt_dtype, - const nvinfer1::Dims& trt_dims, - int batch_size) - : tensor_proxy_ptr_(new SimpleITensor(trt_dtype, trt_dims)), - batch_size_(batch_size), - initialized_(true), - is_tensor_(true) {} - -TRT_TensorOrWeights::TRT_TensorOrWeights(const TRT_ShapedWeights& weights) - : weights_(weights), initialized_(true), is_tensor_(false) {} - -TRT_TensorOrWeights::TRT_TensorOrWeights(const TRT_TensorOrWeights& rhs) - : tensor_proxy_ptr_(rhs.tensor_proxy_ptr_), - batch_size_(rhs.batch_size_), - weights_(rhs.weights_), - initialized_(rhs.initialized_), - is_tensor_(rhs.is_tensor_) {} - -void TRT_TensorOrWeights::operator=(const TRT_TensorOrWeights& rhs) { - tensor_proxy_ptr_ = rhs.tensor_proxy_ptr_; - batch_size_ = rhs.batch_size_; - weights_ = rhs.weights_; - initialized_ = rhs.initialized_; - is_tensor_ = rhs.is_tensor_; -} - -ITensorProxyPtr TRT_TensorOrWeights::tensor() const { - CHECK(is_tensor()); - return tensor_proxy_ptr_; -} - -nvinfer1::Dims TRT_TensorOrWeights::GetTrtDims() const { - if (is_tensor()) { - return tensor()->getDimensions(); - } else { - return weights().shape_; - } -} - -string TRT_TensorOrWeights::DebugString() const { - string output = "TRT_TensorOrWeights(type="; - if (is_tensor()) { - StrAppend(&output, "tensor=", convert::DebugString(tensor()), - ", batch_size=", batch_size_); - } else { - StrAppend(&output, "weights=", weights_.DebugString()); - } - StrAppend(&output, ")"); - return output; -} - // Perform 5 dimensional reorder of data on CPU // This is done once at convert time and does not affect GPU inference perf // Example: reorder NDHWC (Tensorflow) -> NCDHW (TensorRT) @@ -830,22 +873,21 @@ void Reorder2(const nvinfer1::DimsHW& shape, const T* idata, // TODO(jie): fallback to tensorflow!! void ReorderCKtoKC(const TRT_ShapedWeights& iweights, TRT_ShapedWeights* oweights) { - const int c = iweights.shape_.d[0]; - const int k = iweights.shape_.d[1]; - oweights->shape_.d[0] = k; - oweights->shape_.d[1] = c; + const int c = iweights.Shape().dim(0); + const int k = iweights.Shape().dim(1); + oweights->Shape().dim(0) = k; + oweights->Shape().dim(1) = c; const nvinfer1::DimsHW istrides = {1, k}; const nvinfer1::DimsHW ostrides = {c, 1}; switch (iweights.TrtDType()) { case nvinfer1::DataType::kFLOAT: { - Reorder2({k, c}, static_cast(iweights.GetValues()), - istrides, static_cast(oweights->GetValues()), ostrides); + Reorder2({k, c}, iweights.GetPointer(), istrides, + oweights->GetPointer(), ostrides); break; } case nvinfer1::DataType::kHALF: { - Reorder2({k, c}, static_cast(iweights.GetValues()), - istrides, static_cast(oweights->GetValues()), - ostrides); + Reorder2({k, c}, iweights.GetPointer(), istrides, + oweights->GetPointer(), ostrides); break; } default: @@ -860,31 +902,30 @@ void ReorderRSCKToKCRS(const TRT_ShapedWeights& iweights, CHECK_EQ(iweights.size_bytes(), oweights->size_bytes()); // K indexes over output channels, C over input channels, and R and S over the // height and width of the convolution - const int r = iweights.shape_.d[0]; - const int s = iweights.shape_.d[1]; + const int r = iweights.Shape().dim(0); + const int s = iweights.Shape().dim(1); // TRT requires GKcRS, while TF depthwise has RSCK where c=1, C=G - const int c = iweights.shape_.d[2] / num_groups; - const int k = iweights.shape_.d[3] * num_groups; - VLOG(2) << "num_groups: " << num_groups << "c" << iweights.shape_.d[2] - << " then " << c << "k" << iweights.shape_.d[3] << " then " << k - << "r" << iweights.shape_.d[0] << " then " << r << "s" - << iweights.shape_.d[1] << " then " << s; - oweights->shape_.d[0] = k / num_groups; - oweights->shape_.d[1] = c * num_groups; - oweights->shape_.d[2] = r; - oweights->shape_.d[3] = s; + const int c = iweights.Shape().dim(2) / num_groups; + const int k = iweights.Shape().dim(3) * num_groups; + VLOG(2) << "num_groups: " << num_groups << "c" << iweights.Shape().dim(2) + << " then " << c << "k" << iweights.Shape().dim(3) << " then " << k + << "r" << iweights.Shape().dim(0) << " then " << r << "s" + << iweights.Shape().dim(1) << " then " << s; + oweights->Shape().dim(0) = k / num_groups; + oweights->Shape().dim(1) = c * num_groups; + oweights->Shape().dim(2) = r; + oweights->Shape().dim(3) = s; const nvinfer1::Dims4 istrides = {1, k, s * k * c, c * k}; const nvinfer1::Dims4 ostrides = {c * r * s, r * s, s, 1}; switch (iweights.TrtDType()) { case nvinfer1::DataType::kFLOAT: { - Reorder4({k, c, r, s}, static_cast(iweights.GetValues()), - istrides, static_cast(oweights->GetValues()), ostrides); + Reorder4({k, c, r, s}, iweights.GetPointer(), istrides, + oweights->GetPointer(), ostrides); break; } case nvinfer1::DataType::kHALF: { - Reorder4({k, c, r, s}, - static_cast(iweights.GetValues()), istrides, - static_cast(oweights->GetValues()), ostrides); + Reorder4({k, c, r, s}, iweights.GetPointer(), istrides, + oweights->GetPointer(), ostrides); break; } @@ -909,22 +950,22 @@ void ReorderDRSCKToKCDRS(const TRT_ShapedWeights& iweights, CHECK_EQ(iweights.size_bytes(), oweights->size_bytes()); // K indexes over output channels, C over input channels, and R, S, D over the // height, width, depth - const int d = iweights.shape_.d[0]; - const int r = iweights.shape_.d[1]; - const int s = iweights.shape_.d[2]; + const int d = iweights.Shape().dim(0); + const int r = iweights.Shape().dim(1); + const int s = iweights.Shape().dim(2); // TRT requires GKcRS, while TF depthwise has RSCK where c=1, C=G - const int c = iweights.shape_.d[3] / num_groups; - const int k = iweights.shape_.d[4] * num_groups; + const int c = iweights.Shape().dim(3) / num_groups; + const int k = iweights.Shape().dim(4) * num_groups; - VLOG(2) << "num_groups: " << num_groups << ", c: " << iweights.shape_.d[3] - << " becomes " << c << ", k: " << iweights.shape_.d[4] << " becomes " - << k << ", d: " << d << ", r: " << r << ", s: " << s; + VLOG(2) << "num_groups: " << num_groups << ", c: " << iweights.Shape().dim(3) + << " becomes " << c << ", k: " << iweights.Shape().dim(4) + << " becomes " << k << ", d: " << d << ", r: " << r << ", s: " << s; - oweights->shape_.d[0] = iweights.shape_.d[4]; // k / num_groups; - oweights->shape_.d[1] = iweights.shape_.d[3]; // c * num_groups; - oweights->shape_.d[2] = d; - oweights->shape_.d[3] = r; - oweights->shape_.d[4] = s; + oweights->Shape().dim(0) = iweights.Shape().dim(4); // k / num_groups; + oweights->Shape().dim(1) = iweights.Shape().dim(3); // c * num_groups; + oweights->Shape().dim(2) = d; + oweights->Shape().dim(3) = r; + oweights->Shape().dim(4) = s; nvinfer1::Dims shape = InitDimsN({k, c, d, r, s}); // KCDRS shape (same as output) @@ -939,14 +980,13 @@ void ReorderDRSCKToKCDRS(const TRT_ShapedWeights& iweights, switch (iweights.TrtDType()) { case nvinfer1::DataType::kFLOAT: { - Reorder5(shape, static_cast(iweights.GetValues()), istrides, - static_cast(oweights->GetValues()), ostrides); + Reorder5(shape, iweights.GetPointer(), istrides, + oweights->GetPointer(), ostrides); break; } case nvinfer1::DataType::kHALF: { - Reorder5(shape, static_cast(iweights.GetValues()), - istrides, static_cast(oweights->GetValues()), - ostrides); + Reorder5(shape, iweights.GetPointer(), istrides, + oweights->GetPointer(), ostrides); break; } default: @@ -955,31 +995,20 @@ void ReorderDRSCKToKCDRS(const TRT_ShapedWeights& iweights, } } -TRT_ShapedWeights TrtWeightStore::GetTempWeights(nvinfer1::DataType trt_dtype, - const nvinfer1::Dims& dims) { - TensorShape shape; - DataType tf_dtype; - // TODO(laigd): make it return a status. - TF_CHECK_OK(TensorShapeUtils::MakeShape(dims.d, dims.nbDims, &shape)); - TF_CHECK_OK(TrtDataTypeToTf(trt_dtype, &tf_dtype)); - // TODO(jie): check weights size_bytes. 0 means type error - Tensor tensor(tf_dtype, shape); - TRT_ShapedWeights weights(trt_dtype, dims, tensor); - store_.emplace_back(std::move(tensor)); - return weights; -} - OpConverterParams::OpConverterParams( const NodeDef& node_def, const std::vector& inputs, std::vector* outputs, TrtWeightStore* weight_store, - TrtPrecisionMode precision_mode, bool use_calibration) + TrtPrecisionMode precision_mode, bool use_calibration, + bool use_implicit_batch, bool use_explicit_precision) : node_def(node_def), inputs(inputs), outputs(outputs), validation_only(true), weight_store(weight_store), precision_mode(precision_mode), - use_calibration(use_calibration) {} + use_calibration(use_calibration), + use_implicit_batch(use_implicit_batch), + use_explicit_precision(use_explicit_precision) {} OpConverterParams::OpConverterParams( Converter* converter, const NodeDef& node_def, @@ -992,31 +1021,44 @@ OpConverterParams::OpConverterParams( validation_only(false), weight_store(weight_store), precision_mode(converter->precision_mode()), - use_calibration(converter->use_calibration()) {} - -const std::set* TrtNodeValidator::quantize_ops = new std::set{ - "QuantizeAndDequantizeV2", - "QuantizeAndDequantizeV3", - "FakeQuantWithMinMaxVars", - "FakeQuantWithMinMaxArgs", -}; + use_calibration(converter->use_calibration()), + use_implicit_batch(converter->use_implicit_batch()), + use_explicit_precision(converter->UseExplicitPrecision()) {} TrtNodeValidator::TrtNodeValidator( const grappler::GraphProperties& graph_properties, - TrtPrecisionMode precision_mode, bool use_calibration) + TrtPrecisionMode precision_mode, bool use_calibration, + bool use_implicit_batch, bool use_explicit_precision) : graph_properties_(graph_properties), precision_mode_(precision_mode), - use_calibration_(use_calibration) { - RegisterOpValidators(); + use_calibration_(use_calibration), + use_implicit_batch_(use_implicit_batch), + use_explicit_precision_(use_explicit_precision) {} + +::stream_executor::port::StatusOr TrtNodeValidator::GetValidator( + const std::string& op) { + return GetOpConverterRegistry()->LookUp(op); } Status TrtNodeValidator::ConvertToTensorOrWeights( const NodeDef& node_def, int output_port, TRT_TensorOrWeights* tensor_or_weights) { - if (node_def.op() == "Const") { - if (output_port != 0) { - return errors::InvalidArgument("Const node should only have one output."); + // Treat handles separately. + if (node_def.op() == "VarHandleOp" || node_def.op() == "Placeholder") { + AttrSlice attrs(node_def); + DataType dtype; + TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "dtype", &dtype)); + if (dtype == DataType::DT_RESOURCE) { + // The converter doesn't use the input resource at the validation stage + // (it gets the dtype and shape from attributes). A fake resource can be + // used. + ResourceHandle fake_resource; + *tensor_or_weights = TRT_TensorOrWeights(fake_resource); + return Status::OK(); } + } + + if (node_def.op() == "Const" || node_def.op() == "VariableV2") { // The output of the conversion will be used as input to other nodes to // determine whether TRT supports those nodes. If it cannot convert the // Const, it's very likely we cannot treat it as a tensor and make it an @@ -1024,9 +1066,22 @@ Status TrtNodeValidator::ConvertToTensorOrWeights( // treats it as batch size. Also, it's not likely that the converter can // support the op, and performance may suffer even if it can, so we just // simply return error if the conversion fails. + if (output_port != 0) { + return errors::InvalidArgument(node_def.op(), + " node should only have one output."); + } std::vector inputs; return ConvertConstToWeights(node_def, inputs, tensor_or_weights); } + if (node_def.op() == "ReadVariableOp") { + // Similar treatment to Const and VariableV2, but we provide a fake + // resource input to the converter. + const std::vector inputs{ + TRT_TensorOrWeights(ResourceHandle())}; + + // Convert the variable to weights. + return ConvertConstToWeights(node_def, inputs, tensor_or_weights); + } if (!graph_properties_.HasOutputProperties(node_def.name())) { return errors::InvalidArgument("Shape and data type are unknown"); } @@ -1041,8 +1096,8 @@ Status TrtNodeValidator::ConvertToTensorOrWeights( nvinfer1::Dims trt_dims; int batch_size = -1; TF_RETURN_IF_ERROR(ValidateTensorProperties( - node_def.op(), dtype, shape, /*validation_only_=*/true, &trt_dtype, - &trt_dims, &batch_size)); + node_def.op(), dtype, shape, use_implicit_batch_, + /*validation_only_=*/true, &trt_dtype, &trt_dims, &batch_size)); // Adds a fake ITensor. This is fine since op converter operates in // validation-only mode and it won't (and shouldn't) use the tensor to do @@ -1057,11 +1112,12 @@ Status TrtNodeValidator::IsTensorRTCandidate(const Node* node) { // these ops to the relevant tensors. This happens regardless of the value of // use_calibration. bool is_supported_op = false; - if (quantize_ops->count(op)) { + if (absl::c_find(kQuantizationOpNames, op) != kQuantizationOpNames.end()) { is_supported_op = (precision_mode_ == TrtPrecisionMode::INT8); } else { - is_supported_op = op_validators_.count(op); + is_supported_op = GetValidator(op).ok(); } + if (!is_supported_op) { return errors::Unimplemented("Op type ", op, " is not supported."); } @@ -1072,22 +1128,35 @@ Status TrtNodeValidator::IsTensorRTCandidate(const Node* node) { std::vector input_edges; TF_RETURN_IF_ERROR(node->input_edges(&input_edges)); for (const Edge* edge : input_edges) { + // Go up the chain of Identity nodes. + Node* src_node = edge->src(); + while (src_node->def().op() == "Identity") { + std::vector input_edges_temp; + TF_RETURN_IF_ERROR(src_node->input_edges(&input_edges_temp)); + src_node = input_edges_temp[0]->src(); + } + const NodeDef& src_def = src_node->def(); + TRT_TensorOrWeights tensor_or_weights; - const NodeDef& src_def = edge->src()->def(); Status status = ConvertToTensorOrWeights(src_def, edge->src_output(), &tensor_or_weights); if (!status.ok()) { + VLOG(2) << "Failed to convert input `" << src_def.name() << "` to a " + << "TRT_TensorOrWeights: " << status.error_message(); + return errors::Internal( - "Failed to convert input ", src_def.name(), - " to a TRT_TensorOrWeights: ", status.error_message()); + "Failed to convert at least one input to a TRT_TensorOrWeights: ", + status.error_message()); } inputs.push_back(tensor_or_weights); } - OpConverter validator = op_validators_[op]; + auto validator = GetValidator(op); + TF_RETURN_IF_ERROR(validator.status()); OpConverterParams params(node->def(), inputs, /*arg_outputs=*/nullptr, - &weight_store_, precision_mode_, use_calibration_); - return validator(¶ms); + &weight_store_, precision_mode_, use_calibration_, + use_implicit_batch_, use_explicit_precision_); + return validator.ValueOrDie()(¶ms); } Status TrtNodeValidator::ConvertConstToWeights( @@ -1096,71 +1165,83 @@ Status TrtNodeValidator::ConvertConstToWeights( TRT_TensorOrWeights* output) { std::vector outputs; OpConverterParams params(const_node_def, inputs, &outputs, &weight_store_, - precision_mode_, use_calibration_); - Status status = op_validators_["Const"](¶ms); - if (status.ok() && output) *output = outputs[0]; + precision_mode_, use_calibration_, + use_implicit_batch_, use_explicit_precision_); + auto const_val = GetValidator(const_node_def.op()); + TF_RETURN_IF_ERROR(const_val.status()); + Status status = const_val.ValueOrDie()(¶ms); + if (status.ok() && (output != nullptr)) { + *output = outputs[0]; + } return status; } -static void InitializeTrtPlugins() { - static mutex plugin_mutex(LINKER_INITIALIZED); - static bool plugin_initialized = false; - static Logger trt_logger; - mutex_lock lock(plugin_mutex); - if (plugin_initialized) return; - - plugin_initialized = initLibNvInferPlugins(&trt_logger, ""); - if (!plugin_initialized) { - LOG(ERROR) << "Failed to initialize TensorRT plugins, and conversion may " - "fail later."; - } - - int num_trt_plugins = 0; - nvinfer1::IPluginCreator* const* trt_plugin_creator_list = - getPluginRegistry()->getPluginCreatorList(&num_trt_plugins); - if (!trt_plugin_creator_list) { - LOG(WARNING) << "Can not find any TensorRT plugins in registry."; - } else { - VLOG(1) << "Found the following " << num_trt_plugins - << " TensorRT plugins in registry:"; - for (int i = 0; i < num_trt_plugins; ++i) { - if (!trt_plugin_creator_list[i]) { - LOG(WARNING) << "TensorRT plugin at index " << i - << " is not accessible (null pointer returned by " - "getPluginCreatorList for this plugin)"; - } else { - VLOG(1) << " " << trt_plugin_creator_list[i]->getPluginName(); - } - } - } -} - -Converter::Converter(nvinfer1::INetworkDefinition* trt_network, - TrtPrecisionMode precision_mode, bool use_calibration) - : trt_network_(trt_network), +// static +::stream_executor::port::StatusOr> Converter::Create( + TrtPrecisionMode precision_mode, bool use_calibration, + nvinfer1::ILogger* trt_logger, const bool use_implicit_batch, + absl::string_view engine_name, bool use_explicit_precision, + OpKernelContext* ctx) { + std::unique_ptr converter = absl::WrapUnique(new Converter( + precision_mode, use_calibration, trt_logger, use_implicit_batch, + engine_name, use_explicit_precision, ctx)); + TF_RETURN_IF_ERROR(converter->Init(trt_logger)); + return converter; +} + +Converter::Converter(TrtPrecisionMode precision_mode, bool use_calibration, + nvinfer1::ILogger* trt_logger, + const bool use_implicit_batch, + absl::string_view engine_name, bool use_explicit_precision, + OpKernelContext* ctx) + : ctx_(ctx), precision_mode_(precision_mode), - use_calibration_(use_calibration) { - InitializeTrtPlugins(); - this->RegisterOpConverters(); + use_calibration_(use_calibration), + use_implicit_batch_(use_implicit_batch), + engine_name_(engine_name), + use_explicit_precision_(use_explicit_precision) { + MaybeInitializeTrtPlugins(trt_logger); +} + +Status Converter::Init(nvinfer1::ILogger* trt_logger) { + VLOG(1) << "Creating TensorRT builder"; + trt_builder_.reset(nvinfer1::createInferBuilder(*trt_logger)); + + VLOG(1) << "Creating TensorRT network"; + uint32_t flags = + use_implicit_batch_ + ? 0U + : (1U << static_cast( + nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH)); + if (use_explicit_precision_) { + flags |= + (1U << static_cast( + nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_PRECISION)); + } + trt_network_.reset(trt_builder_->createNetworkV2(flags)); + if (!trt_network_) { + return errors::Internal("Failed to create TensorRT network object"); + } + return Status::OK(); } Status Converter::ConvertNode(const NodeDef& node_def) { - std::vector inputs, outputs; + std::vector inputs; + std::vector outputs; TF_RETURN_IF_ERROR(this->GetInputs(node_def, &inputs)); OpConverterParams params(this, node_def, inputs, &outputs, &weight_store_); const string& op = node_def.op(); - auto itr = op_registry_.find(op); - if (itr == op_registry_.end()) { - return errors::Unimplemented("No converter registered for op: ", op); - } - OpConverter op_converter = itr->second; - TF_RETURN_IF_ERROR(op_converter(¶ms)); + auto op_converter = GetOpConverterRegistry()->LookUp(op); + TF_RETURN_IF_ERROR(op_converter.status()); + TF_RETURN_IF_ERROR(op_converter.ValueOrDie()(¶ms)); for (size_t i = 0; i < outputs.size(); ++i) { TRT_TensorOrWeights& output = outputs[i]; string output_name = node_def.name(); - if (i != 0) absl::StrAppend(&output_name, ":", i); + if (i != 0) { + StrAppend(&output_name, ":", i); + } // We need to check the name before setting it. If the input is one of the // engine input, setting the name here will overwrite engine input // bindings which will cause runtime error. @@ -1182,9 +1263,9 @@ Status Converter::ConvertNode(const NodeDef& node_def) { << output.DebugString(); Status status = AddTensorOrWeights(output_name, output); if (!status.ok()) { - return Status(status.code(), - StrCat("Failed to add output for node ", node_def.name(), - ": ", status.error_message())); + return errors::InvalidArgument( + StrCat("Failed to add output for node: ", node_def.name(), ": ", + status.error_message())); } } return Status::OK(); @@ -1195,10 +1276,13 @@ Status Converter::AddInputTensor(const string& name, nvinfer1::DataType dtype, // We verify the batch size only for the input nodes, and rely on individual // op converter to ensure the batch size of the outputs is not changed. // TODO(laigd): we need to test this properties. - Status status = MaybeUpdateBatchSize(batch_size); - if (!status.ok()) { - return Status(status.code(), StrCat("Batch size doesn't match for tensor ", - name, ": ", status.error_message())); + Status status; + if (use_implicit_batch_) { + status = MaybeUpdateBatchSize(batch_size); + if (!status.ok()) { + return Status(status.code(), + batch_size_error(name, status.error_message())); + } } ITensorProxyPtr tensor = network()->addInput(name.c_str(), dtype, dims); if (*tensor == nullptr) { @@ -1213,8 +1297,19 @@ Status Converter::AddInputTensor(const string& name, nvinfer1::DataType dtype, return Status::OK(); } +Status Converter::AddInputResource(const string& name, + const ResourceHandle& resource) { + Status status = AddTensorOrWeights(name, TRT_TensorOrWeights(resource)); + if (!status.ok()) { + return Status(status.code(), StrCat("Failed to add input resource ", name, + ": ", status.error_message())); + } + return Status::OK(); +} + Status Converter::RenameAndMarkOutputTensors( const std::vector& output_tensors) { + int output_index = 0; for (const auto& output : output_tensors) { TRT_TensorOrWeights tensor_or_weights; TF_RETURN_IF_ERROR( @@ -1240,22 +1335,248 @@ Status Converter::RenameAndMarkOutputTensors( // in ConvertIdentity. if (IsEngineInput(tensor->getName()) || IsEngineOutput(tensor->getName())) { // Using shuffle layer for identity by not setting reshape or transpose. - nvinfer1::IShuffleLayer* layer = network()->addShuffle(*tensor->trt_tensor()); + nvinfer1::IShuffleLayer* layer = + network()->addShuffle(*tensor->trt_tensor()); TFTRT_RETURN_ERROR_IF_NULLPTR( layer, StrCat("Output Copy for ", tensor->getName())); - ITensorProxyPtr output_tensor = layer->getOutput(0); - MarkQuantizationRangesAsInferrable(&tensor, &output_tensor); - tensor = output_tensor; + SetLayerName(layer, tensor->getName(), "shuffle", output_index); + tensor = layer->getOutput(0); } tensor->setName(output.dest_node_name.c_str()); network()->markOutput(*tensor->trt_tensor()); // Set type after marking as output. TRT only supports setType for engine // outputs and inputs (type is inferred otherwise). tensor->setType(output.trt_dtype); + output_index++; VLOG(1) << "Marking output TRT tensor " << output.source_tensor_name << " with data type " << DebugString(output.trt_dtype) << ", which feeds TF node " << output.dest_node_name; } + if (VLOG_IS_ON(2)) { + VLOG(2) << "Created TensorRT network with the following layers:"; + for (int i = 0; i < network()->getNbLayers(); i++) { + auto layer = network()->getLayer(i); + VLOG(2) << " " << layer->getName() << " (" + << "type: " << static_cast(layer->getType()) + << ", precision: " << static_cast(layer->getPrecision()) + << ")"; + } + } + return Status::OK(); +} + +// Returns the value of TF_TRT_ABORT_CUDA_ENGINE_BUILD environment variable. +// This variable can be used to abort CUDA engine construction, therefore it +// provides a way to test and debug the native segment fallback of TF-TRT. +bool AbortCudaEngineBuild() { + bool value; + Status status = ReadBoolFromEnvVar("TF_TRT_ABORT_CUDA_ENGINE_BUILD", + /*default_value=*/false, &value); + if (!status.ok()) { + LOG(ERROR) << status; + } + return value; +} + +Status Converter::BuildCudaEngine( + TrtUniquePtrType* engine, int max_batch_size, + size_t max_workspace_size_bytes, nvinfer1::IGpuAllocator* allocator, + TRTInt8Calibrator* calibrator, TrtShapeOptimizationProfile* profiles) { + if (AbortCudaEngineBuild()) { + return errors::Aborted( + "Engine creation aborted by TF_TRT_ABORT_CUDA_ENGINE_BUILD variable"); + } + + VLOG(1) << "Configuring TensorRT builder"; + trt_builder_->setMaxBatchSize(max_batch_size); + trt_builder_->setGpuAllocator(allocator); + + // Create a network configuration and use it to build a TRT engine. + TrtUniquePtrType builder_config( + trt_builder_->createBuilderConfig()); + builder_config->setMaxWorkspaceSize(max_workspace_size_bytes); + + // Create the algorithm selector. For TensorRT 7.x, the algorithm selector + // cannot be used when building with INT8 calibration. + std::unique_ptr trt_algorithm_selector{nullptr}; + if (!IS_TRT_VERSION_GE(8, 0, 0, 0)) { + if (!use_calibration_ || precision_mode_ != TrtPrecisionMode::INT8) { + trt_algorithm_selector = MaybeCreateAlgorithmSelector(); + } + } else { + trt_algorithm_selector = MaybeCreateAlgorithmSelector(); + } + + if (trt_algorithm_selector != nullptr) { + builder_config->setAlgorithmSelector(trt_algorithm_selector.get()); + } + +#if IS_TRT_VERSION_GE(8, 0, 0, 0) + enum class SparseComputeMode { DISABLED, ENABLED, SIMULATED }; + + static SparseComputeMode sparse_compute_mode = []() { + SparseComputeMode _sparse_compute_mode; + int64 _sparse_mode; + /*TF_TRT_SPARSE_MODE environment variable controls if sparse compute is + enabled. It also allows to simulate the performance benefits of training a + model with sparse compute in mind. + Possible Values: + - 1 [Default]: Sparse compute is enabled if the model was trained with + sparse weights. Otherwise it has no effect. + - < 1: Sparse compute is explicitly disabled regardless on how the model was + trained. + - > 1: Sparse compute is forced. This mode is only to be used for + benchmarking or debugging purpose. This feature artificially introduces a + sparse weight pattern compatible with Sparse TensorCores introduced in + NVIDIA Ampere GPU architecture. As a side effect, it will completely corrupt + the numerical values of the computation. Therefore shall only be used to + evaluate the benefit of using sparse computation for inference.*/ + TF_CHECK_OK(tensorflow::ReadInt64FromEnvVar("TF_TRT_SPARSE_MODE", + /*default_val=*/1, + &_sparse_mode)); + + string sparse_log_msg = "[TF-TRT] Sparse compute capability: "; + if (_sparse_mode == 1) { + sparse_log_msg = StrCat(sparse_log_msg, "enabled."); + _sparse_compute_mode = SparseComputeMode::ENABLED; + } else if (_sparse_mode < 1) { + sparse_log_msg = StrCat(sparse_log_msg, "disabled."); + _sparse_compute_mode = SparseComputeMode::DISABLED; + } else { + sparse_log_msg = StrCat( + sparse_log_msg, "simulated.", + "It shall only be used for sparse computing benchmark and debug."); + _sparse_compute_mode = SparseComputeMode::SIMULATED; + } + LOG(INFO) << sparse_log_msg; + + return _sparse_compute_mode; + }(); + + if (sparse_compute_mode == SparseComputeMode::ENABLED || + sparse_compute_mode == SparseComputeMode::SIMULATED) { + builder_config->setFlag(nvinfer1::BuilderFlag::kSPARSE_WEIGHTS); + } +#endif + + if (tensorflow::tensor_float_32_execution_enabled()) { + builder_config->setFlag(nvinfer1::BuilderFlag::kTF32); + } else { + builder_config->clearFlag(nvinfer1::BuilderFlag::kTF32); + } + + if (precision_mode_ == TrtPrecisionMode::FP16) { + builder_config->setFlag(nvinfer1::BuilderFlag::kFP16); + } else if (precision_mode_ == TrtPrecisionMode::INT8) { + // FP16 is not available in Explicit Precision mode with TensorRT 7. + if (IS_TRT_VERSION_GE(8, 0, 0, 0) || !use_explicit_precision_) { + builder_config->setFlag(nvinfer1::BuilderFlag::kFP16); + } else { + LOG_WARNING_WITH_PREFIX << "With explicit precision mode, FP16 is not " + "allowed before TensorRT 8. TRT will consider " + "INT8 and FP32 tactics."; + } + builder_config->setFlag(nvinfer1::BuilderFlag::kINT8); + } + if (!use_implicit_batch_ && profiles) { + TF_RETURN_IF_ERROR(profiles->ConfigureBuilder( + trt_builder_.get(), builder_config.get(), network())); + } + if (precision_mode_ == TrtPrecisionMode::INT8) { + builder_config->setInt8Calibrator(use_calibration_ ? calibrator : nullptr); + } + + std::unique_ptr timing_cache = nullptr; + // We only use a timing cache if the algorithm selector is not used. If we + // are using TRT version >= 8.0, then we can try to deserialize an existing + // cache. + if (trt_algorithm_selector == nullptr) { +#if IS_TRT_VERSION_GE(8, 0, 0, 0) + TimingCacheRegistry* registry = GetTimingCacheRegistry(); + + auto cache = registry->LookUp("default_cache", builder_config.get()); + if (!cache.ok()) { + LOG(WARNING) << "failed to create a timing cache: " + << cache.status().error_message(); + } else { + timing_cache = std::move(cache.ValueOrDie()); + builder_config->setTimingCache(*timing_cache, /*ignoreMismatch*/ false); + } +#endif // IS_TRT_VERSION_GE(8, 0, 0, 0) + } else { + // Disabling the timing cache is recommended when using the algorithm + // selector. + builder_config->setFlag(nvinfer1::BuilderFlag::kDISABLE_TIMING_CACHE); + } + + string precision_mode_str; + TF_RETURN_IF_ERROR( + TrtPrecisionModeToName(precision_mode_, &precision_mode_str)); + string trt_network_name = StrCat( + "TF:", TF_VERSION_STRING, ", ", + "TRT:", absl::StrJoin(GetLoadedTensorRTVersion(), "."), "-", + "Precision:", precision_mode_str, ", ", "Calibration:", use_calibration_, + ", ", "Max-Batch-Size:", max_batch_size, ", ", + "Max-Workspace-Size:", max_workspace_size_bytes); + +#if IS_TRT_VERSION_GE(8, 0, 0, 0) + trt_network_name = StrCat(trt_network_name, ", Sparse Compute: "); + + switch (sparse_compute_mode) { + case SparseComputeMode::SIMULATED: + trt_network_name = StrCat(trt_network_name, "Simulated"); + break; + case SparseComputeMode::ENABLED: + trt_network_name = StrCat(trt_network_name, "Enabled"); + break; + case SparseComputeMode::DISABLED: + trt_network_name = StrCat(trt_network_name, "Disabled"); + break; + } +#endif + + VLOG(1) << "Setting TensorRT network name to " << trt_network_name; + network()->setName(trt_network_name.c_str()); + + VLOG(1) << "Building TensorRT engine"; + if (VLOG_IS_ON(2)) { + VLOG(2) << "Network inputs"; + int n_inputs = network()->getNbInputs(); + for (int i = 0; i < n_inputs; i++) { + const ITensorProxyPtr input = network()->getInput(i); + if (*input) { + VLOG(2) << " " << i << " " << input->getName(); + } else { + VLOG(2) << "Could not find input " << i; + } + } + } + engine->reset( + trt_builder_->buildEngineWithConfig(*network(), *builder_config)); + if (engine->get() == nullptr) { + return errors::Internal("Failed to build TensorRT engine"); + } + if (VLOG_IS_ON(2)) { + VLOG(2) << "TRT engine created"; + int nbBindings = (*engine)->getNbBindings(); + VLOG(2) << "Number of engine bindings: " << nbBindings; + for (int i = 0; i < nbBindings; i++) { + auto get_location_string = [&engine](int i) { + if ((*engine)->getLocation(i) == nvinfer1::TensorLocation::kDEVICE) + return " on device"; + else + return " on host"; + }; + VLOG(2) << "Binding " << i << " name: " << (*engine)->getBindingName(i) + << get_location_string(i); + } + } + + // Write back the new timing cache results to the registry. + if (timing_cache) { + GetTimingCacheRegistry()->Upsert("default_cache", timing_cache.get()); + } + return Status::OK(); } @@ -1280,7 +1601,9 @@ Status Converter::AddTensorOrWeights(const string& name, // We rely on the individual op converter to understand the semantics of the // TF node, and make sure it doesn't change the batch size nor introduce // intra-element dependency inside the batch. - if (input.is_tensor()) input.set_batch_size(batch_size_); + if (use_implicit_batch_ && input.is_tensor()) { + input.set_batch_size(batch_size_); + } if (trt_tensors_.insert({name, std::move(input)}).second) return Status::OK(); return errors::AlreadyExists("tensor/weights ", name, " already exist."); } @@ -1297,26 +1620,34 @@ Status Converter::GetTensorOrWeights(const string& name, Status Converter::TransposeTensor(ITensorProxyPtr input_tensor, const std::vector& order_with_batch_dim, - ITensorProxyPtr* output_tensor) { + ITensorProxyPtr* output_tensor, + const NodeDef& node_def, + absl::string_view sub_op_name) { const auto dims = input_tensor->getDimensions(); - - if (order_with_batch_dim.size() - 1 != size_t(dims.nbDims)) { + const int order_size = use_implicit_batch_ ? order_with_batch_dim.size() - 1 + : order_with_batch_dim.size(); + if (order_size != size_t(dims.nbDims)) { return errors::InvalidArgument( "Rank of perm for transpose does not match with that of the input."); } - if (order_with_batch_dim[0] != 0) { + if (use_implicit_batch_ && order_with_batch_dim[0] != 0) { return errors::Unimplemented( "Transpose at batch dimension is not supported."); } - nvinfer1::IShuffleLayer* layer = this->network()->addShuffle(*input_tensor->trt_tensor()); + nvinfer1::IShuffleLayer* layer = + this->network()->addShuffle(*input_tensor->trt_tensor()); TFTRT_RETURN_ERROR_IF_NULLPTR(layer, "TF-TRT Internal Transpose"); - ITensorProxyPtr shuffle_tensor = layer->getOutput(0); - MarkQuantizationRangesAsInferrable(&input_tensor, &shuffle_tensor); + SetLayerName(layer, node_def, sub_op_name); nvinfer1::Permutation permutation; - for (int32_t i = 0; i < dims.nbDims; ++i) { - permutation.order[i] = order_with_batch_dim[i + 1] - 1; + if (use_implicit_batch_) { + for (int32_t i = 0; i < dims.nbDims; ++i) { + permutation.order[i] = order_with_batch_dim[i + 1] - 1; + } + } else { + std::copy(order_with_batch_dim.begin(), order_with_batch_dim.end(), + permutation.order); } VLOG(1) << "TransposeTensor permutation: " << DebugString(permutation, dims.nbDims); @@ -1337,21 +1668,21 @@ Status Converter::GetWeightRange(const TRT_ShapedWeights& weights, float* out_min, float* out_max) const { switch (weights.TrtDType()) { case nvinfer1::DataType::kFLOAT: { - auto inp = static_cast(weights.GetValues()); + auto inp = weights.GetPointer(); auto result = std::minmax_element(inp, inp + weights.count()); *out_min = *result.first; *out_max = *result.second; break; } case nvinfer1::DataType::kHALF: { - auto inp = static_cast(weights.GetValues()); + auto inp = weights.GetPointer(); auto result = std::minmax_element(inp, inp + weights.count()); - *out_min = Eigen::half_impl::half_to_float(*result.first); - *out_max = Eigen::half_impl::half_to_float(*result.second); + *out_min = static_cast(*result.first); + *out_max = static_cast(*result.second); break; } case nvinfer1::DataType::kINT32: { - auto inp = static_cast(weights.GetValues()); + auto inp = weights.GetPointer(); auto result = std::minmax_element(inp, inp + weights.count()); *out_min = static_cast(*result.first); *out_max = static_cast(*result.second); @@ -1365,84 +1696,106 @@ Status Converter::GetWeightRange(const TRT_ShapedWeights& weights, return Status::OK(); } -Status Converter::PrepareTensorForShape(const TRT_TensorOrWeights& input, - const nvinfer1::Dims& dims, - const bool validation_only, - ITensorProxyPtr* tensor) { - const nvinfer1::Dims input_dims = input.GetTrtDims(); - // If one of input_dims and dims doesn't have static shape, it means some of - // the dims are unknown or need to be inferred. And we don't do further checks - // but rely on the caller to not make mistakes. - // Otherwise we do simple check to make sure the total sizes are the same. +// Constructs for the ILayer name as +// __ and callSetLayerNameHelper +// to set the name for the ILayer. +// +// If the operation represented by the ILayer is generated by the converter to +// support the conversion of node_def, callers need to specify a non-empty +// sub_op_name to be appended to the name of node_def to avoid layer name +// conflicts. If the operation is generated multiple times, callers also need +// to specify sub_op_instance to be appended to the name of the layers to avoid +// layer name conflicts. +void Converter::SetLayerName(nvinfer1::ILayer* layer, const NodeDef& node_def, + absl::string_view sub_op_name, + absl::optional sub_op_instance, + absl::optional origin_node_name) { + std::string sub_op_suffix = GetLayerNameSuffix(sub_op_name, sub_op_instance); + if (sub_op_suffix.empty()) { + SetLayerNameHelper(layer, engine_name_, node_def.name()); + } else if (origin_node_name.has_value()) { + auto layer_name = absl::StrCat(node_def.name(), "-", + absl::string_view(origin_node_name.value()), + "-", sub_op_suffix); + SetLayerNameHelper(layer, engine_name_, layer_name); + } else { + SetLayerNameHelper(layer, engine_name_, + absl::StrCat(node_def.name(), "-", sub_op_suffix)); + } +} + +// Constructs for the ILayer name as +// __ and callSetLayerNameHelper to +// set the name for the ILayer. +void Converter::SetLayerName(nvinfer1::ILayer* layer, + absl::string_view main_op_name, + absl::string_view sub_op_name, + absl::optional sub_op_instance) { + std::string layer_name_suffix = + GetLayerNameSuffix(sub_op_name, sub_op_instance); + SetLayerNameHelper(layer, engine_name_, + absl::StrCat(main_op_name, "-", layer_name_suffix)); +} + +// Converts 'input' of 'node_def' into 'tensor' with shape specified by 'dims' +// (which doesn't contain the batch dimension). +// +// If validation_only is true, it doesn't do the conversion but only do some +// minimum validation for the eligibility of the conversion, and *tensor will +// be set to nullptr. +Status PrepareTensorForShape(Converter* converter, + const TRT_TensorOrWeights& input, + const DimsAdapter& dims, + const bool validation_only, + ITensorProxyPtr* tensor, const NodeDef& node_def, + absl::optional op_instance, + absl::optional origin_node_name) { + DimsAdapter input_dims(input.GetTrtDims()); + // The input shape may have -1s for dynamic shape. The target shape may have + // 0s representing copy over the corresponding input dimensions. It may also + // have at most one -1 representing a dimension value that needs to be + // inferred. If none of those special values present, we verify that the total + // sizes of the input and output shape are the same. + // TODO(tfeher): Verify that the total sizes of the input and output shape are + // the same in the present of 0s but no -1 in the target shape. // If an input is a weight, it is going to become a tensor via // CreateConstantLayer. So we can treat it as a tensor for // AreDimsStaticWithDifferentSize(). This really only matters for 0-D tensors. - if (AreDimsStaticWithDifferentSize(input_dims, dims, /*is_tensor=*/true)) { + if (dims.Volume() > 0 && AreDimsStaticWithDifferentSize(input_dims, dims)) { return errors::InvalidArgument( - "Incompatible shapes: ", DebugString(input_dims), " vs. ", - DebugString(dims)); + "Incompatible shapes: ", input_dims.DebugString(), " vs. ", + dims.DebugString()); } // ConstantLayer requires static shapes (cannot infer -1). - if (input.is_weights() && !HasStaticShape(dims)) { + if (input.is_weights() && !dims.IsStatic()) { return errors::InvalidArgument("Shape is not fully defined: ", - DebugString(dims)); + dims.DebugString()); } if (validation_only) { *tensor = nullptr; return Status::OK(); } + TFTRT_RETURN_ERROR_IF_NULLPTR(converter, "converter is nullptr"); if (input.is_tensor()) { - if (DimsEqual(input_dims, dims)) { + if (input_dims == dims) { *tensor = input.tensor(); } else { nvinfer1::IShuffleLayer* layer = - this->network()->addShuffle(*input.tensor()->trt_tensor()); + converter->network()->addShuffle(*input.tensor()->trt_tensor()); TFTRT_RETURN_ERROR_IF_NULLPTR(layer, "TF-TRT Internal Reshape"); - layer->setReshapeDimensions(dims); - ITensorProxyPtr input_tensor = input.tensor(); - ITensorProxyPtr output_tensor = layer->getOutput(0); - this->MarkQuantizationRangesAsInferrable(&input_tensor, - &output_tensor); - *tensor = output_tensor; + converter->SetLayerName(layer, node_def, "shuffle", op_instance, + origin_node_name); + layer->setReshapeDimensions(dims.AsTrtDims()); + *tensor = layer->getOutput(0); } } else { - *tensor = CreateConstantLayer(input.weights(), dims); + *tensor = converter->CreateConstantLayer(input.weights(), dims.AsTrtDims()); TFTRT_RETURN_ERROR_IF_NULLPTR(*tensor, "TF-TRT Internal Reshape"); - if (precision_mode() == TrtPrecisionMode::INT8 && !use_calibration()) { - // If we are in int8 mode and not calibrating, we need to explicitly set a - // quantization range for the output tensor of the IConstantLayer. Here we - // set the range to [min(weights), max(weights)]. - float min_range = 0.0f; - float max_range = 0.0f; - TF_RETURN_IF_ERROR( - GetWeightRange(input.weights(), &min_range, &max_range)); - // Avoid setting range to 0 because TRT will throw an error. If the - // weights are zero then the range doesn't matter: using 127.0f should - // ensure the quantized weight will be exactly zero. - if (min_range == 0.0f && max_range == 0.0f) { - min_range = -127.0f; - max_range = 127.0f; - } - ProvideQuantizationRange(tensor, min_range, max_range); - } } return Status::OK(); } -void Converter::MarkQuantizationRangesAsInferrable(ITensorProxyPtr* input, - ITensorProxyPtr* output) { - if ((*input)->is_trt_tensor()) { - quantization_infer_.push_back( - {(*input)->trt_tensor(), (*output)->trt_tensor()}); - quantization_infer_.push_back( - {(*output)->trt_tensor(), (*input)->trt_tensor()}); - } else if ((*input)->is_simple_tensor()) { - quantization_infer_proxy_.push_back({input, output}); - quantization_infer_proxy_.push_back({output, input}); - } -} - void Converter::ProvideQuantizationRange(ITensorProxyPtr* tensor, float min_range, float max_range) { float symmetric_range = std::max(std::abs(min_range), std::abs(max_range)); @@ -1453,48 +1806,10 @@ void Converter::ProvideQuantizationRange(ITensorProxyPtr* tensor, } } -namespace { - -bool IsConvolution(const nvinfer1::ILayer* layer) { - return layer->getType() == nvinfer1::LayerType::kCONVOLUTION; -} - -bool IsScale(const nvinfer1::ILayer* layer) { - return layer->getType() == nvinfer1::LayerType::kSCALE; -} - -bool IsClipOrRelu(const nvinfer1::ILayer* layer) { - if (layer->getType() != nvinfer1::LayerType::kACTIVATION) { - return false; - } - auto activation_type = static_cast(layer) - ->getActivationType(); -#if IS_TRT_VERSION_GE(5, 1, 2, 0) - return activation_type == nvinfer1::ActivationType::kRELU || - activation_type == nvinfer1::ActivationType::kCLIP; -#else - return activation_type == nvinfer1::ActivationType::kRELU; -#endif -} - -bool IsAdd(const nvinfer1::ILayer* layer) { - if (layer->getType() != nvinfer1::LayerType::kELEMENTWISE) { - return false; - } - auto operation = - static_cast(layer)->getOperation(); - return operation == nvinfer1::ElementWiseOperation::kSUM; -} - -} // namespace - void Converter::MaybeApplyQuantizationRanges() { if (precision_mode() != TrtPrecisionMode::INT8) return; - // Infer ranges across marked ops. - PropagateQuantizationRanges(); // Apply ranges. -#if IS_TRT_VERSION_GE(5, 0, 0, 0) for (auto pair : quantization_ranges_) { nvinfer1::ITensor* tensor = pair.first; const float range = pair.second; @@ -1511,206 +1826,6 @@ void Converter::MaybeApplyQuantizationRanges() { // 'range', it should report error. tensor->setDynamicRange(-range, range); } -#endif - - if (use_calibration()) return; -#if !IS_TRT_VERSION_GE(6, 0, 0, 0) - // Attempt to find tensors that are missing ranges, and set the corresponding - // layer's precision to FP16 to avoid Builder::buildCudaEngine() failing. - // This is only needed for TensorRT 5 and before because - // TensorRT6 falls to FP16 internally. - // TensorRT doesn't need ranges for intermediate tensors when layers are fused - // so find fused layers first. - // Get all tensors from network and deduce fused ops. - std::map> layer_consumers; - std::map tensor_layer; - std::set all_tensors; - for (int i = 0; i < this->network()->getNbLayers(); i++) { - nvinfer1::ILayer* layer = this->network()->getLayer(i); - layer_consumers[layer] = {}; - for (int j = 0; j < layer->getNbInputs(); j++) { - ITensorProxyPtr input_tensor = layer->getInput(j); - all_tensors.insert(&input_tensor); - } - for (int j = 0; j < layer->getNbOutputs(); j++) { - ITensorProxyPtr output_tensor = layer->getOutput(j); - tensor_layer[&output_tensor] = layer; - all_tensors.insert(&output_tensor); - } - } - for (int i = 0; i < this->network()->getNbLayers(); i++) { - nvinfer1::ILayer* layer = this->network()->getLayer(i); - layer_consumers[layer] = {}; - for (int j = 0; j < layer->getNbInputs(); j++) { - ITensorProxyPtr input_tensor = layer->getInput(j); - auto input_layer = tensor_layer.find(&input_tensor); - if (input_layer != tensor_layer.end()) { - auto consumed_layer = layer_consumers.find(input_layer->second); - if (consumed_layer != layer_consumers.end()) { - consumed_layer->second.push_back(layer); - } - } - all_tensors.insert(&input_tensor); - } - } - // Identify fused tensors. - // Conv+BiasAdd+Add+Activation(Clip or Relu), Conv+BiasAdd+Add, - // Conv+BiasAdd+Activation(Clip or Relu), Conv+BiasAdd, - // Conv+Activation(Clip or Relu) are fused. - std::set fused_tensors; - typedef std::function matcher; - const std::vector>> fused_patterns = { - {"Fused Conv+Bias+Add+Activation", - { - IsConvolution, - IsScale, - IsAdd, - IsClipOrRelu, - }}, - {"Fused Conv+Bias+Add", - { - IsConvolution, - IsScale, - IsAdd, - }}, - {"Fused Conv+Bias+Activation", - { - IsConvolution, - IsScale, - IsClipOrRelu, - }}, - {"Fused Conv+Bias", - { - IsConvolution, - IsScale, - }}, - {"Fused Conv+Activation", - { - IsConvolution, - IsClipOrRelu, - }}, - }; - for (int i = 0; i < this->network()->getNbLayers(); i++) { - for (const auto& pattern : fused_patterns) { - size_t last_matcher = pattern.second.size() - 1; - nvinfer1::ILayer* layer = this->network()->getLayer(i); - // We should skip this layer if its outputs are already marked as fused, - // but all the current patterns start with a convolution and are ordered - // in decreasing pattern length, so that is not necessary (yet). - std::vector fused_candidates; - for (size_t index = 0; index <= last_matcher; ++index) { - if ((!pattern.second[index](layer)) || - (index < last_matcher && layer_consumers[layer].size() != 1)) { - fused_candidates.clear(); - break; - } - if (index < last_matcher) { - fused_candidates.push_back(layer); - layer = layer_consumers[layer].front(); - } - } - if (!fused_candidates.empty()) { - VLOG(1) << pattern.first; - for (const auto& fused_layer : fused_candidates) { - for (int i = 0; i < fused_layer->getNbOutputs(); i++) { - VLOG(1) << " Fused output tensor:" - << fused_layer->getOutput(i)->getName(); - ITensorProxyPtr output_tensor = fused_layer->getOutput(i); - fused_tensors.insert(&output_tensor); - } - } - break; // Don't try other patterns on this layer. - } - } - } - // Find tensors with no ranges that are not fused and force their layers to - // not be quantized. - for (auto tensor : all_tensors) { - if (!quantization_ranges_proxy_.count(tensor) && - fused_tensors.find(tensor) == fused_tensors.end()) { - // Note: there may be some warnings for "(Unnamed ITensor* N)". These - // are tensors which are created internally by TF-TRT. The ranges for - // these unnamed ITensors are always inferred from user provided ranges, - // thus there will also be a warning for the range(s) the user missed. - LOG(WARNING) << "Quantization range was not found for " - << (*tensor)->getName() << ". " - << "Setting invalid quantization range."; - // Set the range to something unusable so the engine will fail if it - // tries to actually use the tensor's range. - (*tensor)->setDynamicRange(0, 0); - auto layer = tensor_layer.find(tensor); - // If the tensor is the output of a layer, set the layer's precision - // to fp16 so that it isn't quantized. - // Shuffle doesn't support setting precision. - if (layer != tensor_layer.end() && - layer->second->getType() != nvinfer1::LayerType::kSHUFFLE) { - VLOG(1) << "And setting layer " << layer->second->getName() - << " precision to fp16."; - layer->second->setPrecision(nvinfer1::DataType::kHALF); - } - } - } -#endif -} - -void Converter::PropagateQuantizationRanges() { - // Propagate ranges across edges in quantization_infer_ until no new - // information is added. - // Note: this function modifies quantization_infer_, it might be better to - // modify a copy instead if we for some reason need quantization_infer_ - // later. - bool information_added = true; - while (information_added) { - // Propogate for real tensors. - information_added = false; - for (auto it = quantization_infer_.begin(); - it != quantization_infer_.end();) { - auto input_tensor_range = quantization_ranges_.find(it->first); - auto output_tensor_range = quantization_ranges_.find(it->second); - if (input_tensor_range != quantization_ranges_.end() && - output_tensor_range == quantization_ranges_.end()) { - // Input has range but output doesn't: copy range - // TODO(laigd): consider reporting error if it a different range is - // already set. - quantization_ranges_[it->second] = input_tensor_range->second; - information_added = true; - VLOG(1) << "Copy quantization range: " << it->first->getName() << " -> " - << it->second->getName(); - } - // We can remove edges when the output range is known - if (quantization_ranges_.find(it->second) != quantization_ranges_.end()) { - it = quantization_infer_.erase(it); - } else { - ++it; - } - } - // Propogate for proxy. - information_added = false; - for (auto it = quantization_infer_proxy_.begin(); - it != quantization_infer_proxy_.end();) { - auto input_tensor_range = quantization_ranges_proxy_.find(it->first); - auto output_tensor_range = quantization_ranges_proxy_.find(it->second); - if (input_tensor_range != quantization_ranges_proxy_.end() && - output_tensor_range == quantization_ranges_proxy_.end()) { - // Input has range but output doesn't: copy range - // TODO(laigd): consider reporting error if it a different range is - // already set. - quantization_ranges_proxy_[it->second] = input_tensor_range->second; - information_added = true; - VLOG(1) << "Copy quantization range: " << (*it->first)->getName() - << " -> " << (*it->second)->getName(); - std::cout << "Copy quantization range: " << (*it->first)->getName() - << " -> " << (*it->second)->getName(); - } - // We can remove edges when the output range is known - if (quantization_ranges_proxy_.find(it->second) != - quantization_ranges_proxy_.end()) { - it = quantization_infer_proxy_.erase(it); - } else { - ++it; - } - } - } } Status Converter::GetInputs(const NodeDef& node_def, @@ -1755,173 +1870,127 @@ Status Converter::GetInputs(const NodeDef& node_def, } // Checks that the number of inputs match, and enforces that the inputs marked -// as true are constant weights. true means that the input must be a weight, -// while false means the input must be a tensor. In the future, false will mean -// the input can be a tensor or weight. +// as weights are constant. Inputs are allowed to be both weight and tensor. Status CheckInputsWeights( const OpConverterParams& params, - const std::vector>& inputs_is_weight) { + const std::vector>& expected_inputs) { const auto& inputs = params.inputs; const auto& node_def = params.node_def; - if (inputs.size() != inputs_is_weight.size()) { - return errors::InvalidArgument( - node_def.op(), " got ", inputs.size(), " inputs but expected ", - inputs_is_weight.size(), ", at ", node_def.name()); - } + TFTRT_CHECK_INPUT_SIZE(inputs.size(), expected_inputs.size(), node_def); for (int i = 0; i < inputs.size(); i++) { - if (inputs_is_weight[i].second && inputs.at(i).is_tensor()) { - return errors::Unimplemented("The input \"", inputs_is_weight[i].first, + if (expected_inputs[i].second == TrtInputArg::kWeight && + !inputs.at(i).is_weights()) { + return errors::Unimplemented("The input \"", expected_inputs[i].first, "\" for ", node_def.op(), - " must be a constant, at ", node_def.name()); + " must be a constant"); } - // TODO(tmorris): Remove this check and provide a method to automatically + // TODO(tfeher): Remove this check and provide a method to automatically // retrieve an input as a tensor, converting via CreateConstantLayer if it // was originally a weight. We will want a caching mechanism to prevent many // duplicate constants from being created. - if (!inputs_is_weight[i].second && inputs.at(i).is_weights()) { - return errors::Unimplemented("The input \"", inputs_is_weight[i].first, + if (expected_inputs[i].second == TrtInputArg::kTensor && + !inputs.at(i).is_tensor()) { + return errors::Unimplemented("The input \"", expected_inputs[i].first, + "\" for ", node_def.op(), + " must be a tensor"); + } + if (expected_inputs[i].second == TrtInputArg::kResource && + !inputs.at(i).is_resource()) { + return errors::Unimplemented("The input \"", expected_inputs[i].first, "\" for ", node_def.op(), - " must be a tensor, at ", node_def.name()); + " must be a resource handle"); } } return Status::OK(); } -Status AllowDataTypes(const OpConverterParams& params, - const std::set& allowed_dtypes, - const char* dtype_attr_name = "T") { - const auto& node_def = params.node_def; - TFAttrs attrs(node_def); - if (!attrs.count(dtype_attr_name)) { - return errors::InvalidArgument("Attribute with name ", dtype_attr_name, - " not found."); - } - const auto op_dtype = attrs.get(dtype_attr_name); - if (!allowed_dtypes.count(op_dtype)) { - // Build string list of allowed types. - std::ostringstream ss; - for (auto it = allowed_dtypes.begin(); it != allowed_dtypes.end(); ++it) { - if (it != allowed_dtypes.begin()) ss << ", "; - ss << DataTypeString(*it); +// Checks that the number of inputs match, and enforces that the inputs marked +// as true are constant weights. true means that the input must be a weight, +// while false means the input must be a tensor. +Status CheckInputsWeights( + const OpConverterParams& params, + const std::vector>& inputs_is_weight) { + std::vector> expected_inputs; + expected_inputs.reserve(inputs_is_weight.size()); + std::transform( + inputs_is_weight.begin(), inputs_is_weight.end(), + std::back_inserter(expected_inputs), [](std::pair x) { + return std::make_pair( + x.first, x.second ? TrtInputArg::kWeight : TrtInputArg::kTensor); + }); + return CheckInputsWeights(params, expected_inputs); +} + +Status GetNodeDefTfType(const NodeDef& node_def, DataType* tf_type, + const string type_attr_name_in = "") { + string type_attr_name; + if (type_attr_name_in.empty()) { + if (node_def.op() == "ReadVariableOp" || + node_def.op() == "ResourceGather") { + type_attr_name = "dtype"; + } else { + type_attr_name = "T"; } - return errors::Unimplemented("Data type ", DataTypeString(op_dtype), - " is not supported for ", node_def.op(), - ", must be one of [", ss.str(), "], at ", - node_def.name()); + } else { + type_attr_name = type_attr_name_in; } + + AttrSlice attrs(node_def); + if (attrs.Find(type_attr_name) == nullptr) { + return errors::InvalidArgument("Attribute with name ", type_attr_name, + " not found."); + } + TF_RETURN_IF_ERROR(GetNodeAttr(attrs, type_attr_name, tf_type)); return Status::OK(); } -// **************************************************************************** -// Constant folding functions for weights. -// TODO(laigd): we should probably use eigen directly. -// ***************************************************************************** -struct LambdaFactory { - enum class OP_CATEGORY : int { RSQRT = 0, NEG, RECIP }; - OP_CATEGORY op; - - template - std::function unary() { - switch (op) { - case OP_CATEGORY::RSQRT: { - VLOG(2) << "RSQRT GETS DONE"; - return [](T t) -> T { return 1.0 / std::sqrt(t); }; - } - case OP_CATEGORY::NEG: - return [](T t) -> T { return -t; }; - case OP_CATEGORY::RECIP: - return [](T t) -> T { return 1.0 / t; }; - default: - LOG(ERROR) << "Not supported op for unary: " << static_cast(op); - return nullptr; - } +Status GetInputTfType(const OpConverterParams& params, DataType* tf_type, + int pos) { + const std::vector& inputs = params.inputs; + if (inputs.size() <= pos) { + return errors::Internal("Invalid input position"); } -}; -template <> -std::function LambdaFactory::unary() { - switch (op) { - case OP_CATEGORY::RSQRT: { - VLOG(2) << "RSQRT GETS DONE"; - return [](Eigen::half t) { - return Eigen::half(1.0 / std::sqrt(static_cast(t))); - }; - } - case OP_CATEGORY::NEG: - return [](Eigen::half t) { return -t; }; - case OP_CATEGORY::RECIP: - return [](Eigen::half t) { - return Eigen::half(1.0 / static_cast(t)); - }; - default: - LOG(ERROR) << "Not supported op for unary: " << static_cast(op); - return nullptr; - } + return inputs[pos].GetTfType(tf_type); } -Status UnaryCompute(const TRT_ShapedWeights& iweights, - TRT_ShapedWeights* oweights, LambdaFactory unary_op) { - CHECK(iweights.TrtDType() == oweights->TrtDType()); - switch (iweights.TrtDType()) { - case nvinfer1::DataType::kFLOAT: { - auto inp = static_cast(iweights.GetValues()); - auto oup = static_cast(oweights->GetValues()); - std::transform(inp, inp + iweights.count(), oup, unary_op.unary()); - break; - } - case nvinfer1::DataType::kHALF: { - auto inp = static_cast(iweights.GetValues()); - auto oup = static_cast(oweights->GetValues()); - std::transform(inp, inp + iweights.count(), oup, - unary_op.unary()); - break; - } - default: - return errors::Unimplemented("Data type not supported: ", - DebugString(iweights.TrtDType())); +Status GetOutputTfType(const OpConverterParams& params, DataType* tf_type) { + return GetNodeDefTfType(params.node_def, tf_type); +} + +Status AllowDataTypes(const OpConverterParams& params, + const std::set& allowed_types, + const char* type_attr_name = "") { + const auto& node_def = params.node_def; + DataType tf_type; + TF_RETURN_IF_ERROR(GetNodeDefTfType(node_def, &tf_type, type_attr_name)); + if (!allowed_types.count(tf_type)) { + const auto error = + convert_not_supported_dtype_msg(allowed_types, tf_type, node_def); + return errors::Unimplemented(error); } return Status::OK(); } -// Before TRT 5.1.3, we have to calculate padding for convolutions ourselves. -Status Conv2DPaddingHelper(OpConverterParams* params, const TFAttrs& attrs, - const nvinfer1::DimsHW& kernel_size, - const nvinfer1::DimsHW& dilation, - const nvinfer1::DimsHW& stride, - const std::vector& input_dims, - ITensorProxyPtr tensor, - std::vector>* padding, - ITensorProxyPtr* padded_tensor) { - if (attrs.get("padding") == "SAME") { - nvinfer1::DimsHW effective_kernel_size = kernel_size; - effective_kernel_size.h() += (kernel_size.h() - 1) * (dilation.h() - 1); - effective_kernel_size.w() += (kernel_size.w() - 1) * (dilation.w() - 1); - *padding = CreateSamePadding(stride, effective_kernel_size, input_dims); - } else { - *padding = {{0, 0}, {0, 0}}; - } - - // Handle asymmetric padding. TensorRT 5.1 added support for asymmetric - // padding via setPrePadding and setPostPadding. Due to a bug in 5.1.2, we can - // only use asymmetric padding in convolutions with 5.1.3+. But in 5.1.3, we - // will always use setPaddingMode for simplicity. - if ((*padding)[0].first != (*padding)[0].second || - (*padding)[1].first != (*padding)[1].second) { - auto pad_layer = params->converter->network()->addPadding( - *tensor->trt_tensor(), nvinfer1::DimsHW((*padding)[0].first, (*padding)[1].first), - nvinfer1::DimsHW((*padding)[0].second, (*padding)[1].second)); - TFTRT_RETURN_ERROR_IF_NULLPTR(pad_layer, params->node_def.name()); - ITensorProxyPtr output_tensor = pad_layer->getOutput(0); - params->converter->MarkQuantizationRangesAsInferrable(&tensor, - &output_tensor); - *padding = {{0, 0}, {0, 0}}; - tensor = output_tensor; - } - *padded_tensor = tensor; - return Status::OK(); +namespace { +// Extracts the spatial dimensions from `output_sizes` and returns them as a +// vector of size 2. +std::vector GetSpatialDimsFromOutputSizes( + const TRT_TensorOrWeights& output_sizes, const int h_index, + const int w_index) { + // We use h_index and w_index instead of 1 and 2 because we haven't + // transposed output_sizes along with the input. + const TRT_ShapedWeights& weights = output_sizes.weights(); + const int output_sizes_length = weights.count(); + auto output_sizes_values = weights.GetPointer(); + // The length of output_sizes can be 2 or 4. When the length is 4, + // output_sizes represents . + return {output_sizes_values[output_sizes_length == 4 ? h_index : 0], + output_sizes_values[output_sizes_length == 4 ? w_index : 1]}; } +} // namespace -Status ConvertConv2DHelper(OpConverterParams* params, int group, +Status ConvertConv2DHelper(const OpConverterParams* params, int group, bool is_conv2d_backprop_input) { const auto& inputs = params->inputs; const auto& node_def = params->node_def; @@ -1930,56 +1999,99 @@ Status ConvertConv2DHelper(OpConverterParams* params, int group, if (is_conv2d_backprop_input) { // In the case when Conv2dBackpropInput is used for conv2d_transpose, these // inputs correspond to: output size, filter, and input. - TF_RETURN_IF_ERROR(CheckInputsWeights( - *params, - {{"input_sizes", true}, {"filter", true}, {"out_backprop", false}})); + // TODO(cbate): refine this check when moving to structured op converter. + if (!params->use_explicit_precision) { + TF_RETURN_IF_ERROR(CheckInputsWeights( + *params, + {{"input_sizes", true}, {"filter", true}, {"out_backprop", false}})); + } + backprop_output_size = inputs.at(0); tensor = inputs.at(2).tensor(); + bool has_dynamic_hw_shape{false}; + int start_idx{0}; + auto dims = tensor->getDimensions(); + if (params->use_implicit_batch) { + if (dims.nbDims != 3) { + return errors::Internal( + "In implicit batch mode, input nbDims should be 3"); + } + start_idx = 1; + } else { + if (dims.nbDims != 4) { + return errors::Internal( + "In explicit batch mode, input nbDims should be 4"); + } + start_idx = 2; + } + for (int i = start_idx; i < dims.nbDims; ++i) { + if (dims.d[i] < 0) { + has_dynamic_hw_shape = true; + } + } + if (has_dynamic_hw_shape) { + return errors::Unimplemented( + "Conv2dBackpropInput does not support input with unknown spatial " + "shape"); + } } else { - TF_RETURN_IF_ERROR( - CheckInputsWeights(*params, {{"input", false}, {"filter", true}})); + TF_RETURN_IF_ERROR(CheckInputsWeights( + *params, + {{"input", false}, {"filter", !params->use_explicit_precision}})); tensor = inputs.at(0).tensor(); } TF_RETURN_IF_ERROR( AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF})); - TRT_ShapedWeights weights_rsck = inputs.at(1).weights(); - if (weights_rsck.shape_.nbDims != 4) { - return errors::InvalidArgument("Conv2D expects kernel of dimension 4, at " + - node_def.name()); + + if (inputs.at(1).GetTrtDims().nbDims != 4) { + return errors::InvalidArgument("Conv2D expects kernel of dimension 4"); } - TFAttrs attrs(node_def); - auto data_format = attrs.get("data_format"); + + string data_format, padding_type; + std::vector tf_dilations, tf_stride; + AttrSlice attrs(node_def); + TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "data_format", &data_format)); + TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "padding", &padding_type)); + TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "dilations", &tf_dilations)); + TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "strides", &tf_stride)); + int c_index = (data_format == "NHWC") ? 3 : 1; int h_index = (data_format == "NHWC") ? 1 : 2; int w_index = (data_format == "NHWC") ? 2 : 3; - auto tf_dilations = attrs.get>("dilations"); + if (tf_dilations.size() != 4) { return errors::InvalidArgument( - "Convolution dilations field must specify 4 dimensions, at ", - node_def.name()); + "Convolution dilations field must specify 4 dimensions"); } if (tf_dilations[0] != 1 || tf_dilations[c_index] != 1) { return errors::Unimplemented( - "Dilation rate must be 1 for batch and channel dimensions, at ", - node_def.name()); + "Dilation rate must be 1 for batch and channel dimensions"); } const nvinfer1::DimsHW dilation(tf_dilations[h_index], tf_dilations[w_index]); if (is_conv2d_backprop_input && (dilation.d[0] != 1 || dilation.d[1] != 1)) { return errors::Unimplemented( - "Dilation with Conv2DBackpropInput (conv2d_transpose) is not supported", - ", at ", node_def.name()); + "Dilation with Conv2DBackpropInput (conv2d_transpose) is not" + " supported"); } - const auto tf_stride = attrs.get>("strides"); if (tf_stride.size() != 4) { return errors::InvalidArgument( - "Convolution strides field must specify 4 dimensions, at ", - node_def.name()); + "Convolution strides field must specify 4 dimensions"); } if (tf_stride[0] != 1 || tf_stride[c_index] != 1) { return errors::Unimplemented( - "Stride must be 1 for batch and channel dimensions, at ", - node_def.name()); + "Stride must be 1 for batch and channel dimensions"); + } + // Channel dim must be static for DepthwiseConv2dNative since we use that + // value for num_groups at build time. + if (!params->use_implicit_batch && tensor->getDimensions().d[c_index] == -1) { + return errors::InvalidArgument("Channel dimension must be static"); + } + + if (padding_type != "SAME" && padding_type != "VALID") { + return errors::Unimplemented(padding_type + + " padding type not implemented, " + "only VALID and SAME are supported"); } const nvinfer1::DimsHW stride(tf_stride[h_index], tf_stride[w_index]); if (params->validation_only) return Status::OK(); @@ -1987,139 +2099,206 @@ Status ConvertConv2DHelper(OpConverterParams* params, int group, // Transpose to NCHW (NCHW is required for IConvLayer). const bool need_transpose = (data_format == "NHWC"); if (need_transpose) { - TF_RETURN_IF_ERROR( - params->converter->TransposeTensor(tensor, {0, 3, 1, 2}, &tensor)); + TF_RETURN_IF_ERROR(params->converter->TransposeTensor( + tensor, {0, 3, 1, 2}, &tensor, node_def, "to_NCHW")); } // Dimensions of transposed tensor. const auto tensor_dim = tensor->getDimensions(); + const int c_dim_size = tensor_dim.d[params->use_implicit_batch ? 0 : 1]; // group == 0 signifies that this is a depthwise convolution, so set // num_groups to size of input's channel dim. For a non-depthwise conv, // num_groups will be 1. - const int num_groups = (group == 0) ? tensor_dim.d[0] : group; + const int num_groups = (group == 0) ? c_dim_size : group; // For conv, TF weights are RSCK, and TRT expects KCRS. // For backprop, TF weights are RSKC, and TRT expects CKRS. // Therefore, this reorder will work for both cases. - TRT_ShapedWeights weights = - params->weight_store->GetTempWeights(weights_rsck); - ReorderRSCKToKCRS(weights_rsck, &weights, num_groups); - TRT_ShapedWeights biases(weights.TrtDType()); - const int output_axis = is_conv2d_backprop_input ? 1 : 0; - const int noutput = weights.shape_.d[output_axis] * num_groups; + const int output_axis = is_conv2d_backprop_input ? 2 : 3; + auto weights_shape = inputs.at(1).GetTrtDims(); + const int noutput = weights_shape.d[output_axis] * num_groups; nvinfer1::DimsHW kernel_size; - kernel_size.h() = weights.shape_.d[2]; - kernel_size.w() = weights.shape_.d[3]; + kernel_size.h() = weights_shape.d[0]; + kernel_size.w() = weights_shape.d[1]; -// Before TRT 5.1.3, we have to calculate padding ourselves. -#if !IS_TRT_VERSION_GE(5, 1, 3, 0) - std::vector> padding; - std::vector input_dims; - if (is_conv2d_backprop_input) { - // For backprop, calculate padding based on "input_sizes" input, which - // actually corresponds to output size. ("input_sizes" makes sense in the - // context of Conv2DBackpropInput). - // We use h_index and w_index instead of 1 and 2 because we havent - // transposed backprop_output_size along with the input. - auto output_size_weights = - static_cast(backprop_output_size.weights().GetValues()); - input_dims = {output_size_weights[h_index], output_size_weights[w_index]}; + TRT_ShapedWeights weights_rsck; + if (inputs.at(1).is_weights()) { + weights_rsck = inputs.at(1).weights(); } else { - // Use 1 and 2 because tensor_dim has the dimensions of the transposed - // input. - input_dims = {static_cast(tensor_dim.d[1]), - static_cast(tensor_dim.d[2])}; - } - ITensorProxyPtr padded_tensor = nullptr; - TF_RETURN_IF_ERROR(Conv2DPaddingHelper(params, attrs, kernel_size, dilation, - stride, input_dims, tensor, &padding, - &padded_tensor)); - tensor = padded_tensor; -#endif + ::stream_executor::port::StatusOr tmp = + params->weight_store->GetTempWeights(nvinfer1::DataType::kFLOAT, + weights_shape); + TRT_ENSURE_OK(tmp); + weights_rsck = std::move(tmp).ValueOrDie(); + } + + // In explcit precision mode, trace the input back to the constant while also + // verifying that QDQ scale layers are present. + if (!inputs.at(1).is_weights()) { + TRT_ENSURE(params->use_explicit_precision); + ::stream_executor::port::StatusOr builder = + TRTNetworkBuilder::Create(params->converter->network(), + params->weight_store); + TRT_ENSURE_OK(builder); + auto dequant_layer = builder.ValueOrDie().FindProducerOf( + inputs.at(1).tensor()->trt_tensor()); + TRT_ENSURE_PTR_OK(dequant_layer); + + // TODO(cbate): corresponding TRT layer name check + if (!IS_TRT_VERSION_GE(8, 0, 0, 0)) { + TRT_ENSURE(dequant_layer.ValueOrDie()->getType() == + nvinfer1::LayerType::kSCALE); + } + + auto quant_layer = + builder.ValueOrDie().UniqueParentOf(dequant_layer.ValueOrDie(), 0); + TRT_ENSURE_PTR_OK(quant_layer); + + // TODO(cbate): corresponding TRT layer name check + if (!IS_TRT_VERSION_GE(8, 0, 0, 0)) { + TRT_ENSURE(quant_layer.ValueOrDie()->getType() == + nvinfer1::LayerType::kSCALE); + } + + auto weights_layer = + builder.ValueOrDie().UniqueParentOf(quant_layer.ValueOrDie(), 0); + TRT_ENSURE_PTR_OK(weights_layer); + TRT_ENSURE(weights_layer.ValueOrDie()->getType() == + nvinfer1::LayerType::kCONSTANT); + auto const_weights_rsck = + reinterpret_cast(weights_layer.ValueOrDie()) + ->getWeights(); + + TRT_ENSURE(weights_rsck.count() == weights_rsck.count()); + const auto* weights_ptr = + static_cast(const_weights_rsck.values); + std::copy_n(weights_ptr, const_weights_rsck.count, + weights_rsck.GetPointer()); + } + + ::stream_executor::port::StatusOr weights = + params->weight_store->GetTempWeights(weights_rsck); + TRT_ENSURE_OK(weights); + ::stream_executor::port::StatusOr biases = + params->weight_store->GetTempWeights(nvinfer1::DataType::kFLOAT, + nvinfer1::Dims{1, {noutput}}); + TRT_ENSURE_OK(biases); + std::fill_n(biases.ValueOrDie().GetPointer(), noutput, 0.0f); + ReorderRSCKToKCRS(weights_rsck, &weights.ValueOrDie(), num_groups); // Add convolution. nvinfer1::ILayer* conv_layer = nullptr; if (is_conv2d_backprop_input) { nvinfer1::IDeconvolutionLayer* layer = params->converter->network()->addDeconvolution( - *tensor->trt_tensor(), noutput, kernel_size, weights.GetTrtWeights(), - biases.GetTrtWeights()); + *tensor->trt_tensor(), noutput, kernel_size, + weights.ValueOrDie().GetTrtWeights(), + biases.ValueOrDie().GetTrtWeights()); TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name()); layer->setStride(stride); -// TensorRT 5.1.3 added support for padding modes. -#if IS_TRT_VERSION_GE(5, 1, 3, 0) // VALID padding is the default TRT behavior. - if (attrs.get("padding") == "SAME") { + if (padding_type == "SAME") { // SAME_UPPER means that post padding is preferred. layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER); } -#else - layer->setPadding(nvinfer1::DimsHW{padding[0].first, padding[1].first}); -#endif - layer->setName(node_def.name().c_str()); layer->setNbGroups(num_groups); conv_layer = layer; } else { + const nvinfer1::Weights empty_weights{nvinfer1::DataType::kFLOAT, nullptr, + 0}; nvinfer1::IConvolutionLayer* layer = params->converter->network()->addConvolution( - *tensor->trt_tensor(), noutput, kernel_size, weights.GetTrtWeights(), - biases.GetTrtWeights()); + *tensor->trt_tensor(), noutput, kernel_size, + params->use_explicit_precision + ? empty_weights + : weights.ValueOrDie().GetTrtWeights(), + empty_weights); TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name()); layer->setStride(stride); -#if IS_TRT_VERSION_GE(5, 1, 3, 0) - if (attrs.get("padding") == "SAME") { + if (padding_type == "SAME") { layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER); } -#else - layer->setPadding(nvinfer1::DimsHW{padding[0].first, padding[1].first}); -#endif - layer->setName(node_def.name().c_str()); layer->setNbGroups(num_groups); layer->setDilation(dilation); conv_layer = layer; } + + // After creating the conv layer, if we are in explicit precision mode and the + // weights input is a tensor, then we need to override the weights input by + // calling setInput() on the layer. + if (params->use_explicit_precision) { + TRT_ENSURE(inputs.at(1).is_tensor()); + + nvinfer1::IShuffleLayer* layer = params->converter->network()->addShuffle( + *inputs.at(1).tensor()->trt_tensor()); + layer->setFirstTranspose({3, 2, 0, 1}); + layer->setReshapeDimensions({4, {0, 0, 0, 0}}); + conv_layer->setInput(1, *layer->getOutput(0)); + } + + params->converter->SetLayerName(conv_layer, node_def, "conv"); ITensorProxyPtr output_tensor = conv_layer->getOutput(0); // Add an extra padding for Deconv because TRT doesn't accept the // argument output_shape and thus the TRT output shape could be wrong // in case of strides>1. if (is_conv2d_backprop_input) { - auto tf_output_shape = - static_cast(backprop_output_size.weights().GetValues()); + std::vector output_spatial_dims = + GetSpatialDimsFromOutputSizes(backprop_output_size, h_index, w_index); + const int output_height = output_spatial_dims[0]; + const int output_width = output_spatial_dims[1]; nvinfer1::Dims trt_output_shape = output_tensor->getDimensions(); // What determines the padding size is the difference between the given // input_sizes (tf_output_shape) and TRT computed size. - const int height_diff = tf_output_shape[h_index] - trt_output_shape.d[1]; - const int width_diff = tf_output_shape[w_index] - trt_output_shape.d[2]; + int out_h_idx = params->use_implicit_batch ? 1 : 2; + int out_w_idx = params->use_implicit_batch ? 2 : 3; + const int height_diff = output_height - trt_output_shape.d[out_h_idx]; + const int width_diff = output_width - trt_output_shape.d[out_w_idx]; if ((height_diff < 0) || (width_diff < 0)) { return errors::InvalidArgument( "input_sizes argument of Conv2DBackprop (i.e. output_shape argument " "of conv2d_transpose) ", "is too small for the given out_backprop argument of Conv2DBackprop " "(i.e. input argument of conv2d_transpose). Expect: ", - "(", tf_output_shape[h_index], ", ", tf_output_shape[w_index], - ") >= ", "(", trt_output_shape.d[1], ", ", trt_output_shape.d[2], - ") for op ", node_def.name()); + "(", output_height, ", ", output_width, ") >= ", "(", + trt_output_shape.d[out_h_idx], ", ", trt_output_shape.d[out_w_idx], + ")"); } // Only add a padding layer if padding sizes are larger than 0 if ((height_diff > 0) || (width_diff > 0)) { nvinfer1::DimsHW pre_padding(0, 0); nvinfer1::DimsHW post_padding(height_diff, width_diff); nvinfer1::IPaddingLayer* padding_layer = - params->converter->network()->addPadding(*output_tensor->trt_tensor(), pre_padding, - post_padding); + params->converter->network()->addPadding(*output_tensor->trt_tensor(), + pre_padding, post_padding); output_tensor = padding_layer->getOutput(0); + params->converter->SetLayerName(padding_layer, node_def, "pad"); } } // Restore transpose. if (need_transpose) { TF_RETURN_IF_ERROR(params->converter->TransposeTensor( - output_tensor, {0, 2, 3, 1}, &output_tensor)); + output_tensor, {0, 2, 3, 1}, &output_tensor, node_def, "to_NHWC")); } params->outputs->push_back(TRT_TensorOrWeights(output_tensor)); return Status::OK(); } -Status ConvertTranspose(OpConverterParams* params) { +bool AllowInefficientTranspose() { + static bool result = [] { + bool value; + Status status = + ReadBoolFromEnvVar("TF_DEBUG_TRT_ALLOW_INEFFICIENT_TRANSPOSE", + /*default_value=*/false, &value); + if (!status.ok()) { + LOG(ERROR) << status; + } + return value; + }(); + + return result; +} + +Status ConvertTranspose(const OpConverterParams* params) { const auto& inputs = params->inputs; TF_RETURN_IF_ERROR( CheckInputsWeights(*params, {{"x", false}, {"perm", true}})); @@ -2127,143 +2306,208 @@ Status ConvertTranspose(OpConverterParams* params) { *params, {DataType::DT_FLOAT, DataType::DT_HALF, DataType::DT_INT32})); // Get the permutation from weights. TRT_ShapedWeights weights = inputs.at(1).weights(); - const int* weights_ptr = static_cast(weights.GetValues()); + const int* weights_ptr = weights.GetPointer(); std::vector perm(weights_ptr, weights_ptr + weights.count()); // Verify the permutation. ITensorProxyPtr input_tensor = inputs.at(0).tensor(); - if (perm.size() - 1 != size_t(input_tensor->getDimensions().nbDims)) { + const int perm_size = + params->use_implicit_batch ? perm.size() - 1 : perm.size(); + if (perm_size != size_t(input_tensor->getDimensions().nbDims)) { return errors::InvalidArgument( "Rank of perm for transpose does not match with that of the input."); } - if (perm[0] != 0) { + if (params->use_implicit_batch && perm[0] != 0) { return errors::Unimplemented( "Transpose at batch dimension is not supported."); } + if (!IS_TRT_VERSION_GE(7, 1, 3, 4)) { + // TensorRT versions before 7.1.3.4 is slow transposing large tensors. + // So check tensor size, and don't convert if it is too large. + constexpr int64_t kMaxEfficientTranspose = 2500000; + int64_t tensor_size = DimsAdapter(input_tensor->getDimensions()).Volume(); + if (!AllowInefficientTranspose() && tensor_size > kMaxEfficientTranspose) { + return errors::Unimplemented(StrCat("Transpose too large:", tensor_size)); + } + } + if (params->validation_only) return Status::OK(); // Start conversion. ITensorProxyPtr output_tensor = nullptr; - TF_RETURN_IF_ERROR( - params->converter->TransposeTensor(input_tensor, perm, &output_tensor)); + TF_RETURN_IF_ERROR(params->converter->TransposeTensor( + input_tensor, perm, &output_tensor, params->node_def)); params->outputs->push_back(TRT_TensorOrWeights(output_tensor)); return Status::OK(); } -Status ConvertReshape(OpConverterParams* params) { +Status ConvertShape(const OpConverterParams* params) { const auto& inputs = params->inputs; - const auto& node_def = params->node_def; TF_RETURN_IF_ERROR( - CheckInputsWeights(*params, {{"tensor", false}, {"shape", true}})); - TF_RETURN_IF_ERROR(AllowDataTypes( - *params, {DataType::DT_FLOAT, DataType::DT_HALF, DataType::DT_INT32})); - const TRT_TensorOrWeights& input_tensor = inputs.at(0); - TRT_ShapedWeights weights = inputs.at(1).weights(); - if (weights.count() == 0) { - return errors::Unimplemented("Reshape to shape=[] is not supported, at ", - node_def.name()); + CheckInputsWeights(*params, {{"input", TrtInputArg::kBoth}})); + if (params->use_implicit_batch) { + return errors::Unimplemented( + "Shape is only supported for explicit batch mode."); } + DimsAdapter input_dims(inputs.at(0).GetTrtDims()); + if (params->validation_only) return Status::OK(); - const int* weights_ptr = static_cast(weights.GetValues()); + ::stream_executor::port::StatusOr builder = + TRTNetworkBuilder::Create(params->converter->network(), + params->weight_store); + TRT_ENSURE_OK(builder); + if (input_dims.IsStatic()) { + // Create a const node with the value of the shape. + ::stream_executor::port::StatusOr const_layer = + builder.ValueOrDie().ConstantShape(input_dims); + TRT_ENSURE_PTR_OK(const_layer); + params->outputs->push_back( + TRT_TensorOrWeights(const_layer.ValueOrDie()->getOutput(0))); + return Status::OK(); + } + ::stream_executor::port::StatusOr shape_layer = + builder.ValueOrDie().Shape(inputs.at(0).tensor()->trt_tensor()); + TRT_ENSURE_PTR_OK(shape_layer); + params->converter->SetLayerName(shape_layer.ValueOrDie(), params->node_def, + "shape"); + params->outputs->push_back( + TRT_TensorOrWeights(shape_layer.ValueOrDie()->getOutput(0))); + return Status::OK(); +} - // Check that it doesn't change the batch dimension. This check is - // conservative, for example, when the first dim of the shape is -1 and input - // tensor shape is not fixed, it is still possible that the reshape doesn't - // change the batch dim, but as long as there is a possibility that it could - // change the batch dim, it reject the conversion. The parameters are: - // - // * reshape_batch_dim: the value of the first dim of the input shape constant - // * reshape_dims: all other dims of the input shape constant - // * input_batch_dim: the value of the first dim of the input tensor to - // reshape - // * input_dims: all other dims of the input tensor to reshape - // - // The validation logic is: - // - // if input_batch_dim is fixed: - // if reshape_batch_dim == input_batch_dim: - // ok - // elif reshape_batch_dim == -1 (meaning reshape_dims are fixed) and - // input_dims are fixed and - // prod(input_dims) == prod(reshape_dims) - // ok - // else: - // not ok - // elif input_dims are fixed: - // if reshape_dims are fixed and - // prod(input_dims) == prod(reshape_dims): - // ok - // else: - // not ok - // else: - // not ok - // - // Note that the following is ok no matter whether reshape_batch_dim is fixed - // or not: - // - // ``` - // input_batch_dim is not fixed && - // reshape_dims are fixed && - // prod(input_dims) == prod(reshape_dims), - // ``` - // - // because the non-batch dims of the new and old shapes match, and TF runtime - // should make sure the batch dim is not changed. +Status ExpectShapeTensor(const TRT_TensorOrWeights& tensor) { + if (tensor.tensor()->getType() != nvinfer1::DataType::kINT32) { + return errors::InvalidArgument("Expected a shape tensor with INT32 type"); + } + if (tensor.GetTrtDims().nbDims > 1) { + return errors::InvalidArgument("Expected a 0D or 1D shape tensor"); + } + return Status::OK(); +} - const int input_batch_dim = input_tensor.batch_size(); - const int reshape_batch_dim = weights_ptr[0]; - const nvinfer1::Dims input_dims = input_tensor.GetTrtDims(); +// Converts Reshape op if the input has dynamic (unknown) dims. +Status ConvertDynamicReshape(const OpConverterParams* params) { + if (params->use_implicit_batch) { + return errors::InvalidArgument( + "The input \"shape\" for Reshape must be a constant in implicit batch" + " mode."); + } + if (!IS_TRT_VERSION_GE(7, 1, 3, 0)) { + // While officially TRT supports shape value input , there are problems with + // shape input handling that cause networks converted with + // ConvertDynamicReshape fail. Here we conservatively switch off the + // converter before TRT 7.1.3. + return errors::InvalidArgument( + "Non constant shape input tensor for Reshape requires minimum TRT " + "7.1.3"); + } + const auto& inputs = params->inputs; + const TRT_TensorOrWeights& input_tensor = inputs.at(0); - nvinfer1::Dims reshape_dims; - reshape_dims.nbDims = weights.count() - 1; - for (int i = 1; i < weights.count(); i++) { - reshape_dims.d[i - 1] = weights_ptr[i]; + // If the input is a tensor it must be a shape tensor. + TF_RETURN_IF_ERROR(ExpectShapeTensor(inputs.at(1))); + if (inputs.at(1).tensor()->getDimensions().nbDims == 0) { + // Dynamic reshape requires a 1D shape tensor. + return errors::Unimplemented( + "Reshape with dynamic input requires 1D input tensor"); } + if (params->validation_only) return Status::OK(); + nvinfer1::IShuffleLayer* layer = params->converter->network()->addShuffle( + *input_tensor.tensor()->trt_tensor()); + VLOG(2) << "ConvertReshape setInput (1) " + << DebugString(inputs.at(1).tensor()->getDimensions()); + layer->setInput(1, *inputs.at(1).tensor()->trt_tensor()); + params->outputs->push_back(TRT_TensorOrWeights(layer->getOutput(0))); + return Status::OK(); +} + +// Converts Reshape in explicit batch mode if the input has static (known) dims. +Status ConvertStaticReshapeForExplicitBatchMode( + const OpConverterParams* params, DimsAdapter output_dims, + ITensorProxyPtr* output_tensor) { + return PrepareTensorForShape(params->converter, params->inputs.at(0), + output_dims, params->validation_only, + output_tensor, params->node_def); +} + +// Converts Reshape in implicit batch mode. The input has static (known) dims. +Status ConvertStaticReshapeForImplicitBatchMode( + const OpConverterParams* params, DimsAdapter output_dims, + ITensorProxyPtr* output_tensor) { + const auto& inputs = params->inputs; + const TRT_TensorOrWeights& input_tensor = inputs.at(0); + const int input_batch_dim = input_tensor.batch_size(); + const int64_t output_batch_dim = output_dims.dim(0); + + DimsAdapter input_nonbatch_dims(input_tensor.GetTrtDims()); + DimsAdapter output_nonbatch_dims(output_dims); + TF_RETURN_IF_ERROR(output_nonbatch_dims.RemoveBatchDimension()); - // Check that it doesn't change the batch dimension according to the logic - // mentioned above. + VLOG(1) << "input_batch_dim=" << input_batch_dim + << ", input_nonbatch_dims=" << input_nonbatch_dims.DebugString() + << "\nresult_batch_dim=" << output_batch_dim + << ", result_nonbatch_dims=" << output_nonbatch_dims.DebugString(); + + // Check whether input_batch_dim and output_batch_dim will have the same + // static value. bool reshape_may_change_batch_dim = false; - if (input_batch_dim > 0) { // Batch size is fixed. - if (reshape_batch_dim == -1) { // Other dims of the shape must be fixed. - if (!AreDimsStaticWithSameSize(input_dims, reshape_dims, - /*is_tensor=*/true)) { - reshape_may_change_batch_dim = true; - } - } else if (reshape_batch_dim != input_batch_dim) { - reshape_may_change_batch_dim = true; - } else { - // This means (input_batch_dim>0 && input_batch_dim==reshape_batch_dim), - // and TF runtime should make sure non-batch dims are matched. - } - } else if (!AreDimsStaticWithSameSize(input_dims, reshape_dims, - /*is_tensor=*/true)) { - reshape_may_change_batch_dim = true; + if (input_batch_dim != -1 && output_batch_dim != -1) { + reshape_may_change_batch_dim = (input_batch_dim != output_batch_dim); + } else { + reshape_may_change_batch_dim = + !AreDimsStaticWithSameSize(input_nonbatch_dims, output_nonbatch_dims); } - VLOG(1) << "input_batch_dim=" << input_batch_dim - << ", input_dims=" << DebugString(input_dims) - << "\nreshape_batch_dim=" << reshape_batch_dim - << ", reshape_dims=" << DebugString(reshape_dims); if (reshape_may_change_batch_dim) { - const string msg = StrCat( - "Reshape on batch dimension is not supported, at ", node_def.name(), - ". input_batch_dim=", input_batch_dim, ", ", DebugString(input_dims), - "; reshape_batch_dim=", reshape_batch_dim, ", ", - DebugString(reshape_dims)); - return errors::Unimplemented(msg); + return errors::Unimplemented("Reshape on batch dimension is not supported"); + } + // Perform the conversion. + return PrepareTensorForShape(params->converter, input_tensor, + output_nonbatch_dims, params->validation_only, + output_tensor, params->node_def); +} + +Status ConvertReshape(const OpConverterParams* params) { + const auto& inputs = params->inputs; + TF_RETURN_IF_ERROR(CheckInputsWeights( + *params, + {{"tensor", TrtInputArg::kTensor}, {"shape", TrtInputArg::kBoth}})); + TF_RETURN_IF_ERROR(AllowDataTypes( + *params, {DataType::DT_FLOAT, DataType::DT_HALF, DataType::DT_INT32})); + if (inputs.at(1).is_tensor()) { + return ConvertDynamicReshape(params); } - // Start conversion. + // TODO(bixia): we can't use inputs.at(1).weights().ToVector() for two + // reasons: (1) When weights.count()==0, TRT_ShapedWeights::tensor_ dtype is + // not properly set to INT32. (2) I tried a fix for the first problem, I got + // shared pointer related error in convert_nodes_test. We should fix the + // problems and switch to use inputs.at(1).weights().ToVector(), a type + // safe method to access the content of the tensor. + TRT_ShapedWeights weights = inputs.at(1).weights(); + if (weights.count() == 0 && params->use_implicit_batch) { + return errors::Unimplemented("Reshape to shape=[] is not supported"); + } + + DimsAdapter output_shape_dims( + absl::MakeSpan(weights.GetPointer(), weights.count())); ITensorProxyPtr output_tensor = nullptr; - TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape( - input_tensor, reshape_dims, params->validation_only, &output_tensor)); + + if (!params->use_implicit_batch) { + TF_RETURN_IF_ERROR(ConvertStaticReshapeForExplicitBatchMode( + params, output_shape_dims, &output_tensor)); + } else { + TF_RETURN_IF_ERROR(ConvertStaticReshapeForImplicitBatchMode( + params, output_shape_dims, &output_tensor)); + } if (params->validation_only) return Status::OK(); + // Record the conversion result. params->outputs->push_back(TRT_TensorOrWeights(output_tensor)); return Status::OK(); } -Status ConvertExpandDims(OpConverterParams* params) { +Status ConvertExpandDims(const OpConverterParams* params) { const auto& inputs = params->inputs; const auto& node_def = params->node_def; TF_RETURN_IF_ERROR( @@ -2277,29 +2521,161 @@ Status ConvertExpandDims(OpConverterParams* params) { // Get axis to expand on. auto axis = inputs.at(1).weights().GetSpan(); if (axis.size() != 1) { - return errors::InvalidArgument("ExpandDims axis must be a scalar, at ", - node_def.name()); + return errors::InvalidArgument("ExpandDims axis must be a scalar"); } // Use rank = nbDims + 1 for ConvertAxis's bounds checking to account for // ExpandDim's ability to add an axis at end of the shape. int trt_axis; TF_RETURN_IF_ERROR(ConvertAxis(axis[0], dims.nbDims + 1, node_def.name(), - /*use_implicit_batch=*/true, &trt_axis)); + params->use_implicit_batch, &trt_axis)); if (params->validation_only) return Status::OK(); - - // ExpandDims: Insert new dim of size 1. - input_dims.insert(input_dims.begin() + trt_axis, 1); - // Reshape tensor. - nvinfer1::Dims new_dims; - TF_RETURN_IF_ERROR(TensorShapeArrayToTrtDims(input_dims, &new_dims)); ITensorProxyPtr output_tensor = nullptr; - TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape( - input_tensor, new_dims, /*validation_only=*/false, &output_tensor)); + + if (!params->use_implicit_batch && !HasStaticShape(input_dims)) { + TF_RETURN_IF_ERROR(params->converter->DynamicExpandDims( + /*input=*/input_tensor.tensor(), + /*dims=*/dims, + /*axis=*/trt_axis, + /*params=*/params, + /*output=*/&output_tensor)); + } else { + // ExpandDims: Insert new dim of size 1. + input_dims.insert(input_dims.begin() + trt_axis, 1); + // Reshape tensor. + DimsAdapter dims(input_dims); + TF_RETURN_IF_ERROR(PrepareTensorForShape( + params->converter, input_tensor, dims, + /*validation_only=*/false, &output_tensor, params->node_def)); + } params->outputs->push_back(TRT_TensorOrWeights(output_tensor)); return Status::OK(); } -Status ConvertSqueeze(OpConverterParams* params) { +Status Converter::DynamicReshape(ITensorProxyPtr input, + std::vector> slices, + const OpConverterParams* params, + ITensorProxyPtr* output, + std::vector size_for_added_dims, + absl::optional op_instance) { + *output = nullptr; + // DynamicReshape relies on INetworkDefinition::addShape + if (params->validation_only) { + return errors::Internal( + "DynamicReshape should not be used during validation"); + } + ITensorProxyPtr shape = + network()->addShape(*input->trt_tensor())->getOutput(0); + // Build new shape = shape[:trt_axis] + [1] + shape[trt_axis:] + std::vector concat_inputs; + int max_num_slices = std::max(slices.size(), size_for_added_dims.size()); + int op_instance_value = op_instance.has_value() ? op_instance.value() : 0; + + for (int i = 0; i < max_num_slices; i++) { + ITensorProxyPtr tensor; + // maybe_add_a_dimension(i); + if (i < size_for_added_dims.size() && size_for_added_dims[i] >= 0) { + nvinfer1::Dims dims{1, {1}}; + if (size_for_added_dims[i] > 0) { + dims.d[0] = size_for_added_dims[i]; + } + TF_RETURN_IF_ERROR( + CreateScalarConstant(params, std::min(size_for_added_dims[i], 1), + &tensor, nvinfer1::DataType::kINT32, dims)); + concat_inputs.push_back(tensor); + } + if (i < slices.size()) { + nvinfer1::ISliceLayer* slice_layer = network()->addSlice( + *shape->trt_tensor(), {1, {slices[i].first}}, + {1, {slices[i].second - slices[i].first}}, {1, {1}}); + concat_inputs.push_back(slice_layer->getOutput(0)); + string slice_name = StrCat("slice_", op_instance_value); + SetLayerName(slice_layer, params->node_def, slice_name, + /*op_instance=*/i); + } + } + std::vector trt_concat_inputs; + for (const auto& t : concat_inputs) { + trt_concat_inputs.push_back(t->trt_tensor()); + } + nvinfer1::IConcatenationLayer* concat_layer = network()->addConcatenation( + static_cast(trt_concat_inputs.data()), + concat_inputs.size()); + SetLayerName(concat_layer, params->node_def, "concat", op_instance); + concat_layer->setAxis(0); + ITensorProxyPtr new_shape = concat_layer->getOutput(0); + // Reshape input using new shape + nvinfer1::IShuffleLayer* shuffle = + network()->addShuffle(*input->trt_tensor()); + SetLayerName(shuffle, params->node_def, "shuffle", op_instance); + shuffle->setInput(1, *new_shape->trt_tensor()); + *output = shuffle->getOutput(0); + return Status::OK(); +} + +Status Converter::DynamicExpandDims(ITensorProxyPtr input, + const nvinfer1::Dims& dims, int axis, + const OpConverterParams* params, + ITensorProxyPtr* output, + absl::optional op_instance) { + if (params->validation_only) { + *output = nullptr; + return errors::Internal( + "DynamicExpandDims should not be used during validation"); + } + std::vector> slices; + std::vector extra_dims; + if (axis != 0) { + slices.push_back(std::pair{0, axis}); + extra_dims.push_back(-1); + } + extra_dims.push_back(1); + if (axis != dims.nbDims) { + slices.push_back(std::pair{axis, dims.nbDims}); + } + return DynamicReshape( + /*input=*/input, + /*slices=*/slices, + /*params=*/params, + /*output=*/output, + /*size_for_added_dims=*/extra_dims, + /*op_instance=*/op_instance); +} + +Status Converter::SqueezeTensor(ITensorProxyPtr input, + std::vector* input_dims, + const OpConverterParams* params, + ITensorProxyPtr* output, + absl::optional op_instance) { + // If the remaining dimensions of a squeeze operation have dynamic sizes, we + // need to use TRT ops to build the result shape for the squeeze operation. + // This is because IShuffleLayer::setReshapeDimensions treats -1 as a special + // value. + if (!params->use_implicit_batch && !HasStaticShape(*input_dims)) { + std::vector> slices; + for (int i = 0; i < input_dims->size(); i++) { + if (input_dims->at(i) != 0) { + slices.push_back(std::pair(i, i + 1)); + } + } + return DynamicReshape( + /*input=*/input, + /*slices=*/slices, + /*params=*/params, + /*output=*/output, + /*size_for_added_dims=*/{}, + /*op_instance=*/op_instance); + } + // Remove all dims which are equal to 0. + input_dims->erase(std::remove(input_dims->begin(), input_dims->end(), 0), + input_dims->end()); + // Reshape tensor. + TF_RETURN_IF_ERROR(PrepareTensorForShape( + params->converter, TRT_TensorOrWeights(input), DimsAdapter(*input_dims), + /*validation_only=*/false, output, params->node_def, op_instance)); + return Status::OK(); +} + +Status ConvertSqueeze(const OpConverterParams* params) { const auto& inputs = params->inputs; const auto& node_def = params->node_def; TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"input", false}})); @@ -2309,45 +2685,59 @@ Status ConvertSqueeze(OpConverterParams* params) { const TRT_TensorOrWeights& input_tensor = inputs.at(0); const nvinfer1::Dims dims = input_tensor.GetTrtDims(); std::vector input_dims(dims.d, dims.d + dims.nbDims); - // Mark axes to remove by setting them to 0. - TFAttrs attrs(node_def); - auto squeeze_dims = attrs.get>("squeeze_dims"); + std::vector squeeze_dims; + TF_RETURN_IF_ERROR( + GetNodeAttr(AttrSlice(node_def), "squeeze_dims", &squeeze_dims)); if (squeeze_dims.empty()) { - return errors::Unimplemented( - "Squeeze is only implemented for explicit dims, at ", node_def.name()); - } - for (int tf_axis : squeeze_dims) { - // Make sure axis is valid. - int trt_axis; - TF_RETURN_IF_ERROR(ConvertAxis(tf_axis, dims.nbDims, node_def.name(), - /*use_implicit_batch=*/true, &trt_axis)); - // Make sure target dimension is size 1. - if (input_dims[trt_axis] != 1) { - return errors::InvalidArgument( - "Dimension ", tf_axis, " with size ", input_dims[trt_axis], - " cannot be squeezed because it must be size 1, at ", - node_def.name()); + if (params->use_implicit_batch || !HasStaticShape(dims)) { + return errors::Unimplemented( + "Squeeze is not implemented for empty squeeze_dims"); + } else { + // explicit batch mode with static input shape we squeeze all singleton + // dimensions + for (int& dim : input_dims) { + if (dim == 1) { + // Mark it for removal by setting it to 0 + dim = 0; + } + } + } + } else { + std::vector trt_axes; + trt_axes.reserve(squeeze_dims.size()); + for (int tf_axis : squeeze_dims) { + // If the axis is valid, then convert it to TRT axis, otherwise abort + // conversion. + int trt_axis; + TF_RETURN_IF_ERROR(ConvertAxis(tf_axis, dims.nbDims, node_def.name(), + params->use_implicit_batch, &trt_axis)); + // Make sure target dimension is size 1 or unknown size (-1) + if (input_dims[trt_axis] != -1 && input_dims[trt_axis] != 1) { + return errors::InvalidArgument( + "Dimension ", tf_axis, " with size ", input_dims[trt_axis], + " cannot be squeezed because it must be size 1"); + } + trt_axes.push_back(trt_axis); + } + // Mark axes to remove by setting them to 0. + for (int axis : trt_axes) { + input_dims[axis] = 0; } - // Mark dim for removal by setting to 0. - input_dims[trt_axis] = 0; } if (params->validation_only) return Status::OK(); - // Remove all dims which are equal to 0. - input_dims.erase(std::remove(input_dims.begin(), input_dims.end(), 0), - input_dims.end()); - // Reshape tensor. - nvinfer1::Dims new_dims; - TF_RETURN_IF_ERROR(TensorShapeArrayToTrtDims(input_dims, &new_dims)); ITensorProxyPtr output_tensor = nullptr; - TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape( - input_tensor, new_dims, /*validation_only=*/false, &output_tensor)); + TF_RETURN_IF_ERROR(params->converter->SqueezeTensor( + /*input=*/input_tensor.tensor(), + /*input_dims=*/&input_dims, + /*params=*/params, + /*output=*/&output_tensor)); params->outputs->push_back(TRT_TensorOrWeights(output_tensor)); return Status::OK(); } template -Status ConvertStridedSliceHelper(OpConverterParams* params, +Status ConvertStridedSliceHelper(const OpConverterParams* params, const TRT_TensorOrWeights& input, Container begin, Container size, const Container& stride, @@ -2393,9 +2783,9 @@ Status ConvertStridedSliceHelper(OpConverterParams* params, ITensorProxyPtr tensor = layer->getOutput(0); // Reshape for shrink_axis. if (final_shape) { - TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape( - TRT_TensorOrWeights(tensor), *final_shape, /*validation_only=*/false, - &tensor)); + TF_RETURN_IF_ERROR(PrepareTensorForShape( + params->converter, TRT_TensorOrWeights(tensor), *final_shape, + /*validation_only=*/false, &tensor, params->node_def)); } params->outputs->push_back(TRT_TensorOrWeights(tensor)); return Status::OK(); @@ -2496,8 +2886,9 @@ Status ConvertStridedSliceHelper(OpConverterParams* params, // Start conversion. nvinfer1::ITensor* tensor = input.tensor(); if (need_reshape) { - TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape( - input, reshape_dims, /*validation_only=*/false, &tensor)); + TF_RETURN_IF_ERROR(PrepareTensorForShape( + params->converter, input, reshape_dims, /*validation_only=*/false, + &tensor, params->node_def)); } if (need_transpose) { TF_RETURN_IF_ERROR( @@ -2517,9 +2908,9 @@ Status ConvertStridedSliceHelper(OpConverterParams* params, } // Reshape for shrink_axis. if (final_shape) { - TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape( - TRT_TensorOrWeights(tensor), *final_shape, /*validation_only=*/false, - &tensor)); + TF_RETURN_IF_ERROR(PrepareTensorForShape( + params->converter, TRT_TensorOrWeights(tensor), *final_shape, + /*validation_only=*/false, &tensor, params->node_def)); } else if (need_reshape) { // Restore reshape. // Calculate output dimensions @@ -2540,9 +2931,9 @@ Status ConvertStridedSliceHelper(OpConverterParams* params, nvinfer1::Dims new_dims; TF_RETURN_IF_ERROR(TensorShapeArrayToTrtDims(input_dims, &new_dims, /*ignore_first_dim=*/true)); - TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape( - TRT_TensorOrWeights(tensor), new_dims, /*validation_only=*/false, - &tensor)); + TF_RETURN_IF_ERROR(PrepareTensorForShape( + params->converter, TRT_TensorOrWeights(tensor), new_dims, + /*validation_only=*/false, &tensor, params->node_def)); } params->outputs->push_back(TRT_TensorOrWeights(tensor)); @@ -2550,7 +2941,7 @@ Status ConvertStridedSliceHelper(OpConverterParams* params, #endif } -Status ConvertSlice(OpConverterParams* params) { +Status ConvertSlice(const OpConverterParams* params) { const auto& inputs = params->inputs; const auto& node_def = params->node_def; TF_RETURN_IF_ERROR(CheckInputsWeights( @@ -2595,7 +2986,7 @@ Status ConvertSlice(OpConverterParams* params) { return ConvertStridedSliceHelper(params, inputs.at(0), begin, size, stride); } -Status ConvertStridedSlice(OpConverterParams* params) { +Status ConvertStridedSlice(const OpConverterParams* params) { const auto& inputs = params->inputs; const auto& node_def = params->node_def; TF_RETURN_IF_ERROR(CheckInputsWeights( @@ -2664,7 +3055,7 @@ Status ConvertStridedSlice(OpConverterParams* params) { // If batch dimension is covered by the ellipsis mask, it means it's left // untouched. Otherwise we check whether it modifies the batch dimension here. if (!(ellipsis_mask & 1) || - begin_weights.shape_.nbDims >= input_dims.size()) { + begin_weights.Shape().NumDims() >= input_dims.size()) { // Check that batch dimension is unmodified. We need to use the expanded // begin/end/strides array since the original array may be incorrect when // (ellipsis_mask&1)==1. @@ -2708,20 +3099,19 @@ Status ConvertStridedSlice(OpConverterParams* params) { final_shape_dims_ptr); } -Status ConvertConv2D(OpConverterParams* params) { +Status ConvertConv2D(const OpConverterParams* params) { return ConvertConv2DHelper(params, 1, /*is_conv2d_backprop_input=*/false); } -Status ConvertConv2DDepthwise(OpConverterParams* params) { +Status ConvertConv2DDepthwise(const OpConverterParams* params) { return ConvertConv2DHelper(params, 0, /*is_conv2d_backprop_input=*/false); } -Status ConvertConv2DBackpropInput(OpConverterParams* params) { +Status ConvertConv2DBackpropInput(const OpConverterParams* params) { return ConvertConv2DHelper(params, 1, /*is_conv2d_backprop_input=*/true); } -#if IS_TRT_VERSION_GE(6, 0, 0, 0) -Status ConvertConv3DHelper(OpConverterParams* params, int group, +Status ConvertConv3DHelper(const OpConverterParams* params, int group, bool is_conv3d_backprop_input = false) { const int kNumDims = 5; const auto& inputs = params->inputs; @@ -2744,27 +3134,30 @@ Status ConvertConv3DHelper(OpConverterParams* params, int group, TF_RETURN_IF_ERROR( AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF})); const TRT_ShapedWeights weights_drsck = inputs.at(1).weights(); - if (weights_drsck.shape_.nbDims != kNumDims) { - return errors::InvalidArgument("Conv3D expects kernel of dimension 5, at ", - node_def.name()); + if (weights_drsck.Shape().NumDims() != kNumDims) { + return errors::InvalidArgument("Conv3D expects kernel of dimension 5"); } - TFAttrs attrs(node_def); - auto data_format = attrs.get("data_format"); + + string data_format, padding_type; + std::vector tf_dilations, tf_stride; + AttrSlice attrs(node_def); + TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "data_format", &data_format)); + TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "padding", &padding_type)); + TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "dilations", &tf_dilations)); + TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "strides", &tf_stride)); + const bool is_ndhwc = (data_format == "NDHWC"); // Or NCDHW 01234 - > 02341 const int d_index = is_ndhwc ? 1 : 2; const int h_index = is_ndhwc ? 2 : 3; const int w_index = is_ndhwc ? 3 : 4; const int c_index = is_ndhwc ? 4 : 1; - auto tf_dilations = attrs.get>("dilations"); if (tf_dilations.size() != kNumDims) { return errors::InvalidArgument( - "Convolution dilations field must specify 5 dimensions, at ", - node_def.name()); + "Convolution dilations field must specify 5 dimensions"); } if (tf_dilations[0] != 1 || tf_dilations[c_index] != 1) { return errors::Unimplemented( - "Dilation rate must be 1 for batch and channel dimensions, at ", - node_def.name()); + "Dilation rate must be 1 for batch and channel dimensions"); } const nvinfer1::Dims3 dilation_dhw( @@ -2774,20 +3167,16 @@ Status ConvertConv3DHelper(OpConverterParams* params, int group, dilation_dhw.d[2] != 1)) { return errors::Unimplemented( "Dilation with Conv3DBackpropInputV2 (conv3d_transpose) is not " - "supported", - ", at ", node_def.name()); + "supported"); } - const auto tf_stride = attrs.get>("strides"); if (tf_stride.size() != kNumDims) { return errors::InvalidArgument( - "Convolution strides field must specify 5 dimensions, at ", - node_def.name()); + "Convolution strides field must specify 5 dimensions"); } if (tf_stride[0] != 1 || tf_stride[c_index] != 1) { return errors::Unimplemented( - "Stride must be 1 for batch and channel dimensions, at ", - node_def.name()); + "Stride must be 1 for batch and channel dimensions"); } const nvinfer1::Dims3 stride_dhw(tf_stride[d_index], tf_stride[h_index], @@ -2795,24 +3184,24 @@ Status ConvertConv3DHelper(OpConverterParams* params, int group, const auto tensor_dim = tensor->getDimensions(); // Asymmetric padding on Deconv not supported for now - if (is_conv3d_backprop_input && attrs.get("padding") == "SAME") { - const int tensor_c_idx = c_index - 1; - const int num_groups = (group == 0) ? tensor_dim.d[tensor_c_idx] : group; - - TRT_ShapedWeights weights = + if (is_conv3d_backprop_input && padding_type == "SAME") { + ::stream_executor::port::StatusOr weights = params->weight_store->GetTempWeights(weights_drsck); - + TRT_ENSURE_OK(weights); nvinfer1::Dims3 effective_kernel_size( - weights.shape_.d[0] + - (weights.shape_.d[0] - 1) * (dilation_dhw.d[0] - 1), // D - weights.shape_.d[1] + - (weights.shape_.d[1] - 1) * (dilation_dhw.d[1] - 1), // R - weights.shape_.d[2] + - (weights.shape_.d[2] - 1) * (dilation_dhw.d[2] - 1) // S + weights.ValueOrDie().Shape().dim(0) + + (weights.ValueOrDie().Shape().dim(0) - 1) * + (dilation_dhw.d[0] - 1), // D + weights.ValueOrDie().Shape().dim(1) + + (weights.ValueOrDie().Shape().dim(1) - 1) * + (dilation_dhw.d[1] - 1), // R + weights.ValueOrDie().Shape().dim(2) + + (weights.ValueOrDie().Shape().dim(2) - 1) * + (dilation_dhw.d[2] - 1) // S ); const auto output_size_weights = - static_cast(backprop_output_size.weights().GetValues()); + backprop_output_size.weights().GetPointer(); const std::vector input_dims = {output_size_weights[d_index], output_size_weights[h_index], output_size_weights[w_index]}; @@ -2825,19 +3214,26 @@ Status ConvertConv3DHelper(OpConverterParams* params, int group, padding[2].first != padding[2].second) { return errors::Unimplemented( "Asymmetric padding with Conv3DBackpropInputV2 (conv3d_transpose) is " - "not supported, at ", - node_def.name()); + "not supported"); } } - if (params->validation_only) - return Status::OK(); // Finished validation checks + // Channel dim must be static for Conv3D since we use that value for + // num_groups at build time. + // TODO: Allow conversion if kImplicitBatchModeCompatible||kOptimal is used. + int implicit_batch_offset = params->use_implicit_batch ? -1 : 0; + if (tensor->getDimensions().d[c_index + implicit_batch_offset] == -1) { + return errors::InvalidArgument("Channel dimension must be static"); + } + + // Finished validation checks + if (params->validation_only) return Status::OK(); // Transpose to NCDHW (NCDHW is required for IConvLayer). const bool need_transpose = is_ndhwc; if (need_transpose) { - TF_RETURN_IF_ERROR( - params->converter->TransposeTensor(tensor, {0, 4, 1, 2, 3}, &tensor)); + TF_RETURN_IF_ERROR(params->converter->TransposeTensor( + tensor, {0, 4, 1, 2, 3}, &tensor, node_def, "to_NCDHW")); } // group == 0 signifies that this is a depthwise convolution, so set @@ -2848,15 +3244,17 @@ Status ConvertConv3DHelper(OpConverterParams* params, int group, // For conv, TF weights are DRSCK, and TRT expects KCDRS. // For backprop, TF weights are DRSKC, and TRT expects KCDRS. // Therefore, this reorder will work for both cases. - TRT_ShapedWeights weights = + ::stream_executor::port::StatusOr weights = params->weight_store->GetTempWeights(weights_drsck); - ReorderDRSCKToKCDRS(weights_drsck, &weights, num_groups); - TRT_ShapedWeights biases(weights.TrtDType()); + TRT_ENSURE_OK(weights); + ReorderDRSCKToKCDRS(weights_drsck, &weights.ValueOrDie(), num_groups); + TRT_ShapedWeights biases(weights.ValueOrDie().TrtDType()); const int output_axis = is_conv3d_backprop_input ? 1 : 0; - const int noutput = weights.shape_.d[output_axis] * num_groups; - nvinfer1::Dims3 kernel_size_drs(weights.shape_.d[2], // D - weights.shape_.d[3], // R - weights.shape_.d[4] // S + const int noutput = + weights.ValueOrDie().Shape().dim(output_axis) * num_groups; + nvinfer1::Dims3 kernel_size_drs(weights.ValueOrDie().Shape().dim(2), // D + weights.ValueOrDie().Shape().dim(3), // R + weights.ValueOrDie().Shape().dim(4) // S ); // Add convolution. @@ -2864,60 +3262,151 @@ Status ConvertConv3DHelper(OpConverterParams* params, int group, if (is_conv3d_backprop_input) { nvinfer1::IDeconvolutionLayer* layer = params->converter->network()->addDeconvolutionNd( - *tensor->trt_tensor(), noutput, kernel_size_drs, weights.GetTrtWeights(), - biases.GetTrtWeights()); + *tensor->trt_tensor(), noutput, kernel_size_drs, + weights.ValueOrDie().GetTrtWeights(), biases.GetTrtWeights()); TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name()); layer->setStrideNd(stride_dhw); // change to nd set stride - // TensorRT 5.1.3 added support for padding modes. - if (attrs.get("padding") == "SAME") { + if (padding_type == "SAME") { VLOG(2) << "Using SAME padding"; // SAME_UPPER means that post padding is preferred. layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER); } - layer->setName(node_def.name().c_str()); layer->setNbGroups(num_groups); conv_layer = layer; } else { nvinfer1::IConvolutionLayer* layer = params->converter->network()->addConvolutionNd( - *tensor->trt_tensor(), noutput, kernel_size_drs, weights.GetTrtWeights(), - biases.GetTrtWeights()); + *tensor->trt_tensor(), noutput, kernel_size_drs, + weights.ValueOrDie().GetTrtWeights(), biases.GetTrtWeights()); TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name()); layer->setStrideNd(stride_dhw); - if (attrs.get("padding") == "SAME") { + if (padding_type == "SAME") { VLOG(2) << "Using SAME padding"; layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER); } - layer->setName(node_def.name().c_str()); layer->setNbGroups(num_groups); layer->setDilationNd(dilation_dhw); conv_layer = layer; } + params->converter->SetLayerName(conv_layer, node_def, "conv"); ITensorProxyPtr output_tensor = conv_layer->getOutput(0); // Restore transpose. if (need_transpose) { TF_RETURN_IF_ERROR(params->converter->TransposeTensor( - output_tensor, {0, 2, 3, 4, 1}, &output_tensor)); + output_tensor, {0, 2, 3, 4, 1}, &output_tensor, node_def, "to_NDHWC")); } params->outputs->push_back(TRT_TensorOrWeights(output_tensor)); return Status::OK(); } -Status ConvertConv3D(OpConverterParams* params) { +Status ConvertConv3D(const OpConverterParams* params) { return ConvertConv3DHelper(params, 1, /*is_conv3d_backprop_input=*/false); } -Status ConvertConv3DBackpropInputV2(OpConverterParams* params) { +Status ConvertConv3DBackpropInputV2(const OpConverterParams* params) { return ConvertConv3DHelper(params, 1, /*is_conv3d_backprop_input=*/true); } -#endif // #if IS_TRT_VERSION_GE(6, 0, 0, 0) -Status ConvertFusedConv2DBiasActivation(OpConverterParams* params) { +Status ConvertPool3D(const OpConverterParams* params) { + const int kNumDims = 5; + const auto& inputs = params->inputs; + const auto& node_def = params->node_def; + TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"input", false}})); + TF_RETURN_IF_ERROR( + AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF})); + nvinfer1::PoolingType type; + if (node_def.op() == "MaxPool3D") { + type = nvinfer1::PoolingType::kMAX; + } else if (node_def.op() == "AvgPool3D") { + type = nvinfer1::PoolingType::kAVERAGE; + } else { + return errors::Unimplemented("Unsupported pooling type: ", node_def.op()); + } + + string data_format, padding_type; + std::vector tf_stride, tf_kernel; + AttrSlice attrs(node_def); + TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "padding", &padding_type)); + TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "data_format", &data_format)); + TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "strides", &tf_stride)); + TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "ksize", &tf_kernel)); + + if ((padding_type != "SAME") && (padding_type != "VALID")) { + return errors::Unimplemented("Unsupported padding type: ", padding_type); + } + + const bool is_ndhwc = (data_format == "NDHWC"); + const int c_index = is_ndhwc ? 4 : 1; + const int d_index = is_ndhwc ? 1 : 2; + const int h_index = is_ndhwc ? 2 : 3; + const int w_index = is_ndhwc ? 3 : 4; + + if (tf_stride.size() != kNumDims) { + return errors::InvalidArgument( + "Pooling strides field must specify 5 dimensions"); + } + if (tf_stride[0] != 1 || tf_stride[c_index] != 1) { + return errors::Unimplemented( + "stride must be 1 for batch and channel dimensions"); + } + + if (tf_kernel.size() != kNumDims) { + return errors::InvalidArgument( + "Pooling ksize field must specify 5 dimensions"); + } + if (tf_kernel[0] != 1 || tf_kernel[c_index] != 1) { + return errors::Unimplemented( + "ksize must be 1 for batch and channel dimensions"); + } + + const nvinfer1::Dims3 stride(tf_stride[d_index], tf_stride[h_index], + tf_stride[w_index]); + const nvinfer1::Dims3 ksize(tf_kernel[d_index], tf_kernel[h_index], + tf_kernel[w_index]); + + if (!(ksize.nbDims >= 3 && + (ksize.d[0] >= 1 && ksize.d[1] >= 1 && ksize.d[2] >= 1) && + (ksize.d[0] * ksize.d[1] * ksize.d[2] < MAX_KERNEL_DIMS_PRODUCT(3)))) { + return errors::InvalidArgument("Window dimensions are not within bounds"); + } + if (params->validation_only) return Status::OK(); + + ITensorProxyPtr tensor = inputs.at(0).tensor(); + if (data_format == "NDHWC") { + // NDHWC => NCDHW + TF_RETURN_IF_ERROR(params->converter->TransposeTensor( + tensor, {0, 4, 1, 2, 3}, &tensor, node_def, "to_NCDHW")); + } + + nvinfer1::IPoolingLayer* layer = params->converter->network()->addPoolingNd( + *tensor->trt_tensor(), type, ksize); + TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name()); + + layer->setStrideNd(stride); + // VALID padding is the default TRT behavior. + if (padding_type == "SAME") { + // SAME_UPPER means that post padding is preferred. + layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER); + } + params->converter->SetLayerName(layer, node_def, "pooling"); + + ITensorProxyPtr output_tensor = layer->getOutput(0); + if (data_format == "NDHWC") { + // NCDHW => NDHWC + TF_RETURN_IF_ERROR(params->converter->TransposeTensor( + output_tensor, {0, 2, 3, 4, 1}, &output_tensor, node_def, "to_NDHWC")); + } + + params->outputs->push_back(TRT_TensorOrWeights(output_tensor)); + return Status::OK(); +} + +Status ConvertFusedConv2DBiasActivation(const OpConverterParams* params) { const auto& inputs = params->inputs; const auto& node_def = params->node_def; @@ -2931,107 +3420,88 @@ Status ConvertFusedConv2DBiasActivation(OpConverterParams* params) { TF_RETURN_IF_ERROR( AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF})); TRT_ShapedWeights weights = inputs.at(1).weights(); - if (weights.shape_.nbDims != 4) { + if (weights.Shape().NumDims() != 4) { return errors::InvalidArgument( - "FusedConv2DBiasActivation expects kernel of dimension 4, at " + - node_def.name()); + "FusedConv2DBiasActivation expects kernel of dimension 4"); } - TFAttrs attrs(node_def); - auto data_format = attrs.get("data_format"); + + string data_format, filter_format, activation_mode, padding_type; + std::vector tf_dilations, tf_stride; + AttrSlice attrs(node_def); + TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "data_format", &data_format)); + TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "filter_format", &filter_format)); + TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "activation_mode", &activation_mode)); + TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "padding", &padding_type)); + TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "dilations", &tf_dilations)); + TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "strides", &tf_stride)); + if (data_format != "NHWC" && data_format != "NCHW") { - return errors::InvalidArgument("Unsupported data_format:", data_format, - " at ", node_def.name()); + return errors::InvalidArgument("Unsupported data_format:", data_format); } - int c_index = (data_format == "NHWC") ? 3 : 1; int h_index = (data_format == "NHWC") ? 1 : 2; int w_index = (data_format == "NHWC") ? 2 : 3; - auto tf_dilations = attrs.get>("dilations"); + if (tf_dilations.size() != 4) { return errors::InvalidArgument( - "Convolution dilations field must specify 4 dimensions, at ", - node_def.name()); + "Convolution dilations field must specify 4 dimensions"); } if (tf_dilations[0] != 1 || tf_dilations[c_index] != 1) { return errors::Unimplemented( - "Dilation rate must be 1 for batch and channel dimensions, at ", - node_def.name()); + "Dilation rate must be 1 for batch and channel dimensions"); } const nvinfer1::DimsHW dilation(tf_dilations[h_index], tf_dilations[w_index]); - const auto tf_stride = attrs.get>("strides"); if (tf_stride.size() != 4) { return errors::InvalidArgument( - "Convolution strides field must specify 4 dimensions, at ", - node_def.name()); + "Convolution strides field must specify 4 dimensions"); } if (tf_stride[0] != 1 || tf_stride[c_index] != 1) { return errors::Unimplemented( - "Stride must be 1 for batch and channel dimensions, at ", - node_def.name()); + "Stride must be 1 for batch and channel dimensions"); } const nvinfer1::DimsHW stride(tf_stride[h_index], tf_stride[w_index]); - const auto activation_mode = attrs.get("activation_mode"); auto op_pair = ActivationTypeMap()->find(activation_mode); if (op_pair == ActivationTypeMap()->end() && activation_mode != "None") { - return errors::Unimplemented("Activation mode: ", activation_mode, - " not supported at: ", node_def.name()); + return errors::Unimplemented("Activation mode not supported: ", + activation_mode); } - const auto filter_format = attrs.get("filter_format"); if (filter_format != "HWIO" && filter_format != "OIHW") { - return errors::InvalidArgument("Unsupported filter_format:", filter_format, - " at ", node_def.name()); + return errors::InvalidArgument("Unsupported filter_format:", filter_format); } // Check that there's no side_input or conv_input_scale. TRT_ShapedWeights side_input = inputs.at(3).weights(); if (side_input.count() != 0) { return errors::InvalidArgument( - "FusedConv2DBiasActivation doesn't yet support side_input, at " + - node_def.name()); + "FusedConv2DBiasActivation doesn't yet support side_input"); } TRT_ShapedWeights conv_input_scale = inputs.at(4).weights(); if (conv_input_scale.count() != 1 || conv_input_scale.TrtDType() != nvinfer1::DataType::kFLOAT || conv_input_scale.GetSpan()[0] != 1.0) { return errors::InvalidArgument( - "FusedConv2DBiasActivation doesn't yet support conv_input_scale, at " + - node_def.name()); + "FusedConv2DBiasActivation doesn't yet support conv_input_scale"); } if (params->validation_only) return Status::OK(); // Transpose to NCHW (NCHW is required for IConvLayer). const bool need_transpose = (data_format == "NHWC"); if (need_transpose) { - TF_RETURN_IF_ERROR( - params->converter->TransposeTensor(tensor, {0, 3, 1, 2}, &tensor)); + TF_RETURN_IF_ERROR(params->converter->TransposeTensor( + tensor, {0, 3, 1, 2}, &tensor, node_def, "to_NCHW")); } nvinfer1::DimsHW kernel_size; if (filter_format == "OIHW") { - kernel_size.h() = weights.shape_.d[2]; - kernel_size.w() = weights.shape_.d[3]; + kernel_size.h() = weights.Shape().dim(2); + kernel_size.w() = weights.Shape().dim(3); } else { // HWIO. DCHECK_EQ(filter_format, "HWIO"); - kernel_size.h() = weights.shape_.d[0]; - kernel_size.w() = weights.shape_.d[1]; + kernel_size.h() = weights.Shape().dim(0); + kernel_size.w() = weights.Shape().dim(1); } -// Before TRT 5.1.3, we have to calculate padding ourselves. -#if !IS_TRT_VERSION_GE(5, 1, 3, 0) - const auto tensor_dim = tensor->getDimensions(); - std::vector input_dims; - // Use 1 and 2 because tensor_dim has the dimensions of the transposed - // input. - input_dims = {static_cast(tensor_dim.d[1]), - static_cast(tensor_dim.d[2])}; - std::vector> padding; - ITensorProxyPtr padded_tensor = nullptr; - TF_RETURN_IF_ERROR(Conv2DPaddingHelper(params, attrs, kernel_size, dilation, - stride, input_dims, tensor, &padding, - &padded_tensor)); - tensor = padded_tensor; -#endif // Add convolution. TRT_ShapedWeights biases = inputs.at(2).weights(); @@ -3039,28 +3509,25 @@ Status ConvertFusedConv2DBiasActivation(OpConverterParams* params) { if (filter_format == "OIHW") { // Weights are already in the right order. conv_layer = params->converter->network()->addConvolution( - *tensor->trt_tensor(), weights.shape_.d[0], kernel_size, weights.GetTrtWeights(), - biases.GetTrtWeights()); + *tensor->trt_tensor(), weights.Shape().dim(0), kernel_size, + weights.GetTrtWeights(), biases.GetTrtWeights()); } else { // For conv, TF weights are RSCK, and TRT expects KCRS. - DCHECK_EQ(filter_format, "HWIO"); - TRT_ShapedWeights weights_kcrs = + TRT_ENSURE(filter_format == "HWIO"); + ::stream_executor::port::StatusOr weights_kcrs = params->weight_store->GetTempWeights(weights); - ReorderRSCKToKCRS(weights, &weights_kcrs, 1); + TRT_ENSURE_OK(weights_kcrs); + ReorderRSCKToKCRS(weights, &weights_kcrs.ValueOrDie(), 1); conv_layer = params->converter->network()->addConvolution( - *tensor->trt_tensor(), weights.shape_.d[3], kernel_size, weights_kcrs.GetTrtWeights(), - biases.GetTrtWeights()); + *tensor->trt_tensor(), weights.Shape().dim(3), kernel_size, + weights_kcrs.ValueOrDie().GetTrtWeights(), biases.GetTrtWeights()); } TFTRT_RETURN_ERROR_IF_NULLPTR(conv_layer, node_def.name()); conv_layer->setStride(stride); -#if IS_TRT_VERSION_GE(5, 1, 3, 0) - if (attrs.get("padding") == "SAME") { + if (padding_type == "SAME") { conv_layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER); } -#else - conv_layer->setPadding(nvinfer1::DimsHW{padding[0].first, padding[1].first}); -#endif - conv_layer->setName(node_def.name().c_str()); + params->converter->SetLayerName(conv_layer, node_def, "conv"); conv_layer->setNbGroups(1); conv_layer->setDilation(dilation); ITensorProxyPtr output_tensor = conv_layer->getOutput(0); @@ -3068,182 +3535,94 @@ Status ConvertFusedConv2DBiasActivation(OpConverterParams* params) { // Add activation if there is one. if (op_pair != ActivationTypeMap()->end()) { nvinfer1::IActivationLayer* activation_layer = - params->converter->network()->addActivation(*output_tensor->trt_tensor(), - op_pair->second); + params->converter->network()->addActivation( + *output_tensor->trt_tensor(), op_pair->second); TFTRT_RETURN_ERROR_IF_NULLPTR(activation_layer, node_def.name()); + params->converter->SetLayerName(activation_layer, node_def, "activation"); output_tensor = activation_layer->getOutput(0); } // Restore transpose. if (need_transpose) { TF_RETURN_IF_ERROR(params->converter->TransposeTensor( - output_tensor, {0, 2, 3, 1}, &output_tensor)); + output_tensor, {0, 2, 3, 1}, &output_tensor, node_def, "to_NHWC")); } params->outputs->push_back(TRT_TensorOrWeights(output_tensor)); return Status::OK(); } -Status ConvertPool(OpConverterParams* params) { +Status ConvertPool(const OpConverterParams* params) { const auto& inputs = params->inputs; const auto& node_def = params->node_def; TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"input", false}})); - TF_RETURN_IF_ERROR( - AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF})); + std::set allowed_types{DataType::DT_FLOAT, DataType::DT_HALF, + DataType::DT_INT8}; + TF_RETURN_IF_ERROR(AllowDataTypes(*params, allowed_types)); nvinfer1::PoolingType type; if (node_def.op() == "MaxPool") { type = nvinfer1::PoolingType::kMAX; } else if (node_def.op() == "AvgPool") { type = nvinfer1::PoolingType::kAVERAGE; } else { - return errors::Unimplemented("Unsupported pooling type: ", node_def.op(), - ", at ", node_def.name()); + return errors::Unimplemented("Unsupported pooling type: ", node_def.op()); } - TFAttrs attrs(node_def); - const string padding_type = attrs.get("padding"); + + string data_format, padding_type; + std::vector tf_stride, tf_kernel; + AttrSlice attrs(node_def); + TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "data_format", &data_format)); + TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "padding", &padding_type)); + TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "strides", &tf_stride)); + TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "ksize", &tf_kernel)); + if ((padding_type != "SAME") && (padding_type != "VALID")) { - return errors::Unimplemented("Unsupported padding type: ", padding_type, - ", at ", node_def.name()); + return errors::Unimplemented("Unsupported padding type: ", padding_type); } - if (params->validation_only) return Status::OK(); ITensorProxyPtr tensor = inputs.at(0).tensor(); int h_index = 2; int w_index = 3; - const auto data_format = attrs.get("data_format"); if (data_format == "NHWC") { h_index = 1; w_index = 2; - TF_RETURN_IF_ERROR( - params->converter->TransposeTensor(tensor, {0, 3, 1, 2}, &tensor)); } - const auto tf_stride = attrs.get>("strides"); const nvinfer1::DimsHW stride(tf_stride[h_index], tf_stride[w_index]); - - const auto tf_kernel = attrs.get>("ksize"); const nvinfer1::DimsHW ksize(tf_kernel[h_index], tf_kernel[w_index]); -// Before TRT 5.1.3, we have to calculate padding ourselves. -#if !IS_TRT_VERSION_GE(5, 1, 3, 0) - auto tensor_dim = tensor->getDimensions(); - std::vector> padding; - if (padding_type == "SAME") { - // This is NCHW tensor with no batch dimension. - // 1 -> h - // 2 -> w - padding = CreateSamePadding( - stride, ksize, - {static_cast(tensor_dim.d[1]), static_cast(tensor_dim.d[2])}); - } else if (padding_type == "VALID") { - padding = {{0, 0}, {0, 0}}; + if (!((ksize.h() >= 1 && ksize.w() >= 1) && + (ksize.h() * ksize.w() < MAX_KERNEL_DIMS_PRODUCT(2)))) { + return errors::InvalidArgument("Window dimensions are not within bounds"); } -#endif -// TensorRT 5.1 added support for asymmetric padding. Before that, we need an -// extra padding layer. -#if !IS_TRT_VERSION_GE(5, 1, 0, 0) - // Asymmetric padding case. - if (padding[0].first != padding[0].second || - padding[1].first != padding[1].second) { - auto pad_layer = params->converter->network()->addPadding( - *tensor->trt_tensor(), nvinfer1::DimsHW(padding[0].first, padding[1].first), - nvinfer1::DimsHW(padding[0].second, padding[1].second)); - TFTRT_RETURN_ERROR_IF_NULLPTR(pad_layer, node_def.name()); - ITensorProxyPtr out_tensor = pad_layer->getOutput(0); - params->converter->MarkQuantizationRangesAsInferrable(&tensor, &out_tensor); - padding = {{0, 0}, {0, 0}}; - tensor = out_tensor; + + if (params->validation_only) return Status::OK(); + + if (data_format == "NHWC") { + TF_RETURN_IF_ERROR(params->converter->TransposeTensor( + tensor, {0, 3, 1, 2}, &tensor, node_def, "to_NCHW")); } -#endif - nvinfer1::IPoolingLayer* layer = - params->converter->network()->addPooling(*tensor->trt_tensor(), type, ksize); + nvinfer1::IPoolingLayer* layer = params->converter->network()->addPooling( + *tensor->trt_tensor(), type, ksize); TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name()); - // TODO(tmorris): Average pooling may not be entirely safe to infer - // quantization range through (at least forwards - backwards should be fine). - // Max pooling is okay. - ITensorProxyPtr out_tensor = layer->getOutput(0); - params->converter->MarkQuantizationRangesAsInferrable(&tensor, &out_tensor); layer->setStride(stride); -#if IS_TRT_VERSION_GE(5, 1, 3, 0) // VALID padding is the default TRT behavior. - if (attrs.get("padding") == "SAME") { + if (padding_type == "SAME") { // SAME_UPPER means that post padding is preferred. layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER); } -#elif IS_TRT_VERSION_GE(5, 1, 0, 0) - layer->setPrePadding(nvinfer1::DimsHW{padding[0].first, padding[1].first}); - layer->setPostPadding(nvinfer1::DimsHW{padding[0].second, padding[1].second}); -#else - layer->setPadding(nvinfer1::DimsHW{padding[0].first, padding[1].first}); -#endif - layer->setName(node_def.name().c_str()); + params->converter->SetLayerName(layer, node_def, "pooling"); ITensorProxyPtr output_tensor = layer->getOutput(0); if (data_format == "NHWC") { TF_RETURN_IF_ERROR(params->converter->TransposeTensor( - output_tensor, {0, 2, 3, 1}, &output_tensor)); + output_tensor, {0, 2, 3, 1}, &output_tensor, node_def, "to_NHWC")); } params->outputs->push_back(TRT_TensorOrWeights(output_tensor)); return Status::OK(); } -Status ConvertLeakyRelu(OpConverterParams* params) { - const auto& inputs = params->inputs; - const auto& node_def = params->node_def; - TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"input", false}})); - TF_RETURN_IF_ERROR( - AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF})); - TFAttrs attrs(node_def); - const float alpha = attrs.get("alpha"); - -#if IS_TRT_VERSION_GE(5, 1, 2, 0) - // Use IActivationLayer when available. - if (params->validation_only) return Status::OK(); - - nvinfer1::IActivationLayer* layer = - params->converter->network()->addActivation( - *inputs.at(0).tensor()->trt_tensor(), nvinfer1::ActivationType::kLEAKY_RELU); - TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name()); - layer->setAlpha(alpha); - params->outputs->push_back(TRT_TensorOrWeights(layer->getOutput(0))); - return Status::OK(); -#else - // Use elementwise ops when IActivationLayer is not available. - if (alpha < 0.0f || alpha > 1.0f) { - return errors::Unimplemented( - "Alpha value for LeakyRelu must be between 0 and 1, at ", - node_def.name()); - } - if (params->validation_only) return Status::OK(); - - ITensorProxyPtr tensor = inputs.at(0).tensor(); - // Create const for alpha. - ITensorProxyPtr const_alpha_tensor = nullptr; - TF_RETURN_IF_ERROR(CreateBroadcastableScalarConstant( - params, alpha, tensor->getDimensions(), &const_alpha_tensor)); - // alpha * x - nvinfer1::IElementWiseLayer* mul_layer = - params->converter->network()->addElementWise( - *tensor->trt_tensor(), *const_alpha_tensor->trt_tensor(), nvinfer1::ElementWiseOperation::kPROD); - TFTRT_RETURN_ERROR_IF_NULLPTR(mul_layer, node_def.name()); - // max(x, alpha * x) - nvinfer1::IElementWiseLayer* max_layer = - params->converter->network()->addElementWise( - *tensor->trt_tensor(), *mul_layer->getOutput(0), - nvinfer1::ElementWiseOperation::kMAX); - TFTRT_RETURN_ERROR_IF_NULLPTR(max_layer, node_def.name()); - ITensorProxyPtr max_tensor = max_layer->getOutput(0); - ITensorProxyPtr mul_tensor = mul_layer->getOutput(0); - params->converter->MarkQuantizationRangesAsInferrable(&max_tensor, - &mul_tensor); - - params->outputs->push_back(TRT_TensorOrWeights(max_tensor)); - return Status::OK(); -#endif -} - -#if IS_TRT_VERSION_GE(5, 1, 2, 0) -Status ConvertClipByValue(OpConverterParams* params) { +Status ConvertClipByValue(const OpConverterParams* params) { const auto& inputs = params->inputs; const auto& node_def = params->node_def; // TODO(tmorris): We can also allow the case where min and max are tensors by @@ -3255,8 +3634,9 @@ Status ConvertClipByValue(OpConverterParams* params) { AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF})); if (params->validation_only) return Status::OK(); - TFAttrs attrs(node_def); - const DataType dtype = attrs.get("T"); + DataType dtype; + TF_RETURN_IF_ERROR(GetNodeAttr(AttrSlice(node_def), "T", &dtype)); + float clip_value_min = 0.0f; float clip_value_max = 0.0f; // TODO(tmorris): Add a templated helper function to get scalar weights of @@ -3265,324 +3645,32 @@ Status ConvertClipByValue(OpConverterParams* params) { clip_value_min = inputs.at(1).weights().GetSpan()[0]; clip_value_max = inputs.at(2).weights().GetSpan()[0]; } else if (dtype == DataType::DT_HALF) { - clip_value_min = Eigen::half_impl::half_to_float( - inputs.at(1).weights().GetSpan()[0]); - clip_value_max = Eigen::half_impl::half_to_float( - inputs.at(2).weights().GetSpan()[0]); + clip_value_min = + static_cast(inputs.at(1).weights().GetSpan()[0]); + clip_value_max = + static_cast(inputs.at(2).weights().GetSpan()[0]); } nvinfer1::IActivationLayer* layer = params->converter->network()->addActivation( - *inputs.at(0).tensor()->trt_tensor(), nvinfer1::ActivationType::kCLIP); + *inputs.at(0).tensor()->trt_tensor(), + nvinfer1::ActivationType::kCLIP); layer->setAlpha(clip_value_min); layer->setBeta(clip_value_max); TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name()); - ITensorProxyPtr output_tensor = layer->getOutput(0); - params->converter->ProvideQuantizationRange(&output_tensor, clip_value_min, - clip_value_max); - params->outputs->push_back(TRT_TensorOrWeights(output_tensor)); - return Status::OK(); -} -#endif - -const std::unordered_map* -ActivationTypeMap() { - static auto* const m = - new std::unordered_map({ - {"Relu", nvinfer1::ActivationType::kRELU}, - {"Sigmoid", nvinfer1::ActivationType::kSIGMOID}, - {"Tanh", nvinfer1::ActivationType::kTANH}, -#if IS_TRT_VERSION_GE(5, 1, 2, 0) - {"Elu", nvinfer1::ActivationType::kELU}, - {"Selu", nvinfer1::ActivationType::kSELU}, - {"Softsign", nvinfer1::ActivationType::kSOFTSIGN}, - {"Softplus", nvinfer1::ActivationType::kSOFTPLUS}, -#endif - }); - return m; -} - -Status ConvertActivation(OpConverterParams* params) { - const auto& inputs = params->inputs; - const auto& node_def = params->node_def; - TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"input", false}})); - TF_RETURN_IF_ERROR( - AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF})); - auto op_pair = ActivationTypeMap()->find(node_def.op()); - if (op_pair == ActivationTypeMap()->end()) { - return errors::Unimplemented("Activation op: ", node_def.op(), - " not supported at: ", node_def.name()); - } - if (params->validation_only) return Status::OK(); - - // Start conversion. - nvinfer1::IActivationLayer* layer = - params->converter->network()->addActivation(*inputs.at(0).tensor()->trt_tensor(), - op_pair->second); - TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name()); - // Set parameters. -#if IS_TRT_VERSION_GE(5, 1, 2, 0) - if (node_def.op() == "Elu") { - layer->setAlpha(1.0f); - } else if (node_def.op() == "Selu") { - // From tensorflow/core/kernels/relu_op_functor.h - layer->setAlpha(1.7580993408473768599402175208123f); - layer->setBeta(1.0507009873554804934193349852946f); - } else if (node_def.op() == "Softplus") { - layer->setAlpha(1.0f); - layer->setBeta(1.0f); - } -#endif - ITensorProxyPtr output_tensor = layer->getOutput(0); - // Set quantization range for output when known. - if (node_def.op() == "Sigmoid") { - params->converter->ProvideQuantizationRange(&output_tensor, 0.0f, 1.0f); - } else if (node_def.op() == "Tanh") { - params->converter->ProvideQuantizationRange(&output_tensor, -1.0f, 1.0f); - } else if (node_def.op() == "Softsign") { - params->converter->ProvideQuantizationRange(&output_tensor, -1.0f, 1.0f); - } - params->outputs->push_back(TRT_TensorOrWeights(output_tensor)); - return Status::OK(); -} - -Status ConvertQuantize(OpConverterParams* params) { - const auto& inputs = params->inputs; - const auto& node_def = params->node_def; - if (node_def.op() == "FakeQuantWithMinMaxArgs") { - TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"input", false}})); - } else if (node_def.op() == "FakeQuantWithMinMaxVars") { - TF_RETURN_IF_ERROR(CheckInputsWeights( - *params, {{"input", false}, {"min", true}, {"max", true}})); - } else if (node_def.op() == "QuantizeAndDequantizeV2") { - TF_RETURN_IF_ERROR(CheckInputsWeights( - *params, {{"input", false}, {"input_min", true}, {"input_max", true}})); - } else if (node_def.op() == "QuantizeAndDequantizeV3") { - TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"input", false}, - {"input_min", true}, - {"input_max", true}, - {"num_bits", true}})); - } - float min_range = 0.0f; - float max_range = 0.0f; - if (node_def.op() == "FakeQuantWithMinMaxArgs") { - // Get ranges via node attributes. - TFAttrs attrs(node_def); - if (attrs.count("min") == 0 || attrs.count("max") == 0) { - return errors::InvalidArgument("Min or max attribute not found for ", - node_def.op(), " at ", node_def.name()); - } - min_range = attrs.get("min"); - max_range = attrs.get("max"); - } else if (node_def.op() == "FakeQuantWithMinMaxVars" || - node_def.op() == "QuantizeAndDequantizeV2" || - node_def.op() == "QuantizeAndDequantizeV3") { - // Get ranges via inputs. - auto get_weights_value = [&inputs](int index) { - auto raw_weights = - static_cast(inputs.at(index).weights().GetValues()); - return raw_weights[0]; - }; - min_range = get_weights_value(1); - max_range = get_weights_value(2); - } else { - return errors::InvalidArgument("Unknown quantization op ", node_def.op(), - ", at ", node_def.name()); - } - if (params->validation_only) return Status::OK(); - - // Store ranges for tensor - ITensorProxyPtr input0 = inputs.at(0).tensor(); - params->converter->ProvideQuantizationRange(&input0, min_range, max_range); - // Sometimes, TRT may not quantize a tensor, either because it chooses to - // execute a higher precision kernel or because of op fusion. In these cases, - // accuracy will suffer if the model was trained to expect quantization at - // that tensor. We should consider adding a clip(tensor, min_range, max_range) - // operation here to ensure that any arbitrarily placed quantize node will - // execute as expected. However, this will negatively affect performance. If - // users train their models in a way which models inference as close as - // possible (i.e. not quantizing in place where fusion will occur), then there - // is no problem with the current implementation. - params->outputs->push_back(inputs.at(0)); - return Status::OK(); -} - -Status ConvertRelu6(OpConverterParams* params) { - const auto& inputs = params->inputs; - const auto& node_def = params->node_def; - TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"input", false}})); - TF_RETURN_IF_ERROR( - AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF})); - if (params->validation_only) return Status::OK(); - -#if IS_TRT_VERSION_GE(5, 1, 2, 0) - // Use IActivationLayer for TRT >= 5.1 - nvinfer1::IActivationLayer* layer = - params->converter->network()->addActivation( - *inputs.at(0).tensor()->trt_tensor(), nvinfer1::ActivationType::kCLIP); - layer->setAlpha(0.0f); - layer->setBeta(6.0f); - TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name()); - ITensorProxyPtr output_tensor = layer->getOutput(0); - params->converter->ProvideQuantizationRange(&output_tensor, 0.0f, 6.0f); - params->outputs->push_back(TRT_TensorOrWeights(output_tensor)); - return Status::OK(); -#else - // Convert using min(Relu(x), 6) before TRT 5.1 - // Input Tensor - ITensorProxyPtr tensor = inputs.at(0).tensor(); - - // Relu operation i.e. Relu(x) = max(0, x) - nvinfer1::IActivationLayer* relu_layer = - params->converter->network()->addActivation( - *tensor->trt_tensor(), nvinfer1::ActivationType::kRELU); - TFTRT_RETURN_ERROR_IF_NULLPTR(relu_layer, node_def.name()); - - // Large range of relu is problematic during quantization in INT8 precision - // mode. Setting dynamic range of relu = [0.f, 6.0f] helps with quantization. - // TRT only uses dynamic ranges in INT8 precision mode, - // and this does not affect the FP32 path. - params->converter->ProvideQuantizationRange(&relu_layer->getOutput(0), 0.0f, - 6.0f); - - // Create a constant layer to store the floating point weight i.e. 6.0f - ITensorProxyPtr const6_tensor = nullptr; - TF_RETURN_IF_ERROR(CreateBroadcastableScalarConstant( - params, 6.0f, relu_layer->getOutput(0)->getDimensions(), &const6_tensor)); - - // ElementWise Min Operation - // Min op is a nop for INT8 execution path, as the input tensor - // to this layer will only have values in range [0.f, 6.0f]. - nvinfer1::IElementWiseLayer* relu6_layer = - params->converter->network()->addElementWise( - *relu_layer->getOutput(0), *const6_tensor->trt_tensor(), - nvinfer1::ElementWiseOperation::kMIN); - TFTRT_RETURN_ERROR_IF_NULLPTR(relu6_layer, node_def.name()); - ITensorProxyPtr output_tensor = relu6_layer->getOutput(0); - params->converter->ProvideQuantizationRange(&output_tensor, 0.0f, 6.0f); - - params->outputs->push_back(TRT_TensorOrWeights(output_tensor)); - return Status::OK(); -#endif -} - -Status ConvertBiasAddInt8WithoutCalibration(OpConverterParams* params) { - const auto& inputs = params->inputs; - const auto& node_def = params->node_def; - TF_RETURN_IF_ERROR( - CheckInputsWeights(*params, {{"value", false}, {"bias", true}})); - TF_RETURN_IF_ERROR( - AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF})); - if (params->validation_only) return Status::OK(); - - ITensorProxyPtr tensor = inputs.at(0).tensor(); - const nvinfer1::Dims original_dims = tensor->getDimensions(); - TFAttrs attrs(node_def); - const string data_format = attrs.get("data_format"); - const int channel_index = - (data_format == "NHWC" ? original_dims.nbDims - 1 : 0); - - nvinfer1::Permutation permutation; - if (channel_index != 0) { - // Permute the dimensions so that the channel dimension is the first - // dimension. - for (int i = 0; i < original_dims.nbDims; ++i) { - permutation.order[i] = i; - } - permutation.order[0] = channel_index; - permutation.order[channel_index] = 0; - VLOG(1) << "ConvertBiasAdd permutation: " - << DebugString(permutation, original_dims.nbDims); - } - - // TensorRT addScale requires input to be of rank 3, we need to apply - // transpose as well as reshape. - // TODO(laigd): this doesn't match what the TRT doc says, fix the doc? - if (channel_index != 0 || original_dims.nbDims != 3) { - nvinfer1::IShuffleLayer* shuffle_layer = - params->converter->network()->addShuffle(*tensor->trt_tensor()); - TFTRT_RETURN_ERROR_IF_NULLPTR(shuffle_layer, node_def.name()); - ITensorProxyPtr out_tensor = shuffle_layer->getOutput(0); - params->converter->MarkQuantizationRangesAsInferrable(&tensor, &out_tensor); - - // NOTE(laigd): for some reason we need to apply the reshape - // unconditionally. The default shape has nbDims==-1 and it seems the - // behavior is undefined in some cases. - nvinfer1::Dims reshape_dims; - reshape_dims.nbDims = 3; - // 0 means copying from input; -1 means inferring from the rest. - reshape_dims.d[0] = 0; - reshape_dims.d[1] = original_dims.nbDims >= 2 ? 0 : 1; - reshape_dims.d[2] = original_dims.nbDims >= 3 ? -1 : 1; - shuffle_layer->setReshapeDimensions(reshape_dims); - - if (channel_index != 0) { - shuffle_layer->setFirstTranspose(permutation); - } - tensor = out_tensor; - } - - TRT_ShapedWeights weights = inputs.at(1).weights(); - nvinfer1::ScaleMode mode = nvinfer1::ScaleMode::kCHANNEL; - if (weights.shape_.d[0] == 1) { - mode = nvinfer1::ScaleMode::kUNIFORM; - } - - TRT_ShapedWeights empty_weights(weights.TrtDType()); - nvinfer1::IScaleLayer* layer = params->converter->network()->addScale( - *tensor->trt_tensor(), mode, weights.GetTrtWeights(), empty_weights.GetTrtWeights(), - empty_weights.GetTrtWeights()); - TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name()); - - ITensorProxyPtr output_tensor = layer->getOutput(0); - - // Restore transpose & reshape. - if (channel_index != 0 || original_dims.nbDims != 3) { - nvinfer1::IShuffleLayer* shuffle_layer = - params->converter->network()->addShuffle(*output_tensor->trt_tensor()); - TFTRT_RETURN_ERROR_IF_NULLPTR(shuffle_layer, node_def.name()); - // NOTE: for same reason as mentioned above we need to apply the reshape - // unconditionally. - nvinfer1::Dims reshape_dims = original_dims; - if (channel_index != 0) { - // NOTE: according to NVIDIA dimension types are deprecated, so we don't - // need to copy them back. - reshape_dims.d[channel_index] = original_dims.d[0]; - reshape_dims.d[0] = original_dims.d[channel_index]; - } - shuffle_layer->setReshapeDimensions(reshape_dims); - - if (channel_index != 0) { - shuffle_layer->setSecondTranspose(permutation); - } - ITensorProxyPtr shuffle_tensor = shuffle_layer->getOutput(0); - params->converter->MarkQuantizationRangesAsInferrable(&output_tensor, - &shuffle_tensor); - output_tensor = shuffle_tensor; - } - - params->outputs->push_back(TRT_TensorOrWeights(output_tensor)); + params->converter->SetLayerName(layer, node_def, "activation"); + params->outputs->push_back(TRT_TensorOrWeights(layer->getOutput(0))); return Status::OK(); } -Status ConvertBiasAdd(OpConverterParams* params) { - if (params->precision_mode == TrtPrecisionMode::INT8 && - !params->use_calibration) { - // NOTE(laigd): based on some observation, it seems TensorRT cannot fuse - // IConvolutionLayer and IElementwiseLayer and will require range - // information for the output of Conv2D. Using IScaleLayer will fix the - // problem. - return ConvertBiasAddInt8WithoutCalibration(params); - } +Status ConvertBiasAdd(const OpConverterParams* params) { const auto& inputs = params->inputs; const auto& node_def = params->node_def; - - if (inputs.size() != 2) { - return errors::InvalidArgument( - "BiasAdd expects exactly 2 inputs, but received ", inputs.size()); - } + TFTRT_CHECK_INPUT_SIZE(inputs.size(), 2, node_def); if (inputs[0].is_weights() && inputs[1].is_weights()) { + // TODO(lsugy): don't assume that if all inputs are weights, grappler + // should fold them, because variables are weights. return errors::InvalidArgument( "All inputs are weights, but Grappler is expected to fold them."); } @@ -3590,60 +3678,69 @@ Status ConvertBiasAdd(OpConverterParams* params) { TF_RETURN_IF_ERROR( AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF})); - TFAttrs attrs(node_def); - const string& data_format = attrs.get("data_format"); + string data_format; + TF_RETURN_IF_ERROR( + GetNodeAttr(AttrSlice(node_def), "data_format", &data_format)); nvinfer1::Dims input_shape = inputs.at(0).GetTrtDims(); nvinfer1::Dims bias_shape = inputs.at(1).GetTrtDims(); - // If the input is NCHW, then we need to unsqueeze the bias such that its last - // dimensions are 1s (and the first dimension is C). + // The bias input arg is a 1-D tensor with length C. If the input is NCHW, + // then we need to unsqueeze the bias such that its shape is [1, C, 1, 1]. if (data_format == "NCHW") { - bias_shape.nbDims = input_shape.nbDims; - std::fill(bias_shape.d + 1, bias_shape.d + bias_shape.nbDims, 1); + if (params->use_implicit_batch) { + // The batch dim is not included in implicit batch mode, so the shape of + // the bias tensor is [C, 1, 1]. + bias_shape.nbDims = input_shape.nbDims; + std::fill(bias_shape.d + 1, bias_shape.d + bias_shape.nbDims, 1); + } else { + // In explicit batch mode we create a tensor with shape [1, C, 1, 1]. + std::vector bias_shape_vec(bias_shape.d, + bias_shape.d + bias_shape.nbDims); + // Insert 1 before for batch dim + bias_shape_vec.insert(bias_shape_vec.begin(), 1); + // Trail with 1s to match input_shape size + bias_shape_vec.insert(bias_shape_vec.end(), + input_shape.nbDims - bias_shape_vec.size(), 1); + DimsAdapter(bias_shape_vec).TrtDims(&bias_shape); + } } else { // Next, broadcast the bias across the input. TF_RETURN_IF_ERROR(GetTrtBroadcastShape(inputs.at(0), inputs.at(1), /*check_feasibility=*/true, + params->use_implicit_batch, &input_shape, &bias_shape)); } // Convert input to a TRT tensor ITensorProxyPtr input_tensor{nullptr}; - TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape( - inputs.at(0), input_shape, params->validation_only, &input_tensor)); + TF_RETURN_IF_ERROR(PrepareTensorForShape( + params->converter, inputs.at(0), DimsAdapter(input_shape), + params->validation_only, &input_tensor, node_def, + /*op_instance=*/0)); // Finally, reshape bias. Since the bias is usually a constant, this will // normally happen at conversion-time. ITensorProxyPtr bias_tensor{nullptr}; - TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape( - inputs.at(1), bias_shape, params->validation_only, &bias_tensor)); + TF_RETURN_IF_ERROR(PrepareTensorForShape( + params->converter, inputs.at(1), DimsAdapter(bias_shape), + params->validation_only, &bias_tensor, node_def, + /*op_instance=*/1)); VLOG(2) << "Bias shape adjusted to " << DebugString(bias_shape); if (params->validation_only) return Status::OK(); nvinfer1::IElementWiseLayer* layer = params->converter->network()->addElementWise( - *input_tensor->trt_tensor(), *bias_tensor->trt_tensor(), nvinfer1::ElementWiseOperation::kSUM); + *input_tensor->trt_tensor(), *bias_tensor->trt_tensor(), + nvinfer1::ElementWiseOperation::kSUM); TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name()); + params->converter->SetLayerName(layer, node_def, "sum"); ITensorProxyPtr output_tensor = layer->getOutput(0); params->outputs->push_back(TRT_TensorOrWeights(output_tensor)); return Status::OK(); } -void GetTensorDimsWithProtoShape(const Tensor& tensor, nvinfer1::Dims* dims) { - if (tensor.dims() > 0) { - *dims = GetTrtDimsForTensor(tensor); - } else { - dims->nbDims = 1; - // No dimension provided. Flatten it. - dims->d[0] = tensor.NumElements(); - for (int i = 1; i < nvinfer1::Dims::MAX_DIMS; ++i) { - dims->d[i] = 0; - } - } -} - template inline bool IsIntegerInInt32Bounds(const Input& inp) { static_assert(std::is_integral::value, @@ -3691,7 +3788,7 @@ Status TfTensorToTrtWeights(const Tensor& tensor, TrtWeightStore* weight_store, // Verify that the dtype is supported by TensorRT. Otherwise, return an error. nvinfer1::DataType trt_dtype; - TF_RETURN_IF_ERROR(TfDataTypeToTrt(converted_dtype, &trt_dtype)); + TF_RETURN_IF_ERROR(TfTypeToTrtType(converted_dtype, &trt_dtype)); if (tensor.NumElements() == 0) { // Return empty weights. @@ -3699,21 +3796,26 @@ Status TfTensorToTrtWeights(const Tensor& tensor, TrtWeightStore* weight_store, return Status::OK(); } - nvinfer1::Dims weight_dims; - GetTensorDimsWithProtoShape(tensor, &weight_dims); - *weights = weight_store->GetTempWeights(trt_dtype, weight_dims); + ::stream_executor::port::StatusOr weight_dims = + DimsAdapter::Create(tensor.shape()); + TRT_ENSURE_OK(weight_dims); + + auto tmp = weight_store->GetTempWeights(trt_dtype, + weight_dims.ValueOrDie().AsTrtDims()); + TRT_ENSURE_OK(tmp); + *weights = std::move(tmp).ValueOrDie(); // Copy the tensor directly if the tensor does not require cast to the // supported type. if (converted_dtype == dtype) { - char* dst = static_cast(weights->GetValues()); - memcpy(dst, tensor.tensor_data().data(), tensor.TotalBytes()); + std::copy_n(tensor.tensor_data().data(), tensor.TotalBytes(), + weights->GetPointer()); return Status::OK(); } Status status = Status::OK(); // Copy tensor elements after casting them to the converted DataType. - int32* dst = static_cast(weights->GetValues()); + int32* dst = weights->GetPointer(); switch (dtype) { case DT_INT8: status = CopyToTrtInt32Array(tensor, dst); @@ -3747,13 +3849,12 @@ Status TfTensorToTrtWeights(const Tensor& tensor, TrtWeightStore* weight_store, // weights to params->outputs. We did this since TrtNodeValidator needs the // weights as input to other nodes, and use it to determine whether those nodes // are supported by TRT. -Status ConvertConst(OpConverterParams* params) { +Status ConvertConst(const OpConverterParams* params) { const auto& inputs = params->inputs; const auto& node_def = params->node_def; if (!inputs.empty()) { return errors::InvalidArgument( - "Constant node is expected to have empty input list: ", - node_def.name()); + "Constant node is expected to have empty input list"); } // Create shaped weights as output @@ -3764,8 +3865,9 @@ Status ConvertConst(OpConverterParams* params) { node_def.name()); } - TFAttrs attrs(node_def); - const DataType dtype = attrs.get("dtype"); + DataType dtype; + TF_RETURN_IF_ERROR(GetNodeAttr(AttrSlice(node_def), "dtype", &dtype)); + if (dtype != tensor.dtype()) { return errors::InvalidArgument("DataType mismatch between attr (", DataTypeString(dtype), ") and tensor (", @@ -3782,206 +3884,42 @@ Status ConvertConst(OpConverterParams* params) { return Status::OK(); } -Status ConvertIdentity(OpConverterParams* params) { +Status ConvertIdentity(const OpConverterParams* params) { // TODO(tmorris): TRT's Identity layer does not get optimized away as of TRT // 5.0, however once we know that it does it would be nice to use that // instead. if (params->validation_only) return Status::OK(); - params->outputs->push_back(params->inputs.at(0)); - return Status::OK(); -} - -const std::unordered_map* -BinaryOperationMap() { - static auto* const m = - new std::unordered_map { - {"Add", nvinfer1::ElementWiseOperation::kSUM}, - {"AddV2", nvinfer1::ElementWiseOperation::kSUM}, - {"Mul", nvinfer1::ElementWiseOperation::kPROD}, - {"Sub", nvinfer1::ElementWiseOperation::kSUB}, - {"Div", nvinfer1::ElementWiseOperation::kDIV}, -#if IS_TRT_VERSION_GE(5, 1, 0, 0) - // This op applies Floor after Div. - {"FloorDiv", nvinfer1::ElementWiseOperation::kDIV}, -#endif - {"RealDiv", nvinfer1::ElementWiseOperation::kDIV}, - {"Minimum", nvinfer1::ElementWiseOperation::kMIN}, - {"Maximum", nvinfer1::ElementWiseOperation::kMAX}, - {"Pow", nvinfer1::ElementWiseOperation::kPOW}, - }; - return m; -} - -Status ConvertBinary(OpConverterParams* params) { - const auto& inputs = params->inputs; - const auto& node_def = params->node_def; - if (inputs.size() != 2) { - return errors::InvalidArgument(node_def.op(), " got ", inputs.size(), - " inputs but expected 2, at ", - node_def.name()); - } - TF_RETURN_IF_ERROR( - AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF})); - - // Constant folding should have been done by TensorFlow - if (inputs.at(0).is_weights() && inputs.at(1).is_weights()) { - return errors::Unimplemented( - "Constant folding is falled back to TensorFlow, binary op received " - "both input as constant at: ", - node_def.name()); - } - const TRT_TensorOrWeights& operand_l = inputs.at(0); - const TRT_TensorOrWeights& operand_r = inputs.at(1); - - auto op_pair = BinaryOperationMap()->find(node_def.op()); - if (op_pair == BinaryOperationMap()->end()) { - return errors::Unimplemented("Binary op ", node_def.op(), - " not supported at: ", node_def.name()); - } - - nvinfer1::Dims broadcasted_dims_l, broadcasted_dims_r; - TF_RETURN_IF_ERROR( - GetTrtBroadcastShape(operand_l, operand_r, /*check_feasibility=*/true, - &broadcasted_dims_l, &broadcasted_dims_r)); - ITensorProxyPtr tensor_l = nullptr; - ITensorProxyPtr tensor_r = nullptr; - // This will also convert constants to tensors, and set quantization ranges. - TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape( - operand_l, broadcasted_dims_l, params->validation_only, &tensor_l)); - TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape( - operand_r, broadcasted_dims_r, params->validation_only, &tensor_r)); - if (params->validation_only) return Status::OK(); - - // Add ElementWise layer. - nvinfer1::ILayer* layer = params->converter->network()->addElementWise( - *tensor_l->trt_tensor(), *tensor_r->trt_tensor(), op_pair->second); - TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name()); - ITensorProxyPtr trt_tensor = layer->getOutput(0); - -#if IS_TRT_VERSION_GE(5, 1, 0, 0) - if (node_def.op() == "FloorDiv") { - layer = params->converter->network()->addUnary( - *trt_tensor->trt_tensor(), nvinfer1::UnaryOperation::kFLOOR); - TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name()); - trt_tensor = layer->getOutput(0); - } -#endif - params->outputs->push_back(TRT_TensorOrWeights(trt_tensor)); - return Status::OK(); -} - -Status ConvertRsqrt(OpConverterParams* params) { - const auto& inputs = params->inputs; - const auto& node_def = params->node_def; - TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"x", false}})); - TF_RETURN_IF_ERROR( - AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF})); - if (params->validation_only) return Status::OK(); - // TODO(tmorris): params->converter is null during validation. Allow - // precision_mode and use_calibration to be accessed during validation and - // include this check in validation. - // We will need a quantization range for intermediate tensor if not using - // calibration. - // - // x -> [Sqrt] -> sqrt(x) -> [Recip] -> 1/sqrt(x) - // ^ - // need range here - if (params->converter->precision_mode() == TrtPrecisionMode::INT8 && - !params->converter->use_calibration()) { - return errors::Unimplemented( - "Intermediate quantization range cannot be determined without" - " calibration for Rsqrt, consider replacing with " - "Sqrt -> FakeQuant -> Reciprocal ops, at ", - node_def.name()); + for (int i = 0; i < params->inputs.size(); i++) { + params->outputs->push_back(params->inputs.at(i)); } - // Start conversion. - ITensorProxyPtr tensor = inputs.at(0).tensor(); - // Sqrt - nvinfer1::IUnaryLayer* sqrt_layer = params->converter->network()->addUnary( - *tensor->trt_tensor(), nvinfer1::UnaryOperation::kSQRT); - TFTRT_RETURN_ERROR_IF_NULLPTR(sqrt_layer, node_def.name()); - // Recip - nvinfer1::IUnaryLayer* recip_layer = params->converter->network()->addUnary( - *sqrt_layer->getOutput(0), nvinfer1::UnaryOperation::kRECIP); - TFTRT_RETURN_ERROR_IF_NULLPTR(recip_layer, node_def.name()); - params->outputs->push_back(TRT_TensorOrWeights(recip_layer->getOutput(0))); return Status::OK(); } -const std::unordered_map* -UnaryOperationMap() { - static auto* const m = - new std::unordered_map({ - {"Neg", nvinfer1::UnaryOperation::kNEG}, - {"Exp", nvinfer1::UnaryOperation::kEXP}, - {"Log", nvinfer1::UnaryOperation::kLOG}, - {"Sqrt", nvinfer1::UnaryOperation::kSQRT}, - {"Abs", nvinfer1::UnaryOperation::kABS}, - {"Reciprocal", nvinfer1::UnaryOperation::kRECIP}, -#if IS_TRT_VERSION_GE(5, 1, 0, 0) - {"Sin", nvinfer1::UnaryOperation::kSIN}, - {"Cos", nvinfer1::UnaryOperation::kCOS}, - {"Tan", nvinfer1::UnaryOperation::kTAN}, - {"Sinh", nvinfer1::UnaryOperation::kSINH}, - {"Cosh", nvinfer1::UnaryOperation::kCOSH}, - {"Asin", nvinfer1::UnaryOperation::kASIN}, - {"Acos", nvinfer1::UnaryOperation::kACOS}, - {"Atan", nvinfer1::UnaryOperation::kATAN}, - {"Asinh", nvinfer1::UnaryOperation::kASINH}, - {"Acosh", nvinfer1::UnaryOperation::kACOSH}, - {"Atanh", nvinfer1::UnaryOperation::kATANH}, - {"Ceil", nvinfer1::UnaryOperation::kCEIL}, - {"Floor", nvinfer1::UnaryOperation::kFLOOR}, -#endif - }); - return m; -} - -Status ConvertUnary(OpConverterParams* params) { - const auto& inputs = params->inputs; - const auto& node_def = params->node_def; - TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"x", false}})); - TF_RETURN_IF_ERROR( - AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF})); - auto op_pair = UnaryOperationMap()->find(node_def.op()); - if (op_pair == UnaryOperationMap()->end()) { - return errors::Unimplemented("Unary op: ", node_def.op(), - " not supported at: ", node_def.name()); - } +// This converter is a debug-only feature designed to allow graph segmentation +// experiments. Its use is being controled by +// `TF_TRT_OP_FAKELIST=OpName1,OpName2,...`. +// See `op_converter_registry.cc` for further details. +// +// This converter is designed as followed: +// - always succeed at graph segmentation time. +// - always fail at TRT Engine build time. +Status ConvertFake(const OpConverterParams* params) { if (params->validation_only) return Status::OK(); - // Start conversion. - ITensorProxyPtr tensor = inputs.at(0).tensor(); - nvinfer1::IUnaryLayer* layer = - params->converter->network()->addUnary(*tensor->trt_tensor(), op_pair->second); - TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name()); - ITensorProxyPtr output_tensor = layer->getOutput(0); - - // Set quantization ranges. - if (node_def.op() == "Sin" || node_def.op() == "Cos") { - params->converter->ProvideQuantizationRange(&output_tensor, -1.0f, 1.0f); - } else if (node_def.op() == "Asin" || node_def.op() == "Atan") { - params->converter->ProvideQuantizationRange(&output_tensor, -M_PI_2, M_PI_2); - } else if (node_def.op() == "Acos") { - params->converter->ProvideQuantizationRange(&output_tensor, 0.0f, M_PI); - } else if (node_def.op() == "Neg" || node_def.op() == "Abs") { - // Neg and Abs will have same range as input since TRT uses symmetric - // quantization. - // TODO(tmorris): Should we infer ranges for Ceil and Floor as well? - params->converter->MarkQuantizationRangesAsInferrable(&tensor, - &output_tensor); - } - params->outputs->push_back(TRT_TensorOrWeights(output_tensor)); - return Status::OK(); + return errors::Unimplemented( + "This converter is not valid after graph " + "segmentation. Building an engine using this " + "converter will trigger a native segment " + "fallback."); } -Status ConvertSquare(OpConverterParams* params) { +Status ConvertSquare(const OpConverterParams* params) { const auto& inputs = params->inputs; const auto& node_def = params->node_def; TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"x", false}})); - TF_RETURN_IF_ERROR( - AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF})); + TF_RETURN_IF_ERROR(AllowDataTypes( + *params, {DataType::DT_FLOAT, DataType::DT_HALF, DataType::DT_INT32})); if (params->validation_only) return Status::OK(); // Constant 2 with same rank as input @@ -3995,40 +3933,45 @@ Status ConvertSquare(OpConverterParams* params) { *inputs.at(0).tensor()->trt_tensor(), *const2_tensor->trt_tensor(), nvinfer1::ElementWiseOperation::kPOW); TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name()); + params->converter->SetLayerName(layer, node_def); ITensorProxyPtr output_tensor = layer->getOutput(0); params->outputs->push_back(TRT_TensorOrWeights(output_tensor)); return Status::OK(); } -Status ConvertReduce(OpConverterParams* params) { +Status ConvertReduce(const OpConverterParams* params) { const auto& inputs = params->inputs; const auto& node_def = params->node_def; TF_RETURN_IF_ERROR( CheckInputsWeights(*params, {{"input", false}, {"axis", true}})); - TF_RETURN_IF_ERROR( - AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF})); + TF_RETURN_IF_ERROR(AllowDataTypes( + *params, {DataType::DT_FLOAT, DataType::DT_HALF, DataType::DT_INT32})); ITensorProxyPtr tensor = inputs.at(0).tensor(); auto tf_axes_list = inputs.at(1).weights().GetSpan(); - TFAttrs attrs(node_def); + DataType idx_dtype{DataType::DT_INT32}; + bool keep_dims{false}; + AttrSlice attrs(node_def); + TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "Tidx", &idx_dtype)); + TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "keep_dims", &keep_dims)); + // Only expect to handle INT32 as attributes for now - if (attrs.get("Tidx") != DataType::DT_INT32) { + if (idx_dtype != DataType::DT_INT32) { return errors::Unimplemented("Tidx supports only DT_INT32"); } int axes = 0; if (tf_axes_list.size() == 0) { return errors::InvalidArgument( - "TRT cannot support reduce on all (batch) dimensions, at", - node_def.name()); + "TRT cannot support reduce on all (batch) dimensions"); } for (int i = 0; i < tf_axes_list.size(); i++) { int trt_axis; TF_RETURN_IF_ERROR( ConvertAxis(tf_axes_list[i], tensor->getDimensions().nbDims, - node_def.name(), /*use_implicit_batch=*/true, &trt_axis)); + node_def.name(), params->use_implicit_batch, &trt_axis)); axes |= (1 << trt_axis); } @@ -4044,15 +3987,14 @@ Status ConvertReduce(OpConverterParams* params) { } else if (node_def.op() == "Mean") { reduce_operation = nvinfer1::ReduceOperation::kAVG; } else { - return errors::Unimplemented("Op not supported ", node_def.op(), ", at ", - node_def.name()); + return errors::Unimplemented("Op not supported ", node_def.op()); } if (params->validation_only) return Status::OK(); - const auto keep_dims = attrs.get("keep_dims"); nvinfer1::ILayer* layer = params->converter->network()->addReduce( *tensor->trt_tensor(), reduce_operation, axes, keep_dims); TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name()); + params->converter->SetLayerName(layer, node_def); params->outputs->push_back(TRT_TensorOrWeights(layer->getOutput(0))); return Status::OK(); @@ -4062,55 +4004,97 @@ Status ConvertReduce(OpConverterParams* params) { // converted by first expanding input tensors by adding a new dimension of size // one at the specified axis and then concatenating the tensors at the same // axis. -Status ConvertPack(OpConverterParams* params) { +Status ConvertPack(const OpConverterParams* params) { const auto& inputs = params->inputs; const auto& node_def = params->node_def; - TFAttrs attrs(node_def); - const int num_inputs = attrs.get("N"); + int num_inputs{0}; + int64 tf_axis{0}; + AttrSlice attrs(node_def); + TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "N", &num_inputs)); + TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "axis", &tf_axis)); + if (num_inputs != inputs.size()) { return errors::InvalidArgument( - "Number of inputs for Pack is inconsistent with N attribute, at ", - node_def.name()); - } - - // Validate inputs. Values must be tensors for now. - std::vector> inputs_is_weight; + "Number of inputs for Pack is inconsistent with N attribute"); + } + + // In implicit batch mode we do not allow weight input. An input tensor with + // dims NCHW is represented with dims CHW during conversion time, and N is + // defined only during runtime. A weight is represented with dims NCHW. We + // cannot be sure that the runtime N will agree with the conversion time N, + // therefore we do not convert the pack op if it has both tensor and weight + // inputs. This restriction does not apply in explicit batch mode, in that + // case the input tensors are also represented with full dims that include the + // batch size. + TrtInputArg expected_arg = + params->use_implicit_batch ? TrtInputArg::kTensor : TrtInputArg::kBoth; + + std::vector> inputs_is_weight; + inputs_is_weight.reserve(num_inputs); for (int i = 0; i < num_inputs; ++i) { - inputs_is_weight.push_back({StrCat("values_", i), false}); + inputs_is_weight.push_back({StrCat("values_", i), expected_arg}); } TF_RETURN_IF_ERROR(CheckInputsWeights(*params, inputs_is_weight)); - // TODO(hinsu): Enable INT32 with TensorRT version 5.1.3 after testing. - TF_RETURN_IF_ERROR( - AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF})); - + std::set allowed_types{DataType::DT_FLOAT, DataType::DT_HALF, + DataType::DT_INT32}; + TF_RETURN_IF_ERROR(AllowDataTypes(*params, allowed_types)); if (num_inputs > 1) { // Verify that inputs are compatible for concatenation after the expansion. TF_RETURN_IF_ERROR( VerifyShapesMatch(inputs, /*masked_dim=*/-1, node_def.name())); } + // Find the dimension of the inputs. In general inputs can have dynamic shape, + // in that case we have to use DynamicExpandDims to calculate the expanded + // dimensions. To avoid that, we try to find a weight input which is + // guaranteed to have known static shape. + int idx = 0; + for (int i = 1; i < inputs.size(); i++) { + if (HasStaticShape(inputs.at(i).GetTrtDims())) { + idx = i; + } + } + DimsAdapter dims(inputs.at(idx).GetTrtDims()); // Convert axis from the TensorFlow format to TensorRT format. - const nvinfer1::Dims dims = inputs.at(0).GetTrtDims(); - const int64 tf_axis = attrs.get("axis"); - int trt_axis; - TF_RETURN_IF_ERROR(ConvertAxis(tf_axis, dims.nbDims + 1, node_def.name(), - /*use_implicit_batch=*/true, &trt_axis)); + int trt_axis{0}; + TF_RETURN_IF_ERROR(ConvertAxis(tf_axis, dims.NumDims() + 1, node_def.name(), + params->use_implicit_batch, &trt_axis)); // Compute expanded dimensions and then reshape input tensors. - std::vector tensor_dims(dims.d, dims.d + dims.nbDims); + std::vector tensor_dims(dims.begin(), dims.end()); tensor_dims.insert(tensor_dims.begin() + trt_axis, 1); - nvinfer1::Dims expanded_dims; - TF_RETURN_IF_ERROR(TensorShapeArrayToTrtDims(tensor_dims, &expanded_dims)); std::vector expanded_tensors; - for (const TRT_TensorOrWeights& tensor : inputs) { + + int input_index = 0; + for (const TRT_TensorOrWeights& input : inputs) { ITensorProxyPtr expanded_tensor = nullptr; - TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape( - tensor, expanded_dims, params->validation_only, &expanded_tensor)); + if (input.is_tensor() && !params->use_implicit_batch && + !HasStaticShape(dims)) { + if (!params->validation_only) { + TF_RETURN_IF_ERROR(params->converter->DynamicExpandDims( + /*input=*/input.tensor(), + /*dims=*/dims.AsTrtDims(), + /*axis=*/trt_axis, + /*params=*/params, + /*output=*/&expanded_tensor, + /*op_instance=*/input_index)); + } + } else { + TF_RETURN_IF_ERROR(PrepareTensorForShape( + /*converter=*/params->converter, + /*input=*/input, + /*dims=*/DimsAdapter(tensor_dims), + /*validation_only=*/params->validation_only, + /*tensor=*/&expanded_tensor, + /*node_def=*/node_def, + /*op_instance=*/input_index)); + } if (!params->validation_only) { expanded_tensors.push_back(expanded_tensor); } + input_index++; } if (params->validation_only) return Status::OK(); @@ -4130,118 +4114,152 @@ Status ConvertPack(OpConverterParams* params) { static_cast(trt_expanded_tensors.data()), expanded_tensors.size()); TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name()); + params->converter->SetLayerName(layer, node_def, "concat"); // Note that trt_axis stays the same even after expanding tensors at the axis. layer->setAxis(trt_axis); params->outputs->push_back(TRT_TensorOrWeights(layer->getOutput(0))); return Status::OK(); } -Status ConvertPad(OpConverterParams* params) { +Status ConvertPad(const OpConverterParams* params) { const auto& inputs = params->inputs; const auto& node_def = params->node_def; TF_RETURN_IF_ERROR( CheckInputsWeights(*params, {{"tensor", false}, {"paddings", true}})); - TF_RETURN_IF_ERROR( - AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF})); + TF_RETURN_IF_ERROR(AllowDataTypes( + *params, {DataType::DT_FLOAT, DataType::DT_HALF, DataType::DT_INT8})); // Implement tensor binaryOp weight [channel wise] for now; ITensorProxyPtr tensor = inputs.at(0).tensor(); const auto dims = tensor->getDimensions(); // Restore implicit batch dimension - const int nb_dims = dims.nbDims + 1; + const int nb_dims = + params->use_implicit_batch ? dims.nbDims + 1 : dims.nbDims; + // TODO(tfeher): Support nb_dims < 4 by inserting extra dimensions to the + // original input. + if (nb_dims < 4) { + return errors::InvalidArgument("Convertpad requires at least 4D input"); + } TRT_ShapedWeights pads = inputs.at(1).weights(); - TFAttrs attrs(node_def); - // Padding type here is done through TF type - // so I can leverage their EnumToDataType for my cast - auto padding_type = attrs.get("Tpaddings"); // TODO(jie): handle data type conversion for TRT? + DataType padding_dtype{DataType::DT_INT32}; + TF_RETURN_IF_ERROR( + GetNodeAttr(AttrSlice(node_def), "Tpaddings", &padding_dtype)); - if (pads.shape_.d[0] != nb_dims || pads.shape_.d[1] != 2) { - return errors::InvalidArgument( - "Pad only supports explicit padding on 4 dimensional tensor, at ", - node_def.name()); + if (pads.Shape().dim(0) != nb_dims || pads.Shape().dim(1) != 2) { + return errors::InvalidArgument("Paddings must be a weight with shape ", + "[n, 2], where n is the rank of input ", + "tensor"); } // Only expect to handle INT32 as attributes for now - if (padding_type != DataType::DT_INT32) { + if (padding_dtype != DataType::DT_INT32) { return errors::Unimplemented("Tpaddings supports only DT_INT32"); } - auto pad_data = static_cast(pads.GetValues()); + auto pad_data = pads.GetPointer(); - std::vector pad_index; + std::vector tf_pad_index; for (int i = 0; i < nb_dims; i++) { if (pad_data[2 * i] != 0 || pad_data[2 * i + 1] != 0) { - pad_index.push_back(i); + tf_pad_index.push_back(i); } } // No padding at all, we should exit - if (pad_index.empty()) { + if (tf_pad_index.empty()) { params->outputs->push_back(inputs.at(0)); return Status::OK(); } - // Only supports padding on less than 2 axis GIE-2579 - if (pad_index.size() > 2) { + // TRT pad layer can only support padding on up to 2 dimensions (TRT-2579). + // TODO(tfeher): Use multiple TRT pad layers to support padding on more than 2 + // dimensions. + if (tf_pad_index.size() > 2) { return errors::InvalidArgument( "Padding layer does not support padding on > 2"); } // Padding on batch dimension is not supported - if (pad_index[0] == 0) { + if (params->use_implicit_batch && tf_pad_index[0] == 0) { return errors::InvalidArgument( "Padding layer does not support padding on batch dimension"); } - // Not doing the legit thing here. ignoring padding on dim 1 and 3; - // TODO(jie): implement pad as uff parser - if (pad_index.size() == 2 && pad_index[0] == 0 && pad_index[1] == 3) { - return errors::Unimplemented( - "Padding layer does not support padding on dimension 1 and 3 yet"); - } if (params->validation_only) return Status::OK(); - bool legit_pad = true; + // TRT can only do the padding at the last two dimensions. We transpose the + // input tensor if needed. + bool transposed_pad = false; + std::vector transpose_idx(nb_dims); + std::iota(transpose_idx.begin(), transpose_idx.end(), 0); + + // trt_pad_index denotes the actual idx where the padding is performed by TRT. + std::vector trt_pad_index{nb_dims - 2, nb_dims - 1}; + + // How many zeros are padded at the last two dimensions. nvinfer1::DimsHW pre_padding(0, 0); nvinfer1::DimsHW post_padding(0, 0); - std::vector permuted_pad_index(pad_index); - if (pad_index[0] == 1) { - legit_pad = false; - TF_RETURN_IF_ERROR( - params->converter->TransposeTensor(tensor, {0, 3, 2, 1}, &tensor)); - permuted_pad_index[0] = 3; - } - - for (size_t i = 0; i < pad_index.size(); i++) { - int index = pad_index[i]; - if (permuted_pad_index[i] == 2) { - pre_padding.h() = pad_data[index * 2]; - post_padding.h() = pad_data[index * 2 + 1]; - } else if (permuted_pad_index[i] == 3) { - pre_padding.w() = pad_data[index * 2]; - post_padding.w() = pad_data[index * 2 + 1]; + // Dimension to set in the pre_padding and post_padding array. + std::vector trt_pre_post_padding_index{0, 1}; + + // Two special cases where we can avoid permutations. + if (tf_pad_index.size() == 1 && tf_pad_index[0] == nb_dims - 1) { + // Only one dimension needs to be padded. We store its index at + // trt_pad_index[0]. We ignore trt_pad_index[1]. + trt_pad_index[0] = nb_dims - 1; + trt_pre_post_padding_index[0] = 1; + } + if (tf_pad_index.size() == 2 && tf_pad_index[1] == nb_dims - 2) { + // tf_pad_index only has two values that are in ascending order. If + // tf_pad_index[1] is nb_dims-2, then swapping the two values in + // trt_pad_index here makes it possible to only swap one pair of dimensions + // (swap tf_pad_index[0] with nb_dims-1) in the input tensor. Otherwise, we + // would have to swap two pairs of dimensions in the input tensor: + // (tf_pad_index[0] with nb_dims-2) and (tf_pad_index[1], with nb_dims-1). + // Here is an example for a 4D input tensor: + // tf_pad_index = [1, 2] + // trt_pad_index = [3, 2] + // transpose_idx = [0, 3, 2, 1] + std::swap(trt_pad_index[0], trt_pad_index[1]); + std::swap(trt_pre_post_padding_index[0], trt_pre_post_padding_index[1]); + } + + for (int i = 0; i < tf_pad_index.size(); i++) { + const int tf_index = tf_pad_index[i]; + const int trt_index = trt_pad_index[i]; + const int k = trt_pre_post_padding_index[i]; + pre_padding.d[k] = pad_data[tf_index * 2]; + post_padding.d[k] = pad_data[tf_index * 2 + 1]; + if (tf_index != trt_index) { + transposed_pad = true; + std::swap(transpose_idx[tf_index], transpose_idx[trt_index]); } } + if (transposed_pad) { + TF_RETURN_IF_ERROR(params->converter->TransposeTensor( + tensor, transpose_idx, &tensor, node_def, "to_pad")); + } + nvinfer1::IPaddingLayer* layer = params->converter->network()->addPadding( *tensor->trt_tensor(), pre_padding, post_padding); TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name()); + params->converter->SetLayerName(layer, node_def); ITensorProxyPtr output_tensor = layer->getOutput(0); - params->converter->MarkQuantizationRangesAsInferrable(&tensor, &output_tensor); - if (!legit_pad) { + if (transposed_pad) { TF_RETURN_IF_ERROR(params->converter->TransposeTensor( - output_tensor, {0, 3, 2, 1}, &output_tensor)); + output_tensor, transpose_idx, &output_tensor, node_def, "from_pad")); } params->outputs->push_back(TRT_TensorOrWeights(output_tensor)); return Status::OK(); } -Status ConvertSplitHelper(OpConverterParams* params, +Status ConvertSplitHelper(const OpConverterParams* params, const TRT_TensorOrWeights& input, int tf_axis, int num_splits, bool squeeze_after) { const auto& node_def = params->node_def; @@ -4299,7 +4317,7 @@ Status ConvertSplitHelper(OpConverterParams* params, return Status::OK(); } -Status ConvertSplit(OpConverterParams* params) { +Status ConvertSplit(const OpConverterParams* params) { const auto& inputs = params->inputs; const auto& node_def = params->node_def; TF_RETURN_IF_ERROR( @@ -4317,60 +4335,101 @@ Status ConvertSplit(OpConverterParams* params) { return ConvertSplitHelper(params, inputs.at(1), tf_axis, num_split, false); } -Status ConvertUnpack(OpConverterParams* params) { +Status ConvertUnpack(const OpConverterParams* params) { const auto& inputs = params->inputs; const auto& node_def = params->node_def; TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"value", false}})); - TF_RETURN_IF_ERROR(AllowDataTypes(*params, { - DataType::DT_FLOAT, DataType::DT_HALF, -#if IS_TRT_VERSION_GE(5, 1, 3, 1) - DataType::DT_INT32, -#endif - })); + TF_RETURN_IF_ERROR(AllowDataTypes( + *params, {DataType::DT_FLOAT, DataType::DT_HALF, DataType::DT_INT32})); // Input must be rank 1 or higher, since we can't unpack on axis 0. if (inputs.at(0).GetTrtDims().nbDims == 0) { return errors::Unimplemented( - "Input \"value\" for Unpack must be rank 2 or greater, at ", - node_def.name()); + "Input \"value\" for Unpack must be rank 2 or greater"); } - TFAttrs attrs(node_def); - const int tf_axis = attrs.get("axis"); - const int num = attrs.get("num"); + + int tf_axis = 0, num = 0; + AttrSlice attrs(node_def); + TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "axis", &tf_axis)); + TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "num", &num)); return ConvertSplitHelper(params, inputs.at(0), tf_axis, num, true); } -Status ConvertConcat(OpConverterParams* params) { +Status ConvertCast(const OpConverterParams* params) { + auto unsupport_cast_error = [&](string msg) { + return errors::Unimplemented("Cast op is not supported - ", msg); + }; + + if (isExperimentalFeatureActivated("reject_all_fp_cast_ops")) { + LOG(WARNING) << "`TF_TRT_EXPERIMENTAL_FEATURES=reject_all_fp_cast_ops`is " + << "meant as a workaround. If the Cast converter leads to any " + << "performance or accuracy regression, please open an issue " + << "on GitHub."; + return unsupport_cast_error( + "TF_TRT_EXPERIMENTAL_FEATURES=reject_all_fp_cast_ops has been defined"); + } + + std::set allowed_types{DataType::DT_FLOAT, DataType::DT_HALF}; + + DataType input_type; + TF_RETURN_IF_ERROR(GetInputTfType(*params, &input_type, 0)); + + if (allowed_types.find(input_type) == allowed_types.end()) { + return unsupport_cast_error( + StrCat("Allowed input dtypes: [", DataTypeString(DataType::DT_FLOAT), + ", ", DataTypeString(DataType::DT_HALF), + "]. Received: ", DataTypeString(input_type))); + } + + DataType output_type; + TF_RETURN_IF_ERROR(GetNodeDefTfType(params->node_def, &output_type, + kCastOutputTypeAttrName)); + + if (allowed_types.find(output_type) == allowed_types.end()) { + return unsupport_cast_error( + StrCat("Allowed output dtypes: [", DataTypeString(DataType::DT_FLOAT), + ", ", DataTypeString(DataType::DT_HALF), + "]. Received: ", DataTypeString(output_type))); + } + + return ConvertIdentity(params); +} + +Status ConvertConcat(const OpConverterParams* params) { const auto& inputs = params->inputs; const auto& node_def = params->node_def; - TFAttrs attrs(node_def); - // Get number of tensor inputs. - const int num_inputs = attrs.get("N"); + + int num_inputs{0}; + TF_RETURN_IF_ERROR(GetNodeAttr(AttrSlice(node_def), "N", &num_inputs)); + if (num_inputs != static_cast(inputs.size()) - 1) { return errors::InvalidArgument( - "Number of inputs for ConcatV2 is inconsistent with N attribute, at ", - node_def.name()); + "Number of inputs for ConcatV2 is inconsistent with N attributes."); } - // Validate inputs. Values must be tensors for now. - std::vector> inputs_is_weight; + // Validate inputs. + std::vector> inputs_kinds; + TrtInputArg expected_input = + params->use_implicit_batch ? TrtInputArg::kTensor : TrtInputArg::kBoth; + + inputs_kinds.reserve(num_inputs); for (int i = 0; i < num_inputs; ++i) { - inputs_is_weight.push_back({StrCat("values_", i), false}); + inputs_kinds.push_back({StrCat("values_", i), expected_input}); } - inputs_is_weight.push_back({"axis", true}); - TF_RETURN_IF_ERROR(CheckInputsWeights(*params, inputs_is_weight)); - // TODO(tmorris): There is a bug with Concat and INT32 in TRT - it is supposed - // to be supported. - TF_RETURN_IF_ERROR( - AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF})); + inputs_kinds.push_back({"axis", TrtInputArg::kWeight}); + TF_RETURN_IF_ERROR(CheckInputsWeights(*params, inputs_kinds)); + + std::set allowed_types{DataType::DT_FLOAT, DataType::DT_HALF, + DataType::DT_INT32}; + + TF_RETURN_IF_ERROR(AllowDataTypes(*params, allowed_types)); const auto axis = inputs.at(num_inputs).weights().GetSpan(); if (axis.size() != 1) { - return errors::InvalidArgument("Axis for ConcatV2 must be a scalar, at ", - node_def.name()); + return errors::InvalidArgument("Axis for ConcatV2 must be a scalar"); } int trt_axis = 0; const auto dim = inputs.at(0).GetTrtDims(); TF_RETURN_IF_ERROR(ConvertAxis(axis[0], dim.nbDims, node_def.name(), - /*use_implicit_batch=*/true, &trt_axis)); + params->use_implicit_batch, &trt_axis)); // Check that dimensions match on non-concatenate axis. TF_RETURN_IF_ERROR(VerifyShapesMatch( absl::Span(inputs).first(num_inputs), trt_axis, @@ -4379,8 +4438,15 @@ Status ConvertConcat(OpConverterParams* params) { // Gather inputs as tensors std::vector input_tensors; + input_tensors.reserve(num_inputs); + for (int i = 0; i < num_inputs; i++) { - input_tensors.push_back(inputs.at(i).tensor()); + if (inputs.at(i).is_tensor()) { + input_tensors.push_back(inputs.at(i).tensor()); + } else { + input_tensors.push_back(params->converter->CreateConstantLayer( + inputs.at(i).weights(), inputs.at(i).GetTrtDims())); + } } std::vector trt_input_tensors; for (const auto& t : input_tensors) { @@ -4391,12 +4457,13 @@ Status ConvertConcat(OpConverterParams* params) { static_cast(trt_input_tensors.data()), input_tensors.size()); TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name()); + params->converter->SetLayerName(layer, node_def); layer->setAxis(trt_axis); params->outputs->push_back(TRT_TensorOrWeights(layer->getOutput(0))); return Status::OK(); } -Status ConvertFusedBatchNorm(OpConverterParams* params) { +Status ConvertFusedBatchNorm(const OpConverterParams* params) { const auto& inputs = params->inputs; const auto& node_def = params->node_def; TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"x", false}, @@ -4406,41 +4473,54 @@ Status ConvertFusedBatchNorm(OpConverterParams* params) { {"variance", true}})); TF_RETURN_IF_ERROR( AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF})); - TFAttrs attrs(node_def); - float epsilon = attrs.get("epsilon"); - auto data_format = attrs.get("data_format"); - if (data_format != "NCHW") { - return errors::Unimplemented( - node_def.op(), " only supports data_format=NCHW, at ", node_def.name()); - } - bool is_training = attrs.get("is_training"); + + float epsilon{0.1f}; + string data_format; + bool is_training{false}; + AttrSlice attrs(node_def); + TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "epsilon", &epsilon)); + TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "data_format", &data_format)); + TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "is_training", &is_training)); + if (is_training) { // Trying to use batchnorm in training mode is a very common problem. // Because the error message will only be printed in VLOG(1) by the // segmenter, we issue a special warning so that users will actually see it. - LOG(WARNING) << node_def.op() << " only supports is_training=false. If you " - << "are using Keras, please call " - << "keras.backend.set_learning_phase(0) before constructing " - << "your model. At " << node_def.name(); + LOG_WARNING_WITH_PREFIX + << node_def.op() << " only supports is_training=false. If you " + << "are using Keras, please call " + << "keras.backend.set_learning_phase(0) before constructing " + << "your model. At " << node_def.name(); return errors::Unimplemented(node_def.op(), - " only supports is_training=false, at ", - node_def.name()); + " only supports is_training=false"); } ITensorProxyPtr tensor = inputs.at(0).tensor(); - + if (!params->use_implicit_batch) { + // This check is to make sure that channel dimension is known during + // conversion. + // + // We check this only in explicit batch mode and reject an op with unknown + // channel dimension during segmentation. In implicit batch mode we have + // known shapes during conversion even though the shapes may not be known + // during segmentation (see the actual argument for input_shapes when + // ConvertGraphDefToEngine is called from TRTEngineOp::BuildEngine). + int channel_dim = (data_format == "NCHW" ? 1 : 3); + if (tensor->getDimensions().d[channel_dim] == -1) { + return errors::InvalidArgument("Channel dimension must be static"); + } + } // Check parameter types auto parameter_type = inputs.at(1).weights().TrtDType(); if ((parameter_type != nvinfer1::DataType::kFLOAT) && (parameter_type != nvinfer1::DataType::kHALF)) { return errors::Unimplemented( - "Only float32 or float16 weight data type is supported, for node ", - node_def.name(), " got ", DebugString(parameter_type)); + "Only float32 or float16 weight data type is supported,", " got ", + DebugString(parameter_type)); } for (int i = 1; i < 5; i++) { if (inputs.at(i).weights().TrtDType() != parameter_type) { return errors::Unimplemented( - "Inconsistent parameter type for batchnorm is not supported, at: " + - node_def.name()); + "Inconsistent parameter type for batchnorm is not supported"); } } @@ -4454,35 +4534,34 @@ Status ConvertFusedBatchNorm(OpConverterParams* params) { if (inputs.at(i).weights().count() == nweight) { ptr_shape_weights = &(inputs.at(i).weights()); } else if (inputs.at(i).weights().count() != 1) { - return errors::InvalidArgument( - "Inconsistent batchnorm parameter count, at: " + node_def.name()); + return errors::InvalidArgument("Inconsistent batchnorm parameter count"); } } if (params->validation_only) return Status::OK(); // We could technically have two weights with different shape. // that requires two addScale op, arguably less performant - TRT_ShapedWeights combined_scale_weights = + ::stream_executor::port::StatusOr combined_scale_weights = params->weight_store->GetTempWeights(*ptr_shape_weights); - TRT_ShapedWeights combined_offset_weights = + TRT_ENSURE_OK(combined_scale_weights); + ::stream_executor::port::StatusOr combined_offset_weights = params->weight_store->GetTempWeights(*ptr_shape_weights); + TRT_ENSURE_OK(combined_offset_weights); const Eigen::half* cast_vals_array[4]; const float* vals_array[4]; for (int j = 0; j < 4; j++) { - cast_vals_array[j] = - static_cast(inputs.at(j + 1).weights().GetValues()); - vals_array[j] = - static_cast(inputs.at(j + 1).weights().GetValues()); + cast_vals_array[j] = inputs.at(j + 1).weights().GetPointer(); + vals_array[j] = inputs.at(j + 1).weights().GetPointer(); } Eigen::half* cast_combined_scale_vals = - static_cast(combined_scale_weights.GetValues()); + combined_scale_weights.ValueOrDie().GetPointer(); Eigen::half* cast_combined_offset_vals = - static_cast(combined_offset_weights.GetValues()); + combined_offset_weights.ValueOrDie().GetPointer(); float* combined_scale_vals = - static_cast(combined_scale_weights.GetValues()); + combined_scale_weights.ValueOrDie().GetPointer(); float* combined_offset_vals = - static_cast(combined_offset_weights.GetValues()); + combined_offset_weights.ValueOrDie().GetPointer(); for (size_t i = 0; i < nweight; ++i) { float batchnorm_data[4]; @@ -4491,15 +4570,13 @@ Status ConvertFusedBatchNorm(OpConverterParams* params) { if (parameter_type == nvinfer1::DataType::kFLOAT) { batchnorm_data[j] = vals_array[j][i]; } else if (parameter_type == nvinfer1::DataType::kHALF) { - batchnorm_data[j] = - Eigen::half_impl::half_to_float(cast_vals_array[j][i]); + batchnorm_data[j] = static_cast(cast_vals_array[j][i]); } } else { if (parameter_type == nvinfer1::DataType::kFLOAT) { batchnorm_data[j] = vals_array[j][0]; } else if (parameter_type == nvinfer1::DataType::kHALF) { - batchnorm_data[j] = - Eigen::half_impl::half_to_float(cast_vals_array[j][0]); + batchnorm_data[j] = static_cast(cast_vals_array[j][0]); } } } @@ -4518,41 +4595,64 @@ Status ConvertFusedBatchNorm(OpConverterParams* params) { } } - nvinfer1::ScaleMode mode = nweight == 1 ? nvinfer1::ScaleMode::kUNIFORM - : nvinfer1::ScaleMode::kCHANNEL; - nvinfer1::IScaleLayer* layer = params->converter->network()->addScale( - *tensor->trt_tensor(), mode, combined_offset_weights.GetTrtWeights(), - combined_scale_weights.GetTrtWeights(), - dummy_power_weights.GetTrtWeights()); - TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name()); - ITensorProxyPtr output_tensor = layer->getOutput(0); + ITensorProxyPtr output_tensor; + + if (data_format == "NCHW") { + // IScaleLayer CHANNEL mode requires NCHW format. + nvinfer1::ScaleMode mode = nvinfer1::ScaleMode::kCHANNEL; + nvinfer1::IScaleLayer* layer = params->converter->network()->addScale( + *tensor->trt_tensor(), mode, + combined_offset_weights.ValueOrDie().GetTrtWeights(), + combined_scale_weights.ValueOrDie().GetTrtWeights(), + nvinfer1::Weights{nvinfer1::DataType::kFLOAT, nullptr, 0}); + TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name()); + params->converter->SetLayerName(layer, node_def); + output_tensor = layer->getOutput(0); + } + if (data_format == "NHWC") { + // nweight is the number of channels. TensorRT IElementWiseLayer supports + // implicit broadcasting for dimensions of size 1. + nvinfer1::Dims dims = tensor->getDimensions(); + for (int i = 0; i < dims.nbDims - 1; i++) { + dims.d[i] = 1; + } + dims.d[dims.nbDims - 1] = nweight; + ::stream_executor::port::StatusOr builder = + TRTNetworkBuilder::Create(params->converter->network(), + params->weight_store); + TRT_ENSURE_OK(builder); + auto scale_constant_layer = builder.ValueOrDie().WeightsToConstant( + combined_scale_weights.ValueOrDie().GetTrtWeights(), dims); + ITensorProxyPtr scale_constant = + scale_constant_layer.ValueOrDie()->getOutput(0); + auto scale_layer = builder.ValueOrDie().Mul(tensor->trt_tensor(), + scale_constant->trt_tensor()); + auto offset_constant_layer = builder.ValueOrDie().WeightsToConstant( + combined_offset_weights.ValueOrDie().GetTrtWeights(), dims); + ITensorProxyPtr offset_constant = + offset_constant_layer.ValueOrDie()->getOutput(0); + auto offset_layer = builder.ValueOrDie().Add( + scale_layer.ValueOrDie()->getOutput(0), offset_constant->trt_tensor()); + output_tensor = offset_layer.ValueOrDie()->getOutput(0); + } + params->outputs->push_back(TRT_TensorOrWeights(output_tensor)); return Status::OK(); } -Status ConvertGather(OpConverterParams* params) { +Status ConvertGather(const OpConverterParams* params) { const auto& inputs = params->inputs; const auto& node_def = params->node_def; // TODO(tmorris): Use CheckInputsWeights by changing bool to enum with an // option for an input to be either tensor or weight. - if (inputs.size() != 3) { - return errors::InvalidArgument("GatherV2 got ", inputs.size(), - " inputs but expected 3, at ", - node_def.name()); - } + TF_RETURN_IF_ERROR( + CheckInputsWeights(*params, {{"params", TrtInputArg::kBoth}, + {"indices", TrtInputArg::kBoth}, + {"axis", TrtInputArg::kWeight}})); + const auto& params_input = inputs.at(0); const auto& indices_input = inputs.at(1); const auto& axis_input = inputs.at(2); - if (!axis_input.is_weights()) { - return errors::Unimplemented( - "The input \"axis\" for GatherV2 must be a constant, at ", - node_def.name()); - } - if (!indices_input.is_tensor()) { - return errors::Unimplemented( - "The input \"indices\" for GatherV2 must be a tensor, at ", - node_def.name()); - } TF_RETURN_IF_ERROR(AllowDataTypes( *params, {DataType::DT_FLOAT, DataType::DT_HALF, DataType::DT_INT32}, @@ -4562,280 +4662,407 @@ Status ConvertGather(OpConverterParams* params) { absl::Span axis = axis_input.weights().GetSpan(); if (axis.size() != 1) { - return errors::InvalidArgument("Axis for GatherV2 must be a scalar, at ", - node_def.name()); + return errors::InvalidArgument("Axis for GatherV2 must be a scalar"); } + int trt_axis = 0; - TF_RETURN_IF_ERROR(ConvertAxis(axis[0], params_input.GetTrtDims().nbDims, - node_def.name(), params_input.is_tensor(), - &trt_axis)); - if (params_input.is_weights() && trt_axis != 0) { + TF_RETURN_IF_ERROR(ConvertAxis( + axis[0], params_input.GetTrtDims().nbDims, node_def.name(), + params->use_implicit_batch && params_input.is_tensor(), &trt_axis)); + if (params->use_implicit_batch && params_input.is_weights() && + trt_axis != 0) { return errors::Unimplemented( "The input axis must be zero when params is a weight."); } - if (params_input.is_tensor() && indices_input.batch_size() != 1) { + if (params->use_implicit_batch && + (params_input.is_tensor() == indices_input.is_tensor()) && + (indices_input.batch_size() != 1 || params_input.batch_size() != 1)) { return errors::Unimplemented( - "Indices must have a batch size of 1 when params is a tensor."); + "Params and indices must have a batch size of 1 when params and indices" + " are both tensors or both constants."); } + + auto get_rank = [params](const auto& input) { + return input.GetTrtDims().nbDims + + (params->use_implicit_batch && input.is_tensor() ? 1 : 0); + }; // Both input are tensors, and the TF gather result will have rank: // (params.nbDims + 1) + (indices.nbDims + 1) - 1, // where "+ 1" adds the batch dim. If params is a weight, the TRT rank matches // the TF rank so we don't have to add + 1. - const int params_tf_rank = - params_input.GetTrtDims().nbDims + (params_input.is_tensor() ? 1 : 0); - const int indices_tf_rank = indices_input.GetTrtDims().nbDims + 1; + const int params_tf_rank = get_rank(params_input); + const int indices_tf_rank = get_rank(indices_input); const int tf_gather_output_rank = params_tf_rank + indices_tf_rank - 1; - if (tf_gather_output_rank > nvinfer1::Dims::MAX_DIMS + 1) { + if (tf_gather_output_rank > + nvinfer1::Dims::MAX_DIMS + (params->use_implicit_batch ? 1 : 0)) { return errors::InvalidArgument( "Result of gather has dimension greater than ", nvinfer1::Dims::MAX_DIMS + 1); } - if (params->validation_only) return Status::OK(); - // Convert params to tensor is it is a weight. - ITensorProxyPtr params_tensor = nullptr; - if (params_input.is_weights()) { - params_tensor = params->converter->CreateConstantLayer( - params_input.weights(), params_input.GetTrtDims()); - } else { - params_tensor = params_input.tensor(); + int32 batch_dims; + TF_RETURN_IF_ERROR(GetNodeAttr(node_def, "batch_dims", &batch_dims)); + if (params->use_implicit_batch && batch_dims) { + return errors::InvalidArgument( + "batch_dims must be zero in implicit batch mode"); + } + if (!params->use_implicit_batch && batch_dims > 1) { + return errors::InvalidArgument( + "batch_dims cannot exceed 1 in dynamic shape mode"); } + if (params->validation_only) return Status::OK(); + + // Convert input or indices to tensor if it is a constant. + auto populate_tensor = [params](const auto& input) -> ITensorProxyPtr { + ITensorProxyPtr result_tensor = nullptr; + + if (input.is_weights()) { + result_tensor = params->converter->CreateConstantLayer( + input.weights(), input.GetTrtDims()); + } else { + result_tensor = input.tensor(); + } + + return result_tensor; + }; + + ITensorProxyPtr params_tensor = populate_tensor(params_input); + ITensorProxyPtr indices_tensor = populate_tensor(indices_input); + // Note on how IGatherLayer works: if both the data and indices tensors have // a batch size dimension of size N, it performs: // for batchid in xrange(N): // output[batchid, a0, ..., an, i, ..., j, b0, ..., bn] = ( // data[batchid, a0, ..., an, indices[batchid, i, ..., j] b0, ..., bn]) nvinfer1::IGatherLayer* layer = params->converter->network()->addGather( - *params_tensor->trt_tensor(), *indices_input.tensor()->trt_tensor(), trt_axis); + *params_tensor->trt_tensor(), *indices_tensor->trt_tensor(), trt_axis); TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name()); + params->converter->SetLayerName(layer, node_def); + layer->setNbElementWiseDims(batch_dims); ITensorProxyPtr output_tensor = layer->getOutput(0); nvinfer1::Dims trt_gather_output_dims = output_tensor->getDimensions(); - // Note for the "- 2": one is for the output batch dim encapsulated by TF-TRT, - // and the other is for the output dimension that is squeezed by IGatherLayer - // because of the implicit batch dim in the indices (see the above note). - const int expected_trt_output_rank = - tf_gather_output_rank - (params_input.is_tensor() ? 2 : 1); - if (trt_gather_output_dims.nbDims != expected_trt_output_rank) { - return errors::Internal( - "Get unexpected output dimensions of IGatherLayer. Expect nbDims: ", - expected_trt_output_rank, - ", actual nbDims: ", trt_gather_output_dims.nbDims); + + if (params->use_implicit_batch) { + // Note for the "- 2": one is for the output batch dim encapsulated by + // TF-TRT, and the other is for the output dimension that is squeezed by + // IGatherLayer because of the implicit batch dim in the indices (see the + // above note). + const int expected_trt_output_rank = tf_gather_output_rank - + (params_input.is_tensor() ? 1 : 0) - + (indices_input.is_tensor() ? 1 : 0); + + if (trt_gather_output_dims.nbDims != expected_trt_output_rank) { + return errors::Internal( + "Get unexpected output dimensions of IGatherLayer. Expect nbDims: ", + expected_trt_output_rank, + ", actual nbDims: ", trt_gather_output_dims.nbDims); + } } // Reshape the output so after adding the implicit batch dim it'll match the // output shape of TF GatherV2. - if (params_input.is_tensor()) { + if (params->use_implicit_batch && params_input.is_tensor() && + indices_input.is_tensor()) { for (int i = trt_gather_output_dims.nbDims; i > trt_axis; --i) { trt_gather_output_dims.d[i] = trt_gather_output_dims.d[i - 1]; } trt_gather_output_dims.d[trt_axis] = 1; ++trt_gather_output_dims.nbDims; - TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape( - TRT_TensorOrWeights(output_tensor), trt_gather_output_dims, - /*validation_only=*/false, &output_tensor)); + TF_RETURN_IF_ERROR(PrepareTensorForShape( + params->converter, TRT_TensorOrWeights(output_tensor), + trt_gather_output_dims, + /*validation_only=*/false, &output_tensor, node_def)); + } + + // When input and indices are both constants, for the supported cases, reshape + // output so that after removing the implicit batch dim it will match the + // output shape of TF GatherV2 op. + if (params->use_implicit_batch && params_input.is_weights() && + indices_input.is_weights()) { + for (int i = trt_axis; i < trt_gather_output_dims.nbDims - 1; ++i) { + trt_gather_output_dims.d[i] = trt_gather_output_dims.d[i + 1]; + } + + // Squeeze the implicit batch dimension out. Note: this works only + // when batch size for both inputs and indices are 1. + --trt_gather_output_dims.nbDims; + + TF_RETURN_IF_ERROR(PrepareTensorForShape( + params->converter, TRT_TensorOrWeights(output_tensor), + trt_gather_output_dims, + /*validation_only=*/false, &output_tensor, node_def)); } params->outputs->push_back(TRT_TensorOrWeights(output_tensor)); return Status::OK(); } -Status ConvertFullyConnectedHelper(OpConverterParams* params, - ITensorProxyPtr tensor_a, - TRT_ShapedWeights weights_b, - bool transpose_b, const string& node_name) { - // Reshape input to 3D - this will be a no-op unless using int8 precision. - auto input_dim = tensor_a->getDimensions(); - while (input_dim.nbDims < 3) { - input_dim.d[input_dim.nbDims++] = 1; +// Converts the input matrix multiplication node to a fully connected (FC) layer +// if possible, as the FC layer has more tactics and INT implementations. +// Returns the output ITensor* if the node is converted or nullptr if conversion +// is not possible. An error status indicates internal problems during +// conversion. +::stream_executor::port::StatusOr ConvertFullyConnectedImpl( + const OpConverterParams* params, TRT_TensorOrWeights input_a, + TRT_TensorOrWeights input_b, bool transpose_a, bool transpose_b) { + if (!(!transpose_a && input_a.is_tensor() && input_b.is_weights())) { + VLOG(2) << "Not FC compatible, A must be non transposed tensor, and B " + "must be constant."; + return ITensorProxyPtr(nullptr); + } + + if (!params->use_implicit_batch && input_b.GetTrtDims().nbDims > 2 && + input_b.GetTrtDims().d[0] != 1) { + // Implicit broadcasting, if needed, has already been considered to + // transform the inputs and ensure the two operands have the same rank here. + // If the inputs have rank >= 3, then d[0] is the explicit batch dimension. + // The weight (input_b) must have batch size 1 in implicit batch mode. + VLOG(2) << "Not FC compatible, if B has an explicit batch dimension, then " + "it must be 1."; + return ITensorProxyPtr(nullptr); + } + + nvinfer1::Dims input_dim = input_a.GetTrtDims(); + if (input_dim.d[input_dim.nbDims - 1] == -1) { + VLOG(2) << "Not FC compatible, last dim of A must be static."; + return ITensorProxyPtr(nullptr); + } + + if (input_dim.nbDims + 2 > nvinfer1::Dims::MAX_DIMS) { + VLOG(2) << "Not FC compatible, cannot expand A's shape."; + return ITensorProxyPtr(nullptr); + } + + // Add two trailing 1's because FC layer combines the last three dims. + ITensorProxyPtr tensor_a = nullptr; + + // Initialize the elements of reshap_dim to 0. A value 0 in + // reshape_dim(i) will preserve the i-th dimension value from the shape of + // input_a. Add two trailing dimensions of size 1. + auto reshape_dim = DimsAdapter(input_dim.nbDims, + DimsAdapter::StorageType(input_dim.nbDims, 0)) + .Append(1) + .Append(1); + + const NodeDef& node_def = params->node_def; + TF_RETURN_IF_ERROR(PrepareTensorForShape( + params->converter, input_a, reshape_dim, + /*validation_only=*/false, &tensor_a, node_def, /*op_instance=*/0, + /*origin_node_name=*/"FULLY_CONNECTED")); + + VLOG(2) << "New shape of A " << DebugString(tensor_a->getDimensions()); + + TRT_ShapedWeights weights_b = input_b.weights(); + TRT_ShapedWeights weights_2D(weights_b); + if (weights_b.Shape().NumDims() > 2) { + // Combine first nbDims-1 dims into a single dim, e.g. for a 4D tensor we + // transform [N, H, W, C] -> [N*H*W, C]. This is only valid if all batch + // dimensions are 1. + if (std::any_of(weights_b.Shape().begin(), + weights_b.Shape().begin() + weights_b.Shape().NumDims() - 2, + [](int d) { return d != 1; })) { + VLOG(2) << "Not FC compatible, B has a batch dim larger than 1"; + return ITensorProxyPtr(nullptr); + } + int k = weights_b.Shape().dim(weights_b.Shape().NumDims() - 1); + nvinfer1::Dims dims{2, {static_cast(weights_b.count() / k), k}}; + TF_RETURN_IF_ERROR(weights_2D.SetShape(dims)); } - TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape( - TRT_TensorOrWeights(tensor_a), input_dim, /*validation_only=*/false, - &tensor_a)); // FC layer will transpose weights, so we need to pre-transpose. - TRT_ShapedWeights weights(weights_b.TrtDType()); + TRT_ShapedWeights weights(weights_2D.TrtDType()); if (!transpose_b) { - weights = params->weight_store->GetTempWeights(weights_b); - ReorderCKtoKC(weights_b, &weights); + auto tmp = params->weight_store->GetTempWeights(weights_2D); + TRT_ENSURE_OK(tmp); + weights = std::move(tmp).ValueOrDie(); + ReorderCKtoKC(weights_2D, &weights); } else { - weights = weights_b; + weights = weights_2D; } TRT_ShapedWeights biases(weights.TrtDType()); - const int noutput = weights.shape_.d[0]; + int k = weights.Shape().dim(weights.Shape().NumDims() - 1); + const int noutput = weights.count() / k; + VLOG(2) << "Using fully connected layer with k=" << k + << ", n_output=" << noutput + << " weights shape: " << weights.Shape().DebugString() + << " to convert " << node_def.op(); nvinfer1::IFullyConnectedLayer* layer = params->converter->network()->addFullyConnected( - *tensor_a->trt_tensor(), noutput, weights.GetTrtWeights(), biases.GetTrtWeights()); + *tensor_a->trt_tensor(), noutput, weights.GetTrtWeights(), + biases.GetTrtWeights()); - TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_name); + TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name()); + params->converter->SetLayerName(layer, node_def); ITensorProxyPtr output_tensor = layer->getOutput(0); - // Reshape output to 1D - this will be a no-op unless using int8 precision. + // A fully connected layer produces output with two trailing singleton + // dimensions. We remove these. auto output_dim = output_tensor->getDimensions(); - output_dim.nbDims = 1; - TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape( - TRT_TensorOrWeights(output_tensor), output_dim, /*validation_only=*/false, - &output_tensor)); - - params->outputs->push_back(TRT_TensorOrWeights(output_tensor)); - return Status::OK(); -} - -Status ConvertMatMulHelper(OpConverterParams* params, - TRT_TensorOrWeights input_a, - TRT_TensorOrWeights input_b, bool transpose_a, - bool transpose_b, string node_name) { - // TODO: ReorderCKtoKC is currently not general enough to transpose weights - // that are not 2D. - if ((transpose_a && input_a.is_weights() && - input_a.GetTrtDims().nbDims != 2) || - (transpose_b && input_b.is_weights() && - input_b.GetTrtDims().nbDims != 2)) { - return errors::InvalidArgument( - "Cannot currently transpose constant input if it is not 2 dimensional"); - } - - // If A is a tensor, we can only transpose if it is at least 3D in TF, - // or TRT will not do the correct transposition. - if (transpose_a && input_a.is_tensor() && input_a.GetTrtDims().nbDims < 2) { - return errors::InvalidArgument( - "Cannot transpose first input if it is a tensor with fewer than 2 " - "non-batch dimensions."); + output_dim.nbDims -= 2; + // A zero in output_dim indicates copying the corresponding input dimension + // value during reshape. + std::fill(output_dim.d, output_dim.d + output_dim.nbDims, 0); + TF_RETURN_IF_ERROR(PrepareTensorForShape( + params->converter, TRT_TensorOrWeights(output_tensor), output_dim, + /*validation_only=*/false, &output_tensor, node_def, + /*op_instance=*/1, /*origin_node_name=*/"FULLY_CONNECTED")); + return output_tensor; +} + +::stream_executor::port::StatusOr ConvertMatMulImpl( + const OpConverterParams* params, TRT_TensorOrWeights input_a, + TRT_TensorOrWeights input_b, bool transpose_a, bool transpose_b) { + if (params->use_implicit_batch) { + // In implicit batch mode we are very limited when can we multiply 2D + // matrices. If input_A is a 2D tensor, then nbDims==1 (implicit batch dim + // not counted). If A is not transposed and B is weight, then we can convert + // this treating A as a batch of vectors. This is the only possibility + // to implement MatMul with 2D input in implicit batch mode. + if ((input_a.GetTrtDims().nbDims < 2 && + (transpose_a || !input_b.is_weights())) || + (input_b.GetTrtDims().nbDims < 2)) { + return errors::InvalidArgument( + "MatMul with 2D tensors requires explicit batch mode, or that tensor" + " A is not transposed and B is a constant tensor."); + } } - // If B is a tensor, then it must be at least 3D in TF, - // or TRT won't be able to handle the multiply correctly. - if (input_b.is_tensor() && input_b.GetTrtDims().nbDims < 2) { - return errors::InvalidArgument( - "Second input must either be a constant, or contain at least 2 " - "non-batch dimensions."); - } - if (params->validation_only) return Status::OK(); + if (params->validation_only) return ITensorProxyPtr(nullptr); - // If an FC layer can be used and would be faster, use that instead. - const bool can_use_fc = - !transpose_a && input_a.is_tensor() && input_b.is_weights(); - const bool should_use_fc = can_use_fc && input_a.GetTrtDims().nbDims >= 3 && - input_b.GetTrtDims().nbDims == 2; - // If int8 is specified, FC must be used unless it is not compatible, as MM - // does not support int8 at this time. - if (should_use_fc || (can_use_fc && params->converter->precision_mode() == - TrtPrecisionMode::INT8)) { - return ConvertFullyConnectedHelper( - params, input_a.tensor()->trt_tensor(), input_b.weights(), transpose_b, node_name); + ::stream_executor::port::StatusOr result = + ConvertFullyConnectedImpl(params, input_a, input_b, transpose_a, + transpose_b); + TF_RETURN_IF_ERROR(result.status()); + ITensorProxyPtr output = result.ValueOrDie(); + if (*output) { + // FC conversion was successful, we can return. + return output; } - - const auto get_matrix_op = [](ITensorProxyPtr in, - bool transpose) -> nvinfer1::MatrixOperation { - return (in->getDimensions().nbDims < 2) - ? nvinfer1::MatrixOperation::kVECTOR - : (transpose) ? nvinfer1::MatrixOperation::kTRANSPOSE - : nvinfer1::MatrixOperation::kNONE; - }; - - // If the MatMul operand is a constant, applies transposes at conversion-time - // as necessary. If the operand is a tensor, does nothing. If required - // transposes were applied, sets transpose to false. - const auto prepare_matmul_operand = - [¶ms](TRT_TensorOrWeights operand, - bool* transpose) -> ITensorProxyPtr { + const auto convert_to_itensor = + [¶ms](TRT_TensorOrWeights operand) -> ITensorProxyPtr { if (operand.is_tensor()) { return operand.tensor(); } else { - TRT_ShapedWeights weights(operand.weights().TrtDType()); - if (*transpose) { - weights = params->weight_store->GetTempWeights(operand.weights()); - ReorderCKtoKC(operand.weights(), &weights); - // Weights have been transposed, can set transpose to false - *transpose = false; - } else { - weights = operand.weights(); - } - return params->converter->CreateConstantLayer(weights, weights.shape_); + return params->converter->CreateConstantLayer(operand.weights(), + operand.GetTrtDims()); } }; - ITensorProxyPtr tensor_a = prepare_matmul_operand(input_a, &transpose_a); - ITensorProxyPtr tensor_b = prepare_matmul_operand(input_b, &transpose_b); + ITensorProxyPtr tensor_a = convert_to_itensor(input_a); + ITensorProxyPtr tensor_b = convert_to_itensor(input_b); + + const auto get_matrix_op = [](ITensorProxyPtr in, + bool transpose) -> nvinfer1::MatrixOperation { + return (transpose) ? nvinfer1::MatrixOperation::kTRANSPOSE + : nvinfer1::MatrixOperation::kNONE; + }; + nvinfer1::MatrixOperation op_a, op_b; + // Note: In implicit batch mode kTRANSPOSE and kNONE are only valid if the + // matrix has at least 2 non-batch dimension. In implicit batch mode, if a has + // 1 dim (excluding batch dim), then we can only use kVECTOR, which will treat + // matrix A as a batch of vectors. + op_a = (tensor_a->getDimensions().nbDims < 2) + ? nvinfer1::MatrixOperation::kVECTOR + : get_matrix_op(tensor_a, transpose_a); + // In implicit batch mode, if B has only 1 dims (excluding batch dim) then we + // already reject the case and don't convert. One could consider using the + // kVECTOR flag to express C = MatMul(A, B.T) if A is weight, but the result + // will not have the correct shape: in TRT's implicit batch implementation, + // the result is a batch of vectors D_ji = A_ik * B_jk, where j is the batch + // dimension. In contrast, the TF MatMul op produces C = D.T, and we cannot + // transpose over the batch dimension (implicit batch mode). + op_b = get_matrix_op(tensor_b, transpose_b); nvinfer1::IMatrixMultiplyLayer* layer = params->converter->network()->addMatrixMultiply( - *tensor_a->trt_tensor(), get_matrix_op(tensor_a, transpose_a), *tensor_b->trt_tensor(), - get_matrix_op(tensor_b, transpose_b)); + *tensor_a->trt_tensor(), op_a, *tensor_b->trt_tensor(), op_b); - TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_name); - ITensorProxyPtr output_tensor = layer->getOutput(0); - params->outputs->push_back(TRT_TensorOrWeights(output_tensor)); + const auto& node_def = params->node_def; + TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name()); + params->converter->SetLayerName(layer, node_def); + return ITensorProxyPtr(layer->getOutput(0)); +} + +Status ConvertMatMulHelper(const OpConverterParams* params, + TRT_TensorOrWeights input_a, + TRT_TensorOrWeights input_b, bool transpose_a, + bool transpose_b) { + ::stream_executor::port::StatusOr result = + ConvertMatMulImpl(params, input_a, input_b, transpose_a, transpose_b); + TF_RETURN_IF_ERROR(result.status()); + if (!params->validation_only) { + params->outputs->push_back(TRT_TensorOrWeights(result.ValueOrDie())); + } return Status::OK(); } // inputs are both two dimensional (ops::MatMul) -Status ConvertMatMul(OpConverterParams* params) { +Status ConvertMatMul(const OpConverterParams* params) { const auto& inputs = params->inputs; const auto& node_def = params->node_def; - if (inputs.size() != 2) { - return errors::InvalidArgument(node_def.op(), " got ", inputs.size(), - " inputs but expected 2, at ", - node_def.name()); - } + TFTRT_CHECK_INPUT_SIZE(inputs.size(), 2, node_def); + TF_RETURN_IF_ERROR( AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF})); - TFAttrs attrs(node_def); - bool transpose_a = attrs.get("transpose_a"); - bool transpose_b = attrs.get("transpose_b"); + bool transpose_a = false, transpose_b = false; + AttrSlice attrs(node_def); + TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "transpose_a", &transpose_a)); + TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "transpose_b", &transpose_b)); return ConvertMatMulHelper(params, inputs.at(0), inputs.at(1), transpose_a, - transpose_b, node_def.name()); + transpose_b); } -Status ConvertBatchMatMul(OpConverterParams* params) { +Status ConvertBatchMatMul(const OpConverterParams* params) { const auto& inputs = params->inputs; const auto& node_def = params->node_def; - if (inputs.size() != 2) { - return errors::InvalidArgument(node_def.op(), " got ", inputs.size(), - " inputs but expected 2, at ", - node_def.name()); - } - // TODO(tmorris): Enable once false is updated to mean either tensor or weight - // TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"x", false}, {"y", - // false}})); + TFTRT_CHECK_INPUT_SIZE(inputs.size(), 2, node_def); + + TF_RETURN_IF_ERROR(CheckInputsWeights( + *params, {{"x", TrtInputArg::kBoth}, {"y", TrtInputArg::kBoth}})); + // TODO(tfeher): Consider adding INT8 type because FC layer can support it. TF_RETURN_IF_ERROR( AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF})); if (inputs.at(0).is_weights() && inputs.at(1).is_weights()) { + // TODO(lsugy): don't assume that if all inputs are weights, grappler + // should fold them, because variables are weights. return errors::InvalidArgument( "All inputs are weights, but Grappler is expected to fold them."); } - if (inputs.at(0).is_tensor() && inputs.at(1).is_tensor() && - inputs.at(0).GetTrtDims().nbDims != inputs.at(1).GetTrtDims().nbDims) { - return errors::Unimplemented( - "Inputs must have the same rank if they are both tensors."); - } - TFAttrs attrs(node_def); - const bool transpose_a = attrs.get("adj_x"); - const bool transpose_b = attrs.get("adj_y"); + bool transpose_a = false, transpose_b = false; + AttrSlice attrs(node_def); + TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "adj_x", &transpose_a)); + TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "adj_y", &transpose_b)); - // There is no way to batch constants in TRT. Example: - // Tensor with TF Dims: 12 5 3 -> TRT Dims: 5 3 - // Weight with TF Dims: 12 3 6 -> TRT Dims: 12 3 6 - // It is not possible to treat the weight input as a batched [3, 6] tensor. + // In case input_l is weight, check whether input_l has implicit batch mode + // compatible batch dim. const auto check_weight_is_not_batched = [](const TRT_TensorOrWeights& input_l, const TRT_TensorOrWeights& input_r) { - // If input_l is a weight, then input_r must be a tensor because - // otherwise the op would be handled by Grappler. + // There is no way to batch constants in TRT using implicit batch mode. + // Example: + // Tensor with TF Dims: 12 5 3 -> TRT Dims: 5 3 + // Weight with TF Dims: 12 3 6 -> TRT Dims: 12 3 6 + // It is not possible to treat the weight input as a batched [3, 6] + // tensor. Batched weight tensors must have batch dim = 1 (after the + // broadcast). if (input_l.is_weights() && input_l.GetTrtDims().nbDims > input_r.GetTrtDims().nbDims && input_l.GetTrtDims().d[0] != 1) { return errors::Unimplemented( - "TensorRT does not support batched constants."); + "TensorRT does not support batched constants in implicit batch " + "mode."); } return Status::OK(); }; - TF_RETURN_IF_ERROR(check_weight_is_not_batched(inputs.at(0), inputs.at(1))); - TF_RETURN_IF_ERROR(check_weight_is_not_batched(inputs.at(1), inputs.at(0))); + if (params->use_implicit_batch) { + TF_RETURN_IF_ERROR(check_weight_is_not_batched(inputs.at(0), inputs.at(1))); + TF_RETURN_IF_ERROR(check_weight_is_not_batched(inputs.at(1), inputs.at(0))); + } // Broadcast inputs. We don't check feasibility since the dimensions in a // MatMul don't need to match. For example, consider a valid set of inputs @@ -4843,71 +5070,38 @@ Status ConvertBatchMatMul(OpConverterParams* params) { // input 0: [N, T, C] // input 1: [1, C, K] // Since C != K and T != C, check feasiblity would fail. - nvinfer1::Dims broadcasted_dims_l, broadcasted_dims_r; - TF_RETURN_IF_ERROR(GetTrtBroadcastShape( - inputs.at(0), inputs.at(1), /*check_feasibility=*/false, - &broadcasted_dims_l, &broadcasted_dims_r)); - ITensorProxyPtr tensor_l = nullptr; - ITensorProxyPtr tensor_r = nullptr; - TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape( - inputs.at(0), broadcasted_dims_l, params->validation_only, &tensor_l)); - TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape( - inputs.at(1), broadcasted_dims_r, params->validation_only, &tensor_r)); - if (params->validation_only) return Status::OK(); - - return ConvertMatMulHelper(params, TRT_TensorOrWeights(tensor_l), - TRT_TensorOrWeights(tensor_r), transpose_a, - transpose_b, node_def.name()); -} - -Status ConvertSoftmax(OpConverterParams* params) { - const auto& inputs = params->inputs; - const auto& node_def = params->node_def; - TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"logits", false}})); - TF_RETURN_IF_ERROR( - AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF})); - ITensorProxyPtr tensor = inputs.at(0).tensor(); + auto input_l = std::make_unique(inputs.at(0)); + auto input_r = std::make_unique(inputs.at(1)); + TF_RETURN_IF_ERROR(BroadcastTensors(input_l, input_r, + /*check_feasibility=*/false, params)); - const int num_trt_dims = tensor->getDimensions().nbDims; - if (num_trt_dims == 0) { - return errors::InvalidArgument( - "TensorRT Softmax cannot apply on batch dimension, at", - node_def.name()); - } if (params->validation_only) return Status::OK(); - nvinfer1::ISoftMaxLayer* layer = - params->converter->network()->addSoftMax(*tensor->trt_tensor()); - TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name()); - // Tensorflow SoftMax assumes applying softmax on the last dimension. - layer->setAxes(1 << (num_trt_dims - 1)); - - ITensorProxyPtr output_tensor = layer->getOutput(0); - // Quantization range for SoftMax is always (0, 1) - params->converter->ProvideQuantizationRange(&output_tensor, 0.0f, 1.0f); - params->outputs->push_back(TRT_TensorOrWeights(output_tensor)); - return Status::OK(); + return ConvertMatMulHelper(params, *input_l, *input_r, transpose_a, + transpose_b); } -Status ConvertArgMinMax(OpConverterParams* params) { +Status ConvertArgMinMax(const OpConverterParams* params) { const auto& inputs = params->inputs; const auto& node_def = params->node_def; TF_RETURN_IF_ERROR( CheckInputsWeights(*params, {{"input", false}, {"dimension", true}})); TF_RETURN_IF_ERROR( AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF})); - // INT64 outputs are not supported by TRT. - TFAttrs attrs(node_def); - DataType output_dtype = attrs.get("output_type"); + + DataType output_dtype{DataType::DT_INT32}; + TF_RETURN_IF_ERROR( + GetNodeAttr(AttrSlice(node_def), "output_type", &output_dtype)); + if (output_dtype != DataType::DT_INT32) { return errors::Unimplemented("Output type ", DataTypeString(output_dtype), - " is not supported, at ", node_def.name()); + " is not supported"); } int tf_axis = inputs.at(1).weights().GetSpan()[0]; int trt_axis; nvinfer1::Dims dims = inputs.at(0).GetTrtDims(); TF_RETURN_IF_ERROR(ConvertAxis(tf_axis, dims.nbDims, node_def.name(), - /*use_implicit_batch=*/true, &trt_axis)); + params->use_implicit_batch, &trt_axis)); nvinfer1::TopKOperation topk_op; if (node_def.op() == "ArgMin") { topk_op = nvinfer1::TopKOperation::kMIN; @@ -4916,6 +5110,18 @@ Status ConvertArgMinMax(OpConverterParams* params) { } else { return errors::InvalidArgument("Unsupported ArgMin/Max operation"); } + +#if !IS_TRT_VERSION_GE(7, 0, 0, 11) + const nvinfer1::Dims trt_dims = params->inputs.at(0).GetTrtDims(); + if (trt_dims.nbDims >= 4) { + string trt_dim_str = DebugString(trt_dims); + + return errors::Unimplemented(node_def.op(), "op is not able to support", + " tensors with 4+ dimensions (excluding batch", + " size). Received: ", trt_dim_str); + } +#endif + if (params->validation_only) return Status::OK(); // Use TopK with k = 1. Only indices output is needed (output 1). @@ -4923,51 +5129,63 @@ Status ConvertArgMinMax(OpConverterParams* params) { nvinfer1::ITopKLayer* layer = params->converter->network()->addTopK( *inputs.at(0).tensor()->trt_tensor(), topk_op, 1, reduce_axes); TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name()); + params->converter->SetLayerName(layer, node_def, "topk"); ITensorProxyPtr output_indices_tensor = layer->getOutput(1); // Squeeze on axis. - std::vector size(dims.d, dims.d + dims.nbDims); - size.erase(size.begin() + trt_axis); - nvinfer1::Dims new_dims; - TF_RETURN_IF_ERROR(TensorShapeArrayToTrtDims(size, &new_dims)); + std::vector input_dims(dims.d, dims.d + dims.nbDims); + input_dims[trt_axis] = 0; ITensorProxyPtr output_tensor = nullptr; - TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape( - TRT_TensorOrWeights(output_indices_tensor), new_dims, - /*validation_only=*/false, &output_tensor)); - + TF_RETURN_IF_ERROR(params->converter->SqueezeTensor( + /*input=*/output_indices_tensor, + /*input_dims=*/&input_dims, + /*params=*/params, + /*output=*/&output_tensor)); params->outputs->push_back(TRT_TensorOrWeights(output_tensor)); + return Status::OK(); } -Status ConvertTopK(OpConverterParams* params) { +Status ConvertTopK(const OpConverterParams* params) { const auto& inputs = params->inputs; const auto& node_def = params->node_def; TF_RETURN_IF_ERROR( CheckInputsWeights(*params, {{"input", false}, {"k", true}})); TF_RETURN_IF_ERROR( AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF})); + bool sorted{false}; + TF_RETURN_IF_ERROR(GetNodeAttr(AttrSlice(node_def), "sorted", &sorted)); + + if (!sorted) { + // TensorRT only supports sorted output. Although TensorFlow API + // doesn't specify the order of output elements in case sorted=false, + // but it's safer to not convert because the output of TensorRT might + // be different with TensorFlow which can cause confusion. + return errors::InvalidArgument("Only sorted=True is supported"); + } + ITensorProxyPtr tensor = inputs.at(0).tensor(); const int num_dims = tensor->getDimensions().nbDims; if (num_dims == 0) { return errors::InvalidArgument( - "TensorRT TopK cannot apply on batch dimension, at", node_def.name()); + "TensorRT TopK cannot apply on batch dimension"); } TRT_ShapedWeights k_w = inputs.at(1).weights(); if (k_w.count() != 1) { - return errors::InvalidArgument("k value of TopK should be a scalar, at", - node_def.name()); + return errors::InvalidArgument("k value of TopK should be a scalar"); } // Note that ITopKLayer always have sorted outputs, so we don't need to handle // the 'sorted' attribute of the node. if (params->validation_only) return Status::OK(); const nvinfer1::TopKOperation op = nvinfer1::TopKOperation::kMAX; - const int k = *(static_cast(k_w.GetValues())); + const int k = *(k_w.GetPointer()); const uint32_t reduce_axes = 1 << (num_dims - 1); - nvinfer1::ITopKLayer* layer = - params->converter->network()->addTopK(*tensor->trt_tensor(), op, k, reduce_axes); + nvinfer1::ITopKLayer* layer = params->converter->network()->addTopK( + *tensor->trt_tensor(), op, k, reduce_axes); TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name()); + params->converter->SetLayerName(layer, node_def); ITensorProxyPtr output_value_tensor = layer->getOutput(0); ITensorProxyPtr output_indices_tensor = layer->getOutput(1); @@ -4976,40 +5194,176 @@ Status ConvertTopK(OpConverterParams* params) { return Status::OK(); } -Status ConvertDepthSpaceShuffle(OpConverterParams* params) { +::stream_executor::port::StatusOr> +CalcDepthSpaceDynamicShape(const OpConverterParams* params, int block_size, + string data_format) { + // Instead we use a shape layer and shape arithmetic to calculate the reshape + // dimensions. + const auto& inputs = params->inputs; + const auto& node_def = params->node_def; + + const int channels_axis = data_format == "NCHW" ? 1 : 3; + const int h_axis = data_format == "NCHW" ? 2 : 1; + const int w_axis = data_format == "NCHW" ? 3 : 2; + + // Get shapes. + ITensorProxyPtr shape = params->converter->network() + ->addShape(*inputs.at(0).tensor()->trt_tensor()) + ->getOutput(0); + ITensorProxyPtr batch_size = + params->converter->network() + ->addSlice(*shape->trt_tensor(), {1, {0}}, {1, {1}}, {1, {1}}) + ->getOutput(0); + ITensorProxyPtr num_channels = + params->converter->network() + ->addSlice(*shape->trt_tensor(), {1, {channels_axis}}, {1, {1}}, + {1, {1}}) + ->getOutput(0); + ITensorProxyPtr h = + params->converter->network() + ->addSlice(*shape->trt_tensor(), {1, {h_axis}}, {1, {1}}, {1, {1}}) + ->getOutput(0); + ITensorProxyPtr w = + params->converter->network() + ->addSlice(*shape->trt_tensor(), {1, {w_axis}}, {1, {1}}, {1, {1}}) + ->getOutput(0); + ITensorProxyPtr r; + TF_RETURN_IF_ERROR(CreateScalarConstant(params, block_size, &r)); + ITensorProxyPtr r_squared; + TF_RETURN_IF_ERROR( + CreateScalarConstant(params, block_size * block_size, &r_squared)); + // Get shuffle parameters. + std::vector first_shuffle_tensors(6, nullptr); + std::vector second_shuffle_tensors(4, nullptr); + if (node_def.op() == "DepthToSpace") { + // First Reshape [N, C, H, W] - > [N, r, r, C/(r*r), H, W]. + first_shuffle_tensors[0] = batch_size; + first_shuffle_tensors[1] = r; + first_shuffle_tensors[2] = r; + first_shuffle_tensors[3] = + params->converter->network() + ->addElementWise(*num_channels->trt_tensor(), + *r_squared->trt_tensor(), + nvinfer1::ElementWiseOperation::kDIV) + ->getOutput(0); + first_shuffle_tensors[4] = h; + first_shuffle_tensors[5] = w; + // Second Reshape [N, C/(r*r), H, r, W, r] -> [N, C/(r*r), H * r, W * r]. + second_shuffle_tensors[0] = batch_size; + second_shuffle_tensors[1] = + params->converter->network() + ->addElementWise(*num_channels->trt_tensor(), + *r_squared->trt_tensor(), + nvinfer1::ElementWiseOperation::kDIV) + ->getOutput(0); + second_shuffle_tensors[2] = + params->converter->network() + ->addElementWise(*h->trt_tensor(), *r->trt_tensor(), + nvinfer1::ElementWiseOperation::kPROD) + ->getOutput(0); + second_shuffle_tensors[3] = + params->converter->network() + ->addElementWise(*w->trt_tensor(), *r->trt_tensor(), + nvinfer1::ElementWiseOperation::kPROD) + ->getOutput(0); + } else if (node_def.op() == "SpaceToDepth") { + // First Reshape [N, C, H, W] -> [N, C, H/r, r, W/r, r]. + first_shuffle_tensors[0] = batch_size; + first_shuffle_tensors[1] = num_channels; + first_shuffle_tensors[2] = + params->converter->network() + ->addElementWise(*h->trt_tensor(), *r->trt_tensor(), + nvinfer1::ElementWiseOperation::kDIV) + ->getOutput(0); + first_shuffle_tensors[3] = r; + first_shuffle_tensors[4] = + params->converter->network() + ->addElementWise(*w->trt_tensor(), *r->trt_tensor(), + nvinfer1::ElementWiseOperation::kDIV) + ->getOutput(0); + first_shuffle_tensors[5] = r; + + // Second Reshape [N, r, r, C, H/r, W/r] -> [N, C*r*r, H/r, W/r]. + second_shuffle_tensors[0] = batch_size; + second_shuffle_tensors[1] = + params->converter->network() + ->addElementWise(*num_channels->trt_tensor(), + *r_squared->trt_tensor(), + nvinfer1::ElementWiseOperation::kPROD) + ->getOutput(0); + second_shuffle_tensors[2] = + params->converter->network() + ->addElementWise(*h->trt_tensor(), *r->trt_tensor(), + nvinfer1::ElementWiseOperation::kDIV) + ->getOutput(0); + second_shuffle_tensors[3] = + params->converter->network() + ->addElementWise(*w->trt_tensor(), *r->trt_tensor(), + nvinfer1::ElementWiseOperation::kDIV) + ->getOutput(0); + } + + ::stream_executor::port::StatusOr result = + ConcatenateTensors(params, first_shuffle_tensors, 0); + TF_RETURN_IF_ERROR(result.status()); + ITensorProxyPtr first_shuffle_shape = result.ValueOrDie(); + + result = ConcatenateTensors(params, second_shuffle_tensors, 1); + TF_RETURN_IF_ERROR(result.status()); + ITensorProxyPtr second_shuffle_shape = result.ValueOrDie(); + + return std::make_pair(first_shuffle_shape, second_shuffle_shape); +} + +Status ConvertDepthSpaceShuffle(const OpConverterParams* params) { const auto& inputs = params->inputs; const auto& node_def = params->node_def; TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"input", false}})); TF_RETURN_IF_ERROR(AllowDataTypes( *params, {DataType::DT_FLOAT, DataType::DT_HALF, DataType::DT_INT32})); - TFAttrs attrs(node_def); - const int block_size = attrs.get("block_size"); + + string data_format; + int block_size; + AttrSlice attrs(node_def); + TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "data_format", &data_format)); + TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "block_size", &block_size)); + if (block_size < 2) { - return errors::InvalidArgument("Block size must be 2 or greater, at ", - node_def.name()); + return errors::InvalidArgument("Block size must be 2 or greater"); } - const string data_format = attrs.get("data_format"); + if (data_format != "NCHW" && data_format != "NHWC") { return errors::Unimplemented("Data format ", data_format, - " is not supported, at ", node_def.name()); + " is not supported"); } + int idx_offset = params->use_implicit_batch ? 0 : 1; nvinfer1::Dims dims = inputs.at(0).GetTrtDims(); - if (dims.nbDims != 3) { + const int required_rank = 3 + idx_offset; + if (dims.nbDims != required_rank) { return errors::InvalidArgument("The input to ", node_def.op(), - " must be rank 4, at ", node_def.name()); - } - const int num_channels = data_format == "NCHW" ? dims.d[0] : dims.d[2]; - const int h = data_format == "NCHW" ? dims.d[1] : dims.d[0]; - const int w = data_format == "NCHW" ? dims.d[2] : dims.d[1]; + " must be rank 4"); + } + const int num_channels = + data_format == "NCHW" ? dims.d[0 + idx_offset] : dims.d[2 + idx_offset]; + const int h = + data_format == "NCHW" ? dims.d[1 + idx_offset] : dims.d[0 + idx_offset]; + const int w = + data_format == "NCHW" ? dims.d[2 + idx_offset] : dims.d[1 + idx_offset]; // Get shuffle parameters. nvinfer1::Dims first_shuffle_shape; nvinfer1::Permutation transpose_perm; nvinfer1::Dims second_shuffle_shape; + + // We define all the shuffle and transpose dimensions assuming implicit batch + // mode. Afterwards we will update them to explicit batch mode if needed. + // Additionally, an NCHW layout is assumed, and this assumption is corrected + // afterwards with an initial transpose op. TODO(tfeher): Get rid of the + // layout_transpose ops by defining shuffle shape specifically for NCHW and + // NHCW. if (node_def.op() == "DepthToSpace") { - if (num_channels % (block_size * block_size) != 0) { + if (num_channels != -1 && num_channels % (block_size * block_size) != 0) { return errors::InvalidArgument( - "Number of channels must be divisible by block_size*block_size, at ", - node_def.name()); + "Number of channels must be divisible by block_size*block_size"); } // First Reshape [C, H, W] - > [r, r, C/(r*r), H, W] first_shuffle_shape = { @@ -5021,12 +5375,13 @@ Status ConvertDepthSpaceShuffle(OpConverterParams* params) { // Second Reshape [C/(r*r), H, r, W, r] -> [C/(r*r), H * r, W * r] second_shuffle_shape = nvinfer1::Dims3(num_channels / (block_size * block_size), - h * block_size, w * block_size); - } else if (node_def.op() == "SpaceToDepth") { - if (h % block_size != 0 || w % block_size != 0) { + h * block_size, w * block_size); + } else { + if (node_def.op() != "SpaceToDepth") + return errors::InvalidArgument("Incorrect op type ", node_def.op()); + if ((h != -1 && h % block_size != 0) || (w != -1 && w % block_size != 0)) { return errors::InvalidArgument( - "Width and height must be divisible by block_size, at ", - node_def.name()); + "Width and height must be divisible by block_size"); } // First Reshape [C, H, W] -> [C, H/r, r, W/r, r] first_shuffle_shape = {/*nbDims=*/5, @@ -5041,34 +5396,90 @@ Status ConvertDepthSpaceShuffle(OpConverterParams* params) { if (params->validation_only) return Status::OK(); nvinfer1::IShuffleLayer* first_shuffle = - params->converter->network()->addShuffle(*inputs.at(0).tensor()->trt_tensor()); + params->converter->network()->addShuffle( + *inputs.at(0).tensor()->trt_tensor()); TFTRT_RETURN_ERROR_IF_NULLPTR(first_shuffle, node_def.name()); + params->converter->SetLayerName(first_shuffle, node_def, "shuffle", + /*op_instance=*/0); + + ITensorProxyPtr second_shuffle_shape_tensor; + + if (HasStaticShape(inputs.at(0).GetTrtDims())) { + // Adjust a reshape constructed at implicit batch mode for explicit batch + // mode. In particular, we need to insert the batch dimension size to the + // beginning of all the dimension sizes. Example: reshape {20,10,30} for + // implicit batch mode becomes reshape {N,20,10,30} for explicit batch mode. + auto adjust_reshape = [](int N, nvinfer1::Dims dims, + bool use_implicit_batch) { + if (use_implicit_batch) return dims; + for (int i = dims.nbDims; i > 0; i--) { + dims.d[i] = dims.d[i - 1]; + } + dims.d[0] = N; + dims.nbDims++; + return dims; + }; + + first_shuffle_shape = adjust_reshape(dims.d[0], first_shuffle_shape, + params->use_implicit_batch); + second_shuffle_shape = adjust_reshape(dims.d[0], second_shuffle_shape, + params->use_implicit_batch); + + first_shuffle->setReshapeDimensions(first_shuffle_shape); + } else { + ::stream_executor::port::StatusOr< + std::pair> + result = CalcDepthSpaceDynamicShape(params, block_size, data_format); + TF_RETURN_IF_ERROR(result.status()); + first_shuffle->setInput(1, *result.ValueOrDie().first->trt_tensor()); + second_shuffle_shape_tensor = result.ValueOrDie().second; + } + + // Adjust a transpose constructed assuming implicit batch mode for explicit + // batch mode. In particular, we need to add the batch dimension to d0 and + // add 1 to all the dimension id in the transpose. Example: permutation + // for implicit batch mode becomes permutation {0,3,2,1} for explicit batch + // mode. + auto adjust_perm = [](int n, nvinfer1::Permutation perm, + bool use_implicit_batch) { + if (use_implicit_batch) return perm; + for (int i = n; i > 0; i--) { + perm.order[i] = perm.order[i - 1] + 1; + } + perm.order[0] = 0; + return perm; + }; + transpose_perm = adjust_perm(5, transpose_perm, params->use_implicit_batch); + if (data_format == "NHWC") { - first_shuffle->setFirstTranspose({2, 0, 1}); + nvinfer1::Permutation layout_transpose = + adjust_perm(3, {2, 0, 1}, params->use_implicit_batch); + first_shuffle->setFirstTranspose(layout_transpose); } - first_shuffle->setReshapeDimensions(first_shuffle_shape); first_shuffle->setSecondTranspose(transpose_perm); nvinfer1::IShuffleLayer* second_shuffle = params->converter->network()->addShuffle(*first_shuffle->getOutput(0)); TFTRT_RETURN_ERROR_IF_NULLPTR(second_shuffle, node_def.name()); - second_shuffle->setReshapeDimensions(second_shuffle_shape); + params->converter->SetLayerName(second_shuffle, node_def, "shuffle", + /*op_instance=*/1); + + if (HasStaticShape(inputs.at(0).GetTrtDims())) { + second_shuffle->setReshapeDimensions(second_shuffle_shape); + } else { + second_shuffle->setInput(1, *second_shuffle_shape_tensor->trt_tensor()); + } if (data_format == "NHWC") { - second_shuffle->setSecondTranspose({1, 2, 0}); + nvinfer1::Permutation layout_transpose = + adjust_perm(3, {1, 2, 0}, params->use_implicit_batch); + second_shuffle->setSecondTranspose(layout_transpose); } - ITensorProxyPtr input_tensor = inputs.at(0).tensor(); - ITensorProxyPtr first_shuffle_tensor = first_shuffle->getOutput(0); - ITensorProxyPtr second_shuffle_tensor = second_shuffle->getOutput(0); - params->converter->MarkQuantizationRangesAsInferrable(&input_tensor, - &first_shuffle_tensor); - params->converter->MarkQuantizationRangesAsInferrable(&first_shuffle_tensor, - &second_shuffle_tensor); - params->outputs->push_back(TRT_TensorOrWeights(second_shuffle_tensor)); + params->outputs->push_back(TRT_TensorOrWeights(second_shuffle->getOutput(0))); return Status::OK(); } -Status ConvertSquaredDifference(OpConverterParams* params) { +Status ConvertSquaredDifference(const OpConverterParams* params) { TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"x", false}, {"y", false}})); TF_RETURN_IF_ERROR( AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF})); @@ -5078,231 +5489,314 @@ Status ConvertSquaredDifference(OpConverterParams* params) { nvinfer1::Dims broadcasted_dims_l, broadcasted_dims_r; TF_RETURN_IF_ERROR(GetTrtBroadcastShape( inputs.at(0), inputs.at(1), /*check_feasibility=*/true, - &broadcasted_dims_l, &broadcasted_dims_r)); + params->use_implicit_batch, &broadcasted_dims_l, &broadcasted_dims_r)); ITensorProxyPtr tensor_l = nullptr; ITensorProxyPtr tensor_r = nullptr; - TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape( - inputs.at(0), broadcasted_dims_l, params->validation_only, &tensor_l)); - TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape( - inputs.at(1), broadcasted_dims_r, params->validation_only, &tensor_r)); + TF_RETURN_IF_ERROR( + PrepareTensorForShape(params->converter, inputs.at(0), broadcasted_dims_l, + params->validation_only, &tensor_l, node_def)); + TF_RETURN_IF_ERROR( + PrepareTensorForShape(params->converter, inputs.at(1), broadcasted_dims_r, + params->validation_only, &tensor_r, node_def)); if (params->validation_only) return Status::OK(); // Subtract x - y. nvinfer1::IElementWiseLayer* sub = params->converter->network()->addElementWise( - *tensor_l->trt_tensor(), *tensor_r->trt_tensor(), nvinfer1::ElementWiseOperation::kSUB); + *tensor_l->trt_tensor(), *tensor_r->trt_tensor(), + nvinfer1::ElementWiseOperation::kSUB); TFTRT_RETURN_ERROR_IF_NULLPTR(sub, node_def.name()); + params->converter->SetLayerName(sub, node_def, "sub"); + // Multiply (x - y) * (x - y). nvinfer1::IElementWiseLayer* mul = params->converter->network()->addElementWise( *sub->getOutput(0), *sub->getOutput(0), nvinfer1::ElementWiseOperation::kPROD); TFTRT_RETURN_ERROR_IF_NULLPTR(mul, node_def.name()); + params->converter->SetLayerName(mul, node_def, "mul"); params->outputs->push_back(TRT_TensorOrWeights(mul->getOutput(0))); return Status::OK(); } -#if IS_TRT_VERSION_GE(5, 1, 0, 0) -Status ConvertCombinedNMS(OpConverterParams* params) { - TF_RETURN_IF_ERROR( - CheckInputsWeights(*params, {{"boxes", false}, - {"scores", false}, - {"max_output_size_per_class", true}, - {"max_total_size", true}, - {"iou_threshold", true}, - {"score_threshold", true}})); +#if IS_TRT_VERSION_GE(7, 1, 3, 0) || defined(TF_TRT_USE_EFFICIENT_NMS_PLUGIN) +Status ConvertCombinedNMS(const OpConverterParams* params) { + TF_RETURN_IF_ERROR(CheckInputsWeights( + *params, {{"boxes", TrtInputArg::kTensor}, + {"scores", TrtInputArg::kTensor}, + {"max_output_size_per_class", TrtInputArg::kWeight}, + {"max_total_size", TrtInputArg::kWeight}, + {"iou_threshold", TrtInputArg::kWeight}, + {"score_threshold", TrtInputArg::kWeight}})); const auto& inputs = params->inputs; const auto& node_def = params->node_def; + const auto& node_name = node_def.name(); - ITensorProxyPtr boxes_tensor = inputs.at(0).tensor(); - ITensorProxyPtr scores_tensor = inputs.at(1).tensor(); - TRT_ShapedWeights output_size_per_class = inputs.at(2).weights(); - TRT_ShapedWeights total_size = inputs.at(3).weights(); - TRT_ShapedWeights iou_threshold = inputs.at(4).weights(); - TRT_ShapedWeights score_threshold = inputs.at(5).weights(); - - // Validate tensors and weights (also set some of the needed plugin fields) + const ITensorProxyPtr boxes_tensor = inputs.at(0).tensor(); + const ITensorProxyPtr scores_tensor = inputs.at(1).tensor(); const auto boxes_dims = boxes_tensor->getDimensions(); const auto scores_dims = scores_tensor->getDimensions(); - if (boxes_dims.nbDims != 3) { + +#if IS_TRT_VERSION_GE(8, 2, 1, 6) || defined(TF_TRT_USE_EFFICIENT_NMS_PLUGIN) + const auto flag = true; + const auto* plugin_name = "NMS TRT Plugin "; + const auto* pluginName = "EfficientNMS_TFTRT_TRT"; +#else // IS_TRT_VERSION_GE(7, 1, 3, 0) + const auto flag = false; + const auto* plugin_name = "TensorRT BatchedNMS Plugin "; + const auto* pluginName = "BatchedNMS_TRT"; + + auto AllowNmsTopkOverride = []() { + static bool result = [] { + bool value; + const Status status = ReadBoolFromEnvVar("TF_TRT_ALLOW_NMS_TOPK_OVERRIDE", + /*default_value=*/false, &value); + if (!status.ok()) { + LOG(ERROR) << status; + } + return value; + }(); + return result; + }; +#endif + + if (params->use_implicit_batch == flag) { + if (flag) { + return errors::Unimplemented( + convert_not_supported_implicit(node_def.op(), node_name)); + } else { + if (!HasStaticShape(boxes_dims) || !HasStaticShape(scores_dims)) { + return errors::Unimplemented(plugin_name, + "requires input with static shape"); + } + } + } + + const auto& output_size_per_class = inputs.at(2).weights(); + const auto& total_size = inputs.at(3).weights(); + const auto& iou_threshold = inputs.at(4).weights(); + const auto& score_threshold = inputs.at(5).weights(); + + const int offset = params->use_implicit_batch ? 0 : 1; + if (boxes_dims.nbDims != 3 + offset) { return errors::InvalidArgument( - "TensorRT BatchedNMS Plugin input boxes must be 3-D excluding batch ", - node_def.name()); + plugin_name, "input boxes must be 4-D including batch, at ", node_name); } - const int num_classes = scores_dims.d[1]; - bool box_check = boxes_dims.d[1] == 1 || boxes_dims.d[1] == num_classes; + + AttrSlice attrs(node_def); + bool clip_boxes = false, pad_per_class = false; + TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "clip_boxes", &clip_boxes)); + TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "pad_per_class", &pad_per_class)); + + const int class_idx = 1 + offset; + const int num_classes = scores_dims.d[class_idx]; + const bool box_check = + boxes_dims.d[class_idx] == 1 || boxes_dims.d[class_idx] == num_classes; if (!box_check) { return errors::InvalidArgument( - "TensorRT BatchedNMS Plugin third dimension of boxes must be either 1 " - "or num_classes ", - node_def.name()); + plugin_name, + "third dimension of boxes must be either 1" + "or match the num_classes dimension of scores, at ", + node_name); } - if (output_size_per_class.shape_.nbDims != 1) { + + if (output_size_per_class.count() != 1) { return errors::InvalidArgument( - "TensorRT BatchedNMS Plugin max_output_size_per_class must be 0-D ", - node_def.name()); + plugin_name, "max_output_size_per_class must be scalar, at ", + node_name); } - int max_size_per_class = - *(static_cast(output_size_per_class.GetValues())); + + const int max_size_per_class = *(output_size_per_class.GetPointer()); if (max_size_per_class <= 0) { return errors::InvalidArgument( - "TensorRT BatchedNMS Plugin max_output_size_per_class should be > 0", - node_def.name()); + plugin_name, "max_output_size_per_class should be > 0, at ", node_name); } - if (total_size.shape_.nbDims != 1) { + + if (total_size.count() != 1) { return errors::InvalidArgument( - "TensorRT BatchedNMS Plugin max_total_size must be 0-D ", - node_def.name()); + plugin_name, "max_total_size must be scalar, at ", node_name); } - int max_total_size = *(static_cast(total_size.GetValues())); + + int max_total_size = *(total_size.GetPointer()); if (max_total_size <= 0) { return errors::InvalidArgument( - "TensorRT BatchedNMS Plugin max_total_size should be > 0", - node_def.name()); + plugin_name, "max_total_size should be > 0, at ", node_name); } - if (iou_threshold.shape_.nbDims != 1) { + + if (iou_threshold.count() != 1) { return errors::InvalidArgument( - "TensorRT BatchedNMS Plugin iou_threshold must be 0-D ", - node_def.name()); + plugin_name, "iou_threshold must be scalar, at ", node_name); } - float iou_thresh = *(static_cast(iou_threshold.GetValues())); + + const auto iou_thresh = *(iou_threshold.GetPointer()); if (iou_thresh < 0.0 || iou_thresh > 1.0) { return errors::InvalidArgument( - "TensorRT BatchedNMS Plugin iou_threshold must be in [0, 1]", - node_def.name()); + plugin_name, "iou_threshold must be in [0, 1], at", node_name); } - if (score_threshold.shape_.nbDims != 1) { + + if (score_threshold.count() != 1) { return errors::InvalidArgument( - "TensorRT BatchedNMS Plugin score_threshold must be 0-D ", - node_def.name()); + plugin_name, "score_threshold must be scalar, at ", node_name); + } + +#if !IS_TRT_VERSION_GE(8, 2, 1, 6) && !defined(TF_TRT_USE_EFFICIENT_NMS_PLUGIN) + // TRT op is_normalized=False treats input coordinates as pixels and + // calculates width/height as (max - min + 1). + // + // TF op CombinedNonMaxSuppression doesn't care about the normalization and + // calculates width/height as (max-min). + // + // We set is_normalized = true to be consistent with TF IOU calculaton. + const bool is_normalized = true; + const int backgrnd_id = -1; + const bool share_location = (boxes_dims.d[class_idx] == 1); + int keep_top_k = + pad_per_class ? std::min(max_size_per_class * num_classes, max_total_size) + : max_total_size; + + // According to the batchedNMS plugin description we need to set top_k so that + // keep_top_k <= top_k + // https://github.com/NVIDIA/TensorRT/tree/master/plugin/batchedNMSPlugin + // Before the NMS step, TRT selects top_k candidate from each class and + // discards the rest. The NMS step is performed only among the top_k + // candidates. To be strictly compatible with the TF op, we need that top_k is + // greater equal to num_boxes. + const int num_boxes = boxes_dims.d[offset]; + int top_k = std::max(num_boxes, keep_top_k); + // TRT has a limitation: top_k <=4096. + if (top_k > 4096) { + if (AllowNmsTopkOverride()) { + top_k = 4096; + keep_top_k = std::min(top_k, keep_top_k); + } else { + return errors::InvalidArgument( + "TRT NMS plugin allow top_k<=4096, where top_k = max(num_boxes, " + "max_total_size). You can override this by setting " + "TF_TRT_ALLOW_NMS_TOPK_OVERRIDE=1 environment variable, but this can " + "result in a loss of accuracy."); + } } +#endif if (params->validation_only) return Status::OK(); - // TF op CombinedNonMaxSuppression doesn't have the option of - // not normalizing coordinates. - const bool is_normalized = true; - // Set plugin fields and the field collection - TFAttrs attrs(node_def); - bool share_location = (boxes_dims.d[1] == 1); - const bool pad_per_class = attrs.get("pad_per_class"); - const int top_k = boxes_dims.d[0]; - int keep_top_k = 0; - if (pad_per_class) { - keep_top_k = std::min(max_size_per_class * num_classes, max_total_size); - } else { - keep_top_k = max_total_size; - } - float score_thresh = *(static_cast(score_threshold.GetValues())); - const int background_id = -1; - nvinfer1::PluginField fields[8] = { - nvinfer1::PluginField{"shareLocation", &share_location, - nvinfer1::PluginFieldType::kINT32, 1}, - nvinfer1::PluginField{"backgroundLabelId", &background_id, - nvinfer1::PluginFieldType::kINT32, 1}, - nvinfer1::PluginField{"numClasses", &num_classes, - nvinfer1::PluginFieldType::kINT32, 1}, - nvinfer1::PluginField{"topK", &top_k, nvinfer1::PluginFieldType::kINT32, - 1}, - nvinfer1::PluginField{"keepTopK", &keep_top_k, - nvinfer1::PluginFieldType::kINT32, 1}, - nvinfer1::PluginField{"scoreThreshold", &score_thresh, - nvinfer1::PluginFieldType::kFLOAT32, 1}, - nvinfer1::PluginField{"iouThreshold", &iou_thresh, - nvinfer1::PluginFieldType::kFLOAT32, 1}, - nvinfer1::PluginField{"isNormalized", &is_normalized, - nvinfer1::PluginFieldType::kINT32, 1}, + // Create plugin. + float score_thresh = *(score_threshold.GetPointer()); + nvinfer1::PluginField fields[] = { +#if IS_TRT_VERSION_GE(8, 2, 1, 6) || defined(TF_TRT_USE_EFFICIENT_NMS_PLUGIN) + {"max_output_size_per_class", &max_size_per_class, + nvinfer1::PluginFieldType::kINT32, 1}, + {"max_total_size", &max_total_size, nvinfer1::PluginFieldType::kINT32, 1}, + {"iou_threshold", &iou_thresh, nvinfer1::PluginFieldType::kFLOAT32, 1}, + {"score_threshold", &score_thresh, nvinfer1::PluginFieldType::kFLOAT32, 1}, + {"pad_per_class", &pad_per_class, nvinfer1::PluginFieldType::kINT32, 1}, + {"clip_boxes", &clip_boxes, nvinfer1::PluginFieldType::kINT32, 1}, +#else // IS_TRT_VERSION_GE(7, 1, 3, 0) + {"shareLocation", &share_location, nvinfer1::PluginFieldType::kINT32, 1}, + {"backgroundLabelId", &backgrnd_id, nvinfer1::PluginFieldType::kINT32, 1}, + {"numClasses", &num_classes, nvinfer1::PluginFieldType::kINT32, 1}, + {"topK", &top_k, nvinfer1::PluginFieldType::kINT32, 1}, + {"keepTopK", &keep_top_k, nvinfer1::PluginFieldType::kINT32, 1}, + {"scoreThreshold", &score_thresh, nvinfer1::PluginFieldType::kFLOAT32, 1}, + {"iouThreshold", &iou_thresh, nvinfer1::PluginFieldType::kFLOAT32, 1}, + {"isNormalized", &is_normalized, nvinfer1::PluginFieldType::kINT32, 1}, + {"clipBoxes", &clip_boxes, nvinfer1::PluginFieldType::kINT32, 1}, +#endif }; - nvinfer1::PluginFieldCollection fc{8, fields}; - // Get plugin creator - auto creator = - getPluginRegistry()->getPluginCreator("BatchedNMS_TRT", "1", ""); - TFTRT_RETURN_ERROR_IF_NULLPTR(creator, node_def.name()); + nvinfer1::PluginFieldCollection fc{sizeof(fields) / sizeof(fields[0]), + fields}; + + // Get plugin creator. + auto creator = getPluginRegistry()->getPluginCreator(pluginName, "1", ""); + TFTRT_RETURN_ERROR_IF_NULLPTR(creator, node_name); - // Create plugin TrtUniquePtrType plugin( - creator->createPlugin(node_def.name().c_str(), &fc)); - TFTRT_RETURN_ERROR_IF_NULLPTR(plugin, node_def.name()); + creator->createPlugin(node_name.c_str(), &fc)); + TFTRT_RETURN_ERROR_IF_NULLPTR(plugin, node_name); - // Set plugin inputs + // Set plugin inputs. std::vector trt_plugin_inputs; trt_plugin_inputs.push_back(boxes_tensor->trt_tensor()); trt_plugin_inputs.push_back(scores_tensor->trt_tensor()); - // Add plugin to network + // Add plugin to network. nvinfer1::IPluginV2Layer* layer = params->converter->network()->addPluginV2( &trt_plugin_inputs[0], static_cast(trt_plugin_inputs.size()), *plugin); - TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name()); + TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_name); + params->converter->SetLayerName(layer, node_def, "plugin"); - // Set plugin outputs - nvinfer1::ITensor* output_nmsed_boxes = layer->getOutput(1); -#if IS_TRT_VERSION_GE(6, 0, 0, 0) - // TRT6 fixes (removes) the extra last dimension in CombinedNMS outputs + // Set plugin outputs. + const ITensorProxyPtr output_detection_boxes = layer->getOutput(1); + const ITensorProxyPtr output_detection_scores = layer->getOutput(2); ITensorProxyPtr output_num_detections = layer->getOutput(0); - ITensorProxyPtr output_nmsed_scores = layer->getOutput(2); - ITensorProxyPtr output_nmsed_classes = layer->getOutput(3); -#else - ITensorProxyPtr output_num_detections = nullptr; - ITensorProxyPtr output_nmsed_scores = nullptr; - ITensorProxyPtr output_nmsed_classes = nullptr; - - auto shrink_last_dim = [params](nvinfer1::ITensor* in_tensor, - nvinfer1::ITensor** out_tensor) { - nvinfer1::Dims dims = in_tensor->getDimensions(); - if (dims.d[dims.nbDims - 1] != 1) { - return errors::Internal("Expect last dims to be 1, for tensor ", - DebugString(*in_tensor)); - } - --dims.nbDims; - TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape( - TRT_TensorOrWeights(in_tensor), dims, - /*validation_only=*/false, out_tensor)); - return Status::OK(); - }; - TF_RETURN_IF_ERROR( - shrink_last_dim(layer->getOutput(2), &output_nmsed_scores)); - TF_RETURN_IF_ERROR( - shrink_last_dim(layer->getOutput(3), &output_nmsed_classes)); - TF_RETURN_IF_ERROR( - shrink_last_dim(layer->getOutput(0), &output_num_detections)); -#endif // IS_TRT_VERSION_GE(6, 0, 0, 0) + ITensorProxyPtr output_detection_classes = layer->getOutput(3); + +#if IS_TRT_VERSION_GE(8, 2, 1, 6) || defined(TF_TRT_USE_EFFICIENT_NMS_PLUGIN) + // Cast the classes output from int32 to float32. + nvinfer1::IIdentityLayer* layer_detection_classes = + params->converter->network()->addIdentity( + *output_detection_classes->trt_tensor()); + layer_detection_classes->setOutputType(0, nvinfer1::DataType::kFLOAT); + output_detection_classes = layer_detection_classes->getOutput(0); + + // The plugin produces a [N, 1] tensor for the num output, squeeze it to [N] + std::vector input_dims{output_num_detections->getDimensions().d[0], 0}; + TF_RETURN_IF_ERROR(params->converter->SqueezeTensor( + /*input=*/output_num_detections, + /*input_dims=*/&input_dims, + /*params=*/params, + /*output=*/&output_num_detections)); +#endif - params->outputs->push_back(TRT_TensorOrWeights(output_nmsed_boxes)); - params->outputs->push_back(TRT_TensorOrWeights(output_nmsed_scores)); - params->outputs->push_back(TRT_TensorOrWeights(output_nmsed_classes)); + // Final outputs. + params->outputs->push_back(TRT_TensorOrWeights(output_detection_boxes)); + params->outputs->push_back(TRT_TensorOrWeights(output_detection_scores)); + params->outputs->push_back(TRT_TensorOrWeights(output_detection_classes)); params->outputs->push_back(TRT_TensorOrWeights(output_num_detections)); - return Status::OK(); } -#endif // CombinedNonMaxSuppression +#endif -#if IS_TRT_VERSION_GE(6, 0, 0, 0) -Status ConvertResize(OpConverterParams* params) { +Status ConvertResize(const OpConverterParams* params) { const auto& inputs = params->inputs; const auto& node_def = params->node_def; - TF_RETURN_IF_ERROR( - CheckInputsWeights(*params, {{"input", false}, {"size", true}})); + TF_RETURN_IF_ERROR(CheckInputsWeights( + *params, + {{"input", TrtInputArg::kTensor}, {"size", TrtInputArg::kBoth}})); TF_RETURN_IF_ERROR(AllowDataTypes( *params, {DataType::DT_FLOAT, DataType::DT_HALF, DataType::DT_INT32})); - // Get input tensor. Transpose it from NHWC to NCHW. - ITensorProxyPtr tensor = inputs.at(0).tensor(); - TFTRT_RETURN_ERROR_IF_NULLPTR(tensor, params->node_def.name()); + // Get input tensor. + ITensorProxyPtr inputs_tensor = inputs.at(0).tensor(); + TFTRT_RETURN_ERROR_IF_NULLPTR(inputs_tensor, params->node_def.name()); - // Get output size. It must constain two values i.e. [H_out, W_out] - TRT_ShapedWeights weights = inputs.at(1).weights(); - if (weights.count() != 2) { - return errors::Unimplemented("Resize to shape=[] is not supported, at ", - node_def.name()); + // Check output size. It must constain two values i.e. [H_out, W_out] + const bool const_output_size = inputs.at(1).is_weights(); + if (const_output_size) { + // Output size is given as a constant. + if (inputs.at(1).weights().count() != 2) { + return errors::Unimplemented("Resize requires 2D values for the size"); + } + } else { + // Output size is given as a tensor, possibly as the result of shape + // calculation ops in the graph. + if (params->use_implicit_batch) { + return errors::Unimplemented( + "Resize requires constant size in implicit batch mode"); + } + TF_RETURN_IF_ERROR(ExpectShapeTensor(inputs.at(1))); + if (inputs.at(1).tensor()->getDimensions().d[0] != 2) { + return errors::Unimplemented("Resize requires 2D values for the size"); + } } - const int* weights_ptr = static_cast(weights.GetValues()); // Verify and consume node attributes. - TFAttrs attrs(node_def); - bool align_corners = attrs.get("align_corners"); + bool align_corners; + TF_RETURN_IF_ERROR( + GetNodeAttr(AttrSlice(node_def), "align_corners", &align_corners)); TF_RETURN_IF_ERROR( AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF})); @@ -5319,67 +5813,123 @@ Status ConvertResize(OpConverterParams* params) { } else if (node_def.op() == "ResizeNearestNeighbor") { resize_mode = nvinfer1::ResizeMode::kNEAREST; } else { - return errors::Unimplemented(node_def.op(), " is not yet implemented at ", - node_def.name()); + return errors::Unimplemented(node_def.op(), " is not yet implemented"); } // return after validation if only validation is requested. if (params->validation_only) return Status::OK(); // Transpose tensor from NHWC to NCHW format. - TF_RETURN_IF_ERROR( - params->converter->TransposeTensor(tensor, {0, 3, 1, 2}, &tensor)); + TF_RETURN_IF_ERROR(params->converter->TransposeTensor( + inputs_tensor, {0, 3, 1, 2}, &inputs_tensor, node_def, "to_NCHW")); + + // Calculate the output shape as static dimensions or a shape tensor: + // Given input shape [N, C, H, W] and output size [H_out, W_out], + // output shape equals [N, C, H_out, W_out]. + nvinfer1::Dims output_shape_dims; + ITensorProxyPtr output_shape_tensor; + const bool static_output_shape = + HasStaticShape(inputs_tensor->getDimensions()) && const_output_size; + if (static_output_shape) { + // If the output shape can be fully determined at build time, calculate it + // as a set of dimensions. + output_shape_dims.nbDims = inputs_tensor->getDimensions().nbDims; + for (int i = 0; i < output_shape_dims.nbDims; ++i) { + output_shape_dims.d[i] = inputs_tensor->getDimensions().d[i]; + } + const int* weights_ptr = inputs.at(1).weights().GetPointer(); + output_shape_dims.d[output_shape_dims.nbDims - 2] = weights_ptr[0]; + output_shape_dims.d[output_shape_dims.nbDims - 1] = weights_ptr[1]; + } else { + // Otherwise, build the output shape as a shape tensor that will be computed + // at run time. + // The batch size and num of channels will be copied from the input shape. + ITensorProxyPtr shape = params->converter->network() + ->addShape(*inputs_tensor->trt_tensor()) + ->getOutput(0); + ITensorProxyPtr batch_size = + params->converter->network() + ->addSlice(*shape->trt_tensor(), {1, {0}}, {1, {1}}, {1, {1}}) + ->getOutput(0); + ITensorProxyPtr num_channels = + params->converter->network() + ->addSlice(*shape->trt_tensor(), {1, {1}}, {1, {1}}, {1, {1}}) + ->getOutput(0); + + // The height and width will be obtained from the requested output size. + ITensorProxyPtr height, width; + if (const_output_size) { + // If the output size is constant, the height and width dimensions can be + // created as constants from the size values. + const int* weights_ptr = inputs.at(1).weights().GetPointer(); + TF_RETURN_IF_ERROR(CreateScalarConstant(params, weights_ptr[0], &height)); + TF_RETURN_IF_ERROR(CreateScalarConstant(params, weights_ptr[1], &width)); + } else { + // Otherwise, the size is a tensor which can be sliced, and each element + // used directly as the output height and width dimensions. + ITensorProxyPtr size = inputs.at(1).tensor(); + height = params->converter->network() + ->addSlice(*size->trt_tensor(), {1, {0}}, {1, {1}}, {1, {1}}) + ->getOutput(0); + width = params->converter->network() + ->addSlice(*size->trt_tensor(), {1, {1}}, {1, {1}}, {1, {1}}) + ->getOutput(0); + } - // Calculate output dimensions. - // Given input dimensions [N, C, H, W] and output size [H_out, W_out], - // output dimensions equals [N, C, H_out, W_out] - nvinfer1::Dims output_dimensions; - output_dimensions.nbDims = tensor->getDimensions().nbDims; - for (int i = 0; i < output_dimensions.nbDims; ++i) { - output_dimensions.d[i] = tensor->getDimensions().d[i]; + ::stream_executor::port::StatusOr result = + ConcatenateTensors(params, {batch_size, num_channels, height, width}, + 0); + TF_RETURN_IF_ERROR(result.status()); + output_shape_tensor = result.ValueOrDie(); } - output_dimensions.d[output_dimensions.nbDims - 2] = weights_ptr[0]; - output_dimensions.d[output_dimensions.nbDims - 1] = weights_ptr[1]; // Add resize layer. nvinfer1::IResizeLayer* layer = - params->converter->network()->addResize(*tensor->trt_tensor()); + params->converter->network()->addResize(*inputs_tensor->trt_tensor()); TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name()); + params->converter->SetLayerName(layer, node_def); // Set layer parameters. layer->setResizeMode(resize_mode); - layer->setOutputDimensions(output_dimensions); layer->setAlignCorners(align_corners); + // Set output shape. + if (static_output_shape) { + // If the shapes are fully known at build time, pass the static output shape + // to the resize layer as expected output dimensions. + layer->setOutputDimensions(output_shape_dims); + } else { + // Otherwise, pass the output shape tensor to the resize layer as an input. + layer->setInput(1, *output_shape_tensor->trt_tensor()); + } + // Get output tensor. Transpose it from NCHW to NHWC. ITensorProxyPtr output = layer->getOutput(0); - TF_RETURN_IF_ERROR( - params->converter->TransposeTensor(output, {0, 2, 3, 1}, &output)); + TF_RETURN_IF_ERROR(params->converter->TransposeTensor( + output, {0, 2, 3, 1}, &output, node_def, "to_NHWC")); params->outputs->push_back(TRT_TensorOrWeights(output)); // Success return Status::OK(); } // ConvertResize -#endif // IS_TRT_VERSION_GE(6, 0, 0, 0) -Status ConvertAddN(OpConverterParams* params) { +Status ConvertAddN(const OpConverterParams* params) { const auto& inputs = params->inputs; const auto& node_def = params->node_def; TF_RETURN_IF_ERROR( AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF})); - TFAttrs attrs(node_def); - const int num_inputs = attrs.get("N"); + + int num_inputs; + TF_RETURN_IF_ERROR(GetNodeAttr(AttrSlice(node_def), "N", &num_inputs)); + if (num_inputs < 2) { - return errors::InvalidArgument("AddN requires at least two inputs, at ", - node_def.name()); - } - if (inputs.size() != num_inputs) { - return errors::InvalidArgument("Got ", inputs.size(), - " inputs but expected ", num_inputs, ", at ", - node_def.name()); + return errors::InvalidArgument("AddN requires at least two inputs"); } + + TFTRT_CHECK_INPUT_SIZE(inputs.size(), num_inputs, node_def); + for (const auto& input : inputs) { - if (!input.is_tensor() && input.weights().shape_.d[0] != 1) { + if (!input.is_tensor() && input.weights().Shape().dim(0) != 1) { return errors::InvalidArgument( "Weights input to AddN is required to have batch dimension 1."); } @@ -5388,195 +5938,165 @@ Status ConvertAddN(OpConverterParams* params) { // AddN doesn't support broadcast. std::vector tensor_inputs; + tensor_inputs.reserve(inputs.size()); for (const auto& input : inputs) { if (input.is_tensor()) { tensor_inputs.push_back(input.tensor()); } else { - auto dims = input.weights().shape_; - TF_RETURN_IF_ERROR(RemoveBatchDimension(&dims)); - tensor_inputs.push_back( - params->converter->CreateConstantLayer(input.weights(), dims)); + auto dims = input.weights().Shape(); + if (params->use_implicit_batch) { + TF_RETURN_IF_ERROR(dims.RemoveBatchDimension()); + } + tensor_inputs.push_back(params->converter->CreateConstantLayer( + input.weights(), dims.AsTrtDims())); } } ITensorProxyPtr lhs = tensor_inputs[0]; for (int i = 1; i < num_inputs; ++i) { ITensorProxyPtr rhs = tensor_inputs[i]; nvinfer1::ILayer* layer = params->converter->network()->addElementWise( - *lhs->trt_tensor(), *rhs->trt_tensor(), nvinfer1::ElementWiseOperation::kSUM); + *lhs->trt_tensor(), *rhs->trt_tensor(), + nvinfer1::ElementWiseOperation::kSUM); TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name()); + params->converter->SetLayerName(layer, node_def, std::to_string(i)); lhs = layer->getOutput(0); } params->outputs->push_back(TRT_TensorOrWeights(lhs)); return Status::OK(); } -static void RegisterValidatableOpConverters( - std::unordered_map* registration) { - (*registration)["BiasAdd"] = ConvertBiasAdd; -#if IS_TRT_VERSION_GE(5, 1, 2, 0) - (*registration)["ClipByValue"] = ConvertClipByValue; -#endif - -#if IS_TRT_VERSION_GE(5, 1, 0, 0) -// TODO: @mconley @jdekhtiar - Removed when fixed -#ifndef TF2TENSORRT_BYPASS_NMS_RESIZE_OPS - (*registration)["CombinedNonMaxSuppression"] = ConvertCombinedNMS; -#endif //TF2TENSORRT_BYPASS_NMS_RESIZE_OPS +REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertBiasAdd, "BiasAdd"); +REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertClipByValue, "ClipByValue"); +#if IS_TRT_VERSION_GE(7, 1, 3, 0) || defined(TF_TRT_USE_EFFICIENT_NMS_PLUGIN) +REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertCombinedNMS, + "CombinedNonMaxSuppression"); #endif - (*registration)["AddN"] = ConvertAddN; - (*registration)["ConcatV2"] = ConvertConcat; - (*registration)["Const"] = ConvertConst; - (*registration)["Conv2D"] = ConvertConv2D; - (*registration)["Conv2DBackpropInput"] = ConvertConv2DBackpropInput; - (*registration)["DepthToSpace"] = ConvertDepthSpaceShuffle; - (*registration)["DepthwiseConv2dNative"] = ConvertConv2DDepthwise; - (*registration)["ExpandDims"] = ConvertExpandDims; - (*registration)["FusedConv2DBiasActivation"] = - ConvertFusedConv2DBiasActivation; - (*registration)["GatherV2"] = ConvertGather; - (*registration)["LeakyRelu"] = ConvertLeakyRelu; - (*registration)["MatMul"] = ConvertMatMul; - (*registration)["Pack"] = ConvertPack; - (*registration)["Pad"] = ConvertPad; - (*registration)["Relu6"] = ConvertRelu6; - (*registration)["Reshape"] = ConvertReshape; -#if IS_TRT_VERSION_GE(6, 0, 0, 0) - (*registration)["Conv3D"] = ConvertConv3D; - (*registration)["Conv3DBackpropInputV2"] = ConvertConv3DBackpropInputV2; -// TODO: @mconley @jdekhtiar - Removed when fixed -#ifndef TF2TENSORRT_BYPASS_NMS_RESIZE_OPS - for (auto resize_mode : {"ResizeBilinear", "ResizeNearestNeighbor"}) { - (*registration)[resize_mode] = ConvertResize; - } -#endif // TF2TENSORRT_BYPASS_NMS_RESIZE_OPS -#endif - (*registration)["Rsqrt"] = ConvertRsqrt; - (*registration)["Slice"] = ConvertSlice; - (*registration)["Softmax"] = ConvertSoftmax; - (*registration)["SpaceToDepth"] = ConvertDepthSpaceShuffle; - (*registration)["Split"] = ConvertSplit; - (*registration)["Square"] = ConvertSquare; - (*registration)["SquaredDifference"] = ConvertSquaredDifference; - (*registration)["Squeeze"] = ConvertSqueeze; - (*registration)["StridedSlice"] = ConvertStridedSlice; - (*registration)["TopKV2"] = ConvertTopK; - (*registration)["Transpose"] = ConvertTranspose; - (*registration)["Unpack"] = ConvertUnpack; - - for (auto quantization_op_type : - {"QuantizeAndDequantizeV2", "QuantizeAndDequantizeV3", - "FakeQuantWithMinMaxVars", "FakeQuantWithMinMaxArgs"}) { - (*registration)[quantization_op_type] = ConvertQuantize; - } - for (const auto& binary_op_pair : *BinaryOperationMap()) { - (*registration)[binary_op_pair.first] = ConvertBinary; - } - for (const auto& activation_op_pair : *ActivationTypeMap()) { - (*registration)[activation_op_pair.first] = ConvertActivation; - } - for (auto pool_op_type : {"AvgPool", "MaxPool"}) { - (*registration)[pool_op_type] = ConvertPool; - } - for (auto normalization_op_type : - {"FusedBatchNorm", "FusedBatchNormV2", "FusedBatchNormV3"}) { - (*registration)[normalization_op_type] = ConvertFusedBatchNorm; - } - for (auto unary_op_pair : *UnaryOperationMap()) { - (*registration)[unary_op_pair.first] = ConvertUnary; - } - for (auto reduce_op_type : {"Sum", "Prod", "Max", "Min", "Mean"}) { - (*registration)[reduce_op_type] = ConvertReduce; +REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertAddN, "AddN"); +REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertCast, "Cast"); +REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertConcat, "ConcatV2"); +REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertConst, "Const"); +REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertConv2D, "Conv2D"); +REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertConv2DBackpropInput, + "Conv2DBackpropInput"); +REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertDepthSpaceShuffle, "DepthToSpace"); +REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertConv2DDepthwise, + "DepthwiseConv2dNative"); + +REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertExpandDims, "ExpandDims"); +REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertFusedConv2DBiasActivation, + "FusedConv2DBiasActivation"); +REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertGather, "GatherV2"); +REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertMatMul, "MatMul"); +REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertPack, "Pack"); +REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertPad, "Pad"); +REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertReshape, "Reshape"); +REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertConv3D, "Conv3D"); +REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertConv3DBackpropInputV2, + "Conv3DBackpropInputV2"); +REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertResize, "ResizeBilinear"); +REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertResize, "ResizeNearestNeighbor"); +REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertPool3D, "AvgPool3D"); +REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertPool3D, "MaxPool3D"); +REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertShape, "Shape"); +REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertSlice, "Slice"); +REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertDepthSpaceShuffle, "SpaceToDepth"); +REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertSplit, "Split"); +REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertSquare, "Square"); +REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertSquaredDifference, + "SquaredDifference"); +REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertSqueeze, "Squeeze"); +REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertStridedSlice, "StridedSlice"); +REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertTopK, "TopKV2"); +REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertTranspose, "Transpose"); +REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertUnpack, "Unpack"); +REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertPool, {"MaxPool", "AvgPool"}); +REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertFusedBatchNorm, + {"FusedBatchNorm", "FusedBatchNormV2", + "FusedBatchNormV3"}); +REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertReduce, + {"Sum", "Prod", "Max", "Min", "Mean"}); +REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertArgMinMax, {"ArgMin", "ArgMax"}); +// The following are no-ops during inference and will not be mapped to any +// TRT layer. +REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertIdentity, + {"Identity", "IdentityN", "Snapshot", + "StopGradient", "_CopyFromHostToGpu"}); +REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertBatchMatMul, + {"BatchMatMul", "BatchMatMulV2"}); +// Debug converter only accessible via `TF_TRT_OP_FAKELIST=OpName1,OpName2,...` +REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertFake, "FakeOp"); + +static Status SetDeviceInfoInNodes(GraphDef* graph_def, const string& device) { + for (auto& node : *(graph_def->mutable_node())) { + *node.mutable_device() = device; } - for (auto arg_minmax_type : {"ArgMin", "ArgMax"}) { - (*registration)[arg_minmax_type] = ConvertArgMinMax; - } - // The following are no-ops during inference and will not be mapped to any TRT - // layer. - for (auto identity_op_type : {"Identity", "Snapshot", "StopGradient"}) { - (*registration)[identity_op_type] = ConvertIdentity; - } - for (auto batch_matmul_type : {"BatchMatMul", "BatchMatMulV2"}) { - (*registration)[batch_matmul_type] = ConvertBatchMatMul; - } -} - -void TrtNodeValidator::RegisterOpValidators() { - RegisterValidatableOpConverters(&op_validators_); -} - -void Converter::RegisterOpConverters() { - RegisterValidatableOpConverters(&op_registry_); + return Status::OK(); } Status ConvertGraphDefToEngine( - const GraphDef& gdef, TrtPrecisionMode precision_mode, int max_batch_size, - size_t max_workspace_size_bytes, - const std::vector& input_shapes, Logger* logger, - nvinfer1::IGpuAllocator* allocator, TRTInt8Calibrator* calibrator, + const GraphDef& gdef, OpKernelContext* ctx, TrtPrecisionMode precision_mode, + int max_batch_size, size_t max_workspace_size_bytes, + const std::vector& input_shapes, + nvinfer1::ILogger* trt_logger, nvinfer1::IGpuAllocator* allocator, + TRTInt8Calibrator* calibrator, TrtUniquePtrType* engine, bool use_calibration, - bool* convert_successfully) { + const bool use_implicit_batch, bool* convert_successfully, + TrtShapeOptimizationProfile* profiles, absl::string_view engine_name, + bool use_explicit_precision, tensorflow::grappler::Cluster* cluster, + const string& device) { engine->reset(); if (convert_successfully) *convert_successfully = false; - // Create the builder. - TrtUniquePtrType builder( - nvinfer1::createInferBuilder(*logger)); - builder->setMaxBatchSize(max_batch_size); - builder->setGpuAllocator(allocator); -#if IS_TRT_VERSION_GE(6, 0, 0, 0) - TrtUniquePtrType builder_config( - builder->createBuilderConfig()); - builder_config->setMaxWorkspaceSize(max_workspace_size_bytes); - if (precision_mode == TrtPrecisionMode::FP16) { - builder_config->setFlag(nvinfer1::BuilderFlag::kFP16); - } else if (precision_mode == TrtPrecisionMode::INT8) { - builder_config->setFlag(nvinfer1::BuilderFlag::kFP16); - builder_config->setFlag(nvinfer1::BuilderFlag::kINT8); - if (use_calibration) { - builder_config->setInt8Calibrator(calibrator); - } else { - builder_config->setInt8Calibrator(nullptr); - } - } - const uint32_t flags = 0U; // Implicit Batch Mode - // Create the network. - auto trt_network = - TrtUniquePtrType(builder->createNetworkV2(flags)); -#else // IS_TRT_VERSION_GE(6, 0, 0, 0) - builder->setMaxWorkspaceSize(max_workspace_size_bytes); - if (precision_mode == TrtPrecisionMode::FP16) { - builder->setFp16Mode(true); - } else if (precision_mode == TrtPrecisionMode::INT8) { - // Setting FP16 mode as well allows TRT to also consider FP16 kernels and - // use them in situations where they are faster than INT8 or where INT8 is - // not supported for a given layer. - builder->setFp16Mode(true); - builder->setInt8Mode(true); - if (use_calibration) { - builder->setInt8Calibrator(calibrator); - } else { - builder->setInt8Calibrator(nullptr); + // Creating converter, TensorRT builder and network + auto statusor = Converter::Create(precision_mode, use_calibration, trt_logger, + use_implicit_batch, engine_name, + use_explicit_precision, ctx); + + TF_RETURN_IF_ERROR(statusor.status()); + std::unique_ptr converter = std::move(statusor.ValueOrDie()); + + GraphDef graph = gdef; + if (cluster != nullptr) { + bool apply_layout_optim; + Status status = + ReadBoolFromEnvVar("TF_TRT_ENABLE_LAYOUT_OPTIMIZER", + /*default_value=*/true, &apply_layout_optim); + if (!status.ok()) { + LOG(ERROR) << status; } - } - // Create the network. - auto trt_network = - TrtUniquePtrType(builder->createNetwork()); -#endif // IS_TRT_VERSION_GE(6, 0, 0, 0) + if (apply_layout_optim) { + tensorflow::grappler::GrapplerItem grappler_item; + grappler_item.graph = gdef; - if (!trt_network) { - return errors::Internal("Failed to create TensorRT network object"); - } + // Add device information to each node in the graphdef for successful + // execution of the layout optimizer + TF_RETURN_IF_ERROR(SetDeviceInfoInNodes(&grappler_item.graph, device)); + + // TensorRT API requires the input for convolution to be in NCHW. + tensorflow::grappler::GenericLayoutOptimizer layout_optimizer("NCHW"); + TF_RETURN_IF_ERROR( + layout_optimizer.Optimize(cluster, grappler_item, &graph)); + + grappler_item.graph = graph; + + tensorflow::grappler::ConstantFolding const_optimizer(nullptr); + TF_RETURN_IF_ERROR( + const_optimizer.Optimize(cluster, grappler_item, &graph)); - // Build the network - if (VLOG_IS_ON(1)) { - string mode_str; - TF_RETURN_IF_ERROR(TrtPrecisionModeToName(precision_mode, &mode_str)); - VLOG(1) << "Starting engine conversion, precision mode: " << mode_str; + // The optimizers may break the topological order + // so we need these steps to restore it + Graph g(OpRegistry::Global()); + TF_RETURN_IF_ERROR( + ConvertGraphDefToGraph(GraphConstructorOptions(), graph, &g)); + g.ToGraphDef(&graph); + } } - Converter converter(trt_network.get(), precision_mode, use_calibration); + VLOG(1) << "Starting to convert TensorFlow ops to TensorRT layers"; std::vector output_tensors; + int num_layers = converter->network()->getNbLayers(); + absl::flat_hash_set layer_names; // Graph nodes are already topologically sorted during construction - for (const auto& node_def : gdef.node()) { + for (const auto& node_def : graph.node()) { const string& node_name = node_def.name(); VLOG(2) << "Converting node " << node_name << ", op=" << node_def.op(); if (IsEngineInput(node_name)) { @@ -5600,27 +6120,45 @@ Status ConvertGraphDefToEngine( "Node ", node_name, " with is neither Placeholder nor Arg, instead ", node_def.op()); } - nvinfer1::DataType trt_dtype; - nvinfer1::Dims trt_dims; - int batch_size = -1; - auto shape = input_shapes.at(slot_number); - auto status = ValidateTensorProperties( - node_def.op(), node_def.attr().at(type_key).type(), shape, - /*validation_only=*/false, &trt_dtype, &trt_dims, &batch_size); - if (!status.ok()) { - const string error_message = - StrCat("Validation failed for ", node_name, " and input slot ", - slot_number, ": ", status.error_message()); - LOG(WARNING) << error_message; - return Status(status.code(), error_message); + DataType tf_dtype = node_def.attr().at(type_key).type(); + if (tf_dtype == DT_RESOURCE) { + VLOG(2) << "Adding engine input resource " << node_name; + if (ctx == nullptr) { + return errors::InvalidArgument( + "Variable resource type conversion requires a valid ctx"); + } + + if (ctx->input(slot_number).NumElements() == 0) { + return errors::InvalidArgument("Resource input ", node_name, + " is empty."); + } + TF_RETURN_IF_ERROR(converter->AddInputResource( + node_name, ctx->input(slot_number).flat()(0))); + } else { + nvinfer1::DataType trt_dtype; + nvinfer1::Dims trt_dims; + int batch_size = -1; + const auto shape = input_shapes.at(slot_number); + const auto status = ValidateTensorProperties( + node_def.op(), node_def.attr().at(type_key).type(), shape, + use_implicit_batch, /*validation_only=*/false, &trt_dtype, + &trt_dims, &batch_size); + if (!status.ok()) { + const string error_message = + StrCat("Validation failed for ", node_name, " and input slot ", + slot_number, ": ", status.error_message()); + LOG_WARNING_WITH_PREFIX << error_message; + return Status(status.code(), error_message); + } + VLOG(2) << "Adding engine input tensor " << node_name << " with shape " + << DebugString(trt_dims); + // TODO(laigd): the conversion should always happen at runtime where all + // the shapes are known, and we can provide a mode to generate the + // engines offline, by calling sess.run() and cache/serialize the + // engines. + TF_RETURN_IF_ERROR(converter->AddInputTensor(node_name, trt_dtype, + trt_dims, batch_size)); } - VLOG(2) << "Adding engine input tensor " << node_name << " with shape " - << DebugString(trt_dims); - // TODO(laigd): the conversion should always happen at runtime where all - // the shapes are known, and we can provide a mode to generate the - // engines offline, by calling sess.run() and cache/serialize the engines. - TF_RETURN_IF_ERROR( - converter.AddInputTensor(node_name, trt_dtype, trt_dims, batch_size)); } else if (IsEngineOutput(node_name)) { int32 slot_number = -1; if (node_def.op() == "Identity") { @@ -5640,36 +6178,59 @@ Status ConvertGraphDefToEngine( node_def.op()); } // Get output type that TensorFlow expects - TFAttrs attrs(node_def); - DataType tf_dtype = attrs.get("T"); + string out_type_key; + if (node_def.op() == "ReadVariableOp" || + node_def.op() == "ResourceGather") { + out_type_key = "dtype"; + } else { + out_type_key = "T"; + } + DataType tf_dtype; + TF_RETURN_IF_ERROR( + GetNodeAttr(AttrSlice(node_def), out_type_key, &tf_dtype)); nvinfer1::DataType trt_dtype; - TF_RETURN_IF_ERROR(TfDataTypeToTrt(tf_dtype, &trt_dtype)); + TF_RETURN_IF_ERROR(TfTypeToTrtType(tf_dtype, &trt_dtype)); if (output_tensors.size() <= slot_number) { output_tensors.resize(slot_number + 1); } output_tensors.at(slot_number) = {node_def.input(0), node_name, trt_dtype}; } else { - TF_RETURN_IF_ERROR(converter.ConvertNode(node_def)); + TF_RETURN_IF_ERROR(converter->ConvertNode(node_def)); + } + + // To support TF-TRT profiling, we ensure each ILayer has a non-empty name. + // BuildCudaEngine returns an error if there is any ILayer name collision. + // We want to report the error here before BuildCudaEngine in a more + // meaningful way. + int new_num_layers = converter->network()->getNbLayers(); + for (int i = num_layers; i < new_num_layers; i++) { + auto layer = converter->network()->getLayer(i); + if (layer->getName() == nullptr || + !layer_names.insert(layer->getName()).second) { + std::string error_message = absl::StrCat( + "Converting node ", node_name, ", op=", node_def.op(), + layer->getName() ? " creates a layer with name collision" + : " creates a layer without a name"); + LOG_WARNING_WITH_PREFIX << error_message; + return errors::Internal(error_message); + } } + num_layers = new_num_layers; } - TF_RETURN_IF_ERROR(converter.RenameAndMarkOutputTensors(output_tensors)); + TF_RETURN_IF_ERROR(converter->RenameAndMarkOutputTensors(output_tensors)); if (convert_successfully) *convert_successfully = true; // Apply user provided quantization ranges to tensors - converter.MaybeApplyQuantizationRanges(); + if (!use_explicit_precision) { + converter->MaybeApplyQuantizationRanges(); + } // Build the engine. - VLOG(1) << "Starting engine creation"; -#if IS_TRT_VERSION_GE(6, 0, 0, 0) - engine->reset( - builder->buildEngineWithConfig(*converter.network(), *builder_config)); -#else - engine->reset(builder->buildCudaEngine(*converter.network())); -#endif // IS_TRT_VERSION_GE(6, 0, 0, 0) - if (engine->get() == nullptr) { - return errors::Internal("Failed to build TensorRT engine"); - } + TF_RETURN_IF_ERROR(converter->BuildCudaEngine( + engine, max_batch_size, max_workspace_size_bytes, allocator, calibrator, + profiles)); + VLOG(1) << "Finished conversion"; return Status::OK(); } @@ -5677,12 +6238,21 @@ Status ConvertGraphDefToEngine( Status ConvertSegmentToGraphDef( const Graph* graph, const grappler::GraphProperties& graph_properties, const std::vector& subgraph_nodes, // In topological order - std::vector* connections, GraphDef* segment_def, - string* scope_name) { + EngineInfo* engine_info) { + tensorflow::profiler::TraceMe activity( + "ConvertSegmentToGraphDef", tensorflow::profiler::TraceMeLevel::kInfo); + std::vector* connections = &engine_info->connections; + GraphDef* segment_def = &engine_info->segment_graph_def; std::set marker_nodes; // Update connection shapes/data types and add corresponding input/output // nodes in the segment graphdef. for (size_t i = 0; i < connections->size(); ++i) { + tensorflow::profiler::TraceMe activity( + [&] { + return StrCat("Constructing TRTEngine IO: ", i + 1, "/", + connections->size()); + }, + tensorflow::profiler::TraceMeLevel::kInfo); auto& connection = connections->at(i); if (connection.is_control_edge()) continue; auto outside_node = graph->FindNodeId(connection.outside_id); @@ -5757,7 +6327,14 @@ Status ConvertSegmentToGraphDef( std::unordered_map old_to_new_id_map; // Copy internal nodes to new graphdef string local_scope = subgraph_nodes.front()->name(); + int i = 0; for (const Node* node : subgraph_nodes) { + tensorflow::profiler::TraceMe activity( + [&] { + return StrCat("Copy Node to Subgraph: ", ++i, "/", + subgraph_nodes.size()); + }, + tensorflow::profiler::TraceMeLevel::kInfo); local_scope = GetCommonNameScope(local_scope, node->name()); old_to_new_id_map[node->id()] = segment_def->node_size(); auto snode = segment_def->add_node(); @@ -5766,6 +6343,13 @@ Status ConvertSegmentToGraphDef( } // Update the inputs of the new input nodes to point to placeholder nodes. for (int i = 0; i < connections->size(); ++i) { + tensorflow::profiler::TraceMe activity( + [&] { + return StrCat("Updating Subgraph Input: ", i + 1, "/", + connections->size()); + }, + tensorflow::profiler::TraceMeLevel::kInfo); + auto& connection = connections->at(i); if (connection.is_control_edge() || !connection.is_input_edge) continue; auto snode = @@ -5777,13 +6361,26 @@ Status ConvertSegmentToGraphDef( << arg_name; snode->set_input(connection.inside_port, arg_name); } + std::set subgraph_node_names; - for (const Node* node : subgraph_nodes) { - subgraph_node_names.insert(node->name()); + { + tensorflow::profiler::TraceMe activity( + "Constructing subgraph_node_names set: ", + tensorflow::profiler::TraceMeLevel::kInfo); + + for (const Node* node : subgraph_nodes) { + subgraph_node_names.insert(node->name()); + } } // Remove control inputs that are not inside the segment. for (int i = 0; i < segment_def->node_size(); ++i) { + tensorflow::profiler::TraceMe activity( + [&] { + return StrCat("Removing outside to subgraph control inputs: ", i + 1, + "/", segment_def->node_size()); + }, + tensorflow::profiler::TraceMeLevel::kInfo); auto snode = segment_def->mutable_node(i); const int input_size = snode->input_size(); int input_idx = 0; @@ -5798,12 +6395,8 @@ Status ConvertSegmentToGraphDef( << " from subgraph."; ++input_idx; continue; - } else { - return errors::InvalidArgument( - "Found non control input outside the segment that is not an " - "engine connection to ", - snode->name(), ": ", input.first); } + /// TODO(lsugy): throw error when it's not a resource input. } if (actual_input_idx != input_idx) { snode->set_input(actual_input_idx, snode->input(input_idx)); @@ -5815,7 +6408,6 @@ Status ConvertSegmentToGraphDef( snode->mutable_input()->RemoveLast(); } } - *scope_name = local_scope; return Status::OK(); } @@ -5829,9 +6421,48 @@ bool OutputEdgeValidator::operator()(const Edge* out_edge) const { return true; } +ITensorProxyPtr TRT_TensorOrWeights::as_tensor( + const OpConverterParams* params) { + if (is_tensor()) { + return tensor(); + } else { + return params->converter->CreateConstantLayer(weights(), GetTrtDims()); + } +} + +std::string unexpected_type_error_msg(nvinfer1::DataType type_being_checked, + nvinfer1::DataType type_expected, + const NodeDef& node_def, int idx) { + return "The '" + node_def.input(idx) + "' parameter of " + node_def.op() + + " operation in " + node_def.name() + " is expected to be of type " + + DebugString(type_expected) + " type, got " + + DebugString(type_being_checked) + "."; +} + +string batch_size_error(absl::string_view name, absl::string_view comment) { + return StrCat("Batch size doesn't match for tensor '", name, "' : ", comment); +} + +Status check_type(nvinfer1::DataType type_being_checked, + nvinfer1::DataType type_expected, const NodeDef& node_def, + int idx) { + if (type_being_checked == type_expected) return Status::OK(); + + return errors::InvalidArgument(unexpected_type_error_msg( + type_being_checked, type_expected, node_def, idx)); +} + +std::string convert_not_supported_implicit(const std::string& pOpName, + const std::string& pNodeName, + const char* pOpType) { + const auto oper = pOpType ? absl::StrCat(pOpType, " ") : string(""); + return absl::StrCat("Convertion for ", oper, "op: '", pOpName, + "' is not supported in implicit batch mode, at ", + pNodeName); +} + } // namespace convert } // namespace tensorrt } // namespace tensorflow -#endif // GOOGLE_TENSORRT -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA && GOOGLE_TENSORRT diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h index 192c8f1e6d0..02c7148b842 100644 --- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h +++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h @@ -22,35 +22,28 @@ limitations under the License. #include #include +#include "absl/types/optional.h" +#include "tensorflow/compiler/tf2tensorrt/convert/op_converter.h" #include "tensorflow/compiler/tf2tensorrt/convert/utils.h" +#include "tensorflow/compiler/tf2tensorrt/convert/weights.h" #include "tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h" #include "tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.h" #include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h" +#include "tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h" #include "tensorflow/compiler/tf2tensorrt/utils/trt_tensor_proxy.h" #include "tensorflow/core/framework/graph.pb.h" #include "tensorflow/core/graph/graph.h" #include "tensorflow/core/grappler/costs/graph_properties.h" #include "tensorflow/core/lib/core/status.h" -#if GOOGLE_CUDA -#if GOOGLE_TENSORRT +#if GOOGLE_CUDA && GOOGLE_TENSORRT #include "third_party/tensorrt/NvInfer.h" -// TODO: @mconley @jdekhtiar - Removed when fixed -#define TF2TENSORRT_BYPASS_NMS_RESIZE_OPS - namespace tensorflow { namespace tensorrt { namespace convert { - -#define IS_TRT_VERSION_GE(major, minor, patch, build) \ - ((NV_TENSORRT_MAJOR > major) || \ - (NV_TENSORRT_MAJOR == major && NV_TENSORRT_MINOR > minor) || \ - (NV_TENSORRT_MAJOR == major && NV_TENSORRT_MINOR == minor && \ - NV_TENSORRT_PATCH > patch) || \ - (NV_TENSORRT_MAJOR == major && NV_TENSORRT_MINOR == minor && \ - NV_TENSORRT_PATCH == patch && NV_TENSORRT_BUILD >= build)) +using ::stream_executor::port::StatusOr; struct EngineConnection { // Constructs a non-control edge. @@ -101,8 +94,13 @@ struct EngineInfo { EngineInfo() : engine_type(EngineType::TRTStatic), max_workspace_size_bytes(0), + max_batch_size(absl::nullopt), + maximum_cached_engines(0), precision_mode(TrtPrecisionMode::FP32), - use_calibration(true) {} + use_calibration(true), + + allow_build_at_runtime(true), + use_explicit_precision(false) {} string engine_name; string device; @@ -116,45 +114,55 @@ struct EngineInfo { enum class EngineType { TRTStatic = 0, TRTDynamic = 1 }; EngineType engine_type; int64 max_workspace_size_bytes; + absl::optional max_batch_size; int maximum_cached_engines; TrtPrecisionMode precision_mode; bool use_calibration; + bool allow_build_at_runtime; + bool use_explicit_precision; }; -// Constructs a graphdef from the segment in the given graph. Adds _Arg -// nodes for input edges (InputPH_*) and _Retval nodes for output edges -// (OutputPH_*). This function needs to be called before TensorRT nodes -// inserted in order to correctly get sizes from the original graph. +// Constructs a graphdef from the segment in the given graph and stores it to +// the engine_info. Adds _Arg nodes for input edges (InputPH_*) and _Retval +// nodes for output edges (OutputPH_*). Maintains the topological order of the +// non-input/output nodes in the graphdef. This function needs to be called +// before TensorRT layers are created because it prepares the original graph +// for TensorRT conversion. // // - subgraph_node_names: the node names of the subgraph. // - subgraph_node_ids: the node ids of the subgraph, must be sorted in // topological order. -// - segment_def: the output GraphDef, whose non-input/output nodedefs will be -// sorted in topological order. -// - scope_name: the name of the scope where the TRTEngineOp will be placed. +// - engine_info: a data structure that records the information about the +// engine containing the subgraph. // // TODO(aaroey): add tests to validate these properties. Status ConvertSegmentToGraphDef( const Graph* graph, const grappler::GraphProperties& graph_properties, - const std::vector& subgraph_nodes, - std::vector* connections, GraphDef* segment_def, - string* scope_name); + const std::vector& subgraph_nodes, EngineInfo* engine_info); // Converts given subgraph to a TRT engine saved in 'engine'. Returns ok iff // 'builder' successfully build the engine. If the result is not ok, 'engine' // will be set to nullptr -// Once returned, 'builder' is not needed any more and can be safely detroyed. +// Once returned, 'builder' is not needed any more and can be safely destroyed. // -// - convert_successfully: indicates whether the converson to TensorRT network +// - convert_successfully: indicates whether the conversion to TensorRT network // is successful. This is different than successfully building the engine: // building can still fail afterwards. +// Note: When 'cluster' is not null, it contains the graph to be converted. +// We may perform additional optimizations to the graph before converting +// the graph. Status ConvertGraphDefToEngine( - const GraphDef& gdef, TrtPrecisionMode precision_mode, int max_batch_size, - size_t max_workspace_size_bytes, - const std::vector& input_shapes, Logger* logger, - nvinfer1::IGpuAllocator* allocator, TRTInt8Calibrator* calibrator, + const GraphDef& gdef, OpKernelContext* ctx, TrtPrecisionMode precision_mode, + int max_batch_size, size_t max_workspace_size_bytes, + const std::vector& input_shapes, + nvinfer1::ILogger* logger, nvinfer1::IGpuAllocator* allocator, + TRTInt8Calibrator* calibrator, TrtUniquePtrType* engine, bool use_calibration, - bool* convert_successfully); + const bool use_implicit_batch, bool* convert_successfully, + TrtShapeOptimizationProfile* profiles, absl::string_view engine_name, + bool use_explicit_precision, + tensorflow::grappler::Cluster* cluster = nullptr, + const string& device = ""); // Helper class for the segmenter to determine whether an output edge from the // TRT segment is valid. @@ -165,206 +173,6 @@ class OutputEdgeValidator { bool operator()(const Edge* out_edge) const; }; -string DebugString(const nvinfer1::DataType trt_dtype); -string DebugString(const nvinfer1::Dims& dims); -string DebugString(const nvinfer1::Permutation& permutation, int len); -string DebugString(const nvinfer1::ITensor& tensor); -int64_t TrtWeightDimsNumElements(const nvinfer1::Dims& dims); -int64_t TrtTensorDimsNumElements(const nvinfer1::Dims& dims); - -// Class to convert TF compile-time constants (e.g. Const nodes) to TRT weight. -class TRT_ShapedWeights { - public: - explicit TRT_ShapedWeights( - nvinfer1::DataType type = nvinfer1::DataType::kFLOAT); - - // Copy from another weights. - // - // NOTE: this does not copy the underlying buffer but only increase its - // reference count. - TRT_ShapedWeights(const TRT_ShapedWeights& rhs); - - nvinfer1::Weights GetTrtWeights() const; - - const Tensor& GetTensor() const { return tensor_; } - - // Returns the raw pointer to the underlying buffer which holds the weights - // value. - void* GetValues() const { - return const_cast(tensor_.tensor_data().data()); - } - - int64_t count() const; - - size_t size_bytes() const; - - string DebugString() const; - - template - absl::Span GetSpan() const { - return absl::Span(tensor_.flat().data(), count()); - } - - template - std::vector ToVector() const { - auto span = GetSpan(); - return std::vector(span.data(), span.data() + span.size()); - } - - nvinfer1::DataType TrtDType() const { return type_; } - - // TODO(aaroey): make these private. - nvinfer1::Dims shape_; // Note: shape.type[] is not used. - - private: - // This constructor is only used by TrtWeightStore, which creates the - // underlying buffer. - TRT_ShapedWeights(nvinfer1::DataType type, nvinfer1::Dims dims, - Tensor tensor); - - nvinfer1::DataType type_; - - // All weights should be stored inside TrtWeightStore to make sure lifetime of - // all the underlying tensors are available until the engine is built. For - // this reason, tensor_ should never be reassigned to a different value that - // is not already present in the TrtWeightStore. - Tensor tensor_; - - friend class TrtWeightStore; -}; - -// Container for TRT_ShapedWeights. We need this container because, TRT doesn't -// manage the lifetime of the weights buffer, it only keeps a pointer to it and -// requires that the data referenced by the pointer be available until the -// building of engine is complete. For more information see -// https://docs.nvidia.com/deeplearning/sdk/tensorrt-api/c_api/classnvinfer1_1_1_weights.html -// -// TODO(laigd): consider adding garbage collection to the unused weights. -class TrtWeightStore { - public: - // Get a TRT_ShapedWeights with 'type' and 'dims'. - TRT_ShapedWeights GetTempWeights(nvinfer1::DataType trt_type, - const nvinfer1::Dims& dims); - - // Get a TRT_ShapedWeights with the same data type and dimensions as - // 'weights'. - TRT_ShapedWeights GetTempWeights(const TRT_ShapedWeights& weights) { - return GetTempWeights(weights.TrtDType(), weights.shape_); - } - - private: - // The backend storage of the TRT_ShapedWeights. - std::vector store_; -}; - -// Represents a TRT-style input to a TF node, it can be either a -// nvinfer1::ITensor, or TRT_ShapedWeights which is compile-time constant. -// -// TODO(laigd): maybe rename it to TrtArgument, or mimic XlaCompiler::Argument. -class TRT_TensorOrWeights { - public: - TRT_TensorOrWeights() {} - TRT_TensorOrWeights(ITensorProxyPtr); - TRT_TensorOrWeights(ITensorProxyPtr tensor, int batch_size); - - // Constructor that makes it an ITensor, doesn't take ownership of 'tensor'. - // This is used by Converter when building the TRT network, where the ITensor - // is owned by the TRT network being built. See comment for 'trt_tensor_' - // in trt_proxy_tensor.h. - explicit TRT_TensorOrWeights(nvinfer1::ITensor* tensor, int batch_size = -1); - - // Constructor that makes it an ITensor by creating one using provided data - // type and shape, and takes ownership of the created ITensor. This is used by - // TrtNodeValidator to encapsulate the type and shape information for - // validation of graph nodes, and the created ITensor is fake and temporary, - // and should not be used to build any TRT network. See comment for - // 'simple_tensor_' in trt_proxy_tensor.h. - explicit TRT_TensorOrWeights(nvinfer1::DataType trt_dtype, - const nvinfer1::Dims& trt_dims, int batch_size); - - // Constructor that makes it a TRT_TensorOrWeights. - explicit TRT_TensorOrWeights(const TRT_ShapedWeights& weights); - - TRT_TensorOrWeights(const TRT_TensorOrWeights& rhs); - - void operator=(const TRT_TensorOrWeights& rhs); - - bool is_tensor() const { return initialized_ && is_tensor_; } - bool is_weights() const { return initialized_ && !is_tensor_; } - - ITensorProxyPtr tensor() const; - - TRT_ShapedWeights& weights() { - CHECK(is_weights()); - return weights_; - } - - const TRT_ShapedWeights& weights() const { - CHECK(is_weights()); - return weights_; - } - - nvinfer1::Dims GetTrtDims() const; - - int batch_size() const { return batch_size_; } - - string DebugString() const; - - private: - - void set_batch_size(int batch_size) { batch_size_ = batch_size; } - - // First dimension of the TF tensor (NOT tensor_) that is represented by - // tensor_ is treated as the "batch dimension" by TRT, and tensor_'s - // dimensions (obtained via tensor_->getDimensions()) do not contain the batch - // dimension. For example, when a TF tensor with shape (A,B,C) is represented - // in TRT, tensor_->getDimensions() will be (B,C) and batch_size_ will be A. - // - // This requires that all tensors in the subgraph that is converted to a TRT - // engine have the same batch size are represented by the first dimension of - // their shape, and Converter will verify this during conversion. The drawback - // is that currently it cannot convert a graph that doesn't have the batch - // size represented in the shapes or the batch sizes are different. See - // b/118387490 for more details. - ITensorProxyPtr tensor_proxy_ptr_ = nullptr; - int batch_size_ = -1; - - TRT_ShapedWeights weights_; - bool initialized_ = false; - bool is_tensor_ = false; - - friend class Converter; -}; - -class Converter; - -// Parameters for each op converter. -struct OpConverterParams { - // Constructor used for validation only. - OpConverterParams(const NodeDef& node_def, - const std::vector& inputs, - std::vector* outputs, - TrtWeightStore* weight_store, - TrtPrecisionMode precision_mode, bool use_calibration); - - // Constructor used for conversion. - OpConverterParams(Converter* converter, const NodeDef& node_def, - const std::vector& inputs, - std::vector* outputs, - TrtWeightStore* weight_store); - - Converter* converter = nullptr; - const NodeDef& node_def; - const std::vector& inputs; - std::vector* outputs; - const bool validation_only; - TrtWeightStore* weight_store; - const TrtPrecisionMode precision_mode; - const bool use_calibration; -}; - -using OpConverter = std::function; - // Class to verify if specific TF node is supported by TRT. class TrtNodeValidator { public: @@ -372,33 +180,39 @@ class TrtNodeValidator { // checked by IsTensorRTCandidate() later. It is used to get the shape and // data type information of a tensor for validation purpose. TrtNodeValidator(const grappler::GraphProperties& graph_properties, - TrtPrecisionMode precision_mode, bool use_calibration); + TrtPrecisionMode precision_mode, bool use_calibration, + bool use_implicit_batch, bool use_explicit_precision); // Returns OK iff 'node' is a TF-TRT conversion candidate, which will be added // to TRT subgraph and later converted into TRT engine. Status IsTensorRTCandidate(const Node* node); - private: static const std::set* quantize_ops; - void RegisterOpValidators(); + // Returns validator by op type. If no validator is registered for + // specific op, it means no validation is needed and ValidateNode() will + // return OK. + ::stream_executor::port::StatusOr GetValidator( + const std::string& op); + private: // Convert a Const node to a TRT_TensorOrWeights. Status ConvertConstToWeights(const NodeDef& const_node_def, const std::vector& inputs, TRT_TensorOrWeights* output); + // Convert a VariableV2 node to a TRT_TensorOrWeights. + Status ConvertVariableToWeights( + const NodeDef& const_node_def, + const std::vector& inputs, + TRT_TensorOrWeights* output); + // Convert the output tensor at 'output_port' of 'node_def' to a // TRT_TensorOrWeights which will be later used as an input to other nodes and // passed to ValidateNode() below. Status ConvertToTensorOrWeights(const NodeDef& node_def, int output_port, TRT_TensorOrWeights* tensor_or_weights); - // Stores all the validators by op type. If no validator is registered for - // specific op, it means no validation is needed and ValidateNode() will - // return OK. - std::unordered_map op_validators_; - // Store the weights added during validation. Some validations (e.g. // validation for Const node) may produce weights. TrtWeightStore weight_store_; @@ -412,6 +226,10 @@ class TrtNodeValidator { const bool use_calibration_; + const bool use_implicit_batch_; + + const bool use_explicit_precision_; + friend class ValidatorTest; friend class OpConverterTest; }; @@ -432,8 +250,11 @@ class Converter { nvinfer1::DataType trt_dtype; }; - Converter(nvinfer1::INetworkDefinition* trt_network, - TrtPrecisionMode precision_mode, bool use_calibration); + static ::stream_executor::port::StatusOr> Create( + TrtPrecisionMode precision_mode, bool use_calibration, + nvinfer1::ILogger* trt_logger, const bool use_implicit_batch, + absl::string_view engine_name, bool use_explicit_precision = false, + OpKernelContext* ctx = nullptr); ////////////////////////////////////////////////////////////////////////////// // Methods used by the TRT engine builder to build a TRT network from a TF @@ -447,34 +268,44 @@ class Converter { Status AddInputTensor(const string& name, nvinfer1::DataType dtype, const nvinfer1::Dims& dims, int batch_size); + // Store the ResourceHandle as a TRT_TensorOrWeights object. This can be + // later used as input to other nodes. + Status AddInputResource(const string& name, const ResourceHandle& resource); + // Mark the tensors with names specified by source_tensor_name as output of // the TRT network, and set their names in the TRT network as dest_node_name. Status RenameAndMarkOutputTensors( const std::vector& output_tensors); + // Build a TRT engine using the created network. + Status BuildCudaEngine(TrtUniquePtrType* engine, + int max_batch_size, size_t max_workspace_size_bytes, + nvinfer1::IGpuAllocator* allocator, + TRTInt8Calibrator* calibrator, + TrtShapeOptimizationProfile* profiles); + ////////////////////////////////////////////////////////////////////////////// // Methods used by op converters to convert individual TF node and add layers // to the TRT network. // Op converters (e.g. ConvertReshape) need to access the TRT network in order // to add TRT layers. - nvinfer1::INetworkDefinition* network() { return trt_network_; } + nvinfer1::INetworkDefinition* network() { return trt_network_.get(); } // What precision are we targeting? TrtPrecisionMode precision_mode() const { return precision_mode_; } + // Variable converters need the context to read variable values. + OpKernelContext* context() { return ctx_; } + // Calibration will be or was previously performed on this network? bool use_calibration() const { return use_calibration_; } - // This should be called on the inputs and outputs of any layer we create - // where we know that the quantization range does not change during that - // operation. (e.g. Reshape, Transpose, Identity, MaxPool). - void MarkQuantizationRangesAsInferrable(ITensorProxyPtr* input, - ITensorProxyPtr* output); + // Whether implicit batch mode is enabled + bool use_implicit_batch() const { return use_implicit_batch_; } // This function should be called when we know the quantization range of a - // tensor, either from a quantize/dequantize node or when the output is a - // fixed range (e.g. SoftMax, Relu6, Sigmoid). + // tensor from a quantize/dequantize node. void ProvideQuantizationRange(ITensorProxyPtr* tensor, float min_range, float max_range); @@ -487,28 +318,113 @@ class Converter { // Transpose 'input_tensor' with given permutation 'order_with_batch_dim' to // 'output_tensor'. The permutation 'order_with_batch_dim' contains the batch - // dimension which should always be 0. + // dimension which should always be 0. If this is for adding a transpose layer + // to support the conversion of 'node_def', callers need to provide a + // non-empty 'sub_op_name' appended to the name of 'node_def' to avoid layer + // name conflicts. Status TransposeTensor(ITensorProxyPtr input_tensor, const std::vector& order_with_batch_dim, - ITensorProxyPtr* output_tensor); + ITensorProxyPtr* output_tensor, + const NodeDef& node_def, + absl::string_view sub_op_name = ""); - // Converts 'input' into 'tensor' with shape specified by 'dims' (which - // doesn't contain the batch dimension). + // Reshapes a dynamic shape tensor by removing or adding dimensions of size 1, + // and/or permuting the dimensions. The new shape is derived from the shape of + // the input tensor according to the slices and size_for_added_dims arguments. // - // If validation_only is true, it doesn't do the conversion but only do some - // minimum validation for the eligibility of the conversion, and *tensor will - // be set to nullptr. - Status PrepareTensorForShape(const TRT_TensorOrWeights& input, - const nvinfer1::Dims& dims, - const bool validation_only, - ITensorProxyPtr* tensor); + // If there would be at most one unknown dimension, we could set the new shape + // using IShuffleLayer::setReshapeDimensions, which treats -1 as a special + // value (the same way as TF). In general, we can have more than one unknown + // dimensions, and we have to manipulate the shape tensors during runtime to + // define the new shape. This helper function defines the necessary shape + // inference layers and calls reshape using the calculated new shape. + // + // Example: + // + // Assume that we want to reshape a tensor from shape {A,B,C,D} to {C,D,A,B} + // (no transpose, just change the shape). In dynamic shape mode, the A,B,C,D + // values are not necessarily known at conversion time, they can be all -1. We + // can only define the new shape at runtime, when the actual shape is already + // known. To define the new shape: + // - We use an IShapeLayer to retrieve a shape tensor with the {A,B,C,D} + // values. + // - Create two slices {C,D} and {A,B} of the shape tensor. + // - Concatenate these slices {C,D,A,B}, + // - Set the {C,D,A,B} shape tensor as an input shape tensor for + // IShuffleLayer. + // + // This can be achieved by calling DynamicReshape(input, {{2,4},{0,2}}, + // params). + // + // Before each slice we can insert new dims if the corresponding + // size_for_added_dims element is not negative. The size_for_added_dims array + // can have more than slices.size() elements, in order to insert a dimension + // after the last slice. For example, to add two leading 1 dimensions, and + // three trailing 1 dimensions, call DynamicReshape(input, {{0,nbDims}}, + // {2, 3}). + // + // Parameters: + // input - input tensor + // slices - [start, end) pairs of slices + // params - conversion parameters + // output - reshaped tensor + // size_for_added_dims - size of dimension inserted right before slice[i]. We + // only insert a new dim if size_for_added_dims[i] >= 0. + Status DynamicReshape(ITensorProxyPtr input, + std::vector> slices, + const OpConverterParams* params, + ITensorProxyPtr* output, + std::vector size_for_added_dims = {}, + absl::optional op_instance = absl::nullopt); + + // Inserts a singleton dimension at axis for a dynamic shape tensor. + Status DynamicExpandDims(ITensorProxyPtr input, const nvinfer1::Dims& dims, + int axis, const OpConverterParams* params, + ITensorProxyPtr* output, + absl::optional op_instance = absl::nullopt); + + // Helper function to add a squeeze op to the network. + // + // The input_dims argument stores the TRT dimensions of the input tensor, + // where the dimensions to be squeezed are replaced by 0. + Status SqueezeTensor(ITensorProxyPtr input, std::vector* input_dims, + const OpConverterParams* params, ITensorProxyPtr* output, + absl::optional op_instance = absl::nullopt); // Creates an IConstantLayer using 'weights' whose dimensions are specified by // 'dims', and returns the output ITensor. ITensorProxyPtr CreateConstantLayer(const TRT_ShapedWeights& weights, - const nvinfer1::Dims& dims); + const nvinfer1::Dims& dims); + + // Gets the min and max value in a TRT_ShapedWeights + Status GetWeightRange(const TRT_ShapedWeights& weights, float* out_min, + float* out_max) const; + + // Constructs a name and passed it to the TensorRT layer to support xprof. + void SetLayerName( + nvinfer1::ILayer* layer, const NodeDef& node_def, + absl::string_view sub_op_name = "", + absl::optional sub_op_instance = absl::nullopt, + absl::optional origin_node_name = absl::nullopt); + + void SetLayerName(nvinfer1::ILayer* layer, absl::string_view main_op_name, + absl::string_view sub_op_name, + absl::optional sub_op_instance = absl::nullopt); + + std::unordered_map& TensorsMap() { + return trt_tensors_; + } + + bool UseExplicitPrecision() const { return use_explicit_precision_; } private: + Converter(TrtPrecisionMode precision_mode, bool use_calibration, + nvinfer1::ILogger* trt_logger, const bool use_implicit_batch, + absl::string_view engine_name, bool use_explicit_precision, + OpKernelContext* ctx); + + Status Init(nvinfer1::ILogger* trt_logger); + // Verify the provided batch_size is consistent with batch_size_ and update it // if necessary. Status MaybeUpdateBatchSize(int batch_size); @@ -523,26 +439,21 @@ class Converter { Status GetInputs(const NodeDef& node_def, std::vector* inputs) const; - void RegisterOpConverters(); - - void PropagateQuantizationRanges(); - - // Gets the min and max value in a TRT_ShapedWeights - Status GetWeightRange(const TRT_ShapedWeights& weights, float* out_min, - float* out_max) const; - - // Registered op converters by op type. - std::unordered_map op_registry_; - // Tensors/weights added during construction of trt_network_. std::unordered_map trt_tensors_; - // The TRT networking being built. - nvinfer1::INetworkDefinition* trt_network_; + // The TRT builder used to create the network and build the engine. Not owned. + TrtUniquePtrType trt_builder_; + + // The TRT network being built. + TrtUniquePtrType trt_network_; // Store the weights added during construction of trt_network_. TrtWeightStore weight_store_; + // Store the context. + OpKernelContext* ctx_; + // During conversion, this table is populated with quantization ranges per // tensor. MaybeApplyQuantizationRanges() will use this table to set the TRT // quantization ranges. Since TRT only supports symmetric ranges, we will @@ -552,49 +463,132 @@ class Converter { std::unordered_map quantization_ranges_proxy_; std::unordered_map quantization_ranges_; - // Edges where quantization ranges can be inferred (copied) across ops - from - // first tensor to second tensor. PropagateQuantizationRanges() will propagate - // known ranges from quantization_ranges_ across these edges, adding the new - // ranges to quantization_ranges_ so that they can be applied in - // MaybeApplyQuantizationRanges(). - std::vector> - quantization_infer_proxy_; - std::vector> - quantization_infer_; - const TrtPrecisionMode precision_mode_; const bool use_calibration_; + // If this is false, all dimensions including the batch dimension are + // set explicitely. + const bool use_implicit_batch_; + // Batch size of inputs to trt_network_ added by AddInputTensor(). During // network construction it will update this, use it to verify the batch // size of all inputs are compatible, and make sure individual TF node is // acceptable by TRT. int batch_size_ = -1; + // Assign a ID to each constant layer we create, so that we can assign a + // unique name to the layer. + int next_constant_layer_id_ = 0; + + // The name of the TRTEngineOp node. + absl::string_view engine_name_; + + // Indicates whether to use explicit precision in TensorRT (Q/DQ support). + bool use_explicit_precision_; + friend class ConverterTest; friend class OpConverterTest; }; +// Converts a TensorFlow tensor to TRT shaped weights. +Status TfTensorToTrtWeights(const Tensor& tensor, TrtWeightStore* weight_store, + TRT_ShapedWeights* weights); + +// Converts 'input' of 'node_def' into 'tensor' with shape specified by 'dims' +// (which doesn't contain the batch dimension). +// +// If validation_only is true, it doesn't do the conversion but only do some +// minimum validation for the eligibility of the conversion, and *tensor will +// be set to nullptr. +// If validation_only is false converter must not be nullptr. +Status PrepareTensorForShape( + Converter* converter, const TRT_TensorOrWeights& input, + const DimsAdapter& dims, const bool validation_only, + ITensorProxyPtr* tensor, const NodeDef& node_def, + absl::optional op_instance = absl::nullopt, + absl::optional origin_node_name = absl::nullopt); + // Return OK if the broadcast scheme is supported and compute the shapes after // broadcasting. check_feasibility can be set to false in cases where dimensions // do not need to match exactly (as in the case of BatchMatMulV2). Status GetTrtBroadcastShape(const TRT_TensorOrWeights& operand_l, const TRT_TensorOrWeights& operand_r, const bool check_feasibility, + const bool use_implicit_batch, nvinfer1::Dims* operand_l_new_dims, nvinfer1::Dims* operand_r_new_dims); -// Map of all supported UnaryOperations -const std::unordered_map* UnaryOperationMap(); -// Map of all supported ActivationTypes -const std::unordered_map* ActivationTypeMap(); +template +using OperationMap = std::unordered_map; + +// Map from Tensorflow operation names to TensorRT unary operations. +using UnaryOperationMapType = OperationMap; +const UnaryOperationMapType* UnaryOperationMap(); + +// Map from Tensorflow boolean operation names to TensorRT unary operations. +const UnaryOperationMapType* UnaryBooleanOperationMap(); + +// Map of all supported ActivationTypes. +using ActivationTypeMapType = OperationMap; +const ActivationTypeMapType* ActivationTypeMap(); + +// Map from Tensorflow binary operation names to TensorRT binary operations +// types. +using BinaryOperationMapType = OperationMap; +const BinaryOperationMapType* BinaryOperationMap(); + +// Map from Tensorflow boolean binary operation names to TensorRT binary +// operations types. +const BinaryOperationMapType* BinaryBooleanOperationMap(); + +template +absl::InlinedVector GetOperationNames(const T& set) { + absl::InlinedVector result; + absl::c_transform(set, std::back_inserter(result), + [](const auto x) { return x.first; }); + return result; +} + +// Adds a matrix multiplication operation to the TensorRT graph. The "params" +// pointer is only used to access the TRT network builder. The inputs and +// parameters for the op are fully specified by input_[a|b] and transpose_[a|b]. +::stream_executor::port::StatusOr ConvertMatMulImpl( + const OpConverterParams* params, TRT_TensorOrWeights input_a, + TRT_TensorOrWeights input_b, bool transpose_a, bool transpose_b); + +Status ApplyBroadcast(std::unique_ptr& operand, + const DimsAdapter& broadcasted_dims, + const OpConverterParams* params, + absl::optional op_instance); + +std::string convert_range_error_msg(float start, float limit, float delta); +std::string convert_range_expected_msg(const NodeDef& node_def); +std::string bool_weight_error_msg(const NodeDef& node_def); +std::string unexpected_type_error_msg(nvinfer1::DataType type_being_checked, + nvinfer1::DataType type_expected, + const NodeDef& node_def, int idx = 0); +std::string then_else_dtypes_error_msg(nvinfer1::DataType type_then, + nvinfer1::DataType type_else, + const NodeDef& node_def); +std::string input_shapes_error_msg(const nvinfer1::Dims& shape1, + const nvinfer1::Dims& shape2, + const NodeDef& node, + bool then_vs_else = false); +std::string batch_size_error(absl::string_view name, absl::string_view comment); + +inline bool find_name(const string& name, const std::vector names) { + return std::find(names.begin(), names.end(), name) != names.end(); +} + +Status check_type(nvinfer1::DataType type_being_checked, + nvinfer1::DataType type_expected, const NodeDef& node_def, + int idx = 0); } // namespace convert } // namespace tensorrt } // namespace tensorflow -#endif // GOOGLE_TENSORRT -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA && GOOGLE_TENSORRT #endif // TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_CONVERT_NODES_H_ diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc index 8a0c963e0c8..0733c840d57 100644 --- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc +++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc @@ -15,238 +15,148 @@ limitations under the License. #include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h" +#include +#include +#include +#include #include +#include +#include #include #include +#include "absl/time/civil_time.h" + +#if GOOGLE_CUDA && GOOGLE_TENSORRT + #include #include +#include "Eigen/Core" // from @eigen_archive +#include "absl/algorithm/container.h" +#include "absl/base/call_once.h" +#include "absl/container/inlined_vector.h" #include "absl/strings/match.h" #include "absl/strings/numbers.h" #include "absl/strings/str_cat.h" +#include "absl/strings/str_format.h" #include "absl/strings/string_view.h" #include "absl/types/span.h" #include "tensorflow/cc/framework/ops.h" #include "tensorflow/cc/framework/scope.h" #include "tensorflow/cc/ops/nn_ops_internal.h" #include "tensorflow/cc/ops/standard_ops.h" +#include "tensorflow/compiler/tf2tensorrt/common/datavec.h" +#include "tensorflow/compiler/tf2tensorrt/common/utils.h" +#include "tensorflow/compiler/tf2tensorrt/convert/op_converter_registry.h" +#include "tensorflow/compiler/tf2tensorrt/convert/utils.h" +#include "tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.h" #include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h" +#include "tensorflow/compiler/tf2tensorrt/utils/trt_testutils.h" +#include "tensorflow/core/common_runtime/device_mgr.h" +#include "tensorflow/core/common_runtime/gpu/gpu_managed_allocator.h" +#include "tensorflow/core/common_runtime/process_function_library_runtime.h" +#include "tensorflow/core/framework/allocator.h" +#include "tensorflow/core/framework/device_factory.h" #include "tensorflow/core/framework/node_def.pb.h" // NOLINT +#include "tensorflow/core/framework/resource_var.h" #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/framework/tensor.pb.h" // NOLINT #include "tensorflow/core/framework/tensor_shape.h" #include "tensorflow/core/framework/tensor_testutil.h" #include "tensorflow/core/framework/types.h" #include "tensorflow/core/grappler/costs/graph_properties.h" +#include "tensorflow/core/kernels/variable_ops.h" #include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/lib/core/status_test_util.h" #include "tensorflow/core/lib/strings/str_util.h" #include "tensorflow/core/lib/strings/strcat.h" #include "tensorflow/core/platform/protobuf.h" +#include "tensorflow/core/platform/status_matchers.h" #include "tensorflow/core/platform/test.h" +#include "tensorflow/core/platform/threadpool.h" #include "tensorflow/core/protobuf/config.pb.h" // NOLINT #include "tensorflow/core/public/session.h" - -#if GOOGLE_CUDA -#if GOOGLE_TENSORRT +#include "tensorflow/core/public/version.h" +#include "tensorflow/core/util/tensor_slice_reader_cache.h" #include "third_party/gpus/cuda/include/cuda.h" #include "third_party/gpus/cuda/include/cuda_runtime_api.h" #include "third_party/tensorrt/NvInfer.h" namespace tensorflow { namespace tensorrt { -namespace convert { - -using absl::StrCat; -using ::testing::ElementsAre; -using ::testing::ElementsAreArray; -using ::testing::NanSensitiveFloatNear; -// TODO(laigd): put this into some test utils file. -void ExpectStatus(Status status, error::Code code = error::OK, - const char* substr = nullptr) { - EXPECT_EQ(code, status.code()) - << status << " vs expected error code \"" << error::Code_Name(code) - << "\" and message \"" << substr << "\""; - if (substr) { - EXPECT_THAT(status.error_message(), ::testing::HasSubstr(substr)) << status; - } -} - -nvinfer1::Dims GetTestDims(const std::vector& d) { - nvinfer1::Dims dims; - dims.nbDims = d.size(); - for (int i = 0; i < d.size(); ++i) { - dims.d[i] = d[i]; - } - return dims; -} - -nvinfer1::DataType TfDataTypeToTrt(DataType tf_dtype) { - switch (tf_dtype) { - case DT_FLOAT: - return nvinfer1::DataType::kFLOAT; - case DT_HALF: - return nvinfer1::DataType::kHALF; - case DT_INT32: - return nvinfer1::DataType::kINT32; - default: - QCHECK(false) << "Unexpected data type " << DataTypeString(tf_dtype); - } -} +// TensorRT modes for testing. We define the following three modes: +// 1. Implicit batch mode: The tensors have static (known) input shape and the +// the batch dimension (first dim) is removed from the TRT tensor shape. In +// a loose notation: trt_shape = tf_shape[1:]. +// 2. Explicit batch mode: static (known) input shape, but the batch dimension +// is part of the trt tensor shape. (trt_shape = tf_shape) +// 3. Dynamic shape mode allows unknown input shapes, and requires explicit +// batch size definition (trt_shape = tf_shape). +// +// Note that the Converter only distinguishes between two modes: +// - use_implicit_batch == true, this corresponds to kImplicitBatch, +// - use_implicit_batch == false which includes both kExplicitBatch and +// kDynamicShape. +// +// For the converter, the distinction between explicit batch or dynamic shape +// mode follows from the input tensors of the network: dynamic shape input +// implies dynamic shape mode, while static shape input tensors imply explicit +// batch mode. We want to test all these modes, therefore we define the +// TrtTestMode with the following three options. +enum class TrtTestMode { + kImplicitBatch = 0, + kExplicitBatch = 1, + kDynamicShape = 2 +}; -DataType TrtDataTypeToTf(nvinfer1::DataType trt_dtype) { - switch (trt_dtype) { - case nvinfer1::DataType::kFLOAT: - return DT_FLOAT; - case nvinfer1::DataType::kHALF: - return DT_HALF; - case nvinfer1::DataType::kINT32: - return DT_INT32; +string DebugString(const TrtTestMode mode) { + switch (mode) { + case TrtTestMode::kImplicitBatch: + return "kImplicitBatch"; + case TrtTestMode::kExplicitBatch: + return "kExplicitBatch"; + case TrtTestMode::kDynamicShape: + return "kDynamicShape"; default: - QCHECK(false) << "Unexpected data type " << static_cast(trt_dtype); - } -} - -NodeDef MakeNodeDef(const string& name, const string& op, - const std::vector& inputs, - const std::map attrs = {}) { - NodeDef node_def; - node_def.set_name(name); - node_def.set_op(op); - for (const string& input : inputs) { - node_def.add_input(input); - } - for (const auto& attr : attrs) { - (*node_def.mutable_attr())[attr.first] = attr.second; - } - return node_def; -} - -template -NodeDef MakeConstNodeDef(const string& name, const std::vector& vals, - const TensorShape& shape) { - Scope s = Scope::NewRootScope(); - Tensor t = test::AsTensor(vals, shape); - auto const_op = ops::Const(s.WithOpName(name), t); - return const_op.node()->def(); -} - -template -NodeDef MakeConstNodeDef(const string& name, const std::vector& vals) { - TensorShape shape; - const std::vector shape_dims = {static_cast(vals.size())}; - TF_EXPECT_OK(TensorShapeUtils::MakeShape(shape_dims, &shape)); - return MakeConstNodeDef(name, vals, shape); -} - -bool TrtDimsEquals(const nvinfer1::Dims& lhs, const nvinfer1::Dims& rhs) { - if (lhs.nbDims != rhs.nbDims) return false; - for (int i = 0; i < lhs.nbDims; ++i) { - if (lhs.d[i] != rhs.d[i]) return false; - // We don't check the types in the tests. + return "Invalid TrtTestMode"; } - return true; -} - -bool TrtDimsEqualsArray(const std::vector& lhs, - const nvinfer1::Dims& rhs) { - return TrtDimsEquals(GetTestDims(lhs), rhs); } -// TODO(laigd): define a parameterized matcher that can compare against the -// vector. -void ExpectTrtDimsEqualsArray(const std::vector& lhs, - const nvinfer1::Dims& rhs) { - EXPECT_TRUE(TrtDimsEqualsArray(lhs, rhs)) - << "expected: " << DebugString(GetTestDims(lhs)) << "\n" - << " actual: " << DebugString(rhs); -} - -template -void ExpectArrayNear(const std::vector& lhs, absl::Span rhs) { - ASSERT_EQ(lhs.size(), rhs.size()); - for (int i = 0; i < lhs.size(); i++) { - EXPECT_FLOAT_EQ(lhs[i], rhs[i]); - } -} +namespace convert { -// Eigen::half cannot implicitly convert to float which is required for -// EXPECT_FLOAT_EQ. -template <> -void ExpectArrayNear(const std::vector& lhs, - absl::Span rhs) { - ASSERT_EQ(lhs.size(), rhs.size()); - for (int i = 0; i < lhs.size(); i++) { - EXPECT_FLOAT_EQ(Eigen::half_impl::half_to_float(lhs[i]), - Eigen::half_impl::half_to_float(rhs[i])); - } -} +using absl::StrCat; +using ::testing::ElementsAre; +using ::testing::ElementsAreArray; +using ::testing::HasSubstr; +using ::testing::Matcher; +using ::testing::PrintToString; -template -void ExpectArrayAlmostEqual(const std::vector& lhs, absl::Span rhs, - T tolerance) { - ASSERT_EQ(lhs.size(), rhs.size()); - for (int i = 0; i < lhs.size(); i++) { - EXPECT_NEAR(lhs[i], rhs[i], tolerance); - } -} +using ::tensorflow::testing::IsOk; +using ::tensorflow::testing::StatusIs; -// Eigen::half cannot implicitly convert to float which is required for -// EXPECT_NEAR. -template <> -void ExpectArrayAlmostEqual(const std::vector& lhs, - absl::Span rhs, - Eigen::half tolerance) { - ASSERT_EQ(lhs.size(), rhs.size()); - for (int i = 0; i < lhs.size(); i++) { - EXPECT_NEAR(Eigen::half_impl::half_to_float(lhs[i]), - Eigen::half_impl::half_to_float(rhs[i]), - Eigen::half_impl::half_to_float(tolerance)); - } -} +constexpr std::array ValidTrtModes = { + TrtTestMode::kImplicitBatch, TrtTestMode::kExplicitBatch, + TrtTestMode::kDynamicShape}; bool TrtShapedWeightsEquals(const TRT_ShapedWeights& lhs, const TRT_ShapedWeights& rhs) { - return TrtDimsEquals(lhs.shape_, rhs.shape_) && - lhs.TrtDType() == rhs.TrtDType() && lhs.GetValues() == rhs.GetValues(); + return lhs.Shape() == rhs.Shape() && lhs.TrtDType() == rhs.TrtDType() && + lhs.GetPointer() == rhs.GetPointer(); } template void ValidateWeights(const TRT_ShapedWeights& weights, const std::vector& expected_dims, const std::vector& expected_value) { - ExpectTrtDimsEqualsArray(expected_dims, weights.shape_); + EXPECT_EQ(weights.Shape(), DimsAdapter(expected_dims)); ASSERT_EQ(expected_value.size(), weights.count()) << weights.DebugString(); - const T* actual_values = static_cast(weights.GetValues()); + const T* actual_values = weights.GetPointer(); for (int i = 0; i < expected_value.size(); ++i) { EXPECT_EQ(expected_value[i], actual_values[i]); } } -template -std::vector InitTestVector(int size, CType start_value = CType(0)) { - std::vector res; - res.reserve(size); - for (int i = 0; i < size; ++i) { - res.push_back(start_value + CType(i)); - } - return res; -} - -template -struct StaticCaster { - OutCType operator()(InCType in) const { return static_cast(in); } -}; - -template -std::vector CastTestVector(const std::vector& vals) { - std::vector res(vals.size()); - std::transform(vals.begin(), vals.end(), res.begin(), - StaticCaster()); - return res; -} - TEST(TRT_ShapedWeights_Test, Basic) { // Test constructor with no arguments. { @@ -258,7 +168,7 @@ TEST(TRT_ShapedWeights_Test, Basic) { EXPECT_EQ(nullptr, trt_weights.values); EXPECT_EQ(0, trt_weights.count); - EXPECT_EQ(nullptr, ptr->GetValues()); + EXPECT_EQ(nullptr, ptr->GetPointer()); EXPECT_EQ(0, ptr->count()); EXPECT_EQ(0, ptr->size_bytes()); } @@ -273,7 +183,7 @@ TEST(TRT_ShapedWeights_Test, Basic) { EXPECT_EQ(nullptr, trt_weights.values); EXPECT_EQ(0, trt_weights.count); - EXPECT_EQ(nullptr, ptr->GetValues()); + EXPECT_EQ(nullptr, ptr->GetPointer()); EXPECT_EQ(0, ptr->count()); EXPECT_EQ(0, ptr->size_bytes()); } @@ -282,7 +192,8 @@ TEST(TRT_ShapedWeights_Test, Basic) { { TrtWeightStore store; TRT_ShapedWeights weights = - store.GetTempWeights(nvinfer1::DataType::kFLOAT, GetTestDims({2, 5})); + store.GetTempWeights(nvinfer1::DataType::kFLOAT, CreateDims({2, 5})) + .value(); TRT_ShapedWeights copy(weights); for (auto ptr : {&weights, ©}) { nvinfer1::Weights trt_weights = ptr->GetTrtWeights(); @@ -290,12 +201,12 @@ TEST(TRT_ShapedWeights_Test, Basic) { EXPECT_NE(nullptr, trt_weights.values); EXPECT_EQ(10, trt_weights.count); - EXPECT_EQ(trt_weights.values, ptr->GetValues()); + EXPECT_EQ(trt_weights.values, ptr->GetPointer()); EXPECT_EQ(10, ptr->count()); EXPECT_EQ(40, ptr->size_bytes()); } // Test that it doesn't copy the underlying buffer. - EXPECT_EQ(weights.GetValues(), copy.GetValues()); + EXPECT_EQ(weights.GetPointer(), copy.GetPointer()); } } @@ -336,7 +247,7 @@ TEST(TRT_TensorOrWeights_Test, Basic) { EXPECT_EQ(1, ptr->batch_size()); } EXPECT_EQ(itensor->simple_tensor(), ptr->tensor()->simple_tensor()); - ExpectTrtDimsEqualsArray({1}, ptr->GetTrtDims()); + EXPECT_THAT(ptr->GetTrtDims(), DimsAreArray({1})); } } } @@ -355,7 +266,7 @@ TEST(TRT_TensorOrWeights_Test, Basic) { EXPECT_EQ(false, ptr->is_weights()); EXPECT_EQ(1, ptr->batch_size()); EXPECT_NE(nullptr, ptr->tensor()->simple_tensor()); - ExpectTrtDimsEqualsArray({1}, ptr->GetTrtDims()); + EXPECT_THAT(ptr->GetTrtDims(), DimsAreArray({1})); } } // Test constructor with TRT_ShapedWeights argument. @@ -369,18 +280,15 @@ TEST(TRT_TensorOrWeights_Test, Basic) { EXPECT_EQ(false, ptr->is_tensor()); EXPECT_EQ(true, ptr->is_weights()); EXPECT_TRUE(TrtShapedWeightsEquals(weights, ptr->weights())); - ExpectTrtDimsEqualsArray({}, ptr->GetTrtDims()); + std::vector empty_dims; + EXPECT_THAT(ptr->GetTrtDims(), DimsAreArray(empty_dims)); } } } class ValidatorTest : public ::testing::Test { public: - std::unordered_map& op_validators( - TrtNodeValidator* validator) { - return validator->op_validators_; - } - + ValidatorTest() {} Status ConvertToTensorOrWeights(const Scope& scope, const Node* node, int output_port, TRT_TensorOrWeights* tensor_or_weights) { @@ -390,26 +298,14 @@ class ValidatorTest : public ::testing::Test { TF_EXPECT_OK(graph_properties.InferStatically(true)); TrtNodeValidator validator(graph_properties, TrtPrecisionMode::FP32, - /*use_calibration=*/false); + /*use_calibration=*/false, + /*use_implicit_batch=*/true, + /*use_explicit_precision=*/false); return validator.ConvertToTensorOrWeights(node->def(), output_port, tensor_or_weights); } - - const std::set* GetQuantizeOps(TrtNodeValidator* validator) { - return validator->quantize_ops; - } }; -TEST_F(ValidatorTest, QuantizeOpsAreRegistered) { - grappler::GrapplerItem item; - grappler::GraphProperties graph_properties(item); - TrtNodeValidator validator(graph_properties, TrtPrecisionMode::FP32, - /*use_calibration=*/false); - for (const string& quantize_op : *GetQuantizeOps(&validator)) { - QCHECK(op_validators(&validator).count(quantize_op)); - } -} - TEST_F(ValidatorTest, ConvertToTensorOrWeights) { // Convert Const. { @@ -417,13 +313,14 @@ TEST_F(ValidatorTest, ConvertToTensorOrWeights) { auto node = ops::Const(s.WithOpName("my_const"), {1.0f, 2.0f}, TensorShape({2})); TRT_TensorOrWeights output; - ExpectStatus(ConvertToTensorOrWeights(s, node.op().node(), - /*output_port=*/0, &output)); + EXPECT_THAT(ConvertToTensorOrWeights(s, node.op().node(), + /*output_port=*/0, &output), + IsOk()); ValidateWeights(output.weights(), {2}, {1.0, 2.0}); } // Helper method to run ConvertToTensorOrWeights() with predefined parameters. - auto convert_to_tensor_or_weights = [this](const std::vector& dims, + auto convert_to_tensor_or_weights = [this](const std::vector& dims, TRT_TensorOrWeights* output) { Scope s = Scope::NewRootScope(); const auto attrs = ops::Placeholder::Shape(PartialTensorShape{dims}); @@ -435,30 +332,33 @@ TEST_F(ValidatorTest, ConvertToTensorOrWeights) { // Convert non-Const with #dims > nvinfer1::Dims::MAX_DIMS+1. { TRT_TensorOrWeights output; - ExpectStatus( + EXPECT_THAT( convert_to_tensor_or_weights( - std::vector(nvinfer1::Dims::MAX_DIMS + 2, 1), &output), - error::OUT_OF_RANGE, "Input tensor rank is greater than 9"); + std::vector(nvinfer1::Dims::MAX_DIMS + 2, 1), &output), + StatusIs(absl::StatusCode::kOutOfRange, + HasSubstr("Input tensor rank is greater than 9"))); } // Convert non-Const with #dims < 1. { TRT_TensorOrWeights output; - ExpectStatus( - convert_to_tensor_or_weights({}, &output), error::INVALID_ARGUMENT, - "Scalar input tensor is not supported since the first dimension " - "is treated as batch dimension by TRT"); + EXPECT_THAT(convert_to_tensor_or_weights({}, &output), + StatusIs(absl::StatusCode::kInvalidArgument, + HasSubstr("Scalar input tensor is not supported since " + "the first dimension " + "is treated as batch dimension by TRT"))); } - // Convert non-Const. We test the case where the non-batch dimemsion is + // Convert non-Const. We test the case where the non-batch dimension is // unknown as well, to make sure the validator allows that. for (const int32 non_batch_dim : {-1, 2}) { const int32 batch_size = 12; TRT_TensorOrWeights output; - ExpectStatus( - convert_to_tensor_or_weights({batch_size, non_batch_dim}, &output)); + EXPECT_THAT( + convert_to_tensor_or_weights({batch_size, non_batch_dim}, &output), + IsOk()); ASSERT_TRUE(output.is_tensor()); EXPECT_EQ(batch_size, output.batch_size()); EXPECT_NE(nullptr, output.tensor()->simple_tensor()); - ExpectTrtDimsEqualsArray({non_batch_dim}, output.GetTrtDims()); + EXPECT_THAT(output.GetTrtDims(), DimsAreArray({non_batch_dim})); } } @@ -474,31 +374,39 @@ TEST_F(ValidatorTest, IsTensorRTCandidate_Basics) { grappler::GraphProperties graph_properties(item); TF_EXPECT_OK(graph_properties.InferStatically(true)); TrtNodeValidator validator(graph_properties, TrtPrecisionMode::FP32, - /*use_calibration=*/false); + /*use_calibration=*/false, + /*use_implicit_batch=*/true, + /*use_explicit_precision=*/false); + // Override the Add converter. bool start_conversion = false; bool should_fail = false; - auto op_converter = [&start_conversion, - &should_fail](OpConverterParams* params) -> Status { + auto op_converter = [&start_conversion, &should_fail]( + const OpConverterParams* params) -> Status { if (should_fail) return errors::InvalidArgument(""); if (!params->validation_only) start_conversion = true; return Status::OK(); }; // Validator not registered. - ASSERT_EQ(1, op_validators(&validator).erase("Add")); - ExpectStatus(validator.IsTensorRTCandidate(add_node), error::UNIMPLEMENTED, - "Op type Add is not supported."); - - // Register validator. - op_validators(&validator)["Add"] = op_converter; + auto original_op_converter = GetOpConverterRegistry()->LookUp("Add"); + ASSERT_TRUE(original_op_converter.ok()); + GetOpConverterRegistry()->Clear("Add"); + EXPECT_THAT(validator.IsTensorRTCandidate(add_node), + StatusIs(absl::StatusCode::kUnimplemented, + HasSubstr("Op type Add is not supported."))); + GetOpConverterRegistry()->Register("Add", kDefaultConverterPriority + 1, + op_converter); TF_EXPECT_OK(validator.IsTensorRTCandidate(add_node)); EXPECT_EQ(false, start_conversion); // Let the converter return error. should_fail = true; - ExpectStatus(validator.IsTensorRTCandidate(add_node), - error::INVALID_ARGUMENT); + EXPECT_THAT(validator.IsTensorRTCandidate(add_node), + StatusIs(absl::StatusCode::kInvalidArgument)); + GetOpConverterRegistry()->Clear("Add"); + GetOpConverterRegistry()->Register("Add", kDefaultConverterPriority, + *original_op_converter); } TEST(TrtNodeValidator, IsTensorRTCandidate) { @@ -527,7 +435,7 @@ TEST(TrtNodeValidator, IsTensorRTCandidate) { feed, const_1, matmul_attrs); // Unsupported op. - auto unsupported_op = ops::Erf(s.WithOpName("sin"), feed); + auto unsupported_op = ops::Erfc(s.WithOpName("sin"), feed); // Incompatible input. auto incompatible_feed = ops::Placeholder(s.WithOpName("feed"), DT_DOUBLE); @@ -553,25 +461,32 @@ TEST(TrtNodeValidator, IsTensorRTCandidate) { for (const TrtPrecisionMode precision_mode : {TrtPrecisionMode::FP32, TrtPrecisionMode::INT8}) { TrtNodeValidator validator(graph_properties, precision_mode, - /*use_calibration=*/false); + /*use_calibration=*/false, + /*use_implicit_batch=*/true, + /*use_explicit_precision=*/false); TF_EXPECT_OK(validator.IsTensorRTCandidate(matmul.operation.node())); - ExpectStatus( + EXPECT_THAT( validator.IsTensorRTCandidate(incompatible_matmul.operation.node()), - error::INVALID_ARGUMENT, - "Cannot transpose first input if it is a tensor with fewer than 2 " - "non-batch dimensions."); - ExpectStatus(validator.IsTensorRTCandidate(unsupported_op.operation.node()), - error::UNIMPLEMENTED, "Op type Erf is not supported"); - ExpectStatus(validator.IsTensorRTCandidate( - matmul_with_incompatible_input.operation.node()), - error::INTERNAL, - "Failed to convert input feed_1 to a TRT_TensorOrWeights"); + StatusIs(absl::StatusCode::kInvalidArgument, + HasSubstr("MatMul with 2D tensors requires explicit batch " + "mode, or that tensor A " + "is not transposed and B is a constant tensor."))); + EXPECT_THAT(validator.IsTensorRTCandidate(unsupported_op.operation.node()), + StatusIs(absl::StatusCode::kUnimplemented, + HasSubstr("Op type Erfc is not supported"))); + EXPECT_THAT(validator.IsTensorRTCandidate( + matmul_with_incompatible_input.operation.node()), + StatusIs(absl::StatusCode::kInternal, + HasSubstr("Failed to convert at least one input to a " + "TRT_TensorOrWeights:"))); if (precision_mode == TrtPrecisionMode::INT8) { TF_EXPECT_OK(validator.IsTensorRTCandidate(quantize.operation.node())); } else { - ExpectStatus(validator.IsTensorRTCandidate(quantize.operation.node()), - error::UNIMPLEMENTED, - "Op type FakeQuantWithMinMaxArgs is not supported"); + EXPECT_THAT( + validator.IsTensorRTCandidate(quantize.operation.node()), + StatusIs( + absl::StatusCode::kUnimplemented, + HasSubstr("Op type FakeQuantWithMinMaxArgs is not supported"))); } } } @@ -581,24 +496,21 @@ class ConverterTest : public ::testing::Test { ConverterTest() { Reset(); } void Reset() { - builder_.reset(nvinfer1::createInferBuilder(logger_)); -#if IS_TRT_VERSION_GE(6, 0, 0, 0) - const uint32_t flags = 0U; // Implicit Batch Mode - network_.reset(builder_->createNetworkV2(flags)); -#else - network_.reset(builder_->createNetwork()); -#endif // TRT >= 6 - converter_.reset(new Converter(network_.get(), TrtPrecisionMode::FP32, - /*use_calibration=*/false)); + GetOpConverterRegistry()->Clear("MyOp"); + GetOpConverterRegistry()->Clear("DummyOp"); + converter_ = + std::move(Converter::Create(TrtPrecisionMode::FP32, + /*use_calibration=*/false, &logger_, + /*use_implicit_batch=*/true, + /*engine_name=*/"TRTEngineOp_000_000", + /*use_explicit_precision=*/false) + .value()); weight_store_ = &converter_->weight_store_; } - void AddOpConverter(const string& op_name, OpConverter op_converter) { - converter_->op_registry_[op_name] = op_converter; - } - + // TODO(cbate): These should be removed or changed to public per black-box + // testing principle. // Below we expose private methods of Converter for testing. - Status MaybeUpdateBatchSize(int batch_size) { return converter_->MaybeUpdateBatchSize(batch_size); } @@ -621,10 +533,6 @@ class ConverterTest : public ::testing::Test { return converter_->GetWeightRange(weights, out_min, out_max); } - void PropagateQuantizationRanges() { - converter_->PropagateQuantizationRanges(); - } - int batch_size() const { return converter_->batch_size_; } std::unordered_map& quantization_ranges_proxy() { @@ -637,13 +545,6 @@ class ConverterTest : public ::testing::Test { private: Logger& logger_ = *Logger::GetLogger(); - // These members are ordered in a way such that the destruction order is: - // converter_ -> network_ -> builder_ - TrtUniquePtrType builder_; -#if IS_TRT_VERSION_GE(6, 0, 0, 0) - TrtUniquePtrType builder_config_; -#endif - TrtUniquePtrType network_; protected: std::unique_ptr converter_; @@ -652,7 +553,8 @@ class ConverterTest : public ::testing::Test { TEST_F(ConverterTest, ConvertNode) { ITensorProxyPtr output_tensors[2]; - auto op_converter = [&output_tensors](OpConverterParams* params) -> Status { + auto op_converter = + [&output_tensors](const OpConverterParams* params) -> Status { nvinfer1::Dims dims = params->inputs[0].tensor()->getDimensions(); for (int i = 0; i < 2; ++i) { dims.d[0] += 1; @@ -662,26 +564,33 @@ TEST_F(ConverterTest, ConvertNode) { return Status::OK(); }; NodeDef node_def = MakeNodeDef("my_op", "MyOp", {"my_input"}); - TF_EXPECT_OK(converter_->AddInputTensor( - "my_input", nvinfer1::DataType::kFLOAT, GetTestDims({123}), 1)); + + TF_ASSERT_OK(converter_->AddInputTensor( + "my_input", nvinfer1::DataType::kFLOAT, CreateDims({123}), 1)); // Converter not registered. - ExpectStatus(converter_->ConvertNode(node_def), error::UNIMPLEMENTED, - "No converter registered for op: MyOp"); + EXPECT_THAT(converter_->ConvertNode(node_def), + StatusIs(absl::StatusCode::kNotFound, + HasSubstr("No converter for op MyOp"))); // Register the converter and retry. - AddOpConverter("MyOp", op_converter); - TF_EXPECT_OK(converter_->ConvertNode(node_def)); + GetOpConverterRegistry()->Register("MyOp", kDefaultConverterPriority, + op_converter); + TF_ASSERT_OK(converter_->ConvertNode(node_def)); TRT_TensorOrWeights actual_output_1; TF_EXPECT_OK(GetTensorOrWeights("my_op", &actual_output_1)); - EXPECT_EQ(output_tensors[0]->simple_tensor(), actual_output_1.tensor()->simple_tensor()); + EXPECT_EQ(output_tensors[0]->simple_tensor(), + actual_output_1.tensor()->simple_tensor()); EXPECT_EQ(124, actual_output_1.tensor()->getDimensions().d[0]); TRT_TensorOrWeights actual_output_2; TF_EXPECT_OK(GetTensorOrWeights("my_op:1", &actual_output_2)); - EXPECT_EQ(output_tensors[1]->simple_tensor(), actual_output_2.tensor()->simple_tensor()); + EXPECT_EQ(output_tensors[1]->simple_tensor(), + actual_output_2.tensor()->simple_tensor()); EXPECT_EQ(125, actual_output_2.tensor()->getDimensions().d[0]); + + EXPECT_THAT(converter_->network(), LayerNamesNonEmpty()); } TEST_F(ConverterTest, AddAndGetInputs) { @@ -693,24 +602,26 @@ TEST_F(ConverterTest, AddAndGetInputs) { node_def.add_input("weird_input:2:3:4:0"); TF_EXPECT_OK(converter_->AddInputTensor("input", nvinfer1::DataType::kFLOAT, - GetTestDims({1}), 1)); + CreateDims({1}), 1)); TF_EXPECT_OK(converter_->AddInputTensor("input:1", nvinfer1::DataType::kINT32, - GetTestDims({2, 3}), 1)); + CreateDims({2, 3}), 1)); TF_EXPECT_OK(converter_->AddInputTensor( - "weird_input:2:3:4", nvinfer1::DataType::kHALF, GetTestDims({5, 3}), 1)); + "weird_input:2:3:4", nvinfer1::DataType::kHALF, CreateDims({5, 3}), 1)); std::vector inputs; TF_EXPECT_OK(GetInputs(node_def, &inputs)); EXPECT_EQ(4, inputs.size()); - EXPECT_EQ(inputs[0].tensor()->simple_tensor(), inputs[1].tensor()->simple_tensor()); + EXPECT_EQ(inputs[0].tensor()->trt_tensor(), inputs[1].tensor()->trt_tensor()); EXPECT_EQ(nvinfer1::DataType::kFLOAT, inputs[0].tensor()->getType()); EXPECT_EQ(nvinfer1::DataType::kINT32, inputs[2].tensor()->getType()); EXPECT_EQ(nvinfer1::DataType::kHALF, inputs[3].tensor()->getType()); - ExpectTrtDimsEqualsArray({1}, inputs[0].tensor()->getDimensions()); - ExpectTrtDimsEqualsArray({2, 3}, inputs[2].tensor()->getDimensions()); - ExpectTrtDimsEqualsArray({5, 3}, inputs[3].tensor()->getDimensions()); + EXPECT_THAT(inputs[0].tensor()->getDimensions(), DimsAreArray({1})); + EXPECT_THAT(inputs[2].tensor()->getDimensions(), DimsAreArray({2, 3})); + EXPECT_THAT(inputs[3].tensor()->getDimensions(), DimsAreArray({5, 3})); + + EXPECT_THAT(converter_->network(), LayerNamesNonEmpty()); } TEST_F(ConverterTest, RenameAndMarkOutputTensors) { @@ -720,7 +631,8 @@ TEST_F(ConverterTest, RenameAndMarkOutputTensors) { // Register a custom converter which shuffles the input. We use it to build a // TRT network whose output will be later marked. std::vector output_tensors; - auto op_converter = [&output_tensors](OpConverterParams* params) -> Status { + auto op_converter = + [&output_tensors](const OpConverterParams* params) -> Status { nvinfer1::Permutation perm; perm.order[0] = 1; perm.order[1] = 0; @@ -737,81 +649,96 @@ TEST_F(ConverterTest, RenameAndMarkOutputTensors) { params->outputs->emplace_back(output_weights); return Status::OK(); }; - AddOpConverter("MyOp", op_converter); + GetOpConverterRegistry()->Register("MyOp", kDefaultConverterPriority, + op_converter); // Run the conversion. NodeDef node_def = MakeNodeDef("my_op", "MyOp", {"my_input"}); TF_EXPECT_OK(converter_->AddInputTensor( - "my_input", nvinfer1::DataType::kFLOAT, GetTestDims({1, 2}), 1)); + "my_input", nvinfer1::DataType::kFLOAT, CreateDims({1, 2}), 1)); TF_EXPECT_OK(converter_->ConvertNode(node_def)); // Mark a weight as output, should fail. - ExpectStatus( + EXPECT_THAT( converter_->RenameAndMarkOutputTensors({{"my_op:2", "my_output"}}), - error::INVALID_ARGUMENT, "Output my_op:2 is weights not tensor"); + StatusIs(absl::StatusCode::kInvalidArgument, + HasSubstr("Output my_op:2 is weights not tensor"))); // Mark tensors as output, should pass. TF_EXPECT_OK(converter_->RenameAndMarkOutputTensors( {{"my_op", "my_output"}, {"my_op:1", "my_output_1"}})); EXPECT_EQ(2, output_tensors.size()); for (auto output_tensor : output_tensors) { - ExpectTrtDimsEqualsArray({2, 1}, output_tensor->getDimensions()); + EXPECT_THAT(output_tensor->getDimensions(), DimsAreArray({2, 1})); } EXPECT_EQ("my_output", string(output_tensors[0]->getName())); EXPECT_EQ("my_output_1", string(output_tensors[1]->getName())); + + EXPECT_THAT(converter_->network(), LayerNamesNonEmpty()); } TEST_F(ConverterTest, TransposeTensor) { ITensorProxyPtr input_tensor = converter_->network()->addInput( - "", nvinfer1::DataType::kFLOAT, GetTestDims({2, 3, 5})); + "", nvinfer1::DataType::kFLOAT, CreateDims({2, 3, 5})); ITensorProxyPtr output_tensor = nullptr; - + NodeDef dummy_node_def = MakeNodeDef("dummy_op", "DummyOp", {}); // Rank doesn't match. - ExpectStatus( - converter_->TransposeTensor(input_tensor, {0, 1}, &output_tensor), - error::INVALID_ARGUMENT, - "Rank of perm for transpose does not match with that of the input"); + EXPECT_THAT(converter_->TransposeTensor(input_tensor, {0, 1}, &output_tensor, + dummy_node_def, "sub1"), + StatusIs(absl::StatusCode::kInvalidArgument, + HasSubstr("Rank of perm for transpose does not match " + "with that of the input"))); // Transpose at batch dimension. - ExpectStatus( - converter_->TransposeTensor(input_tensor, {1, 0, 2, 3}, &output_tensor), - error::UNIMPLEMENTED, "Transpose at batch dimension is not supported."); + EXPECT_THAT( + converter_->TransposeTensor(input_tensor, {1, 0, 2, 3}, &output_tensor, + dummy_node_def, "sub2"), + StatusIs(absl::StatusCode::kUnimplemented, + HasSubstr("Transpose at batch dimension is not supported."))); // OK. - TF_EXPECT_OK( - converter_->TransposeTensor(input_tensor, {0, 3, 1, 2}, &output_tensor)); - ExpectTrtDimsEqualsArray({5, 2, 3}, output_tensor->getDimensions()); + TF_EXPECT_OK(converter_->TransposeTensor( + input_tensor, {0, 3, 1, 2}, &output_tensor, dummy_node_def, "sub3")); + EXPECT_THAT(output_tensor->getDimensions(), DimsAreArray({5, 2, 3})); + EXPECT_THAT( + converter_->network(), + LayerNamesAreArray({"TRTEngineOp_000_000/dummy_op-sub3:SHUFFLE"})); } void TestPrepareTensorForShape( const std::vector& input_dims, const std::vector& reshape_dims, const std::vector& expected_tensor_dims, bool input_is_tensor, Converter* converter, TrtWeightStore* weight_store, - error::Code expected_code = error::OK, + absl::StatusCode expected_code = absl::StatusCode::kOk, const char* expected_error_msg_substr = nullptr) { TRT_TensorOrWeights input; if (input_is_tensor) { input = TRT_TensorOrWeights(converter->network()->addInput( - "", nvinfer1::DataType::kFLOAT, GetTestDims(input_dims))); + "", nvinfer1::DataType::kFLOAT, CreateDims(input_dims))); } else { - input = TRT_TensorOrWeights(weight_store->GetTempWeights( - nvinfer1::DataType::kFLOAT, GetTestDims(input_dims))); + input = TRT_TensorOrWeights( + weight_store + ->GetTempWeights(nvinfer1::DataType::kFLOAT, CreateDims(input_dims)) + .value()); } ITensorProxyPtr output_tensor = nullptr; + NodeDef dummy_node_def = MakeNodeDef("dummy_op", "DummyOp", {}); for (bool validation_only : {false, true}) { - const Status status = converter->PrepareTensorForShape( - input, GetTestDims(reshape_dims), validation_only, &output_tensor); - if (expected_code == error::OK) { + const Status status = + PrepareTensorForShape(converter, input, DimsAdapter(reshape_dims), + validation_only, &output_tensor, dummy_node_def); + if (expected_code == absl::StatusCode::kOk) { TF_EXPECT_OK(status); if (validation_only) { EXPECT_EQ(nullptr, *output_tensor); } else { - ExpectTrtDimsEqualsArray(expected_tensor_dims, - output_tensor->getDimensions()); + EXPECT_THAT(output_tensor->getDimensions(), + DimsAreArray(expected_tensor_dims)); } } else { - ExpectStatus(status, expected_code, expected_error_msg_substr); + EXPECT_THAT(status, StatusIs(expected_code, + HasSubstr(expected_error_msg_substr))); } } } @@ -822,7 +749,8 @@ TEST_F(ConverterTest, PrepareTensorForShape) { Reset(); TestPrepareTensorForShape({2, 3, 5}, {2, 3, 6}, {}, input_is_tensor, converter_.get(), weight_store_, - error::INVALID_ARGUMENT, "Incompatible shapes"); + absl::StatusCode::kInvalidArgument, + "Incompatible shapes"); // Regular shape. Reset(); @@ -853,8 +781,10 @@ TEST_F(ConverterTest, PrepareTensorForShape) { Reset(); TestPrepareTensorForShape({2, 3, 5}, {-1, 2}, {15, 2}, /*input_is_tensor=*/false, converter_.get(), - weight_store_, error::INVALID_ARGUMENT, + weight_store_, absl::StatusCode::kInvalidArgument, "Shape is not fully defined"); + + EXPECT_THAT(converter_->network(), LayerNamesNonEmpty()); } TEST_F(ConverterTest, MaybeUpdateBatchSize) { @@ -872,8 +802,11 @@ TEST_F(ConverterTest, MaybeUpdateBatchSize) { TF_EXPECT_OK(MaybeUpdateBatchSize(-1)); EXPECT_EQ(123, batch_size()); - ExpectStatus(MaybeUpdateBatchSize(124), error::INVALID_ARGUMENT, - "Provided batch size does not match converter batch size"); + EXPECT_THAT( + MaybeUpdateBatchSize(124), + StatusIs(absl::StatusCode::kInvalidArgument, + HasSubstr( + "Provided batch size does not match converter batch size"))); } TEST_F(ConverterTest, AddAndGetTensorOrWeights) { @@ -890,17 +823,19 @@ TEST_F(ConverterTest, AddAndGetTensorOrWeights) { EXPECT_EQ(123, added_tensor.batch_size()); // Add the same tensor again. - ExpectStatus(AddTensorOrWeights("my_tensor", tensor), error::ALREADY_EXISTS, - "tensor/weights my_tensor already exist"); + EXPECT_THAT(AddTensorOrWeights("my_tensor", tensor), + StatusIs(absl::StatusCode::kAlreadyExists, + HasSubstr("tensor/weights my_tensor already exist"))); } template void TestGetWeightRange(ConverterTest* test, TrtWeightStore* weight_store) { - TRT_ShapedWeights weights = weight_store->GetTempWeights( - TfDataTypeToTrt(DataTypeToEnum::v()), GetTestDims({2, 3})); + nvinfer1::DataType trt_type; + TF_ASSERT_OK(TfTypeToTrtType(DataTypeToEnum::v(), &trt_type)); + TRT_ShapedWeights weights = + weight_store->GetTempWeights(trt_type, CreateDims({2, 3})).value(); const std::vector values = {T(3), T(1), T(2), T(6), T(5), T(4)}; - memcpy(weights.GetValues(), values.data(), weights.size_bytes()); - + absl::c_copy(values, weights.GetPointer()); float out_min = 0.0f; float out_max = 0.0f; TF_EXPECT_OK(test->GetWeightRange(weights, &out_min, &out_max)); @@ -916,7 +851,7 @@ TEST_F(ConverterTest, GetWeightRange) { TEST_F(ConverterTest, ProvideQuantizationRange) { ITensorProxyPtr simple_tensor; - // Assymetric range + // Asymmetric range converter_->ProvideQuantizationRange(&simple_tensor, 0.0f, 6.0f); EXPECT_EQ(6.0f, quantization_ranges_proxy()[&simple_tensor]); converter_->ProvideQuantizationRange(&simple_tensor, 1.0f, 6.0f); @@ -928,62 +863,27 @@ TEST_F(ConverterTest, ProvideQuantizationRange) { // Symmetric range converter_->ProvideQuantizationRange(&simple_tensor, -6.123f, 6.123f); EXPECT_EQ(6.123f, quantization_ranges_proxy()[&simple_tensor]); + + EXPECT_THAT(converter_->network(), LayerNamesNonEmpty()); } TEST_F(ConverterTest, MaybeApplyQuantizationRanges) { - // input -> infer1 -> infer2 -> infer3 - ITensorProxyPtr input, infer_1, infer_2, infer_3; + ITensorProxyPtr input; ITensorProxyPtr not_infer; - Converter int8_converter(/*trt_network=*/nullptr, TrtPrecisionMode::INT8, - /*use_calibration=*/true); - int8_converter.ProvideQuantizationRange(&input, -5.0f, 5.0f); - int8_converter.ProvideQuantizationRange(¬_infer, -100.0f, 100.0f); - int8_converter.MarkQuantizationRangesAsInferrable(&input, &infer_1); - int8_converter.MarkQuantizationRangesAsInferrable(&infer_1, &infer_2); - int8_converter.MarkQuantizationRangesAsInferrable(&infer_2, &infer_3); - - // Input range should be inferred along the chain and applied to tensors. - int8_converter.MaybeApplyQuantizationRanges(); -#if IS_TRT_VERSION_GE(8, 0, 0, 0) + Logger& logger = *Logger::GetLogger(); + auto int8_converter = Converter::Create(TrtPrecisionMode::INT8, + /*use_calibration=*/true, &logger, + /*use_implicit_batch=*/true, + /*engine_name=*/"") + .value(); + int8_converter->ProvideQuantizationRange(&input, -5.0f, 5.0f); + int8_converter->ProvideQuantizationRange(¬_infer, -100.0f, 100.0f); + + int8_converter->MaybeApplyQuantizationRanges(); EXPECT_EQ(input->getDynamicRangeMax(), 5.0f); - EXPECT_EQ(infer_1->getDynamicRangeMax(), 5.0f); - EXPECT_EQ(infer_2->getDynamicRangeMax(), 5.0f); - EXPECT_EQ(infer_3->getDynamicRangeMax(), 5.0f); EXPECT_EQ(not_infer->getDynamicRangeMax(), 100.0f); - EXPECT_EQ(input->getDynamicRangeMin(), -5.0f); - EXPECT_EQ(infer_1->getDynamicRangeMin(), -5.0f); - EXPECT_EQ(infer_2->getDynamicRangeMin(), -5.0f); - EXPECT_EQ(infer_3->getDynamicRangeMin(), -5.0f); - EXPECT_EQ(not_infer->getDynamicRangeMin(), -100.0f); -#elif IS_TRT_VERSION_GE(5, 0, 0, 0) - EXPECT_EQ(input->getDynamicRange(), 5.0f); - EXPECT_EQ(infer_1->getDynamicRange(), 5.0f); - EXPECT_EQ(infer_2->getDynamicRange(), 5.0f); - EXPECT_EQ(infer_3->getDynamicRange(), 5.0f); - EXPECT_EQ(not_infer->getDynamicRange(), 100.0f); -#endif -} -TEST_F(ConverterTest, PropagateQuantizationRanges) { - // infer0 <-> infer1 <-> infer2 <-> infer3 - // | - // infer4 <-> infer5 - ITensorProxyPtr infer[6]; - ITensorProxyPtr not_infer; - converter_->ProvideQuantizationRange(&infer[4], -5.0f, 5.0f); - converter_->MarkQuantizationRangesAsInferrable(&infer[0], &infer[1]); - converter_->MarkQuantizationRangesAsInferrable(&infer[1], &infer[2]); - converter_->MarkQuantizationRangesAsInferrable(&infer[3], &infer[2]); - converter_->MarkQuantizationRangesAsInferrable(&infer[4], &infer[1]); - converter_->MarkQuantizationRangesAsInferrable(&infer[4], &infer[5]); - - // Input range should be inferred along the chain. - PropagateQuantizationRanges(); - auto ranges = quantization_ranges_proxy(); - for (int i = 0; i < 6; ++i) { - EXPECT_EQ(5.0f, ranges[&infer[i]]); - } - EXPECT_EQ(ranges.count(¬_infer), 0); + EXPECT_THAT(int8_converter->network(), LayerNamesNonEmpty()); } TEST_F(ConverterTest, GetTrtBroadcastShape) { @@ -995,18 +895,19 @@ TEST_F(ConverterTest, GetTrtBroadcastShape) { const bool operand_2_is_tensor, const std::vector& expected_operand_1_shape, const std::vector& expected_operand_2_shape, - error::Code expected_code = error::OK, - const char* expected_error_msg_substr = nullptr, + absl::StatusCode expected_code = + absl::StatusCode::kOk, + const char* expected_error_msg_substr = "", const int operand_1_batch_size = -1, const int operand_2_batch_size = -1) { auto create_tensor_or_weights = [](const std::vector& shape, bool is_tensor, int batch_size = -1) { if (is_tensor) { - return TRT_TensorOrWeights{nvinfer1::DataType::kFLOAT, - GetTestDims(shape), batch_size}; + return TRT_TensorOrWeights(nvinfer1::DataType::kFLOAT, + CreateDims(shape), batch_size); } TRT_ShapedWeights weights; - weights.shape_ = GetTestDims(shape); + weights.Shape() = CreateDims(shape); return TRT_TensorOrWeights(weights); }; @@ -1017,28 +918,31 @@ TEST_F(ConverterTest, GetTrtBroadcastShape) { operand_2_shape, operand_2_is_tensor, operand_2_batch_size); // operand_1 broadcast operand_2 - ExpectStatus( + EXPECT_THAT( GetTrtBroadcastShape(operand_1, operand_2, /*check_feasibility=*/true, - &operand_1_new_dims, &operand_2_new_dims), - expected_code, expected_error_msg_substr); - if (expected_code == error::OK) { - ExpectTrtDimsEqualsArray(expected_operand_1_shape, operand_1_new_dims); - ExpectTrtDimsEqualsArray(expected_operand_2_shape, operand_2_new_dims); + /*use_implicit_batch=*/true, &operand_1_new_dims, + &operand_2_new_dims), + StatusIs(expected_code, HasSubstr(expected_error_msg_substr))); + if (expected_code == absl::StatusCode::kOk) { + EXPECT_THAT(operand_1_new_dims, DimsAreArray(expected_operand_1_shape)); + EXPECT_THAT(operand_2_new_dims, DimsAreArray(expected_operand_2_shape)); } // operand_2 broadcast operand_1 - ExpectStatus( + EXPECT_THAT( GetTrtBroadcastShape(operand_2, operand_1, /*check_feasibility=*/true, - &operand_2_new_dims, &operand_1_new_dims), - expected_code, expected_error_msg_substr); - if (expected_code == error::OK) { - ExpectTrtDimsEqualsArray(expected_operand_1_shape, operand_1_new_dims); - ExpectTrtDimsEqualsArray(expected_operand_2_shape, operand_2_new_dims); + /*use_implicit_batch=*/true, &operand_2_new_dims, + &operand_1_new_dims), + StatusIs(expected_code, HasSubstr(expected_error_msg_substr))); + if (expected_code == absl::StatusCode::kOk) { + EXPECT_THAT(operand_1_new_dims, DimsAreArray(expected_operand_1_shape)); + EXPECT_THAT(operand_2_new_dims, DimsAreArray(expected_operand_2_shape)); } }; // Both inputs are weights. symmetric_test( - {1}, {1}, kIsNotTensor, kIsNotTensor, {}, {}, error::INVALID_ARGUMENT, + {1}, {1}, kIsNotTensor, kIsNotTensor, {}, {}, + absl::StatusCode::kInvalidArgument, "Broadcasting requires at least one of the operands be tensors"); // One tensor and one weights. @@ -1054,51 +958,58 @@ TEST_F(ConverterTest, GetTrtBroadcastShape) { symmetric_test({1, 3, 4}, {1, 2, 1, 4}, kIsTensor, kIsNotTensor, {1, 3, 4}, {2, 1, 4}); symmetric_test({1, 1, 1}, {2, 1, 1, 1}, kIsTensor, kIsNotTensor, {}, {}, - error::INVALID_ARGUMENT, "Infeasible broadcast scheme"); + absl::StatusCode::kInvalidArgument, + "Infeasible broadcast scheme"); symmetric_test({1, 1, 1}, {2, 1, 1, 1}, kIsTensor, kIsNotTensor, {}, {}, - error::INVALID_ARGUMENT, "Infeasible broadcast scheme", + absl::StatusCode::kInvalidArgument, + "Infeasible broadcast scheme", /*operand_1_batch_size=*/2); symmetric_test({1, 1, 1}, {1, 1, 1, 1, 1}, kIsTensor, kIsNotTensor, {}, {}, - error::INVALID_ARGUMENT, + absl::StatusCode::kInvalidArgument, "Broadcasting beyond batch dimension is not supported " "(tensor #dims 4 vs broadcast #dims 5)"); symmetric_test({3}, {1, 1, 3}, kIsTensor, kIsNotTensor, {}, {}, - error::INVALID_ARGUMENT, + absl::StatusCode::kInvalidArgument, "Broadcasting beyond batch dimension is not supported " "(tensor #dims 2 vs broadcast #dims 3)", /*operand_1_batch_size=*/2); // Both inputs are tensors. symmetric_test({1, 1, 1}, {1, 1}, kIsTensor, kIsTensor, {}, {}, - error::INVALID_ARGUMENT, + absl::StatusCode::kInvalidArgument, "Broadcasting beyond batch dimension is not supported " "(tensor #dims 3 vs broadcast #dims 4)"); symmetric_test({1, 3}, {3}, kIsTensor, kIsTensor, {}, {}, - error::INVALID_ARGUMENT, + absl::StatusCode::kInvalidArgument, "Broadcasting beyond batch dimension is not supported " "(tensor #dims 2 vs broadcast #dims 3)"); symmetric_test({1, 3, 4}, {2, 1, 4}, kIsTensor, kIsTensor, {1, 3, 4}, {2, 1, 4}); symmetric_test({1, 1, 1}, {1, 1, 1, 1}, kIsTensor, kIsTensor, {}, {}, - error::INVALID_ARGUMENT, + absl::StatusCode::kInvalidArgument, "Broadcasting beyond batch dimension is not supported " "(tensor #dims 4 vs broadcast #dims 5)"); symmetric_test({2, 3}, {7, 5}, kIsTensor, kIsTensor, {}, {}, - error::INVALID_ARGUMENT, "Infeasible broadcast scheme"); + absl::StatusCode::kInvalidArgument, + "Infeasible broadcast scheme"); + + EXPECT_THAT(converter_->network(), LayerNamesNonEmpty()); } TEST_F(ConverterTest, CreateConstantLayer) { for (auto dtype : {nvinfer1::DataType::kFLOAT, nvinfer1::DataType::kINT32}) { TRT_ShapedWeights weights = - weight_store_->GetTempWeights(dtype, GetTestDims({2, 3, 5})); + weight_store_->GetTempWeights(dtype, CreateDims({2, 3, 5})).value(); ITensorProxyPtr tensor = - converter_->CreateConstantLayer(weights, GetTestDims({3, 10})); + converter_->CreateConstantLayer(weights, CreateDims({3, 10})); ASSERT_NE(nullptr, tensor->trt_tensor()); EXPECT_EQ(dtype, tensor->getType()) << "Expected " << DebugString(dtype) << " vs. actual " << DebugString(tensor->getType()); - ExpectTrtDimsEqualsArray({3, 10}, tensor->getDimensions()); + EXPECT_THAT(tensor->getDimensions(), DimsAreArray({3, 10})); } + + EXPECT_THAT(converter_->network(), LayerNamesNonEmpty()); } class ConvertGraphDefToEngineTest : public ::testing::Test { @@ -1125,10 +1036,12 @@ class ConvertGraphDefToEngineTest : public ::testing::Test { } // TODO(laigd): execute the engine and get outputs. return ConvertGraphDefToEngine( - gdef, TrtPrecisionMode::FP32, /*max_batch_size=*/1, + gdef, /*ctx=*/nullptr, TrtPrecisionMode::FP32, /*max_batch_size=*/1, /*max_workspace_size_bytes=*/64 << 20, input_shapes, &logger_, /*allocator=*/nullptr, /*calibrator=*/nullptr, &engine_, - /*use_calibration=*/false, /*convert_successfully=*/nullptr); + /*use_calibration=*/false, /*use_implicit_batch=*/true, + /*convert_successfully=*/nullptr, /*profiles=*/nullptr, + "TRTEngineOp_000_000", /*use_explicit_precision=*/false); } protected: @@ -1154,258 +1067,409 @@ TEST_F(ConvertGraphDefToEngineTest, IdentityGraph) { TF_EXPECT_OK(RunConvertGraphDefToEngine(&s)); } -// Input/output data format for OpConverterTest::BuildAndRun(). -struct InputOutputData { - void* Buffer() const { - return const_cast(tensor.tensor_data().data()); - } - - size_t TotalBytes() const { return tensor.TotalBytes(); } - - string name; - Tensor tensor; -}; - -template -Tensor ConstructTensor(int data_size, const T& value = T()) { - std::vector values(data_size, value); - return test::AsTensor(values); +// Returns a vector of shapes from a vector of input tensors. This can be used +// to create optimization profiles. +Status GetShapeFromDataVec(DataVec input_data, + std::vector* shape_vec) { + shape_vec->reserve(input_data.size()); + std::transform(input_data.begin(), input_data.end(), + std::back_inserter(*shape_vec), + [](InputOutputData x) { return x.tensor.shape(); }); + return Status::OK(); } -using DataVec = std::vector; - template inline absl::Span GetSpanForData(const InputOutputData& data) { const auto& tensor_map = data.tensor.flat(); return absl::Span(tensor_map.data(), tensor_map.size()); } +std::vector GetDataAsFloat(InputOutputData& data) { + const auto dType = data.tensor.dtype(); + if (dType == DT_FLOAT) { + auto span = GetSpanForData(data); + return std::vector(span.begin(), span.end()); + } + if (dType == DT_HALF) { + return CastVector(GetSpanForData(data)); + } + if (dType == DT_INT32) { + return CastVector(GetSpanForData(data)); + } +#if IS_TRT_VERSION_GE(8, 2, 0, 0) + if (dType == DT_BOOL) { + return CastVector(GetSpanForData(data)); + } +#endif + LOG(FATAL) << "DataType not supported for testing " << DataTypeString(dType); + return {}; +} + // Class to test various op converters, using both a TrtNodeValidator and // Converter. class OpConverterTest : public ::testing::Test { public: - OpConverterTest() : scope_(Scope::NewRootScope()) { + OpConverterTest() + : tensor_buffer_allocator_(new GpuManagedAllocator()), + scope_(Scope::NewRootScope()) { QCHECK_EQ(0, cudaStreamCreate(&stream_)); Reset(); } - ~OpConverterTest() noexcept override { QCHECK_EQ(0, cudaStreamDestroy(stream_)); } + ~OpConverterTest() noexcept override { + QCHECK_EQ(0, cudaStreamDestroy(stream_)); + } Status GetTensorOrWeights(const string& name, TRT_TensorOrWeights* output) { return converter_->GetTensorOrWeights(name, output); } - void Reset() { + void Reset(TrtPrecisionMode precision_mode_to_test = TrtPrecisionMode::FP32, + TrtTestMode trt_mode = TrtTestMode::kImplicitBatch, + OpKernelContext* ctx = nullptr) { + // Destroy existing TRT objects in a proper order. converter_.reset(nullptr); - - // Reset the INetworkDefinition. engine_.reset(nullptr); - network_.reset(nullptr); - builder_.reset(nvinfer1::createInferBuilder(logger_)); -#if IS_TRT_VERSION_GE(6, 0, 0, 0) - builder_config_.reset(builder_->createBuilderConfig()); - builder_config_->setMaxWorkspaceSize(1 << 26); - if (precision_mode_to_test_ == TrtPrecisionMode::FP16) { - builder_config_->setFlag(nvinfer1::BuilderFlag::kFP16); - } else if (precision_mode_to_test_ == TrtPrecisionMode::INT8) { - builder_config_->setFlag(nvinfer1::BuilderFlag::kFP16); - builder_config_->setFlag(nvinfer1::BuilderFlag::kINT8); - builder_config_->setInt8Calibrator(nullptr); - } - const uint32_t flags = 0U; // Implicit Batch Mode - network_.reset(builder_->createNetworkV2(flags)); -#else - network_.reset(builder_->createNetwork()); - builder_->setMaxWorkspaceSize(1 << 26); -#endif // TRT >= 6 - // Reset the converter. - converter_.reset(new Converter(network_.get(), precision_mode_to_test_, - /*use_calibration=*/false)); + // Re-create them in proper order. + converter_ = + std::move(Converter::Create(precision_mode_to_test, + /*use_calibration=*/false, &logger_, + /*use_implicit_batch=*/trt_mode == + TrtTestMode::kImplicitBatch, + /*engine_name=*/"", + /*use_explicit_precision=*/false, ctx) + .value()); // Reset other related artifacts. scope_ = Scope::NewRootScope(); } + // Constructs a flat tensor with 'vals' in Unified Memory. + template + Tensor AsTensor(gtl::ArraySlice vals) { // non-absl ok + Tensor ret(tensor_buffer_allocator_.get(), DataTypeToEnum::value, + {static_cast(vals.size())}); + std::copy_n(vals.data(), vals.size(), ret.flat().data()); + return ret; + } + + // Constructs a tensor of "shape" with values "vals" in Unified Memory. + template + Tensor AsTensor(gtl::ArraySlice vals, // non-absl ok + const TensorShape& shape) { + Tensor ret(tensor_buffer_allocator_.get(), DataTypeToEnum::value, + {static_cast(vals.size())}); + CHECK(ret.CopyFrom(AsTensor(vals), shape)); + return ret; + } + + template + void transformTensor(const std::vector& vals, Tensor& ret) { + std::transform(vals.begin(), vals.end(), ret.flat().data(), + [](const T in_val) -> S { return static_cast(in_val); }); + } + + template + void transformWeights(const std::vector& vals, + TRT_ShapedWeights& weights) { + std::transform(vals.begin(), vals.end(), weights.GetPointer(), + [](const T in_val) -> S { return static_cast(in_val); }); + } + + // Constructs a tensor with given values (vals). The tensor type is defined by + // the tf_type argument, its shape is given by input_dims. The tensor is + // constructed using the allocator of OpConverterTest in Unified Memory. + template + Tensor AsTensor(const std::vector& vals, + const std::vector& input_dims, DataType tf_type) { + Tensor ret(tensor_buffer_allocator_.get(), tf_type, + {static_cast(vals.size())}); + if (tf_type == DT_FLOAT) { + transformTensor(vals, ret); + } else if (tf_type == DT_HALF) { + transformTensor(vals, ret); + } else if (tf_type == DT_INT32) { + transformTensor(vals, ret); +#if IS_TRT_VERSION_GE(8, 2, 0, 0) + } else if (tf_type == DT_BOOL) { + transformTensor(vals, ret); +#endif + } else { + LOG(FATAL) << "Cannot create tensor with type " + << DataTypeString(tf_type); + } + TensorShape shape; + TF_EXPECT_OK(TensorShapeUtils::MakeShape(input_dims, &shape)); + CHECK(ret.CopyFrom(ret, shape)); + return ret; + } + + template + Tensor AsTensor(const std::vector& vals, + const std::vector& input_dims, DataType tf_type) { + const auto& conv_vals = CastVector(vals); + return AsTensor(conv_vals, input_dims, tf_type); + } + + // Constructs a flat tensor in Unified Memory. + template + Tensor ConstructTensor(int data_size, const T& value = T()) { + std::vector values(data_size, value); + return AsTensor(values); + } + + // Constructs a flat tensor in Unified Memory. + template + Tensor ConstructTensor(int data_size, const T& value, DataType tf_type) { + std::vector values(data_size, value); + return AsTensor(values, {data_size}, tf_type); + } + void CheckDataTypeMatches(const DataVec& datas) { + if (VLOG_IS_ON(2)) { + int nbBindings = engine_->getNbBindings(); + VLOG(2) << "Number of engine bindings: " << nbBindings; + for (int i = 0; i < nbBindings; i++) { + VLOG(2) << "Binding " << i << " name: " << engine_->getBindingName(i); + } + } for (const auto& data : datas) { + VLOG(2) << "Checking if data type matches for tensor " << data.name; const int input_index = engine_->getBindingIndex(data.name.c_str()); ASSERT_NE(-1, input_index); const nvinfer1::DataType trt_dtype = engine_->getBindingDataType(input_index); - const DataType tf_dtype = TrtDataTypeToTf(trt_dtype); - ASSERT_EQ(data.tensor.dtype(), tf_dtype) + DataType tf_type; + TF_ASSERT_OK(TrtTypeToTfType(trt_dtype, &tf_type)); + ASSERT_EQ(data.tensor.dtype(), tf_type) << DataTypeString(data.tensor.dtype()) << " vs. " - << DataTypeString(tf_dtype); + << DataTypeString(tf_type); } } - // TODO(laigd): test fp16 and int8 support for more converters. - void BuildAndRun(const DataVec& input_data, DataVec* output_data, - TrtPrecisionMode precision_mode = TrtPrecisionMode::FP32, - const int batch_size = 1) { + Status BuildAndRun(const DataVec& input_data, DataVec* output_data, + const int batch_size = 1) { // Mark the output tensor as TRT engine output. std::vector output_info; for (const auto& data : *output_data) { - output_info.push_back( - {data.name, data.name, TfDataTypeToTrt(data.tensor.dtype())}); - } - TF_EXPECT_OK(converter_->RenameAndMarkOutputTensors(output_info)); - - ASSERT_EQ(nullptr, engine_.get()); - builder_->setMaxBatchSize(batch_size); -#if IS_TRT_VERSION_GE(6, 0, 0, 0) - if (precision_mode == TrtPrecisionMode::FP16) { - builder_config_->setFlag(nvinfer1::BuilderFlag::kFP16); - } else if (precision_mode == TrtPrecisionMode::INT8) { - builder_config_->setFlag(nvinfer1::BuilderFlag::kFP16); - builder_config_->setFlag(nvinfer1::BuilderFlag::kINT8); - builder_config_->setInt8Calibrator(nullptr); + nvinfer1::DataType trt_type; + TF_RETURN_IF_ERROR(TfTypeToTrtType(data.tensor.dtype(), &trt_type)); + output_info.push_back({data.name, data.name, trt_type}); } - engine_.reset( - builder_->buildEngineWithConfig(*converter_->network(), *builder_config_)); -#else + TF_RETURN_IF_ERROR(converter_->RenameAndMarkOutputTensors(output_info)); + // Build the TRT engine. - if (precision_mode == TrtPrecisionMode::FP16) { - builder_->setFp16Mode(true); - } else if (precision_mode == TrtPrecisionMode::INT8) { - // Setting FP16 mode as well allows TRT to also consider FP16 kernels and - // use them in situations where they are faster than INT8 or where INT8 is - // not supported for a given layer. - builder_->setFp16Mode(true); - builder_->setInt8Mode(true); + if (engine_.get() != nullptr) { + return errors::Internal("Engine already exists"); } - engine_.reset(builder_->buildCudaEngine(*converter_->network())); -#endif + TrtShapeOptimizationProfile profiles; + if (!converter_->use_implicit_batch()) { + std::vector input_mask(input_data.size()); + for (int i = 0; i < input_data.size(); i++) { + input_mask[i] = (input_data[i].tensor.dtype() != DataType::DT_RESOURCE); + } + profiles.SetInputMask(input_mask); + profiles.SetShapeTensorMask(converter_->network()); + TF_RETURN_IF_ERROR(profiles.CollectShapeValues(input_data)); + // Create a single optimization profile for explicit batch mode + std::vector input_shapes; + TF_RETURN_IF_ERROR(GetShapeFromDataVec(input_data, &input_shapes)); + profiles.AddShape(input_shapes); + std::vector input_partial_shapes; + TF_RETURN_IF_ERROR( + GetNetworkInputShapes(converter_->network(), &input_partial_shapes)); + profiles.InitProfiles(input_partial_shapes, ProfileStrategy::kRange); + } + TF_RETURN_IF_ERROR( + converter_->BuildCudaEngine(&engine_, + /*max_batch_size=*/batch_size, + /*max_workspace_size_bytes=*/1 << 26, + /*allocator=*/nullptr, + /*calibrator=*/nullptr, + /*profiles=*/&profiles)); CHECK_NOTNULL(engine_.get()); CheckDataTypeMatches(input_data); CheckDataTypeMatches(*output_data); - // Execute the TRT engine. const int num_bindings = input_data.size() + output_data->size(); std::vector buffers(num_bindings); - for (const auto& data : input_data) { - const int input_index = engine_->getBindingIndex(data.name.c_str()); - ASSERT_NE(-1, input_index); - ASSERT_EQ(0, cudaMalloc(&buffers[input_index], data.TotalBytes())); - ASSERT_EQ(0, cudaMemcpyAsync(buffers[input_index], data.Buffer(), - data.TotalBytes(), cudaMemcpyHostToDevice, - stream_)); + if (engine_->getNbBindings() != num_bindings) { + return errors::Internal("Number of bindings do not match"); } - struct SizeAndIndex { - SizeAndIndex(int in_size, int in_index) - : size(in_size), index(in_index) {} - int size; - int index; - }; - std::vector output_infos; - for (const auto& data : *output_data) { - const int output_index = engine_->getBindingIndex(data.name.c_str()); - ASSERT_NE(-1, output_index); - output_infos.emplace_back(data.TotalBytes(), output_index); - ASSERT_EQ(0, cudaMalloc(&buffers[output_index], data.TotalBytes())); - } - - ASSERT_EQ(engine_->getNbBindings(), num_bindings); + // Since we have only 1 optimization profile (which is enabled by default) + // it is fine to create execution context directly, instead of calling + // profiles.CreateExecutionContexts() TrtUniquePtrType execution_context( engine_->createExecutionContext()); - execution_context->enqueue(batch_size, buffers.data(), stream_, nullptr); - for (int i = 0; i < output_infos.size(); ++i) { - const auto& output_info = output_infos[i]; - ASSERT_EQ(0, cudaMemcpyAsync(output_data->at(i).Buffer(), - buffers[output_info.index], output_info.size, - cudaMemcpyDeviceToHost, stream_)); - } + // Prepare input bindings. + TF_RETURN_IF_ERROR( + SetTrtEngineInputs(engine_.get(), execution_context.get(), 0, buffers, + converter_->use_implicit_batch(), batch_size, + profiles, nullptr, &input_data)); + // Prepare output bindings. + TF_RETURN_IF_ERROR(SetTrtEngineOutputs( + engine_.get(), execution_context.get(), 0, buffers, + converter_->use_implicit_batch(), batch_size, nullptr, output_data)); + // Execute the TRT engine. + TF_RETURN_IF_ERROR(TrtEnqueue(execution_context.get(), buffers, stream_, + converter_->use_implicit_batch(), + batch_size)); cudaStreamSynchronize(stream_); + return Status::OK(); + } + + // Adds ITensor for both validation and conversion, assuming explicit batch + // dimension is included in dims (ie for an NCHW tensor dims = {N, C, H, W}). + void AddTestTensorWithTFDims( + const string& name, const std::vector& dims, + nvinfer1::DataType trt_type = nvinfer1::DataType::kFLOAT, + Status add_input_status = Status::OK()) { + DataType tf_type; + TF_ASSERT_OK(TrtTypeToTfType(trt_type, &tf_type)); + ops::Placeholder::Attrs attrs; + TF_EXPECT_OK(TensorShapeUtils::MakeShape(dims, &attrs.shape_)); + + auto input = ops::Placeholder(scope_.WithOpName(name), tf_type, attrs); + node_inputs_[name] = input.output; + + // Add a real ITensor for conversion conditionally. - for (int i = 0; i < num_bindings; ++i) { - ASSERT_EQ(0, cudaFree(buffers[i])); + auto dims_adap = + DimsAdapter::Create(attrs.shape_, converter_->use_implicit_batch()); + if (converter_->use_implicit_batch() && !dims_adap.ok()) { + ASSERT_EQ(add_input_status, dims_adap.status()); + return; + } else { + TF_EXPECT_OK(dims_adap.status()); + } + if (!converter_->use_implicit_batch() || dims_adap->IsStatic()) { + int batch_size = dims.size() > 0 ? dims[0] : 0; + Status status = converter_->AddInputTensor( + name, trt_type, dims_adap->AsTrtDims(), batch_size); + ASSERT_EQ(add_input_status, status); } } - bool HasStaticShape(const nvinfer1::Dims& dims) const { - if (dims.nbDims < 0) return false; - for (int i = 0; i < dims.nbDims; ++i) { - if (dims.d[i] < 0) return false; - } - return true; + Status AddTensorOrWeights(const string& name, TRT_TensorOrWeights input) { + return converter_->AddTensorOrWeights(name, input); } - // Add ITensor for both validation and conversion. + // Adds ITensor for both validation and conversion. The difference compared to + // AddTestTensorWithTFDims is in the meaning of the dims parameter. To define + // a tensor with NCHW shape, here we set dims = {C,H,W} and batch_size = N. + // TODO(tfeher) remove this function once all test are updated to use the + // other version of AddTestTensor (defined by + // ParameterizedOpConverterTestBase). void AddTestTensor( const string& name, const std::vector& dims, int batch_size = 1, nvinfer1::DataType trt_dtype = nvinfer1::DataType::kFLOAT) { - DataType tf_dtype = TrtDataTypeToTf(trt_dtype); - ops::Placeholder::Attrs attrs; - TF_EXPECT_OK(TensorShapeUtils::MakeShape(dims, &attrs.shape_)); - attrs.shape_.InsertDim(0, batch_size); - auto input = ops::Placeholder(scope_.WithOpName(name), tf_dtype, attrs); - node_inputs_[name] = input.output; - - // Add a real ITensor for conversion conditionally. - const nvinfer1::Dims trt_dims = GetTestDims(dims); - if (HasStaticShape(trt_dims)) { - TF_EXPECT_OK( - converter_->AddInputTensor(name, trt_dtype, trt_dims, batch_size)); + DimsAdapter adap(dims); + std::vector dims_vec; + TF_CHECK_OK(adap.Prepend(batch_size).Vector(&dims_vec)); + AddTestTensorWithTFDims(name, dims_vec, trt_dtype); + if (adap.IsStatic()) { ASSERT_EQ(batch_size, converter_->batch_size_); } } - // Add weights for both validation and conversion. - template + // Adds weights for both validation and conversion. The type of the weight is + // determined by tf_type. The initial value vector (values) can have any + // type (T) that can be statically casted to tf_type. + template void AddTestWeights(const string& name, const std::vector& dims, - const std::vector& values) { + const std::vector& values_inp, DataType tf_type, + bool fix_values = true) { + const DimsAdapter dims_adap(dims); + const int64_t num_elements = dims_adap.Volume(); + + std::vector values(values_inp); + if (num_elements != values.size()) { + if (fix_values) { + AdjustVectorByDims(values, num_elements, name, "AddTestWeights"); + } else { + FAIL() << "Unable to create test weights: " + << (num_elements > values.size() ? "not enough" : "to many") + << " values specified: " << values.size() << " vs. " + << num_elements << " defined by dims"; + } + } // Add weights for validation. - TensorShape shape; - TF_EXPECT_OK(TensorShapeUtils::MakeShape(dims, &shape)); - Tensor t = test::AsTensor(values, shape); + Tensor t = AsTensor(values, dims, tf_type); node_inputs_[name] = ops::Const(scope_.WithOpName(name), t); // Add weights for conversion. - const nvinfer1::DataType dtype = TfDataTypeToTrt(DataTypeToEnum::v()); - const nvinfer1::Dims trt_dims = GetTestDims(dims); - const int64_t num_elements = TrtWeightDimsNumElements(trt_dims); + nvinfer1::DataType dtype; + TF_ASSERT_OK(TfTypeToTrtType(tf_type, &dtype)); QCHECK_EQ(num_elements, values.size()) << num_elements << " vs " << values.size(); TRT_ShapedWeights weights(dtype); if (num_elements) { - weights = converter_->weight_store_.GetTempWeights(dtype, trt_dims); - QCHECK_EQ(weights.size_bytes(), sizeof(T) * values.size()) - << weights.size_bytes() << " vs " << sizeof(T) * values.size(); - memcpy(weights.GetValues(), values.data(), weights.size_bytes()); + weights = + converter_->weight_store_.GetTempWeights(dtype, dims_adap.AsTrtDims()) + .value(); + + if (tf_type == DT_FLOAT) { + transformWeights(values, weights); + } else if (tf_type == DT_HALF) { + transformWeights(values, weights); + } else if (tf_type == DT_INT32) { + transformWeights(values, weights); +#if IS_TRT_VERSION_GE(8, 2, 0, 0) + } else if (tf_type == DT_BOOL) { + transformWeights(values, weights); +#endif + } else { + LOG(FATAL) << "Cannot create tensor with type " + << DataTypeString(tf_type); + } } TF_EXPECT_OK( converter_->AddTensorOrWeights(name, TRT_TensorOrWeights{weights})); } + // Adds test weight without specifying tf_type arg. In this case the initial + // value type (T) will determine the type of the weights. + template + void AddTestWeights(const string& name, const std::vector& dims, + const std::vector& value, bool fix_values = true) { + AddTestWeights(name, dims, value, DataTypeToEnum::value, fix_values); + } + // Test validation in validation-only mode. - void RunValidation(const Node* node, error::Code expected_code = error::OK, - const char* expected_msg_substr = nullptr) { + Status RunValidation(const Node* node) { grappler::GrapplerItem item; TF_EXPECT_OK(scope_.ToGraphDef(&item.graph)); grappler::GraphProperties graph_properties(item); TF_EXPECT_OK(graph_properties.InferStatically(true)); - TrtNodeValidator validator(graph_properties, precision_mode_to_test_, - /*use_calibration=*/false); - ExpectStatus(validator.IsTensorRTCandidate(node), expected_code, - expected_msg_substr); - } - - void RunConversion(const Node* node, error::Code expected_code = error::OK, - const char* expected_msg_substr = nullptr) { - ExpectStatus(converter_->ConvertNode(node->def()), expected_code, - expected_msg_substr); + TrtNodeValidator validator( + graph_properties, converter_->precision_mode(), + /*use_calibration=*/false, + /*use_implicit_batch=*/converter_->use_implicit_batch(), + /*use_explicit_precision=*/false); + return validator.IsTensorRTCandidate(node); + } + + void RunConversion(const Node* node, + absl::StatusCode expected_code = absl::StatusCode::kOk, + absl::string_view expected_msg_substr = "") { + EXPECT_THAT(converter_->ConvertNode(node->def()), + StatusIs(expected_code, HasSubstr(expected_msg_substr))); + if (expected_code == absl::StatusCode::kOk) { + EXPECT_THAT(converter_->network(), LayerNamesNonEmpty()); + } } // Helper method to run both validation and conversion, when the expected // output are same. - void RunValidationAndConversion(const NodeDef& node_def, - error::Code expected_code = error::OK, - const char* expected_msg_substr = nullptr, - bool should_run_conversion = true) { + void RunValidationAndConversion( + const NodeDef& node_def, + absl::StatusCode expected_code = absl::StatusCode::kOk, + absl::string_view expected_msg_substr = "", + bool should_run_conversion = true) { // Add the node to the graph. // TODO(laigd): we should accept a function that adds the node using // `scope_`, so individual test case can reuse the scope object and we don't @@ -1422,13 +1486,51 @@ class OpConverterTest : public ::testing::Test { graph->AddEdge(input.node(), input.index(), node, i); } - RunValidation(node, expected_code, expected_msg_substr); - if (should_run_conversion) { + status = RunValidation(node); + if (should_run_conversion && status.ok()) { RunConversion(node, expected_code, expected_msg_substr); + } else { + EXPECT_THAT(status, + StatusIs(expected_code, HasSubstr(expected_msg_substr))); + } + } + + // Helper method to run both validation and conversion, and check the output + // shapes. + void RunValidationAndConversion( + const NodeDef& node_def, const Status& status, + const std::string& output_name, + const std::vector>& exp_out_dims) { + RunValidationAndConversion(node_def, + static_cast(status.code()), + status.message(), true); + + if (status.ok()) { + // TODO(tfeher): Enable this check in explicit_batch_mode. + // In dynamic shape mode the output dims cannot be tested here. In that + // case we need to wait for the concrate input shapes to be defined (by + // setBindingDimensions before enqueue) before we can check the output + // dims. + if (converter_->use_implicit_batch()) { + for (int i = 0; i < exp_out_dims.size(); i++) { + TRT_TensorOrWeights output; + string name = i == 0 ? output_name : StrCat(output_name, ":", i); + TF_EXPECT_OK(GetTensorOrWeights(name.c_str(), &output)); + ASSERT_TRUE(output.is_tensor()); + if (!exp_out_dims[i].empty()) { + // Removing batch dim. + auto out_dims = std::vector(exp_out_dims[i].begin() + 1, + exp_out_dims[i].end()); + VLOG(2) << "Testing output shape for tensor " << name; + EXPECT_THAT(output.tensor()->getDimensions(), + DimsAreArray(out_dims)); + } + } + } } } - // Expose quantization_ranges_proxy for tests + // Expose quantization_ranges_ for tests std::unordered_map& quantization_ranges_proxy() { return converter_->quantization_ranges_proxy_; } @@ -1438,77 +1540,869 @@ class OpConverterTest : public ::testing::Test { return converter_->quantization_ranges_; } - void PropagateQuantizationRanges() { - converter_->PropagateQuantizationRanges(); + protected: + template + void AdjustVectorByDims(std::vector& values, size_t num_elements, + const string& name, const char* callingFunc) { + const auto old_size = values.size(); + if (num_elements > old_size) { + // Expending vector with 0's. + const std::vector zeros(num_elements - old_size, 0); + values.reserve(num_elements); + values.insert(values.end(), zeros.begin(), zeros.end()); + VLOG(2) << "In function " << callingFunc << " the vector '" << name + << "' was extended by " << num_elements - old_size << " zeros"; + } else { + // Removing unnecessary elements. + values.resize(num_elements); + VLOG(2) << "Only first " << num_elements << " out of " << old_size + << " elements of the vector '" << name + << "' will be used in function" << callingFunc; + } } + + public: std::unique_ptr converter_; protected: - // TODO(laigd): parameterize the test and make the precision mode a parameter. - TrtPrecisionMode precision_mode_to_test_ = TrtPrecisionMode::FP32; + Logger& logger_ = *Logger::GetLogger(); private: - Logger& logger_ = *Logger::GetLogger(); - TrtUniquePtrType builder_; -#if IS_TRT_VERSION_GE(6, 0, 0, 0) - TrtUniquePtrType builder_config_; -#endif - TrtUniquePtrType network_; TrtUniquePtrType engine_; cudaStream_t stream_; - // Used to create placeholders with shape and data type information. The - // created placeholders will be used as inputs to the node to be verified, - // thus we need the shape and data type information to get a non-empty - // GraphProperties. + std::unique_ptr tensor_buffer_allocator_; + + public: + // The scope that contains the graph being converted. Because + // tensor_buffer_allocator_ provides the storage for tensor contents that are + // represented as attributes for graph nodes within scope_, + // tensor_buffer_allocator_ needs to be available when destructing scope_. + // Therefore, scope_ comes after tensor_buffer_allocator_ in the class member + // field list. Scope scope_; + + protected: std::unordered_map node_inputs_; }; -template -void CopyTensorElements(const Tensor& tensor, protobuf::RepeatedField* out) { - out->Clear(); - if (tensor.NumElements() == 0) return; - - // TensorProto does not need to have all the elements present and can truncate - // trailing elements with the same value for compressed representation. Such - // elements are derived based on the tensor shape. - const auto flat = tensor.flat(); - int64 last_index = 0; - for (int64 i = 0; i < tensor.NumElements(); ++i) { - if (flat(i) != flat(last_index)) { - last_index = i; - } +// Extends the OpConverterTest for variable converters which require a properly +// setup context. +class VariableOpConverterTest : public OpConverterTest { + public: + void Reset(TrtPrecisionMode precision_mode_to_test = TrtPrecisionMode::FP32, + TrtTestMode trt_mode = TrtTestMode::kImplicitBatch) { + OpConverterTest::Reset(precision_mode_to_test, trt_mode, context_.get()); } - int num_out_elements = last_index + 1; - out->Reserve(num_out_elements); - out->AddNAlreadyReserved(num_out_elements); - const T* src = flat.data(); - T* dst = out->mutable_data(); - std::copy(src, src + num_out_elements, dst); -} + void CreateContext(const NodeDef& node_def, OpKernel** kernel, + OpKernelContext** context) { + std::unique_ptr device_( + DeviceFactory::NewDevice("GPU", {}, "/job:a/replica:0/task:0")); + Device* device_ptr = device_.get(); -template -void TestConvertConst(OpConverterTest* test) { - NodeDef node_def; - node_def.set_name("my_const"); - node_def.set_op("Const"); + device_mgr_ = std::make_unique(std::move(device_)); - auto reset_and_test = [&node_def, test]( - const Tensor& tensor, const bool as_tensor_content, - const std::vector& expected_dims, - const std::vector& expected_value) { - test->Reset(); + managed_allocator_ = std::make_unique(); + Allocator* allocator = managed_allocator_.get(); + step_container_ = + std::make_unique(0, [](const string&) {}); + slice_reader_cache_wrapper_ = + std::make_unique(); - TensorProto* tensor_attr = - (*node_def.mutable_attr())["value"].mutable_tensor(); - tensor_attr->Clear(); + flib_def_ = std::make_unique( + OpRegistry::Global(), FunctionDefLibrary()); - if (as_tensor_content) { - tensor.AsProtoTensorContent(tensor_attr); - } else { - tensor.shape().AsProto(tensor_attr->mutable_tensor_shape()); - tensor_attr->set_dtype(tensor.dtype()); + thread_pool_ = + std::make_unique(Env::Default(), "default", + /*num_threads=*/1); + pflr_ = std::make_unique( + device_mgr_.get(), Env::Default(), /*config=*/nullptr, + TF_GRAPH_DEF_VERSION, flib_def_.get(), OptimizerOptions(), + thread_pool_.get()); + + FunctionLibraryRuntime* flib = pflr_->GetFLR(device_ptr->name()); + ResourceMgr* resource_mgr = device_ptr->resource_manager(); + + TF_CHECK_OK(NodeProperties::CreateFromNodeDef( + node_def, OpRegistry::Global(), &props_)); + + OpKernel* kernel_ptr = nullptr; + TF_CHECK_OK(CreateOpKernel(DEVICE_GPU, device_ptr, allocator, flib, + resource_mgr, props_, TF_GRAPH_DEF_VERSION, + &kernel_ptr)); + op_kernel_ = std::unique_ptr(kernel_ptr); + + auto* dev_info = device_ptr->tensorflow_accelerator_device_info(); + CHECK_NOTNULL(dev_info); + DeviceContext* device_context = dev_info->default_context; + + // Note: this setup is not exhaustive. + params_.device = device_ptr; + params_.op_kernel = op_kernel_.get(); + params_.resource_manager = resource_mgr; + params_.frame_iter = FrameAndIter(0, 0); + params_.inputs = inputs_; + params_.step_container = step_container_.get(); + params_.function_library = flib; + params_.slice_reader_cache = slice_reader_cache_wrapper_.get(); + params_.op_device_context = device_context; + + context_ = std::make_unique(¶ms_); + + // Outputs. + *kernel = op_kernel_.get(); + *context = context_.get(); + } + + // Adds resource for resource variable op converters. + void AddTestResource(const string& name, const ResourceHandle& resource) { + // Add resource for validation. + node_inputs_[name] = + ops::Placeholder(scope_.WithOpName("my_handle"), DT_RESOURCE); + + // Add resource for conversion. + TF_EXPECT_OK(AddTensorOrWeights(name, TRT_TensorOrWeights{resource})); + } + + private: + // The following pointers manage the kernel context. + std::unique_ptr device_mgr_; + std::unique_ptr managed_allocator_; + std::unique_ptr step_container_; + std::unique_ptr + slice_reader_cache_wrapper_; + std::unique_ptr flib_def_; + std::unique_ptr thread_pool_; + std::unique_ptr pflr_; + OpKernelContext::Params params_; + std::unique_ptr op_kernel_; + std::unique_ptr context_; + std::shared_ptr props_; + absl::InlinedVector inputs_; +}; + +// General test parameters to be used with ops that take a single input tensor. +struct TestParamBase { + // Concrete input dimensions for the test (including the batch dim) + std::vector input_dims; + + // Dimensions to define an input with PartialTensorShape. This can be used to + // define networks with dynamic input shape. It can be left empty, in that + // case AddTestTensor sets partial shapes that are appropriate to TrtTestMode. + std::vector partial_input_dims; + + // Concrete (static) output dimensions, including batch size as first dim + std::vector expected_output_dims; + + // Parameter vector, has converter specific meaning. + std::vector param; + + // Expected status of conversion (with concrete error message) + Status status; + + // Expected status of BuildAndRun + Status runtime_status; +}; + +std::ostream& operator<<(std::ostream& os, const TestParamBase& p) { + os << "input_dims" << PrintToString(p.input_dims); + if (!p.partial_input_dims.empty()) { + os << ", partial_input_dims" << PrintToString(p.partial_input_dims); + } + if (!p.expected_output_dims.empty()) { + os << ", exp_out_dims" << PrintToString(p.expected_output_dims); + } + if (!p.param.empty()) { + os << ", param" << PrintToString(p.param); + } + os << ", " << p.status; + return os; +} + +// Printing vector with the numbers of type T which defines tensor or shape. +template +const std::string get_debug_string_for_vector(const std::vector& vector, + absl::string_view pComment, + absl::string_view name, + absl::string_view type = "") { + const std::string t1 = absl::StrCat(pComment, " '", name, "': Dims(nbDims="); + const std::string t2 = absl::StrJoin(vector, ","); + const std::string t3 = type != "" ? absl::StrCat(") of type ", type) : ")"; + std::stringstream stream; + stream << t1 << vector.size() << ", d=" << t2 << t3; + return stream.str(); +} + +// Parameterized version of OpConverterTest. We have the following parameters: +// 1. TrtTestMode: implicit batch, explicit batch, dynamic shape modes +// 2. DataType of the input TF tensors: DT_FLOAT, DT_HALF, DT_INT32 +// 3. TrtPrecisionMode argument for the Converter: FP32, FP16, INT8 +// We will introduce subclasses that will be instantiated using different +// combinations of the DataType and TrtPrecisionMode parameters. +class ParameterizedOpConverterTestBase + : public OpConverterTest, + public ::testing::WithParamInterface< + std::tuple> { + public: + ParameterizedOpConverterTestBase() + : trt_mode_(std::get<0>(GetParam())), + tf_type_(std::get<1>(GetParam())), + converter_precision_(std::get<2>(GetParam())) { + LOG(INFO) << "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%"; + LOG(INFO) << "tf_type_: " << DebugString(tf_type_); + LOG(INFO) << "trt_mode_: " << DebugString(trt_mode_); + LOG(INFO) << "converter_precision_: " << DebugString(converter_precision_); + LOG(INFO) << "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%"; + } + + void Reset() { + OpConverterTest::Reset(converter_precision_, trt_mode_); + input_data_.clear(); + } + + void Reset(TrtPrecisionMode precision) { + OpConverterTest::Reset(precision, trt_mode_); + input_data_.clear(); + } + + // Getters of protected attributes + DataType get_tf_type() { return tf_type_; } + TrtTestMode get_trt_mode() { return trt_mode_; } + TrtPrecisionMode get_converter_precision() { return converter_precision_; } + + // Adds an input ITensor for TRT network. Also creates the corresponding TF + // tensor, and stores it in the list of inputs (input_data_). + // + // The TF tensor is always created with concrete static input shape given by + // dims. The ITensor can have static or dynamic shape based on the trt_mode + // attribute. The ITensor shape is set automatically according to the trt_mode + // parameter, unless the user overrides it with an explicit + // partial_input_shape_dims argument. + // + // Parameters: + // - name of the input node + // - dims actual dimensions of the tensor that we will use during the test + // (including explicit batch dim) + // - values initial values for the TF tensor + // - dtype data type of the tensor + // - partial_input_shape dimensions which can include unknown shapes. This can + // be empty, in that case the partial_input_shape will be set automatically + // depending on the trt_mode argument. (This argument also includes explicit + // batch dim). + // - add_input_status adding ITensor to the network can fail in implicit batch + // mode if the batch size is inconsistent. Using the add_input_status arg we + // can test such errors. + // + template + void AddTestTensor(const string& name, const std::vector& dims, + DataType tf_type, const std::vector& values_inp, + const std::vector& partial_input_shape_dims = {}, + Status add_input_status = Status::OK(), + bool fix_values = true) { + std::vector values(values_inp); + VLOG(2) << "**** AddTestTensor for " << name + << " ***** dims empty() = " << dims.empty() + << " tf_type = " << DebugString(tf_type); + if (!dims.empty()) { + const auto num_elements = std::accumulate( + std::begin(dims), std::end(dims), 1, std::multiplies()); + if (!values.empty() && num_elements != values.size()) { + if (fix_values) { + AdjustVectorByDims(values, num_elements, name, "AddTestTensor"); + } else { + // Note: for conversion only tests, it is valid to have empty values, + // otherwise the number of elements should match. + LOG(WARNING) << "Expected Test Tensor Shape: " << DebugString(dims) + << ", Received Input Tensor: " << DebugString(values); + } + } + } + + std::vector partial_shape; + if (!partial_input_shape_dims.empty()) { + partial_shape = partial_input_shape_dims; + } else { + if (trt_mode_ == TrtTestMode::kDynamicShape) { + // In dynamic shape mode we make all dims unknown. + partial_shape = std::vector(dims.size(), -1); + } else { + // Use static (known) input shapes. + partial_shape = dims; + } + if (VLOG_IS_ON(2)) { + VLOG(2) << get_debug_string_for_vector(partial_shape, + "Using partial_shape for", name); + } + } + nvinfer1::DataType trt_type; + TF_ASSERT_OK(TfTypeToTrtType(tf_type, &trt_type)); + AddTestTensorWithTFDims(name, partial_shape, trt_type, add_input_status); + if (!values.empty()) { + if (VLOG_IS_ON(2)) { + VLOG(2) << get_debug_string_for_vector(values, "Adding test tensor for", + name, DataTypeString(tf_type)); + } + InputOutputData data{name, AsTensor(values, dims, tf_type)}; + VLOG(2) << "Added tensor: " << data.name << " with dtype " + << DataTypeString(data.tensor.dtype()); + input_data_.push_back(data); + } + } + + // Adds test tensor (same as above) but with the default tf_type defined by + // the test params. + template + void AddTestTensor(const string& name, const std::vector& dims, + const std::vector& values = {}, + const std::vector& partial_input_shape_dims = {}) { + AddTestTensor(name, dims, tf_type_, values, partial_input_shape_dims); + } + + // Builds and runs the converted network. Checks output tensor shape. Tests + // output values using a matcher. The network can have multiple input and + // output tensors. The inputs are defined by the input_data_ member variable. + void BuildAndRun(const string& name, + const std::vector>& expected_output_dims, + const Status& expected_runtime_status, + const std::vector>>& matcher, + const std::vector& out_tf_types = {}) { + TensorShape shape; + const int n_output = expected_output_dims.size(); + ASSERT_EQ(n_output, matcher.size()); + DataVec output_data; + for (int i = 0; i < n_output; i++) { + TF_EXPECT_OK( + TensorShapeUtils::MakeShape(expected_output_dims[i], &shape)); + string out_name = (i == 0) ? name : StrCat(name, ":", i); + DataType out_tf_type = + out_tf_types.size() > i ? out_tf_types[i] : tf_type_; + InputOutputData data{ + out_name, ConstructTensor(shape.num_elements(), 0, out_tf_type)}; + output_data.push_back(data); + } + const int batch_size = + input_data_.empty() || + TensorShapeUtils::IsScalar(input_data_[0].tensor.shape()) + ? 1 + : input_data_[0].tensor.shape().dim_size(0); + Status stat = + OpConverterTest::BuildAndRun(input_data_, &output_data, batch_size); + ASSERT_EQ(expected_runtime_status.ok(), stat.ok()) + << "expected status: " << expected_runtime_status + << ", actual status: " << stat; + if (expected_runtime_status.ok() && stat.ok()) { + for (int i = 0; i < n_output; i++) { + // Check the shape of the actual output tensors + TF_EXPECT_OK( + TensorShapeUtils::MakeShape(expected_output_dims[i], &shape)); + EXPECT_TRUE(output_data[i].tensor.shape() == shape) + << "Expected shape: " << shape.DebugString() << ", actual shape: " + << output_data[i].tensor.shape().DebugString(); + EXPECT_THAT(GetDataAsFloat(output_data[i]), matcher[i]); + } + } + } + + // Runs validation and conversion. If conversion is successfull then builds + // the TRT network, executes it and checks the output. Handles multiple output + // tensors. + void TestOpConverterMultiOut( + const NodeDef& node_def, + const std::vector>& expected_output_dims, + const Status& expected_conversion_status, + const Status& expected_runtime_status, + const std::vector>>& matcher, + const std::vector& out_tf_type = {}) { + const auto& name = node_def.name(); + RunValidationAndConversion(node_def, expected_conversion_status, name, + expected_output_dims); + if (expected_conversion_status.ok()) { + BuildAndRun(name, expected_output_dims, expected_runtime_status, matcher, + out_tf_type); + } + } + + // Runs validation and conversion. If conversion is successfull then builds + // the TRT network, executes it and checks the output. + void TestOpConverter(const NodeDef& node_def, + const std::vector& expected_output_dims, + const Status& expected_conversion_status, + const Status& expected_runtime_status, + const Matcher>& matcher, + const std::vector& out_tf_types = {}) { + TestOpConverterMultiOut( + node_def, std::vector>({expected_output_dims}), + expected_conversion_status, expected_runtime_status, + std::vector>>({matcher}), out_tf_types); + } + + protected: + const TrtTestMode trt_mode_; + const DataType tf_type_; + const TrtPrecisionMode converter_precision_; + DataVec input_data_; +}; + +template +class OpConverter_UnaryTest : public ParameterizedOpConverterTestBase { + public: + template + void RunTests( + const string& testName, const OperationMap& map, + std::map, T (*)(T)>>& op_map, + const std::vector input_values, const std::string input_name = "input", + float max_abs_error = 0.0001, bool nan_sensitive = true) { + // Prepare test parameters. + auto p = TestParamBase{ + {1, 1, 2, 3}, // input dims + {}, // input partial dims + {1, 1, 2, 3}, // expected output dims + }; + + // Get list of ops to test. + std::vector ops_to_test; + for (auto& pair : map) { + ops_to_test.push_back(pair.first); + } + + for (const string& op_name : ops_to_test) { + SCOPED_TRACE(op_name); + if (!op_map.count(op_name)) { + FAIL() << testName << " op test map does not contain op " << op_name; + } + + const DataType tf_type = get_tf_type(); + const NodeDef& node = op_map[op_name].first(tf_type); + runExpectedToFailTest(node, input_name, input_values, op_name); + + Status conv_status = Status::OK(); + if (trt_mode_ == TrtTestMode::kImplicitBatch && + (op_name == "Sign" || op_name == "Round" || + op_name == "LogicalNot")) { + const auto& err = + convert_not_supported_implicit(op_name, node.name(), "Unary"); + conv_status = errors::Unimplemented(err); + } + + Reset(); + const DataType input_tf_type = op_name == "Cast" ? DT_HALF : tf_type; + const DataType output_tf_type = op_name == "Cast" ? DT_FLOAT : tf_type; + + AddTestTensor("input", p.input_dims, input_tf_type, input_values); + + std::vector output; + std::transform(input_values.begin(), input_values.end(), + std::back_inserter(output), op_map[op_name].second); + + TestOpConverter(node, p.expected_output_dims, conv_status, Status::OK(), + ArrayFloatNear(output, max_abs_error, nan_sensitive), + {output_tf_type}); + } + } + void runExpectedToFailTest(const NodeDef& node_def, + const std::string& input_name, + const std::vector& input_values, + const std::string& op_name) { + // Input is weights, should fail. + Reset(); + std::string error = + "The input \"" + input_name + "\" for " + op_name + " must be a tensor"; + AddTestWeights("input", {1, 2, 3}, input_values, get_tf_type()); + RunValidationAndConversion(node_def, absl::StatusCode::kUnimplemented, + error); + + // Input has 0 dimensions, should fail. + Reset(); + std::vector dims{}; + if (trt_mode_ == TrtTestMode::kImplicitBatch) { + dims = {1}; + } + error = "At least 1 dimension is required for UNARY operation '" + op_name + + "'"; + AddTestTensor("input", dims); + RunValidationAndConversion(node_def, absl::StatusCode::kInvalidArgument, + error); + } +}; + +template +class OpConverter_BinaryTest : public ParameterizedOpConverterTestBase { + public: + template + void RunTests( + const OperationMap& map, + std::map, std::vector>>& + op_test_info, + const std::vector>& data) { + const std::vector bool_types{DT_BOOL}, default_types{}; + std::vector logical_ops{"Greater", "Less", "Equal"}; + std::vector combined_ops{"GreaterEqual", "LessEqual"}; + const DataType tf_type = get_tf_type(); + AttrValue dtype; + dtype.set_type(tf_type); + std::map nodes; + for (const auto op_name : combined_ops) { + nodes[op_name] = MakeNodeDef("my_binary", op_name, {"input1", "input2"}, + {{"T", dtype}}); + } + + for (auto& iter : map) { + const string& op_name = iter.first; + if (!op_test_info.count(op_name)) { + FAIL() << "Binary op test map does not contain op " << op_name; + } + const auto comb_op = find_name(op_name, combined_ops); + const auto& node_def = + comb_op ? nodes[op_name] : op_test_info[op_name].first(tf_type); + + for (const bool operand_1_is_tensor : {true, false}) { + for (const bool operand_2_is_tensor : {true, false}) { + SCOPED_TRACE(StrCat(op_name, "_", operand_1_is_tensor ? "T" : "W", + operand_2_is_tensor ? "T" : "W")); + Reset(); + if (!operand_1_is_tensor && !operand_2_is_tensor) { + // In that case the only test which should be launched is in + // runExpectedToFailTest + runExpectedToFailTest(op_name, node_def); + continue; + } + + const bool logical_op = comb_op || find_name(op_name, logical_ops); + auto conv_status = Status::OK(); + if (tf_type == DT_BOOL || logical_op) { + if (trt_mode_ == TrtTestMode::kImplicitBatch) { + conv_status = + errors::Unimplemented(convert_not_supported_implicit( + op_name, node_def.name(), "Binary")); + } else if (!logical_op && + (!operand_1_is_tensor || !operand_2_is_tensor)) { + conv_status = errors::InvalidArgument( + "Both inputs of '", op_name, "' are expected to be tensors"); + } + } + + if (operand_1_is_tensor) { + AddTestTensor("input1", {2, 1, 2}, data[0]); + } else { + AddTestWeights("input1", {1, 2}, data[1], tf_type); + } + if (operand_2_is_tensor) { + AddTestTensor("input2", {2, 2, 1}, data[2]); + } else { + AddTestWeights("input2", {2, 1}, data[3], tf_type); + } + + TestOpConverter(node_def, {2, 2, 2}, conv_status, Status::OK(), + ElementsAreArray(op_test_info[op_name].second), + logical_op ? bool_types : default_types); + } + } + } + } + + void runExpectedToFailTest(const std::string& op_name, const NodeDef& node) { + AddTestWeights("input1", {1}, {1}, tf_type_); + AddTestWeights("input2", {1}, {1}, tf_type_); + const string error = + "Constant folding is falled back to TensorFlow, " + "binary op '" + + op_name + "' received both input as constant"; + RunValidationAndConversion(node, absl::StatusCode::kUnimplemented, error); + } +}; + +// Op converter test in FP32 mode. While for debugging purposes it might make +// sense to run over all possible combinations, normally a subset of them +// would be sufficient: +// - All valid options to TrtTestMode (implicit, explicit, dynamic shape) +// - DataType: is the TF data type of the input tensors. This usually only +// influences the data type added by Converter::AddInputTensor. We test the +// valid combinations of input data types in AddAndGetInputs, therefore +// for most of the OpConverterTest its is sufficient to test for DT_FLOAT. +// - TrtPrecisionMode: valid options are FP32, FP16 and INT8. This influences +// how TRT handles the precision inside the TRT network, but should not matter +// for the TF -> TRT conversion. Therefore it should be sufficient to test +// for FP32. +typedef ParameterizedOpConverterTestBase OpConverter_FP32_Test; +// Base class for tests that need to be tested for both FP32 and FP16. +typedef ParameterizedOpConverterTestBase OpConverter_FP32_FP16_Test; +// Base class for Binary tests that need to be tested +typedef OpConverter_BinaryTest OpConverter_FP32_FP16_BinaryTest; +typedef OpConverter_BinaryTest OpConverter_BOOL_BinaryTest; +// Base class for tests that need to be tested for FP32, FP16, and INT32 +typedef ParameterizedOpConverterTestBase OpConverter_FP32_FP16_INT32_Test; +// Base class for tests that need to be tested for INT32 +typedef ParameterizedOpConverterTestBase OpConverter_INT32_Test; +// Base class for Unary tests that need to be tested +typedef OpConverter_UnaryTest OpConverter_FP32_UnaryTest; +typedef OpConverter_UnaryTest OpConverter_BOOL_Test; + +// Instantiate parameter combinations to OpConverter__Test +INSTANTIATE_TEST_CASE_P( + OpConvTestInstantiation, OpConverter_FP32_Test, + ::testing::Combine(::testing::ValuesIn(ValidTrtModes), + ::testing::Values(DT_FLOAT), + ::testing::Values(TrtPrecisionMode::FP32))); + +INSTANTIATE_TEST_CASE_P( + OpConvTestInstantiation, OpConverter_FP32_FP16_Test, + ::testing::Combine(::testing::ValuesIn(ValidTrtModes), + ::testing::Values(DT_FLOAT, DT_HALF), + ::testing::Values(TrtPrecisionMode::FP32))); + +INSTANTIATE_TEST_CASE_P( + OpConvTestInstantiation, OpConverter_FP32_FP16_INT32_Test, + ::testing::Combine(::testing::ValuesIn(ValidTrtModes), + ::testing::Values(DT_FLOAT, DT_HALF, DT_INT32), + ::testing::Values(TrtPrecisionMode::FP32))); + +INSTANTIATE_TEST_CASE_P( + OpConvTestInstantiation, OpConverter_INT32_Test, + ::testing::Combine(::testing::ValuesIn(ValidTrtModes), + ::testing::Values(DT_INT32), + ::testing::Values(TrtPrecisionMode::FP32))); + +INSTANTIATE_TEST_CASE_P( + OpConvTestInstantiation, OpConverter_FP32_UnaryTest, + ::testing::Combine(::testing::ValuesIn(ValidTrtModes), + ::testing::Values(DT_FLOAT), + ::testing::Values(TrtPrecisionMode::FP32))); + +INSTANTIATE_TEST_CASE_P( + OpConvTestInstantiation, OpConverter_BOOL_Test, + ::testing::Combine(::testing::ValuesIn(ValidTrtModes), + ::testing::Values(DT_BOOL), + ::testing::Values(TrtPrecisionMode::FP32))); + +INSTANTIATE_TEST_CASE_P( + OpConvTestInstantiation, OpConverter_FP32_FP16_BinaryTest, + ::testing::Combine(::testing::ValuesIn(ValidTrtModes), + ::testing::Values(DT_FLOAT, DT_HALF), + ::testing::Values(TrtPrecisionMode::FP32))); + +INSTANTIATE_TEST_CASE_P( + OpConvTestInstantiation, OpConverter_BOOL_BinaryTest, + ::testing::Combine(::testing::ValuesIn(ValidTrtModes), + ::testing::Values(DT_BOOL), + ::testing::Values(TrtPrecisionMode::FP32))); + +template +void CopyTensorElements(const Tensor& tensor, protobuf::RepeatedField* out) { + out->Clear(); + if (tensor.NumElements() == 0) return; + + // TensorProto does not need to have all the elements present and can truncate + // trailing elements with the same value for compressed representation. Such + // elements are derived based on the tensor shape. + const auto flat = tensor.flat(); + int64 last_index = 0; + for (int64 i = 0; i < tensor.NumElements(); ++i) { + if (flat(i) != flat(last_index)) { + last_index = i; + } + } + + int num_out_elements = last_index + 1; + out->Reserve(num_out_elements); + out->AddNAlreadyReserved(num_out_elements); + const T* src = flat.data(); + T* dst = out->mutable_data(); + std::copy(src, src + num_out_elements, dst); +} + +template +void TestConvertVariableV2(VariableOpConverterTest* test) { + struct TestParam { + string container; + string shared_name; + std::vector dims; + float epsilon; + Status conversion_status; + }; + + std::vector test_param = { + {"", "var0", {}, 0.001, Status::OK()}, + {"", "var0", {64}, 0.001, Status::OK()}, + {"", "var0", {8, 16}, 0.001, Status::OK()}, + {"box", "var", {8, 16}, 0.001, Status::OK()}}; + for (auto p : test_param) { + // Create node definition. + NodeDef node_def; + std::vector dims_64(p.dims.begin(), p.dims.end()); + TensorShape shape = TensorShape(absl::Span(dims_64)); + TF_CHECK_OK(NodeDefBuilder("my_var", "VariableV2") + .Attr("dtype", dtype) + .Attr("shape", shape) + .Attr("container", p.container) + .Attr("shared_name", p.shared_name) + .Finalize(&node_def)); + + OpKernel* kernel; + OpKernelContext* context; + test->CreateContext(node_def, &kernel, &context); + + test->Reset(TrtPrecisionMode::FP32, TrtTestMode::kDynamicShape); + + // Set the value of the variable according to p.dims. + int var_size = std::accumulate(p.dims.begin(), p.dims.end(), 1, + std::multiplies()); + std::vector expected_value; + expected_value.reserve(var_size); + for (int i = 0; i < var_size; i++) { + expected_value.push_back((CType)i); + } + + // To set the variable, we get the tensor by executing the VariableV2 op + // rather than creating the resource directly in the manager, because: + // 1) LegacyVar defined in `variable_ops.cc` is not accessible. + // 2) Tensor::set_shape is private, VariableOp is a friend class. + kernel->Compute(context); + Tensor* tensor_ptr = context->mutable_output(0); + CHECK_NOTNULL(tensor_ptr); + // We allocate the tensor in the temporary memory. Note that creating a + // tensor in this scope and sharing the underlying storage by copy would + // lead to double destruction. + AllocatorAttributes attr; + attr.set_gpu_compatible(true); + attr.set_nic_compatible(true); + OP_REQUIRES_OK(context, + context->allocate_temp(dtype, shape, tensor_ptr, attr)); + // The tensor is allocated on GPU. We copy the values from the CPU. + auto tensor_flat = tensor_ptr->flat(); + CHECK_NOTNULL(tensor_flat.data()); + auto ret = cudaMemcpy(tensor_flat.data(), expected_value.data(), + expected_value.size() * sizeof(CType), + cudaMemcpyHostToDevice); + CHECK_EQ(ret, 0); + + test->RunValidationAndConversion(node_def); + TRT_TensorOrWeights output; + TF_EXPECT_OK(test->GetTensorOrWeights("my_var", &output)); + EXPECT_THAT(output.weights(), + ShapedWeightsHasDimsAndValues(p.dims, expected_value)); + } +} + +TEST_F(VariableOpConverterTest, ConvertVariableV2) { + TestConvertVariableV2(this); + TestConvertVariableV2(this); +} + +template +void TestConvertReadVariableOp(VariableOpConverterTest* test) { + struct TestParam { + string container; + string name; + std::vector dims; + float epsilon; + Status conversion_status; + }; + + std::vector test_param = { + {"", "var0", {}, 0.001, Status::OK()}, + {"", "var0", {64}, 0.001, Status::OK()}, + {"", "var0", {8, 16}, 0.001, Status::OK()}, + {"box", "var", {8, 16}, 0.001, Status::OK()}}; + for (auto p : test_param) { + // Create node definition. + NodeDefBuilder::NodeOut rvo_input = + NodeDefBuilder::NodeOut("my_handle", 0, DT_RESOURCE); + NodeDef node_def; + std::vector dims_64(p.dims.begin(), p.dims.end()); + TensorShape shape = + TensorShape(gtl::ArraySlice(dims_64)); // non-absl ok + TF_CHECK_OK(NodeDefBuilder("my_var", "ReadVariableOp") + .Attr("dtype", dtype) + .Attr("_shape", shape) + .Input(rvo_input) + .Finalize(&node_def)); + + OpKernel* kernel; + OpKernelContext* context; + test->CreateContext(node_def, &kernel, &context); + + test->Reset(TrtPrecisionMode::FP32, TrtTestMode::kDynamicShape); + + // Set the value of the variable according to p.dims. + int var_size = std::accumulate(p.dims.begin(), p.dims.end(), 1, + std::multiplies()); + std::vector expected_value; + expected_value.reserve(var_size); + for (int i = 0; i < var_size; i++) { + // Set expected_value[i] = (cast)i. + expected_value.push_back((CType)i); + } + + // Create a resource handle. + DtypeAndPartialTensorShape dtype_and_shape; + dtype_and_shape.dtype = dtype; + TF_CHECK_OK(PartialTensorShape::BuildPartialTensorShape( + gtl::ArraySlice(dims_64), // non-absl ok + &dtype_and_shape.shape)); + ResourceHandle handle = MakeResourceHandle( + context, p.container, p.name, + std::vector{dtype_and_shape}); + + // Create input resource with the handle. + test->AddTestResource("my_handle", handle); + + // Create a resource with this handle. + Var* resource = new Var(dtype); + TF_EXPECT_OK(CreateResource(context, handle, resource)); + + // Setup the tensor of the variable. + // We allocate the tensor in the temporary memory. Note that creating a + // tensor in this scope and sharing the underlying storage by copy would + // lead to double destruction. + AllocatorAttributes attr_value; + attr_value.set_gpu_compatible(true); + attr_value.set_nic_compatible(true); + TF_EXPECT_OK( + context->allocate_temp(dtype, shape, resource->tensor(), attr_value)); + // The tensor is allocated on GPU. We copy the values from the CPU. + auto tensor_flat = resource->tensor()->flat(); + CHECK(tensor_flat.data()); + auto ret = cudaMemcpy(tensor_flat.data(), expected_value.data(), + expected_value.size() * sizeof(CType), + cudaMemcpyHostToDevice); + CHECK_EQ(ret, 0); + + test->RunValidationAndConversion(node_def); + TRT_TensorOrWeights output; + TF_EXPECT_OK(test->GetTensorOrWeights("my_var", &output)); + EXPECT_THAT(output.weights(), + ShapedWeightsHasDimsAndValues(p.dims, expected_value)); + } +} + +TEST_F(VariableOpConverterTest, ConvertReadVariableOp) { + TestConvertReadVariableOp(this); + TestConvertReadVariableOp(this); +} + +template +void TestConvertConst(OpConverterTest* test) { + NodeDef node_def; + node_def.set_name("my_const"); + node_def.set_op("Const"); + + auto reset_and_test = [&node_def, test]( + const Tensor& tensor, const bool as_tensor_content, + const std::vector& expected_dims, + const std::vector& expected_value) { + test->Reset(); + + TensorProto* tensor_attr = + (*node_def.mutable_attr())["value"].mutable_tensor(); + tensor_attr->Clear(); + + if (as_tensor_content) { + tensor.AsProtoTensorContent(tensor_attr); + } else { + tensor.shape().AsProto(tensor_attr->mutable_tensor_shape()); + tensor_attr->set_dtype(tensor.dtype()); if (tensor.dtype() == DT_FLOAT) { CopyTensorElements(tensor, tensor_attr->mutable_float_val()); @@ -1521,7 +2415,8 @@ void TestConvertConst(OpConverterTest* test) { test->RunValidationAndConversion(node_def); TRT_TensorOrWeights output; TF_EXPECT_OK(test->GetTensorOrWeights("my_const", &output)); - ValidateWeights(output.weights(), expected_dims, expected_value); + EXPECT_THAT(output.weights(), ShapedWeightsHasDimsAndValues( + expected_dims, expected_value)); }; auto& attr = *node_def.mutable_attr(); @@ -1534,17 +2429,20 @@ void TestConvertConst(OpConverterTest* test) { } { Tensor t = test::AsScalar(12); - reset_and_test(t, false, {1}, {12}); - reset_and_test(t, true, {1}, {12}); + std::vector expected_dims{1}; + // Scalars are represented as rank 0 tensors. + expected_dims.clear(); + reset_and_test(t, false, expected_dims, {12}); + reset_and_test(t, true, expected_dims, {12}); } { - Tensor t = test::AsTensor({1, 2}); + Tensor t = test->AsTensor({1, 2}); reset_and_test(t, false, {2}, {1, 2}); reset_and_test(t, true, {2}, {1, 2}); } { Tensor t = - test::AsTensor({1, 2, 3, 4, 5, 6}, TensorShape({2, 3})); + test->AsTensor({1, 2, 3, 4, 5, 6}, TensorShape({2, 3})); reset_and_test(t, false, {2, 3}, {1, 2, 3, 4, 5, 6}); reset_and_test(t, true, {2, 3}, {1, 2, 3, 4, 5, 6}); } @@ -1552,7 +2450,7 @@ void TestConvertConst(OpConverterTest* test) { // Set all tensor elements to the same value. Such tensors are encoded // using a single element list in tensor proto. Tensor t = - test::AsTensor({1, 1, 1, 1, 1, 1}, TensorShape({2, 3})); + test->AsTensor({1, 1, 1, 1, 1, 1}, TensorShape({2, 3})); reset_and_test(t, false, {2, 3}, {1, 1, 1, 1, 1, 1}); reset_and_test(t, true, {2, 3}, {1, 1, 1, 1, 1, 1}); } @@ -1560,7 +2458,7 @@ void TestConvertConst(OpConverterTest* test) { // Set trailing tensor elements to the same value. Such tensors are // encoded by truncating all equal elements except the first one. Tensor t = - test::AsTensor({2, 2, 1, 1, 1, 1}, TensorShape({2, 3})); + test->AsTensor({2, 2, 1, 1, 1, 1}, TensorShape({2, 3})); reset_and_test(t, false, {2, 3}, {2, 2, 1, 1, 1, 1}); reset_and_test(t, true, {2, 3}, {2, 2, 1, 1, 1, 1}); } @@ -1570,15 +2468,15 @@ TEST_F(OpConverterTest, ConvertConst) { { Reset(); NodeDef node_def = MakeConstNodeDef("my_const", {}); - RunValidationAndConversion(node_def, error::INVALID_ARGUMENT, - "Unsupported data type double"); + RunValidationAndConversion(node_def, absl::StatusCode::kInvalidArgument, + "Unsupported tensorflow data type double"); } { Reset(); Tensor tensor = - test::AsTensor({1, std::numeric_limits::max(), 1, 1, 1, - std::numeric_limits::lowest()}, - TensorShape({2, 3})); + AsTensor({1, std::numeric_limits::max(), 1, 1, 1, + std::numeric_limits::lowest()}, + TensorShape({2, 3})); NodeDef node_def; node_def.set_name("my_const"); node_def.set_op("Const"); @@ -1587,7 +2485,7 @@ TEST_F(OpConverterTest, ConvertConst) { (*node_def.mutable_attr())["value"].mutable_tensor(); tensor_attr->Clear(); tensor.AsProtoTensorContent(tensor_attr); - RunValidationAndConversion(node_def, error::INVALID_ARGUMENT, + RunValidationAndConversion(node_def, absl::StatusCode::kInvalidArgument, "outside the range of int32"); } @@ -1602,244 +2500,684 @@ TEST_F(OpConverterTest, ConvertConst) { TestConvertConst(this); } -TEST_F(OpConverterTest, ConvertTranspose) { +template +NodeDef CreateFusedBatchNormOp(DataType tf_type, std::string data_format, + bool is_training, float epsilon) { + Scope s = Scope::NewRootScope(); + auto x = ops::Placeholder(s.WithOpName("x"), tf_type); + auto scale = ops::Placeholder(s.WithOpName("scale"), tf_type); + auto offset = ops::Placeholder(s.WithOpName("offset"), tf_type); + auto mean = ops::Placeholder(s.WithOpName("mean"), tf_type); + auto variance = ops::Placeholder(s.WithOpName("variance"), tf_type); + typename T::Attrs attrs; + attrs.data_format_ = data_format; + attrs.is_training_ = is_training; + if (epsilon > 0) { + attrs.epsilon_ = epsilon; + } else { + EXPECT_GE(epsilon, 0); + } + return T(s.WithOpName("my_batchnorm"), x, scale, offset, mean, variance, + attrs) + .operation.node() + ->def(); +} + +TEST_P(OpConverter_FP32_Test, ConvertFusedBatchNorm) { + using OpFunc = std::function; + std::vector get_node_def_vec{ + CreateFusedBatchNormOp, + CreateFusedBatchNormOp, + CreateFusedBatchNormOp}; + + struct TestParam { + std::string data_format; + int tensor_input_idx; // Index of an input that will be provided as tensor. + bool is_training; + float epsilon; + Status conversion_status; + bool keep_channel_unknown; + }; + + struct NodeInput { + std::string name; + std::vector dims; + std::vector val; + }; + std::vector node_input_nchw{ + {"x", {2, 3, 2, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}}, + {"scale", {3}, {7, 8, 9}}, + {"offset", {3}, {10, 20, 30}}, + {"mean", {3}, {1, 2, 3}}, + {"variance", {3}, {4, 5, 6}}}; + + std::vector node_input_nhwc{ + {"x", {2, 2, 1, 3}, {1, 3, 5, 2, 4, 6, 7, 9, 11, 8, 10, 12}}, + {"scale", {3}, {7, 8, 9}}, + {"offset", {3}, {10, 20, 30}}, + {"mean", {3}, {1, 2, 3}}, + {"variance", {3}, {4, 5, 6}}}; + + std::vector expected_output_nchw{ + 10.0, 13.495633, 23.574135, 27.148273, 37.342354, 41.013527, + 30.9738, 34.469433, 45.018955, 48.59309, 59.369415, 63.04059}; + + std::vector expected_output_nhwc{ + 10.0, 23.574135, 37.342354, 13.495633, 27.148273, 41.013527, + 30.9738, 45.018955, 59.369415, 34.469433, 48.59309, 63.04059}; + + for (auto get_node_def : get_node_def_vec) { + NodeDef tmp_node_def = get_node_def(tf_type_, "NCHW", true, 0); + std::string op_name = tmp_node_def.op(); + std::vector test_param{ + {"NCHW", 0, true, 0, + errors::Unimplemented( + StrCat(op_name, " only supports is_training=false"))}, + {"NCHW", 1, false, 0, + errors::Unimplemented(StrCat("The input \"scale\" for ", op_name, + " must be a constant"))}, + {"NCHW", 2, false, 0, + errors::Unimplemented(StrCat("The input \"offset\" for ", op_name, + " must be a constant"))}, + {"NCHW", 3, false, 0, + errors::Unimplemented(StrCat("The input \"mean\" for ", op_name, + " must be a constant"))}, + {"NCHW", 4, false, 0, + errors::Unimplemented(StrCat("The input \"variance\" for ", op_name, + " must be a constant"))}, + {"NCHW", 0, false, 0.01}, + {"NHWC", 0, false, 0.01}}; + if (trt_mode_ == TrtTestMode::kDynamicShape) { + test_param.push_back( + {"NCHW", 0, false, 0.01, + errors::InvalidArgument("Channel dimension must be static"), true}); + test_param.push_back( + {"NHWC", 0, false, 0.01, + errors::InvalidArgument("Channel dimension must be static"), true}); + } + for (auto p : test_param) { + Reset(); + NodeDef node_def = + get_node_def(tf_type_, p.data_format, p.is_training, p.epsilon); + std::vector node_input = + p.data_format == "NCHW" ? node_input_nchw : node_input_nhwc; + std::vector expected_output = + p.data_format == "NCHW" ? expected_output_nchw : expected_output_nhwc; + for (int i = 0; i < node_input.size(); i++) { + if (i == 0 || i == p.tensor_input_idx) { + // The first input (x) is always added as a tensor, and it has shape + // NCHW/NHWC. The other inputs are per channel values (1D, size C). + // + // In implicit batch mode, it is not possible to add any of the 1D + // inputs as a tensor: the first dim is always treated as batch dim in + // implicit batch mode, and that has to agree for all tensors. We have + // two input tensors with shapes NCHW and C and in general N != C. + // The converter already picked up N from the fist input, and reports + // an error when we try to add any other tensors with not matching + // first dim. + // + // This restriction does not apply in explicit batch mode: the tensors + // can have different first dim. The converter still expects that only + // the first arg is a tensor. TODO(tfeher) Check if one can relax this + // restriction. + Status expected_status = + (i != 0 && trt_mode_ == TrtTestMode::kImplicitBatch) + ? errors::InvalidArgument( + batch_size_error(node_input[i].name, + "Provided batch size does not match " + "converter batch size: 3 vs 2")) + : Status::OK(); + std::vector partial_input_shape; + if (i == 0 && trt_mode_ == TrtTestMode::kDynamicShape && + !p.keep_channel_unknown) { + // keep channel dim static (known) + partial_input_shape.resize(4, -1); + int channel_dim = (p.data_format == "NCHW" ? 1 : 3); + partial_input_shape[channel_dim] = node_input[i].dims[channel_dim]; + } + AddTestTensor(node_input[i].name, node_input[i].dims, tf_type_, + node_input[i].val, partial_input_shape, + expected_status); + + } else { + AddTestWeights(node_input[i].name, node_input[i].dims, + node_input[i].val, tf_type_); + } + } + TestOpConverter(node_def, node_input[0].dims, p.conversion_status, + Status::OK(), ArrayFloatNear(expected_output)); + } + } +} + +TEST_P(OpConverter_FP32_Test, ConvertTranspose) { // Get the NodeDef for Transpose. Scope s = Scope::NewRootScope(); - auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT); + auto input = ops::Placeholder(s.WithOpName("input"), tf_type_); auto weights = ops::Placeholder(s.WithOpName("weights"), DT_INT32); auto transpose = ops::Transpose(s.WithOpName("my_transpose"), input, weights); const NodeDef& node_def = transpose.operation.node()->def(); - { - // Permutation is a tensor, should fail. - Reset(); - AddTestTensor("input", {1, 2, 3}); - AddTestTensor("weights", {3}); - RunValidationAndConversion( - node_def, error::UNIMPLEMENTED, - "The input \"perm\" for Transpose must be a constant, at my_transpose"); - } - { - // Transpose at batch dimension, should fail. - Reset(); - AddTestTensor("input", {1, 2, 3}); - AddTestWeights("weights", {4}, {1, 0, 2, 3}); - RunValidationAndConversion(node_def, error::UNIMPLEMENTED, - "Transpose at batch dimension is not supported"); - } - { - // Permutation rank doesn't match, should fail. - Reset(); - AddTestTensor("input", {1, 2, 3}); - AddTestWeights("weights", {3}, {0, 1, 2}); - RunValidationAndConversion( - node_def, error::INVALID_ARGUMENT, - "Rank of perm for transpose does not match with that of the input."); + std::vector test_params = { + // For the first test we leave param empty. This signals to use a + // input as weight which will be invalid + TestParamBase{{3, 1, 2, 1}, + {}, + {}, + {}, + Status(absl::StatusCode::kUnimplemented, + "The input \"perm\" for Transpose must be a " + "constant")}, + TestParamBase{{1, 1, 2, 3}, + {}, + {}, + {0, 1, 2}, + Status(absl::StatusCode::kInvalidArgument, + "Rank of perm for transpose does not match with " + "that of the input.")}, + // Transpose batch dim + TestParamBase{ + {1, 1, 2, 3}, + {}, + {3, 2, 1, 1}, + {3, 2, 1, 0}, + (trt_mode_ == TrtTestMode::kImplicitBatch) + ? Status(absl::StatusCode::kUnimplemented, + "Transpose at batch dimension is not supported") + : Status::OK()}, + TestParamBase{{1, 1, 2, 3}, {}, {1, 3, 1, 2}, {0, 3, 1, 2}}, + }; + if (trt_mode_ == TrtTestMode::kDynamicShape) { + // Dynamic shape tests where some shapes are known + test_params.push_back(TestParamBase{ + {1, 1, 2, 3}, {-1, 1, 2, -1}, {1, 3, 1, 2}, {0, 3, 1, 2}}); + } + std::vector expected_values{1, 4, 2, 5, 3, 6}; + for (auto p : test_params) { + SCOPED_TRACE(p); + Reset(); + AddTestTensor("input", p.input_dims, {1, 2, 3, 4, 5, 6}, + p.partial_input_dims); + if (p.param.empty()) { + AddTestTensor("weights", {3}); + } else { + AddTestWeights("weights", {static_cast(p.param.size())}, + p.param); + } + TestOpConverter(node_def, p.expected_output_dims, p.status, + p.runtime_status, ElementsAreArray(expected_values)); } - { - // Ok. - Reset(); - AddTestTensor("input", {1, 2, 3}); - AddTestWeights("weights", {4}, {0, 3, 1, 2}); - RunValidationAndConversion(node_def); - TRT_TensorOrWeights output; - TF_EXPECT_OK(GetTensorOrWeights("my_transpose", &output)); - ASSERT_TRUE(output.is_tensor()); - ExpectTrtDimsEqualsArray({3, 1, 2}, output.tensor()->getDimensions()); +} - const DataVec input_data{ - {"input", test::AsTensor({1, 2, 3, 4, 5, 6})}}; - DataVec output_data{{"my_transpose", ConstructTensor(6)}}; - BuildAndRun(input_data, &output_data); - EXPECT_THAT(GetSpanForData(output_data[0]), - ElementsAre(1, 4, 2, 5, 3, 6)); +TEST_P(OpConverter_FP32_Test, ConvertTile) { + Scope s = Scope::NewRootScope(); + auto input = ops::Placeholder(s.WithOpName("input"), tf_type_); + auto weights = ops::Placeholder(s.WithOpName("weights"), DT_INT32); + auto tile = ops::Tile(s.WithOpName("my_tile"), input, weights); + const NodeDef& node_def = tile.operation.node()->def(); + + struct TileParam { + std::vector input_dims; + std::vector multiplier; + std::vector tensor; + // Concrete (static) output dimensions, including batch size as first dim. + std::vector expected_output_dims; + std::vector expected_results; + int test_ID; + // Expected status of conversion (with concrete error message). + Status status; + }; + + std::vector test_params = { + // Tests to be rejected by ConvertTile::Validate() for any trt_mode_. + TileParam{{1, 2, 3}, // input_dims + {1, -2, 1}, // multiplier + {}, // tensor + {}, // expected_output_dims + {}, // expected_results + 1, // test_ID + Status(absl::StatusCode::kInvalidArgument, + "All replications of the Tile operation in " + "'my_tile' should be positive, got (1, -2, 1).")}, + TileParam{{1, 2, 3}, // input_dims + {1, 2, 1, 3}, // multiplier + {0, 1, 2, 3, 4, 5}, // tensor + {}, // expected_output_dims + {}, // expected_results + 2, // test_ID + Status(absl::StatusCode::kInvalidArgument, + "The length of the replication vector (4) of the " + "Tile operation in 'my_tile' is expected to be equal " + "to the rank of the input vector (3).")}, + // Tests passed ConvertTile::Validate() for at least some trt_mode_. + TileParam{{1, 2}, // input_dims + {1, 3}, // multiplier + {2, 3}, // tensor + {1, 6}, // expected_output_dims + {2, 3, 2, 3, 2, 3}}, // out values + TileParam{{1, 2, 3}, // input_dims + {1, 2, 1}, // multiplier + {0, 1, 2, 3, 4, 5}, // tensor + {1, 4, 3}, // output dims + {0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5}}, // expected_results + TileParam{{1, 2, 3}, // input_dims + {1, 1, 2}, // multiplier + {0, 1, 2, 3, 4, 5}, // tensor + {1, 2, 6}, // expected_output_dims + {0, 1, 2, 0, 1, 2, 3, 4, 5, 3, 4, 5}}, // expected_results + TileParam{{1, 2, 3}, // input_dims + {1, 2, 2}, // multiplier + {0, 1, 2, 3, 4, 5}, // tensor + {1, 4, 6}, // expected_output_dims + {0, 1, 2, 0, 1, 2, 3, 4, 5, 3, 4, 5, + 0, 1, 2, 0, 1, 2, 3, 4, 5, 3, 4, 5}}, // expected_results + // Tests with non trivial batch size multiplier. + TileParam{{1, 2}, // input_dims + {2, 3}, // multiplier + {2, 3}, // tensor + {2, 6}, // expected_output_dims + {2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3}}, // out values + TileParam{{1, 2, 3}, // input_dims + {2, 2, 1}, // multiplier + {0, 1, 2, 3, 4, 5}, // tensor + {2, 4, 3}, // output dims + {0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, + 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5}}, // expected_results + }; + + for (bool multiplier_is_tensor : {true, false}) { + for (bool input_is_tensor : {true, false}) { + for (auto p : test_params) { + std::vector num_mults = {static_cast(p.multiplier.size())}; + std::vector partial_input_dims = {}; + if (multiplier_is_tensor) { + if (trt_mode_ == TrtTestMode::kImplicitBatch) { + p.status = + Status(absl::StatusCode::kInvalidArgument, + "Conversion for Tile is not implemented for multipliers " + "passed as a tensor in implicit batch mode"); + num_mults = {1, static_cast(p.multiplier.size())}; + } else { + if (p.test_ID == 1) { + // Skip this test because in that situation it is impossible + // to do a valid check for negative multipliers. + continue; + } + + if (trt_mode_ == TrtTestMode::kDynamicShape) { + partial_input_dims = num_mults; + p.status = Status::OK(); + } + + if (p.test_ID == 2) { + p.status = Status(absl::StatusCode::kInvalidArgument, + "When replications are defined as a tensor, " + "the number of its elements (4) must be equal " + "to the rank of the input tensor (3)."); + } + } + } else { + if (trt_mode_ == TrtTestMode::kImplicitBatch && p.multiplier[0] > 1) { + p.status = + Status(absl::StatusCode::kUnimplemented, + "The Tile operation along " + "the batch dimension in 'my_tile' is not implemented."); + } + } + + Reset(); + if (input_is_tensor) { + AddTestTensor("input", p.input_dims, p.tensor); + } else { + AddTestWeights("input", p.input_dims, p.tensor, tf_type_); + } + + if (multiplier_is_tensor) { + AddTestTensor("weights", num_mults, DT_INT32, p.multiplier, + partial_input_dims); + } else { + AddTestWeights("weights", num_mults, p.multiplier); + } + + TestOpConverter(node_def, p.expected_output_dims, p.status, + Status::OK(), ElementsAreArray(p.expected_results)); + } + } } } -TEST_F(OpConverterTest, ConvertReshape) { +TEST_P(OpConverter_FP32_Test, ConvertReshape) { // Get the NodeDef for Reshape. Scope s = Scope::NewRootScope(); - auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT); + auto input = ops::Placeholder(s.WithOpName("input"), tf_type_); auto weights = ops::Placeholder(s.WithOpName("weights"), DT_INT32); auto reshape = ops::Reshape(s.WithOpName("my_reshape"), input, weights); const NodeDef& node_def = reshape.operation.node()->def(); - { - // Shape is a tensor, should fail. + if (trt_mode_ == TrtTestMode::kImplicitBatch) { + // Shape is a tensor, should fail in implicit batch mode. Reset(); - AddTestTensor("input", {1, 2, 3}); + AddTestTensor("input", {3, 2, 1}); AddTestTensor("weights", {3}); RunValidationAndConversion( - node_def, error::UNIMPLEMENTED, - "The input \"shape\" for Reshape must be a constant, at my_reshape"); - } - { - // Reshape to scalar, should fail. - Reset(); - AddTestTensor("input", {1, 2, 3}); - AddTestWeights("weights", {0}, {}); - RunValidationAndConversion( - node_def, error::UNIMPLEMENTED, - "Reshape to shape=[] is not supported, at my_reshape"); - } - { - // Reshape tensor with zero rank to empty tensor, should fail. - Reset(); - AddTestTensor("input", {}); - AddTestWeights("weights", {1, 0, 1}, {}); + node_def, absl::StatusCode::kInvalidArgument, + "The input \"shape\" for Reshape must be a constant in implicit batch " + "mode"); + } else if (!IS_TRT_VERSION_GE(7, 1, 3, 0)) { + // Shape is a tensor, should fail before TRT 7.1.3 even in explicit batch / + // dynamic shape mode. + Reset(); + AddTestTensor("input", {3, 2, 1}); + AddTestTensor("weights", {3}); RunValidationAndConversion( - node_def, error::UNIMPLEMENTED, - "Reshape to shape=[] is not supported, at my_reshape"); - } + node_def, absl::StatusCode::kInvalidArgument, + "Non constant shape input tensor for Reshape requires minimum TRT " + "7.1.3"); + } + + Status reshape_from_scalar_status = + trt_mode_ == TrtTestMode::kImplicitBatch + ? errors::Internal( + "Failed to convert at least one input to a TRT_TensorOrWeights:" + " Scalar input tensor is not supported since the first " + "dimension is treated as batch dimension by TRT") + : Status::OK(); + Status add_scalar_tensor_status = + trt_mode_ == TrtTestMode::kImplicitBatch + ? errors::InvalidArgument( + "removing first dim requires explicit batch dimension") + : Status::OK(); + Status reshape_to_scalar_status = + trt_mode_ == TrtTestMode::kImplicitBatch + ? errors::Unimplemented("Reshape to shape=[] is not supported") + : Status::OK(); + Status reshape_batch_status = + trt_mode_ == TrtTestMode::kImplicitBatch + ? errors::Unimplemented("Reshape on batch dimension is not supported") + : Status::OK(); struct TestParams { - int batch_size; std::vector tensor_dims; std::vector shape; + std::vector expected_shape; + Status conversion_status; + Status runtime_status; + std::vector shape_prof; // needed concrete values if shape == -1. + Status add_test_tensor_status; }; - // Reshape at batch dimension, should fail. - const int kReshapeBatchDimsCases = 5; - TestParams params[kReshapeBatchDimsCases] = { - TestParams{1, {1, 2, 3}, {3, 1, 1, 2}}, - TestParams{1, {1, 2, -1}, {-1, 1, 1, 2}}, - TestParams{1, {1, 2, 3}, {-1, 1, 1, 2}}, - TestParams{-1, {1, 2, 3}, {1, 1, 1, 2}}, - TestParams{-1, {-1, 2, 3}, {1, 1, 1, 6}}, // TODO(laigd): it should pass. + std::vector params = { + // Reshape scalar to tensor, should fail in implicit batch mode. + TestParams{{}, + {1, 1}, + {}, + reshape_from_scalar_status, + {}, + {}, + add_scalar_tensor_status}, + // Reshape tensor to scalar, should fail in implicit batch mode. + // - In explicit batch mode if shape is set as weight it works. + // - In explicit batch mode && using shape as tensor input it should + // fail. In that case we set the expected conversion status in the + // test loop. + TestParams{{1, 1}, {}, {}, reshape_to_scalar_status}, + // Reshape at batch dimension, should fail in implicit batch mode. + TestParams{{1, 1, 2, 3}, {3, 1, 1, 2}, {}, reshape_batch_status}, + TestParams{{2, 1, 2, 3}, {-1, 1, 4}, {3, 1, 4}, reshape_batch_status}, + // Tests that should succeed in every trt_mode. + TestParams{{1, 1, 2, 3}, {-1, 1, 3, 2}, {1, 1, 3, 2}}, + TestParams{{1, 1, 2, 3}, {1, 1, -1}, {1, 1, 6}}, + TestParams{{1, 1, 2, 3}, {1, 1, 3, 2}}, + TestParams{{2, 1, 2, 3}, {2, 1, 3, 2}}, + TestParams{{1, 1, 1}, {1}}, + TestParams{{1}, {1, 1}}, + TestParams{{2, 1, 1}, {2}}, + TestParams{{2}, {2, 1}}, }; - for (int i = 0; i < kReshapeBatchDimsCases; ++i) { - Reset(); - const std::vector& dims = params[i].tensor_dims; - AddTestTensor("input", dims, params[i].batch_size); - AddTestWeights("weights", {4}, params[i].shape); - RunValidationAndConversion( - node_def, error::UNIMPLEMENTED, - "Reshape on batch dimension is not supported, at my_reshape", - /*should_run_conversion=*/(dims[0] > 0 && dims[1] > 0 && dims[2] > 0)); - } - - // Reshape on non batch dimensions, ok. - const int kReshapeOKCases = 8; - TestParams ok_params[kReshapeOKCases] = { - TestParams{-1, {1, 2, 3}, {-1, 1, 3, 2}}, - TestParams{1, {1, 2, 3}, {-1, 1, 3, 2}}, - TestParams{1, {1, 2, 3}, {1, 1, 3, 2}}, - TestParams{2, {1, 2, 3}, {2, 1, 3, 2}}, - TestParams{1, {1, 1}, {1}}, - TestParams{1, {}, {1, 1}}, - TestParams{2, {1, 1}, {2}}, - TestParams{2, {}, {2, 1}}, + if (trt_mode_ == TrtTestMode::kImplicitBatch) { + // Reshape tensor with zero rank using an empty shape tensor, should fail in + // implicit batch mode. In explicit batch mode this is an identity operation + // and does not add a reshape layer therefore we do not test it. + params.push_back(TestParams{{}, + {}, + {}, + reshape_from_scalar_status, + {}, + {}, + add_scalar_tensor_status}); + } + // Testing the methods for representing the reshape shape for IShuffleLayer: + // as a weight (true) or as a tensor (false). + std::vector shape_input_options(1, true); + + if (trt_mode_ != TrtTestMode::kImplicitBatch && + IS_TRT_VERSION_GE(7, 1, 3, 0)) { + shape_input_options.push_back(false); + } + + for (auto p : params) { + for (auto shape_as_weight : shape_input_options) { + std::ostringstream oss; + oss << "shape " << PrintToString(p.shape); + SCOPED_TRACE(StrCat(oss.str(), shape_as_weight ? " weight" : " tensor")); + if (!shape_as_weight && p.shape.empty()) { + p.conversion_status = errors::Unimplemented( + "Reshape with dynamic input requires 1D input tensor"); + } + Reset(); + const int n_elements = + std::accumulate(p.tensor_dims.begin(), p.tensor_dims.end(), 1, + std::multiplies()); + std::vector input_vec(n_elements); + std::iota(input_vec.begin(), input_vec.end(), 1); + AddTestTensor("input", p.tensor_dims, tf_type_, input_vec, {}, + p.add_test_tensor_status); + if (shape_as_weight) { + AddTestWeights("weights", {static_cast(p.shape.size())}, + p.shape); + } else { + std::vector dims; + std::vector values{p.shape}; + if (!p.shape.empty()) { + dims.push_back(p.shape.size()); + } else { + // If the shape is empty we use a dummy value to ensure that + // AddTestTensor creates the corresponding entry in InputOutputData. + values.push_back(1); + } + AddTestTensor("weights", dims, DT_INT32, values, dims); + } + std::vector expected_shape = + p.expected_shape.empty() ? p.shape : p.expected_shape; + VLOG(2) << "Calling TestOpConverter"; + TestOpConverter(node_def, expected_shape, p.conversion_status, + p.runtime_status, ElementsAreArray(input_vec)); + } + } +} + +TEST_P(OpConverter_FP32_Test, ConvertShape) { + // Get the NodeDef for Shape op. + Scope s = Scope::NewRootScope(); + auto input = ops::Placeholder(s.WithOpName("input"), tf_type_); + auto shape = ops::Shape(s.WithOpName("my_shape"), input); + const NodeDef& node_def = shape.operation.node()->def(); + + Status conversion_status = + (trt_mode_ == TrtTestMode::kImplicitBatch) + ? errors::Unimplemented( + "Shape is only supported for explicit batch mode.") + : Status::OK(); + std::vector test_params = { +// TODO(b/166274212): Enable the test parameter for TensorRT 7.1.3. +#if !IS_TRT_VERSION_GE(7, 1, 3, 0) + TestParamBase{{1, 2, 3}, {}, {3}, {}, conversion_status}, +#endif + // Add input as weight (we use non empty param ({1}) to trigger this). + TestParamBase{{1, 2, 3}, {}, {3}, {1}, conversion_status}, }; - for (int i = 0; i < kReshapeOKCases; ++i) { - const int batch_size = std::max(1, ok_params[i].batch_size); - const auto& shape = ok_params[i].shape; - Reset(); - AddTestTensor("input", ok_params[i].tensor_dims, batch_size); - AddTestWeights("weights", {static_cast(shape.size())}, shape); - RunValidationAndConversion(node_def); - TRT_TensorOrWeights output; - TF_EXPECT_OK(GetTensorOrWeights("my_reshape", &output)); - ASSERT_TRUE(output.is_tensor()); - const std::vector expected_output_dims(shape.begin() + 1, shape.end()); - const nvinfer1::Dims actual_output_dims = output.tensor()->getDimensions(); - ExpectTrtDimsEqualsArray(expected_output_dims, actual_output_dims); - - std::vector input_vec(TrtTensorDimsNumElements(actual_output_dims) * - batch_size); - std::iota(input_vec.begin(), input_vec.end(), 1); - const DataVec input_data{{"input", test::AsTensor(input_vec)}}; - DataVec output_data{ - {"my_reshape", ConstructTensor(input_vec.size())}}; - BuildAndRun(input_data, &output_data, TrtPrecisionMode::FP32, batch_size); - EXPECT_THAT(GetSpanForData(output_data[0]), - ElementsAreArray(input_vec)); + auto input_is_weight = [](const TestParamBase p) { return !p.param.empty(); }; + for (auto p : test_params) { + SCOPED_TRACE(p); + Reset(); + // The number of elements of the input tensor. We leave it 0 in case we do + // not need to add an input tensor. This happens in explicit batch mode: the + // shape is known at conversion time and therefore the shape is added to the + // network as a constant layer. In this case the single node network that + // we use for the unit test have no actual input tensor when it is converted + // to a TensorRT network. + int n_elements = 0; + if (input_is_weight(p) || trt_mode_ != TrtTestMode::kExplicitBatch) { + // Calculate the number of elements for adding input data. + n_elements = std::accumulate(p.input_dims.begin(), p.input_dims.end(), 1, + std::multiplies()); + } + std::vector input_val(n_elements, 1); + if (!input_is_weight(p)) { + AddTestTensor("input", p.input_dims, input_val); + } else { + AddTestWeights("input", p.input_dims, input_val, tf_type_); + } + TestOpConverter(node_def, p.expected_output_dims, p.status, + p.runtime_status, ElementsAreArray(p.input_dims), + {DT_INT32}); } } -// Helper function for testing MatMul and BatchMatMul -// get_matmul corresponds to the function used to generate the node. It should -// accept (DataType, transpose_a, transpose_b) as parameters. +struct MatMulTestParams { + std::vector shape_a; + std::vector values_a; + bool transpose_a; + std::vector shape_b; + std::vector values_b; + bool transpose_b; + std::vector expected_shape; + std::vector expected_output; +}; + +// Helper function for testing MatMul and BatchMatMul. get_matmul is a function +// used to generate the node. It accepts (DataType, transpose_a, transpose_b) as +// parameters. void TestMatMulHelper( - OpConverterTest* test, + ParameterizedOpConverterTestBase* test, const std::function& get_matmul, - const std::string& op_name) { - // HACK: This needs to be done in a better way. - const bool is_batch_matmul = op_name == "BatchMatMul"; + const std::vector& params) { { // Unsupported data type. test->Reset(); NodeDef node_def = get_matmul(DT_INT32, false, false); - test->AddTestTensor("input", {2}, /*batch_size=*/1, - nvinfer1::DataType::kINT32); + test->AddTestTensor("input", {1, 2}, DT_INT32, {}); test->AddTestWeights("weights", {2, 1}, {3, 5}); + const std::vector allowed_types{DT_FLOAT, DT_HALF}; test->RunValidationAndConversion( - node_def, error::UNIMPLEMENTED, - StrCat("Data type int32 is not supported for ", op_name, - ", must be one of [float, half], at my_matmul") - .c_str()); - } - // OK. - for (bool transpose_a : {false, true}) { - for (bool transpose_b : {false, true}) { - test->Reset(); - NodeDef node_def = get_matmul(DT_FLOAT, transpose_a, transpose_b); - test->AddTestTensor("input", {2}, /*batch_size=*/1); - test->AddTestWeights("weights", {2, 2}, {0, 1, 2, 3}); - if (is_batch_matmul) { - test->RunValidationAndConversion( - node_def, error::UNIMPLEMENTED, - "TensorRT does not support batched constants."); - continue; - } else if (transpose_a) { - test->RunValidationAndConversion( - node_def, error::INVALID_ARGUMENT, - "Cannot transpose first input if it is a tensor with fewer than 2 " - "non-batch dimensions"); - continue; - } - test->RunValidationAndConversion(node_def); - TRT_TensorOrWeights output; - TF_EXPECT_OK(test->GetTensorOrWeights("my_matmul", &output)); - ASSERT_TRUE(output.is_tensor()); - ExpectTrtDimsEqualsArray({2}, output.tensor()->getDimensions()); - - const DataVec input_data{{"input", test::AsTensor({0, 1})}}; - DataVec output_data{{"my_matmul", ConstructTensor(2)}}; - test->BuildAndRun(input_data, &output_data); - if (transpose_b) { - EXPECT_THAT(GetSpanForData(output_data[0]), ElementsAre(1, 3)); - } else { - EXPECT_THAT(GetSpanForData(output_data[0]), ElementsAre(2, 3)); + node_def, absl::StatusCode::kUnimplemented, + convert_not_supported_dtype_msg(allowed_types, DT_INT32, node_def)); + } + + // FC conversion depends on whether the last dim of A is known or not. In + // Dynamic shape mode, we will check whether A is handled correctly if it has + // a partially known input shape (last dim known). + std::vector a_test_partial_shape_values{false}; + if (test->get_trt_mode() == TrtTestMode::kDynamicShape) { + a_test_partial_shape_values.push_back(true); + } + + for (auto p : params) { + for (bool a_is_tensor : {true, false}) { + for (bool b_is_tensor : {true, false}) { + for (bool a_partial_shape : a_test_partial_shape_values) { + if (a_partial_shape && !a_is_tensor) { + // Only tensors can have partial shape. + continue; + } + if (!a_is_tensor && !b_is_tensor) { + // Skip test when both args are weights. We do not convert this + // since const folding eliminates this case. + continue; + } + SCOPED_TRACE(StrCat("A", p.transpose_a ? ".T" : "", " is ", + a_is_tensor ? "tensor" : "weight", ", B", + p.transpose_b ? ".T" : "", " is ", + b_is_tensor ? "tensor " : "weight, rank A ", + p.shape_a.size(), ", rank B ", p.shape_b.size())); + test->Reset(); + + NodeDef node_def = + get_matmul(test->get_tf_type(), p.transpose_a, p.transpose_b); + const bool is_batch_matmul = node_def.op() == "BatchMatMul"; + + if (a_is_tensor) { + if (a_partial_shape) { + // Prepare a partial shape for A where only the last dim is known. + std::vector partial_shape(p.shape_a.size(), -1); + int k = p.shape_a.size() - 1; + partial_shape.at(k) = p.shape_a.at(k); + test->AddTestTensor("input", p.shape_a, test->get_tf_type(), + p.values_a, partial_shape); + } else { + test->AddTestTensor("input", p.shape_a, p.values_a); + } + } else { + test->AddTestWeights("input", p.shape_a, p.values_a, + test->get_tf_type()); + } + if (b_is_tensor) { + if (a_is_tensor && p.shape_a[0] != p.shape_b[0] && + test->get_trt_mode() == TrtTestMode::kImplicitBatch) { + VLOG(2) << "Skipping test with inpcompatible batch dimensions"; + continue; + } + test->AddTestTensor("weights", p.shape_b, p.values_b); + } else { + test->AddTestWeights("weights", p.shape_b, p.values_b, + test->get_tf_type()); + } + + Status conversion_status = Status::OK(); + if (test->get_trt_mode() == TrtTestMode::kImplicitBatch) { + // Implicit batch mode has several restriction. We change conversion + // status accordingly. + if (is_batch_matmul) { + if (a_is_tensor && p.shape_a.size() < p.shape_b.size()) { + conversion_status = errors::InvalidArgument( + "Broadcasting beyond batch dimension is not supported " + "(tensor #dims ", + p.shape_a.size(), " vs broadcast #dims ", p.shape_b.size(), + ")"); + } + if (b_is_tensor && p.shape_b.size() < p.shape_a.size()) { + conversion_status = errors::InvalidArgument( + "Broadcasting beyond batch dimension is not supported " + "(tensor #dims ", + p.shape_b.size(), " vs broadcast #dims ", p.shape_a.size(), + ")"); + } + if ((!a_is_tensor || !b_is_tensor) && p.shape_a[0] != 1) { + conversion_status = errors::Unimplemented( + "TensorRT does not support batched constants in implicit " + "batch mode."); + } + } else if ((a_is_tensor && p.shape_a.size() <= 2 && + (p.transpose_a || b_is_tensor)) || + (b_is_tensor && p.shape_b.size() <= 2)) { + conversion_status = errors::InvalidArgument( + "MatMul with 2D tensors requires explicit batch mode, or that" + " tensor A is not transposed and B is a constant tensor."); + } + } + + test->TestOpConverter(node_def, p.expected_shape, conversion_status, + Status::OK(), + ElementsAreArray(p.expected_output)); + if (!conversion_status.ok()) { + VLOG(2) << "Converted with status " << conversion_status; + } + VLOG(2) << "== Finished test iteration =="; + } } } } - // OK, 3D inputs - for (bool transpose_b : {false, true}) { - test->Reset(); - NodeDef node_def = get_matmul(DT_FLOAT, /*transpose_a=*/false, transpose_b); - test->AddTestTensor("input", {2}, /*batch_size=*/1); - test->AddTestWeights("weights", {2, 2}, {0, 1, 2, 3}); - if (is_batch_matmul) { - test->RunValidationAndConversion( - node_def, error::UNIMPLEMENTED, - "TensorRT does not support batched constants."); - continue; - } - test->RunValidationAndConversion(node_def); - TRT_TensorOrWeights output; - TF_EXPECT_OK(test->GetTensorOrWeights("my_matmul", &output)); - ASSERT_TRUE(output.is_tensor()); - ExpectTrtDimsEqualsArray({2}, output.tensor()->getDimensions()); - const DataVec input_data{{"input", test::AsTensor({0, 1})}}; - DataVec output_data{{"my_matmul", ConstructTensor(2)}}; - test->BuildAndRun(input_data, &output_data); - if (transpose_b) { - EXPECT_THAT(GetSpanForData(output_data[0]), ElementsAre(1, 3)); - } else { - EXPECT_THAT(GetSpanForData(output_data[0]), ElementsAre(2, 3)); - } - } } template @@ -1854,7 +3192,39 @@ void CheckAddedLayers(OpConverterTest* test, bool expect_found) { EXPECT_EQ(expect_found, layer_found); } -TEST_F(OpConverterTest, ConvertMatMul) { +std::vector GetMatMulTestParams() { + std::vector params{ + // clang-format off + MatMulTestParams{{2, 2}, {0, 1, 2, 3}, false, // A (shape, val, T?) + {2, 2}, {0, 1, 2, 3}, false, // B (shape, val, T?) + {2, 2}, {2, 3, 6, 11}}, // result (shape, val) + MatMulTestParams{{2, 2}, {0, 1, 2, 3}, false, + {2, 2}, {0, 1, 2, 3}, true, + {2, 2}, {1, 3, 3, 13}}, + MatMulTestParams{{2, 2}, {0, 1, 2, 3}, true, + {2, 2}, {0, 1, 2, 3}, false, + {2, 2}, {4, 6, 6, 10}}, + MatMulTestParams{{2, 2}, {0, 1, 2, 3}, true, + {2, 2}, {0, 1, 2, 3}, true, + {2, 2}, {2, 6, 3, 11}}, + MatMulTestParams{{2, 3}, {0, 1, 2, 3, 4, 5}, false, + {2, 3}, {1, 2, 3, 4, 5, 6}, true, + {2, 2}, {8, 17, 26, 62}}, + MatMulTestParams{{2, 3}, {0, 1, 2, 3, 4, 5}, true, + {2, 3}, {1, 2, 3, 4, 5, 6}, false, + {3, 3}, {12, 15, 18, 17, 22, 27, 22, 29, 36}}, + MatMulTestParams{{3, 2}, {0, 1, 2, 3, 4, 5}, false, + {2, 3}, {1, 2, 3, 4, 5, 6}, false, + {3, 3}, {4, 5, 6, 14, 19, 24, 24, 33, 42}}, + MatMulTestParams{{3, 2}, {0, 1, 2, 3, 4, 5}, true, + {2, 3}, {1, 2, 3, 4, 5, 6}, true, + {2, 2}, {16, 34, 22, 49}}, + // clang-format on + }; + return params; +} + +TEST_P(OpConverter_FP32_Test, ConvertMatMul) { // Get the NodeDef for MatMul. auto get_matmul_nodedef = [](DataType dtype, bool transpose_a, bool transpose_b) -> NodeDef { @@ -1868,68 +3238,10 @@ TEST_F(OpConverterTest, ConvertMatMul) { return matmul.operation.node()->def(); }; - // Additional test cases specific to MatMul - { - // Can only transpose A if it is 2D in TRT - Reset(); - NodeDef node_def = get_matmul_nodedef(DT_FLOAT, true, false); - AddTestTensor("input", {2}, /*batch_size=*/1); - AddTestWeights("weights", {2, 2}, {0, 1, 2, 3}); - RunValidationAndConversion( - node_def, error::INVALID_ARGUMENT, - "Cannot transpose first input if it is a tensor with fewer than 2 " - "non-batch dimensions."); - } - { - // B must always have 2 non-batch dimensions - Reset(); - NodeDef node_def = get_matmul_nodedef(DT_FLOAT, false, false); - AddTestTensor("input", {2}, /*batch_size=*/1); - AddTestTensor("weights", {2}, /*batch_size=*/1); - RunValidationAndConversion( - node_def, error::INVALID_ARGUMENT, - "Second input must either be a constant, or contain at least 2 " - "non-batch dimensions."); - } - { - // We can never transpose weights that are not 2D. - Reset(); - NodeDef node_def = get_matmul_nodedef(DT_FLOAT, true, false); - AddTestWeights("input", {1, 1, 2}, {0, 1}); - AddTestTensor("weights", {2, 2}, /*batch_size=*/1); - RunValidationAndConversion( - node_def, error::INVALID_ARGUMENT, - "Cannot currently transpose constant input if it is not 2 dimensional"); - } - { - // Make sure that INT8 mode uses IFullyConnectedLayer when possible. - precision_mode_to_test_ = TrtPrecisionMode::INT8; - Reset(); - NodeDef node_def = get_matmul_nodedef(DT_FLOAT, false, false); - AddTestTensor("input", {2, 1, 1}); - AddTestWeights("weights", {2, 2}, {0, 1, 2, 3}); - RunValidationAndConversion(node_def); - CheckAddedLayers(this, false); - CheckAddedLayers(this, true); - precision_mode_to_test_ = TrtPrecisionMode::FP32; - } - { - // Make sure that INT8 mode doesn't try to use IFullyConnectedLayer when not - // compatible. In this case we can't use FC because weights is a tensor. - precision_mode_to_test_ = TrtPrecisionMode::INT8; - Reset(); - NodeDef node_def = get_matmul_nodedef(DT_FLOAT, false, false); - AddTestTensor("input", {2, 1, 1}); - AddTestTensor("weights", {2, 2}); - RunValidationAndConversion(node_def); - CheckAddedLayers(this, true); - CheckAddedLayers(this, false); - precision_mode_to_test_ = TrtPrecisionMode::FP32; - } - TestMatMulHelper(this, get_matmul_nodedef, "MatMul"); + TestMatMulHelper(this, get_matmul_nodedef, GetMatMulTestParams()); } -TEST_F(OpConverterTest, ConvertBatchMatMul) { +TEST_P(OpConverter_FP32_Test, ConvertBatchMatMul) { // Get the NodeDef for BatchMatMul. auto get_batch_matmul_nodedef = [](DataType dtype, bool transpose_a, bool transpose_b) -> NodeDef { @@ -1943,304 +3255,229 @@ TEST_F(OpConverterTest, ConvertBatchMatMul) { return matmul.operation.node()->def(); }; - { - // Can't broadcast two tensor inputs of different rank. - Reset(); - NodeDef node_def = get_batch_matmul_nodedef(DT_FLOAT, false, false); - AddTestTensor("input", {1, 2, 2}, /*batch_size=*/2); - AddTestTensor("weights", {2}, /*batch_size=*/2); - RunValidationAndConversion( - node_def, error::UNIMPLEMENTED, - "Inputs must have the same rank if they are both tensors."); - } - { - // Make sure that INT8 mode doesn't try to use IFullyConnectedLayer when not - // compatible. In this case we can't use FC because transpose_a is true. - precision_mode_to_test_ = TrtPrecisionMode::INT8; - Reset(); - NodeDef node_def = get_batch_matmul_nodedef(DT_FLOAT, true, false); - AddTestTensor("input", {1, 2, 2}); - AddTestWeights("weights", {2, 2}, {0, 1, 2, 3}); - RunValidationAndConversion(node_def); - CheckAddedLayers(this, true); - CheckAddedLayers(this, false); - precision_mode_to_test_ = TrtPrecisionMode::FP32; - } + // We derive test data from the MatMul test params by adding extra leading + // dimensions. + std::vector params_2d = GetMatMulTestParams(); + std::vector params; + params.reserve(params_2d.size() * 3 + 1); - for (bool transpose_a : {false, true}) { - for (bool transpose_b : {false, true}) { - Reset(); - NodeDef node_def = - get_batch_matmul_nodedef(DT_FLOAT, transpose_a, transpose_b); - AddTestTensor("input", {2, 2}, /*batch_size=*/1); - AddTestWeights("weights", {1, 2, 2}, {1, 2, 3, 4}); - - RunValidationAndConversion(node_def); - TRT_TensorOrWeights output; - TF_EXPECT_OK(GetTensorOrWeights("my_matmul", &output)); - ASSERT_TRUE(output.is_tensor()); - ExpectTrtDimsEqualsArray({2, 2}, output.tensor()->getDimensions()); - const DataVec input_data{{"input", test::AsTensor({0, 1, 2, 3})}}; - DataVec output_data{{"my_matmul", ConstructTensor(4)}}; - BuildAndRun(input_data, &output_data); - if (!transpose_a && !transpose_b) { - EXPECT_THAT(GetSpanForData(output_data[0]), - ElementsAre(3, 4, 11, 16)); - } else if (transpose_a && transpose_b) { - EXPECT_THAT(GetSpanForData(output_data[0]), - ElementsAre(4, 8, 7, 15)); - } else if (transpose_a) { - EXPECT_THAT(GetSpanForData(output_data[0]), - ElementsAre(6, 8, 10, 14)); - } else if (transpose_b) { - EXPECT_THAT(GetSpanForData(output_data[0]), - ElementsAre(2, 4, 8, 18)); - } - } - } + auto insert_ones = [](std::vector v, int n) { + std::vector ones(n, 1); + ones.insert(ones.end(), v.begin(), v.end()); + return ones; + }; - TestMatMulHelper(this, get_batch_matmul_nodedef, "BatchMatMul"); + // Add a leading 1 dimension to A, B and result. + std::transform(params_2d.begin(), params_2d.end(), std::back_inserter(params), + [](MatMulTestParams p) { + p.shape_a.insert(p.shape_a.begin(), 1); + p.shape_b.insert(p.shape_b.begin(), 1); + p.expected_shape.insert(p.expected_shape.begin(), 1); + return p; + }); + + // Test with N > 1: weights cannot be batched in implicit batch mode. + // clang-format off + params.push_back( + MatMulTestParams{{2, 2, 2}, {0, 1, 2, 3, 0, 1, 2, 3}, false, // A + {2, 2, 2}, {0, 1, 2, 3, 0, 1, 2, 3}, false, // B + {2, 2, 2}, {2, 3, 6, 11, 2, 3, 6, 11}} // result + ); + + params.push_back( + MatMulTestParams{{2, 2, 3}, {0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5}, + false, + {2, 2, 3}, {1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6}, true, + {2, 2, 2}, {8, 17, 26, 62, 8, 17, 26, 62}}); + // clang-format on + + // Add two leading 1 dimensions to A, B and result. + std::transform(params_2d.begin(), params_2d.end(), std::back_inserter(params), + [insert_ones](MatMulTestParams p) { + p.shape_a = insert_ones(p.shape_a, 2); + p.shape_b = insert_ones(p.shape_b, 2); + p.expected_shape = insert_ones(p.expected_shape, 2); + return p; + }); + + // Test broadcast: add two leading 1 dimensions to A, but not to B. + std::transform(params_2d.begin(), params_2d.end(), std::back_inserter(params), + [insert_ones](MatMulTestParams p) { + p.shape_a = insert_ones(p.shape_a, 2); + p.expected_shape = insert_ones(p.expected_shape, 2); + return p; + }); + + // Test broadcast: add a leading 1 dimension to A and two leading 1s to B. + // Broadcasting A need a dynamic brodacast which will be incompatible with + // FC layer. + std::transform(params_2d.begin(), params_2d.end(), std::back_inserter(params), + [insert_ones](MatMulTestParams p) { + p.shape_a = insert_ones(p.shape_a, 1); + p.shape_b = insert_ones(p.shape_b, 2); + p.expected_shape = insert_ones(p.expected_shape, 2); + return p; + }); + + // Test with N > 1: since weights cannot be batched in implicit batch mode. + // We tests with batch size 2. + std::transform(params_2d.begin(), params_2d.end(), std::back_inserter(params), + [insert_ones](MatMulTestParams p) { + p.shape_a.insert(p.shape_a.begin(), 2); + p.values_a.reserve(p.values_a.size() * 2); + p.values_a.insert(p.values_a.end(), p.values_a.begin(), + p.values_a.end()); + + p.shape_b.insert(p.shape_b.begin(), 2); + p.values_b.reserve(p.values_b.size() * 2); + p.values_b.insert(p.values_b.end(), p.values_b.begin(), + p.values_b.end()); + + p.expected_shape.insert(p.expected_shape.begin(), 2); + p.expected_output.reserve(p.expected_output.size() * 2); + p.expected_output.insert(p.expected_output.end(), + p.expected_output.begin(), + p.expected_output.end()); + return p; + }); + + // 4D tensor where the second "batch dim" is not 1 + params.push_back(MatMulTestParams{ + {1, 2, 4, 5}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, + 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, + 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39}, + false, // A + {1, 2, 3, 5}, + {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30}, + true, // B + {1, 2, 4, 3}, + {40, 90, 140, 115, 290, 465, 190, 490, + 790, 265, 690, 1115, 1990, 2540, 3090, 2440, + 3115, 3790, 2890, 3690, 4490, 3340, 4265, 5190}}); // result + + TestMatMulHelper(this, get_batch_matmul_nodedef, params); } -template -void TestConvertBiasAdd(OpConverterTest* test) { +TEST_P(OpConverter_FP32_FP16_Test, ConvertBiasAdd) { + // Note that kINT32 is not supported by IScaleLayer, so we don't test + // DT_INT32 type here. DT_FLOAT and DT_HALF are tested. // Get the NodeDef for BiasAdd. - auto get_biasadd_nodedef = [](const string& data_format) -> NodeDef { + auto get_biasadd_nodedef = [](const string& data_format, + DataType tf_type) -> NodeDef { Scope s = Scope::NewRootScope(); - auto input = ops::Placeholder(s.WithOpName("input"), dtype); - auto weights = ops::Placeholder(s.WithOpName("weights"), dtype); + auto input = ops::Placeholder(s.WithOpName("input"), tf_type); + auto weights = ops::Placeholder(s.WithOpName("weights"), tf_type); const auto biasadd_attrs = ops::BiasAdd::DataFormat(data_format); auto biasadd = ops::BiasAdd(s.WithOpName("my_biasadd"), input, weights, biasadd_attrs); return biasadd.operation.node()->def(); }; - typedef typename EnumToDataType::Type CType; for (const string& data_format : {"NHWC", "NCHW"}) { for (const int trt_input_rank : {1, 2, 3, 4}) { - test->Reset(); - NodeDef node_def = get_biasadd_nodedef(data_format); + Reset(); + NodeDef node_def = get_biasadd_nodedef(data_format, tf_type_); // Add input, dims_array will be like {2, 1, ..., 1, 3} - std::vector dims_array(trt_input_rank, 1); + std::vector dims_array(trt_input_rank + 1, 1); if (trt_input_rank == 1) { - dims_array[0] = (data_format == "NHWC" ? 3 : 2); + dims_array[1] = (data_format == "NHWC" ? 3 : 2); } else { - dims_array[0] = 2; - dims_array[trt_input_rank - 1] = 3; + dims_array[1] = 2; + dims_array[trt_input_rank] = 3; } - test->AddTestTensor("input", dims_array, /*batch_size=*/1, - TfDataTypeToTrt(dtype)); + const int64_t num_input = DimsAdapter(dims_array).Volume(); + ASSERT_EQ(trt_input_rank > 1 ? 6 : (data_format == "NHWC" ? 3 : 2), + num_input); + std::vector input_data(num_input, 0); + + AddTestTensor("input", dims_array, input_data); - // Add bias weights. const int channel_size = (data_format == "NHWC" ? 3 : 2); - std::vector bias(channel_size); + std::vector bias(channel_size); for (int i = 0; i < channel_size; ++i) { - bias[i] = CType(i + 1); // bias will be {1, 2, 3, ...} + bias[i] = i + 1; // bias will be {1, 2, 3, ...} } - test->AddTestWeights("weights", {channel_size}, bias); - - // Run the conversion. - test->RunValidationAndConversion(node_def); - TRT_TensorOrWeights output; - TF_EXPECT_OK(test->GetTensorOrWeights("my_biasadd", &output)); - ASSERT_TRUE(output.is_tensor()); - ExpectTrtDimsEqualsArray(dims_array, output.tensor()->getDimensions()); + AddTestWeights("weights", {channel_size}, bias, tf_type_); // Build and run the engine. - const int num_input = TrtTensorDimsNumElements(GetTestDims(dims_array)); - ASSERT_EQ(trt_input_rank > 1 ? 6 : (data_format == "NHWC" ? 3 : 2), - num_input); + std::vector output_data; - const DataVec input_data{ - {"input", ConstructTensor(num_input, CType(0))}}; - DataVec output_data{{"my_biasadd", ConstructTensor(num_input)}}; - test->BuildAndRun(input_data, &output_data); if (trt_input_rank == 1) { if (data_format == "NHWC") { - EXPECT_THAT(GetSpanForData(output_data[0]), - ElementsAre(CType(1), CType(2), CType(3))); + output_data = {1, 2, 3}; } else { - EXPECT_THAT(GetSpanForData(output_data[0]), - ElementsAre(CType(1), CType(2))); + output_data = {1, 2}; } } else { if (data_format == "NHWC") { - EXPECT_THAT(GetSpanForData(output_data[0]), - ElementsAre(CType(1), CType(2), CType(3), CType(1), - CType(2), CType(3))); + output_data = {1, 2, 3, 1, 2, 3}; } else { - EXPECT_THAT(GetSpanForData(output_data[0]), - ElementsAre(CType(1), CType(1), CType(1), CType(2), - CType(2), CType(2))); + output_data = {1, 1, 1, 2, 2, 2}; } } + TestOpConverter(node_def, dims_array, Status::OK(), Status::OK(), + ElementsAreArray(output_data)); } } } -TEST_F(OpConverterTest, ConvertBiasAdd) { - // OK. Note that kINT32 is not supported by IScaleLayer, so we don't test - // DT_INT32 type here. - TestConvertBiasAdd(this); - TestConvertBiasAdd(this); -} - template -NodeDef GetBinaryOpNodeDef(const string& input_name_l, - const string& input_name_r, DataType dtype) { +NodeDef GetBinaryOpNodeDef(DataType dtype) { Scope s = Scope::NewRootScope(); - auto input_l = ops::Placeholder(s.WithOpName(input_name_l), dtype); - auto input_r = ops::Placeholder(s.WithOpName(input_name_r), dtype); + auto input_l = ops::Placeholder(s.WithOpName("input1"), dtype); + auto input_r = ops::Placeholder(s.WithOpName("input2"), dtype); auto op = OpType(s.WithOpName("my_binary"), input_l, input_r); return op.operation.node()->def(); } - -template -void TestBinaryOp(OpConverterTest* test, bool operand_1_is_tensor, - bool operand_2_is_tensor) { - typedef typename EnumToDataType::Type CType; - test->Reset(); - const NodeDef node_def = - GetBinaryOpNodeDef("input1", "input2", dtype); - if (operand_1_is_tensor) { - test->AddTestTensor("input1", /*dims=*/{1, 2}, /*batch_size=*/2, - TfDataTypeToTrt(dtype)); - } else { - test->AddTestWeights("input1", /*dims=*/{1, 2}, - /*values=*/std::vector{CType(3), CType(6)}); - } - if (operand_2_is_tensor) { - test->AddTestTensor("input2", /*dims=*/{2, 1}, /*batch_size=*/2, - TfDataTypeToTrt(dtype)); - } else { - test->AddTestWeights("input2", /*dims=*/{2, 1}, - /*values=*/std::vector{CType(2), CType(3)}); - } - test->RunValidationAndConversion(node_def); - - DataVec input_data; - if (operand_1_is_tensor) { - input_data.push_back( - {"input1", - test::AsTensor({CType(3), CType(6), CType(3), CType(6)})}); - } - if (operand_2_is_tensor) { - input_data.push_back( - {"input2", - test::AsTensor({CType(2), CType(3), CType(2), CType(3)})}); - } - DataVec output_data{{"my_binary", ConstructTensor(8)}}; - // Check output dims. - TRT_TensorOrWeights output; - TF_EXPECT_OK(test->GetTensorOrWeights("my_binary", &output)); - ASSERT_TRUE(output.is_tensor()); - ExpectTrtDimsEqualsArray({2, 2}, output.tensor()->getDimensions()); - // After broadcasting first input becomes {3, 6, 3, 6} and second input - // becomes {2, 3, 2, 3}. - test->BuildAndRun( - input_data, &output_data, - dtype == DT_HALF ? TrtPrecisionMode::FP16 : TrtPrecisionMode::FP32, - /*batch_size=*/2); - if (node_def.op() == "Add") { - EXPECT_THAT( - GetSpanForData(output_data[0]), - ElementsAreArray(CastTestVector({5, 8, 6, 9, 5, 8, 6, 9}))); - } else if (node_def.op() == "Sub") { - EXPECT_THAT( - GetSpanForData(output_data[0]), - ElementsAreArray(CastTestVector({1, 4, 0, 3, 1, 4, 0, 3}))); - } else if (node_def.op() == "Mul") { - EXPECT_THAT(GetSpanForData(output_data[0]), - ElementsAreArray( - CastTestVector({6, 12, 9, 18, 6, 12, 9, 18}))); - } else if (node_def.op() == "Div") { - EXPECT_THAT(GetSpanForData(output_data[0]), - ElementsAreArray(CastTestVector( - {1.5, 3, 1, 2, 1.5, 3, 1, 2}))); - } else if (node_def.op() == "RealDiv") { - EXPECT_THAT(GetSpanForData(output_data[0]), - ElementsAreArray(CastTestVector( - {1.5, 3, 1, 2, 1.5, 3, 1, 2}))); - } else if (node_def.op() == "FloorDiv") { - EXPECT_THAT(GetSpanForData(output_data[0]), - ElementsAreArray( - CastTestVector({1, 3, 1, 2, 1, 3, 1, 2}))); - } else if (node_def.op() == "Minimum") { - EXPECT_THAT( - GetSpanForData(output_data[0]), - ElementsAreArray(CastTestVector({2, 2, 3, 3, 2, 2, 3, 3}))); - } else if (node_def.op() == "Maximum") { - EXPECT_THAT( - GetSpanForData(output_data[0]), - ElementsAreArray(CastTestVector({3, 6, 3, 6, 3, 6, 3, 6}))); - } else if (node_def.op() == "Pow") { - ExpectArrayNear( - CastTestVector({9, 36, 27, 216, 9, 36, 27, 216}), - GetSpanForData(output_data[0])); - } else { - ASSERT_TRUE(false); - } + +TEST_P(OpConverter_FP32_FP16_BinaryTest, ConvertBinary) { + using OpFunc = std::function; + std::map>> op_test_info; +#define ADD_OP(name, op, v1, v2, v3, v4, v5, v6, v7, v8) \ + op_test_info[name] = \ + std::make_pair(GetBinaryOpNodeDef, \ + std::vector(v1, v2, v3, v4, v5, v6, v7, v8)) + ADD_OP("Add", ops::Add, {5, 8, 6, 9, 5, 8, 6, 9}); + ADD_OP("AddV2", ops::AddV2, {5, 8, 6, 9, 5, 8, 6, 9}); + ADD_OP("Sub", ops::Sub, {1, 4, 0, 3, 1, 4, 0, 3}); + ADD_OP("Mul", ops::Mul, {6, 12, 9, 18, 6, 12, 9, 18}); + ADD_OP("Div", ops::Div, {1.5, 3, 1, 2, 1.5, 3, 1, 2}); + ADD_OP("RealDiv", ops::RealDiv, {1.5, 3, 1, 2, 1.5, 3, 1, 2}); + ADD_OP("FloorDiv", ops::FloorDiv, {1, 3, 1, 2, 1, 3, 1, 2}); + ADD_OP("Minimum", ops::Minimum, {2, 2, 3, 3, 2, 2, 3, 3}); + ADD_OP("Maximum", ops::Maximum, {3, 6, 3, 6, 3, 6, 3, 6}); + ADD_OP("Pow", ops::Pow, {9, 36, 27, 216, 9, 36, 27, 216}); +#if IS_TRT_VERSION_GE(8, 2, 0, 0) + ADD_OP("Greater", ops::Greater, {1, 1, 0, 1, 1, 1, 0, 1}); + ADD_OP("Less", ops::Less, {0, 0, 0, 0, 0, 0, 0, 0}); + ADD_OP("Equal", ops::Equal, {0, 0, 1, 0, 0, 0, 1, 0}); + ADD_OP("GreaterEqual", ops::Less, {1, 1, 1, 1, 1, 1, 1, 1}); + ADD_OP("LessEqual", ops::Greater, {0, 0, 1, 0, 0, 0, 1, 0}); +#endif +#undef ADD_OP + std::vector> data = { + {3, 6, 3, 6}, {3, 6}, {2, 3, 2, 3}, {2, 3}}; + RunTests(*BinaryOperationMap(), op_test_info, data); } -TEST_F(OpConverterTest, ConvertBinary) { - AttrValue dtype; - dtype.set_type(DT_FLOAT); - { - // Both inputs are weights. - Reset(); - NodeDef node_def = - MakeNodeDef("my_add", "Add", {"weights1", "weights2"}, {{"T", dtype}}); - AddTestWeights("weights1", {1}, {1}); - AddTestWeights("weights2", {1}, {1}); - RunValidationAndConversion( - node_def, error::UNIMPLEMENTED, - "Constant folding is falled back to TensorFlow, binary op received " - "both input as constant at: my_add"); - } - - // Test combinations of tensor vs weight inputs (except when both inputs are - // weights). - for (const bool operand_1_is_tensor : {true, false}) { - for (const bool operand_2_is_tensor : {true, false}) { - if (!operand_1_is_tensor && !operand_2_is_tensor) continue; - // FP32 tests - TestBinaryOp(this, operand_1_is_tensor, - operand_2_is_tensor); - TestBinaryOp(this, operand_1_is_tensor, - operand_2_is_tensor); - TestBinaryOp(this, operand_1_is_tensor, - operand_2_is_tensor); - TestBinaryOp(this, operand_1_is_tensor, - operand_2_is_tensor); - TestBinaryOp(this, operand_1_is_tensor, - operand_2_is_tensor); - TestBinaryOp(this, operand_1_is_tensor, - operand_2_is_tensor); - TestBinaryOp(this, operand_1_is_tensor, - operand_2_is_tensor); - TestBinaryOp(this, operand_1_is_tensor, - operand_2_is_tensor); - // FP16 tests - // TODO(tmorris): Use templates to avoid duplication. - TestBinaryOp(this, operand_1_is_tensor, - operand_2_is_tensor); - TestBinaryOp(this, operand_1_is_tensor, - operand_2_is_tensor); - TestBinaryOp(this, operand_1_is_tensor, - operand_2_is_tensor); - TestBinaryOp(this, operand_1_is_tensor, - operand_2_is_tensor); - TestBinaryOp(this, operand_1_is_tensor, - operand_2_is_tensor); - TestBinaryOp(this, operand_1_is_tensor, - operand_2_is_tensor); - TestBinaryOp(this, operand_1_is_tensor, - operand_2_is_tensor); - TestBinaryOp(this, operand_1_is_tensor, - operand_2_is_tensor); - } - } +TEST_P(OpConverter_BOOL_BinaryTest, ConvertBooleanBinary) { + using OpFunc = std::function; + std::map>> op_test_info; +#define ADD_OP(name, op, v1, v2, v3, v4, v5, v6, v7, v8) \ + op_test_info[name] = \ + std::make_pair(GetBinaryOpNodeDef, \ + std::vector(v1, v2, v3, v4, v5, v6, v7, v8)) + ADD_OP("LogicalOr", ops::LogicalOr, {1, 1, 0, 1, 1, 1, 0, 1}); + ADD_OP("LogicalAnd", ops::LogicalAnd, {0, 1, 0, 0, 0, 1, 0, 0}); +#undef ADD_OP +#if IS_TRT_VERSION_GE(8, 2, 0, 0) + std::vector> data = { + {0, 1, 0, 1}, {0, 1}, {1, 0, 1, 0}, {1, 0}}; + RunTests(*BinaryBooleanOperationMap(), op_test_info, data); +#endif } NodeDef GetAddNNodeDef(const std::vector& input_names, DataType dtype) { @@ -2253,94 +3490,136 @@ NodeDef GetAddNNodeDef(const std::vector& input_names, DataType dtype) { return op.operation.node()->def(); } -template -void TestAddN(OpConverterTest* test) { - typedef typename EnumToDataType::Type CType; - { - // All inputs are tensors. - test->Reset(); - DataVec input_data; - for (const auto name : {"inp1", "inp2", "inp3"}) { - test->AddTestTensor(name, /*dims=*/{1, 2}, /*batch_size=*/2, - TfDataTypeToTrt(dtype)); - input_data.push_back({name, test::AsTensor({CType(1), CType(2), - CType(3), CType(4)})}); - } - const NodeDef node_def = GetAddNNodeDef({"inp1", "inp2", "inp3"}, dtype); - test->RunValidationAndConversion(node_def); +struct AddNTestParams { + std::vector input_values; + std::vector input_names; + std::vector dimensions; + std::vector expected_output; + Status status; +}; - TRT_TensorOrWeights output; - TF_EXPECT_OK(test->GetTensorOrWeights("my_addn", &output)); - ASSERT_TRUE(output.is_tensor()); - ExpectTrtDimsEqualsArray({1, 2}, output.tensor()->getDimensions()); +void TestAddN(ParameterizedOpConverterTestBase* test, AddNTestParams& p) { + // All inputs are tensors. + test->Reset(); + const NodeDef node_def = GetAddNNodeDef(p.input_names, test->get_tf_type()); - DataVec output_data{{"my_addn", ConstructTensor(4)}}; - test->BuildAndRun( - input_data, &output_data, - dtype == DT_HALF ? TrtPrecisionMode::FP16 : TrtPrecisionMode::FP32, - /*batch_size=*/2); - EXPECT_THAT(GetSpanForData(output_data[0]), - ElementsAreArray(CastTestVector({3, 6, 9, 12}))); + if (p.input_values.size() % p.input_names.size() != 0) { + LOG(ERROR) << "The number of input values: `" << p.input_values.size() + << "` is not a multiple of the number of inputs: `" + << p.input_names.size() << "`"; + ASSERT_TRUE(false); } - { - // Input contains tensors and weights. - test->Reset(); - DataVec input_data; - for (const auto name : {"inp1", "inp2"}) { - test->AddTestTensor(name, /*dims=*/{1, 2}, /*batch_size=*/1, - TfDataTypeToTrt(dtype)); - input_data.push_back({name, test::AsTensor({CType(1), CType(2)})}); - } - test->AddTestWeights("inp3", /*dims=*/{1, 1, 2}, - /*values=*/std::vector{CType(3), CType(4)}); - const NodeDef node_def = GetAddNNodeDef({"inp1", "inp2", "inp3"}, dtype); - test->RunValidationAndConversion(node_def); - TRT_TensorOrWeights output; - TF_EXPECT_OK(test->GetTensorOrWeights("my_addn", &output)); - ASSERT_TRUE(output.is_tensor()); - ExpectTrtDimsEqualsArray({1, 2}, output.tensor()->getDimensions()); - - DataVec output_data{{"my_addn", ConstructTensor(2)}}; - test->BuildAndRun( - input_data, &output_data, - dtype == DT_HALF ? TrtPrecisionMode::FP16 : TrtPrecisionMode::FP32); - EXPECT_THAT(GetSpanForData(output_data[0]), - ElementsAreArray(CastTestVector({5, 8}))); - } + DataVec input_data; + int input_offset = 0; + const int window_size = p.input_values.size() / p.input_names.size(); + for (const string& name : p.input_names) { + std::vector::const_iterator start_pos = + p.input_values.begin() + input_offset; + std::vector::const_iterator end_pos = start_pos + window_size; + std::vector sub_input_val(start_pos, end_pos); + input_offset += window_size; + + test->AddTestTensor(name, p.dimensions, test->get_tf_type(), sub_input_val); + } + + test->TestOpConverter(node_def, p.dimensions, + /*expected_conversion_status=*/p.status, + /*expected_runtime_status=*/p.status, + /*matcher=*/ElementsAreArray(p.expected_output), + /*out_tf_types=*/{test->get_tf_type()}); } -TEST_F(OpConverterTest, ConvertAddN) { +TEST_P(OpConverter_FP32_FP16_Test, ConvertAddN) { { // Weights with batch dim that is not 1. Reset(); - const NodeDef node_def = GetAddNNodeDef({"tensor", "weights"}, DT_FLOAT); - AddTestTensor("tensor", /*dims=*/{1, 2}, /*batch_size=*/2); + const NodeDef node_def = GetAddNNodeDef({"tensor", "weights"}, tf_type_); + AddTestTensor("tensor", /*dims=*/{1, 2}); AddTestWeights("weights", {2, 1, 2}, {0, 1, 2, 3}); RunValidationAndConversion( - node_def, error::INVALID_ARGUMENT, + node_def, absl::StatusCode::kInvalidArgument, "Weights input to AddN is required to have batch dimension 1."); } - TestAddN(this); - TestAddN(this); + + const std::vector common_input = CreateVectorIota(6); + + std::vector params = { + {/*input_values=*/common_input, + /*input_names=*/{"inp1", "inp2", "inp3"}, + /*dimensions=*/{1, 1, 2, 1, 1}, + /*expected_output=*/{6, 9}, + /*status=*/Status::OK()}, + {/*input_values=*/common_input, + /*input_names=*/{"inp1", "inp2"}, + /*dimensions=*/{1, 1, 3, 1, 1}, + /*expected_output=*/{3, 5, 7}, + /*status=*/Status::OK()}, + {/*input_values=*/common_input, + /*input_names=*/{"inp1", "inp2", "inp3"}, + /*dimensions=*/{1, 2, 1, 1}, + /*expected_output=*/{6, 9}, + /*status=*/Status::OK()}, + {/*input_values=*/common_input, + /*input_names=*/{"inp1", "inp2"}, + /*dimensions=*/{1, 1, 3, 1}, + /*expected_output=*/{3, 5, 7}, + /*status=*/Status::OK()}, + {/*input_values=*/common_input, + /*input_names=*/{"inp1", "inp2", "inp3"}, + /*dimensions=*/{1, 2, 1}, + /*expected_output=*/{6, 9}, + /*status=*/Status::OK()}, + {/*input_values=*/common_input, + /*input_names=*/{"inp1", "inp2"}, + /*dimensions=*/{1, 1, 3}, + /*expected_output=*/{3, 5, 7}, + /*status=*/Status::OK()}, + {/*input_value=*/common_input, + /*input_names=*/{"inp1", "inp2", "inp3"}, + /*dimensions=*/{2, 1}, + /*expected_output=*/{6, 9}, + /*status=*/Status::OK()}, + {/*input_values=*/common_input, + /*input_names=*/{"inp1", "inp2"}, + /*dimensions=*/{1, 3}, + /*expected_output=*/{3, 5, 7}, + /*status=*/Status::OK()}, + {/*input_values=*/common_input, + /*input_names=*/{"inp1", "inp2", "inp3"}, + /*dimensions=*/{2}, + /*expected_output=*/{6, 9}, + /*status=*/Status::OK()}, + {/*input_values=*/common_input, + /*input_names=*/{"inp1", "inp2"}, + /*dimensions=*/{3}, + /*expected_output=*/{3, 5, 7}, + /*status=*/Status::OK()}, + {/*input_values=*/common_input, + /*input_names=*/{"inp1", "inp2", "inp3", "inp4", "inp5", "inp6"}, + /*dimensions=*/{1}, + /*expected_output=*/{15}, + /*status=*/Status::OK()}, + }; + + for (auto p : params) { + TestAddN(this, p); + } } -TEST_F(OpConverterTest, ConvertQuantize) { - precision_mode_to_test_ = TrtPrecisionMode::INT8; +TEST_P(OpConverter_FP32_Test, ConvertQDQDynamicRangeMode) { { // FakeQuantWithMinMaxArgs attributes are empty, should fail. - Reset(); + Reset(TrtPrecisionMode::INT8); NodeDef node_def = MakeNodeDef("my_quantize", "FakeQuantWithMinMaxArgs", {"input"}); AddTestTensor("input", {1, 2, 3}); - RunValidationAndConversion( - node_def, error::INVALID_ARGUMENT, - "Min or max attribute not found for FakeQuantWithMinMaxArgs " - "at my_quantize"); + RunValidationAndConversion(node_def, absl::StatusCode::kNotFound, + "No attr named 'min'"); } { // FakeQuantWithMinMaxArgs ranges set via attributes, ok. - Reset(); + Reset(TrtPrecisionMode::INT8); Scope s = Scope::NewRootScope(); auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT); auto quantize_attrs = ops::FakeQuantWithMinMaxArgs::Min(-6.0f).Max(6.0f); @@ -2358,7 +3637,7 @@ TEST_F(OpConverterTest, ConvertQuantize) { } { // FakeQuantWithMinMaxVars ranges set via inputs, ok. - Reset(); + Reset(TrtPrecisionMode::INT8); Scope s = Scope::NewRootScope(); auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT); auto weights_min = ops::Placeholder(s.WithOpName("weights_min"), DT_FLOAT); @@ -2379,7 +3658,7 @@ TEST_F(OpConverterTest, ConvertQuantize) { } { // QuantizeAndDequantizeV2 ranges set via inputs, ok. - Reset(); + Reset(TrtPrecisionMode::INT8); Scope s = Scope::NewRootScope(); auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT); auto weights_min = ops::Placeholder(s.WithOpName("weights_min"), DT_FLOAT); @@ -2400,7 +3679,7 @@ TEST_F(OpConverterTest, ConvertQuantize) { } { // QuantizeAndDequantizeV2 Range inputs are tensors, should fail. - Reset(); + Reset(TrtPrecisionMode::INT8); Scope s = Scope::NewRootScope(); auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT); auto weights_min = ops::Placeholder(s.WithOpName("weights_min"), DT_FLOAT); @@ -2411,14 +3690,13 @@ TEST_F(OpConverterTest, ConvertQuantize) { AddTestTensor("input", {1, 2, 3}); AddTestTensor("weights_min", {1}); AddTestTensor("weights_max", {1}); - RunValidationAndConversion( - node_def, error::UNIMPLEMENTED, - "The input \"input_min\" for QuantizeAndDequantizeV2 must be a constant" - ", at my_quantize"); + RunValidationAndConversion(node_def, absl::StatusCode::kUnimplemented, + "The input \"input_min\" for " + "QuantizeAndDequantizeV2 must be a constant"); } { // QuantizeAndDequantizeV3 ranges set via inputs, ok. - Reset(); + Reset(TrtPrecisionMode::INT8); Scope s = Scope::NewRootScope(); auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT); auto weights_min = ops::Placeholder(s.WithOpName("weights_min"), DT_FLOAT); @@ -2441,80 +3719,409 @@ TEST_F(OpConverterTest, ConvertQuantize) { } } -template -void TestConvertSquare(OpConverterTest* test) { - test->Reset(); - typedef typename EnumToDataType::Type CType; +TEST_P(OpConverter_FP32_FP16_Test, ConvertSquare) { + { + // Input is weights, should fail. + Reset(); + Scope s = Scope::NewRootScope(); + auto input = ops::Placeholder(s.WithOpName("input"), tf_type_); + auto square = ops::Square(s.WithOpName("my_square"), input); + NodeDef node_def = square.operation.node()->def(); + AddTestWeights("input", {1, 2, 3}, {1, 2, 3, 4, -5, 6}, tf_type_); + RunValidationAndConversion(node_def, absl::StatusCode::kUnimplemented, + "The input \"x\" for Square must be a tensor"); + } + + Reset(); Scope s = Scope::NewRootScope(); - auto input = ops::Placeholder(s.WithOpName("input"), dtype); + auto input = ops::Placeholder(s.WithOpName("input"), tf_type_); auto square = ops::Square(s.WithOpName("my_square"), input); NodeDef node_def = square.operation.node()->def(); - test->AddTestTensor("input", {1, 20}, /*batch_size=*/1, - TfDataTypeToTrt(dtype)); - test->RunValidationAndConversion(node_def); - TRT_TensorOrWeights output; - TF_EXPECT_OK(test->GetTensorOrWeights("my_square", &output)); - ASSERT_TRUE(output.is_tensor()); - ExpectTrtDimsEqualsArray({1, 20}, output.tensor()->getDimensions()); - const int num_inputs = 20; - std::vector inputs(num_inputs); - std::vector expected_outputs(num_inputs); + std::vector inputs(num_inputs); + std::vector expected_outputs(num_inputs); + for (int i = 0; i < num_inputs; ++i) { - const CType value = CType(i - 9); + const float value = (i - 9); inputs[i] = value; expected_outputs[i] = value * value; } - const DataVec input_data{{"input", test::AsTensor(inputs)}}; - // Engine outputs are converted to FP16 automatically if we set FP16 mode in - // the builder. - DataVec output_data{{"my_square", ConstructTensor(num_inputs)}}; - test->BuildAndRun( - input_data, &output_data, - dtype == DT_HALF ? TrtPrecisionMode::FP16 : TrtPrecisionMode::FP32); - ExpectArrayNear(expected_outputs, GetSpanForData(output_data[0])); + AddTestTensor("input", {1, 1, 20}, tf_type_, inputs); + + TestOpConverter(node_def, {1, 1, 20}, Status::OK(), Status::OK(), + ArrayFloatNear(expected_outputs, 0)); } -TEST_F(OpConverterTest, ConvertSquare) { - { - // Input is weights, should fail. +// A function that builds the next lexicographically greater configuration +// for the current one. The configuration is described as a (0,1)-vector +// config, where config[i] is 0 or 1 when the i-th parameter is passed as +// a weight or tensor, respectively. The function returns TRUE if such +// a configuration is built, or FALSE otherwise. +bool nextTensorWeightConfiguration(std::vector& config) { + for (int i = config.size(); i-- > 0;) { + if ((config[i] = 1 - config[i])) return true; + } + return false; +} + +#if IS_TRT_VERSION_GE(8, 2, 0, 0) + +TEST_P(OpConverter_FP32_FP16_INT32_Test, ConvertFill) { + Scope s = Scope::NewRootScope(); + auto dims = ops::Placeholder(s.WithOpName("dims"), DT_INT32); + auto value = ops::Placeholder(s.WithOpName("value"), tf_type_); + auto fill = ops::Fill(s.WithOpName("my_fill"), dims, value); + const NodeDef& node_def = fill.operation.node()->def(); + + if (trt_mode_ == TrtTestMode::kImplicitBatch) { Reset(); - Scope s = Scope::NewRootScope(); - auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT); - auto square = ops::Square(s.WithOpName("my_square"), input); - NodeDef node_def = square.operation.node()->def(); - AddTestWeights("input", {1, 2, 3}, {1, 2, 3, 4, -5, 6}); + // random data + AddTestWeights("dims", {2}, {2, 2}, DT_INT32); + AddTestWeights("value", {1}, {42}, tf_type_); RunValidationAndConversion( - node_def, error::UNIMPLEMENTED, - "The input \"x\" for Square must be a tensor, at my_square"); + node_def, absl::StatusCode::kUnimplemented, + convert_not_supported_implicit(node_def.op(), node_def.name())); + return; + } + + std::vector> output_dims_params = { + {8}, {8, 2, 4}, {32, 32, 3200}}; + std::vector> value_dims_params = {{}, {1}}; + + float val = 42.0; + Status status = Status::OK(); + for (bool dims_is_tensor : {true, false}) { + for (bool value_is_tensor : {true, false}) { + for (auto output_dims : output_dims_params) { + for (auto value_dims : value_dims_params) { + Reset(); + std::vector dims_dims = { + static_cast(output_dims.size())}; + if (dims_is_tensor) { + AddTestTensor("dims", dims_dims, DT_INT32, output_dims, dims_dims); + } else { + AddTestWeights("dims", dims_dims, output_dims, DT_INT32); + } + if (value_is_tensor) { + AddTestTensor("value", value_dims, tf_type_, + {static_cast(val)}); + } else { + AddTestWeights("value", value_dims, {static_cast(val)}, + tf_type_); + } + size_t nb_el = 1; + for (auto d : output_dims) { + nb_el *= d; + } + std::vector expected_output(nb_el, val); + TestOpConverter(node_def, output_dims, status, status, + ElementsAreArray(expected_output)); + } + } + } } +} + +TEST_P(OpConverter_FP32_FP16_INT32_Test, ConvertRange) { + auto get_casted_value = [this](const float value, const DataType dtype) { + return dtype == DT_INT32 ? static_cast(value) : value; + }; + + auto set_parameters = [this](const std::array& name, + const std::array, 3>& value, + const std::array& type, + const std::vector& config, + int shape_idx = -1) { + Reset(); + for (int i = 0; i < 3; i++) { + if (config[i]) { + std::vector partial_shape_dims = {}; + // The correct partial shape will be provided + // (a) for all parameters, when shape_idx > 3 + // (b) for all parameters, except shape_idx, when shape_idx >= 0 + // (c) for none of the shape_idx < 0 + if (shape_idx > 3 || (shape_idx >= 0 && shape_idx != i)) { + partial_shape_dims = {1}; + } + AddTestTensor(name[i], {1}, type[i], value[i], partial_shape_dims); + } else { + AddTestWeights(name[i], {1}, value[i], type[i]); + } + } + }; + + const float start = 1.0; + const float limit = 43.0; + const float delta = 2.0; + + const std::array param_name = {"start", "limit", "delta"}; + std::array, 3> param_value; + param_value[0] = {start}; + param_value[1] = {limit}; + param_value[2] = {delta}; + const auto start_type = tf_type_; + std::array param_type = {tf_type_, tf_type_, tf_type_}; + + Scope s = Scope::NewRootScope(); + const auto range = + ops::Range(s.WithOpName("my_range"), + ops::Placeholder(s.WithOpName(param_name[0]), param_type[0]), + ops::Placeholder(s.WithOpName(param_name[1]), param_type[1]), + ops::Placeholder(s.WithOpName(param_name[2]), param_type[2])); + + const NodeDef& ndef = range.operation.node()->def(); + const std::vector param_types{DT_FLOAT, DT_HALF, DT_INT32}; + + // ConverterRange is not implemented for Implicite batch mode. + std::vector config(3, 0); + if (trt_mode_ == TrtTestMode::kImplicitBatch) { + const auto& err = convert_not_supported_implicit(ndef.op(), ndef.name()); + do { + set_parameters(param_name, param_value, param_type, config); + RunValidationAndConversion(ndef, absl::StatusCode::kUnimplemented, err); + } while (nextTensorWeightConfiguration(config)); + + return; + } + + const auto& expect_msg = convert_range_expected_msg(ndef); + bool all_weights = true; + do { + for (auto limit_type : param_types) { + param_type[1] = limit_type; + for (auto delta_type : param_types) { + param_type[2] = delta_type; + + const auto all_integers = start_type == DT_INT32 && + limit_type == DT_INT32 && + delta_type == DT_INT32; + + if (all_weights || (all_integers && !config[2])) { + // Reject invalid parameters if delta = 0 and it's passed as a weight. + param_value[2] = {0}; + set_parameters(param_name, param_value, param_type, config); + RunValidationAndConversion( + ndef, absl::StatusCode::kInvalidArgument, + "The delta parameter of Range operation cannot be equal to 0"); + + if (!all_weights && !config[2]) { + param_value[2] = {-1}; + set_parameters(param_name, param_value, param_type, config); + const string err = StrCat( + "The delta parameter of Range operation " + "cannot be negative, when one of (start, limit) is passed as " + "a tensor, but got ", + param_value[2][0]); + RunValidationAndConversion(ndef, absl::StatusCode::kInvalidArgument, + err); + } + } + + if (all_weights) { + // Reject invalid parameters preventing the limit from + // being reached for fixed values of start and delta. + for (int j = 0; j <= 1; j++) { + param_value[j] = {get_casted_value(start, tf_type_)}; + param_value[1 - j] = {get_casted_value(limit, limit_type)}; + param_value[2] = {(2 * j - 1) * + get_casted_value(delta, delta_type)}; + set_parameters(param_name, param_value, param_type, config); + const auto error = convert_range_error_msg( + param_value[0][0], param_value[1][0], param_value[2][0]); + RunValidationAndConversion(ndef, absl::StatusCode::kInvalidArgument, + error); + } + } + + param_value[0] = {start}; + param_value[2] = {delta}; + if (all_integers) { + if (trt_mode_ == TrtTestMode::kDynamicShape) { + // Wrong dimension for the parameter passed as a tensor. + for (int j = 0; j < 3; j++) { + if (!config[j]) continue; + + const string err = + StrCat("Dimension for '", param_name[j], + "' of Range operator should be equal to 1"); + set_parameters(param_name, param_value, param_type, config, j); + RunValidationAndConversion( + ndef, absl::StatusCode::kInvalidArgument, err); + } + } + } else { + if (!all_weights) { + // The following test should fail, when + // (a) at least one parameter is passed as a tensor; + // (b) at least one parameter is not of type DT_INT32. + set_parameters(param_name, param_value, param_type, config); + RunValidationAndConversion(ndef, absl::StatusCode::kUnimplemented, + expect_msg); + } + } + } + } + // All other configs will be set so that at least one parameter + // will be passed as a tensor + all_weights = false; + } while (nextTensorWeightConfiguration(config)); + + nvinfer1::DataType trt_type; + TF_ASSERT_OK(TfTypeToTrtType(DT_BOOL, &trt_type)); + const std::string error_msg = + "Unsupported data type " + DebugString(trt_type) + " used for '"; + do { + for (auto limit_type : param_types) { + param_type[1] = limit_type; + for (auto delta_type : param_types) { + param_type[2] = delta_type; + + for (int i = 0; i < 3; i++) { + if (!config[i]) { + const auto saved_type = param_type[i]; + param_type[i] = DT_BOOL; + set_parameters(param_name, param_value, param_type, config); + param_type[i] = saved_type; + RunValidationAndConversion(ndef, absl::StatusCode::kInvalidArgument, + error_msg + param_name[i] + "'"); + } + } + } + } + } while (nextTensorWeightConfiguration(config)); + + // The tests that pass all checks in ConvertRange::Validate(). + const Status status = Status::OK(); + const std::vector int_type{DT_INT32}; + int partial_shape_idx = -1; + all_weights = true; + do { + // For now when at least one of (start, limit, delta) is passed as a tensor + // (a) all these parameters should be of DT_INT32 type; + // (b) only positive delta could be used. + const auto& types = all_weights ? param_types : int_type; + const auto jEnd = all_weights ? 1 : 0; + for (auto limit_type : types) { + param_type[1] = limit_type; + for (auto delta_type : types) { + param_type[2] = delta_type; + // Loop for positive and negative deltas. + for (int j = 0; j <= jEnd; j++) { + // Define the expected result which should match the usage + // of DT_INT32 for one of (start, limit, delta). + const int mult = (1 - 2 * j); + param_value[j] = {get_casted_value(start, tf_type_)}; + param_value[1 - j] = {get_casted_value(limit, limit_type)}; + param_value[2] = {mult * get_casted_value(delta, delta_type)}; + + // Create expected output. + std::vector expected_output; + const float limit_curr = param_value[1][0]; + const float delta_curr = param_value[2][0]; + float value = param_value[0][0]; + int num_values = 0; + while (mult * (limit_curr - value) > 0) { + num_values++; + expected_output.push_back(value); + value += delta_curr; + } + + set_parameters(param_name, param_value, param_type, config, + partial_shape_idx); + const std::vector output_dims = {num_values}; + TestOpConverter(ndef, output_dims, status, status, + ElementsAreArray(expected_output)); + } + } + } + + if (all_weights) { + if (start_type != DT_INT32) break; + if (trt_mode_ == TrtTestMode::kDynamicShape) partial_shape_idx = 3; + + // All other configs will be set so that at least one parameter + // will be passed as a tensor + all_weights = false; + } + } while (nextTensorWeightConfiguration(config)); +} + +TEST_P(OpConverter_FP32_FP16_INT32_Test, ConvertLikeOps) { + auto get_node = [&](int value) -> NodeDef { + Scope s = Scope::NewRootScope(); + auto input = ops::Placeholder(s.WithOpName("input"), tf_type_); + if (value == 0) { + auto zeros_like = ops::ZerosLike(s.WithOpName("Zeros"), input); + return zeros_like.operation.node()->def(); + } + auto ones_like = ops::OnesLike(s.WithOpName("Ones"), input); + return ones_like.operation.node()->def(); + }; + + for (int value : {0, 1}) { + Reset(); + const NodeDef& node_def = get_node(value); + + if (trt_mode_ == TrtTestMode::kImplicitBatch) { + std::vector input_data(8, 42.0f); + AddTestTensor("input", {8}, tf_type_, input_data); + const auto& err = convert_not_supported_implicit(node_def.name() + "Like", + node_def.name()); + RunValidationAndConversion(node_def, absl::StatusCode::kUnimplemented, + err); + continue; + } - // OK. Note that kINT32 is not supported by IElementWiseLayer, so we don't - // test DT_INT32 type here. - TestConvertSquare(this); - TestConvertSquare(this); + std::vector> output_dims_params = { + {8}, {8, 2, 4}, {32, 32, 3200}}; + + float val = 42.0; + Status status = Status::OK(); + for (bool input_is_tensor : {true, false}) { + for (auto output_dims : output_dims_params) { + Reset(); + size_t nb_el = 1; + for (auto d : output_dims) { + nb_el *= d; + } + std::vector input_data(nb_el, val); + if (input_is_tensor) { + AddTestTensor("input", output_dims, tf_type_, input_data); + } else { + AddTestWeights("input", output_dims, input_data, tf_type_); + } + std::vector expected_output(nb_el, value); + TestOpConverter(node_def, output_dims, status, status, + ElementsAreArray(expected_output)); + } + } + } } -#if IS_TRT_VERSION_GE(5, 1, 0, 0) -// TODO: @mconley @jdekhtiar - Reactivate when fixed -#ifndef TF2TENSORRT_BYPASS_NMS_RESIZE_OPS -TEST_F(OpConverterTest, ConvertCombinedNMS) { +#endif // IS_TRT_VERSION_GE(8, 2, 0, 0) + +#if IS_TRT_VERSION_GE(8, 2, 1, 6) || defined(TF_TRT_USE_EFFICIENT_NMS_PLUGIN) + +TEST_P(OpConverter_FP32_Test, ConvertCombinedNMS) { // Get the NodeDef for CombinedNMS. - auto get_nms_nodedef = []() -> NodeDef { + auto get_nms_nodedef = [](DataType tf_type, bool clip_boxes = true, + bool pad_per_class = false) -> NodeDef { Scope s = Scope::NewRootScope(); - auto boxes_tensor = ops::Placeholder(s.WithOpName("boxes"), DT_FLOAT); - auto scores_tensor = ops::Placeholder(s.WithOpName("scores"), DT_FLOAT); + auto boxes_tensor = ops::Placeholder(s.WithOpName("boxes"), tf_type); + auto scores_tensor = ops::Placeholder(s.WithOpName("scores"), tf_type); auto max_output_size_per_class = ops::Placeholder(s.WithOpName("max_output_size_per_class"), DT_INT32); auto max_total_size = ops::Placeholder(s.WithOpName("max_total_size"), DT_INT32); auto iou_threshold = - ops::Placeholder(s.WithOpName("iou_threshold"), DT_FLOAT); + ops::Placeholder(s.WithOpName("iou_threshold"), tf_type); auto score_threshold = - ops::Placeholder(s.WithOpName("score_threshold"), DT_FLOAT); - auto nms_attrs = ops::CombinedNonMaxSuppression::Attrs().PadPerClass(false); + ops::Placeholder(s.WithOpName("score_threshold"), tf_type); + auto nms_attrs = ops::CombinedNonMaxSuppression::Attrs() + .PadPerClass(pad_per_class) + .ClipBoxes(clip_boxes); auto nms_op = ops::CombinedNonMaxSuppression( s.WithOpName("my_nms"), boxes_tensor, scores_tensor, @@ -2524,212 +4131,376 @@ TEST_F(OpConverterTest, ConvertCombinedNMS) { }; struct TestParams { + const std::string description; const std::vector boxes_tensor_dims; const std::vector scores_tensor_dims; + const std::vector boxes_values; + const std::vector scores_values; const int32 max_output_size_per_class; const int32 max_total_size; const float iou_threshold; const float score_threshold; - const std::vector expected_nmsed_boxes_dims; - const std::vector expected_nmsed_scores_dims; - const std::vector expected_nmsed_classes_dims; + const bool pad_per_class; + const bool clip_boxes; + const std::vector> expected_output_dims; + const std::vector exp_boxes; + const std::vector exp_scores; + const std::vector exp_classes; + const std::vector exp_num_detections; + Status conversion_status; + Status runtime_status; }; - // Ok. - const int kCombinedNMSOKCases = 1; - TestParams ok_params[kCombinedNMSOKCases] = { +#if IS_TRT_VERSION_GE(8, 2, 1, 6) || defined(TF_TRT_USE_EFFICIENT_NMS_PLUGIN) + Status conv_status = + trt_mode_ == TrtTestMode::kImplicitBatch + ? errors::Unimplemented(convert_not_supported_implicit( + "CombinedNonMaxSuppression", "my_nms")) + : Status::OK(); + + std::vector params = { + TestParams{"Test 1: clip boxes", + {1, 1, 3, 4}, // boxes dims + {1, 1, 3}, // scores dims + // boxes values: + {0, 0, 0.3, 1.4, 0, 0, 0.3, 1.4, 0, 0, 0.3, 1.4}, + {0.4, 0.7, 0.3}, // scores values + 3, // max_output_size_per_class + 2, // max_total_size + 0.1, // IOU threshold + 0, // score_threshold + false, // pad_per_class + true, // clip_boxes + {{1, 2, 4}, // expected_nmsed_boxes_dims + {1, 2}, // expected_nmsed_scores_dims + {1, 2}, // expected_nmsed_classes_dims + {1}}, // expected_valid_detections_dims + // exp_boxes_values: + {0, 0, 0.3, 1.0, 0, 0, 0.3, 1.0}, + {0.7, 0.4}, // exp_scores + {1, 0}, // exp_classes + {2}, // exp_num_detections + conv_status}, + TestParams{ + "Test 2: iou threshold", + {1, 5, 1, 4}, // boxes dims + {1, 5, 1}, // scores dims + // boxes values: + {0, 0, 5, 10, 0, 1, 5, 11, 8, 0, 12, 4, 6, 2, 10, 6, 8, 9, 11, 12}, + {5, 4, 3, 2, 1}, // scores values + 4, // max_output_size_per_class + 4, // max_total_size + 0.7, // IOU threshold + 0, // score threshold + false, // pad_per_class + false, // clip_boxes + {{1, 4, 4}, // expected nmsed_boxes_dims + {1, 4}, // expected nmsed_scores_dims + {1, 4}, // expected_nmsed_classes_dims + {1}}, // expected_valid_detections_dims + // exp_boxes_values: + {0, 0, 5, 10, 8, 0, 12, 4, 6, 2, 10, 6, 8, 9, 11, 12}, + {5, 3, 2, 1}, // exp_scores + {0, 0, 0, 0}, // exp_classes + {4}, // exp_num_detections + conv_status}, + TestParams{ + "Test 3: score threshold", + {1, 5, 1, 4}, // boxes dims + {1, 5, 1}, // scores dims + // boxes values: + {0, 0, 5, 10, 0, 1, 5, 11, 8, 0, 12, 4, 6, 2, 10, 6, 8, 9, 11, 12}, + {5, 4, 3, 2, 1}, // scores values + 4, // max_output_size_per_class + 4, // max_total_size + 0.1, // IOU threshold + 2, // score threshold + false, // pad_per_class + false, // clip_boxes + {{1, 4, 4}, // expected nmsed_boxes_dims + {1, 4}, // expected nmsed_scores_dims + {1, 4}, // expected_nmsed_classes_dims + {1}}, // expected_valid_detections_dims + // exp_boxes_values: + {0, 0, 5, 10, 8, 0, 12, 4, 0, 0, 0, 0, 0, 0, 0, 0}, + {5, 3, 0, 0}, // exp_scores + {0, 0, 0, 0}, // exp_classes + {2}, // exp_num_detections + conv_status}, + TestParams{ + "Test 4: per class size and pad", + {1, 5, 1, 4}, // boxes dims + {1, 5, 2}, // scores dims + // boxes values: + {0, 0, 5, 10, 0, 1, 5, 11, 8, 0, 12, 4, 6, 2, 10, 6, 8, 9, 11, 12}, + // scores values: + {5, 0, 0, 4, 3, 0, 2, 0, 1, 0}, + 1, // max_output_size_per_class + 4, // max_total_size + 0.1, // IOU threshold + 0, // score threshold + true, // pad_per_class + false, // clip_boxes + {{1, 2, 4}, // expected nmsed_boxes_dims + {1, 2}, // expected nmsed_scores_dims + {1, 2}, // expected_nmsed_classes_dims + {1}}, // expected_valid_detections_dims + // exp_boxes_values: + {0, 0, 5, 10, 0, 1, 5, 11}, + {5, 4}, // exp_scores + {0, 1}, // exp_classes + {2}, // exp_num_detections + conv_status}, + TestParams{ + "Test 5: different box coordinate order", + {1, 5, 1, 4}, // boxes dims + {1, 5, 2}, // scores dims + // boxes values: + {5, 10, 0, 0, 5, 11, 0, 1, 12, 4, 8, 0, 10, 6, 6, 2, 11, 12, 8, 9}, + // scores values: + {5, 0, 0, 4, 3, 0, 2, 0, 1, 0}, + 1, // max_output_size_per_class + 4, // max_total_size + 0.1, // IOU threshold + 0, // score threshold + true, // pad_per_class + false, // clip_boxes + {{1, 2, 4}, // expected nmsed_boxes_dims + {1, 2}, // expected nmsed_scores_dims + {1, 2}, // expected_nmsed_classes_dims + {1}}, // expected_valid_detections_dims + // exp_boxes_values: + {5, 10, 0, 0, 5, 11, 0, 1}, + {5, 4}, // exp_scores + {0, 1}, // exp_classes + {2}, // exp_num_detections + conv_status}, + }; +#else // IS_TRT_VERSION_GE(7, 1, 3, 0) + Status conv_status = + trt_mode_ == TrtTestMode::kDynamicShape + ? errors::Unimplemented( + "TensorRT BatchedNMS Plugin requires input with static shape") + : Status::OK(); + + std::vector params = { // TODO(aaroey): there is a bug in TRT's CombinedNonMaxSuppression // implementation that, the extra output classes that are outside of the // range specified by valid_detections[i] are not zeros but -1s. - TestParams{{1, 1, 4}, {1, 3}, 3, 2, .5f, 0, {2, 4}, {2}, {2}}}; + TestParams{ + "Test 1: Original test", + {1, 1, 3, 4}, // boxes dims + {1, 1, 3}, // scores dims + {0, 0, 0.3, 0.4, 0, 0, 0.3, 0.4, 0, 0, 0.3, 0.4}, // boxes values + {0.4, 0.7, 0.3}, // scores values + 3, // max_output_size_per_class + 2, // max_total_size + .5f, // IOU threshold + 0, // score_threshold + false, // pad_per_class + true, // clip_boxes + {{1, 2, 4}, // expected_nmsed_boxes_dims + {1, 2}, // expected_nmsed_scores_dims + {1, 2}, // expected_nmsed_classes_dims + {1}}, // expected_valid_detections_dims + {0, 0, 0.3, 0.4, 0, 0, 0.3, 0.4}, // exp_boxes_values + {0.7, 0.4}, // exp_scores + {1, 0}, // exp_classes + {2}, // exp_num_detections + conv_status}, + // Test with clip_boxes = False + TestParams{ + "Test 2: clip_boxes", + {1, 5, 1, 4}, // boxes dims + {1, 5, 1}, // scores dims + // boxes values: + {0, 0, 5, 10, 0, 4, 5, 14, 8, 0, 12, 4, 6, 2, 10, 6, 8, 9, 11, 12}, + {5, 4, 3, 2, 1}, // scores values + 4, // max_output_size_per_class + 4, // max_total_size + 0.1, // IOU threshold + 0, // score threshold + false, // pad_per_class + false, // clip_boxes + {{1, 4, 4}, // expected nmsed_boxes_dims + {1, 4}, // expected nmsed_scores_dims + {1, 4}, // expected_nmsed_classes_dims + {1}}, // expected_valid_detections_dims + // exp_boxes_values: + {0, 0, 5, 10, 8, 0, 12, 4, 8, 9, 11, 12, 0, 0, 0, 0}, + {5, 3, 1, 0}, // exp_scores + {0, 0, 0, -1}, // exp_classes + {3}, // exp_num_detections + conv_status}, + // Test with clip_boxes = False, and nonzero score threshold + TestParams{ + "Test 3: score threshold", + {1, 5, 1, 4}, // boxes dims + {1, 5, 1}, // scores dims + // boxes values: + {0, 0, 5, 10, 0, 4, 5, 14, 8, 0, 12, 4, 6, 2, 10, 6, 8, 9, 11, 12}, + {5, 4, 3, 2, 1}, // scores values + 4, // max_output_size_per_class + 4, // max_total_size + 0.1, // IOU threshold + 2, // score threshold + false, // pad_per_class + false, // clip_boxes + {{1, 4, 4}, // expected nmsed_boxes_dims + {1, 4}, // expected nmsed_scores_dims + {1, 4}, // expected_nmsed_classes_dims + {1}}, // expected_valid_detections_dims + // exp_boxes_values: + {0, 0, 5, 10, 8, 0, 12, 4, 0, 0, 0, 0, 0, 0, 0, 0}, + {5, 3, 0, 0}, // exp_scores + {0, 0, -1, -1}, // exp_classes + {2}, // exp_num_detections + conv_status}, + // Test where the boxes are defined as with max value first for the box + // coordinates. This test fails before TRT 7.1.3. + TestParams{ + "Test 4: max coord first", + {1, 5, 1, 4}, // boxes dims + {1, 5, 1}, // scores dims + // boxes values: + {5, 10, 0, 0, 5, 14, 0, 4, 12, 4, 8, 0, 10, 6, 6, 2, 11, 12, 8, 9}, + {5, 4, 3, 2, 1}, // scores values + 4, // max_output_size_per_class + 4, // max_total_size + 0.1, // IOU threshold + 0, // score threshold + false, // pad_per_class + false, // clip_boxes + {{1, 4, 4}, // expected nmsed_boxes_dims + {1, 4}, // expected nmsed_scores_dims + {1, 4}, // expected_nmsed_classes_dims + {1}}, // expected_valid_detections_dims + // exp_boxes_values: + {5, 10, 0, 0, 12, 4, 8, 0, 11, 12, 8, 9, 0, 0, 0, 0}, + {5, 3, 1, 0}, // exp_scores + {0, 0, 0, -1}, // exp_classes + {3}, // exp_num_detections + conv_status}, + TestParams{"Test 5: TopK error", + {1, 5000, 1, 4}, // boxes dims + {1, 5000, 1}, // scores dims + {}, // boxes values: + {}, // scores values + 4, // max_output_size_per_class + 4, // max_total_size + 0.1, // IOU threshold + 0, // score threshold + false, // pad_per_class + false, // clip_boxes + {}, // expected_valid_detections_dims + {}, // exp_boxes_values + {}, // exp_scores + {}, // exp_classes + {}, // exp_num_detections + conv_status.ok() + ? errors::InvalidArgument( + "TRT NMS plugin allow top_k<=4096, where top_k = " + "max(num_boxes, max_total_size). You can override " + "this by setting TF_TRT_ALLOW_NMS_TOPK_OVERRIDE=1 " + "environment variable, but this can result in a " + "loss of accuracy.") + : conv_status}, + }; +#endif - for (int i = 0; i < kCombinedNMSOKCases; ++i) { + for (auto p : params) { Reset(); - - AddTestTensor("boxes", ok_params[i].boxes_tensor_dims); - AddTestTensor("scores", ok_params[i].scores_tensor_dims); + SCOPED_TRACE(p.description); + AddTestTensor("boxes", p.boxes_tensor_dims, p.boxes_values); + AddTestTensor("scores", p.scores_tensor_dims, p.scores_values); AddTestWeights("max_output_size_per_class", {1}, - {ok_params[i].max_output_size_per_class}); - AddTestWeights("max_total_size", {1}, {ok_params[i].max_total_size}); - AddTestWeights("iou_threshold", {1}, {ok_params[i].iou_threshold}); - AddTestWeights("score_threshold", {1}, - {ok_params[i].score_threshold}); - - RunValidationAndConversion(get_nms_nodedef()); - - TRT_TensorOrWeights nmsed_boxes; - TRT_TensorOrWeights nmsed_scores; - TRT_TensorOrWeights nmsed_classes; - TRT_TensorOrWeights valid_detections; - - TF_EXPECT_OK(GetTensorOrWeights("my_nms", &nmsed_boxes)); - TF_EXPECT_OK(GetTensorOrWeights("my_nms:1", &nmsed_scores)); - TF_EXPECT_OK(GetTensorOrWeights("my_nms:2", &nmsed_classes)); - TF_EXPECT_OK(GetTensorOrWeights("my_nms:3", &valid_detections)); - - ASSERT_TRUE(nmsed_boxes.is_tensor()); - ASSERT_TRUE(nmsed_scores.is_tensor()); - ASSERT_TRUE(nmsed_classes.is_tensor()); - ASSERT_TRUE(valid_detections.is_tensor()); - - ExpectTrtDimsEqualsArray(ok_params[i].expected_nmsed_boxes_dims, - nmsed_boxes.tensor()->getDimensions()); - ExpectTrtDimsEqualsArray(ok_params[i].expected_nmsed_scores_dims, - nmsed_scores.tensor()->getDimensions()); - ExpectTrtDimsEqualsArray(ok_params[i].expected_nmsed_classes_dims, - nmsed_classes.tensor()->getDimensions()); - ExpectTrtDimsEqualsArray({}, valid_detections.tensor()->getDimensions()); - - DataVec output_data{ - {"my_nms", ConstructTensor(8)}, - {"my_nms:1", ConstructTensor(2)}, - {"my_nms:2", ConstructTensor(2)}, - {"my_nms:3", ConstructTensor(1)}, - }; - const DataVec input_data{ - {"boxes", test::AsTensor({0, 0, 0.3, 0.4})}, - {"scores", test::AsTensor({0.4, 0.7, 0.3})}}; - BuildAndRun(input_data, &output_data); - EXPECT_THAT(GetSpanForData(output_data[0]), - ElementsAre(0, 0, 0.3, 0.4, 0, 0, 0.3, 0.4)); - EXPECT_THAT(GetSpanForData(output_data[1]), ElementsAre(0.7, 0.4)); - EXPECT_THAT(GetSpanForData(output_data[2]), ElementsAre(1, 0)); - EXPECT_THAT(GetSpanForData(output_data[3]), ElementsAre(2)); + {p.max_output_size_per_class}); + AddTestWeights("max_total_size", {1}, {p.max_total_size}); + AddTestWeights("iou_threshold", {1}, {p.iou_threshold}, tf_type_); + AddTestWeights("score_threshold", {1}, {p.score_threshold}, + tf_type_); + + auto node_def = get_nms_nodedef(tf_type_, p.clip_boxes, p.pad_per_class); + + TestOpConverterMultiOut(node_def, p.expected_output_dims, + p.conversion_status, p.runtime_status, + { + ElementsAreArray(p.exp_boxes), + ElementsAreArray(p.exp_scores), + ElementsAreArray(p.exp_classes), + ElementsAreArray(p.exp_num_detections), + }, + {tf_type_, tf_type_, tf_type_, DT_INT32}); } } +#endif -#endif // TF2TENSORRT_BYPASS_NMS_RESIZE_OPS -#endif // CombinedNonMaxSuppression +template +NodeDef CreateUnaryOp(DataType tf_type) { + Scope s = Scope::NewRootScope(); + auto input = ops::Placeholder(s.WithOpName("input"), tf_type); + return T(s.WithOpName("my_unary"), input).operation.node()->def(); +} -TEST_F(OpConverterTest, ConvertActivation) { - { - // Input is weights, should fail. - Reset(); - Scope s = Scope::NewRootScope(); - auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT); - auto relu = ops::Relu(s.WithOpName("my_act"), input); - const NodeDef& node_def = relu.operation.node()->def(); - AddTestWeights("input", {1, 2, 3}, {-3, -2, -1, 0, 1, 2}); - RunValidationAndConversion( - node_def, error::UNIMPLEMENTED, - "The input \"input\" for Relu must be a tensor, at my_act"); - } +constexpr float kLeakyReluAlpha = 0.2f; +template <> +NodeDef CreateUnaryOp(DataType tf_type) { + Scope s = Scope::NewRootScope(); + auto input = ops::Placeholder(s.WithOpName("input"), tf_type); + return ops::internal::LeakyRelu( + s.WithOpName("my_unary"), input, + ops::internal::LeakyRelu::Alpha(kLeakyReluAlpha)) + .operation.node() + ->def(); +} - constexpr float kLeakyReluAlpha = 0.2f; +TEST_P(OpConverter_FP32_UnaryTest, ConvertActivation) { constexpr float kSeluAlpha = 1.7580993408473768599402175208123f; constexpr float kSeluScale = 1.0507009873554804934193349852946f; + using OpFunc = std::function; + using ValFunc = float (*)(float); + std::map> op_map; + +#define ADD_OP(name, op, compute) \ + op_map[name] = std::make_pair(CreateUnaryOp, compute) + ADD_OP("LeakyRelu", ops::internal::LeakyRelu, + [](float x) { return (x > 0.0f) ? x : x * kLeakyReluAlpha; }); + ADD_OP("Relu", ops::Relu, [](float x) { return (x > 0.0f) ? x : 0.0f; }); + ADD_OP("Relu6", ops::Relu6, + [](float x) { return std::min(std::max(x, 0.0f), 6.0f); }); + ADD_OP("Sigmoid", ops::Sigmoid, + [](float x) { return 1.0f / (1.0f + std::exp(-x)); }); + ADD_OP("Tanh", ops::Tanh, static_cast(std::tanh)); + ADD_OP("Elu", ops::Elu, + [](float x) { return (x > 0.0f) ? x : std::exp(x) - 1; }); + ADD_OP("Selu", ops::Selu, [](float x) { + return (x > 0.0f) ? kSeluScale * x + : kSeluScale * kSeluAlpha * (std::exp(x) - 1); + }); + ADD_OP("Softsign", ops::Softsign, + [](float x) { return x / (std::abs(x) + 1); }); + ADD_OP("Softplus", ops::Softplus, + [](float x) { return std::log(std::exp(x) + 1); }); +#undef ADD_OP + + // std::exp in Softplus will overflow for input > 88 + const std::vector input = {-100, -2, -1, 0, 1, 88}; + const bool nan_sensitive = false; - // Get nodedef for activation layer. - auto get_act_nodedef = [](string op_name) -> NodeDef { - Scope s = Scope::NewRootScope(); - auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT); - if (op_name == "LeakyRelu") { - auto act = ops::internal::LeakyRelu( - s.WithOpName("my_act"), input, - ops::internal::LeakyRelu::Alpha(kLeakyReluAlpha)); - return act.operation.node()->def(); - } else if (op_name == "Relu") { - auto act = ops::Relu(s.WithOpName("my_act"), input); - return act.operation.node()->def(); - } else if (op_name == "Relu6") { - auto act = ops::Relu6(s.WithOpName("my_act"), input); - return act.operation.node()->def(); - } else if (op_name == "Sigmoid") { - auto act = ops::Sigmoid(s.WithOpName("my_act"), input); - return act.operation.node()->def(); - } else if (op_name == "Tanh") { - auto act = ops::Tanh(s.WithOpName("my_act"), input); - return act.operation.node()->def(); - } else if (op_name == "Elu") { - auto act = ops::Elu(s.WithOpName("my_act"), input); - return act.operation.node()->def(); - } else if (op_name == "Selu") { - auto act = ops::Selu(s.WithOpName("my_act"), input); - return act.operation.node()->def(); - } else if (op_name == "Softsign") { - auto act = ops::Softsign(s.WithOpName("my_act"), input); - return act.operation.node()->def(); - } else if (op_name == "Softplus") { - auto act = ops::Softplus(s.WithOpName("my_act"), input); - return act.operation.node()->def(); - } - EXPECT_TRUE(false); - return NodeDef(); - }; - // Get expected output for activation layer. - auto get_act_output = [](string op_name, float input) -> float { - if (op_name == "LeakyRelu") { - return (input > 0.0f) ? input : input * kLeakyReluAlpha; - } else if (op_name == "Relu") { - return (input > 0.0f) ? input : 0.0f; - } else if (op_name == "Relu6") { - return std::min(std::max(input, 0.0f), 6.0f); - } else if (op_name == "Sigmoid") { - return 1.0f / (1.0f + std::exp(-input)); - } else if (op_name == "Tanh") { - return std::tanh(input); - } else if (op_name == "Elu") { - return (input > 0.0f) ? input : std::exp(input) - 1; - } else if (op_name == "Selu") { - return (input > 0.0f) ? kSeluScale * input - : kSeluScale * kSeluAlpha * (std::exp(input) - 1); - } else if (op_name == "Softsign") { - return input / (std::abs(input) + 1); - } else if (op_name == "Softplus") { - return std::log(std::exp(input) + 1); - } - EXPECT_TRUE(false); - return 0; - }; - - // Get list of ops to test. - std::vector ops_to_test; - // Add all ops supported by ConvertUnary. - auto* map = ActivationTypeMap(); - ops_to_test.reserve(map->size()); - for (auto& pair : *map) { - ops_to_test.push_back(pair.first); - } - // Add other activation ops to test. - ops_to_test.push_back("Relu6"); - ops_to_test.push_back("LeakyRelu"); - // Ok. - for (const string& op_name : ops_to_test) { - Reset(); - NodeDef node_def = get_act_nodedef(op_name); - AddTestTensor("input", {1, 2, 3}); - RunValidationAndConversion(node_def); - TRT_TensorOrWeights output; - TF_EXPECT_OK(GetTensorOrWeights("my_act", &output)); - ASSERT_TRUE(output.is_tensor()); - ExpectTrtDimsEqualsArray({1, 2, 3}, output.tensor()->getDimensions()); - - // Certain activations should set quantization range automatically. - auto ranges = quantization_ranges(); - if (op_name == "Relu6") { - EXPECT_EQ(ranges[output.tensor()->trt_tensor()], 6.0f); - } else if (op_name == "Sigmoid" || op_name == "Tanh" || - op_name == "Softsign") { - EXPECT_EQ(ranges[output.tensor()->trt_tensor()], 1.0f); - } - - // std::exp in Softplus will overflow for input > 88 - const std::vector input = {-100, -2, -1, 0, 1, 88}; - const DataVec input_data{{"input", test::AsTensor(input)}}; - DataVec output_data{{"my_act", ConstructTensor(6)}}; - BuildAndRun(input_data, &output_data); - for (int i = 0; i < input.size(); i++) { - const float expected_output = get_act_output(op_name, input[i]); - EXPECT_NEAR(GetSpanForData(output_data[0])[i], expected_output, - 1e-4); - } - } +#if IS_TRT_VERSION_GE(8, 0, 0, 0) + // NVBug # 3322482 - Known bug with TRT 8.0 on specific GPU architectures + const float max_abs_error = 1e-4; +#else + const float max_abs_error = 0.; +#endif + RunTests("Activation", *ActivationTypeMap(), op_map, input, "input", + max_abs_error, nan_sensitive); } -TEST_F(OpConverterTest, ConvertExpandDims) { +TEST_P(OpConverter_FP32_Test, ConvertExpandDims) { // Get the NodeDef for ExpandDims. Scope s = Scope::NewRootScope(); - auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT); + auto input = ops::Placeholder(s.WithOpName("input"), tf_type_); auto weights = ops::Placeholder(s.WithOpName("weights"), DT_INT32); auto expanddims = ops::ExpandDims(s.WithOpName("my_expanddims"), input, weights); @@ -2739,227 +4510,241 @@ TEST_F(OpConverterTest, ConvertExpandDims) { Reset(); AddTestWeights("input", {1, 2, 3}, {1, 2, 3, 4, 5, 6}); AddTestWeights("weights", {1}, {1}); - RunValidationAndConversion(node_def, error::UNIMPLEMENTED, + RunValidationAndConversion(node_def, absl::StatusCode::kUnimplemented, "The input \"input\" for ExpandDims must be a " - "tensor, at my_expanddims"); + "tensor"); } { // Axis is a tensor, should fail. Reset(); - AddTestTensor("input", {1, 2, 3}); + AddTestTensor("input", {3, 2, 1}); AddTestTensor("weights", {3}); - RunValidationAndConversion(node_def, error::UNIMPLEMENTED, + RunValidationAndConversion(node_def, absl::StatusCode::kUnimplemented, "The input \"axis\" for ExpandDims must be a " - "constant, at my_expanddims"); - } - { - // Add dim at batch dimension, should fail. - Reset(); - AddTestTensor("input", {1, 2, 3}); - AddTestWeights("weights", {1}, {0}); - RunValidationAndConversion( - node_def, error::UNIMPLEMENTED, - "TensorRT does not allow manipulation of the batch dimension, at " - "my_expanddims"); - } - { - // Add dim at batch dimension via negative axis, should fail. - Reset(); - AddTestTensor("input", {1, 2, 3}); - // Input is rank 4 (batch dim included) - AddTestWeights("weights", {1}, {-5}); - RunValidationAndConversion( - node_def, error::UNIMPLEMENTED, - "TensorRT does not allow manipulation of the batch dimension, at " - "my_expanddims"); - } - { - // Axis > rank(input), should fail. - Reset(); - AddTestTensor("input", {1, 2, 3}); - // Input is rank 4 (batch dim included) - AddTestWeights("weights", {1}, {5}); - RunValidationAndConversion( - node_def, error::INVALID_ARGUMENT, - "Axis value of 5 is out of bounds, must be in range [-5, 5), at " - "my_expanddims"); - } - { - // Axis < -rank(input)-1, should fail. - Reset(); - AddTestTensor("input", {1, 2, 3}); - // Input is rank 4 (batch dim included) - AddTestWeights("weights", {1}, {-6}); - RunValidationAndConversion( - node_def, error::INVALID_ARGUMENT, - "Axis value of -6 is out of bounds, must be in range [-5, 5), at " - "my_expanddims"); + "constant"); } - - struct TestParams { - std::vector input_dims; - int axis; - std::vector expected_output_dims; + std::vector test_params = { + TestParamBase{{1, 1, 2, 3}, + {}, + {1, 1, 1, 2, 3}, + {0}, + trt_mode_ == TrtTestMode::kImplicitBatch + ? Status(absl::StatusCode::kUnimplemented, + "TensorRT does not allow manipulation of the " + "batch dimension") + : Status::OK()}, + TestParamBase{{1, 1, 2, 3}, + {}, + {1, 1, 1, 2, 3}, + {-5}, + trt_mode_ == TrtTestMode::kImplicitBatch + ? Status(absl::StatusCode::kUnimplemented, + "TensorRT does not allow manipulation of the " + "batch dimension") + : Status::OK()}, + TestParamBase{{1, 1, 2, 3}, + {}, + {}, + {5}, + Status(absl::StatusCode::kInvalidArgument, + "Axis value of 5 is out of bounds, must be in range" + " [-5, 5)")}, + TestParamBase{{1, 1, 2, 3}, + {}, + {}, + {-6}, + Status(absl::StatusCode::kInvalidArgument, + "Axis value of -6 is out of bounds, must be in range" + " [-5, 5)")}, + TestParamBase{{1, 2, 3}, {}, {1, 1, 2, 3}, {1}}, + TestParamBase{{1, 2, 3}, {}, {1, 1, 2, 3}, {-3}}, + TestParamBase{{1, 2, 3}, {}, {1, 2, 3, 1}, {3}}, + TestParamBase{{1, 2, 3}, {}, {1, 2, 3, 1}, {-1}}, + TestParamBase{{1, 2, 3}, {}, {1, 2, 1, 3}, {2}}, + TestParamBase{{1, 2, 3}, {}, {1, 2, 1, 3}, {-2}}, + TestParamBase{{1, 6}, {}, {1, 1, 6}, {1}}, + TestParamBase{{1, 6}, {}, {1, 6, 1}, {-1}}, }; - - // Ok. - const int kExpandDimsOKCases = 8; - TestParams ok_params[kExpandDimsOKCases] = { - TestParams{{2, 3}, 1, {1, 2, 3}}, TestParams{{2, 3}, -3, {1, 2, 3}}, - TestParams{{2, 3}, 3, {2, 3, 1}}, TestParams{{2, 3}, -1, {2, 3, 1}}, - TestParams{{2, 3}, 2, {2, 1, 3}}, TestParams{{2, 3}, -2, {2, 1, 3}}, - TestParams{{6}, 1, {1, 6}}, TestParams{{6}, -1, {6, 1}}, - }; - for (int i = 0; i < kExpandDimsOKCases; ++i) { + for (auto p : test_params) { Reset(); - AddTestTensor("input", ok_params[i].input_dims); - AddTestWeights("weights", {1}, {ok_params[i].axis}); - RunValidationAndConversion(node_def); - TRT_TensorOrWeights output; - TF_EXPECT_OK(GetTensorOrWeights("my_expanddims", &output)); - ASSERT_TRUE(output.is_tensor()); - ExpectTrtDimsEqualsArray(ok_params[i].expected_output_dims, - output.tensor()->getDimensions()); - - const DataVec input_data{ - {"input", test::AsTensor({1, 2, 3, 4, 5, 6})}}; - DataVec output_data{{"my_expanddims", ConstructTensor(6)}}; - BuildAndRun(input_data, &output_data); - EXPECT_THAT(GetSpanForData(output_data[0]), - ElementsAre(1, 2, 3, 4, 5, 6)); + AddTestTensor("input", p.input_dims, {1, 2, 3, 4, 5, 6}); + AddTestWeights("weights", {1}, {p.param[0]}); + TestOpConverter(node_def, p.expected_output_dims, p.status, + p.runtime_status, ElementsAreArray({1, 2, 3, 4, 5, 6})); } } -TEST_F(OpConverterTest, ConvertSqueeze) { - { - // No attrs, should fail. - Reset(); - Scope s = Scope::NewRootScope(); - auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT); - auto squeeze = ops::Squeeze(s.WithOpName("my_squeeze"), input); - const NodeDef& node_def = squeeze.operation.node()->def(); - AddTestTensor("input", {1, 2, 3}); - RunValidationAndConversion( - node_def, error::UNIMPLEMENTED, - "Squeeze is only implemented for explicit dims, at my_squeeze"); - } - - // Get the NodeDef for Squeeze. - auto get_squeeze_nodedef = [](std::vector axis) -> NodeDef { - Scope s = Scope::NewRootScope(); - auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT); - ops::Squeeze::Attrs squeeze_attrs; - squeeze_attrs.axis_ = gtl::ArraySlice(axis); // non-absl ok - auto squeeze = - ops::Squeeze(s.WithOpName("my_squeeze"), input, squeeze_attrs); - return squeeze.operation.node()->def(); - }; +TEST_P(OpConverter_FP32_FP16_Test, ConvertSoftmax) { + // Get the NodeDef for SoftMax. + Scope s = Scope::NewRootScope(); + auto input = ops::Placeholder(s.WithOpName("logits"), tf_type_); + auto softmax = ops::Softmax(s.WithOpName("my_softmax"), input); + const NodeDef& node_def = softmax.operation.node()->def(); - { - // Input is weights, should fail. - Reset(); - NodeDef node_def = get_squeeze_nodedef({0}); - AddTestWeights("input", {1, 2, 3}, {1, 2, 3, 4, 5, 6}); - RunValidationAndConversion( - node_def, error::UNIMPLEMENTED, - "The input \"input\" for Squeeze must be a tensor, at my_squeeze"); - } - { - // Squeeze batch dim, should fail. - Reset(); - NodeDef node_def = get_squeeze_nodedef({0}); - AddTestTensor("input", {1, 2, 3}); - RunValidationAndConversion(node_def, error::UNIMPLEMENTED, - "TensorRT does not allow manipulation of the " - "batch dimension, at my_squeeze"); - } - { - // Squeeze batch dim via negative axis, should fail. - Reset(); - NodeDef node_def = get_squeeze_nodedef({-4}); - AddTestTensor("input", {1, 2, 3}); - RunValidationAndConversion(node_def, error::UNIMPLEMENTED, - "TensorRT does not allow manipulation of the " - "batch dimension, at my_squeeze"); - } - { - // Squeeze >= rank(input), should fail. - Reset(); - NodeDef node_def = get_squeeze_nodedef({4}); - AddTestTensor("input", {1, 2, 3}); - RunValidationAndConversion( - node_def, error::INVALID_ARGUMENT, - "Axis value of 4 is out of bounds, must be in range [-4, 4), at " - "my_squeeze"); - } - { - // Squeeze < -rank(input), should fail. - Reset(); - NodeDef node_def = get_squeeze_nodedef({-5}); - AddTestTensor("input", {1, 2, 3}); - RunValidationAndConversion( - node_def, error::INVALID_ARGUMENT, - "Axis value of -5 is out of bounds, must be in range [-4, 4), at " - "my_squeeze"); - } - { - // Squeeze an axis with size != 1, should fail. - Reset(); - NodeDef node_def = get_squeeze_nodedef({2}); - AddTestTensor("input", {1, 2, 3}); - RunValidationAndConversion( - node_def, error::INVALID_ARGUMENT, - "Dimension 2 with size 2 cannot be squeezed because it must be size 1, " - "at my_squeeze"); + struct TestParams { + std::vector input_dims; + std::vector expected_values; + }; + std::vector test_params = { + TestParams{/*input_dims=*/{2, 3}, + /*expected_values=*/{0.09003057, 0.24472848, 0.66524094, + 0.09003057, 0.24472848, 0.66524094}}, + TestParams{/*input_dims=*/{6, 1}, + /*expected_values=*/{1, 1, 1, 1, 1, 1}}, // works w/ std input + TestParams{/*input_dims=*/{1, 6}, // this works w/ arange(1,7) input + /*expected_values=*/{0.00426978, 0.01160646, 0.03154963, + 0.08576079, 0.23312202, 0.6336913}}}; + std::vector input_values{1, 2, 3, 4, 5, 6}; + for (auto p : test_params) { + Reset(); + AddTestTensor("logits", p.input_dims, input_values); + TestOpConverter(node_def, p.input_dims, Status::OK(), Status::OK(), + ArrayFloatNear(p.expected_values, 1e-3)); } +} + +TEST_P(OpConverter_FP32_FP16_Test, ConvertLogSoftmax) { + // Get the NodeDef for LogSoftMax. + Scope s = Scope::NewRootScope(); + auto input = ops::Placeholder(s.WithOpName("logits"), tf_type_); + auto logsoftmax = ops::LogSoftmax(s.WithOpName("my_logsoftmax"), input); + const NodeDef& node_def = logsoftmax.operation.node()->def(); struct TestParams { std::vector input_dims; - std::vector axis; - std::vector expected_output_dims; + std::vector expected_values; }; - // Ok. - const int kSqueezeOKCases = 10; - TestParams ok_params[kSqueezeOKCases] = { - TestParams{{1, 2, 3}, {1}, {2, 3}}, - TestParams{{1, 2, 3}, {-3}, {2, 3}}, - TestParams{{2, 3, 1}, {3}, {2, 3}}, - TestParams{{2, 3, 1}, {-1}, {2, 3}}, - TestParams{{1, 2, 1, 3, 1}, {1, 3, 5}, {2, 3}}, - TestParams{{1, 2, 1, 3, 1}, {3, 1, 5}, {2, 3}}, - TestParams{{1, 2, 1, 3, 1}, {-1, -3, -5}, {2, 3}}, - TestParams{{1, 2, 1, 3, 1}, {1, -3, 5}, {2, 3}}, - TestParams{{1, 6}, {1}, {6}}, - TestParams{{6, 1}, {2}, {6}}, - }; - for (int i = 0; i < kSqueezeOKCases; ++i) { - Reset(); - NodeDef node_def = get_squeeze_nodedef(ok_params[i].axis); - AddTestTensor("input", ok_params[i].input_dims); - RunValidationAndConversion(node_def); - TRT_TensorOrWeights output; - TF_EXPECT_OK(GetTensorOrWeights("my_squeeze", &output)); - ASSERT_TRUE(output.is_tensor()); - ExpectTrtDimsEqualsArray(ok_params[i].expected_output_dims, - output.tensor()->getDimensions()); + std::vector test_params = { + TestParams{/*input_dims=*/{2, 3}, + /*expected_values=*/{-2.4076061, -1.407606, -0.40760604, + -2.4076061, -1.407606, -0.40760604}}, + TestParams{/*input_dims=*/{1, 6}, + /*expected_values=*/{-5.4561934, -4.4561934, -3.4561934, + -2.4561934, -1.4561933, -0.45619333}}, + TestParams{/*input_dims=*/{6, 1}, + /*expected_values=*/{0, 0, 0, 0, 0, 0}}}; + std::vector input_values{1, 2, 3, 4, 5, 6}; + for (auto p : test_params) { + Reset(); + AddTestTensor("logits", p.input_dims, input_values); + TestOpConverter(node_def, p.input_dims, Status::OK(), Status::OK(), + ArrayFloatNear(p.expected_values, 1e-3)); + } +} - const DataVec input_data{ - {"input", test::AsTensor({1, 2, 3, 4, 5, 6})}}; - DataVec output_data{{"my_squeeze", ConstructTensor(6)}}; - BuildAndRun(input_data, &output_data); - EXPECT_THAT(GetSpanForData(output_data[0]), - ElementsAre(1, 2, 3, 4, 5, 6)); +TEST_P(OpConverter_FP32_Test, ConvertSqueeze) { + const bool use_implicit_batch = (trt_mode_ == TrtTestMode::kImplicitBatch); + // Get the NodeDef for Squeeze. + auto get_squeeze_nodedef = [](std::vector axes, + DataType tf_type) -> NodeDef { + Scope s = Scope::NewRootScope(); + auto input = ops::Placeholder(s.WithOpName("input"), tf_type); + if (!axes.empty()) { + ops::Squeeze::Attrs squeeze_attrs; + squeeze_attrs.axis_ = gtl::ArraySlice(axes); // non-absl ok + auto squeeze = + ops::Squeeze(s.WithOpName("my_squeeze"), input, squeeze_attrs); + return squeeze.operation.node()->def(); + } else { + auto squeeze = ops::Squeeze(s.WithOpName("my_squeeze"), input); + return squeeze.operation.node()->def(); + } + }; + std::vector test_params = { + TestParamBase{ + {1, 2, 1, 3}, // input dims + {}, // input partial dims + {2, 3}, // expected output dims + {}, // axis + trt_mode_ == TrtTestMode::kExplicitBatch + ? Status::OK() + : Status{absl::StatusCode::kUnimplemented, + "Squeeze is not implemented for empty squeeze_dims"}}, + TestParamBase{{1, 2, 1, 3}, + {}, + {2, 1, 3}, + {0}, + use_implicit_batch + ? Status{absl::StatusCode::kUnimplemented, + "TensorRT does not allow manipulation of the " + "batch dimension"} + : Status::OK()}, + TestParamBase{{1, 2, 1, 3}, + {}, + {2, 1, 3}, + {-4}, + use_implicit_batch + ? Status{absl::StatusCode::kUnimplemented, + "TensorRT does not allow manipulation of the " + "batch dimension"} + : Status::OK()}, + TestParamBase{ + {1, 1, 2, 3}, + {}, + {}, + {4}, + Status{absl::StatusCode::kInvalidArgument, + "Axis value of 4 is out of bounds, must be in range [-4, 4)"}}, + TestParamBase{ + {1, 1, 2, 3}, + {}, + {}, + {-5}, + Status{ + absl::StatusCode::kInvalidArgument, + "Axis value of -5 is out of bounds, must be in range [-4, 4)"}}, + TestParamBase{{1, 1, 2, 3}, {}, {1, 2, 3}, {1}}, + TestParamBase{{1, 1, 2, 3}, {}, {1, 2, 3}, {-3}}, + TestParamBase{{1, 2, 3, 1}, {}, {1, 2, 3}, {3}}, + TestParamBase{{1, 2, 3, 1}, {}, {1, 2, 3}, {-1}}, + TestParamBase{{1, 1, 2, 1, 3, 1}, {}, {1, 2, 3}, {1, 3, 5}}, + TestParamBase{{1, 1, 2, 1, 3, 1}, {}, {1, 2, 3}, {3, 1, 5}}, + TestParamBase{{1, 1, 2, 1, 3, 1}, {}, {1, 2, 3}, {-1, -3, -5}}, + TestParamBase{{1, 1, 2, 1, 3, 1}, {}, {1, 2, 3}, {1, -3, 5}}, + TestParamBase{{1, 1, 6}, {}, {1, 6}, {1}}, + TestParamBase{{1, 6, 1}, {}, {1, 6}, {2}}, + }; + auto squeeze_non_singleton = TestParamBase{ + {1, 1, 2, 3}, + {}, + {}, + {2}, + Status{absl::StatusCode::kInvalidArgument, + "Dimension 2 with size 2 cannot be squeezed because it must be " + "size 1"}}; + + if (trt_mode_ == TrtTestMode::kDynamicShape) { + // In this test we try to squeeze axis=2 which has size > 1. In dynamic + // shape mode the converter sees only -1, so it cannot catch this error. + squeeze_non_singleton.status = Status::OK(); // conversion status + squeeze_non_singleton.runtime_status = + errors::InvalidArgument("Negative number of dimensions -1"); + // Dynamic shape tests with partially known input shape + test_params.push_back(TestParamBase{{2, 1, 3}, {2, -1, 3}, {2, 3}, {1}}); + test_params.push_back(TestParamBase{{2, 1, 3}, {2, 1, -1}, {2, 3}, {1}}); + } + test_params.push_back(squeeze_non_singleton); + + for (TestParamBase p : test_params) { + SCOPED_TRACE(p); + Reset(); + NodeDef node_def = get_squeeze_nodedef(p.param, tf_type_); + AddTestTensor("input", p.input_dims, {1, 2, 3, 4, 5, 6}, + p.partial_input_dims); + TestOpConverter(node_def, p.expected_output_dims, p.status, + p.runtime_status, ElementsAreArray({1, 2, 3, 4, 5, 6})); } } -TEST_F(OpConverterTest, ConvertStridedSlice) { +TEST_P(OpConverter_FP32_FP16_INT32_Test, ConvertStridedSlice) { // Get nodedef for StridedSlice layer. auto get_strided_slice_nodedef = - [](int64 begin_mask = 0, int64 end_mask = 0, int64 ellipsis_mask = 0, - int64 new_axis_mask = 0, int64 shrink_axis_mask = 0) -> NodeDef { + [](DataType tf_type, int64 begin_mask = 0, int64 end_mask = 0, + int64 ellipsis_mask = 0, int64 new_axis_mask = 0, + int64 shrink_axis_mask = 0) -> NodeDef { Scope s = Scope::NewRootScope(); - auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT); + auto input = ops::Placeholder(s.WithOpName("input"), tf_type); auto begin = ops::Placeholder(s.WithOpName("begin"), DT_INT32); auto end = ops::Placeholder(s.WithOpName("end"), DT_INT32); auto strides = ops::Placeholder(s.WithOpName("strides"), DT_INT32); @@ -2977,105 +4762,26 @@ TEST_F(OpConverterTest, ConvertStridedSlice) { { // Input is weights, should fail. Reset(); - NodeDef node_def = get_strided_slice_nodedef(); - AddTestWeights("input", {1, 2, 3}, {1, 2, 3, 4, 5, 6}); + NodeDef node_def = get_strided_slice_nodedef(tf_type_); + AddTestWeights("input", {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}); AddTestWeights("begin", {4}, {0, 0, 0, 0}); AddTestWeights("end", {4}, {1, 1, 2, 3}); AddTestWeights("strides", {4}, {1, 1, 1, 1}); - RunValidationAndConversion(node_def, error::UNIMPLEMENTED, - "The input \"input\" for StridedSlice must be a " - "tensor, at my_strided_slice"); + RunValidationAndConversion(node_def, absl::StatusCode::kUnimplemented, + "The input \"input\" for StridedSlice must " + "be a tensor"); } { // Begin, end, strides are tensors, should fail. Reset(); - NodeDef node_def = get_strided_slice_nodedef(); - AddTestTensor("input", {1, 2, 3}); + NodeDef node_def = get_strided_slice_nodedef(tf_type_); + AddTestTensor("input", {4, 1, 1, 1}); AddTestTensor("begin", {4}); AddTestTensor("end", {4}); AddTestTensor("strides", {4}); RunValidationAndConversion( - node_def, error::UNIMPLEMENTED, - "The input \"begin\" for StridedSlice must be a constant, at " - "my_strided_slice"); - } - { - // Modify batch dim, should fail. - Reset(); - NodeDef node_def = get_strided_slice_nodedef(); - AddTestTensor("input", {1, 2, 3}); - AddTestWeights("begin", {4}, {0, 0, 0, 0}); - AddTestWeights("end", {4}, {0, 1, 2, 3}); - AddTestWeights("strides", {4}, {1, 1, 1, 1}); - RunValidationAndConversion( - node_def, error::UNIMPLEMENTED, - "TensorRT does not allow modifications to the batch dimension, at " - "my_strided_slice"); - } - { - // Dynamic batch size without end_mask, should fail. - Reset(); - NodeDef node_def = get_strided_slice_nodedef(); - AddTestTensor("input", {1, 2, 3}, /*batch_size=*/-1); - AddTestWeights("begin", {4}, {0, 0, 0, 0}); - AddTestWeights("end", {4}, {1, 1, 2, 3}); - AddTestWeights("strides", {4}, {1, 1, 1, 1}); - RunValidationAndConversion( - node_def, error::UNIMPLEMENTED, - "TensorRT does not allow modifications to the batch dimension, at " - "my_strided_slice"); - } - { - // Dynamic batch size but using end_mask, ok. - Reset(); - NodeDef node_def = get_strided_slice_nodedef(/*begin_mask=*/0, - /*end_mask=*/1); - AddTestTensor("input", {1, 2, 3}, /*batch_size=*/-1); - AddTestWeights("begin", {4}, {0, 0, 0, 0}); - AddTestWeights("end", {4}, {0, 1, 2, 2}); - AddTestWeights("strides", {4}, {1, 1, 1, 1}); - RunValidationAndConversion(node_def); - } -// TRT 5.1+ supports strides (disabled until 5.1.3.1 due to bugs) -#if IS_TRT_VERSION_GE(5, 1, 3, 1) - { - // Negative strides, should fail. - Reset(); - NodeDef node_def = get_strided_slice_nodedef(); - AddTestTensor("input", {1, 2, 3}); - AddTestWeights("begin", {4}, {0, 0, 0, 0}); - AddTestWeights("end", {4}, {1, 1, 2, 3}); - AddTestWeights("strides", {4}, {1, 1, 1, -1}); - RunValidationAndConversion(node_def, error::UNIMPLEMENTED, - "Negative or zero stride values are not " - "supported for StridedSlice, at " - "my_strided_slice"); - } -#else - { - // Stride is not 1, should fail. - Reset(); - NodeDef node_def = get_strided_slice_nodedef(); - AddTestTensor("input", {1, 2, 3}); - AddTestWeights("begin", {4}, {0, 0, 0, 0}); - AddTestWeights("end", {4}, {1, 1, 2, 3}); - AddTestWeights("strides", {4}, {1, 2, 1, 3}); - RunValidationAndConversion(node_def, error::UNIMPLEMENTED, - "Strides other than 1 are not supported with " - "this version of TRT, at my_strided_slice"); - } -#endif - { - // Size of sliced dim is negative, should fail. - Reset(); - NodeDef node_def = get_strided_slice_nodedef(); - AddTestTensor("input", {1, 2, 3}); - AddTestWeights("begin", {4}, {0, 0, 2, 0}); - AddTestWeights("end", {4}, {1, 1, 0, 3}); - AddTestWeights("strides", {4}, {1, 1, 1, 1}); - RunValidationAndConversion(node_def, error::INVALID_ARGUMENT, - "\"size\" cannot be negative or zero for " - "StridedSlice, at my_strided_slice"); + node_def, absl::StatusCode::kUnimplemented, + "The input \"begin\" for StridedSlice must be a constant"); } struct TestParams { @@ -3090,6 +4796,9 @@ TEST_F(OpConverterTest, ConvertStridedSlice) { int shrink_axis_mask; std::vector expected_output_dims; std::vector expected_output; + Status conversion_status; + Status runtime_status; + std::vector partial_input_dims; }; auto get_mask = [](const std::vector& mask) { @@ -3101,634 +4810,956 @@ TEST_F(OpConverterTest, ConvertStridedSlice) { }; // Same input is used for all tests. - const std::vector ok_input = {1, 2, 3, 4, 5, 6}; - -#if IS_TRT_VERSION_GE(5, 1, 3, 1) - const int kStridedSliceOKCases = 31; -#else - const int kStridedSliceOKCases = 27; -#endif - // Ok. - TestParams ok_params[kStridedSliceOKCases] = { - // 2D Crop. - TestParams{ - /*input_dims=*/{1, 2, 3}, - /*begin=*/{0, 0, 0, 0}, - /*end=*/{0, 0, 1, 2}, - /*strides=*/{1, 1, 1, 1}, - /*begin_mask=*/get_mask({0, 0, 0, 0}), - /*end_mask=*/get_mask({1, 1, 0, 0}), - /*ellipsis_mask=*/0, - /*new_axis_mask=*/0, - /*shrink_axis_mask=*/0, - /*expected_output_dims=*/{1, 1, 2}, - /*expected_output=*/{1, 2}, - }, - TestParams{ - /*input_dims=*/{1, 2, 3}, - /*begin=*/{0, 0, 1, 1}, - /*end=*/{0, 0, 0, 0}, - /*strides=*/{1, 1, 1, 1}, - /*begin_mask=*/get_mask({0, 0, 0, 0}), - /*end_mask=*/get_mask({1, 1, 1, 1}), - /*ellipsis_mask=*/0, - /*new_axis_mask=*/0, - /*shrink_axis_mask=*/0, - /*expected_output_dims=*/{1, 1, 2}, - /*expected_output=*/{5, 6}, - }, - TestParams{ - /*input_dims=*/{1, 2, 3}, - /*begin=*/{0, 0, 1, 1}, - /*end=*/{0, 1, 2, 3}, - /*strides=*/{1, 1, 1, 1}, - /*begin_mask=*/get_mask({0, 0, 0, 0}), - /*end_mask=*/get_mask({1, 1, 0, 0}), - /*ellipsis_mask=*/0, - /*new_axis_mask=*/0, - /*shrink_axis_mask=*/0, - /*expected_output_dims=*/{1, 1, 2}, - /*expected_output=*/{5, 6}, - }, - // 2D Crop, with transpose. - TestParams{ - /*input_dims=*/{2, 3, 1}, - /*begin=*/{0, 0, 0, 0}, - /*end=*/{0, 1, 2, 1}, - /*strides=*/{1, 1, 1, 1}, - /*begin_mask=*/get_mask({0, 0, 0, 0}), - /*end_mask=*/get_mask({1, 0, 0, 0}), - /*ellipsis_mask=*/0, - /*new_axis_mask=*/0, - /*shrink_axis_mask=*/0, - /*expected_output_dims=*/{1, 2, 1}, - /*expected_output=*/{1, 2}, - }, - TestParams{ - /*input_dims=*/{2, 3, 1}, - /*begin=*/{0, 1, 1, 0}, - /*end=*/{0, 2, 3, 1}, - /*strides=*/{1, 1, 1, 1}, - /*begin_mask=*/get_mask({0, 0, 0, 0}), - /*end_mask=*/get_mask({1, 0, 0, 0}), - /*ellipsis_mask=*/0, - /*new_axis_mask=*/0, - /*shrink_axis_mask=*/0, - /*expected_output_dims=*/{1, 2, 1}, - /*expected_output=*/{5, 6}, - }, - TestParams{ - /*input_dims=*/{2, 1, 3}, - /*begin=*/{0, 0, 0, 0}, - /*end=*/{0, 1, 1, 2}, - /*strides=*/{1, 1, 1, 1}, - /*begin_mask=*/get_mask({0, 0, 0, 0}), - /*end_mask=*/get_mask({1, 0, 0, 0}), - /*ellipsis_mask=*/0, - /*new_axis_mask=*/0, - /*shrink_axis_mask=*/0, - /*expected_output_dims=*/{1, 1, 2}, - /*expected_output=*/{1, 2}, - }, - TestParams{ - /*input_dims=*/{2, 1, 3}, - /*begin=*/{0, 1, 0, 1}, - /*end=*/{0, 2, 1, 3}, - /*strides=*/{1, 1, 1, 1}, - /*begin_mask=*/get_mask({0, 0, 0, 0}), - /*end_mask=*/get_mask({1, 0, 0, 0}), - /*ellipsis_mask=*/0, - /*new_axis_mask=*/0, - /*shrink_axis_mask=*/0, - /*expected_output_dims=*/{1, 1, 2}, - /*expected_output=*/{5, 6}, - }, - // 2D Crop, with reshape. - TestParams{ - /*input_dims=*/{2, 3}, - /*begin=*/{0, 0, 0}, - /*end=*/{0, 1, 2}, - /*strides=*/{1, 1, 1}, - /*begin_mask=*/get_mask({0, 0, 0}), - /*end_mask=*/get_mask({1, 0, 0}), - /*ellipsis_mask=*/0, - /*new_axis_mask=*/0, - /*shrink_axis_mask=*/0, - /*expected_output_dims=*/{1, 2}, - /*expected_output=*/{1, 2}, - }, - TestParams{ - /*input_dims=*/{2, 3}, - /*begin=*/{0, 1, 1}, - /*end=*/{0, 0, 0}, - /*strides=*/{1, 1, 1}, - /*begin_mask=*/get_mask({0, 0, 0}), - /*end_mask=*/get_mask({1, 1, 1}), - /*ellipsis_mask=*/0, - /*new_axis_mask=*/0, - /*shrink_axis_mask=*/0, - /*expected_output_dims=*/{1, 2}, - /*expected_output=*/{5, 6}, - }, - // 1D Crop. - TestParams{ - /*input_dims=*/{1, 2, 3}, - /*begin=*/{0, 0, 0, 0}, - /*end=*/{0, 0, 0, 2}, - /*strides=*/{1, 1, 1, 1}, - /*begin_mask=*/get_mask({0, 0, 0, 0}), - /*end_mask=*/get_mask({1, 1, 1, 0}), - /*ellipsis_mask=*/0, - /*new_axis_mask=*/0, - /*shrink_axis_mask=*/0, - /*expected_output_dims=*/{1, 2, 2}, - /*expected_output=*/{1, 2, 4, 5}, - }, - TestParams{ - /*input_dims=*/{1, 2, 3}, - /*begin=*/{0, 0, 1, 0}, - /*end=*/{0, 0, 0, 0}, - /*strides=*/{1, 1, 1, 1}, - /*begin_mask=*/get_mask({0, 0, 0, 0}), - /*end_mask=*/get_mask({1, 1, 1, 1}), - /*ellipsis_mask=*/0, - /*new_axis_mask=*/0, - /*shrink_axis_mask=*/0, - /*expected_output_dims=*/{1, 1, 3}, - /*expected_output=*/{4, 5, 6}, - }, - // 1D Crop, with transpose. - TestParams{ - /*input_dims=*/{2, 3, 1}, - /*begin=*/{0, 0, 0, 0}, - /*end=*/{0, 1, 0, 0}, - /*strides=*/{1, 1, 1, 1}, - /*begin_mask=*/get_mask({0, 0, 0, 0}), - /*end_mask=*/get_mask({1, 0, 1, 1}), - /*ellipsis_mask=*/0, - /*new_axis_mask=*/0, - /*shrink_axis_mask=*/0, - /*expected_output_dims=*/{1, 3, 1}, - /*expected_output=*/{1, 2, 3}, - }, - TestParams{ - /*input_dims=*/{2, 3, 1}, - /*begin=*/{0, 1, 0, 0}, - /*end=*/{0, 0, 0, 0}, - /*strides=*/{1, 1, 1, 1}, - /*begin_mask=*/get_mask({0, 0, 0, 0}), - /*end_mask=*/get_mask({1, 1, 1, 1}), - /*ellipsis_mask=*/0, - /*new_axis_mask=*/0, - /*shrink_axis_mask=*/0, - /*expected_output_dims=*/{1, 3, 1}, - /*expected_output=*/{4, 5, 6}, - }, - // 1D Crop, with reshape. - TestParams{ - /*input_dims=*/{6}, - /*begin=*/{0, 0}, - /*end=*/{0, 3}, - /*strides=*/{1, 1}, - /*begin_mask=*/get_mask({0, 0}), - /*end_mask=*/get_mask({1, 0}), - /*ellipsis_mask=*/0, - /*new_axis_mask=*/0, - /*shrink_axis_mask=*/0, - /*expected_output_dims=*/{3}, - /*expected_output=*/{1, 2, 3}, - }, - TestParams{ - /*input_dims=*/{1, 6}, - /*begin=*/{0, 0, 2}, - /*end=*/{0, 0, 5}, - /*strides=*/{1, 1, 1}, - /*begin_mask=*/get_mask({0, 0, 0}), - /*end_mask=*/get_mask({1, 1, 0}), - /*ellipsis_mask=*/0, - /*new_axis_mask=*/0, - /*shrink_axis_mask=*/0, - /*expected_output_dims=*/{1, 3}, - /*expected_output=*/{3, 4, 5}, - }, - TestParams{ - /*input_dims=*/{6, 1}, - /*begin=*/{0, 2, 0}, - /*end=*/{0, 5, 0}, - /*strides=*/{1, 1, 1}, - /*begin_mask=*/get_mask({0, 0, 0}), - /*end_mask=*/get_mask({1, 0, 1}), - /*ellipsis_mask=*/0, - /*new_axis_mask=*/0, - /*shrink_axis_mask=*/0, - /*expected_output_dims=*/{3, 1}, - /*expected_output=*/{3, 4, 5}, - }, - // Negative axis. - TestParams{ - /*input_dims=*/{6, 1}, - /*begin=*/{0, -6, 0}, - /*end=*/{0, -3, 0}, - /*strides=*/{1, 1, 1}, - /*begin_mask=*/get_mask({0, 0, 0}), - /*end_mask=*/get_mask({1, 0, 1}), - /*ellipsis_mask=*/0, - /*new_axis_mask=*/0, - /*shrink_axis_mask=*/0, - /*expected_output_dims=*/{3, 1}, - /*expected_output=*/{1, 2, 3}, - }, - TestParams{ - /*input_dims=*/{6, 1}, - /*begin=*/{0, 0, 0}, - /*end=*/{0, -1, 0}, - /*strides=*/{1, 1, 1}, - /*begin_mask=*/get_mask({0, 0, 0}), - /*end_mask=*/get_mask({1, 0, 1}), - /*ellipsis_mask=*/0, - /*new_axis_mask=*/0, - /*shrink_axis_mask=*/0, - /*expected_output_dims=*/{5, 1}, - /*expected_output=*/{1, 2, 3, 4, 5}, - }, - // Clamp out of bounds begin and end. - TestParams{ - /*input_dims=*/{1, 2, 3}, - /*begin=*/{0, 0, -9999, -9}, - /*end=*/{0, 1, 1000, 4}, - /*strides=*/{1, 1, 1, 1}, - /*begin_mask=*/get_mask({0, 0, 0, 0}), - /*end_mask=*/get_mask({1, 0, 0, 0}), - /*ellipsis_mask=*/0, - /*new_axis_mask=*/0, - /*shrink_axis_mask=*/0, - /*expected_output_dims=*/{1, 2, 3}, - /*expected_output=*/{1, 2, 3, 4, 5, 6}, - }, -#if IS_TRT_VERSION_GE(5, 1, 3, 1) - // Strides - TestParams{ - /*input_dims=*/{6}, - /*begin=*/{0, 0}, - /*end=*/{0, 5}, - /*strides=*/{1, 2}, - /*begin_mask=*/get_mask({0, 0}), - /*end_mask=*/get_mask({1, 0}), - /*ellipsis_mask=*/0, - /*new_axis_mask=*/0, - /*shrink_axis_mask=*/0, - /*expected_output_dims=*/{3}, - /*expected_output=*/{1, 3, 5}, - }, - TestParams{ - /*input_dims=*/{6}, - /*begin=*/{0, 0}, - /*end=*/{0, 6}, - /*strides=*/{1, 2}, - /*begin_mask=*/get_mask({0, 0}), - /*end_mask=*/get_mask({1, 0}), - /*ellipsis_mask=*/0, - /*new_axis_mask=*/0, - /*shrink_axis_mask=*/0, - /*expected_output_dims=*/{3}, - /*expected_output=*/{1, 3, 5}, - }, - TestParams{ - /*input_dims=*/{6}, - /*begin=*/{0, 1}, - /*end=*/{0, 6}, - /*strides=*/{1, 2}, - /*begin_mask=*/get_mask({0, 0}), - /*end_mask=*/get_mask({1, 0}), - /*ellipsis_mask=*/0, - /*new_axis_mask=*/0, - /*shrink_axis_mask=*/0, - /*expected_output_dims=*/{3}, - /*expected_output=*/{2, 4, 6}, - }, - TestParams{ - /*input_dims=*/{6}, - /*begin=*/{0, 2}, - /*end=*/{0, 6}, - /*strides=*/{1, 3}, - /*begin_mask=*/get_mask({0, 0}), - /*end_mask=*/get_mask({1, 0}), - /*ellipsis_mask=*/0, - /*new_axis_mask=*/0, - /*shrink_axis_mask=*/0, - /*expected_output_dims=*/{2}, - /*expected_output=*/{3, 6}, - }, -#endif - // ellipsis_mask - TestParams{ - /*input_dims=*/{1, 2, 3}, - /*begin=*/{0, 1}, - /*end=*/{0, 2}, - /*strides=*/{1, 1}, - /*begin_mask=*/get_mask({0, 0, 0, 0}), - /*end_mask=*/get_mask({0, 0, 0, 0}), - /*ellipsis_mask=*/get_mask({1, 0, 0, 0}), - /*new_axis_mask=*/0, - /*shrink_axis_mask=*/0, - /*expected_output_dims=*/{1, 2, 1}, - /*expected_output=*/{2, 5}, - }, - TestParams{ - /*input_dims=*/{1, 2, 3}, - /*begin=*/{0, 0, 1}, - /*end=*/{0, 0, 2}, - /*strides=*/{1, 1, 1}, - /*begin_mask=*/get_mask({1, 0, 0, 0}), - /*end_mask=*/get_mask({1, 0, 0, 0}), - /*ellipsis_mask=*/get_mask({0, 1, 0, 0}), - /*new_axis_mask=*/0, - /*shrink_axis_mask=*/0, - /*expected_output_dims=*/{1, 2, 1}, - /*expected_output=*/{2, 5}, - }, - TestParams{ - /*input_dims=*/{1, 2, 3}, - /*begin=*/{0, 0, 0, 1}, - /*end=*/{0, 1, 2, 2}, - /*strides=*/{1, 1, 1, 1}, - /*begin_mask=*/get_mask({0, 0, 0, 0}), - /*end_mask=*/get_mask({0, 0, 0, 0}), - /*ellipsis_mask=*/get_mask({1, 0, 0, 0}), - /*new_axis_mask=*/0, - /*shrink_axis_mask=*/0, - /*expected_output_dims=*/{1, 2, 1}, - /*expected_output=*/{2, 5}, - }, - TestParams{ - /*input_dims=*/{1, 2, 3}, - /*begin=*/{0, 0, 0, 1}, - /*end=*/{1, 1, 2, 2}, - /*strides=*/{1, 1, 1, 1}, - /*begin_mask=*/get_mask({0, 0, 0, 0}), - /*end_mask=*/get_mask({0, 0, 0, 0}), - /*ellipsis_mask=*/get_mask({0, 1, 0, 0}), - /*new_axis_mask=*/0, - /*shrink_axis_mask=*/0, - /*expected_output_dims=*/{1, 2, 1}, - /*expected_output=*/{2, 5}, - }, - TestParams{ - /*input_dims=*/{1, 2, 3}, - /*begin=*/{0, 0, 0, 0, 1}, - /*end=*/{0, 1, 1, 2, 2}, - /*strides=*/{1, 1, 1, 1, 1}, - /*begin_mask=*/get_mask({0, 0, 0, 0}), - /*end_mask=*/get_mask({0, 0, 0, 0}), - /*ellipsis_mask=*/get_mask({1, 0, 0, 0}), - /*new_axis_mask=*/0, - /*shrink_axis_mask=*/0, - /*expected_output_dims=*/{1, 2, 1}, - /*expected_output=*/{2, 5}, - }, - // shrink_axis_mask - TestParams{ - /*input_dims=*/{1, 2, 3}, - /*begin=*/{0, 0, 0, 1}, - /*end=*/{0, 0, 0, 2}, - /*strides=*/{1, 1, 1, 1}, - /*begin_mask=*/get_mask({1, 1, 1, 0}), - /*end_mask=*/get_mask({1, 1, 1, 0}), - /*ellipsis_mask=*/0, - /*new_axis_mask=*/0, - /*shrink_axis_mask=*/get_mask({0, 0, 0, 1}), - /*expected_output_dims=*/{1, 2}, - /*expected_output=*/{2, 5}, - }, - TestParams{ - /*input_dims=*/{1, 2, 3}, - /*begin=*/{0, 0, 0, 1}, - /*end=*/{0, 1, 2, 2}, - /*strides=*/{1, 1, 1, 1}, - /*begin_mask=*/get_mask({1, 0, 0, 0}), - /*end_mask=*/get_mask({1, 0, 0, 0}), - /*ellipsis_mask=*/0, - /*new_axis_mask=*/0, - /*shrink_axis_mask=*/get_mask({0, 1, 0, 1}), - /*expected_output_dims=*/{2}, - /*expected_output=*/{2, 5}, - }, - TestParams{ - /*input_dims=*/{6}, - /*begin=*/{0, 0}, - /*end=*/{0, 1}, - /*strides=*/{1, 1}, - /*begin_mask=*/get_mask({1, 0}), - /*end_mask=*/get_mask({1, 0}), - /*ellipsis_mask=*/0, - /*new_axis_mask=*/0, - /*shrink_axis_mask=*/get_mask({0, 1}), - /*expected_output_dims=*/{}, - /*expected_output=*/{1}, - }, + const std::vector ok_input = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}; + Status modified_batch_dim_status = + (trt_mode_ == TrtTestMode::kImplicitBatch) + ? errors::Unimplemented( + "TensorRT does not allow modifications to " + "the batch dimension") + : Status::OK(); + std::vector params = { + // Modify batch dim, should fail in implicit batch mode. + TestParams{/*input_dims=*/{2, 1, 1, 3}, + /*begin=*/{0, 0, 0, 0}, + /*end=*/{1, 1, 1, 2}, + /*strides=*/{1, 1, 1, 1}, + /*begin_mask=*/get_mask({0, 0, 0, 0}), + /*end_mask=*/get_mask({0, 0, 0, 0}), + /*ellipsis_mask=*/0, + /*new_axis_mask=*/0, + /*shrink_axis_mask=*/0, + /*expected_output_dims=*/{1, 1, 1, 2}, + /*expected_output=*/{1, 2}, + /*conversion_status=*/modified_batch_dim_status, + /*runtime_status=*/Status::OK(), + /*partial_input_dims=*/{}}, + // Unknown batch size without end_mask. + TestParams{ + /*input_dims=*/{2, 1, 1, 3}, + /*begin=*/{0, 0, 0, 0}, + /*end=*/{1, 1, 1, 2}, + /*strides=*/{1, 1, 1, 1}, + /*begin_mask=*/get_mask({0, 0, 0, 0}), + /*end_mask=*/get_mask({0, 0, 0, 0}), + /*ellipsis_mask=*/0, + /*new_axis_mask=*/0, + /*shrink_axis_mask=*/0, + /*expected_output_dims=*/{1, 1, 1, 2}, + /*expected_output=*/{1, 2}, + modified_batch_dim_status, + Status::OK(), + /*partial_input_dims=*/{-1, 1, 1, 3}, + }, + // Test Case 2: Unknown batch size with end_mask. + TestParams{ + /*input_dims=*/{2, 1, 1, 3}, + /*begin=*/{0, 0, 0, 0}, + /*end=*/{0, 1, 1, 2}, + /*strides=*/{1, 1, 1, 1}, + /*begin_mask=*/get_mask({1, 0, 0, 0}), + /*end_mask=*/get_mask({1, 0, 0, 0}), + /*ellipsis_mask=*/0, + /*new_axis_mask=*/0, + /*shrink_axis_mask=*/0, + /*expected_output_dims=*/{2, 1, 1, 2}, + /*expected_output=*/{1, 2, 4, 5}, + Status::OK(), + Status::OK(), + /*partial_input_dims=*/{-1, 1, 1, 3}, + }, + // Invalid parameters: end[2] < begin[2] + TestParams{/*input_dims=*/{1, 1, 2, 3}, + /*begin=*/{0, 0, 2, 0}, + /*end=*/{1, 1, 0, 3}, + /*strides=*/{1, 1, 1, 1}, + /*begin_mask=*/0, + /*end_mask=*/0, + /*ellipsis_mask=*/0, + /*new_axis_mask=*/0, + /*shrink_axis_mask=*/0, + /*expected_output_dims=*/{}, + /*expected_output=*/{}, + errors::InvalidArgument("\"size\" cannot be negative for " + "StridedSlice"), + Status::OK(), + /*partial_input_dims=*/{}}, + // Slice on the last two dimensions. All dimensions are static. + TestParams{ + /*input_dims=*/{1, 1, 2, 3}, + /*begin=*/{0, 0, 0, 0}, + /*end=*/{0, 0, 1, 2}, + /*strides=*/{1, 1, 1, 1}, + /*begin_mask=*/get_mask({0, 0, 0, 0}), + /*end_mask=*/get_mask({1, 1, 0, 0}), + /*ellipsis_mask=*/0, + /*new_axis_mask=*/0, + /*shrink_axis_mask=*/0, + /*expected_output_dims=*/{1, 1, 1, 2}, + /*expected_output=*/{1, 2}, + }, + // Slice on the last two dimensions. The slice is fully + // specified for the dynamic dimensions. + TestParams{ + /*input_dims=*/{1, 1, 2, 3}, + /*begin=*/{0, 0, 0, 0}, + /*end=*/{0, 0, 1, 2}, + /*strides=*/{1, 1, 1, 1}, + /*begin_mask=*/get_mask({0, 0, 0, 0}), + /*end_mask=*/get_mask({1, 1, 0, 0}), + /*ellipsis_mask=*/0, + /*new_axis_mask=*/0, + /*shrink_axis_mask=*/0, + /*expected_output_dims=*/{1, 1, 1, 2}, + /*expected_output=*/{1, 2}, + Status::OK(), + Status::OK(), + /*partial_input_dims=*/{1, 1, -1, -1}, + }, + // End mask is provided on all dimensions. This should override the fact + // that the end value is 0. For dynamic shape, it tests + // that we can infer tensor size when "end mask" is provided. + TestParams{ + /*input_dims=*/{1, 1, 2, 3}, + /*begin=*/{0, 0, 1, 1}, + /*end=*/{0, 0, 0, 0}, + /*strides=*/{1, 1, 1, 1}, + /*begin_mask=*/get_mask({0, 0, 0, 0}), + /*end_mask=*/get_mask({1, 1, 1, 1}), + /*ellipsis_mask=*/0, + /*new_axis_mask=*/0, + /*shrink_axis_mask=*/0, + /*expected_output_dims=*/{1, 1, 1, 2}, + /*expected_output=*/{5, 6}, + Status::OK(), + Status::OK(), + /*partial_input_dims=*/{1, 1, -1, -1}, + }, + // End mask is provided for the batch dimension to overwrite the end value + // 0 for that dimension. + TestParams{ + /*input_dims=*/{1, 1, 2, 3}, + /*begin=*/{0, 0, 1, 1}, + /*end=*/{0, 1, 2, 3}, + /*strides=*/{1, 1, 1, 1}, + /*begin_mask=*/get_mask({0, 0, 0, 0}), + /*end_mask=*/get_mask({1, 1, 0, 0}), + /*ellipsis_mask=*/0, + /*new_axis_mask=*/0, + /*shrink_axis_mask=*/0, + /*expected_output_dims=*/{1, 1, 1, 2}, + /*expected_output=*/{5, 6}, + }, + // Test slice on two dimensions with negative stride, without end_mask set + // on crop dimensions. + TestParams{/*input_dims=*/{1, 1, 2, 3}, + /*begin=*/{0, 0, 1, 2}, + /*end=*/{0, 0, 0, 0}, + /*strides=*/{1, 1, -1, -1}, + /*begin_mask=*/get_mask({0, 0, 0, 0}), + /*end_mask=*/get_mask({1, 1, 0, 0}), + /*ellipsis_mask=*/0, + /*new_axis_mask=*/0, + /*shrink_axis_mask=*/0, + /*expected_output_dims=*/{1, 1, 1, 2}, + /*expected_output=*/{6, 5}, + /*conversion_status=*/Status::OK(), + /*runtime_status=*/Status::OK(), + /*partial_input_dims=*/{1, 1, -1, -1}}, + // Test slice on two dimensions with negative stride, with end_mask set on + // crop dimensions. In dynamic shape mode, this tests the runtime size + // computation. + TestParams{/*input_dims=*/{1, 1, 2, 3}, + /*begin=*/{0, 0, 1, 1}, + /*end=*/{0, 0, 0, 0}, + /*strides=*/{1, 1, -1, -1}, + /*begin_mask=*/get_mask({0, 0, 0, 0}), + /*end_mask=*/get_mask({1, 1, 1, 1}), + /*ellipsis_mask=*/0, + /*new_axis_mask=*/0, + /*shrink_axis_mask=*/0, + /*expected_output_dims=*/{1, 1, 2, 2}, + /*expected_output=*/{5, 4, 2, 1}, + /*conversion_status=*/Status::OK(), + /*runtime_status=*/Status::OK(), + /*partial_input_dims=*/{1, 1, -1, -1}}, + // Test slice on two dimensions with negative stride, with begin_mask set + // on the crop dimensions. In dynamic shape mode, this tests the runtime + // size computation. + TestParams{/*input_dims=*/{1, 1, 2, 3}, + /*begin=*/{0, 0, 0, 0}, + /*end=*/{0, 0, 0, 0}, + /*strides=*/{1, 1, -1, -1}, + /*begin_mask=*/get_mask({0, 0, 1, 1}), + /*end_mask=*/get_mask({1, 1, 0, 0}), + /*ellipsis_mask=*/0, + /*new_axis_mask=*/0, + /*shrink_axis_mask=*/0, + /*expected_output_dims=*/{1, 1, 1, 2}, + /*expected_output=*/{6, 5}, + /*conversion_status=*/Status::OK(), + /*runtime_status=*/Status::OK(), + /*partial_input_dims=*/{1, 1, -1, -1}}, + // Test the reversal of all non-batch dimensions by providing the begin + // masks, end masks, and -1 as strides. + TestParams{/*input_dims=*/{1, 1, 2, 3}, + /*begin=*/{0, 0, 0, 0}, + /*end=*/{0, 0, 0, 0}, + /*strides=*/{1, -1, -1, -1}, + /*begin_mask=*/get_mask({1, 1, 1, 1}), + /*end_mask=*/get_mask({1, 1, 1, 1}), + /*ellipsis_mask=*/0, + /*new_axis_mask=*/0, + /*shrink_axis_mask=*/0, + /*expected_output_dims=*/{1, 1, 2, 3}, + /*expected_output=*/{6, 5, 4, 3, 2, 1}, + /*conversion_status=*/Status::OK(), + /*runtime_status=*/Status::OK(), + /*partial_input_dims=*/{1, -1, -1, -1}}, + // Slice on dimensions 1 and 2. + TestParams{ + /*input_dims=*/{1, 2, 3, 1}, + /*begin=*/{0, 0, 0, 0}, + /*end=*/{0, 1, 2, 1}, + /*strides=*/{1, 1, 1, 1}, + /*begin_mask=*/get_mask({0, 0, 0, 0}), + /*end_mask=*/get_mask({1, 0, 0, 0}), + /*ellipsis_mask=*/0, + /*new_axis_mask=*/0, + /*shrink_axis_mask=*/0, + /*expected_output_dims=*/{1, 1, 2, 1}, + /*expected_output=*/{1, 2}, + }, + // Slice on dimensions 1 and 2. + TestParams{ + /*input_dims=*/{1, 2, 3, 1}, + /*begin=*/{0, 1, 1, 0}, + /*end=*/{0, 2, 3, 1}, + /*strides=*/{1, 1, 1, 1}, + /*begin_mask=*/get_mask({0, 0, 0, 0}), + /*end_mask=*/get_mask({1, 0, 0, 0}), + /*ellipsis_mask=*/0, + /*new_axis_mask=*/0, + /*shrink_axis_mask=*/0, + /*expected_output_dims=*/{1, 1, 2, 1}, + /*expected_output=*/{5, 6}, + }, + // Slice on dimensions 1 and 3. + TestParams{ + /*input_dims=*/{1, 2, 1, 3}, + /*begin=*/{0, 0, 0, 0}, + /*end=*/{0, 1, 1, 2}, + /*strides=*/{1, 1, 1, 1}, + /*begin_mask=*/get_mask({0, 0, 0, 0}), + /*end_mask=*/get_mask({1, 0, 0, 0}), + /*ellipsis_mask=*/0, + /*new_axis_mask=*/0, + /*shrink_axis_mask=*/0, + /*expected_output_dims=*/{1, 1, 1, 2}, + /*expected_output=*/{1, 2}, + }, + // Slice on dimensions 1 and 3 with non-zero slice start. + TestParams{ + /*input_dims=*/{1, 2, 1, 3}, + /*begin=*/{0, 1, 0, 1}, + /*end=*/{0, 2, 1, 3}, + /*strides=*/{1, 1, 1, 1}, + /*begin_mask=*/get_mask({0, 0, 0, 0}), + /*end_mask=*/get_mask({1, 0, 0, 0}), + /*ellipsis_mask=*/0, + /*new_axis_mask=*/0, + /*shrink_axis_mask=*/0, + /*expected_output_dims=*/{1, 1, 1, 2}, + /*expected_output=*/{5, 6}, + }, + // Slice on 3D tensor. + TestParams{ + /*input_dims=*/{1, 2, 3}, + /*begin=*/{0, 0, 0}, + /*end=*/{0, 1, 2}, + /*strides=*/{1, 1, 1}, + /*begin_mask=*/get_mask({0, 0, 0}), + /*end_mask=*/get_mask({1, 0, 0}), + /*ellipsis_mask=*/0, + /*new_axis_mask=*/0, + /*shrink_axis_mask=*/0, + /*expected_output_dims=*/{1, 1, 2}, + /*expected_output=*/{1, 2}, + }, + // Slice on 3D tensor using end_mask. For dynamic shape, all + // dimensions are dynamic. + TestParams{/*input_dims=*/{1, 2, 3}, + /*begin=*/{0, 1, 1}, + /*end=*/{0, 0, 0}, + /*strides=*/{1, 1, 1}, + /*begin_mask=*/get_mask({0, 0, 0}), + /*end_mask=*/get_mask({1, 1, 1}), + /*ellipsis_mask=*/0, + /*new_axis_mask=*/0, + /*shrink_axis_mask=*/0, + /*expected_output_dims=*/{1, 1, 2}, + /*expected_output=*/{5, 6}, + /*conversion_status=*/Status::OK(), + /*runtime_status=*/Status::OK(), + /*partial_input_dims=*/{-1, -1, -1}}, + // Slice on 3D tensor using end_mask. For dynamic shape, all + // dimensions are dynamic. + TestParams{/*input_dims=*/{1, 1, 2, 3}, + /*begin=*/{0, 0, 0, 0}, + /*end=*/{0, 0, 0, 2}, + /*strides=*/{1, 1, 1, 1}, + /*begin_mask=*/get_mask({0, 0, 0, 0}), + /*end_mask=*/get_mask({1, 1, 1, 0}), + /*ellipsis_mask=*/0, + /*new_axis_mask=*/0, + /*shrink_axis_mask=*/0, + /*expected_output_dims=*/{1, 1, 2, 2}, + /*expected_output=*/{1, 2, 4, 5}, + /*conversion_status=*/Status::OK(), + /*runtime_status=*/Status::OK(), + /*partial_input_dims=*/{-1, -1, -1, -1}}, + TestParams{ + /*input_dims=*/{1, 1, 2, 3}, + /*begin=*/{0, 0, 1, 0}, + /*end=*/{0, 0, 0, 0}, + /*strides=*/{1, 1, 1, 1}, + /*begin_mask=*/get_mask({0, 0, 0, 0}), + /*end_mask=*/get_mask({1, 1, 1, 1}), + /*ellipsis_mask=*/0, + /*new_axis_mask=*/0, + /*shrink_axis_mask=*/0, + /*expected_output_dims=*/{1, 1, 1, 3}, + /*expected_output=*/{4, 5, 6}, + }, + // 1D simple slice. + TestParams{/*input_dims=*/{1, 2, 3, 1}, + /*begin=*/{0, 0, 0, 0}, + /*end=*/{0, 1, 0, 0}, + /*strides=*/{1, 1, 1, 1}, + /*begin_mask=*/get_mask({0, 0, 0, 0}), + /*end_mask=*/get_mask({1, 0, 1, 1}), + /*ellipsis_mask=*/0, + /*new_axis_mask=*/0, + /*shrink_axis_mask=*/0, + /*expected_output_dims=*/{1, 1, 3, 1}, + /*expected_output=*/{1, 2, 3}, + /*conversion_status=*/Status::OK(), + /*runtime_status=*/Status::OK(), + /*partial_input_dims=*/{-1, -1, -1, -1}}, + TestParams{ + /*input_dims=*/{1, 2, 3, 1}, + /*begin=*/{0, 1, 0, 0}, + /*end=*/{0, 0, 0, 0}, + /*strides=*/{1, 1, 1, 1}, + /*begin_mask=*/get_mask({0, 0, 0, 0}), + /*end_mask=*/get_mask({1, 1, 1, 1}), + /*ellipsis_mask=*/0, + /*new_axis_mask=*/0, + /*shrink_axis_mask=*/0, + /*expected_output_dims=*/{1, 1, 3, 1}, + /*expected_output=*/{4, 5, 6}, + }, + // Simple 1D slice on 2D input. + TestParams{/*input_dims=*/{1, 6}, + /*begin=*/{0, 0}, + /*end=*/{0, 3}, + /*strides=*/{1, 1}, + /*begin_mask=*/get_mask({0, 0}), + /*end_mask=*/get_mask({1, 0}), + /*ellipsis_mask=*/0, + /*new_axis_mask=*/0, + /*shrink_axis_mask=*/0, + /*expected_output_dims=*/{1, 3}, + /*expected_output=*/{1, 2, 3}, + /*conversion_status=*/Status::OK(), + /*runtime_status=*/Status::OK(), + /*partial_input_dims=*/{-1, -1}}, + TestParams{ + /*input_dims=*/{1, 1, 6}, + /*begin=*/{0, 0, 2}, + /*end=*/{0, 0, 5}, + /*strides=*/{1, 1, 1}, + /*begin_mask=*/get_mask({0, 0, 0}), + /*end_mask=*/get_mask({1, 1, 0}), + /*ellipsis_mask=*/0, + /*new_axis_mask=*/0, + /*shrink_axis_mask=*/0, + /*expected_output_dims=*/{1, 1, 3}, + /*expected_output=*/{3, 4, 5}, + }, + TestParams{ + /*input_dims=*/{1, 6, 1}, + /*begin=*/{0, 2, 0}, + /*end=*/{0, 5, 0}, + /*strides=*/{1, 1, 1}, + /*begin_mask=*/get_mask({0, 0, 0}), + /*end_mask=*/get_mask({1, 0, 1}), + /*ellipsis_mask=*/0, + /*new_axis_mask=*/0, + /*shrink_axis_mask=*/0, + /*expected_output_dims=*/{1, 3, 1}, + /*expected_output=*/{3, 4, 5}, + }, + // Negative axis. + TestParams{ + /*input_dims=*/{1, 6, 1}, + /*begin=*/{0, -6, 0}, + /*end=*/{0, -3, 0}, + /*strides=*/{1, 1, 1}, + /*begin_mask=*/get_mask({0, 0, 0}), + /*end_mask=*/get_mask({1, 0, 1}), + /*ellipsis_mask=*/0, + /*new_axis_mask=*/0, + /*shrink_axis_mask=*/0, + /*expected_output_dims=*/{1, 3, 1}, + /*expected_output=*/{1, 2, 3}, + }, + TestParams{ + /*input_dims=*/{1, 6, 1}, + /*begin=*/{0, 0, 0}, + /*end=*/{0, -1, 0}, + /*strides=*/{1, 1, 1}, + /*begin_mask=*/get_mask({0, 0, 0}), + /*end_mask=*/get_mask({1, 0, 1}), + /*ellipsis_mask=*/0, + /*new_axis_mask=*/0, + /*shrink_axis_mask=*/0, + /*expected_output_dims=*/{1, 5, 1}, + /*expected_output=*/{1, 2, 3, 4, 5}, + }, + // Clamp out of bounds begin and end. + TestParams{ + /*input_dims=*/{1, 1, 2, 3}, + /*begin=*/{0, 0, -9999, -9}, + /*end=*/{0, 1, 1000, 4}, + /*strides=*/{1, 1, 1, 1}, + /*begin_mask=*/get_mask({0, 0, 0, 0}), + /*end_mask=*/get_mask({1, 0, 0, 0}), + /*ellipsis_mask=*/0, + /*new_axis_mask=*/0, + /*shrink_axis_mask=*/0, + /*expected_output_dims=*/{1, 1, 2, 3}, + /*expected_output=*/{1, 2, 3, 4, 5, 6}, + }, + // Stride values >= 2. + TestParams{/*input_dims=*/{1, 6}, + /*begin=*/{0, 0}, + /*end=*/{0, 5}, + /*strides=*/{1, 2}, + /*begin_mask=*/get_mask({0, 0}), + /*end_mask=*/get_mask({1, 0}), + /*ellipsis_mask=*/0, + /*new_axis_mask=*/0, + /*shrink_axis_mask=*/0, + /*expected_output_dims=*/{1, 3}, + /*expected_output=*/{1, 3, 5}, + /*conversion_status=*/Status::OK(), + /*runtime_status=*/Status::OK(), + /*partial_input_dims=*/{-1, -1}}, + TestParams{/*input_dims=*/{1, 6}, + /*begin=*/{0, 0}, + /*end=*/{0, 6}, + /*strides=*/{1, 2}, + /*begin_mask=*/get_mask({0, 0}), + /*end_mask=*/get_mask({1, 0}), + /*ellipsis_mask=*/0, + /*new_axis_mask=*/0, + /*shrink_axis_mask=*/0, + /*expected_output_dims=*/{1, 3}, + /*expected_output=*/{1, 3, 5}, + /*conversion_status=*/Status::OK(), + /*runtime_status=*/Status::OK(), + /*partial_input_dims=*/{-1, -1}}, + TestParams{/*input_dims=*/{1, 6}, + /*begin=*/{0, 1}, + /*end=*/{0, 6}, + /*strides=*/{1, 2}, + /*begin_mask=*/get_mask({0, 0}), + /*end_mask=*/get_mask({1, 0}), + /*ellipsis_mask=*/0, + /*new_axis_mask=*/0, + /*shrink_axis_mask=*/0, + /*expected_output_dims=*/{1, 3}, + /*expected_output=*/{2, 4, 6}, + /*conversion_status=*/Status::OK(), + /*runtime_status=*/Status::OK(), + /*partial_input_dims=*/{-1, -1}}, + TestParams{/*input_dims=*/{1, 6}, + /*begin=*/{0, 2}, + /*end=*/{0, 6}, + /*strides=*/{1, 3}, + /*begin_mask=*/get_mask({0, 0}), + /*end_mask=*/get_mask({1, 0}), + /*ellipsis_mask=*/0, + /*new_axis_mask=*/0, + /*shrink_axis_mask=*/0, + /*expected_output_dims=*/{1, 2}, + /*expected_output=*/{3, 6}, + /*conversion_status=*/Status::OK(), + /*runtime_status=*/Status::OK(), + /*partial_input_dims=*/{-1, -1}}, + // Stride values <= -2. + TestParams{/*input_dims=*/{1, 6}, + /*begin=*/{0, 5}, + /*end=*/{0, 0}, + /*strides=*/{1, -2}, + /*begin_mask=*/get_mask({0, 0}), + /*end_mask=*/get_mask({1, 1}), + /*ellipsis_mask=*/0, + /*new_axis_mask=*/0, + /*shrink_axis_mask=*/0, + /*expected_output_dims=*/{1, 3}, + /*expected_output=*/{6, 4, 2}, + /*conversion_status=*/Status::OK(), + /*runtime_status=*/Status::OK(), + /*partial_input_dims=*/{-1, -1}}, + TestParams{/*input_dims=*/{1, 6}, + /*begin=*/{0, 5}, + /*end=*/{0, 0}, + /*strides=*/{1, -2}, + /*begin_mask=*/get_mask({0, 0}), + /*end_mask=*/get_mask({1, 0}), + /*ellipsis_mask=*/0, + /*new_axis_mask=*/0, + /*shrink_axis_mask=*/0, + /*expected_output_dims=*/{1, 3}, + /*expected_output=*/{6, 4, 2}, + /*conversion_status=*/Status::OK(), + /*runtime_status=*/Status::OK(), + /*partial_input_dims=*/{-1, -1}}, + TestParams{/*input_dims=*/{1, 6}, + /*begin=*/{0, 5}, + /*end=*/{0, 1}, + /*strides=*/{1, -3}, + /*begin_mask=*/get_mask({0, 0}), + /*end_mask=*/get_mask({1, 0}), + /*ellipsis_mask=*/0, + /*new_axis_mask=*/0, + /*shrink_axis_mask=*/0, + /*expected_output_dims=*/{1, 2}, + /*expected_output=*/{6, 3}, + /*conversion_status=*/Status::OK(), + /*runtime_status=*/Status::OK(), + /*partial_input_dims=*/{-1, -1}}, + // Ellipsis_mask causes leading dimensions to be ignored. Begin, end, + // stride, and mask values of size 2 should be interpreted as applying to + // the last 2 dimensions, while the ellipsis applies to the first 2 (for a + // 4D input tensor). + TestParams{/*input_dims=*/{1, 1, 2, 3}, + /*begin=*/{0, 1}, + /*end=*/{0, 2}, + /*strides=*/{1, 1}, + /*begin_mask=*/get_mask({0, 0}), + /*end_mask=*/get_mask({0, 0}), + /*ellipsis_mask=*/get_mask({1, 0, 0}), + /*new_axis_mask=*/0, + /*shrink_axis_mask=*/0, + /*expected_output_dims=*/{1, 1, 2, 1}, + /*expected_output=*/{2, 5}, + /*conversion_status=*/Status::OK(), + /*runtime_status=*/Status::OK(), + /*partial_input_dims=*/{-1, -1, -1, -1}}, + // Ellipsis_mask on single inner dimension. + TestParams{ + /*input_dims=*/{1, 1, 2, 3}, + /*begin=*/{0, 0, 1}, + /*end=*/{0, 0, 2}, + /*strides=*/{1, 1, 1}, + /*begin_mask=*/get_mask({1, 0, 0, 0}), + /*end_mask=*/get_mask({1, 0, 0, 0}), + /*ellipsis_mask=*/get_mask({0, 1, 0, 0}), + /*new_axis_mask=*/0, + /*shrink_axis_mask=*/0, + /*expected_output_dims=*/{1, 1, 2, 1}, + /*expected_output=*/{2, 5}, + }, + // Ellipsis_mask on single leading dimension. + TestParams{/*input_dims=*/{1, 1, 2, 3}, + /*begin=*/{0, 0, 0, 1}, + /*end=*/{0, 1, 2, 2}, + /*strides=*/{1, 1, 1, 1}, + /*begin_mask=*/get_mask({0, 0, 0, 0}), + /*end_mask=*/get_mask({0, 0, 0, 0}), + /*ellipsis_mask=*/get_mask({1, 0, 0, 0}), + /*new_axis_mask=*/0, + /*shrink_axis_mask=*/0, + /*expected_output_dims=*/{1, 1, 2, 1}, + /*expected_output=*/{2, 5}, + /*conversion_status=*/Status::OK(), + /*runtime_status=*/Status::OK(), + /*partial_input_dims=*/{-1, -1, -1, -1}}, + // Ellipsis_mask on single inner dimension overrides that dimensions' + // begin/end values. + TestParams{/*input_dims=*/{1, 1, 2, 3}, + /*begin=*/{0, 1, 0, 1}, + /*end=*/{1, 1, 2, 2}, + /*strides=*/{1, 1, 1, 1}, + /*begin_mask=*/get_mask({0, 0, 0, 0}), + /*end_mask=*/get_mask({0, 0, 0, 0}), + /*ellipsis_mask=*/get_mask({0, 1, 0, 0}), + /*new_axis_mask=*/0, + /*shrink_axis_mask=*/0, + /*expected_output_dims=*/{1, 1, 2, 1}, + /*expected_output=*/{2, 5}, + /*conversion_status=*/Status::OK(), + /*runtime_status=*/Status::OK(), + /*partial_input_dims=*/{-1, -1, -1, -1}}, + // Ellipsis mask on single leading dimension should throw out extra + // leading values of begin/end vectors so that only the last N-1 values of + // each remain. + TestParams{/*input_dims=*/{1, 1, 2, 3}, + /*begin=*/{0, 0, 0, 0, 1}, + /*end=*/{0, 1, 1, 2, 2}, + /*strides=*/{1, 1, 1, 1, 1}, + /*begin_mask=*/get_mask({0, 0, 0, 0}), + /*end_mask=*/get_mask({0, 0, 0, 0}), + /*ellipsis_mask=*/get_mask({1, 0, 0, 0}), + /*new_axis_mask=*/0, + /*shrink_axis_mask=*/0, + /*expected_output_dims=*/{1, 1, 2, 1}, + /*expected_output=*/{2, 5}, + /*conversion_status=*/Status::OK(), + /*runtime_status=*/Status::OK(), + /*partial_input_dims=*/{-1, -1, -1, -1}}, + // Shrink-axis mask set for the final dimension of final size 1 should + // remove that dimension from the final shape. + TestParams{/*input_dims=*/{1, 1, 2, 3}, + /*begin=*/{0, 0, 0, 1}, + /*end=*/{0, 0, 0, 2}, + /*strides=*/{1, 1, 1, 1}, + /*begin_mask=*/get_mask({1, 1, 1, 0}), + /*end_mask=*/get_mask({1, 1, 1, 0}), + /*ellipsis_mask=*/0, + /*new_axis_mask=*/0, + /*shrink_axis_mask=*/get_mask({0, 0, 0, 1}), + /*expected_output_dims=*/{1, 1, 2}, + /*expected_output=*/{2, 5}, + /*conversion_status=*/Status::OK(), + /*runtime_status=*/Status::OK(), + /*partial_input_dims=*/{1, 1, 2, -1}}, + // Shrink-axis mask set for multiple dimensions that have a final size of + // 1 should remove those dimensions from the final shape. + TestParams{/*input_dims=*/{1, 1, 2, 3}, + /*begin=*/{0, 0, 0, 1}, + /*end=*/{0, 1, 2, 2}, + /*strides=*/{1, 1, 1, 1}, + /*begin_mask=*/get_mask({1, 0, 0, 0}), + /*end_mask=*/get_mask({1, 0, 0, 0}), + /*ellipsis_mask=*/0, + /*new_axis_mask=*/0, + /*shrink_axis_mask=*/get_mask({0, 1, 0, 1}), + /*expected_output_dims=*/{1, 2}, + /*expected_output=*/{2, 5}, + /*conversion_status=*/Status::OK(), + /*runtime_status=*/Status::OK(), + /*partial_input_dims=*/{1, 1, 2, -1}}, + // Shrink-axis mask set for multiple sequential dimensions of final size 1 + // should + // remove those dimensions from the final shape. + TestParams{/*input_dims=*/{6, 1, 1}, + /*begin=*/{0, 0, 0}, + /*end=*/{0, 0, 0}, + /*strides=*/{1, 1, 1}, + /*begin_mask=*/get_mask({1, 1, 1}), + /*end_mask=*/get_mask({1, 1, 1}), + /*ellipsis_mask=*/0, + /*new_axis_mask=*/0, + /*shrink_axis_mask=*/get_mask({0, 1, 1}), + /*expected_output_dims=*/{6}, + /*expected_output=*/{1, 2, 3, 4, 5, 6}, + /*conversion_status=*/Status::OK(), + /*runtime_status=*/Status::OK(), + /*partial_input_dims=*/{-1, -1, -1}}, + // The new_axis_mask parameter is not supported. + TestParams{/*input_dims=*/{1, 6}, + /*begin=*/{0, 0, 0}, + /*end=*/{0, 0, 0}, + /*strides=*/{1, 1, 1}, + /*begin_mask=*/ + get_mask({0, 1, 1}), + /*end_mask=*/get_mask({0, 1, 1}), + /*ellipsis_mask=*/0, + /*new_axis_mask=*/get_mask({1, 0, 0}), + /*shrink_axis_mask=*/get_mask({0, 0, 0}), + /*expected_output_dims=*/{1, 1, 6}, + /*expected_output=*/{1, 1, 6}, + /*conversion_status=*/ + errors::Unimplemented( + "new_axis_mask is not supported for StridedSlice"), + /*runtime_status=*/Status::OK(), + /*partial_input_dims=*/{1, 6}}, + // Test all axes dynamic inputs with shrink_axis_mask + TestParams{/*input_dims=*/{1, 3, 2}, + /*begin=*/{0, 0, 0}, + /*end=*/{0, 0, 3}, + /*strides=*/{1, 1, 1}, + /*begin_mask=*/get_mask({0, 1, 1}), + /*end_mask=*/get_mask({0, 1, 1}), + /*ellipsis_mask=*/0, + /*new_axis_mask=*/0, + /*shrink_axis_mask=*/1, + /*expected_output_dims=*/{3, 2}, + /*expected_output=*/{1, 2, 3, 4, 5, 6}, + /*conversion_status=*/modified_batch_dim_status, Status::OK(), + /*partial_input_dims=*/{-1, -1, -1}}, + // Test dynamic input with shrink_axis_mask along axis=0 + TestParams{/*input_dims=*/{2, 3, 2}, + /*begin=*/{0, 0, 0}, + /*end=*/{0, 0, 3}, + /*strides=*/{1, 1, 1}, + /*begin_mask=*/get_mask({0, 1, 1}), + /*end_mask=*/get_mask({0, 1, 1}), + /*ellipsis_mask=*/0, + /*new_axis_mask=*/0, + /*shrink_axis_mask=*/1, + /*expected_output_dims=*/{3, 2}, + /*expected_output=*/{1, 2, 3, 4, 5, 6}, + /*conversion_status=*/modified_batch_dim_status, Status::OK(), + /*partial_input_dims=*/{-1, -1, 2}}, + // Test dynamic input sizes with multiple axes shrinking + TestParams{/*input_dims=*/{2, 3, 2}, + /*begin=*/{0, 0, 0}, + /*end=*/{0, 0, 3}, + /*strides=*/{1, 1, 1}, + /*begin_mask=*/get_mask({0, 1, 1}), + /*end_mask=*/get_mask({0, 1, 1}), + /*ellipsis_mask=*/0, + /*new_axis_mask=*/0, + /*shrink_axis_mask=*/3, + /*expected_output_dims=*/{2}, + /*expected_output=*/{1, 2}, + /*conversion_status=*/modified_batch_dim_status, Status::OK(), + /*partial_input_dims=*/{-1, -1, 2}}, }; - for (int i = 0; i < kStridedSliceOKCases; i++) { + int i = 0; + for (auto p : params) { Reset(); NodeDef node_def = get_strided_slice_nodedef( - ok_params[i].begin_mask, ok_params[i].end_mask, - ok_params[i].ellipsis_mask, ok_params[i].new_axis_mask, - ok_params[i].shrink_axis_mask); - AddTestTensor("input", ok_params[i].input_dims); - AddTestWeights("begin", - {static_cast(ok_params[i].begin.size())}, - ok_params[i].begin); - AddTestWeights("end", {static_cast(ok_params[i].end.size())}, - ok_params[i].end); - AddTestWeights("strides", - {static_cast(ok_params[i].strides.size())}, - ok_params[i].strides); - RunValidationAndConversion(node_def); + tf_type_, p.begin_mask, p.end_mask, p.ellipsis_mask, p.new_axis_mask, + p.shrink_axis_mask); - TRT_TensorOrWeights output; - TF_EXPECT_OK(GetTensorOrWeights("my_strided_slice", &output)); - ASSERT_TRUE(output.is_tensor()); - ExpectTrtDimsEqualsArray(ok_params[i].expected_output_dims, - output.tensor()->getDimensions()); + VLOG(2) << "Preparing test case " << i++ << " with dims " + << DebugString(p.input_dims); + + switch (trt_mode_) { + case TrtTestMode::kImplicitBatch: { + AddTestTensor("input", p.input_dims, ok_input); + break; + } + case TrtTestMode::kExplicitBatch: { + AddTestTensor("input", p.input_dims, ok_input); + break; + } + case TrtTestMode::kDynamicShape: { + if (p.partial_input_dims.size() > 0) { + AddTestTensor("input", p.input_dims, tf_type_, ok_input, + p.partial_input_dims); + } else { + AddTestTensor("input", p.input_dims, tf_type_, ok_input, + p.input_dims); + } + break; + } + } - const DataVec input_data{{"input", test::AsTensor(ok_input)}}; - DataVec output_data{ - {"my_strided_slice", - ConstructTensor(ok_params[i].expected_output.size())}}; - BuildAndRun(input_data, &output_data); - EXPECT_THAT(GetSpanForData(output_data[0]), - ElementsAreArray(ok_params[i].expected_output)); + VLOG(2) << "Adding weights begin: " << DebugString(p.begin) + << ", end: " << DebugString(p.end) + << ", strides: " << DebugString(p.strides); + AddTestWeights("begin", {static_cast(p.begin.size())}, p.begin); + AddTestWeights("end", {static_cast(p.end.size())}, p.end); + AddTestWeights("strides", {static_cast(p.strides.size())}, + p.strides); + + TestOpConverter(node_def, p.expected_output_dims, p.conversion_status, + p.runtime_status, ElementsAreArray(p.expected_output)); } } -TEST_F(OpConverterTest, ConvertSlice) { +TEST_P(OpConverter_FP32_FP16_INT32_Test, ConvertSlice) { // Get nodedef for Slice layer. - auto get_slice_nodedef = []() -> NodeDef { + auto get_slice_nodedef = [](DataType tf_type) -> NodeDef { Scope s = Scope::NewRootScope(); - auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT); + auto input = ops::Placeholder(s.WithOpName("input"), tf_type); auto begin = ops::Placeholder(s.WithOpName("begin"), DT_INT32); auto size = ops::Placeholder(s.WithOpName("size"), DT_INT32); auto slice = ops::Slice(s.WithOpName("my_slice"), input, begin, size); return slice.operation.node()->def(); }; - { - // Begin is below bounds, should fail. - Reset(); - NodeDef node_def = get_slice_nodedef(); - AddTestTensor("input", {1, 2, 3}); - AddTestWeights("begin", {4}, {0, 0, -1, 0}); - AddTestWeights("size", {4}, {1, 1, 2, 3}); - RunValidationAndConversion( - node_def, error::INVALID_ARGUMENT, - "\"begin\" for dimension 2 in Slice is out of range, at my_slice"); - } - { - // Begin is above bounds, should fail. - Reset(); - NodeDef node_def = get_slice_nodedef(); - AddTestTensor("input", {1, 2, 3}); - AddTestWeights("begin", {4}, {0, 0, 3, 0}); - AddTestWeights("size", {4}, {1, 1, 2, 3}); - RunValidationAndConversion( - node_def, error::INVALID_ARGUMENT, - "\"begin\" for dimension 2 in Slice is out of range, at my_slice"); - } - { - // Size is below bounds, should fail. - Reset(); - NodeDef node_def = get_slice_nodedef(); - AddTestTensor("input", {1, 2, 3}); - AddTestWeights("begin", {4}, {0, 0, 0, 0}); - AddTestWeights("size", {4}, {1, 1, 2, -2}); - RunValidationAndConversion( - node_def, error::INVALID_ARGUMENT, - "\"begin\" + \"size\" for dimension 3 in Slice is out of range, at " - "my_slice"); - } - { - // Size is above bounds, should fail. - Reset(); - NodeDef node_def = get_slice_nodedef(); - AddTestTensor("input", {1, 2, 3}); - AddTestWeights("begin", {4}, {0, 0, 0, 0}); - AddTestWeights("size", {4}, {1, 1, 3, 3}); - RunValidationAndConversion( - node_def, error::INVALID_ARGUMENT, - "\"begin\" + \"size\" for dimension 2 in Slice is out of range, at " - "my_slice"); - } - { - // Modify batch dim, should fail. - Reset(); - NodeDef node_def = get_slice_nodedef(); - AddTestTensor("input", {1, 2, 3}); - AddTestWeights("begin", {4}, {0, 0, 0, 0}); - AddTestWeights("size", {4}, {0, 1, 2, 3}); - RunValidationAndConversion( - node_def, error::UNIMPLEMENTED, - "TensorRT does not allow modifications to the batch dimension, at " - "my_slice"); - } - { - // Dynamic batch size with size[0] not -1, should fail. - Reset(); - NodeDef node_def = get_slice_nodedef(); - AddTestTensor("input", {1, 2, 3}, /*batch_size=*/-1); - AddTestWeights("begin", {4}, {0, 0, 0, 0}); - AddTestWeights("size", {4}, {1, 1, 2, 3}); - RunValidationAndConversion( - node_def, error::UNIMPLEMENTED, - "TensorRT does not allow modifications to the batch dimension, at " - "my_slice"); - } - { - // Dynamic batch size but using size[0] of -1, ok. - Reset(); - NodeDef node_def = get_slice_nodedef(); - AddTestTensor("input", {1, 2, 3}, /*batch_size=*/-1); - AddTestWeights("begin", {4}, {0, 0, 0, 0}); - AddTestWeights("size", {4}, {-1, 1, 2, 2}); - RunValidationAndConversion(node_def); - } - struct TestParams { std::vector input_dims; + std::vector + partial_input_dims; // Symbolic shape in dynamic shape mode. std::vector begin; std::vector size; std::vector expected_output_dims; std::vector expected_output; + Status conversion_status; + Status runtime_status; }; - // Ok. - const int kSliceOKCases = 5; - TestParams ok_params[kSliceOKCases] = { - TestParams{{1, 2, 3}, + std::vector params = { + // Slice start points must always be >= 0. + TestParams{/*input_dims=*/{1, 1, 2, 3}, + /*partial_input_dims=*/{-1, -1, -1, -1}, + /*begin=*/{0, 0, -1, 0}, + /*size=*/{1, 1, 2, 3}, + /*expected_output_dims=*/{}, + /*expected_output=*/{}, + /*conversion_status=*/ + errors::InvalidArgument("\"begin\" in Slice " + "is out of range")}, + // In implicit batch mode, slicing the batch dimension is not allowed. + TestParams{/*input_dims=*/{2, 1, 1, 3}, + /*partial_input_dims=*/{-1, -1, -1, -1}, + /*begin=*/{0, 0, 0, 0}, + /*size=*/{1, 1, 1, 3}, + /*expected_output_dims=*/{1, 1, 1, 3}, + /*expected_output=*/{1, 2, 3}, + /*conversion_status=*/trt_mode_ == TrtTestMode::kImplicitBatch + ? errors::Unimplemented( + "TensorRT does not allow modifications to the batch " + "dimension in implicit batch mode") + : Status::OK()}, + // Dynamic batch size but using size[0] of -1, ok. + TestParams{{1, 1, 2, 3}, + /*partial_input_dims=*/{-1, -1, -1, -1}, + {0, 0, 0, 0}, + {-1, 1, 2, 2}, + {1, 1, 2, 2}, + {1, 2, 4, 5}, + Status::OK()}, + TestParams{{1, 1, 2, 3}, + /*partial_input_dims=*/{-1, -1, -1, -1}, {0, 0, 0, 0}, {-1, -1, -1, -1}, - {1, 2, 3}, + {1, 1, 2, 3}, + {1, 2, 3, 4, 5, 6}, + Status::OK()}, + TestParams{{1, 1, 2, 3}, + /*partial_input_dims=*/{-1, -1, -1, -1}, + {0, 0, 0, 0}, + {1, 1, 2, 3}, + {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}}, + TestParams{{1, 1, 2, 3}, + /*partial_input_dims=*/{-1, -1, -1, -1}, + /*begin=*/{0, 0, 0, 0}, + /*size=*/{1, -1, 2, 2}, + /*expected_output_dims=*/{1, 1, 2, 2}, + /*expected_output=*/{1, 2, 4, 5}, + Status::OK()}, + TestParams{/*input_dims=*/{1, 6}, + /*partial_input_dims=*/{-1, -1}, + /*being=*/{0, 1}, + /*size=*/{1, 5}, + /*expected_output_dims=*/{1, 5}, + /*expected_output=*/{2, 3, 4, 5, 6}}, + TestParams{/*input_dims=*/{1, 6}, + /*partial_input_dims=*/{-1, -1}, + /*begin=*/{0, 1}, + /*size=*/{-1, 3}, + /*expected_output_dims=*/{1, 3}, + /*expected_output=*/{2, 3, 4}, Status::OK()}, + // In dynamic shape mode we do not know the input shape during + // conversion, therfore we cannot check out of bound access. TestParams{ - {1, 2, 3}, {0, 0, 0, 0}, {1, 1, 2, 3}, {1, 2, 3}, {1, 2, 3, 4, 5, 6}}, + {1, 1, 2, 3}, + /*partial_input_dims=*/{-1, -1, -1, -1}, + /*begin=*/{0, 0, 3, 0}, + /*end=*/{1, 1, 2, 3}, + {}, + {}, + trt_mode_ == TrtTestMode::kDynamicShape + ? Status::OK() + : errors::InvalidArgument("\"begin\" + \"size\" for dimension " + "2 in Slice is out of range"), + errors::Internal("Internal: Failed to build TensorRT engine")}, + // The slice operation should expect that the "size[i]" values are not + // less than -1. + TestParams{/*input_dims=*/{1, 1, 2, 3}, + /*partial_input_dims=*/{-1, -1, -1, -1}, + /*begin=*/{0, 0, 0, 0}, + /*size=*/{1, 1, 2, -2}, + {}, + {}, + errors::InvalidArgument("\"size\" in Slice is out of range")}, TestParams{ - {1, 2, 3}, {0, 0, 0, 0}, {1, -1, 2, 2}, {1, 2, 2}, {1, 2, 4, 5}}, - TestParams{{6}, {0, 1}, {1, 5}, {5}, {2, 3, 4, 5, 6}}, - TestParams{{6}, {0, 1}, {-1, 3}, {3}, {2, 3, 4}}, + /*input_dims=*/{1, 1, 2, 3}, + /*partial_input_dims=*/{-1, -1, -1, -1}, + /*begin=*/{0, 0, 0, 0}, + /*size=*/{1, 1, 3, 2}, + /*expected_output_dims=*/{}, + /*expected_output=*/{}, + /*conversion_status=*/trt_mode_ == TrtTestMode::kDynamicShape + ? Status::OK() + : errors::InvalidArgument("\"begin\" + \"size\" for dimension " + "2 in Slice is out of range"), + errors::Internal("Internal: Failed to build TensorRT engine")}, }; - for (int i = 0; i < kSliceOKCases; i++) { + logger_.unsuppressAllLoggerMsgs(); + int i = 0; + for (auto p : params) { Reset(); - NodeDef node_def = get_slice_nodedef(); - AddTestTensor("input", ok_params[i].input_dims); - AddTestWeights("begin", - {static_cast(ok_params[i].begin.size())}, - ok_params[i].begin); - AddTestWeights("size", {static_cast(ok_params[i].size.size())}, - ok_params[i].size); - RunValidationAndConversion(node_def); + NodeDef node_def = get_slice_nodedef(tf_type_); - TRT_TensorOrWeights output; - TF_EXPECT_OK(GetTensorOrWeights("my_slice", &output)); - ASSERT_TRUE(output.is_tensor()); - ExpectTrtDimsEqualsArray(ok_params[i].expected_output_dims, - output.tensor()->getDimensions()); + VLOG(2) << "Preparing test case " << i++ << " with dims " + << DebugString(p.input_dims); - const DataVec input_data{ - {"input", test::AsTensor({1, 2, 3, 4, 5, 6})}}; - DataVec output_data{{"my_slice", ConstructTensor( - ok_params[i].expected_output.size())}}; - BuildAndRun(input_data, &output_data); - EXPECT_THAT(GetSpanForData(output_data[0]), - ElementsAreArray(ok_params[i].expected_output)); + // The input tensor always has size 6. + std::vector input_vals = {1, 2, 3, 4, 5, 6}; + + switch (trt_mode_) { + case TrtTestMode::kImplicitBatch: { + AddTestTensor("input", p.input_dims, input_vals); + break; + } + case TrtTestMode::kExplicitBatch: { + AddTestTensor("input", p.input_dims, input_vals); + break; + } + case TrtTestMode::kDynamicShape: { + if (p.partial_input_dims.size() > 0) { + AddTestTensor("input", p.input_dims, tf_type_, input_vals, + p.partial_input_dims); + + } else { + AddTestTensor("input", p.input_dims, tf_type_, input_vals, + p.input_dims); + } + break; + } + } + + AddTestWeights("begin", {static_cast(p.begin.size())}, p.begin); + AddTestWeights("size", {static_cast(p.size.size())}, p.size); + + const bool flag = + trt_mode_ == TrtTestMode::kDynamicShape && (i == 9 || i == 11); + if (flag) logger_.suppressLoggerMsgs(nvinfer1::ILogger::Severity::kERROR); + + TestOpConverter(node_def, p.expected_output_dims, p.conversion_status, + p.runtime_status, ElementsAreArray(p.expected_output)); + if (flag) logger_.unsuppressLoggerMsgs(nvinfer1::ILogger::Severity::kERROR); } } -TEST_F(OpConverterTest, ConvertConv2D) { +TEST_P(OpConverter_FP32_Test, ConvertConv2D) { // Get nodedef for Conv2D layer. + DataType tf_type = tf_type_; auto get_conv2d_nodedef = - [](std::vector strides = {1, 1, 1, 1}, string padding = "SAME", - string data_format = "NCHW", std::vector dilations = {1, 1, 1, 1}, - bool is_conv2d_backprop_input = false) -> NodeDef { + [tf_type](std::vector strides = {1, 1, 1, 1}, + string padding = "SAME", string data_format = "NCHW", + std::vector dilations = {1, 1, 1, 1}) -> NodeDef { Scope s = Scope::NewRootScope(); - auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT); - auto filter = ops::Placeholder(s.WithOpName("weights"), DT_FLOAT); - if (is_conv2d_backprop_input) { - auto input_sizes = - ops::Placeholder(s.WithOpName("input_sizes"), DT_INT32); - ops::Conv2DBackpropInput::Attrs attrs = ops::Conv2DBackpropInput::Attrs() - .DataFormat(data_format) - .Dilations(dilations); - auto conv2d = - ops::Conv2DBackpropInput(s.WithOpName("my_conv2d"), input_sizes, - filter, input, strides, padding, attrs); - return conv2d.operation.node()->def(); - } else { - ops::Conv2D::Attrs attrs = - ops::Conv2D::Attrs().DataFormat(data_format).Dilations(dilations); - auto conv2d = ops::Conv2D(s.WithOpName("my_conv2d"), input, filter, - strides, padding, attrs); - return conv2d.operation.node()->def(); - } + auto input = ops::Placeholder(s.WithOpName("input"), tf_type); + auto filter = ops::Placeholder(s.WithOpName("weights"), tf_type); + ops::Conv2D::Attrs attrs = + ops::Conv2D::Attrs().DataFormat(data_format).Dilations(dilations); + auto conv2d = ops::Conv2D(s.WithOpName("my_conv2d"), input, filter, strides, + padding, attrs); + return conv2d.operation.node()->def(); }; { @@ -3738,96 +5769,93 @@ TEST_F(OpConverterTest, ConvertConv2D) { AddTestWeights("input", {1, 2, 3}, {1, 2, 3, 4, 5, 6}); AddTestWeights("weights", {3, 3, 1, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9}); RunValidationAndConversion( - node_def, error::UNIMPLEMENTED, - "The input \"input\" for Conv2D must be a tensor, at my_conv2d"); + node_def, absl::StatusCode::kUnimplemented, + "The input \"input\" for Conv2D must be a tensor"); } { // Filter is tensor, should fail. Reset(); NodeDef node_def = get_conv2d_nodedef(); - AddTestTensor("input", {1, 2, 3}); + AddTestTensor("input", {3, 1, 2, 1}); AddTestTensor("weights", {3, 3, 1, 1}); RunValidationAndConversion( - node_def, error::UNIMPLEMENTED, - "The input \"filter\" for Conv2D must be a constant, at my_conv2d"); + node_def, absl::StatusCode::kUnimplemented, + "The input \"filter\" for Conv2D must be a constant"); } { // Filter is not 4D, should fail. Reset(); NodeDef node_def = get_conv2d_nodedef(); - AddTestTensor("input", {1, 2, 3}); + AddTestTensor("input", {1, 1, 2, 3}); AddTestWeights("weights", {3, 3, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9}); - RunValidationAndConversion( - node_def, error::INVALID_ARGUMENT, - "Conv2D expects kernel of dimension 4, at my_conv2d"); + RunValidationAndConversion(node_def, absl::StatusCode::kInvalidArgument, + "Conv2D expects kernel of dimension 4"); } { // Dilations is not 4D, should fail. Reset(); NodeDef node_def = get_conv2d_nodedef({1, 1, 1, 1}, "SAME", "NCHW", {1, 1, 1}); - AddTestTensor("input", {1, 2, 3}); + AddTestTensor("input", {1, 1, 2, 3}); AddTestWeights("weights", {3, 3, 1, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9}); RunValidationAndConversion( - node_def, error::INVALID_ARGUMENT, - "Convolution dilations field must specify 4 dimensions, at my_conv2d"); + node_def, absl::StatusCode::kInvalidArgument, + "Convolution dilations field must specify 4 dimensions"); } { // Dilation value is not 1 for channel, should fail. Reset(); NodeDef node_def = get_conv2d_nodedef({1, 1, 1, 1}, "SAME", "NCHW", {1, 2, 1, 1}); - AddTestTensor("input", {1, 2, 3}); + AddTestTensor("input", {1, 1, 2, 3}); AddTestWeights("weights", {3, 3, 1, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9}); - RunValidationAndConversion(node_def, error::UNIMPLEMENTED, + RunValidationAndConversion(node_def, absl::StatusCode::kUnimplemented, "Dilation rate must be 1 for batch and channel " - "dimensions, at my_conv2d"); + "dimensions"); } { // Dilation value is not 1 for channel (NHWC), should fail. Reset(); NodeDef node_def = get_conv2d_nodedef({1, 1, 1, 1}, "SAME", "NHWC", {1, 1, 1, 2}); - AddTestTensor("input", {2, 3, 1}); + AddTestTensor("input", {1, 2, 3, 1}); AddTestWeights("weights", {3, 3, 1, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9}); - RunValidationAndConversion(node_def, error::UNIMPLEMENTED, + RunValidationAndConversion(node_def, absl::StatusCode::kUnimplemented, "Dilation rate must be 1 for batch and channel " - "dimensions, at my_conv2d"); - } - { - // Dilation + Conv2DBackpropInput, should fail. - Reset(); - NodeDef node_def = - get_conv2d_nodedef({1, 1, 1, 1}, "SAME", "NHWC", {1, 1, 2, 1}, true); - AddTestTensor("input", {2, 3, 1}); - AddTestWeights("weights", {3, 3, 1, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9}); - AddTestWeights("input_sizes", {4}, {1, 2, 3, 1}); - RunValidationAndConversion(node_def, error::UNIMPLEMENTED, - "Dilation with Conv2DBackpropInput " - "(conv2d_transpose) is not supported, " - "at my_conv2d"); + "dimensions"); } { // Strides is not 4D, should fail. Reset(); NodeDef node_def = get_conv2d_nodedef({1, 1, 1}, "SAME", "NCHW", {1, 1, 1, 1}); - AddTestTensor("input", {1, 2, 3}); + AddTestTensor("input", {1, 1, 2, 3}); AddTestWeights("weights", {3, 3, 1, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9}); RunValidationAndConversion( - node_def, error::INVALID_ARGUMENT, - "Convolution strides field must specify 4 dimensions, at my_conv2d"); + node_def, absl::StatusCode::kInvalidArgument, + "Convolution strides field must specify 4 dimensions"); } { // Stride value is not 1 for channel, should fail. Reset(); NodeDef node_def = get_conv2d_nodedef({1, 2, 1, 1}, "SAME", "NCHW", {1, 1, 1, 1}); - AddTestTensor("input", {1, 2, 3}); + AddTestTensor("input", {1, 1, 2, 3}); AddTestWeights("weights", {3, 3, 1, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9}); RunValidationAndConversion( - node_def, error::UNIMPLEMENTED, - "Stride must be 1 for batch and channel dimensions, at my_conv2d"); + node_def, absl::StatusCode::kUnimplemented, + "Stride must be 1 for batch and channel dimensions"); + } + if (trt_mode_ == TrtTestMode::kDynamicShape) { + Reset(); + NodeDef node_def = get_conv2d_nodedef(); + // Channel dim unknown, should fail. + nvinfer1::DataType trt_type; + TF_ASSERT_OK(TfTypeToTrtType(tf_type_, &trt_type)); + AddTestTensorWithTFDims("input", {-1, -1, -1, -1}, trt_type); + AddTestWeights("weights", {1, 2, 1, 1}, {-1, 1}); + RunValidationAndConversion(node_def, absl::StatusCode::kInvalidArgument, + "Channel dimension must be static"); } struct TestParams { @@ -3839,15 +5867,14 @@ TEST_F(OpConverterTest, ConvertConv2D) { string padding; string data_format; std::vector dilations; - bool is_conv2d_backprop_input; std::vector expected_output_dims; std::vector expected_output; }; // Ok. - std::vector ok_params{ + std::vector ok_params = { // Basic - TestParams{/*input_dims=*/{1, 2, 3}, + TestParams{/*input_dims=*/{1, 1, 2, 3}, /*input=*/{0, 1, 2, 3, 3, 4}, /*filter_dims=*/{1, 2, 1, 1}, /*filter=*/{-1, 1}, @@ -3855,11 +5882,10 @@ TEST_F(OpConverterTest, ConvertConv2D) { /*padding=*/"VALID", /*data_format=*/"NCHW", /*dilations=*/{1, 1, 1, 1}, - /*is_conv2d_backprop_input=*/false, - /*expected_output_dims=*/{1, 2, 2}, + /*expected_output_dims=*/{1, 1, 2, 2}, /*expected_output=*/{1, 1, 0, 1}}, // SAME padding (Asymmetric) - TestParams{/*input_dims=*/{1, 2, 3}, + TestParams{/*input_dims=*/{1, 1, 2, 3}, /*input=*/{0, 1, 2, 3, 3, 4}, /*filter_dims=*/{1, 2, 1, 1}, /*filter=*/{-1, 1}, @@ -3867,11 +5893,10 @@ TEST_F(OpConverterTest, ConvertConv2D) { /*padding=*/"SAME", /*data_format=*/"NCHW", /*dilations=*/{1, 1, 1, 1}, - /*is_conv2d_backprop_input=*/false, - /*expected_output_dims=*/{1, 2, 3}, + /*expected_output_dims=*/{1, 1, 2, 3}, /*expected_output=*/{1, 1, -2, 0, 1, -4}}, // SAME padding (Symmetric) - TestParams{/*input_dims=*/{1, 2, 3}, + TestParams{/*input_dims=*/{1, 1, 2, 3}, /*input=*/{0, 1, 2, 3, 3, 4}, /*filter_dims=*/{1, 3, 1, 1}, /*filter=*/{-1, 0, 1}, @@ -3879,11 +5904,10 @@ TEST_F(OpConverterTest, ConvertConv2D) { /*padding=*/"SAME", /*data_format=*/"NCHW", /*dilations=*/{1, 1, 1, 1}, - /*is_conv2d_backprop_input=*/false, - /*expected_output_dims=*/{1, 2, 3}, + /*expected_output_dims=*/{1, 1, 2, 3}, /*expected_output=*/{1, 2, -1, 3, 1, -3}}, // NHWC - TestParams{/*input_dims=*/{2, 3, 1}, + TestParams{/*input_dims=*/{1, 2, 3, 1}, /*input=*/{0, 1, 2, 3, 3, 4}, /*filter_dims=*/{1, 2, 1, 1}, /*filter=*/{-1, 1}, @@ -3891,11 +5915,10 @@ TEST_F(OpConverterTest, ConvertConv2D) { /*padding=*/"VALID", /*data_format=*/"NHWC", /*dilations=*/{1, 1, 1, 1}, - /*is_conv2d_backprop_input=*/false, - /*expected_output_dims=*/{2, 2, 1}, + /*expected_output_dims=*/{1, 2, 2, 1}, /*expected_output=*/{1, 1, 0, 1}}, // Dilated - TestParams{/*input_dims=*/{1, 2, 3}, + TestParams{/*input_dims=*/{1, 1, 2, 3}, /*input=*/{0, 1, 2, 3, 3, 4}, /*filter_dims=*/{1, 2, 1, 1}, /*filter=*/{-1, 1}, @@ -3903,11 +5926,10 @@ TEST_F(OpConverterTest, ConvertConv2D) { /*padding=*/"VALID", /*data_format=*/"NCHW", /*dilations=*/{1, 1, 1, 2}, - /*is_conv2d_backprop_input=*/false, - /*expected_output_dims=*/{1, 2, 1}, + /*expected_output_dims=*/{1, 1, 2, 1}, /*expected_output=*/{2, 1}}, // Strided - TestParams{/*input_dims=*/{1, 2, 4}, + TestParams{/*input_dims=*/{1, 1, 2, 4}, /*input=*/{0, 1, 2, 2, 3, 4, 4, 7}, /*filter_dims=*/{1, 2, 1, 1}, /*filter=*/{-1, 1}, @@ -3915,11 +5937,74 @@ TEST_F(OpConverterTest, ConvertConv2D) { /*padding=*/"VALID", /*data_format=*/"NCHW", /*dilations=*/{1, 1, 1, 1}, - /*is_conv2d_backprop_input=*/false, - /*expected_output_dims=*/{1, 2, 2}, + /*expected_output_dims=*/{1, 1, 2, 2}, /*expected_output=*/{1, 0, 1, 3}}, + }; + + for (int i = 0; i < ok_params.size(); i++) { + Reset(); + NodeDef node_def = + get_conv2d_nodedef(ok_params[i].strides, ok_params[i].padding, + ok_params[i].data_format, ok_params[i].dilations); + std::vector partial_input_shape; + if (trt_mode_ == TrtTestMode::kDynamicShape) { + // The channel dim cannot have unknown size, fix that. + partial_input_shape.resize(ok_params[i].input_dims.size(), -1); + int channel_id = (ok_params[i].data_format == "NCHW") ? 1 : 3; + partial_input_shape[channel_id] = ok_params[i].input_dims[channel_id]; + } + + AddTestTensor("input", ok_params[i].input_dims, tf_type_, + ok_params[i].input, partial_input_shape); + AddTestWeights("weights", ok_params[i].filter_dims, + ok_params[i].filter); + + TestOpConverter(node_def, ok_params[i].expected_output_dims, Status::OK(), + Status::OK(), + ElementsAreArray(ok_params[i].expected_output)); + } +} + +TEST_P(OpConverter_FP32_Test, ConvertConv2DBackpropInput) { + // Get nodedef for Conv2D layer. + auto get_conv2d_backprop_input_nodedef = + [](DataType tf_type, std::vector strides = {1, 1, 1, 1}, + string padding = "SAME", string data_format = "NCHW", + std::vector dilations = {1, 1, 1, 1}) -> NodeDef { + Scope s = Scope::NewRootScope(); + auto input = ops::Placeholder(s.WithOpName("input"), tf_type); + auto filter = ops::Placeholder(s.WithOpName("weights"), tf_type); + auto input_sizes = ops::Placeholder(s.WithOpName("input_sizes"), DT_INT32); + ops::Conv2DBackpropInput::Attrs attrs = ops::Conv2DBackpropInput::Attrs() + .DataFormat(data_format) + .Dilations(dilations); + auto conv2d = ops::Conv2DBackpropInput( + s.WithOpName("my_conv2d_backprop_input"), input_sizes, filter, input, + strides, padding, attrs); + return conv2d.operation.node()->def(); + }; + + struct TestParams { + std::vector input_dims; + std::vector input; + std::vector filter_dims; + std::vector filter; + std::vector strides; + string padding; + string data_format; + std::vector dilations; + std::vector expected_output_dims; + std::vector expected_output; + Status conversion_status; + // For dynamic shape mode, we must use the partial_input_dims for + // creating the test tensor if any of the input_dims are -1. + std::vector partial_input_dims; + }; + + // Ok. + std::vector params = { // Transpose Strided - TestParams{/*input_dims=*/{1, 2, 2}, + TestParams{/*input_dims=*/{1, 1, 2, 2}, /*input=*/{0, 1, 2, 3}, /*filter_dims=*/{1, 2, 1, 1}, /*filter=*/{-1, 1}, @@ -3927,11 +6012,10 @@ TEST_F(OpConverterTest, ConvertConv2D) { /*padding=*/"SAME", /*data_format=*/"NCHW", /*dilations=*/{1, 1, 1, 1}, - /*is_conv2d_backprop_input=*/true, - /*expected_output_dims=*/{1, 2, 4}, + /*expected_output_dims=*/{1, 1, 2, 4}, /*expected_output=*/{0, 0, -1, 1, -2, 2, -3, 3}}, // Transpose Strided NHWC - TestParams{/*input_dims=*/{2, 2, 1}, + TestParams{/*input_dims=*/{1, 2, 2, 1}, /*input=*/{0, 1, 2, 3}, /*filter_dims=*/{1, 2, 1, 1}, /*filter=*/{-1, 1}, @@ -3939,11 +6023,10 @@ TEST_F(OpConverterTest, ConvertConv2D) { /*padding=*/"SAME", /*data_format=*/"NHWC", /*dilations=*/{1, 1, 1, 1}, - /*is_conv2d_backprop_input=*/true, - /*expected_output_dims=*/{2, 4, 1}, + /*expected_output_dims=*/{1, 2, 4, 1}, /*expected_output=*/{0, 0, -1, 1, -2, 2, -3, 3}}, // Transpose Strided NHWC with VALID padding - TestParams{/*input_dims=*/{3, 1, 1}, + TestParams{/*input_dims=*/{1, 3, 1, 1}, /*input=*/{0, 1, 2}, /*filter_dims=*/{2, 1, 1, 1}, /*filter=*/{-1, 1}, @@ -3951,438 +6034,965 @@ TEST_F(OpConverterTest, ConvertConv2D) { /*padding=*/"VALID", /*data_format=*/"NHWC", /*dilations=*/{1, 1, 1, 1}, - /*is_conv2d_backprop_input=*/true, - /*expected_output_dims=*/{7, 1, 1}, + /*expected_output_dims=*/{1, 7, 1, 1}, /*expected_output=*/{0, 0, -1, 1, -2, 2, 0}}, - + TestParams{/*input_dims=*/{1, 1, 2, 2}, + /*input=*/{0, 1, 2, 3}, + /*filter_dims=*/{1, 2, 1, 1}, + /*filter=*/{-1, 1}, + /*strides=*/{1, 1, 1, 2}, + /*padding=*/"EXPLICIT", + /*data_format=*/"NCHW", + /*dilations=*/{1, 1, 1, 1}, + /*expected_output_dims=*/{1, 1, 2, 4}, + /*expected_output=*/{0, 0, -1, 1, -2, 2, -3, 3}, + errors::Unimplemented("EXPLICIT padding type not " + "implemented, only VALID and SAME are" + " supported")}, + // Dilation + Conv2DBackpropInput, should fail. + TestParams{/*input_dims=*/{1, 1, 2, 2}, + /*input=*/{0, 1, 2, 3}, + /*filter_dims=*/{1, 2, 1, 1}, + /*filter=*/{-1, 1}, + /*strides=*/{1, 1, 1, 1}, + /*padding=*/"SAME", + /*data_format=*/"NCHW", + /*dilations=*/{1, 1, 1, 2}, + {1, 1, 2, 2}, + {}, + errors::Unimplemented("Dilation with Conv2DBackpropInput " + "(conv2d_transpose) is not supported")}, }; + if (trt_mode_ == TrtTestMode::kDynamicShape) { + params.push_back( + TestParams{/*input_dims=*/{1, 1, 2, 2}, + /*input=*/{0, 1, 2, 3}, + /*filter_dims=*/{1, 2, 1, 1}, + /*filter=*/{-1, 1}, + /*strides=*/{1, 1, 1, 2}, + /*padding=*/"SAME", + /*data_format=*/"NCHW", + /*dilations=*/{1, 1, 1, 1}, + /*expected_output_dims=*/{1, 1, 2, 4}, + /*expected_output=*/{0, 0, -1, 1, -2, 2, -3, 3}, + errors::InvalidArgument("Channel dimension must be static"), + /*partial input dims=*/{1, -1, 2, 2}}); + // Test dynamic batch dimension. + params.push_back( + TestParams{/*input_dims=*/{2, 1, 2, 2}, + /*input=*/ + // clang-format off + {0, 1, 2, 3, + 3, 2, 1, 0}, + // clang-format on + /*filter_dims=*/{1, 2, 1, 1}, + /*filter=*/{-1, 1}, + /*strides=*/{1, 1, 1, 2}, + /*padding=*/"SAME", + /*data_format=*/"NCHW", + /*dilations=*/{1, 1, 1, 1}, + /*expected_output_dims=*/{2, 1, 2, 4}, + /*expected_output=*/ + // clang-format off + { 0, 0, -1, 1, -2, 2, -3, 3, + -3, 3, -2, 2, -1, 1, 0, 0}, + // clang-format on + /*conversion_status=*/Status::OK(), + /*partial input dims=*/{-1, 1, 2, 2}}); + + // Test dynamic height and width. + params.push_back(TestParams{ + /*input_dims=*/{1, 1, 2, 2}, + /*input=*/{0, 1, 2, 3}, + /*filter_dims=*/{1, 2, 1, 1}, + /*filter=*/{-1, 1}, + /*strides=*/{1, 1, 1, 2}, + /*padding=*/"SAME", + /*data_format=*/"NCHW", + /*dilations=*/{1, 1, 1, 1}, + /*expected_output_dims=*/{1, 1, 2, 4}, + /*expected_output=*/ + {0, 0, -1, 1, -2, 2, -3, 3}, + /*conversion_status=*/ + errors::Unimplemented( + "Conv2dBackpropInput does not support input with unknown spatial " + "shape"), + /*partial input dims=*/{1, 1, -1, -1}}); + } + for (auto p : params) { + for (int input_sizes_length : {2, 4}) { + Reset(); + NodeDef node_def = get_conv2d_backprop_input_nodedef( + tf_type_, p.strides, p.padding, p.data_format, p.dilations); - for (int i = 0; i < ok_params.size(); i++) { - Reset(); - NodeDef node_def = get_conv2d_nodedef( - ok_params[i].strides, ok_params[i].padding, ok_params[i].data_format, - ok_params[i].dilations, ok_params[i].is_conv2d_backprop_input); - AddTestTensor("input", ok_params[i].input_dims); - AddTestWeights("weights", ok_params[i].filter_dims, - ok_params[i].filter); + switch (trt_mode_) { + case TrtTestMode::kImplicitBatch: { + AddTestTensor("input", p.input_dims, p.input); + break; + } + case TrtTestMode::kExplicitBatch: { + AddTestTensor("input", p.input_dims, p.input); + break; + } + case TrtTestMode::kDynamicShape: { + AddTestTensor("input", p.input_dims, tf_type_, p.input, + p.partial_input_dims.size() > 0 ? p.partial_input_dims + : p.input_dims); + break; + } + default: { ASSERT_TRUE(false) << "unknown test mode"; } + } + AddTestWeights("weights", p.filter_dims, p.filter, tf_type_); - if (ok_params[i].is_conv2d_backprop_input) { - std::vector tf_input_sizes = ok_params[i].expected_output_dims; - tf_input_sizes.insert(tf_input_sizes.begin(), 1); // Add batch dimension. - QCHECK_EQ(4, tf_input_sizes.size()); - AddTestWeights("input_sizes", {4}, tf_input_sizes); + if (input_sizes_length == 4) { + AddTestWeights("input_sizes", {4}, p.expected_output_dims); + } else { + std::vector tf_input_sizes(2); + // Remove the channel and batch dimensions. + if (p.data_format == "NHWC") { + std::copy(p.expected_output_dims.begin() + 1, + p.expected_output_dims.end() - 1, tf_input_sizes.begin()); + } else { + std::copy(p.expected_output_dims.begin() + 2, + p.expected_output_dims.end(), tf_input_sizes.begin()); + } + QCHECK_EQ(2, tf_input_sizes.size()); + AddTestWeights("input_sizes", {2}, tf_input_sizes); + } + + TestOpConverter(node_def, p.expected_output_dims, p.conversion_status, + Status::OK(), ElementsAreArray(p.expected_output)); } - RunValidationAndConversion(node_def); - TRT_TensorOrWeights output; - TF_EXPECT_OK(GetTensorOrWeights("my_conv2d", &output)); - ASSERT_TRUE(output.is_tensor()); - ExpectTrtDimsEqualsArray(ok_params[i].expected_output_dims, - output.tensor()->getDimensions()); + } +} - const DataVec input_data{ - {"input", test::AsTensor(ok_params[i].input)}}; - DataVec output_data{ - {"my_conv2d", - ConstructTensor(ok_params[i].expected_output.size())}}; - BuildAndRun(input_data, &output_data); - EXPECT_THAT(GetSpanForData(output_data[0]), - ElementsAreArray(ok_params[i].expected_output)); +// Get the NodeDef for Pack. +NodeDef GetConv3DNodeDef(std::vector strides = {1, 1, 1, 1, 1}, + string padding = "SAME", string data_format = "NCDHW", + std::vector dilations = {1, 1, 1, 1, 1}, + bool is_conv3d_backprop_input = false) { + Scope s = Scope::NewRootScope(); + auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT); + auto filter = ops::Placeholder(s.WithOpName("weights"), DT_FLOAT); + + if (is_conv3d_backprop_input) { + auto input_sizes = ops::Placeholder(s.WithOpName("input_sizes"), DT_INT32); + ops::Conv3DBackpropInputV2::Attrs attrs = + ops::Conv3DBackpropInputV2::Attrs() + .DataFormat(data_format) + .Dilations(dilations); + auto conv3d = + ops::Conv3DBackpropInputV2(s.WithOpName("my_conv3d"), input_sizes, + filter, input, strides, padding, attrs); + return conv3d.operation.node()->def(); + } else { + ops::Conv3D::Attrs attrs = + ops::Conv3D::Attrs().DataFormat(data_format).Dilations(dilations); + auto conv3d = ops::Conv3D(s.WithOpName("my_conv3d"), input, filter, strides, + padding, attrs); + return conv3d.operation.node()->def(); } } -#if IS_TRT_VERSION_GE(6, 0, 0, 0) -TEST_F(OpConverterTest, ConvertConv3D) { - // Get nodedef for Conv3D layer. - auto get_conv3d_nodedef = - [](std::vector strides = {1, 1, 1, 1, 1}, string padding = "SAME", - string data_format = "NCDHW", - std::vector dilations = {1, 1, 1, 1, 1}, - bool is_conv3d_backprop_input = false) -> NodeDef { - Scope s = Scope::NewRootScope(); - auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT); - auto filter = ops::Placeholder(s.WithOpName("weights"), DT_FLOAT); - - if (is_conv3d_backprop_input) { - auto input_sizes = - ops::Placeholder(s.WithOpName("input_sizes"), DT_INT32); - ops::Conv3DBackpropInputV2::Attrs attrs = - ops::Conv3DBackpropInputV2::Attrs() - .DataFormat(data_format) - .Dilations(dilations); - auto conv3d = - ops::Conv3DBackpropInputV2(s.WithOpName("my_conv3d"), input_sizes, - filter, input, strides, padding, attrs); - return conv3d.operation.node()->def(); - } else { - ops::Conv3D::Attrs attrs = - ops::Conv3D::Attrs().DataFormat(data_format).Dilations(dilations); - auto conv3d = ops::Conv3D(s.WithOpName("my_conv3d"), input, filter, - strides, padding, attrs); - return conv3d.operation.node()->def(); - } - }; +struct Conv3DTestParams { + std::vector input_dims; + std::vector input; + std::vector filter_dims; + std::vector filter; + std::vector strides; + string padding; + string data_format; + std::vector dilations; + bool is_conv3d_backprop; + std::vector expected_output_dims; + std::vector expected_output; + bool allow_dynamic_channel_dim; + Status validation_status; +}; + +void TestConv3D(ParameterizedOpConverterTestBase* test, Conv3DTestParams& p) { + test->Reset(); + NodeDef node_def = GetConv3DNodeDef(p.strides, p.padding, p.data_format, + p.dilations, p.is_conv3d_backprop); + + std::vector partial_input_shape; + if (!p.allow_dynamic_channel_dim && + test->get_trt_mode() == TrtTestMode::kDynamicShape) { + // The channel dim cannot have unknown size, fix that. + partial_input_shape.resize(p.input_dims.size(), -1); + int channel_id = (p.data_format == "NCDHW") ? 1 : 4; + partial_input_shape[channel_id] = p.input_dims[channel_id]; + } + + test->AddTestTensor("input", p.input_dims, test->get_tf_type(), p.input, + partial_input_shape); + test->AddTestWeights("weights", p.filter_dims, p.filter); + + if (p.is_conv3d_backprop) { + test->AddTestWeights("input_sizes", + {static_cast(p.expected_output.size())}, + p.expected_output); + } + test->TestOpConverter(node_def, p.expected_output_dims, + /*expected_conversion_status=*/p.validation_status, + /*expected_runtime_status=*/Status::OK(), + /*matcher=*/ElementsAreArray(p.expected_output), + /*out_tf_types=*/{test->get_tf_type()}); +} + +TEST_P(OpConverter_FP32_FP16_Test, ConvertConv3D) { { // Input is weights, should fail. Reset(); - NodeDef node_def = get_conv3d_nodedef(); + NodeDef node_def = GetConv3DNodeDef(); - AddTestWeights("input", {1, 2, 3}, {1, 2, 3, 4, 5, 6}); - AddTestWeights("weights", {3, 3, 1, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9}); + AddTestWeights("input", {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}); + AddTestWeights("weights", {1, 3, 3, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9}); RunValidationAndConversion( - node_def, error::UNIMPLEMENTED, - "The input \"input\" for Conv3D must be a tensor, at my_conv3d"); + node_def, absl::StatusCode::kUnimplemented, + "The input \"input\" for Conv3D must be a tensor"); } { // Filter is tensor, should fail. Reset(); - NodeDef node_def = get_conv3d_nodedef(); - AddTestTensor("input", {1, 2, 3}); - AddTestTensor("weights", {3, 3, 1, 1, 3, 3, 1, 1}); + NodeDef node_def = GetConv3DNodeDef(); + AddTestTensor("input", {1, 1, 2, 3}, tf_type_, CreateVectorIota(6)); + AddTestTensor("weights", {1, 3, 3, 1}, tf_type_, + CreateVectorIota(9)); RunValidationAndConversion( - node_def, error::UNIMPLEMENTED, - "The input \"filter\" for Conv3D must be a constant, at my_conv3d"); + node_def, absl::StatusCode::kUnimplemented, + "The input \"filter\" for Conv3D must be a constant"); } { // Filter is not 5D, should fail. Reset(); - NodeDef node_def = get_conv3d_nodedef(); - AddTestTensor("input", {1, 2, 3}); + NodeDef node_def = GetConv3DNodeDef(); + AddTestTensor("input", {1, 1, 2, 3}, tf_type_, CreateVectorIota(6)); AddTestWeights("weights", {3, 3, 1, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9}); - RunValidationAndConversion( - node_def, error::INVALID_ARGUMENT, - "Conv3D expects kernel of dimension 5, at my_conv3d"); + RunValidationAndConversion(node_def, absl::StatusCode::kInvalidArgument, + "Conv3D expects kernel of dimension 5"); } { // Dilations is not 5D, should fail. Reset(); NodeDef node_def = - get_conv3d_nodedef({1, 1, 1, 1, 1}, "SAME", "NCDHW", {1, 1, 1, 1}); - AddTestTensor("input", {1, 2, 3}); + GetConv3DNodeDef({1, 1, 1, 1, 1}, "SAME", "NCDHW", {1, 1, 1, 1}); + AddTestTensor("input", {1, 1, 2, 3}, tf_type_, CreateVectorIota(6)); AddTestWeights( "weights", {3, 3, 1, 1, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9}); // Dimensions, then values RunValidationAndConversion( - node_def, error::INVALID_ARGUMENT, - "Convolution dilations field must specify 5 dimensions, at my_conv3d"); + node_def, absl::StatusCode::kInvalidArgument, + "Convolution dilations field must specify 5 dimensions"); } { // Dilation value is not 1 for channel, should fail. Reset(); NodeDef node_def = - get_conv3d_nodedef({1, 1, 1, 1, 1}, "SAME", "NCDHW", {1, 2, 1, 1, 1}); - AddTestTensor("input", {1, 2, 3}); + GetConv3DNodeDef({1, 1, 1, 1, 1}, "SAME", "NCDHW", {1, 2, 1, 1, 1}); + AddTestTensor("input", {1, 1, 2, 3}, tf_type_, CreateVectorIota(6)); AddTestWeights("weights", {3, 3, 1, 1, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9}); - RunValidationAndConversion(node_def, error::UNIMPLEMENTED, + RunValidationAndConversion(node_def, absl::StatusCode::kUnimplemented, "Dilation rate must be 1 for batch and channel " - "dimensions, at my_conv3d"); + "dimensions"); } { // Dilation value is not 1 for channel (NDHWC), should fail. Reset(); NodeDef node_def = - get_conv3d_nodedef({1, 1, 1, 1, 1}, "SAME", "NDHWC", {1, 1, 1, 1, 2}); - AddTestTensor("input", {2, 3, 1}); + GetConv3DNodeDef({1, 1, 1, 1, 1}, "SAME", "NDHWC", {1, 1, 1, 1, 2}); + AddTestTensor("input", {1, 2, 3, 1}, tf_type_, CreateVectorIota(6)); AddTestWeights("weights", {3, 3, 1, 1, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9}); - RunValidationAndConversion(node_def, error::UNIMPLEMENTED, + RunValidationAndConversion(node_def, absl::StatusCode::kUnimplemented, "Dilation rate must be 1 for batch and channel " - "dimensions, at my_conv3d"); + "dimensions"); } { // Dilation + Conv3DBackpropInputV2, should fail. Reset(); - NodeDef node_def = get_conv3d_nodedef({1, 1, 1, 1, 1}, "SAME", "NDHWC", - {1, 1, 2, 1, 1}, true); - AddTestTensor("input", {2, 3, 1}); + NodeDef node_def = GetConv3DNodeDef({1, 1, 1, 1, 1}, "SAME", "NDHWC", + {1, 1, 2, 1, 1}, true); + AddTestTensor("input", {1, 2, 3, 1}, tf_type_, CreateVectorIota(6)); AddTestWeights("weights", {3, 3, 1, 1, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9}); AddTestWeights("input_sizes", {4}, {1, 2, 3, 1}); - RunValidationAndConversion(node_def, error::UNIMPLEMENTED, + RunValidationAndConversion(node_def, absl::StatusCode::kUnimplemented, "Dilation with Conv3DBackpropInputV2 " - "(conv3d_transpose) is not supported, " - "at my_conv3d"); + "(conv3d_transpose) is not supported"); } { // Asymmetric+ Conv3DBackpropInputV2, should fail. Reset(); - NodeDef node_def = get_conv3d_nodedef({1, 1, 1, 1, 1}, "SAME", "NDHWC", - {1, 1, 1, 1, 1}, true); - AddTestTensor("input", {1, 2, 2, 2}); + NodeDef node_def = GetConv3DNodeDef({1, 1, 1, 1, 1}, "SAME", "NDHWC", + {1, 1, 1, 1, 1}, true); + AddTestTensor("input", {1, 2, 2, 2}, tf_type_, CreateVectorIota(8)); AddTestWeights("weights", {1, 1, 2, 1, 1}, {1, 1}); AddTestWeights("input_sizes", {8}, {1, 2, 3, 4, 5, 6, 7, 8}); - RunValidationAndConversion(node_def, error::UNIMPLEMENTED, + RunValidationAndConversion(node_def, absl::StatusCode::kUnimplemented, "Asymmetric padding with Conv3DBackpropInputV2 " - "(conv3d_transpose) is not supported, at " - "my_conv3d"); + "(conv3d_transpose) is not supported"); } { // Strides is not 5D, should fail. Reset(); - NodeDef node_def = get_conv3d_nodedef({1, 1, 1, 1, 1, 1}, "SAME", "NCDHW", - {1, 1, 1, 1, 1}); - AddTestTensor("input", {1, 2, 2, 2}); + NodeDef node_def = + GetConv3DNodeDef({1, 1, 1, 1, 1, 1}, "SAME", "NCDHW", {1, 1, 1, 1, 1}); + AddTestTensor("input", {1, 2, 2, 2}, tf_type_, CreateVectorIota(8)); AddTestWeights("weights", {1, 1, 2, 1, 1}, {1, 1}); RunValidationAndConversion( - node_def, error::INVALID_ARGUMENT, - "Convolution strides field must specify 5 dimensions, at my_conv3d"); + node_def, absl::StatusCode::kInvalidArgument, + "Convolution strides field must specify 5 dimensions"); } { // Stride value is not 1 for channel, should fail. Reset(); NodeDef node_def = - get_conv3d_nodedef({1, 2, 1, 1, 1}, "SAME", "NCDHW", {1, 1, 1, 1, 1}); - AddTestTensor("input", {1, 2, 3}); + GetConv3DNodeDef({1, 2, 1, 1, 1}, "SAME", "NCDHW", {1, 1, 1, 1, 1}); + AddTestTensor("input", {1, 1, 2, 3}, tf_type_, CreateVectorIota(6)); AddTestWeights("weights", {3, 3, 1, 1, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9}); RunValidationAndConversion( - node_def, error::UNIMPLEMENTED, - "Stride must be 1 for batch and channel dimensions, at my_conv3d"); + node_def, absl::StatusCode::kUnimplemented, + "Stride must be 1 for batch and channel dimensions"); + } + + // Start here + std::vector ok_params = { + // Basic - just 1x1 conv - input = output + {/*input_dims=*/{1, 1, 3, 3, 3}, // CDHW + /*input=*/{1, 2, 15, 3, 6, -3, 22, 1, 88, 56, 36, 1, 1, 105, + 1, 16, -28, 1, 42, 9, 3, 1, 7, 1, 11, 61, 5}, + /*filter_dims=*/{1, 1, 1, 1, 1}, // DRSCK + /*filter=*/{1}, + /*strides=*/{1, 1, 1, 1, 1}, + /*padding=*/"VALID", + /*data_format=*/"NCDHW", + /*dilations=*/{1, 1, 1, 1, 1}, + /*is_conv3d_backprop=*/false, + /*expected_output_dims=*/{1, 1, 3, 3, 3}, + /*expected_output=*/{1, 2, 15, 3, 6, -3, 22, 1, 88, + 56, 36, 1, 1, 105, 1, 16, -28, 1, + 42, 9, 3, 1, 7, 1, 11, 61, 5}, + /*allow_dynamic_channel_dim=*/false, + /*validation_status=*/Status::OK()}, + // Basic - 2x1 filter + {/*input_dims=*/{1, 1, 3, 3, 3}, // CDHW + /*input=*/{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 6}, + /*filter_dims=*/{2, 1, 1, 1, 1}, // DRSCK + /*filter=*/{1, 1}, + /*strides=*/{1, 1, 1, 1, 1}, + /*padding=*/"VALID", + /*data_format=*/"NCDHW", + /*dilations=*/{1, 1, 1, 1, 1}, + /*is_conv3d_backprop=*/false, + /*expected_output_dims=*/{1, 1, 2, 3, 3}, + /*expected_output=*/ + {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 7}, + /*allow_dynamic_channel_dim=*/false, + /*validation_status=*/Status::OK()}, + // SAME padding (Asymmetric) + {/*input_dims=*/{1, 1, 2, 3, 2}, // CDHW + /*input=*/{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}, + /*filter_dims=*/{2, 1, 1, 1, 1}, // DRSCK + /*filter=*/{-1, 1}, + /*strides=*/{1, 1, 1, 1, 1}, + /*padding=*/"SAME", + /*data_format=*/"NCDHW", + /*dilations=*/{1, 1, 1, 1, 1}, + /*is_conv3d_backprop=*/false, + /*expected_output_dims=*/{1, 1, 2, 3, 2}, + // Diff in first 2 depths is const 6. + /*expected_output=*/{6, 6, 6, 6, 6, 6, -6, -7, -8, -9, -10, -11}, + /*allow_dynamic_channel_dim=*/false, + /*validation_status=*/Status::OK()}, + // SAME padding (Symmetric) + {/*input_dims=*/{1, 1, 2, 3, 2}, // CDHW + /*input=*/{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}, + /*filter_dims=*/{3, 1, 1, 1, 1}, // DRSCK + /*filter=*/{-1, 0, 1}, + /*strides=*/{1, 1, 1, 1, 1}, + /*padding=*/"SAME", + /*data_format=*/"NCDHW", + /*dilations=*/{1, 1, 1, 1, 1}, + /*is_conv3d_backprop=*/false, + /*expected_output_dims=*/{1, 1, 2, 3, 2}, + // Swaps front two depths, negates + /*expected_output=*/{6, 7, 8, 9, 10, 11, 0, -1, -2, -3, -4, -5}, + /*allow_dynamic_channel_dim=*/false, + /*validation_status=*/Status::OK() + + }, + // NDHWC (multi-channel) + {/*input_dims=*/{1, 2, 3, 2, 2}, // DHWC + /*input=*/{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}, + /*filter_dims=*/{2, 1, 1, 2, 1}, // DRSCK + /*filter=*/{-1, 1, 1, -1}, + /*strides=*/{1, 1, 1, 1, 1}, + /*padding=*/"VALID", + /*data_format=*/"NDHWC", + /*dilations=*/{1, 1, 1, 1, 1}, + /*is_conv3d_backprop=*/false, + /*expected_output_dims=*/{1, 1, 3, 2, 1}, + /*expected_output=*/{0, 0, 0, 0, 0, 0}, // Filters oppose each-other + /*allow_dynamic_channel_dim=*/false, + /*validation_status=*/Status::OK()}, + // Dilated + {/*input_dims=*/{1, 1, 3, 3, 3}, // CDHW + /*input=*/{1, 1, 1, 1, 1, 1, 1, 1, 1, -10, -10, -10, -10, -10, + -10, -10, -10, -10, 7, 7, 7, 7, 7, 7, 7, 7, 7}, + /*filter_dims=*/{2, 1, 1, 1, 1}, // DRSCK + /*filter=*/{1, 1}, + /*strides=*/{1, 1, 1, 1, 1}, + /*padding=*/"VALID", + /*data_format=*/"NCDHW", + /*dilations=*/{1, 1, 2, 1, 1}, + /*is_conv3d_backprop=*/false, + /*expected_output_dims=*/{1, 1, 1, 3, 3}, + // Only front depth is valid, skips neg values + /*expected_output=*/{8, 8, 8, 8, 8, 8, 8, 8, 8}, + /*allow_dynamic_channel_dim=*/false, + /*validation_status=*/Status::OK()}, + // Strided + {/*input_dims=*/{1, 1, 3, 3, 3}, + /*input=*/{1, 0, 2, 0, 0, 0, 3, 0, 4, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 5, 0, 6, 0, 0, 0, 7, 0, 8}, + /*filter_dims=*/{1, 1, 1, 1, 1}, + /*filter=*/{1}, + /*strides=*/{1, 1, 2, 2, 2}, + /*padding=*/"VALID", + /*data_format=*/"NCDHW", + /*dilations=*/{1, 1, 1, 1, 1}, + /*is_conv3d_backprop=*/false, + /*expected_output_dims=*/{1, 1, 2, 2, 2}, + // Should only pick up the corners + /*expected_output=*/{1, 2, 3, 4, 5, 6, 7, 8}, + /*allow_dynamic_channel_dim=*/false, + /*validation_status=*/Status::OK()}, + // Transpose Strided + {/*input_dims=*/{1, 1, 2, 2, 2}, // CDHW + /*input=*/{1, 2, 3, 4, 5, 6, 7, 8}, + /*filter_dims=*/{1, 1, 1, 1, 1}, + /*filter=*/{1}, + /*strides=*/{1, 1, 2, 2, 2}, + /*padding=*/"VALID", + /*data_format=*/"NCDHW", + /*dilations=*/{1, 1, 1, 1, 1}, + /*is_conv3d_backprop=*/true, + /*expected_output_dims=*/{1, 1, 3, 3, 3}, + /*expected_output=*/{1, 0, 2, 0, 0, 0, 3, 0, 4, // Cube expands and + 0, 0, 0, 0, 0, 0, 0, 0, 0, // fills center + 5, 0, 6, 0, 0, 0, 7, 0, 8}, // with zeroes + /*allow_dynamic_channel_dim=*/false, + /*validation_status=*/Status::OK()}, + }; + + if (trt_mode_ == TrtTestMode::kDynamicShape) { + ok_params.reserve(ok_params.size() + 2); + const std::vector common_input = CreateVectorIota(3 * 3 * 3); + // NCDHW - Dynamic Channel - Should fail in kDynamicShape + ok_params.push_back(Conv3DTestParams{ + /*input_dims=*/{1, 1, 3, 3, 3}, + /*input=*/common_input, + /*filter_dims=*/{1, 1, 1, 1, 1}, + /*filter=*/{1}, + /*strides=*/{1, 1, 2, 2, 2}, + /*padding=*/"VALID", + /*data_format=*/"NCDHW", + /*dilations=*/{1, 1, 1, 1, 1}, + /*is_conv3d_backprop=*/false, + /*expected_output_dims=*/{}, // ignore, will fail anyway + /*expected_output=*/{}, // ignore, will fail anyway + /*allow_dynamic_channel_dim=*/true, + /*validation_status=*/ + Status{absl::StatusCode::kInvalidArgument, + "Channel dimension must be static"}}); + // NDHWC - Dynamic Channel - Should fail in kDynamicShape + ok_params.push_back(Conv3DTestParams{ + /*input_dims=*/{1, 3, 3, 3, 1}, + /*input=*/common_input, + /*filter_dims=*/{1, 1, 1, 1, 1}, + /*filter=*/{1}, + /*strides=*/{1, 2, 2, 2, 1}, + /*padding=*/"VALID", + /*data_format=*/"NDHWC", + /*dilations=*/{1, 1, 1, 1, 1}, + /*is_conv3d_backprop=*/false, + /*expected_output_dims=*/{}, // ignore, will fail anyway + /*expected_output=*/{}, // ignore, will fail anyway + /*allow_dynamic_channel_dim=*/true, + /*validation_status=*/ + Status{absl::StatusCode::kInvalidArgument, + "Channel dimension must be static"}}); + } + + for (auto p : ok_params) { + TestConv3D(this, p); + } +} + +template +NodeDef CreatePoolOp(DataType tf_type, std::vector ksize, + std::vector strides, string padding, + string data_format) { + Scope s = Scope::NewRootScope(); + auto input = ops::Placeholder(s.WithOpName("input"), tf_type); + typename T::Attrs attrs; + attrs.data_format_ = data_format; + return T(s.WithOpName("my_pool"), input, ksize, strides, padding, attrs) + .operation.node() + ->def(); +} +TEST_P(OpConverter_FP32_Test, ConvertPool) { + // Get nodedef for MaxPool and AvgPool layers (2D or 3D). + auto get_pool_nodedef = + [](DataType tf_type, int nDim, std::vector ksize = {}, + std::vector strides = {}, string padding = "SAME", + string data_format = "", const bool is_max_pooling = true) -> NodeDef { + if (ksize.empty()) { + ksize = nDim == 2 ? std::vector{1, 1, 1, 1} + : std::vector{1, 1, 1, 1, 1}; + } + if (strides.empty()) { + strides = nDim == 2 ? std::vector{1, 1, 1, 1} + : std::vector{1, 1, 1, 1, 1}; + } + if (data_format == "") { + data_format = nDim == 2 ? "NCHW" : "NCDHW"; + } + if (is_max_pooling) { + if (nDim == 3) { + return CreatePoolOp(tf_type, ksize, strides, padding, + data_format); + } else { + return CreatePoolOp(tf_type, ksize, strides, padding, + data_format); + } + } else { + if (nDim == 3) { + return CreatePoolOp(tf_type, ksize, strides, padding, + data_format); + } else { + return CreatePoolOp(tf_type, ksize, strides, padding, + data_format); + } + } + }; + + std::vector test_nDims{2, 3}; + + for (int nDim : test_nDims) { + // Input is weights, should fail. + Reset(); + NodeDef node_def = get_pool_nodedef(tf_type_, nDim); + + AddTestWeights("input", {1, 1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}); + RunValidationAndConversion( + node_def, absl::StatusCode::kUnimplemented, + StrCat("The input \"input\" for ", node_def.op(), " must be a tensor")); } + struct TestParams { std::vector input_dims; std::vector input; - std::vector filter_dims; - std::vector filter; + std::vector ksize; std::vector strides; string padding; string data_format; - std::vector dilations; - bool is_conv3d_backprop_input; std::vector expected_output_dims; - std::vector expected_output; + // The expected outputs for the following operations: MaxPool2D, AvgPool2D, + // MaxPool3D, AvgPool3D + std::vector> expected_outputs; + Status status; + std::set skip_dims; }; - // Start here - const int kConv3DOKCases = 8; - TestParams ok_params[kConv3DOKCases] = { - // Basic - just 1x1 conv - input = output + // We use common_input as the input to test both 2D and 3D pooling operations, + // to simplify TestParams. For 2D operations, only the first 1/3 of the values + // are used. + const std::vector common_input{-4, 2, 15, 3, 6, -3, 22, 1, 88, + 56, 36, 1, 1, 105, 1, 16, -28, 1, + 42, 9, 3, 1, 7, 1, 11, 61, 5}; + // The output of 2D ops for the case where the op is equivalent to the + // identity op. + const std::vector common_2d_output{-4, 2, 15, 3, 6, -3, 22, 1, 88}; + std::vector test_params = { + // Validation failure - kernel size too large for TRT TestParams{ - /*input_dims=*/{1, 3, 3, 3}, // CDHW - /*input=*/{1, 2, 15, 3, 6, -3, 22, 1, 88, 56, 36, 1, 1, 105, - 1, 16, -28, 1, 42, 9, 3, 1, 7, 1, 11, 61, 5}, - /*filter_dims=*/{1, 1, 1, 1, 1}, // DRSCK - /*filter=*/{1}, + /*input_dims=*/{1, 1, 3, 3, 3}, + /*input=*/common_input, + /*ksize=*/{1, 1, 1000, 1000, 1000}, /*strides=*/{1, 1, 1, 1, 1}, /*padding=*/"VALID", /*data_format=*/"NCDHW", - /*dilations=*/{1, 1, 1, 1, 1}, - /*is_conv3d_backprop_input=*/false, - /*expected_output_dims=*/{1, 3, 3, 3}, - /*expected_output=*/{1, 2, 15, 3, 6, -3, 22, 1, 88, - 56, 36, 1, 1, 105, 1, 16, -28, 1, - 42, 9, 3, 1, 7, 1, 11, 61, 5}}, - // Basic - 2x1 filter - TestParams{/*input_dims=*/{1, 3, 3, 3}, // CDHW - /*input=*/{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 6}, - /*filter_dims=*/{2, 1, 1, 1, 1}, // DRSCK - /*filter=*/{1, 1}, - /*strides=*/{1, 1, 1, 1, 1}, - /*padding=*/"VALID", - /*data_format=*/"NCDHW", - /*dilations=*/{1, 1, 1, 1, 1}, - /*is_conv3d_backprop_input=*/false, - /*expected_output_dims=*/{1, 2, 3, 3}, - /*expected_output=*/ - {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 7}}, - // SAME padding (Asymmetric) + /*expected_output_dims=*/{1, 1, 3, 3, 3}, + /*expected_outputs=*/ + {common_2d_output, common_2d_output, common_input, common_input}, + /*status=*/ + Status(absl::StatusCode::kInvalidArgument, + "Window dimensions are not within bounds")}, + // Validation failure for 3D ops - negative kernel depth TestParams{ - /*input_dims=*/{1, 2, 3, 2}, // CDHW - /*input=*/{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}, - /*filter_dims=*/{2, 1, 1, 1, 1}, // DRSCK - /*filter=*/{-1, 1}, + /*input_dims=*/{1, 1, 3, 3, 3}, + /*input=*/common_input, + /*ksize=*/{1, 1, -1, 1, 1}, /*strides=*/{1, 1, 1, 1, 1}, - /*padding=*/"SAME", + /*padding=*/"VALID", /*data_format=*/"NCDHW", - /*dilations=*/{1, 1, 1, 1, 1}, - /*is_conv3d_backprop_input=*/false, - /*expected_output_dims=*/{1, 2, 3, 2}, - /*expected_output=*/ - {6, 6, 6, 6, 6, 6, -6, -7, -8, -9, -10, - -11} // Diff in first 2 depths is const 6 - }, - // SAME padding (Symmetric) + /*expected_output_dims=*/{1, 1, 3, 3, 3}, + /*expected_outputs=*/ + {common_2d_output, common_2d_output, common_input, common_input}, + /*status=*/ + Status(absl::StatusCode::kInvalidArgument, + "Window dimensions are not within bounds"), + /*skip_dims=*/{2}}, + // Validation failure - negative kernel height TestParams{ - /*input_dims=*/{1, 2, 3, 2}, // CDHW - /*input=*/{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}, - /*filter_dims=*/{3, 1, 1, 1, 1}, // DRSCK - /*filter=*/{-1, 0, 1}, + /*input_dims=*/{1, 1, 3, 3, 3}, + /*input=*/common_input, + /*ksize=*/{1, 1, 1, -1, 1}, /*strides=*/{1, 1, 1, 1, 1}, - /*padding=*/"SAME", + /*padding=*/"VALID", /*data_format=*/"NCDHW", - /*dilations=*/{1, 1, 1, 1, 1}, - /*is_conv3d_backprop_input=*/false, - /*expected_output_dims=*/{1, 2, 3, 2}, - /*expected_output=*/ - {6, 7, 8, 9, 10, 11, 0, -1, -2, -3, -4, - -5} // Swaps front two depths, negates - }, - - // NDHWC (multi-channel) + /*expected_output_dims=*/{1, 1, 3, 3, 3}, + /*expected_outputs=*/ + {common_2d_output, common_2d_output, common_input, common_input}, + /*status=*/ + Status(absl::StatusCode::kInvalidArgument, + "Window dimensions are not within bounds")}, + // Validation failure - negative kernel width TestParams{ - /*input_dims=*/{2, 3, 2, 2}, // DHWC - /*input=*/{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}, - /*filter_dims=*/{2, 1, 1, 2, 1}, // DRSCK - /*filter=*/{-1, 1, 1, -1}, + /*input_dims=*/{1, 1, 3, 3, 3}, + /*input=*/common_input, + /*ksize=*/{1, 1, 1, 1, -1}, /*strides=*/{1, 1, 1, 1, 1}, /*padding=*/"VALID", - /*data_format=*/"NDHWC", - /*dilations=*/{1, 1, 1, 1, 1}, - /*is_conv3d_backprop_input=*/false, - /*expected_output_dims=*/{1, 3, 2, 1}, - /*expected_output=*/{0, 0, 0, 0, 0, 0} // Each filter opposes the - // other - }, - - // Dilated + /*data_format=*/"NCDHW", + /*expected_output_dims=*/{1, 1, 3, 3, 3}, + /*expected_outputs=*/ + {common_2d_output, common_2d_output, common_input, common_input}, + /*status=*/ + Status(absl::StatusCode::kInvalidArgument, + "Window dimensions are not within bounds")}, + // Basic - just 1x1 max pooling - input = output TestParams{ - /*input_dims=*/{1, 3, 3, 3}, // CDHW - /*input=*/{1, 1, 1, 1, 1, 1, 1, 1, 1, -10, -10, -10, -10, -10, - -10, -10, -10, -10, 7, 7, 7, 7, 7, 7, 7, 7, 7}, - /*filter_dims=*/{2, 1, 1, 1, 1}, // DRSCK - /*filter=*/{1, 1}, + /*input_dims=*/{1, 1, 3, 3, 3}, + /*input=*/common_input, + /*ksize=*/{1, 1, 1, 1, 1}, /*strides=*/{1, 1, 1, 1, 1}, /*padding=*/"VALID", /*data_format=*/"NCDHW", - /*dilations=*/{1, 1, 2, 1, 1}, - /*is_conv3d_backprop_input=*/false, - /*expected_output_dims=*/{1, 1, 3, 3}, - /*expected_output=*/{8, 8, 8, 8, 8, 8, 8, 8, 8} // Only front depth - // is valid, skips - // neg values - }, - // Strided + /*expected_output_dims=*/{1, 1, 3, 3, 3}, + /*expected_outputs=*/ + {common_2d_output, common_2d_output, common_input, common_input}}, + // Basic - just 1x1 max pooling - input = output, SAME padding TestParams{ - /*input_dims=*/{1, 3, 3, 3}, - /*input=*/{1, 0, 2, 0, 0, 0, 3, 0, 4, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 5, 0, 6, 0, 0, 0, 7, 0, 8}, - /*filter_dims=*/{1, 1, 1, 1, 1}, - /*filter=*/{1}, - /*strides=*/{1, 1, 2, 2, 2}, - /*padding=*/"VALID", + /*input_dims=*/{1, 1, 3, 3, 3}, + /*input=*/common_input, + /*ksize=*/{1, 1, 1, 1, 1}, + /*strides=*/{1, 1, 1, 1, 1}, + /*padding=*/"SAME", /*data_format=*/"NCDHW", - /*dilations=*/{1, 1, 1, 1, 1}, - /*is_conv3d_backprop_input=*/false, - /*expected_output_dims=*/{1, 2, 2, 2}, - /*expected_output=*/{1, 2, 3, 4, 5, 6, 7, 8} // Should only pick up - // the corners - }, - // Transpose Strided - TestParams{/*input_dims=*/{1, 2, 2, 2}, // CDHW - /*input=*/{1, 2, 3, 4, 5, 6, 7, 8}, - /*filter_dims=*/{1, 1, 1, 1, 1}, - /*filter=*/{1}, + /*expected_output_dims=*/{1, 1, 3, 3, 3}, + /*expected_outputs=*/ + {common_2d_output, common_2d_output, common_input, common_input}}, + // 3x3 pooling NCDHW + TestParams{/*input_dims=*/{1, 1, 3, 3, 3}, + /*input=*/common_input, + /*ksize=*/{1, 1, 3, 3, 3}, + /*strides=*/{1, 1, 1, 1, 1}, + /*padding=*/"VALID", + /*data_format=*/"NCDHW", + /*expected_output_dims=*/{1, 1, 1, 1, 1}, + /*expected_outputs=*/{{88}, {14.444445}, {105}, {17}}}, + // 3x3 pooling, NDHWC + TestParams{/*input_dims=*/{1, 3, 3, 3, 1}, + /*input=*/common_input, + /*ksize=*/{1, 3, 3, 3, 1}, + /*strides=*/{1, 1, 1, 1, 1}, + /*padding=*/"VALID", + /*data_format=*/"NDHWC", + /*expected_output_dims=*/{1, 1, 1, 1, 1}, + /*expected_outputs=*/{{88}, {14.444445}, {105}, {17}}}, + // Strided + TestParams{/*input_dims=*/{1, 1, 3, 3, 3}, + /*input=*/{1, 0, 2, 0, 0, 0, 3, 0, 4, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 5, 0, 6, 0, 0, 0, 7, 0, 8}, + /*ksize=*/{1, 1, 1, 1, 1}, /*strides=*/{1, 1, 2, 2, 2}, /*padding=*/"VALID", /*data_format=*/"NCDHW", - /*dilations=*/{1, 1, 1, 1, 1}, - /*is_conv3d_backprop_input=*/true, - /*expected_output_dims=*/{1, 3, 3, 3}, - /*expected_output=*/ - {1, 0, 2, 0, 0, 0, 3, 0, 4, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 5, 0, 6, 0, 0, 0, 7, 0, 8}}, // Cube - // expands and - // fills - // center with - // zeroes - + /*expected_output_dims=*/{1, 1, 2, 2, 2}, + /*expected_outputs=*/ + {{1, 2, 3, 4}, // Should only pick up the corners + {1, 2, 3, 4}, + {1, 2, 3, 4, 5, 6, 7, 8}, + {1, 2, 3, 4, 5, 6, 7, 8}}}, }; - for (int i = 0; i < kConv3DOKCases; i++) { - Reset(); - NodeDef node_def = get_conv3d_nodedef( - ok_params[i].strides, ok_params[i].padding, ok_params[i].data_format, - ok_params[i].dilations, ok_params[i].is_conv3d_backprop_input); - AddTestTensor("input", ok_params[i].input_dims); - AddTestWeights("weights", ok_params[i].filter_dims, - ok_params[i].filter); - if (ok_params[i].is_conv3d_backprop_input) { - AddTestWeights( - "input_sizes", - {static_cast(ok_params[i].expected_output.size())}, - ok_params[i].expected_output); + for (auto p : test_params) { + int test_counter = 0; + for (int nDim : test_nDims) { + if (p.skip_dims.find(nDim) != p.skip_dims.end()) { + continue; + } + auto input = p.input; + auto input_dims = p.input_dims; + auto ksize = p.ksize; + auto strides = p.strides; + auto expected_output_dims = p.expected_output_dims; + std::string data_format = p.data_format; + if (nDim == 2) { + input.resize(9); + data_format = p.data_format == "NDHWC" ? "NHWC" : "NCHW"; + // Remove one of the spatial dimensions + input_dims.erase(input_dims.begin() + 2); + ksize.erase(ksize.begin() + 2); + strides.erase(strides.begin() + 2); + expected_output_dims.erase(expected_output_dims.begin() + 2); + } + for (bool is_max_pooling : {true, false}) { + Reset(); + NodeDef node = get_pool_nodedef(tf_type_, nDim, ksize, strides, + p.padding, data_format, is_max_pooling); + AddTestTensor("input", input_dims, input); + TestOpConverter(node, expected_output_dims, p.status, Status::OK(), + ElementsAreArray(p.expected_outputs.at(test_counter))); + test_counter++; + } } - RunValidationAndConversion(node_def); - TRT_TensorOrWeights output; - TF_EXPECT_OK(GetTensorOrWeights("my_conv3d", &output)); - ASSERT_TRUE(output.is_tensor()); - ExpectTrtDimsEqualsArray(ok_params[i].expected_output_dims, - output.tensor()->getDimensions()); + } +} - const DataVec input_data{ - {"input", test::AsTensor(ok_params[i].input)}}; - DataVec output_data{ - {"my_conv3d", - ConstructTensor(ok_params[i].expected_output.size())}}; - BuildAndRun(input_data, &output_data); - EXPECT_THAT(GetSpanForData(output_data[0]), - ElementsAreArray(ok_params[i].expected_output)); +TEST_P(OpConverter_FP32_FP16_Test, ConvertTopK) { + // Get the NodeDef for TopKV2. + Scope s = Scope::NewRootScope(); + auto input = ops::Placeholder(s.WithOpName("input"), tf_type_); + auto weights = ops::Placeholder(s.WithOpName("weights"), DT_INT32); + auto topk = ops::TopK(s.WithOpName("my_topk"), input, weights); + const NodeDef& node_def = topk.operation.node()->def(); + { + // K is a tensor, should fail. + Reset(); + AddTestTensor("input", {1, 1, 2, 3}); + AddTestTensor("weights", {1}, DT_INT32, {}); + RunValidationAndConversion(node_def, absl::StatusCode::kUnimplemented, + "The input \"k\" for TopKV2 must be a constant"); + } + { + // Ok. + Reset(); + AddTestTensor("input", {1, 1, 2, 5}, {-9, 3, 5, 1, 6, -5, 7, 1, 0, -1}); + AddTestWeights("weights", {1}, {2}); + std::vector> expected_output_dims{{1, 1, 2, 2}, + {1, 1, 2, 2}}; + TestOpConverterMultiOut(node_def, expected_output_dims, Status::OK(), + Status::OK(), + {ElementsAre(6, 5, 7, 1), ElementsAre(4, 2, 1, 2)}, + {tf_type_, DT_INT32}); } } -#endif // IS_TRT_VERSION_GE(6, 0, 0, 0) -TEST_F(OpConverterTest, ConvertTopK) { - // TODO(tmorris): This test isn't setting the input dtype properly. TopK with - // int32 is unsupported by TRT. - for (const auto dtype : {DT_FLOAT}) { - // Get the NodeDef for TopKV2. - Scope s = Scope::NewRootScope(); - auto input = ops::Placeholder(s.WithOpName("input"), dtype); - auto weights = ops::Placeholder(s.WithOpName("weights"), DT_INT32); - auto topk = ops::TopK(s.WithOpName("my_topk"), input, weights); - const NodeDef& node_def = topk.operation.node()->def(); - { - // K is a tensor, should fail. - Reset(); - AddTestTensor("input", {1, 2, 3}, /*batch_size=*/1, - /*trt_dtype=*/TfDataTypeToTrt(dtype)); - AddTestTensor("weights", {2}); - RunValidationAndConversion( - node_def, error::UNIMPLEMENTED, - "The input \"k\" for TopKV2 must be a constant, at my_topk"); - } - { - // Ok. - Reset(); - AddTestTensor("input", {1, 2, 5}); - AddTestWeights("weights", {1}, {2}); - RunValidationAndConversion(node_def); - TRT_TensorOrWeights outputs[2]; - TF_EXPECT_OK(GetTensorOrWeights("my_topk", &outputs[0])); - TF_EXPECT_OK(GetTensorOrWeights("my_topk:1", &outputs[1])); - for (auto& output : outputs) { - ASSERT_TRUE(output.is_tensor()); - ExpectTrtDimsEqualsArray({1, 2, 2}, output.tensor()->getDimensions()); - } +struct DataFormatVecPermuteTestParams { + string dst_format; + string src_format; + std::vector x_shape; + std::vector x; + bool x_is_tensor; + std::vector expected_output; + Status conversion_status; +}; + +NodeDef GetDataFormatVecPermuteNodeDef(string dst_format, string src_format, + std::vector& x_shape) { + Scope s = Scope::NewRootScope(); + PartialTensorShape tensor_shape; + auto x = ops::Placeholder(s.WithOpName("x"), DT_INT32); + const auto attrs = ops::DataFormatVecPermute::Attrs() + .DstFormat(dst_format) + .SrcFormat(src_format); + auto dfvp = ops::DataFormatVecPermute(s.WithOpName("my_dfvp"), x, attrs); + return dfvp.operation.node()->def(); +} + +TEST_P(OpConverter_INT32_Test, ConvertDataFormatVecPermute) { + const auto& error = convert_not_supported_implicit( + string("DataFormatVecPermute"), string("my_dfvp")); + const Status implicit_error = Status{absl::StatusCode::kUnimplemented, error}; + const auto conversion_status = + trt_mode_ == TrtTestMode::kImplicitBatch ? implicit_error : Status::OK(); + std::vector test_params = { + // 1D case with tensor. + DataFormatVecPermuteTestParams{/*dst_format=*/"NCHW", + /*src_format=*/"NHWC", + /*x_shape=*/{4}, + /*x=*/{1, 2, 3, 4}, + /*x_is_tensor=*/true, + /*expected_output=*/{1, 4, 2, 3}, + /*conversion_status=*/conversion_status}, + // 1D case with weights. + DataFormatVecPermuteTestParams{/*dst_format=*/"NCHW", + /*src_format=*/"NHWC", + /*x_shape=*/{4}, + /*x=*/{1, 2, 3, 4}, + /*x_is_tensor=*/false, + /*expected_output=*/{1, 4, 2, 3}, + /*conversion_status=*/conversion_status}, + // 2D case with tensor. + DataFormatVecPermuteTestParams{ + /*dst_format=*/"NCHW", + /*src_format=*/"NHWC", + /*x_shape=*/{4, 2}, + /*x=*/{1, 2, 3, 4, 5, 6, 7, 8}, + /*x_is_tensor=*/true, + /*expected_output=*/{1, 2, 7, 8, 3, 4, 5, 6}, + /*conversion_status=*/conversion_status}, + // 2D case with weights. + DataFormatVecPermuteTestParams{ + /*dst_format=*/"NCHW", + /*src_format=*/"NHWC", + /*x_shape=*/{4, 2}, + /*x=*/{1, 2, 3, 4, 5, 6, 7, 8}, + /*x_is_tensor=*/false, + /*expected_output=*/{1, 2, 7, 8, 3, 4, 5, 6}, + /*conversion_status=*/conversion_status}, + // Format of size 5. + DataFormatVecPermuteTestParams{ + /*dst_format=*/"NCDHW", + /*src_format=*/"NDHWC", + /*x_shape=*/{5, 2}, + /*x=*/{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, + /*x_is_tensor=*/true, + /*expected_output=*/{1, 2, 9, 10, 3, 4, 5, 6, 7, 8}, + /*conversion_status=*/conversion_status}, + // Input of size 2: treat the elements as spatial dimensions. + DataFormatVecPermuteTestParams{/*dst_format=*/"NCWH", + /*src_format=*/"NHWC", + /*x_shape=*/{2, 2}, + /*x=*/{1, 2, 3, 4}, + /*x_is_tensor=*/true, + /*expected_output=*/{3, 4, 1, 2}, + /*conversion_status=*/conversion_status}, + // Input of size 3: treat the elements as spatial dimensions. + DataFormatVecPermuteTestParams{/*dst_format=*/"NCHWD", + /*src_format=*/"NDHWC", + /*x_shape=*/{3}, + /*x=*/{1, 2, 3}, + /*x_is_tensor=*/true, + /*expected_output=*/{2, 3, 1}, + /*conversion_status=*/conversion_status}, + // Invalid rank, should fail. + DataFormatVecPermuteTestParams{ + /*dst_format=*/"NCHW", + /*src_format=*/"NHWC", + /*x_shape=*/{2, 2, 2}, + /*x=*/{1, 2, 3, 4, 5, 6, 7, 8}, + /*x_is_tensor=*/true, + /*expected_output=*/{}, + /*conversion_status=*/trt_mode_ == TrtTestMode::kImplicitBatch + ? implicit_error + : Status{absl::StatusCode::kInvalidArgument, + "Input must be a vector or matrix, but got rank 3, at " + "my_dfvp"}}, + // Invalid size for 1D input, should fail. + DataFormatVecPermuteTestParams{ + /*dst_format=*/"NCHW", + /*src_format=*/"NHWC", + /*x_shape=*/{3}, + /*x=*/{1, 2, 3}, + /*x_is_tensor=*/true, + /*expected_output=*/{}, + /*conversion_status=*/trt_mode_ == TrtTestMode::kImplicitBatch + ? implicit_error + : Status{absl::StatusCode::kInvalidArgument, + "1D input must be of size 2 or 4, but got size 3, at " + "my_dfvp"}}, + // Invalid first dim for 2D input, should fail. + DataFormatVecPermuteTestParams{ + /*dst_format=*/"NCDHW", + /*src_format=*/"NDHWC", + /*x_shape=*/{4, 2}, + /*x=*/{1, 2, 3, 4, 5, 6, 7, 8}, + /*x_is_tensor=*/true, + /*expected_output=*/{}, + /*conversion_status=*/trt_mode_ == TrtTestMode::kImplicitBatch + ? implicit_error + : Status{absl::StatusCode::kInvalidArgument, + "First dimension of 2D input must be of size 3 or 5, " + "but got shape (4, 2), at my_dfvp"}}, + // Invalid second dim for 2D input, should fail. + DataFormatVecPermuteTestParams{ + /*dst_format=*/"NCHW", + /*src_format=*/"NHWC", + /*x_shape=*/{4, 3}, + /*x=*/{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}, + /*x_is_tensor=*/true, + /*expected_output=*/{}, + /*conversion_status=*/trt_mode_ == TrtTestMode::kImplicitBatch + ? implicit_error + : Status{absl::StatusCode::kInvalidArgument, + "Second dimension of 2D input must be of size 2, but " + "got shape (4, 3), at my_dfvp"}}, + }; - const DataVec input_data{ - {"input", test::AsTensor({-9, 3, 5, 1, 6, -5, 7, 1, 0, -1})}}; - DataVec output_data{{"my_topk", ConstructTensor(4)}, - {"my_topk:1", ConstructTensor(4)}}; - BuildAndRun(input_data, &output_data); - EXPECT_THAT(GetSpanForData(output_data[0]), - ElementsAre(6, 5, 7, 1)); - EXPECT_THAT(GetSpanForData(output_data[1]), - ElementsAre(4, 2, 1, 2)); + for (auto p : test_params) { + Reset(); + const NodeDef node_def = + GetDataFormatVecPermuteNodeDef(p.dst_format, p.src_format, p.x_shape); + + if (p.x_is_tensor) { + AddTestTensor("x", p.x_shape, DT_INT32, p.x, p.x_shape); + } else { + AddTestWeights("x", p.x_shape, p.x, DT_INT32); } + + TestOpConverter(node_def, p.x_shape, p.conversion_status, Status::OK(), + ElementsAreArray(p.expected_output)); } } -template -void TestConvertGather(OpConverterTest* test) { - typedef typename EnumToDataType::Type CType; - +NodeDef CreateGatherOp(DataType tf_type, int batch_dims) { // Get the NodeDef for GatherV2. Scope s = Scope::NewRootScope(); - auto params = ops::Placeholder(s.WithOpName("params"), dtype); + auto params = ops::Placeholder(s.WithOpName("params"), tf_type); auto indices = ops::Placeholder(s.WithOpName("indices"), DT_INT32); auto axis = ops::Placeholder(s.WithOpName("axis"), DT_INT32); - auto gather = ops::GatherV2(s.WithOpName("my_gather"), params, indices, axis); + ops::GatherV2::Attrs op_attrs; + op_attrs.batch_dims_ = batch_dims; + auto gather = + ops::GatherV2(s.WithOpName("my_gather"), params, indices, axis, op_attrs); const NodeDef& node_def = gather.operation.node()->def(); + return node_def; +} + +TEST_P(OpConverter_FP32_FP16_INT32_Test, ConvertGather) { + auto node_def = CreateGatherOp(tf_type_, /*batch_dims*/ 0); + + { + // Axis is a tensor, should fail. + Reset(); + AddTestTensor("params", {1, 1, 2, 3}, tf_type_, {}); + AddTestTensor("indices", {1, 2}, DT_INT32, {}); + AddTestTensor("axis", {1}, DT_INT32, {}); + RunValidationAndConversion( + node_def, absl::StatusCode::kUnimplemented, + "The input \"axis\" for GatherV2 must be a constant"); + } + { + // Axis is out of bounds, should fail. + Reset(); + AddTestTensor("params", {1, 1, 2, 3}); + AddTestTensor("indices", {1, 2}, DT_INT32, {}); + AddTestWeights("axis", {1}, {4}); + RunValidationAndConversion(node_def, absl::StatusCode::kInvalidArgument, + "Axis value of 4 is out of bounds, must be in " + "range [-4, 4)"); + } struct TestParams { // TF shape of the input 'params' (including batch dimension). @@ -4391,407 +7001,536 @@ void TestConvertGather(OpConverterTest* test) { std::vector indices_shape; std::vector indices; int axis; + int batch_dims; // Expected TF shape of the output (including batch dimension). std::vector expected_output_shape; std::vector expected_output; bool params_is_tensor; + bool indices_is_tensor; + Status conversion_status; + Status runtime_status; + Status add_index_status; }; // Input is the same {1, 2, 3, 4, 5, 6} for all cases. - const int kGatherOKCases = 11; - const std::vector params_input = {CType(1), CType(2), CType(3), - CType(4), CType(5), CType(6)}; - TestParams ok_params[kGatherOKCases] = { + const std::vector params_input = {1, 2, 3, 4, 5, 6}; + + std::vector test_params = { + // Axis is batch dimension, should fail in implicit batch mode. + TestParams{/*params_shape=*/{2, 1, 1, 3}, + /*indices_shape=*/{2}, + /*indices=*/{1, 0}, + /*axis=*/0, + /*batch_dims=*/0, + /*expected_output_shape=*/{2, 1, 1, 3}, + /*expected_output=*/{4, 5, 6, 1, 2, 3}, + /*params_is_tensor=*/true, + /*indices_is_tensor=*/true, + /*conversion_status=*/trt_mode_ == TrtTestMode::kImplicitBatch + ? Status{absl::StatusCode::kUnimplemented, + "TensorRT does not allow " + "manipulation of the batch dimension"} + : Status::OK()}, + // Batch size of indices is not 1 when params and indices are tensors. + TestParams{/*params_shape=*/{2, 1, 3}, + /*indices_shape=*/{2, 1}, + /*indices=*/{2, 0}, + /*axis=*/2, + /*batch_dims=*/0, + /*expected_output_shape=*/{2, 1, 2, 1}, + /*expected_output=*/{3, 1, 6, 4}, + /*params_is_tensor=*/true, + /*indices_is_tensor=*/true, + /*conversion_status=*/trt_mode_ == TrtTestMode::kImplicitBatch + ? Status{absl::StatusCode::kUnimplemented, + "Params and indices must have a" + " batch size of 1 when params and indices are " + "both tensors or both" + " constants."} + : Status::OK()}, + // Batch size of indices is not 1 when params is tensor and indices are + // constant. + TestParams{/*params_shape=*/{2, 1, 3}, + /*indices_shape=*/{2, 1}, + /*indices=*/{2, 0}, + /*axis=*/2, + /*batch_dims=*/0, + /*expected_output_shape=*/{2, 1, 2, 1}, + /*expected_output=*/{3, 1, 6, 4}, + /*params_is_tensor=*/true, + /*indices_is_tensor=*/false, + /*conversion_status=*/Status::OK()}, + // Axis is not zero when params is a weight, should fail in implicit batch + // mode. + TestParams{/*params_shape=*/{2, 1, 3}, + /*indices_shape=*/{2}, + /*indices=*/{1, 2}, + /*axis=*/2, + /*batch_dims=*/0, + /*expected_output_shape=*/{2, 1, 2}, + /*expected_output=*/{2, 3, 5, 6}, + /*params_is_tensor=*/false, + /*indices_is_tensor=*/true, + /*conversion_status=*/trt_mode_ == TrtTestMode::kImplicitBatch + ? Status{absl::StatusCode::kUnimplemented, + "The input axis must be zero when " + "params is a weight."} + : Status::OK()}, + // Params with only batch dimension. + TestParams{ + /*params_shape=*/{6}, + /*indices_shape=*/{2}, + /*indices=*/{1, 3}, + /*axis=*/0, + /*batch_dims=*/0, + /*expected_output_shape=*/{2}, + /*expected_output=*/{2, 4}, + /*params_is_tensor=*/true, + /*indices_is_tensor=*/true, + /*conversion_status=*/trt_mode_ == TrtTestMode::kImplicitBatch + ? Status{absl::StatusCode::kUnimplemented, + "TensorRT does not allow " + "manipulation of the batch dimension"} + : Status::OK(), + /*runtime_status=*/Status::OK(), + /*add_index_status=*/trt_mode_ == TrtTestMode::kImplicitBatch + ? Status{absl::StatusCode::kInvalidArgument, + batch_size_error("indices", + "Provided batch size does not match " + "converter batch size: 2 vs 6")} + : Status::OK()}, // Vector indices, and output rank is rank(params). TestParams{ /*params_shape=*/{1, 1, 2, 3}, /*indices_shape=*/{1}, /*indices=*/{0}, /*axis=*/3, + /*batch_dims=*/0, /*expected_output_shape=*/{1, 1, 2, 1}, /*expected_output=*/{1, 4}, /*params_is_tensor=*/true, + /*indices_is_tensor=*/true, }, TestParams{ /*params_shape=*/{1, 1, 2, 3}, /*indices_shape=*/{1}, /*indices=*/{1}, /*axis=*/2, + /*batch_dims=*/0, /*expected_output_shape=*/{1, 1, 1, 3}, /*expected_output=*/{4, 5, 6}, /*params_is_tensor=*/true, + /*indices_is_tensor=*/true, }, - // Indices with rank>1, and output rank is rank(params)+rank(indices)-1. + // Indices with rank>1, and output rank is rank(params) + rank(indices) - + // 1 TestParams{ /*params_shape=*/{1, 1, 2, 3}, /*indices_shape=*/{1, 1}, /*indices=*/{0}, /*axis=*/3, + /*batch_dims=*/0, /*expected_output_shape=*/{1, 1, 2, 1, 1}, /*expected_output=*/{1, 4}, /*params_is_tensor=*/true, + /*indices_is_tensor=*/true, }, TestParams{ /*params_shape=*/{1, 1, 2, 3}, /*indices_shape=*/{1, 1}, /*indices=*/{1}, /*axis=*/3, + /*batch_dims=*/0, /*expected_output_shape=*/{1, 1, 2, 1, 1}, /*expected_output=*/{2, 5}, /*params_is_tensor=*/true, + /*indices_is_tensor=*/true, }, TestParams{ /*params_shape=*/{1, 1, 2, 3}, /*indices_shape=*/{1, 1}, /*indices=*/{2}, /*axis=*/-1, + /*batch_dims=*/0, /*expected_output_shape=*/{1, 1, 2, 1, 1}, /*expected_output=*/{3, 6}, /*params_is_tensor=*/true, + /*indices_is_tensor=*/true, }, TestParams{ /*params_shape=*/{1, 1, 2, 3}, /*indices_shape=*/{1, 3}, /*indices=*/{2, 0, 1}, /*axis=*/3, + /*batch_dims=*/0, /*expected_output_shape=*/{1, 1, 2, 1, 3}, /*expected_output=*/{3, 1, 2, 6, 4, 5}, /*params_is_tensor=*/true, + /*indices_is_tensor=*/true, }, TestParams{ /*params_shape=*/{1, 3, 2}, /*indices_shape=*/{1, 2, 2}, /*indices=*/{0, 0, 1, 0}, /*axis=*/2, + /*batch_dims=*/0, /*expected_output_shape=*/{1, 3, 1, 2, 2}, /*expected_output=*/{1, 1, 2, 1, 3, 3, 4, 3, 5, 5, 6, 5}, /*params_is_tensor=*/true, + /*indices_is_tensor=*/true, }, TestParams{ /*params_shape=*/{1, 2, 3}, /*indices_shape=*/{1}, /*indices=*/{0}, /*axis=*/0, + /*batch_dims=*/0, /*expected_output_shape=*/{1, 2, 3}, /*expected_output=*/{1, 2, 3, 4, 5, 6}, /*params_is_tensor=*/false, + /*indices_is_tensor=*/true, }, TestParams{ /*params_shape=*/{3, 2}, /*indices_shape=*/{1, 2}, /*indices=*/{0, 1}, /*axis=*/0, + /*batch_dims=*/0, /*expected_output_shape=*/{1, 2, 2}, /*expected_output=*/{1, 2, 3, 4}, /*params_is_tensor=*/false, + /*indices_is_tensor=*/true, }, TestParams{ /*params_shape=*/{2, 3}, /*indices_shape=*/{1, 1, 2}, /*indices=*/{0, 1}, /*axis=*/0, + /*batch_dims=*/0, /*expected_output_shape=*/{1, 1, 2, 3}, /*expected_output=*/{1, 2, 3, 4, 5, 6}, /*params_is_tensor=*/false, + /*indices_is_tensor=*/true, }, TestParams{ /*params_shape=*/{3, 2}, /*indices_shape=*/{2, 2}, /*indices=*/{0, 2, 1, 0}, /*axis=*/0, + /*batch_dims=*/0, /*expected_output_shape=*/{2, 2, 2}, /*expected_output=*/{1, 2, 5, 6, 3, 4, 1, 2}, /*params_is_tensor=*/false, + /*indices_is_tensor=*/true, + }, + // Test cases in which indices constant + TestParams{ + /*params_shape=*/{1, 1, 2, 3}, + /*indices_shape=*/{1, 1}, + /*indices=*/{0}, + /*axis=*/3, + /*batch_dims=*/0, + /*expected_output_shape=*/{1, 1, 2, 1, 1}, + /*expected_output=*/{1, 4}, + /*params_is_tensor=*/true, + /*indices_is_tensor=*/false, }, + // Test cases in which both input and indices constant + TestParams{/*params_shape=*/{1, 2, 3}, + /*indices_shape=*/{1}, + /*indices=*/{0}, + /*axis=*/0, + /*batch_dims=*/0, + /*expected_output_shape=*/{1, 2, 3}, + /*expected_output=*/{1, 2, 3, 4, 5, 6}, + /*params_is_tensor=*/false, + /*indices_is_tensor=*/false, + /*conversion_status=*/trt_mode_ == TrtTestMode::kImplicitBatch + ? Status{absl::StatusCode::kUnimplemented, + "Params and indices must have a" + " batch size of 1 when params and indices are " + "both tensors or both" + " constants."} + : Status::OK()}, + TestParams{/*params_shape=*/{3, 2}, + /*indices_shape=*/{2, 2}, + /*indices=*/{0, 2, 1, 0}, + /*axis=*/0, + /*batch_dims=*/0, + /*expected_output_shape=*/{2, 2, 2}, + /*expected_output=*/{1, 2, 5, 6, 3, 4, 1, 2}, + /*params_is_tensor=*/false, + /*indices_is_tensor=*/false, + /*conversion_status=*/trt_mode_ == TrtTestMode::kImplicitBatch + ? Status{absl::StatusCode::kUnimplemented, + "Params and indices must have a" + " batch size of 1 when params and indices are " + "both tensors or both" + " constants."} + : Status::OK()}, + TestParams{ + /*params_shape=*/{2, 3}, + /*indices_shape=*/{2, 2}, + /*indices=*/{0, 1, 1, 2}, + /*axis=*/1, + /*batch_dims=*/1, + /*expected_output_shape=*/{2, 2}, + /*expected_output=*/{1, 2, 5, 6}, + /*params_is_tensor=*/false, + /*indices_is_tensor=*/false, + /*conversion_status=*/trt_mode_ == TrtTestMode::kImplicitBatch + ? Status{absl::StatusCode::kUnimplemented, + "The input axis must be zero when params is a weight."} + : Status::OK()}, }; - // Ok. - for (int i = 0; i < kGatherOKCases; i++) { - test->Reset(); - const auto& params_shape = ok_params[i].params_shape; - if (ok_params[i].params_is_tensor) { - std::vector params_dims(params_shape.begin() + 1, - params_shape.end()); - test->AddTestTensor("params", params_dims, params_shape[0], - TfDataTypeToTrt(dtype)); + for (auto p : test_params) { + Reset(); + + auto node_def = CreateGatherOp(tf_type_, p.batch_dims); + + if (p.params_is_tensor) { + AddTestTensor("params", p.params_shape, params_input); } else { - test->AddTestWeights("params", params_shape, params_input); + AddTestWeights("params", p.params_shape, params_input, tf_type_); } - const auto& indices_shape = ok_params[i].indices_shape; - test->AddTestTensor( - "indices", - std::vector(indices_shape.begin() + 1, indices_shape.end()), - indices_shape[0], nvinfer1::DataType::kINT32); - test->AddTestWeights("axis", {1}, {ok_params[i].axis}); - test->RunValidationAndConversion(node_def); - TRT_TensorOrWeights output; - TF_EXPECT_OK(test->GetTensorOrWeights("my_gather", &output)); - ASSERT_TRUE(output.is_tensor()); - - const auto& expected_output_shape = ok_params[i].expected_output_shape; - const auto& expected_output = ok_params[i].expected_output; - ASSERT_EQ(expected_output.size(), - TrtWeightDimsNumElements(GetTestDims(expected_output_shape))); - const std::vector expected_output_dims( - expected_output_shape.begin() + 1, expected_output_shape.end()); - ExpectTrtDimsEqualsArray(expected_output_dims, - output.tensor()->getDimensions()); - - // Create input in CType and convert expected output to CType. - std::vector converted_expected_output(expected_output.begin(), - expected_output.end()); - - DataVec input_data; - if (ok_params[i].params_is_tensor) { - input_data = {{"params", test::AsTensor(params_input)}, - {"indices", test::AsTensor(ok_params[i].indices)}}; + if (p.indices_is_tensor) { + AddTestTensor("indices", p.indices_shape, DT_INT32, p.indices, {}, + p.add_index_status); } else { - input_data = {{"indices", test::AsTensor(ok_params[i].indices)}}; + std::vector indices_shape(p.indices_shape); + AddTestWeights("indices", indices_shape, p.indices, DT_INT32); } - DataVec output_data{ - {"my_gather", ConstructTensor(expected_output.size())}}; - test->BuildAndRun( - input_data, &output_data, - dtype == DT_HALF ? TrtPrecisionMode::FP16 : TrtPrecisionMode::FP32, - /*batch_size=*/expected_output_shape[0]); - EXPECT_THAT(GetSpanForData(output_data[0]), - ElementsAreArray(converted_expected_output)); + + AddTestWeights("axis", {1}, {p.axis}); + TestOpConverter(node_def, p.expected_output_shape, p.conversion_status, + p.runtime_status, ElementsAreArray(p.expected_output)); } } -TEST_F(OpConverterTest, ConvertGather) { - // Get the NodeDef for GatherV2. +template +NodeDef CreateReduceOp(DataType tf_type, bool keep_dims) { Scope s = Scope::NewRootScope(); - auto params = ops::Placeholder(s.WithOpName("params"), DT_FLOAT); - auto indices = ops::Placeholder(s.WithOpName("indices"), DT_INT32); + auto input = ops::Placeholder(s.WithOpName("input"), tf_type); auto axis = ops::Placeholder(s.WithOpName("axis"), DT_INT32); - auto gather = ops::GatherV2(s.WithOpName("my_gather"), params, indices, axis); - const NodeDef& node_def = gather.operation.node()->def(); - { - // Axis is a tensor, should fail. - Reset(); - AddTestTensor("params", {1, 2, 3}); - AddTestTensor("indices", {2}); - AddTestTensor("axis", {1}); - RunValidationAndConversion( - node_def, error::UNIMPLEMENTED, - "The input \"axis\" for GatherV2 must be a constant, at my_gather"); - } - { - // Axis is out of bounds, should fail. - Reset(); - AddTestTensor("params", {1, 2, 3}); - AddTestTensor("indices", {2}); - AddTestWeights("axis", {1}, {4}); - RunValidationAndConversion(node_def, error::INVALID_ARGUMENT, - "Axis value of 4 is out of bounds, must be in " - "range [-4, 4), at my_gather"); - } - { - // Axis is batch dimension, should fail. - Reset(); - AddTestTensor("params", {1, 2, 3}); - AddTestTensor("indices", {2}); - AddTestWeights("axis", {1}, {0}); - RunValidationAndConversion(node_def, error::UNIMPLEMENTED, - "TensorRT does not allow manipulation of the " - "batch dimension, at my_gather"); - } - { - // Axis is not zero when params is a weight, should fail. - Reset(); - AddTestWeights("params", {1, 3}, {1, 2, 3}); - AddTestTensor("indices", {2}); - AddTestWeights("axis", {1}, {1}); - RunValidationAndConversion( - node_def, error::UNIMPLEMENTED, - "The input axis must be zero when params is a weight."); + typename OpType::Attrs op_attrs; + op_attrs.keep_dims_ = keep_dims; + auto op = OpType(s.WithOpName("my_reduce"), input, axis, op_attrs); + return op.operation.node()->def(); +} + +// Applies reduction op on sub-sequences of input +// output[i] = reduce(input[m * i : m * (i +1)]) +std::vector CalcReduce(string op_name, std::vector input, int m, + float (*op)(float, float), float init) { + std::vector output(input.size() / m); + for (int i = 0; i < output.size(); i++) { + auto begin = input.begin() + i * m; + auto end = input.begin() + (i + 1) * m; + output[i] = std::accumulate(begin, end, init, op); + if (op_name == "Mean") { + output[i] /= m; + } } + return output; +} +TEST_P(OpConverter_FP32_FP16_INT32_Test, ConvertReduce) { { - // Batch size of indices is not 1 when params is a tensor. + // Input is weights, should fail. Reset(); - AddTestTensor("params", {1, 2, 3}, /*batch_size=*/2); - AddTestTensor("indices", {2}, /*batch_size=*/2); + const NodeDef node_def = CreateReduceOp(tf_type_, false); + AddTestWeights("input", {1, 2, 3}, {-3, -2, -1, 0, 1, 2}); AddTestWeights("axis", {1}, {1}); - RunValidationAndConversion( - node_def, error::UNIMPLEMENTED, - "Indices must have a batch size of 1 when params is a tensor."); + RunValidationAndConversion(node_def, absl::StatusCode::kUnimplemented, + "The input \"input\" for Sum must be a tensor"); } - - Reset(); - TestConvertGather(this); - TestConvertGather(this); - TestConvertGather(this); -} - -TEST_F(OpConverterTest, ConvertUnary) { { - // Input is weights, should fail. + // Axis is weights, should fail. Reset(); - Scope s = Scope::NewRootScope(); - auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT); - auto neg = ops::Neg(s.WithOpName("my_unary"), input); - const NodeDef& node_def = neg.operation.node()->def(); - AddTestWeights("input", {1, 2, 3}, {-3, -2, -1, 0, 1, 2}); - RunValidationAndConversion( - node_def, error::UNIMPLEMENTED, - "The input \"x\" for Neg must be a tensor, at my_unary"); + const NodeDef node_def = CreateReduceOp(tf_type_, false); + AddTestTensor("input", {1, 2, 3}, {-3, -2, -1, 0, 1, 2}); + AddTestTensor("axis", {1}, DT_INT32, {1}); + RunValidationAndConversion(node_def, absl::StatusCode::kUnimplemented, + "The input \"axis\" for Sum must be a constant"); } - - // Get nodedef for unary layer. - auto get_unary_nodedef = [](string op_name) -> NodeDef { - Scope s = Scope::NewRootScope(); - auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT); - if (op_name == "Abs") { - auto unary = ops::Abs(s.WithOpName("my_unary"), input); - return unary.operation.node()->def(); - } else if (op_name == "Acos") { - auto unary = ops::Acos(s.WithOpName("my_unary"), input); - return unary.operation.node()->def(); - } else if (op_name == "Acosh") { - auto unary = ops::Acosh(s.WithOpName("my_unary"), input); - return unary.operation.node()->def(); - } else if (op_name == "Asin") { - auto unary = ops::Asin(s.WithOpName("my_unary"), input); - return unary.operation.node()->def(); - } else if (op_name == "Asinh") { - auto unary = ops::Asinh(s.WithOpName("my_unary"), input); - return unary.operation.node()->def(); - } else if (op_name == "Atan") { - auto unary = ops::Atan(s.WithOpName("my_unary"), input); - return unary.operation.node()->def(); - } else if (op_name == "Atanh") { - auto unary = ops::Atanh(s.WithOpName("my_unary"), input); - return unary.operation.node()->def(); - } else if (op_name == "Ceil") { - auto unary = ops::Ceil(s.WithOpName("my_unary"), input); - return unary.operation.node()->def(); - } else if (op_name == "Cos") { - auto unary = ops::Cos(s.WithOpName("my_unary"), input); - return unary.operation.node()->def(); - } else if (op_name == "Cosh") { - auto unary = ops::Cosh(s.WithOpName("my_unary"), input); - return unary.operation.node()->def(); - } else if (op_name == "Exp") { - auto unary = ops::Exp(s.WithOpName("my_unary"), input); - return unary.operation.node()->def(); - } else if (op_name == "Floor") { - auto unary = ops::Floor(s.WithOpName("my_unary"), input); - return unary.operation.node()->def(); - } else if (op_name == "Log") { - auto unary = ops::Log(s.WithOpName("my_unary"), input); - return unary.operation.node()->def(); - } else if (op_name == "Neg") { - auto unary = ops::Neg(s.WithOpName("my_unary"), input); - return unary.operation.node()->def(); - } else if (op_name == "Reciprocal") { - auto unary = ops::Reciprocal(s.WithOpName("my_unary"), input); - return unary.operation.node()->def(); - } else if (op_name == "Rsqrt") { - auto unary = ops::Rsqrt(s.WithOpName("my_unary"), input); - return unary.operation.node()->def(); - } else if (op_name == "Sin") { - auto unary = ops::Sin(s.WithOpName("my_unary"), input); - return unary.operation.node()->def(); - } else if (op_name == "Sinh") { - auto unary = ops::Sinh(s.WithOpName("my_unary"), input); - return unary.operation.node()->def(); - } else if (op_name == "Sqrt") { - auto unary = ops::Sqrt(s.WithOpName("my_unary"), input); - return unary.operation.node()->def(); - } else if (op_name == "Tan") { - auto unary = ops::Tan(s.WithOpName("my_unary"), input); - return unary.operation.node()->def(); - } - EXPECT_TRUE(false); - return NodeDef(); + using OpFunc = std::function; + using ValFunc = float (*)(float, float); + struct ReduceTestDescriptor { + string name; + OpFunc get_node; + ValFunc val_func; + float init_val; }; - // Get expected output for unary layer. - auto get_unary_output = [](string op_name, float input) -> float { - if (op_name == "Abs") { - return std::abs(input); - } else if (op_name == "Acos") { - return std::acos(input); - } else if (op_name == "Acosh") { - return std::acosh(input); - } else if (op_name == "Asin") { - return std::asin(input); - } else if (op_name == "Asinh") { - return std::asinh(input); - } else if (op_name == "Atan") { - return std::atan(input); - } else if (op_name == "Atanh") { - return std::atanh(input); - } else if (op_name == "Ceil") { - return std::ceil(input); - } else if (op_name == "Cos") { - return std::cos(input); - } else if (op_name == "Cosh") { - return std::cosh(input); - } else if (op_name == "Exp") { - return std::exp(input); - } else if (op_name == "Floor") { - return std::floor(input); - } else if (op_name == "Log") { - return std::log(input); - } else if (op_name == "Neg") { - return -input; - } else if (op_name == "Reciprocal") { - return 1.0 / input; - } else if (op_name == "Rsqrt") { - return 1.0 / std::sqrt(input); - } else if (op_name == "Sin") { - return std::sin(input); - } else if (op_name == "Sinh") { - return std::sinh(input); - } else if (op_name == "Sqrt") { - return std::sqrt(input); - } else if (op_name == "Tan") { - return std::tan(input); - } - EXPECT_TRUE(false); - return 0; + std::vector op_test_info{ + {"Sum", CreateReduceOp, [](float x, float y) { return x + y; }, + 0}, + {"Prod", CreateReduceOp, + [](float x, float y) { return x * y; }, 1}, + {"Mean", CreateReduceOp, + [](float x, float y) { return x + y; }, 0}, + {"Min", CreateReduceOp, + [](float x, float y) { return y < x ? y : x; }, 1000}, + {"Max", CreateReduceOp, + [](float x, float y) { return x < y ? y : x; }, -1000}}; + + std::vector input_values{1, 2, 3, 4, 5, 6}; + struct TestParams { + std::vector input_dims; + std::vector input_values; + // Helper array contains the same elements as input but permuted in a way + // that the reduction can be calculated over contiguous elements using + // CalcReduce + std::vector helper_array; + std::vector axis; + int stride; // product of input_dims along axis + Status conversion_status; + }; + std::vector params{ + // Out of range tests + TestParams{{2, 3, 1}, input_values, input_values, {3}, 3}, + TestParams{{2, 3, 1}, input_values, input_values, {-4}, 3}, + // Ok tests + TestParams{{2, 3, 1}, input_values, {1, 4, 2, 5, 3, 6}, {0}, 2}, + TestParams{{2, 3, 1}, input_values, input_values, {1}, 3}, + TestParams{{2, 3, 1}, input_values, input_values, {2}, 1}, + TestParams{{2, 3, 1}, input_values, input_values, {0, 1}, 6}, + // Ok tests with negative axis values + TestParams{{2, 3, 1}, input_values, {1, 4, 2, 5, 3, 6}, {-3}, 2}, + TestParams{{2, 3, 1}, input_values, input_values, {-2}, 3}, + TestParams{{2, 3, 1}, input_values, input_values, {-1}, 1}, + TestParams{{2, 3, 1}, input_values, input_values, {-3, 1}, 6}, }; - // Get list of ops to test. - std::vector ops_to_test; - // Add all ops supported by ConvertUnary. - auto* map = UnaryOperationMap(); - ops_to_test.reserve(map->size()); - for (auto& pair : *map) { - ops_to_test.push_back(pair.first); - } - // Add other unary ops to test. - ops_to_test.push_back("Rsqrt"); - // Ok. - for (const string& op_name : ops_to_test) { - Reset(); - NodeDef node_def = get_unary_nodedef(op_name); - AddTestTensor("input", {1, 2, 3}); - RunValidationAndConversion(node_def); - TRT_TensorOrWeights output; - TF_EXPECT_OK(GetTensorOrWeights("my_unary", &output)); - ASSERT_TRUE(output.is_tensor()); - ExpectTrtDimsEqualsArray({1, 2, 3}, output.tensor()->getDimensions()); - - const std::vector input = {-0.9f, 0.6f, 0.0f, -3.5f, 100.0f, 2.9f}; - const DataVec input_data{{"input", test::AsTensor(input)}}; - DataVec output_data{{"my_unary", ConstructTensor(6)}}; - BuildAndRun(input_data, &output_data); - for (int i = 0; i < input.size(); ++i) { - const float expected_output = get_unary_output(op_name, input[i]); - EXPECT_THAT(GetSpanForData(output_data[0])[i], - NanSensitiveFloatNear(expected_output, 0.0001)); + for (bool keep_dims : {false, true}) { + for (auto& op : op_test_info) { + VLOG(2) << "Processing " << op.name << " with keep_dims=" << keep_dims; + for (auto p : params) { + SCOPED_TRACE(StrCat(op.name, keep_dims ? " & keep_dims" : "")); + Reset(); + NodeDef node_def = op.get_node(tf_type_, keep_dims); + + AddTestTensor("input", p.input_dims, p.input_values); + AddTestWeights("axis", {static_cast(p.axis.size())}, + p.axis); + std::vector expected_output_dims(p.input_dims); + + // Set expected output dim and conversion error messages + for (int ax : p.axis) { + int rank = p.input_dims.size(); + if (ax >= rank || ax < -rank) { + p.conversion_status = + errors::InvalidArgument("Axis value of ", ax, + " is out of bounds, must be in " + "range [", + -rank, ", ", rank, ")"); + } else { + int ax_positive = ax >= 0 ? ax : ax + rank; + // Zero marks elements that we will remove later. + expected_output_dims[ax_positive] = keep_dims ? 1 : 0; + if (trt_mode_ == TrtTestMode::kImplicitBatch && + (ax == 0 || ax == -rank)) { + p.conversion_status = errors::Unimplemented( + "TensorRT does not allow manipulation of the batch " + "dimension"); + } + } + } + expected_output_dims.erase(std::remove(expected_output_dims.begin(), + expected_output_dims.end(), 0), + expected_output_dims.end()); + VLOG(2) << "out dims " + << absl::StrCat("[", absl::StrJoin(expected_output_dims, ","), + "]"); + std::vector expected_values = CalcReduce( + op.name, p.helper_array, p.stride, op.val_func, op.init_val); + + if (tf_type_ == DT_INT32) { + // We need to floor the float values in the `expected_values` vector. + std::for_each(expected_values.begin(), expected_values.end(), + [](float& _n) { _n = std::floor(_n); }); + } + + TestOpConverter(node_def, expected_output_dims, p.conversion_status, + Status::OK(), ArrayFloatNear(expected_values)); + } } } } +NodeDef CreateCastOp(DataType tf_type) { + Scope s = Scope::NewRootScope(); + auto input = ops::Placeholder(s.WithOpName("input"), DT_HALF); + return ops::Cast(s.WithOpName("my_unary"), input, DT_FLOAT) + .operation.node() + ->def(); +} + +TEST_P(OpConverter_FP32_UnaryTest, ConvertUnary) { + using OpFunc = std::function; + using ValFunc = float (*)(float); + std::map> op_map; +#define ADD_OP(name, op, compute) \ + op_map[name] = \ + std::make_pair(CreateUnaryOp, static_cast(compute)) + ADD_OP("Abs", ops::Abs, std::abs); + ADD_OP("Acos", ops::Acos, std::acos); + ADD_OP("Acosh", ops::Acosh, std::acosh); + ADD_OP("Asin", ops::Asin, std::asin); + ADD_OP("Asinh", ops::Asinh, std::asinh); + ADD_OP("Atan", ops::Atan, std::atan); + ADD_OP("Atanh", ops::Atanh, std::atanh); + op_map["Cast"] = std::make_pair(CreateCastOp, [](float x) { return x; }); + ADD_OP("Ceil", ops::Ceil, std::ceil); + ADD_OP("Cos", ops::Cos, std::cos); + ADD_OP("Cosh", ops::Cosh, std::cosh); + ADD_OP("Exp", ops::Exp, std::exp); + ADD_OP("Erf", ops::Erf, std::erf); + ADD_OP("Floor", ops::Floor, std::floor); + ADD_OP("Log", ops::Log, std::log); + ADD_OP("Neg", ops::Neg, [](float x) { return -x; }); + ADD_OP("Reciprocal", ops::Reciprocal, [](float x) { return 1.0f / x; }); +#if IS_TRT_VERSION_GE(8, 2, 0, 0) + ADD_OP("Round", ops::Round, [](float x) { return (float)std::round(x); }); + ADD_OP("Sign", ops::Sign, + [](float x) { return x > 0 ? 1.0f : (x < 0 ? -1.0f : 0.0f); }); +#endif + ADD_OP("Rsqrt", ops::Rsqrt, [](float x) { return 1.0f / std::sqrt(x); }); + ADD_OP("Sin", ops::Sin, std::sin); + ADD_OP("Sinh", ops::Sinh, std::sinh); + ADD_OP("Sqrt", ops::Sqrt, std::sqrt); + ADD_OP("Tan", ops::Tan, std::tan); +#undef ADD_OP + + std::vector input_values{-0.9f, 0.6f, 0.0f, -3.5f, 100.0f, 2.9f}; + RunTests("Unary", *UnaryOperationMap(), op_map, input_values, "x"); +} + +TEST_P(OpConverter_BOOL_Test, ConvertBoolean) { + std::vector input_values{1, 0, 1, 0, 0, 1}; + using OpFunc = std::function; + + using ValFunc = int (*)(int); + std::map> op_map; +#define ADD_OP(name, op, compute) \ + op_map[name] = \ + std::make_pair(CreateUnaryOp, static_cast(compute)) + ADD_OP("LogicalNot", ops::LogicalNot, [](int x) { return 1 - x; }); +#undef ADD_OP + +#if IS_TRT_VERSION_GE(8, 2, 0, 0) + // The test does not actually run for TPT versions less than 8.2 + RunTests("LogicalUnary", *UnaryBooleanOperationMap(), op_map, input_values, + "x"); +#endif +} + // Get the NodeDef for ConcatV2. // TODO(hinsu): Consider switching this to static function. auto get_concat_nodedef = [](DataType dtype, int num_inputs) -> NodeDef { Scope s = Scope::NewRootScope(); std::vector values; + values.reserve(num_inputs); for (int i = 0; i < num_inputs; ++i) { const string input_name = StrCat("values_", i); values.push_back(ops::Placeholder(s.WithOpName(input_name), dtype)); @@ -4802,172 +7541,169 @@ auto get_concat_nodedef = [](DataType dtype, int num_inputs) -> NodeDef { return concat.operation.node()->def(); }; -template -void TestConvertConcat(OpConverterTest* test) { - typedef typename EnumToDataType::Type CType; +TEST_P(OpConverter_FP32_FP16_INT32_Test, ConvertConcat) { + { + // Axis is a tensor, should fail. + Reset(); + NodeDef node_def = get_concat_nodedef(tf_type_, 2); + AddTestTensor("values_0", {1, 1, 2, 3}); + AddTestTensor("values_1", {1, 1, 2, 3}); + AddTestTensor("axis", {1}); + RunValidationAndConversion( + node_def, absl::StatusCode::kUnimplemented, + "The input \"axis\" for ConcatV2 must be a constant"); + } + { + // Axis is out of bounds, should fail. + Reset(); + NodeDef node_def = get_concat_nodedef(tf_type_, 2); + AddTestTensor("values_0", {1, 1, 2, 3}); + AddTestTensor("values_1", {1, 1, 2, 3}); + AddTestWeights("axis", {1}, {4}); + RunValidationAndConversion(node_def, absl::StatusCode::kInvalidArgument, + "Axis value of 4 is out of bounds, must be in " + "range [-4, 4)"); + } + { + // Inputs have inconsistent ranks, should fail. + Reset(); + NodeDef node_def = get_concat_nodedef(tf_type_, 2); + AddTestTensor("values_0", {1, 1, 2, 3}); + AddTestTensor("values_1", {1, 1, 6}); + AddTestWeights("axis", {1}, {1}); + RunValidationAndConversion(node_def, absl::StatusCode::kInvalidArgument, + "Received inputs with inconsistent rank"); + } struct TestParams { std::vector> input_shapes; - std::vector> input_values; + std::vector> input_values; + std::vector inputs_are_tensors; int axis; std::vector expected_output_dims; - std::vector expected_output; + std::vector expected_output; + Status conversion_status; + Status run_status; }; - const std::vector> common_input{ - InitTestVector(6), - InitTestVector(6, /*start_value=*/CType(6))}; - // TODO(hinsu): Use std::vector instead of an array to avoid use of explicit - // size. - const int kConcatOKCases = 4; - TestParams ok_params[kConcatOKCases] = { + const std::vector> common_input{CreateVectorIota(6), + CreateVectorIota(6, 6)}; + + std::vector params = { { - /*input_shapes=*/{{1, 2, 3}, {1, 2, 3}}, + /*input_shapes=*/{{1, 1, 2, 3}, {1, 1, 2, 3}}, /*input_values=*/common_input, + /*inputs_are_tensors=*/{true, true}, /*axis=*/1, - /*expected_output_dims=*/{2, 2, 3}, - /*expected_output=*/InitTestVector(12), + /*expected_output_dims=*/{1, 2, 2, 3}, + /*expected_output=*/CreateVectorIota(12), }, { - /*input_shapes=*/{{1, 2, 3}, {1, 2, 3}}, + /*input_shapes=*/{{1, 1, 2, 3}, {1, 1, 2, 3}}, /*input_values=*/common_input, + /*inputs_are_tensors=*/{true, true}, /*axis=*/2, - /*expected_output_dims=*/{1, 4, 3}, - /*expected_output=*/InitTestVector(12), + /*expected_output_dims=*/{1, 1, 4, 3}, + /*expected_output=*/CreateVectorIota(12), }, { - /*input_shapes=*/{{1, 2, 3}, {1, 2, 3}}, + /*input_shapes=*/{{1, 1, 2, 3}, {1, 1, 2, 3}}, /*input_values=*/common_input, + /*inputs_are_tensors=*/{true, true}, /*axis=*/3, - /*expected_output_dims=*/{1, 2, 6}, + /*expected_output_dims=*/{1, 1, 2, 6}, /*expected_output=*/ - {CType(0), CType(1), CType(2), CType(6), CType(7), CType(8), CType(3), - CType(4), CType(5), CType(9), CType(10), CType(11)}, + {0, 1, 2, 6, 7, 8, 3, 4, 5, 9, 10, 11}, }, { - /*input_shapes=*/{{1}, {2}, {3}, {1}, {1}, {2}}, + /*input_shapes=*/{{1, 1}, {1, 2}, {1, 3}, {1, 1}, {1, 1}, {1, 2}}, /*input_values=*/ - {{CType(1)}, - {CType(2), CType(3)}, - {CType(4), CType(5), CType(6)}, - {CType(7)}, - {CType(8)}, - {CType(9), CType(10)}}, + {{1}, {2, 3}, {4, 5, 6}, {7}, {8}, {9, 10}}, + /*inputs_are_tensors=*/{true, true, true, true, true, true}, /*axis=*/1, - /*expected_output_dims=*/{10}, + /*expected_output_dims=*/{1, 10}, /*expected_output=*/ - InitTestVector(10, /*start_value=*/CType(1)), + CreateVectorIota(10, /*start_value=*/1), }, - }; + { + // An input is a weight + /*input_shapes=*/{{1, 1, 2, 3}, {1, 1, 2, 3}}, + /*input_values=*/common_input, + /*inputs_are_tensors=*/{true, false}, + /*axis=*/1, + /*expected_output_dims=*/{1, 2, 2, 3}, + /*expected_output=*/CreateVectorIota(12), + /*conversion_status=*/trt_mode_ == TrtTestMode::kImplicitBatch + ? errors::Unimplemented( + "The input \"values_1\" for ConcatV2 must be a tensor") + : Status::OK(), + /*run_status=*/Status::OK(), + }, + { + // An input is a weight + /*input_shapes=*/{{1, 1, 2, 3}, {1, 1, 2, 3}}, + /*input_values=*/common_input, + /*inputs_are_tensors=*/{false, false}, + /*axis=*/1, + /*expected_output_dims=*/{1, 2, 2, 3}, + /*expected_output=*/CreateVectorIota(12), + /*conversion_status=*/trt_mode_ == TrtTestMode::kImplicitBatch + ? errors::Unimplemented( + "The input \"values_0\" for ConcatV2 must be a tensor") + : Status::OK(), + /*run_status=*/Status::OK(), + }, + { + // Axis is batch dimension, should fail in implicit batch mode. + /*input_shapes=*/{{1, 1, 2, 3}, {1, 1, 2, 3}}, + /*input_values=*/common_input, + /*inputs_are_tensors=*/{true, true}, + /*axis=*/0, + /*expected_output_dims=*/{2, 1, 2, 3}, + /*expected_output=*/CreateVectorIota(12), + /*conversion_status=*/trt_mode_ == TrtTestMode::kImplicitBatch + ? errors::Unimplemented( + "TensorRT does not allow manipulation of the " + "batch dimension") + : Status::OK(), + }, + { + // Inconsistent input shape, runtime error in dynamic shape mode. + /*input_shapes=*/{{1, 1, 2, 3}, {1, 1, 3, 2}}, + /*input_values=*/common_input, + /*inputs_are_tensors=*/{true, true}, + /*axis=*/1, + /*expected_output_dims=*/{2, 1, 2, 3}, + /*expected_output=*/CreateVectorIota(12), + trt_mode_ != TrtTestMode::kDynamicShape + ? errors::InvalidArgument( + "Received inputs with inconsistent shape") + : Status::OK(), + errors::InvalidArgument(""), + }}; + + for (auto p : params) { + Reset(); + const int num_inputs = p.input_shapes.size(); + EXPECT_EQ(num_inputs, p.input_values.size()); + + NodeDef node_def = get_concat_nodedef(tf_type_, num_inputs); - for (int i = 0; i < kConcatOKCases; ++i) { - test->Reset(); - const int num_inputs = ok_params[i].input_shapes.size(); - EXPECT_EQ(num_inputs, ok_params[i].input_values.size()); - NodeDef node_def = get_concat_nodedef(dtype, num_inputs); // Create inputs. for (int j = 0; j < num_inputs; ++j) { - test->AddTestTensor(StrCat("values_", j), ok_params[i].input_shapes[j], 1, - TfDataTypeToTrt(dtype)); - } - test->AddTestWeights("axis", {1}, {ok_params[i].axis}); - test->RunValidationAndConversion(node_def); + string name = StrCat("values_", j); - TRT_TensorOrWeights output; - TF_EXPECT_OK(test->GetTensorOrWeights("my_concat", &output)); - ASSERT_TRUE(output.is_tensor()); - ExpectTrtDimsEqualsArray(ok_params[i].expected_output_dims, - output.tensor()->getDimensions()); - // Create input data for tensors. - DataVec input_data; - for (int j = 0; j < num_inputs; ++j) { - input_data.push_back( - {StrCat("values_", j), - test::AsTensor(ok_params[i].input_values[j])}); + if (!p.inputs_are_tensors[j]) { + AddTestWeights(name, p.input_shapes[j], p.input_values[j], tf_type_); + } else { + AddTestTensor(name, p.input_shapes[j], p.input_values[j]); + } } - DataVec output_data{ - {"my_concat", - ConstructTensor(ok_params[i].expected_output.size())}}; - test->BuildAndRun( - input_data, &output_data, - dtype == DT_HALF ? TrtPrecisionMode::FP16 : TrtPrecisionMode::FP32); - EXPECT_THAT(GetSpanForData(output_data[0]), - ElementsAreArray(ok_params[i].expected_output)); - } -} + AddTestWeights("axis", {1}, {p.axis}); -TEST_F(OpConverterTest, ConvertConcat) { - { - // Axis is a tensor, should fail. - Reset(); - NodeDef node_def = get_concat_nodedef(DT_FLOAT, 2); - AddTestTensor("values_0", {1, 2, 3}); - AddTestTensor("values_1", {1, 2, 3}); - AddTestTensor("axis", {1}); - RunValidationAndConversion( - node_def, error::UNIMPLEMENTED, - "The input \"axis\" for ConcatV2 must be a constant, at my_concat"); - } - { - // Axis is out of bounds, should fail. - Reset(); - NodeDef node_def = get_concat_nodedef(DT_FLOAT, 2); - AddTestTensor("values_0", {1, 2, 3}); - AddTestTensor("values_1", {1, 2, 3}); - AddTestWeights("axis", {1}, {4}); - RunValidationAndConversion(node_def, error::INVALID_ARGUMENT, - "Axis value of 4 is out of bounds, must be in " - "range [-4, 4), at my_concat"); - } - { - // Axis is batch dimension, should fail. - Reset(); - NodeDef node_def = get_concat_nodedef(DT_FLOAT, 2); - AddTestTensor("values_0", {1, 2, 3}); - AddTestTensor("values_1", {1, 2, 3}); - AddTestWeights("axis", {1}, {0}); - RunValidationAndConversion(node_def, error::UNIMPLEMENTED, - "TensorRT does not allow manipulation of the " - "batch dimension, at my_concat"); - } - { - // Inputs have inconsistent rank, should fail. - Reset(); - NodeDef node_def = get_concat_nodedef(DT_FLOAT, 2); - AddTestTensor("values_0", {1, 2, 3}); - AddTestTensor("values_1", {1, 6}); - AddTestWeights("axis", {1}, {1}); - RunValidationAndConversion( - node_def, error::INVALID_ARGUMENT, - "Received inputs with inconsistent rank, at my_concat"); - } - { - // An input is a weight, should fail. - Reset(); - NodeDef node_def = get_concat_nodedef(DT_FLOAT, 2); - AddTestTensor("values_0", {1, 2, 3}); - AddTestWeights("values_1", {1, 2, 3}, {1, 2, 3, 4, 5, 6}); - AddTestWeights("axis", {1}, {1}); - RunValidationAndConversion( - node_def, error::UNIMPLEMENTED, - "The input \"values_1\" for ConcatV2 must be a tensor, at my_concat"); - } - { - // Inputs have inconsistent non-axis shapes, should fail. - Reset(); - NodeDef node_def = get_concat_nodedef(DT_FLOAT, 2); - AddTestTensor("values_0", {1, 2, 3}); - AddTestTensor("values_1", {1, 3, 2}); - AddTestWeights("axis", {1}, {1}); - RunValidationAndConversion( - node_def, error::INVALID_ARGUMENT, - "Received inputs with inconsistent shape, at my_concat"); + TestOpConverter(node_def, p.expected_output_dims, p.conversion_status, + p.run_status, ElementsAreArray(p.expected_output)); } - - TestConvertConcat(this); - TestConvertConcat(this); - // TODO(tmorris): Enable once TRT adds support. - // TestConvertConcat(this); } // Get the NodeDef for Split. @@ -4992,13 +7728,12 @@ void TestConvertSplit(OpConverterTest* test) { std::vector> expected_outputs; }; - const std::vector common_input = InitTestVector(6); - const int kSplitOKCases = 4; - TestParams ok_params[kSplitOKCases] = { + const std::vector common_input = CreateVectorIota(6); + std::vector ok_params = { // Identity (num_split = 1) {/*input_shape=*/{1, 2, 3}, /*value=*/common_input, /*axis=*/1, /*num_split=*/1, /*expected_output_dims=*/{1, 2, 3}, - /*expected_outputs=*/{InitTestVector(6)}}, + /*expected_outputs=*/{CreateVectorIota(6)}}, {/*input_shape=*/{1, 2, 3}, /*value=*/common_input, /*axis=*/3, @@ -5024,16 +7759,17 @@ void TestConvertSplit(OpConverterTest* test) { /*num_split=*/2, /*expected_output_dims=*/{1, 3}, /*expected_outputs=*/ - {InitTestVector(3), InitTestVector(3, CType(3))}}, + {CreateVectorIota(3), CreateVectorIota(3, CType(3))}}, }; - for (int i = 0; i < kSplitOKCases; ++i) { + for (int i = 0; i < ok_params.size(); ++i) { test->Reset(); NodeDef node_def = get_split_nodedef(dtype, ok_params[i].num_split); // Create inputs. test->AddTestWeights("axis", {1}, {ok_params[i].axis}); - test->AddTestTensor("value", ok_params[i].input_shape, 1, - TfDataTypeToTrt(dtype)); + nvinfer1::DataType trt_type; + TF_ASSERT_OK(TfTypeToTrtType(dtype, &trt_type)); + test->AddTestTensor("value", ok_params[i].input_shape, 1, trt_type); // Convert. test->RunValidationAndConversion(node_def); @@ -5045,20 +7781,18 @@ void TestConvertSplit(OpConverterTest* test) { const string name = j == 0 ? StrCat("my_split") : StrCat("my_split:", j); TF_EXPECT_OK(test->GetTensorOrWeights(name, &outputs[j])); EXPECT_TRUE(outputs[j].is_tensor()); - ExpectTrtDimsEqualsArray(ok_params[i].expected_output_dims, - outputs[j].tensor()->getDimensions()); + EXPECT_THAT(outputs[j].tensor()->getDimensions(), + DimsAreArray(ok_params[i].expected_output_dims)); // Create buffer to store output. output_data.push_back( - {name, - ConstructTensor(ok_params[i].expected_outputs[j].size())}); + {name, test->ConstructTensor( + ok_params[i].expected_outputs[j].size())}); } // Verify output values are correct. const DataVec input_data{ - {"value", test::AsTensor(ok_params[i].value)}}; - test->BuildAndRun( - input_data, &output_data, - dtype == DT_HALF ? TrtPrecisionMode::FP16 : TrtPrecisionMode::FP32); + {"value", test->AsTensor(ok_params[i].value)}}; + TF_EXPECT_OK(test->BuildAndRun(input_data, &output_data)); for (int j = 0; j < outputs.size(); ++j) { EXPECT_THAT(GetSpanForData(output_data[j]), ElementsAreArray(ok_params[i].expected_outputs[j])); @@ -5074,8 +7808,8 @@ TEST_F(OpConverterTest, ConvertSplit) { AddTestTensor("axis", {1}); AddTestTensor("value", {1, 2, 3}); RunValidationAndConversion( - node_def, error::UNIMPLEMENTED, - "The input \"axis\" for Split must be a constant, at my_split"); + node_def, absl::StatusCode::kUnimplemented, + "The input \"axis\" for Split must be a constant"); } { // Axis is out of bounds, should fail. @@ -5083,9 +7817,9 @@ TEST_F(OpConverterTest, ConvertSplit) { NodeDef node_def = get_split_nodedef(DT_FLOAT, 1); AddTestWeights("axis", {1}, {4}); AddTestTensor("value", {1, 2, 3}); - RunValidationAndConversion(node_def, error::INVALID_ARGUMENT, + RunValidationAndConversion(node_def, absl::StatusCode::kInvalidArgument, "Axis value of 4 is out of bounds, must be in " - "range [-4, 4), at my_split"); + "range [-4, 4)"); } { // Axis is out of bounds (negative), should fail. @@ -5093,9 +7827,9 @@ TEST_F(OpConverterTest, ConvertSplit) { NodeDef node_def = get_split_nodedef(DT_FLOAT, 1); AddTestWeights("axis", {1}, {-5}); AddTestTensor("value", {1, 2, 3}); - RunValidationAndConversion(node_def, error::INVALID_ARGUMENT, + RunValidationAndConversion(node_def, absl::StatusCode::kInvalidArgument, "Axis value of -5 is out of bounds, must be in " - "range [-4, 4), at my_split"); + "range [-4, 4)"); } { // Axis is batch dimension, should fail. @@ -5103,9 +7837,9 @@ TEST_F(OpConverterTest, ConvertSplit) { NodeDef node_def = get_split_nodedef(DT_FLOAT, 1); AddTestWeights("axis", {1}, {0}); AddTestTensor("value", {1, 2, 3}); - RunValidationAndConversion(node_def, error::UNIMPLEMENTED, + RunValidationAndConversion(node_def, absl::StatusCode::kUnimplemented, "TensorRT does not allow manipulation of the " - "batch dimension, at my_split"); + "batch dimension"); } { // Value is a weight, should fail. @@ -5114,8 +7848,8 @@ TEST_F(OpConverterTest, ConvertSplit) { AddTestWeights("axis", {1}, {1}); AddTestWeights("value", {1, 2, 3}, {1, 2, 3, 4, 5, 6}); RunValidationAndConversion( - node_def, error::UNIMPLEMENTED, - "The input \"value\" for Split must be a tensor, at my_split"); + node_def, absl::StatusCode::kUnimplemented, + "The input \"value\" for Split must be a tensor"); } { // Dim is not evenly divisibly by num_split, should fail. @@ -5124,8 +7858,8 @@ TEST_F(OpConverterTest, ConvertSplit) { AddTestWeights("axis", {1}, {3}); AddTestTensor("value", {1, 2, 3}); RunValidationAndConversion( - node_def, error::INVALID_ARGUMENT, - "Dimension 3 of size 3 is not evenly divisble by 2, at my_split"); + node_def, absl::StatusCode::kInvalidArgument, + "Dimension 3 of size 3 is not evenly divisible by 2"); } { // num_split > dim size, should fail. @@ -5134,15 +7868,13 @@ TEST_F(OpConverterTest, ConvertSplit) { AddTestWeights("axis", {1}, {3}); AddTestTensor("value", {1, 2, 3}); RunValidationAndConversion( - node_def, error::INVALID_ARGUMENT, - "Dimension 3 of size 3 is not evenly divisble by 4, at my_split"); + node_def, absl::StatusCode::kInvalidArgument, + "Dimension 3 of size 3 is not evenly divisible by 4"); } TestConvertSplit(this); TestConvertSplit(this); -#if IS_TRT_VERSION_GE(5, 1, 3, 1) TestConvertSplit(this); -#endif } // Get the NodeDef for Unpack (Unstack in TF API). @@ -5155,164 +7887,174 @@ auto get_unpack_nodedef = [](DataType dtype, int num, int axis) -> NodeDef { return unstack.operation.node()->def(); }; -template -void TestConvertUnpack(OpConverterTest* test) { - typedef typename EnumToDataType::Type CType; +struct UnpackTestParams { + std::vector input_shape; + std::vector input_value; + int axis; + int num; + std::vector expected_output_dims; + std::vector> expected_outputs; + Status run_status; +}; - struct TestParams { - std::vector input_shape; - std::vector value; - int axis; - int num; - std::vector expected_output_dims; - std::vector> expected_outputs; - }; +void TestConvertUnpack(ParameterizedOpConverterTestBase* test, + UnpackTestParams& p) { + test->Reset(); + NodeDef node_def = get_unpack_nodedef(test->get_tf_type(), p.num, p.axis); + // Create inputs. + test->AddTestTensor("value", p.input_shape, test->get_tf_type(), + p.input_value); + + std::vector>> matcher_vec; + std::vector datatype_vec; + std::vector> expected_output_dims; + + for (int j = 0; j < p.expected_outputs.size(); ++j) { + matcher_vec.push_back(ElementsAreArray(p.expected_outputs[j])); + datatype_vec.push_back(test->get_tf_type()); + expected_output_dims.push_back(p.expected_output_dims); + } + + test->TestOpConverterMultiOut(/*node_def=*/node_def, + /*expected_output_dims=*/expected_output_dims, + /*expected_conversion_status=*/p.run_status, + /*expected_runtime_status=*/p.run_status, + /*matcher=*/matcher_vec, + /*out_tf_type=*/datatype_vec); +} - const std::vector common_input = InitTestVector(6); - const int kUnpackOKCases = 4; - TestParams ok_params[kUnpackOKCases] = { - {/*input_shape=*/{1, 2, 3}, /*value=*/common_input, /*axis=*/1, - /*num=*/1, /*expected_output_dims=*/{2, 3}, - /*expected_outputs=*/{InitTestVector(6)}}, - {/*input_shape=*/{1, 2, 3}, - /*value=*/common_input, - /*axis=*/3, +// TODO: Reactivate when INT32 Segfault fixed +TEST_P(OpConverter_FP32_FP16_INT32_Test, ConvertUnpack) { + // We need to skip error testing for Dynamic Shape mode, as it is impossible + // to convert Unpack in Dynamic Shape Mode. + if (trt_mode_ != TrtTestMode::kDynamicShape) { + { + // Value is weights, should fail. + Reset(); + NodeDef node_def = get_unpack_nodedef(tf_type_, /*num=*/3, /*axis=*/3); + AddTestWeights("value", {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}); + RunValidationAndConversion( + node_def, absl::StatusCode::kUnimplemented, + "The input \"value\" for Unpack must be a tensor"); + } + { + // Axis is out of bounds, should fail. + Reset(); + NodeDef node_def = get_unpack_nodedef(tf_type_, /*num=*/1, /*axis=*/4); + AddTestTensor("value", {1, 1, 2, 3}); + RunValidationAndConversion(node_def, absl::StatusCode::kInvalidArgument, + "Axis value of 4 is out of bounds, must be in " + "range [-4, 4)"); + } + { + // Axis is out of bounds (negative), should fail. + Reset(); + NodeDef node_def = get_unpack_nodedef(tf_type_, /*num=*/1, /*axis=*/-5); + AddTestTensor("value", {1, 1, 2, 3}); + RunValidationAndConversion(node_def, absl::StatusCode::kInvalidArgument, + "Axis value of -5 is out of bounds, must be " + "in range [-4, 4)"); + } + { + if (trt_mode_ != TrtTestMode::kExplicitBatch) { + // Axis is batch dimension, should fail. + Reset(); + NodeDef node_def = get_unpack_nodedef(tf_type_, /*num=*/1, /*axis=*/0); + AddTestTensor("value", {1, 2, 3}); + RunValidationAndConversion(node_def, absl::StatusCode::kUnimplemented, + "TensorRT does not allow manipulation of " + "the batch dimension"); + } + } + { + // Dim size does not match num, should fail. + Reset(); + NodeDef node_def = get_unpack_nodedef(tf_type_, /*num=*/5, /*axis=*/2); + AddTestTensor("value", {1, 1, 6}); + RunValidationAndConversion( + node_def, absl::StatusCode::kInvalidArgument, + "Dimension 2 has size 6 which is not equal to num of 5"); + } + { + // Output would be TF scalar, should fail. + Reset(); + NodeDef node_def = get_unpack_nodedef(tf_type_, /*num=*/1, /*axis=*/0); + AddTestTensor( + "value", {}, tf_type_, {}, {}, + trt_mode_ == TrtTestMode::kImplicitBatch + ? errors::InvalidArgument( + "removing first dim requires explicit batch dimension") + : Status::OK()); + if (trt_mode_ == TrtTestMode::kImplicitBatch) { + RunValidationAndConversion( + node_def, absl::StatusCode::kInternal, + "Failed to convert at least one input to a TRT_TensorOrWeights: " + "Scalar input tensor is not supported since the first dimension is " + "treated as batch dimension by TRT"); + } else { + RunValidationAndConversion(node_def, absl::StatusCode::kUnimplemented, + "Input \"value\" for Unpack must be rank 2 " + "or greater"); + } + } + } + + const std::vector common_input = CreateVectorIota(6); + + Status run_status = + trt_mode_ == TrtTestMode::kDynamicShape + ? errors::InvalidArgument( + "The argument `strided_slice_spec` is " + "`absl::nullopt` with `dynamic_input_size_indices` non empty.") + : Status::OK(); + + std::vector params = { + {/*input_shape=*/{1, 1, 2, 1, 3, 1}, + /*input_value=*/common_input, + /*axis=*/4, /*num=*/3, - /*expected_output_dims=*/{1, 2}, - /*expected_outputs=*/ - {{CType(0), CType(3)}, {CType(1), CType(4)}, {CType(2), CType(5)}}}, - {/*input_shape=*/{6, 1}, - /*value=*/common_input, + /*expected_output_dims=*/{1, 1, 2, 1, 1}, + /*expected_outputs=*/{{0, 3}, {1, 4}, {2, 5}}, + /*run_status=*/run_status}, + {/*input_shape=*/{1, 1, 2, 1, 3}, + /*input_value=*/common_input, + /*axis=*/4, + /*num=*/3, + /*expected_output_dims=*/{1, 1, 2, 1}, + /*expected_outputs=*/{{0, 3}, {1, 4}, {2, 5}}, + /*run_status=*/run_status}, + {/*input_shape=*/{1, 1, 2, 3}, + /*input_value=*/common_input, + /*axis=*/1, + /*num=*/1, + /*expected_output_dims=*/{1, 2, 3}, + /*expected_outputs=*/{CreateVectorIota(6)}, + /*run_status=*/run_status}, + {/*input_shape=*/{1, 6, 1}, + /*input_value=*/common_input, /*axis=*/-2, /*num=*/6, - /*expected_output_dims=*/{1}, - /*expected_outputs=*/ - {{CType(0)}, - {CType(1)}, - {CType(2)}, - {CType(3)}, - {CType(4)}, - {CType(5)}}}, - {/*input_shape=*/{6}, - /*value=*/common_input, + /*expected_output_dims=*/{1, 1}, + /*expected_outputs=*/{{0}, {1}, {2}, {3}, {4}, {5}}, + /*run_status=*/run_status}, + {/*input_shape=*/{1, 6}, + /*input_value=*/common_input, /*axis=*/1, /*num=*/6, - /*expected_output_dims=*/{}, - /*expected_outputs=*/ - {{CType(0)}, - {CType(1)}, - {CType(2)}, - {CType(3)}, - {CType(4)}, - {CType(5)}}}, + /*expected_output_dims=*/{1}, + /*expected_outputs=*/{{0}, {1}, {2}, {3}, {4}, {5}}, + /*run_status=*/run_status}, }; - - for (int i = 0; i < kUnpackOKCases; ++i) { - test->Reset(); - NodeDef node_def = - get_unpack_nodedef(dtype, ok_params[i].num, ok_params[i].axis); - // Create inputs. - test->AddTestTensor("value", ok_params[i].input_shape, 1, - TfDataTypeToTrt(dtype)); - // Convert. - test->RunValidationAndConversion(node_def); - - // Get output tensors and verify output dims. - EXPECT_EQ(ok_params[i].expected_outputs.size(), ok_params[i].num); - std::vector outputs(ok_params[i].num); - DataVec output_data; - for (int j = 0; j < outputs.size(); ++j) { - const string name = j == 0 ? "my_unpack" : StrCat("my_unpack:", j); - TF_EXPECT_OK(test->GetTensorOrWeights(name, &outputs[j])); - EXPECT_TRUE(outputs[j].is_tensor()); - ExpectTrtDimsEqualsArray(ok_params[i].expected_output_dims, - outputs[j].tensor()->getDimensions()); - // Create buffer to store output. - output_data.push_back( - {name, - ConstructTensor(ok_params[i].expected_outputs[j].size())}); - } - - // Verify output values are correct. - const DataVec input_data{ - {"value", test::AsTensor(ok_params[i].value)}}; - test->BuildAndRun( - input_data, &output_data, - dtype == DT_HALF ? TrtPrecisionMode::FP16 : TrtPrecisionMode::FP32); - for (int j = 0; j < outputs.size(); ++j) { - EXPECT_THAT(GetSpanForData(output_data[j]), - ElementsAreArray(ok_params[i].expected_outputs[j])); - } - } -} - -TEST_F(OpConverterTest, ConvertUnpack) { - { - // Value is weights, should fail. - Reset(); - NodeDef node_def = get_unpack_nodedef(DT_FLOAT, /*num=*/3, /*axis=*/3); - AddTestWeights("value", {1, 2, 3}, {1, 2, 3, 4, 5, 6}); - RunValidationAndConversion( - node_def, error::UNIMPLEMENTED, - "The input \"value\" for Unpack must be a tensor, at my_unpack"); - } - { - // Axis is out of bounds, should fail. - Reset(); - NodeDef node_def = get_unpack_nodedef(DT_FLOAT, /*num=*/1, /*axis=*/4); - AddTestTensor("value", {1, 2, 3}); - RunValidationAndConversion(node_def, error::INVALID_ARGUMENT, - "Axis value of 4 is out of bounds, must be in " - "range [-4, 4), at my_unpack"); - } - { - // Axis is out of bounds (negative), should fail. - Reset(); - NodeDef node_def = get_unpack_nodedef(DT_FLOAT, /*num=*/1, /*axis=*/-5); - AddTestTensor("value", {1, 2, 3}); - RunValidationAndConversion(node_def, error::INVALID_ARGUMENT, - "Axis value of -5 is out of bounds, must be in " - "range [-4, 4), at my_unpack"); - } - { - // Axis is batch dimension, should fail. - Reset(); - NodeDef node_def = get_unpack_nodedef(DT_FLOAT, /*num=*/1, /*axis=*/0); - AddTestTensor("value", {1, 2, 3}); - RunValidationAndConversion(node_def, error::UNIMPLEMENTED, - "TensorRT does not allow manipulation of the " - "batch dimension, at my_unpack"); - } - { - // Dim size does not match num, should fail. - Reset(); - NodeDef node_def = get_unpack_nodedef(DT_FLOAT, /*num=*/5, /*axis=*/2); - AddTestTensor("value", {1, 6}); - RunValidationAndConversion( - node_def, error::INVALID_ARGUMENT, - "Dimension 2 has size 6 which is not equal to num of 5, at my_unpack"); - } - { - // Output would be TF scalar, should fail. - Reset(); - NodeDef node_def = get_unpack_nodedef(DT_FLOAT, /*num=*/1, /*axis=*/0); - AddTestTensor("value", {}); - RunValidationAndConversion( - node_def, error::UNIMPLEMENTED, - "Input \"value\" for Unpack must be rank 2 or greater, at my_unpack"); - } - - TestConvertUnpack(this); - TestConvertUnpack(this); -#if IS_TRT_VERSION_GE(5, 1, 3, 1) - TestConvertUnpack(this); -#endif + for (auto p : params) { + TestConvertUnpack(this, p); + } } // Get the NodeDef for Pack. NodeDef GetPackNodeDef(DataType dtype, int num_inputs, int axis) { Scope s = Scope::NewRootScope(); std::vector values; + values.reserve(num_inputs); for (int i = 0; i < num_inputs; ++i) { const string input_name = StrCat("values_", i); values.push_back(ops::Placeholder(s.WithOpName(input_name), dtype)); @@ -5324,154 +8066,165 @@ NodeDef GetPackNodeDef(DataType dtype, int num_inputs, int axis) { return pack.operation.node()->def(); } -template -void TestConvertPack(OpConverterTest* test) { - typedef typename EnumToDataType::Type CType; - +TEST_P(OpConverter_FP32_FP16_INT32_Test, ConvertPack) { struct TestParams { std::vector> input_shapes; - std::vector> input_values; + std::vector> partial_input_shapes; + std::vector> input_values; int axis; std::vector expected_output_dims; - std::vector expected_output; + std::vector expected_output; + Status conversion_status; + Status runtime_status; + bool input_1_is_weight; }; - const std::vector> common_input{ - InitTestVector(6), - InitTestVector(6, /*start_value=*/CType(6))}; + const std::vector> common_input{ + CreateVectorIota(6), + CreateVectorIota(6, /*start_value=*/6)}; std::vector params = { + // Second input is weight, should fail in implicit batch mode + {/*input_shapes=*/{{1, 2, 3}, {1, 2, 3}}, + /*partial_input_shapes=*/{{}, {}}, + /*input_values=*/common_input, + /*axis=*/1, + /*expected_output_dims=*/{1, 2, 2, 3}, + /*expected_output=*/CreateVectorIota(12), + trt_mode_ == TrtTestMode::kImplicitBatch + ? Status{absl::StatusCode::kUnimplemented, + "The input \"values_1\" for Pack must be a tensor"} + : Status::OK(), + /*runtime_status*/ Status::OK(), + /*weight_input*/ true}, + // Axis is out of bounds, should fail. + { + /*input_shapes=*/{{1, 2, 3}, {1, 2, 3}}, + /*partial_input_shapes=*/{{}, {}}, + /*input_values=*/common_input, + /*axis=*/-5, + /*expected_output_dims=*/{}, + /*expected_output=*/{}, + Status{absl::StatusCode::kInvalidArgument, + "Axis value of -5 is out of bounds, must be in" + " range [-4, 4)"}, + }, + // Axis is batch dimension, should fail in implicit batch mode. + {/*input_shapes=*/{{1, 2, 3}, {1, 2, 3}}, + /*partial_input_shapes=*/{{}, {}}, + /*input_values=*/common_input, + /*axis=*/-4, + /*expected_output_dims=*/{2, 1, 2, 3}, + /*expected_output=*/CreateVectorIota(12), + trt_mode_ == TrtTestMode::kImplicitBatch + ? Status{absl::StatusCode::kUnimplemented, + "TensorRT does not allow manipulation of the batch " + "dimension"} + : Status::OK()}, + // Inconsistent rank, should fail. + { + /*input_shapes=*/{{1, 2, 3}, {1, 6}}, + /*partial_input_shapes=*/{{}, {}}, + /*input_values=*/common_input, + /*axis=*/1, + /*expected_output_dims=*/{}, + /*expected_output=*/{}, + Status{absl::StatusCode::kInvalidArgument, + "Received inputs with inconsistent rank"}, + }, { - /*input_shapes=*/{{2, 3}, {2, 3}}, + /*input_shapes=*/{{1, 2, 3}, {1, 2, 3}}, + /*partial_input_shapes=*/{{}, {}}, /*input_values=*/common_input, /*axis=*/1, - /*expected_output_dims=*/{2, 2, 3}, - /*expected_output=*/InitTestVector(12), + /*expected_output_dims=*/{1, 2, 2, 3}, + /*expected_output=*/CreateVectorIota(12), }, { - /*input_shapes=*/{{2, 3}, {2, 3}}, + /*input_shapes=*/{{1, 2, 3}, {1, 2, 3}}, + /*partial_input_shapes=*/{{}, {}}, /*input_values=*/common_input, /*axis=*/2, - /*expected_output_dims=*/{2, 2, 3}, + /*expected_output_dims=*/{1, 2, 2, 3}, /*expected_output=*/ - {CType(0), CType(1), CType(2), CType(6), CType(7), CType(8), CType(3), - CType(4), CType(5), CType(9), CType(10), CType(11)}, + {0, 1, 2, 6, 7, 8, 3, 4, 5, 9, 10, 11}, }, { - /*input_shapes=*/{{2, 3}, {2, 3}}, + /*input_shapes=*/{{1, 2, 3}, {1, 2, 3}}, + /*partial_input_shapes=*/{{}, {}}, /*input_values=*/common_input, /*axis=*/3, - /*expected_output_dims=*/{2, 3, 2}, + /*expected_output_dims=*/{1, 2, 3, 2}, /*expected_output=*/ - {CType(0), CType(6), CType(1), CType(7), CType(2), CType(8), CType(3), - CType(9), CType(4), CType(10), CType(5), CType(11)}, + {0, 6, 1, 7, 2, 8, 3, 9, 4, 10, 5, 11}, }, { - /*input_shapes=*/{{2, 3}}, - /*input_values=*/{InitTestVector(6)}, + /*input_shapes=*/{{1, 2, 3}}, + /*partial_input_shapes=*/{{}}, + /*input_values=*/{CreateVectorIota(6)}, /*axis=*/1, - /*expected_output_dims=*/{1, 2, 3}, - /*expected_output=*/InitTestVector(6), + /*expected_output_dims=*/{1, 1, 2, 3}, + /*expected_output=*/CreateVectorIota(6), }, { - /*input_shapes=*/{{2, 3}}, - /*input_values=*/{InitTestVector(6)}, + /*input_shapes=*/{{1, 2, 3}}, + /*partial_input_shapes=*/{{}}, + /*input_values=*/{CreateVectorIota(6)}, /*axis=*/2, - /*expected_output_dims=*/{2, 1, 3}, - /*expected_output=*/InitTestVector(6), + /*expected_output_dims=*/{1, 2, 1, 3}, + /*expected_output=*/CreateVectorIota(6), }, }; - - for (int i = 0; i < params.size(); ++i) { - test->Reset(); - const int num_inputs = params[i].input_shapes.size(); - EXPECT_EQ(num_inputs, params[i].input_values.size()); - - NodeDef node_def = GetPackNodeDef(dtype, num_inputs, params[i].axis); + // Inputs have inconsistent shapes, should fail. + if (trt_mode_ != TrtTestMode::kDynamicShape) { + params.push_back( + TestParams{/*input_shapes=*/{{1, 2, 3}, {1, 3, 2}}, + /*partial_input_shapes=*/{{}, {}}, + /*input_values=*/common_input, + /*axis=*/1, + /*expected_output_dims=*/{}, + /*expected_output=*/CreateVectorIota(12), + Status{absl::StatusCode::kInvalidArgument, + "Received inputs with inconsistent shape"}}); + } else { + // In dynamic shape mode we cannot catch inconsistent shapes at conversion + // time, only during runtime. But TensorRT does not raise a proper runtime + // error, instead it aborts the program with the following message: + // Assertion failed: t->start.d[i] + t->extent.d[i] <= r.dims.d[i] + // ../builder/cudnnBuilderGraph.cpp:862 + // Aborting... + // TODO(tfeher) Add dynamic shapes test once TRT handles shape error + // decently + } + if (trt_mode_ == TrtTestMode::kDynamicShape) { + // Test with mixed dynamic / static shape input tensors + params.push_back( + TestParams{/*input_shapes=*/{{1, 2, 3}, {1, 2, 3}}, + /*partial_input_shapes=*/{{-1, -1, -1}, {1, 2, 3}}, + /*input_values=*/common_input, + /*axis=*/2, + /*expected_output_dims=*/{1, 2, 2, 3}, + /*expected_output=*/ + {0, 1, 2, 6, 7, 8, 3, 4, 5, 9, 10, 11}}); + } + for (auto p : params) { + Reset(); + const int num_inputs = p.input_shapes.size(); + EXPECT_EQ(num_inputs, p.input_values.size()); + + NodeDef node_def = GetPackNodeDef(tf_type_, num_inputs, p.axis); // Create inputs. for (int j = 0; j < num_inputs; ++j) { - test->AddTestTensor(StrCat("values_", j), params[i].input_shapes[j], 1, - TfDataTypeToTrt(dtype)); - } - test->RunValidationAndConversion(node_def); - - TRT_TensorOrWeights output; - TF_EXPECT_OK(test->GetTensorOrWeights("my_pack", &output)); - EXPECT_TRUE(output.is_tensor()); - ExpectTrtDimsEqualsArray(params[i].expected_output_dims, - output.tensor()->getDimensions()); - // Create input data for tensors. - DataVec input_data; - for (int j = 0; j < num_inputs; ++j) { - input_data.push_back({StrCat("values_", j), - test::AsTensor(params[i].input_values[j])}); + if (j == 1 && p.input_1_is_weight) { + AddTestWeights(StrCat("values_", j), p.input_shapes[j], + p.input_values[j], tf_type_); + } else { + AddTestTensor(StrCat("values_", j), p.input_shapes[j], tf_type_, + p.input_values[j], p.partial_input_shapes[j]); + } } - DataVec output_data{ - {"my_pack", ConstructTensor(params[i].expected_output.size())}}; - test->BuildAndRun( - input_data, &output_data, - dtype == DT_HALF ? TrtPrecisionMode::FP16 : TrtPrecisionMode::FP32); - EXPECT_THAT(GetSpanForData(output_data[0]), - ElementsAreArray(params[i].expected_output)); - } -} - -TEST_F(OpConverterTest, ConvertPack) { - { - // An input is a weight, should fail. - Reset(); - NodeDef node_def = GetPackNodeDef(DT_FLOAT, 2, /*axis=*/1); - AddTestTensor("values_0", {1, 2, 3}); - AddTestWeights("values_1", {1, 2, 3}, {1, 2, 3, 4, 5, 6}); - RunValidationAndConversion( - node_def, error::UNIMPLEMENTED, - "The input \"values_1\" for Pack must be a tensor, at my_pack"); - } - { - // Axis is out of bounds, should fail. - Reset(); - NodeDef node_def = GetPackNodeDef(DT_FLOAT, 2, /*axis=*/-5); - AddTestTensor("values_0", {2, 3}); - AddTestTensor("values_1", {2, 3}); - RunValidationAndConversion(node_def, error::INVALID_ARGUMENT, - "Axis value of -5 is out of bounds, must be in " - "range [-4, 4), at my_pack"); - } - { - // Axis is batch dimension, should fail. - Reset(); - NodeDef node_def = GetPackNodeDef(DT_FLOAT, 2, /*axis=*/-4); - AddTestTensor("values_0", {2, 3}); - AddTestTensor("values_1", {2, 3}); - RunValidationAndConversion(node_def, error::UNIMPLEMENTED, - "TensorRT does not allow manipulation of the " - "batch dimension, at my_pack"); - } - { - // Inputs have inconsistent rank, should fail. - Reset(); - NodeDef node_def = GetPackNodeDef(DT_FLOAT, 2, /*axis=*/1); - AddTestTensor("values_0", {1, 2, 3}); - AddTestTensor("values_1", {1, 6}); - RunValidationAndConversion( - node_def, error::INVALID_ARGUMENT, - "Received inputs with inconsistent rank, at my_pack"); - } - { - // Inputs have inconsistent shapes, should fail. - Reset(); - NodeDef node_def = GetPackNodeDef(DT_FLOAT, 2, /*axis=*/1); - AddTestTensor("values_0", {1, 2}); - AddTestTensor("values_1", {2, 2}); - RunValidationAndConversion( - node_def, error::INVALID_ARGUMENT, - "Received inputs with inconsistent shape, at my_pack"); + TestOpConverter(node_def, p.expected_output_dims, p.conversion_status, + p.runtime_status, ElementsAreArray(p.expected_output)); } - - TestConvertPack(this); - TestConvertPack(this); - - // TODO(hinsu): Enable INT32 with TensorRT version 5.1.3 after testing. - // TestConvertPack(this); } // Get the NodeDef for ArgMin or ArgMax. @@ -5485,134 +8238,160 @@ NodeDef GetArgMinMaxNodeDef(DataType input_dtype, DataType output_dtype) { return arg.operation.node()->def(); } -template -void TestConvertArgMinMax(OpConverterTest* test) { - typedef typename EnumToDataType::Type CType; +struct ArgMinMaxTestParams { + std::vector input_shape; + std::vector input_value; + int axis; + std::vector expected_output_dims; + std::vector expected_argmax_output; + std::vector expected_argmin_output; + Status status; +}; - struct TestParams { - std::vector input_shape; - std::vector input_value; - int axis; - std::vector expected_output_dims; - std::vector expected_argmax_output; - std::vector expected_argmin_output; - }; +template +void TestConvertArgMinMax(ParameterizedOpConverterTestBase* test, + DataType _tf_type, ArgMinMaxTestParams& p) { + test->Reset(); - const std::vector common_input = InitTestVector(6); - std::vector params = { + NodeDef node_def = GetArgMinMaxNodeDef(_tf_type, + /*output_dtype=*/DT_INT32); + + std::vector expected_out; + if (node_def.op() == "ArgMax") { + expected_out = p.expected_argmax_output; + } else if (node_def.op() == "ArgMin") { + expected_out = p.expected_argmin_output; + } else { + ASSERT_TRUE(false); + } + + test->AddTestTensor("input", p.input_shape, _tf_type, p.input_value); + test->AddTestWeights("dimension", {1}, {p.axis}, DT_INT32); + + test->TestOpConverter(node_def, p.expected_output_dims, + /*expected_conversion_status=*/p.status, + /*expected_runtime_status=*/Status::OK(), + /*matcher=*/ElementsAreArray(expected_out), {DT_INT32}); +} + +TEST_P(OpConverter_FP32_FP16_Test, ConvertArgMinMax) { + { + // Dimension is a tensor, should fail. + Reset(); + NodeDef node_def = + GetArgMinMaxNodeDef(tf_type_, + /*output_dtype=*/DT_INT32); + AddTestTensor("input", {1, 2, 3}); + AddTestTensor("dimension", {1}); + RunValidationAndConversion( + node_def, absl::StatusCode::kUnimplemented, + "The input \"dimension\" for ArgMax must be a constant"); + } + { + // Output type is INT64, should fail. + Reset(); + NodeDef node_def = + GetArgMinMaxNodeDef(tf_type_, + /*output_dtype=*/DT_INT64); + AddTestTensor("input", {1, 2, 3}); + AddTestWeights("dimension", {1}, {3}, DT_INT32); + RunValidationAndConversion(node_def, absl::StatusCode::kUnimplemented, + "Output type int64 is not supported"); + } + + const std::vector common_input = CreateVectorIota(6); + std::vector params = { + {/*input_shape=*/{2, 3}, + /*input_value=*/common_input, + /*axis=*/0, + /*expected_output_dims=*/{3}, + /*expected_argmax_output=*/{1, 1, 1}, + /*expected_argmin_output=*/{0, 0, 0}, + trt_mode_ == TrtTestMode::kImplicitBatch + ? errors::Unimplemented("TensorRT does not allow manipulation of " + "the batch dimension") + : Status::OK()}, + { + /*input_shape=*/{1, 6}, + /*input_value=*/common_input, + /*axis=*/1, + /*expected_output_dims=*/{1}, + /*expected_argmax_output=*/{5}, + /*expected_argmin_output=*/{0}, + }, + { + /*input_shape=*/{1, 10}, + /*input_value=*/ + {-5.0f, 3.0f, 5.0f, 1.0f, 6.0f, -9.0f, 7.0f, 1.0f, 0.0f, -1.0f}, + /*axis=*/-1, + /*expected_output_dims=*/{1}, + /*expected_argmax_output=*/{6}, + /*expected_argmin_output=*/{5}, + }, { - /*input_shape=*/{2, 3}, + /*input_shape=*/{1, 2, 3}, /*input_value=*/common_input, /*axis=*/2, - /*expected_output_dims=*/{2}, + /*expected_output_dims=*/{1, 2}, /*expected_argmax_output=*/{2, 2}, /*expected_argmin_output=*/{0, 0}, }, { - /*input_shape=*/{2, 3}, + /*input_shape=*/{1, 2, 3}, /*input_value=*/common_input, /*axis=*/-2, - /*expected_output_dims=*/{3}, + /*expected_output_dims=*/{1, 3}, /*expected_argmax_output=*/{1, 1, 1}, /*expected_argmin_output=*/{0, 0, 0}, }, { - /*input_shape=*/{6}, + /*input_shape=*/{1, 2, 1, 3}, /*input_value=*/common_input, - /*axis=*/1, - /*expected_output_dims=*/{}, - /*expected_argmax_output=*/{5}, - /*expected_argmin_output=*/{0}, + /*axis=*/3, + /*expected_output_dims=*/{1, 2, 1}, + /*expected_argmax_output=*/{2, 2}, + /*expected_argmin_output=*/{0, 0}, }, { - /*input_shape=*/{10}, - /*input_value=*/ - {CType(-5), CType(3), CType(5), CType(1), CType(6), CType(-9), - CType(7), CType(1), CType(0), CType(-1)}, - /*axis=*/-1, - /*expected_output_dims=*/{}, - /*expected_argmax_output=*/{6}, - /*expected_argmin_output=*/{5}, + /*input_shape=*/{1, 2, 1, 3}, + /*input_value=*/common_input, + /*axis=*/-3, + /*expected_output_dims=*/{1, 1, 3}, + /*expected_argmax_output=*/{1, 1, 1}, + /*expected_argmin_output=*/{0, 0, 0}, + }, + {/*input_shape=*/{1, 2, 1, 1, 3}, + /*input_value=*/common_input, + /*axis=*/4, + /*expected_output_dims=*/{1, 2, 1, 1}, + /*expected_argmax_output=*/{2, 2}, + /*expected_argmin_output=*/{0, 0}, +#if !IS_TRT_VERSION_GE(7, 0, 0, 11) + errors::Unimplemented("op is not able to support tensors with 4+" + " dimensions (excluding batch size)") +#else + Status::OK() +#endif + }, + {/*input_shape=*/{1, 2, 1, 1, 3}, + /*input_value=*/common_input, + /*axis=*/-4, + /*expected_output_dims=*/{1, 1, 1, 3}, + /*expected_argmax_output=*/{1, 1, 1}, + /*expected_argmin_output=*/{0, 0, 0}, +#if !IS_TRT_VERSION_GE(7, 0, 0, 11) + errors::Unimplemented("op is not able to support tensors with 4+" + " dimensions (excluding batch size)") +#else + Status::OK() +#endif }, }; - for (int i = 0; i < params.size(); ++i) { - test->Reset(); - - NodeDef node_def = GetArgMinMaxNodeDef(dtype, DT_INT32); - // Create inputs. - test->AddTestTensor("input", params[i].input_shape, /*batch_size=*/1, - /*trt_dtype=*/TfDataTypeToTrt(dtype)); - test->AddTestWeights("dimension", {1}, {params[i].axis}); - test->RunValidationAndConversion(node_def); - - TRT_TensorOrWeights output; - TF_EXPECT_OK(test->GetTensorOrWeights("my_arg", &output)); - EXPECT_TRUE(output.is_tensor()); - ExpectTrtDimsEqualsArray(params[i].expected_output_dims, - output.tensor()->getDimensions()); - // Create input data for tensors. - const DataVec input_data{ - {"input", test::AsTensor(params[i].input_value)}}; - DataVec output_data{ - {"my_arg", - ConstructTensor(params[i].expected_argmax_output.size())}}; - test->BuildAndRun( - input_data, &output_data, - dtype == DT_HALF ? TrtPrecisionMode::FP16 : TrtPrecisionMode::FP32); - - if (node_def.op() == "ArgMax") { - EXPECT_THAT(GetSpanForData(output_data[0]), - ElementsAreArray(params[i].expected_argmax_output)); - } else if (node_def.op() == "ArgMin") { - EXPECT_THAT(GetSpanForData(output_data[0]), - ElementsAreArray(params[i].expected_argmin_output)); - } else { - ASSERT_TRUE(false); - } - } -} - -TEST_F(OpConverterTest, ConvertArgMinMax) { - { - // Dimension is a tensor, should fail. - Reset(); - NodeDef node_def = GetArgMinMaxNodeDef(DT_FLOAT, DT_INT32); - AddTestTensor("input", {1, 2, 3}); - AddTestTensor("dimension", {1}); - RunValidationAndConversion( - node_def, error::UNIMPLEMENTED, - "The input \"dimension\" for ArgMax must be a constant, at my_arg"); - } - { - // Output type is INT64, should fail. - Reset(); - NodeDef node_def = GetArgMinMaxNodeDef(DT_FLOAT, DT_INT64); - AddTestTensor("input", {1, 2, 3}); - AddTestWeights("dimension", {1}, {3}); - RunValidationAndConversion(node_def, error::UNIMPLEMENTED, - "Output type int64 is not supported, at my_arg"); + for (auto p : params) { + TestConvertArgMinMax(this, tf_type_, p); + TestConvertArgMinMax(this, tf_type_, p); } - { - // Axis is batch dimension, should fail - Reset(); - NodeDef node_def = GetArgMinMaxNodeDef(DT_FLOAT, DT_INT32); - AddTestTensor("input", {1, 2, 3}); - AddTestWeights("dimension", {1}, {0}); - RunValidationAndConversion( - node_def, error::UNIMPLEMENTED, - "TensorRT does not allow manipulation of the batch dimension, at " - "my_arg"); - } - - TestConvertArgMinMax(this); - TestConvertArgMinMax(this); - TestConvertArgMinMax(this); - TestConvertArgMinMax(this); - // TRT does not support int32 for TopK layer which is used to implement ArgMin - // and ArgMax. - // TestConvertArgMinMax(this); - // TestConvertArgMinMax(this); } // Get the NodeDef for DepthToSpace or SpaceToSpace. @@ -5626,363 +8405,297 @@ NodeDef GetDepthSpaceShuffleNodeDef(DataType dtype, int block_size, return shuffle.operation.node()->def(); } -template struct DepthSpaceShuffleTestParams { std::vector input_dims; - std::vector input_value; + std::vector input_value; int block_size; string data_format; std::vector expected_output_dims; - std::vector expected_output; + std::vector expected_output; }; -template +template void TestConvertDepthSpaceShuffle( - OpConverterTest* test, - const std::vector>& params) { - for (int i = 0; i < params.size(); ++i) { - test->Reset(); - - NodeDef node_def = GetDepthSpaceShuffleNodeDef( - dtype, params[i].block_size, params[i].data_format); - test->AddTestTensor("input", params[i].input_dims, 1, - TfDataTypeToTrt(dtype)); - test->RunValidationAndConversion(node_def); - - TRT_TensorOrWeights output; - TF_EXPECT_OK(test->GetTensorOrWeights("my_shuffle", &output)); - EXPECT_TRUE(output.is_tensor()); - ExpectTrtDimsEqualsArray(params[i].expected_output_dims, - output.tensor()->getDimensions()); - - DataVec input_data{{"input", test::AsTensor(params[i].input_value)}}; - DataVec output_data{{"my_shuffle", ConstructTensor( - params[i].expected_output.size())}}; - test->BuildAndRun( - input_data, &output_data, - dtype == DT_HALF ? TrtPrecisionMode::FP16 : TrtPrecisionMode::FP32); - EXPECT_THAT(GetSpanForData(output_data[0]), - ElementsAreArray(params[i].expected_output)); - } -} - -template -void TestConvertDepthToSpace(OpConverterTest* test) { - typedef typename EnumToDataType::Type CType; - const std::vector common_input = InitTestVector(16); - std::vector> params = { - { - /*input_shape=*/{4, 2, 2}, - /*input_value=*/common_input, - /*block_size=*/2, - /*data_format=*/"NCHW", - /*expected_output_dims=*/{1, 4, 4}, - /*expected_output=*/ - CastTestVector( - {0, 4, 1, 5, 8, 12, 9, 13, 2, 6, 3, 7, 10, 14, 11, 15}), - }, - { - /*input_shape=*/{2, 2, 4}, - /*input_value=*/common_input, - /*block_size=*/2, - /*data_format=*/"NHWC", - /*expected_output_dims=*/{4, 4, 1}, - /*expected_output=*/ - CastTestVector( - {0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15}), - }, - { - /*input_shape=*/{16, 1, 1}, - /*input_value=*/common_input, - /*block_size=*/4, - /*data_format=*/"NCHW", - /*expected_output_dims=*/{1, 4, 4}, - /*expected_output=*/InitTestVector(16), - }, - { - /*input_shape=*/{2, 2, 8}, - /*input_value=*/InitTestVector(32), - /*block_size=*/2, - /*data_format=*/"NHWC", - /*expected_output_dims=*/{4, 4, 2}, - /*expected_output=*/CastTestVector({0, 1, 2, 3, 8, - 9, 10, 11, 4, 5, - 6, 7, 12, 13, 14, - 15, 16, 17, 18, 19, - 24, 25, 26, 27, 20, - 21, 22, 23, 28, 29, - 30, 31}), - }, - }; - - TestConvertDepthSpaceShuffle(test, params); -} + ParameterizedOpConverterTestBase* test, + const std::vector& params) { + Status status = Status::OK(); -TEST_F(OpConverterTest, ConvertDepthToSpace) { { // Input is a weight, should fail. - Reset(); - NodeDef node_def = - GetDepthSpaceShuffleNodeDef(DT_FLOAT, 2, "NCHW"); - AddTestWeights("input", {4, 1, 1}, {1, 2, 3, 4}); - RunValidationAndConversion(node_def, error::UNIMPLEMENTED, - "The input \"input\" for DepthToSpace must be a " - "tensor, at my_shuffle"); + test->Reset(); + NodeDef node_def = GetDepthSpaceShuffleNodeDef( + test->get_tf_type(), 2, "NCHW"); + test->AddTestWeights("input", {1, 4, 1, 1}, {1, 2, 3, 4}); + test->RunValidationAndConversion( + node_def, absl::StatusCode::kUnimplemented, + StrCat("The input \"input\" for ", node_def.op(), " must be a tensor")); } { // Input rank != 4 - Reset(); - NodeDef node_def = - GetDepthSpaceShuffleNodeDef(DT_FLOAT, 2, "NCHW"); - AddTestTensor("input", {16, 32}); - RunValidationAndConversion( - node_def, error::INVALID_ARGUMENT, - "The input to DepthToSpace must be rank 4, at my_shuffle"); - } - { - // Channels not divisible by block_size, should fail. - Reset(); - NodeDef node_def = - GetDepthSpaceShuffleNodeDef(DT_FLOAT, 3, "NCHW"); - AddTestTensor("input", {16, 32, 32}); - RunValidationAndConversion(node_def, error::INVALID_ARGUMENT, - "Number of channels must be divisible by " - "block_size*block_size, at my_shuffle"); + test->Reset(); + NodeDef node_def = GetDepthSpaceShuffleNodeDef( + test->get_tf_type(), 2, "NCHW"); + test->AddTestTensor("input", {1, 16, 32}); + test->RunValidationAndConversion( + node_def, absl::StatusCode::kInvalidArgument, + StrCat("The input to ", node_def.op(), " must be rank 4")); } { // Unsupported format, should fail. - Reset(); + test->Reset(); NodeDef node_def = GetDepthSpaceShuffleNodeDef( - DT_FLOAT, 2, "NCHW_VECT_C"); - AddTestTensor("input", {16, 32, 32}); - RunValidationAndConversion( - node_def, error::UNIMPLEMENTED, - "Data format NCHW_VECT_C is not supported, at my_shuffle"); + test->get_tf_type(), 2, "NCHW_VECT_C"); + test->AddTestTensor("input", {1, 16, 32, 32}); + test->RunValidationAndConversion( + node_def, absl::StatusCode::kUnimplemented, + "Data format NCHW_VECT_C is not supported"); + } + if (test->get_trt_mode() != TrtTestMode::kDynamicShape) { + // In dynamic shape mode, we cannot check input dimension values at + // conversion time therefore we cannot confirm block_size vs input dim + // consistency. We rely on the user to provide a valid TF graph. Otherwise + // TRT will fail with a runtime error. + if (std::is_same::value) { + // Channels not divisible by block_size, should fail. + test->Reset(); + NodeDef node_def = GetDepthSpaceShuffleNodeDef( + test->get_tf_type(), 3, "NCHW"); + test->AddTestTensor("input", {1, 16, 32, 32}); + test->RunValidationAndConversion(node_def, + absl::StatusCode::kInvalidArgument, + "Number of channels must be divisible by" + " block_size*block_size"); + } else { + { // Width not divisible by block_size, should fail. + test->Reset(); + NodeDef node_def = GetDepthSpaceShuffleNodeDef( + test->get_tf_type(), 3, "NCHW"); + test->AddTestTensor("input", {1, 16, 9, 32}); + test->RunValidationAndConversion(node_def, + absl::StatusCode::kInvalidArgument, + "Width and height must be divisible by" + " block_size"); + } + { + // Height not divisible by block_size, should fail. + test->Reset(); + NodeDef node_def = GetDepthSpaceShuffleNodeDef( + test->get_tf_type(), 3, "NCHW"); + test->AddTestTensor("input", {1, 16, 32, 9}); + test->RunValidationAndConversion(node_def, + absl::StatusCode::kInvalidArgument, + "Width and height must be divisible by" + " block_size"); + } + } } - TestConvertDepthToSpace(this); - TestConvertDepthToSpace(this); - TestConvertDepthToSpace(this); + for (auto p : params) { + test->Reset(); + const NodeDef node = GetDepthSpaceShuffleNodeDef( + test->get_tf_type(), p.block_size, p.data_format); + test->AddTestTensor("input", p.input_dims, p.input_value); + test->TestOpConverter(node, p.expected_output_dims, status, Status::OK(), + ElementsAreArray(p.expected_output)); + } } -template -void TestConvertSpaceToDepth(OpConverterTest* test) { - typedef typename EnumToDataType::Type CType; - const std::vector common_input = InitTestVector(16); - std::vector> params = { +TEST_P(OpConverter_FP32_FP16_INT32_Test, ConvertDepthToSpace) { + const std::vector common_input = CreateVectorIota(16); + std::vector params = { { - /*input_shape=*/{1, 4, 4}, + /*input_shape=*/{1, 4, 2, 2}, /*input_value=*/common_input, /*block_size=*/2, /*data_format=*/"NCHW", - /*expected_output_dims=*/{4, 2, 2}, + /*expected_output_dims=*/{1, 1, 4, 4}, /*expected_output=*/ - CastTestVector( - {0, 2, 8, 10, 1, 3, 9, 11, 4, 6, 12, 14, 5, 7, 13, 15}), + {0, 4, 1, 5, 8, 12, 9, 13, 2, 6, 3, 7, 10, 14, 11, 15}, }, { - /*input_shape=*/{4, 4, 1}, + /*input_shape=*/{1, 2, 2, 4}, /*input_value=*/common_input, /*block_size=*/2, /*data_format=*/"NHWC", - /*expected_output_dims=*/{2, 2, 4}, + /*expected_output_dims=*/{1, 4, 4, 1}, /*expected_output=*/ - CastTestVector( - {0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15}), + {0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15}, }, { - /*input_shape=*/{1, 4, 4}, + /*input_shape=*/{1, 16, 1, 1}, /*input_value=*/common_input, /*block_size=*/4, /*data_format=*/"NCHW", - /*expected_output_dims=*/{16, 1, 1}, - /*expected_output=*/InitTestVector(16), + /*expected_output_dims=*/{1, 1, 4, 4}, + /*expected_output=*/CreateVectorIota(16), }, { - /*input_shape=*/{4, 4, 2}, - /*input_value=*/InitTestVector(32), + /*input_shape=*/{1, 2, 2, 8}, + /*input_value=*/CreateVectorIota(32), /*block_size=*/2, /*data_format=*/"NHWC", - /*expected_output_dims=*/{2, 2, 8}, - /*expected_output=*/CastTestVector({0, 1, 2, 3, 8, - 9, 10, 11, 4, 5, - 6, 7, 12, 13, 14, - 15, 16, 17, 18, 19, - 24, 25, 26, 27, 20, - 21, 22, 23, 28, 29, - 30, 31}), - }, - }; - - TestConvertDepthSpaceShuffle(test, params); -} - -TEST_F(OpConverterTest, ConvertSpaceToDepth) { - { - // Input is a weight, should fail. - Reset(); - NodeDef node_def = - GetDepthSpaceShuffleNodeDef(DT_FLOAT, 2, "NCHW"); - AddTestWeights("input", {4, 1, 1}, {1, 2, 3, 4}); - RunValidationAndConversion(node_def, error::UNIMPLEMENTED, - "The input \"input\" for SpaceToDepth must be a " - "tensor, at my_shuffle"); - } - { - // Input rank != 4 - Reset(); - NodeDef node_def = - GetDepthSpaceShuffleNodeDef(DT_FLOAT, 2, "NCHW"); - AddTestTensor("input", {16, 32}); - RunValidationAndConversion( - node_def, error::INVALID_ARGUMENT, - "The input to SpaceToDepth must be rank 4, at my_shuffle"); - } - { - // Width not divisble by block_size, should fail. - Reset(); - NodeDef node_def = - GetDepthSpaceShuffleNodeDef(DT_FLOAT, 3, "NCHW"); - AddTestTensor("input", {16, 9, 32}); - RunValidationAndConversion(node_def, error::INVALID_ARGUMENT, - "Width and height must be divisible by " - "block_size, at my_shuffle"); - } - { - // Height not divisble by block_size, should fail. - Reset(); - NodeDef node_def = - GetDepthSpaceShuffleNodeDef(DT_FLOAT, 3, "NCHW"); - AddTestTensor("input", {16, 32, 9}); - RunValidationAndConversion(node_def, error::INVALID_ARGUMENT, - "Width and height must be divisible by " - "block_size, at my_shuffle"); - } - { - // Unsupported format, should fail. - Reset(); - NodeDef node_def = GetDepthSpaceShuffleNodeDef( - DT_FLOAT, 2, "NCHW_VECT_C"); - AddTestTensor("input", {16, 32, 32}); - RunValidationAndConversion( - node_def, error::UNIMPLEMENTED, - "Data format NCHW_VECT_C is not supported, at my_shuffle"); - } - - TestConvertSpaceToDepth(this); - TestConvertSpaceToDepth(this); - TestConvertSpaceToDepth(this); -} - -#if IS_TRT_VERSION_GE(5, 1, 2, 0) -// Get the NodeDef for ClipByValue. -NodeDef GetClipByValueNodeDef(DataType dtype) { - Scope s = Scope::NewRootScope(); - auto t = ops::Placeholder(s.WithOpName("t"), dtype); - auto clip_value_min = ops::Placeholder(s.WithOpName("clip_value_min"), dtype); - auto clip_value_max = ops::Placeholder(s.WithOpName("clip_value_max"), dtype); - auto clip = ops::ClipByValue(s.WithOpName("my_clip"), t, clip_value_min, - clip_value_max); - return clip.operation.node()->def(); -} - -template -void TestConvertClipByValue(OpConverterTest* test) { - typedef typename EnumToDataType::Type CType; - - struct TestParams { - std::vector dims; - std::vector input_value; - CType clip_value_min; - CType clip_value_max; - std::vector expected_output; - }; + /*expected_output_dims=*/{1, 4, 4, 2}, + /*expected_output=*/{0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, + 7, 12, 13, 14, 15, 16, 17, 18, 19, 24, 25, + 26, 27, 20, 21, 22, 23, 28, 29, 30, 31}, + }}; - const std::vector common_input = InitTestVector(6); - std::vector params = { + TestConvertDepthSpaceShuffle(this, params); +} + +TEST_P(OpConverter_FP32_FP16_INT32_Test, ConvertSpaceToDepth) { + const std::vector common_input = CreateVectorIota(16); + std::vector params = { + { + /*input_shape=*/{1, 1, 4, 4}, + /*input_value=*/common_input, + /*block_size=*/2, + /*data_format=*/"NCHW", + /*expected_output_dims=*/{1, 4, 2, 2}, + /*expected_output=*/ + {0, 2, 8, 10, 1, 3, 9, 11, 4, 6, 12, 14, 5, 7, 13, 15}, + }, { - /*dims=*/{1, 2, 3}, + /*input_shape=*/{1, 4, 4, 1}, /*input_value=*/common_input, - /*clip_value_min=*/CType(2), - /*clip_value_max=*/CType(5), + /*block_size=*/2, + /*data_format=*/"NHWC", + /*expected_output_dims=*/{1, 2, 2, 4}, /*expected_output=*/ - {CType(2), CType(2), CType(2), CType(3), CType(4), CType(5)}, + {0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15}, }, { - /*dims=*/{2, 1, 3}, + /*input_shape=*/{1, 1, 4, 4}, /*input_value=*/common_input, - /*clip_value_min=*/CType(-1), - /*clip_value_max=*/CType(8), - /*expected_output=*/common_input, + /*block_size=*/4, + /*data_format=*/"NCHW", + /*expected_output_dims=*/{1, 16, 1, 1}, + /*expected_output=*/CreateVectorIota(16), + }, + { + /*input_shape=*/{1, 4, 4, 2}, + /*input_value=*/CreateVectorIota(32), + /*block_size=*/2, + /*data_format=*/"NHWC", + /*expected_output_dims=*/{1, 2, 2, 8}, + /*expected_output=*/{0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, + 7, 12, 13, 14, 15, 16, 17, 18, 19, 24, 25, + 26, 27, 20, 21, 22, 23, 28, 29, 30, 31}, }, }; + TestConvertDepthSpaceShuffle(this, params); +} - for (int i = 0; i < params.size(); ++i) { - test->Reset(); - - NodeDef node_def = GetClipByValueNodeDef(dtype); - test->AddTestTensor("t", params[i].dims, 1, TfDataTypeToTrt(dtype)); - test->AddTestWeights("clip_value_min", {1}, - {params[i].clip_value_min}); - test->AddTestWeights("clip_value_max", {1}, - {params[i].clip_value_max}); - test->RunValidationAndConversion(node_def); +TEST_P(OpConverter_FP32_FP16_Test, ConvertClipByValue) { + Scope s = Scope::NewRootScope(); + auto t = ops::Placeholder(s.WithOpName("t"), tf_type_); + auto clip_value_min = + ops::Placeholder(s.WithOpName("clip_value_min"), tf_type_); + auto clip_value_max = + ops::Placeholder(s.WithOpName("clip_value_max"), tf_type_); + auto clip = ops::ClipByValue(s.WithOpName("my_clip"), t, clip_value_min, + clip_value_max); + const NodeDef& node_def = clip.operation.node()->def(); - TRT_TensorOrWeights output; - TF_EXPECT_OK(test->GetTensorOrWeights("my_clip", &output)); - EXPECT_TRUE(output.is_tensor()); - ExpectTrtDimsEqualsArray(params[i].dims, output.tensor()->getDimensions()); - - DataVec input_data{{"t", test::AsTensor(params[i].input_value)}}; - DataVec output_data{ - {"my_clip", ConstructTensor(params[i].expected_output.size())}}; - test->BuildAndRun( - input_data, &output_data, - dtype == DT_HALF ? TrtPrecisionMode::FP16 : TrtPrecisionMode::FP32); - EXPECT_THAT(GetSpanForData(output_data[0]), - ElementsAreArray(params[i].expected_output)); - } -} + nvinfer1::DataType trt_type_; + TF_ASSERT_OK(TfTypeToTrtType(tf_type_, &trt_type_)); -TEST_F(OpConverterTest, ConvertClipByValue) { { // Input is a weight, should fail. Reset(); - NodeDef node_def = GetClipByValueNodeDef(DT_FLOAT); - AddTestWeights("t", {1, 2, 3}, {1, 2, 3, 4, 5, 6}); - AddTestWeights("clip_value_min", {1}, {1}); - AddTestWeights("clip_value_max", {1}, {5}); - RunValidationAndConversion(node_def, error::UNIMPLEMENTED, + AddTestWeights("t", {1, 2, 3}, {1, 2, 3, 4, 5, 6}, tf_type_); + AddTestWeights("clip_value_min", {1}, {1}, tf_type_); + AddTestWeights("clip_value_max", {1}, {5}, tf_type_); + RunValidationAndConversion(node_def, absl::StatusCode::kUnimplemented, "The input \"t\" for ClipByValue must be a " - "tensor, at my_clip"); + "tensor"); } { // Clip min is a tensor, should fail. Reset(); - NodeDef node_def = GetClipByValueNodeDef(DT_FLOAT); AddTestTensor("t", {1, 2, 3}); AddTestTensor("clip_value_min", {1}); - AddTestWeights("clip_value_max", {1}, {1}); - RunValidationAndConversion(node_def, error::UNIMPLEMENTED, + AddTestWeights("clip_value_max", {1}, {1}, tf_type_); + RunValidationAndConversion(node_def, absl::StatusCode::kUnimplemented, "The input \"clip_value_min\" for ClipByValue " - "must be a constant, at my_clip"); + "must be a constant"); } { // Clip max is a tensor, should fail. Reset(); - NodeDef node_def = GetClipByValueNodeDef(DT_FLOAT); AddTestTensor("t", {1, 2, 3}); - AddTestWeights("clip_value_min", {1}, {1}); + AddTestWeights("clip_value_min", {1}, {1}, tf_type_); AddTestTensor("clip_value_max", {1}); - RunValidationAndConversion(node_def, error::UNIMPLEMENTED, + RunValidationAndConversion(node_def, absl::StatusCode::kUnimplemented, "The input \"clip_value_max\" for ClipByValue " - "must be a constant, at my_clip"); + "must be a constant"); } - TestConvertClipByValue(this); - TestConvertClipByValue(this); + struct TestParams { + std::vector dims; + int clip_value_min; + int clip_value_max; + std::vector expected_output; + }; + + const std::vector common_input = CreateVectorIota(6); + + std::vector params = {{ + /*dims=*/{6}, + /*clip_value_min=*/2, + /*clip_value_max=*/4, + /*expected_output=*/{2, 2, 2, 3, 4, 4}, + }, + { + /*dims=*/{1, 6}, + /*clip_value_min=*/2, + /*clip_value_max=*/4, + /*expected_output=*/{2, 2, 2, 3, 4, 4}, + }, + { + /*dims=*/{1, 2, 3}, + /*clip_value_min=*/2, + /*clip_value_max=*/4, + /*expected_output=*/{2, 2, 2, 3, 4, 4}, + }, + { + /*dims=*/{1, 2, 3, 1}, + /*clip_value_min=*/2, + /*clip_value_max=*/4, + /*expected_output=*/{2, 2, 2, 3, 4, 4}, + }, + { + /*dims=*/{1, 1, 3, 1, 2}, + /*clip_value_min=*/2, + /*clip_value_max=*/4, + /*expected_output=*/{2, 2, 2, 3, 4, 4}, + }, + { + /*dims=*/{1, 1, 3, 1, 2, 1}, + /*clip_value_min=*/2, + /*clip_value_max=*/4, + /*expected_output=*/{2, 2, 2, 3, 4, 4}, + }, + { + /*dims=*/{2, 1, 3}, + /*clip_value_min=*/-1, + /*clip_value_max=*/8, + /*expected_output=*/common_input, + }}; + + for (auto p : params) { + Reset(); + + AddTestTensor("t", p.dims, tf_type_, common_input); + AddTestWeights("clip_value_min", {1}, {p.clip_value_min}, tf_type_); + AddTestWeights("clip_value_max", {1}, {p.clip_value_max}, tf_type_); + + TestOpConverter(node_def, p.dims, + /*expected_conversion_status=*/Status::OK(), + /*expected_runtime_status=*/Status::OK(), + /*matcher=*/ElementsAreArray(p.expected_output)); + } } -#endif // IS_TRT_VERSION_GE(5, 1, 2, 0) // Get the NodeDef for SquaredDifference. NodeDef GetSquaredDifferenceNodeDef(DataType dtype) { @@ -5994,222 +8707,197 @@ NodeDef GetSquaredDifferenceNodeDef(DataType dtype) { return squared_diff.operation.node()->def(); } -template -void TestConvertSquaredDifference(OpConverterTest* test) { - typedef typename EnumToDataType::Type CType; +TEST_P(OpConverter_FP32_FP16_Test, ConvertSquaredDifference) { + { + // Input is a weight, should fail. + Reset(); + NodeDef node_def = GetSquaredDifferenceNodeDef(tf_type_); + AddTestWeights("x", {1, 2, 3}, {1, 2, 3, 4, 5, 6}); + AddTestTensor("y", {1, 1, 2, 3}); + RunValidationAndConversion(node_def, absl::StatusCode::kUnimplemented, + "The input \"x\" for SquaredDifference must be " + "a tensor"); + } struct TestParams { std::vector dims_x; std::vector dims_y; - std::vector value_x; - std::vector value_y; + std::vector value_x; + std::vector value_y; std::vector expected_output_dims; - std::vector expected_output; + std::vector expected_output; + Status status; + Status runtime_status; }; - const std::vector common_input = InitTestVector(6); + const std::vector common_input = CreateVectorIota(6); std::vector params = { + {/*dims_x=*/{1, 2, 3}, + /*dims_y=*/{1, 7, 5}, + /*value_x=*/common_input, + /*value_y=*/std::vector(7 * 5, 0), + /*expected_output_dims=*/{1, 1, 2, 3}, + /*expected_output=*/common_input, + trt_mode_ == TrtTestMode::kDynamicShape + ? Status::OK() + : errors::InvalidArgument("Infeasible broadcast scheme"), + errors::Internal( + "Binding index out of range. This can happen if profile is not set, " + "or the network is invalid for the current profile.")}, { - /*dims_x=*/{1, 2, 3}, - /*dims_y=*/{1, 2, 3}, + /*dims_x=*/{1, 1, 2, 3}, + /*dims_y=*/{1, 1, 2, 3}, /*value_x=*/common_input, - /*value_y=*/CastTestVector({0, -1, 3, 0, 10, -7}), - /*expected_output_dims=*/{1, 2, 3}, - /*expected_output=*/CastTestVector({0, 4, 1, 9, 36, 144}), + /*value_y=*/{0, -1, 3, 0, 10, -7}, + /*expected_output_dims=*/{1, 1, 2, 3}, + /*expected_output=*/{0, 4, 1, 9, 36, 144}, }, { - /*dims_x=*/{1, 2, 3}, - /*dims_y=*/{1, 1, 3}, + /*dims_x=*/{1, 1, 2, 3}, + /*dims_y=*/{1, 1, 1, 3}, /*value_x=*/common_input, - /*value_y=*/CastTestVector({0, 1, 2}), - /*expected_output_dims=*/{1, 2, 3}, - /*expected_output=*/CastTestVector({0, 0, 0, 9, 9, 9}), + /*value_y=*/{0, 1, 2}, + /*expected_output_dims=*/{1, 1, 2, 3}, + /*expected_output=*/{0, 0, 0, 9, 9, 9}, }, }; - for (int i = 0; i < params.size(); ++i) { - test->Reset(); - - NodeDef node_def = GetSquaredDifferenceNodeDef(dtype); - test->AddTestTensor("x", params[i].dims_x, 1, TfDataTypeToTrt(dtype)); - test->AddTestTensor("y", params[i].dims_y, 1, TfDataTypeToTrt(dtype)); - test->RunValidationAndConversion(node_def); - - TRT_TensorOrWeights output; - TF_EXPECT_OK(test->GetTensorOrWeights("my_squared_diff", &output)); - EXPECT_TRUE(output.is_tensor()); - ExpectTrtDimsEqualsArray(params[i].expected_output_dims, - output.tensor()->getDimensions()); - - DataVec input_data{{"x", test::AsTensor(params[i].value_x)}, - {"y", test::AsTensor(params[i].value_y)}}; - DataVec output_data{ - {"my_squared_diff", - ConstructTensor(params[i].expected_output.size())}}; - test->BuildAndRun( - input_data, &output_data, - dtype == DT_HALF ? TrtPrecisionMode::FP16 : TrtPrecisionMode::FP32); - EXPECT_THAT(GetSpanForData(output_data[0]), - ElementsAreArray(params[i].expected_output)); - } -} - -TEST_F(OpConverterTest, ConvertSquaredDifference) { - { - // Input is a weight, should fail. - Reset(); - NodeDef node_def = GetSquaredDifferenceNodeDef(DT_FLOAT); - AddTestWeights("x", {1, 2, 3}, {1, 2, 3, 4, 5, 6}); - AddTestTensor("y", {1, 2, 3}); - RunValidationAndConversion(node_def, error::UNIMPLEMENTED, - "The input \"x\" for SquaredDifference must be " - "a tensor, at my_squared_diff"); - } - { - // Shapes are not broadcastable, should fail. + for (auto p : params) { Reset(); - NodeDef node_def = GetSquaredDifferenceNodeDef(DT_FLOAT); - AddTestTensor("x", {2, 3}); - AddTestTensor("y", {7, 5}); - RunValidationAndConversion(node_def, error::INVALID_ARGUMENT, - "Infeasible broadcast scheme"); + const NodeDef node = GetSquaredDifferenceNodeDef(tf_type_); + AddTestTensor("x", p.dims_x, p.value_x); + AddTestTensor("y", p.dims_y, p.value_y); + TestOpConverter(node, p.expected_output_dims, p.status, p.runtime_status, + ElementsAreArray(p.expected_output)); } - - TestConvertSquaredDifference(this); - TestConvertSquaredDifference(this); } -#if IS_TRT_VERSION_GE(6, 0, 0, 0) -// TODO: @mconley @jdekhtiar - Reactivate when fixed -#ifndef TF2TENSORRT_BYPASS_NMS_RESIZE_OPS template -NodeDef MakeResizeNodeDef(std::string name, DataType dtype, - bool align_corners) { +NodeDef MakeResizeNodeDef(DataType dtype, bool align_corners) { Scope s = Scope::NewRootScope(); auto input = ops::Placeholder(s.WithOpName("input"), dtype); auto size = ops::Placeholder(s.WithOpName("size"), DT_INT32); auto attrs = typename OpType::Attrs().AlignCorners(align_corners); - auto resize = OpType(s.WithOpName(name), input, size, attrs); + auto resize = OpType(s.WithOpName("my_resize"), input, size, attrs); return resize.operation.node()->def(); } -template struct ResizeTestParams { std::vector input_dims; std::vector output_resize_dims; - std::vector input_values; + std::vector input_value; + bool size_as_tensor; bool align_corners; std::vector expected_output_dims; - std::vector expected_nearest_output_values; - std::vector expected_bilinear_output_values; + std::vector expected_nearest_output_values; + std::vector expected_bilinear_output_values; + Status status; }; -template -void TestConvertResize(OpConverterTest* test) { - typedef typename EnumToDataType::Type CType; - - std::vector> params{ - { - /*input_dims=*/{1, 2, 1}, // H, W, C - /*output_resize_dims=*/{2, 3}, // H_out, W_out - /*input_values=*/CastTestVector({2.0f, -1.0f}), - /*align_corners=*/false, - /*expected_output_dims=*/{2, 3, 1}, // H, W, C - /*expected_nearest_output_values=*/ - CastTestVector({2.0f, 2.0f, -1.0f, 2.0f, 2.0f, -1.0f}), - /*expected_bilinear_output_values=*/ - CastTestVector({2.0f, 0.f, -1.0f, 2.0f, 0.f, -1.0f}), - }, - { - /*input_dims=*/{1, 2, 1}, // H, W, C - /*output_resize_dims=*/{2, 3}, // H_out, W_out - /*input_values=*/CastTestVector({2.0f, -1.0f}), - /*align_corners=*/true, - /*expected_output_dims=*/{2, 3, 1}, // H, W, C - /*expected_nearest_output_values=*/ - CastTestVector({2.0f, 2.0f, -1.0f, 2.0f, 2.0f, -1.0f}), - /*expected_bilinear_output_values=*/ - CastTestVector({2.0f, 0.5f, -1.0f, 2.0f, 0.5f, -1.0f}), - }}; - -// This use case is not supported as of TRT version 7.1 -#if IS_TRT_VERSION_GE(7, 1, 0, 0) - if (OpType == ops::ResizeBilinear) { - params.erase(params.begin()); +template +void TestConvertResize(ParameterizedOpConverterTestBase* test, + ResizeTestParams& p) { + test->Reset(); + // Create resize node. + NodeDef node_def = + MakeResizeNodeDef(test->get_tf_type(), p.align_corners); + + test->AddTestTensor("input", p.input_dims, test->get_tf_type(), + p.input_value); + // Create output size. + if (p.size_as_tensor) { + std::vector size_dims{2}; + std::vector size_values{p.output_resize_dims}; + test->AddTestTensor("size", size_dims, DT_INT32, size_values, size_dims); + } else { + test->AddTestWeights("size", {2}, p.output_resize_dims, DT_INT32); } -#endif - - for (int i = 0; i < params.size(); ++i) { - test->Reset(); - // Create resize node. - NodeDef node_def = - MakeResizeNodeDef("my_resize", dtype, params[i].align_corners); - // Create input tensor - test->AddTestTensor("input", params[i].input_dims, /*batch_size=*/1, - /*trt_dtype=*/TfDataTypeToTrt(dtype)); - // Create output size. - test->AddTestWeights("size", {2}, params[i].output_resize_dims); - - test->RunValidationAndConversion(node_def); - TRT_TensorOrWeights output; - TF_EXPECT_OK(test->GetTensorOrWeights("my_resize", &output)); + std::vector expected_out; - // Create input data for tensors. - const DataVec input_data{ - {"input", test::AsTensor(params[i].input_values)}}; - DataVec output_data{ - {"my_resize", ConstructTensor( - params[i].expected_nearest_output_values.size())}}; - - test->BuildAndRun( - input_data, &output_data, - dtype == DT_HALF ? TrtPrecisionMode::FP16 : TrtPrecisionMode::FP32); - - if (node_def.op() == "ResizeBilinear") { - ExpectArrayAlmostEqual(params[i].expected_bilinear_output_values, - GetSpanForData(output_data[0]), - CType(1e-3)); - } else if (node_def.op() == "ResizeNearestNeighbor") { - ExpectArrayAlmostEqual(params[i].expected_nearest_output_values, - GetSpanForData(output_data[0]), - CType(1e-3)); - } + if (node_def.op() == "ResizeBilinear") { + expected_out = p.expected_bilinear_output_values; + } else if (node_def.op() == "ResizeNearestNeighbor") { + expected_out = p.expected_nearest_output_values; + } else { + ASSERT_TRUE(false); } + + test->TestOpConverter(node_def, p.expected_output_dims, + /*expected_conversion_status=*/p.status, + /*expected_runtime_status=*/p.status, + /*matcher=*/ElementsAreArray(expected_out), + /*out_tf_types=*/{DT_FLOAT}); } -TEST_F(OpConverterTest, ConvertResize) { +TEST_P(OpConverter_FP32_FP16_Test, ConvertResize) { { // First input is weight, should fail. Reset(); - NodeDef node_def = - MakeResizeNodeDef("my_resize", DT_FLOAT, true); + NodeDef node_def = MakeResizeNodeDef(tf_type_, + /*align_corners=*/ + true); AddTestWeights("input", {1, 2}, {1, 2}); AddTestWeights("size", {1, 2}, {1, 2}); RunValidationAndConversion( - node_def, error::UNIMPLEMENTED, + node_def, absl::StatusCode::kUnimplemented, "The input \"input\" for ResizeBilinear must be a " - "tensor, at my_resize"); + "tensor"); + } + + std::vector params{ + {/*input_dims=*/{1, 1, 2, 1}, // N, H, W, C + /*output_resize_dims=*/{2, 3}, // H_out, W_out + /*input_values=*/{2.0f, -1.0f}, + /*size_as_tensor=*/false, + /*align_corners=*/false, + /*expected_output_dims=*/{1, 2, 3, 1}, // N, H, W, C + /*expected_nearest_output_values=*/ + {2.0f, 2.0f, -1.0f, 2.0f, 2.0f, -1.0f}, + /*expected_bilinear_output_values=*/ + {2.0f, 0.f, -1.0f, 2.0f, 0.f, -1.0f}, + /*status=*/Status::OK()}, + {/*input_dims=*/{1, 1, 2, 1}, // N, H, W, C + /*output_resize_dims=*/{2, 3}, // H_out, W_out + /*input_values=*/{2.0f, -1.0f}, + /*size_as_tensor=*/false, + /*align_corners=*/true, + /*expected_output_dims=*/{1, 2, 3, 1}, // N, H, W, C + /*expected_nearest_output_values=*/ + {2.0f, 2.0f, -1.0f, 2.0f, 2.0f, -1.0f}, + /*expected_bilinear_output_values=*/ + {2.0f, 0.5f, -1.0f, 2.0f, 0.5f, -1.0f}, + /*status=*/Status::OK()}}; + + if (trt_mode_ != TrtTestMode::kImplicitBatch) { + // Size as a tensor is not supported in implicit batch mode. + params.push_back({/*input_dims=*/{1, 1, 2, 1}, // N, H, W, C + /*output_resize_dims=*/{2, 3}, // H_out, W_out + /*input_values=*/{2.0f, -1.0f}, + /*size_as_tensor=*/true, + /*align_corners=*/true, + /*expected_output_dims=*/{1, 2, 3, 1}, // N, H, W, C + /*expected_nearest_output_values=*/ + {2.0f, 2.0f, -1.0f, 2.0f, 2.0f, -1.0f}, + /*expected_bilinear_output_values=*/ + {2.0f, 0.5f, -1.0f, 2.0f, 0.5f, -1.0f}, + /*status=*/Status::OK()}); + } + + for (auto p : params) { + TestConvertResize(this, p); + +// This use case is not supported as of TRT version 7.1 +#if IS_TRT_VERSION_GE(7, 1, 0, 0) + if (!p.align_corners) { + p.status = errors::InvalidArgument( + "Cannot Convert Bilinear Resize when align_corners=False"); + } +#endif + + TestConvertResize(this, p); } - { - // output dimension is a tensor, should fail. - Reset(); - NodeDef node_def = - MakeResizeNodeDef("my_resize", DT_FLOAT, true); - AddTestTensor("input", {1, 2}); - AddTestTensor("size", {1, 2}); - RunValidationAndConversion( - node_def, error::UNIMPLEMENTED, - "The input \"size\" for ResizeBilinear must be a " - "constant, at my_resize"); - } - TestConvertResize(this); - TestConvertResize(this); - TestConvertResize(this); - TestConvertResize(this); } -#endif // TF2TENSORRT_BYPASS_NMS_RESIZE_OPS -#endif // IS_TRT_VERSION_GE(6, 0, 0, 0) NodeDef MakePadNodeDef(std::string name, DataType dtype) { Scope s = Scope::NewRootScope(); @@ -6219,88 +8907,42 @@ NodeDef MakePadNodeDef(std::string name, DataType dtype) { return pad.operation.node()->def(); } -template struct PadTestParams { std::vector input_dims; std::vector pad_dims; - std::vector input_values; + std::vector pad_values; + std::vector input_values; std::vector expected_output_dims; - std::vector expected_output_values; + std::vector expected_output_values; + Status status; }; -template -void TestConvertPad(OpConverterTest* test) { - typedef typename EnumToDataType::Type CType; - - std::vector> params{ - { - /*input_dims=*/{1, 2, 1}, // H, W, C - /*pad_dims=*/{4, 2}, // #dims, {pad_before, pad_after} - /*input_values=*/CastTestVector({2.0f, -1.0f}), - /*expected_output_dims=*/{2, 3, 1}, // H, W, C - /*expected_output_values=*/ - CastTestVector({0.0, 0.0, 0.0, 2.0f, -1.0f, 0.0}), - }, - }; - - for (int i = 0; i < params.size(); ++i) { - test->Reset(); - // Create pad node. - NodeDef node_def = MakePadNodeDef("my_pad", dtype); - // Create input tensor - test->AddTestTensor("input", params[i].input_dims, /*batch_size=*/1, - /*trt_dtype=*/TfDataTypeToTrt(dtype)); - // Create output size. - test->AddTestWeights("padding", params[i].pad_dims, - {0, 0, 1, 0, 0, 1, 0, 0}); - test->RunValidationAndConversion(node_def); - - TRT_TensorOrWeights output; - TF_EXPECT_OK(test->GetTensorOrWeights("padding", &output)); - - // Create input data for tensors. - const DataVec input_data{ - {"input", test::AsTensor(params[i].input_values)}}; - DataVec output_data{ - {"my_pad", - ConstructTensor(params[i].expected_output_values.size())}}; - - test->BuildAndRun( - input_data, &output_data, - dtype == DT_HALF ? TrtPrecisionMode::FP16 : TrtPrecisionMode::FP32); - ExpectArrayAlmostEqual(params[i].expected_output_values, - GetSpanForData(output_data[0]), CType(1e-5)); - } -} - -TEST_F(OpConverterTest, ConvertPad) { +TEST_P(OpConverter_FP32_FP16_Test, ConvertPad) { { // First input is weight, should fail. Reset(); - NodeDef node_def = MakePadNodeDef("my_pad", DT_FLOAT); - AddTestWeights("input", {1, 2}, {1, 2}); + NodeDef node_def = MakePadNodeDef("my_pad", tf_type_); + AddTestWeights("input", {1, 2}, {1, 2}, tf_type_); AddTestWeights("padding", {1, 2}, {1, 2}); - RunValidationAndConversion(node_def, error::UNIMPLEMENTED, + RunValidationAndConversion(node_def, absl::StatusCode::kUnimplemented, "The input \"tensor\" for Pad must be a " "tensor"); } { // padding is a tensor, should fail. Reset(); - NodeDef node_def = MakePadNodeDef("my_pad", DT_FLOAT); + NodeDef node_def = MakePadNodeDef("my_pad", tf_type_); AddTestTensor("input", {1, 2}); AddTestTensor("padding", {1, 2}); - RunValidationAndConversion(node_def, error::UNIMPLEMENTED, + RunValidationAndConversion(node_def, absl::StatusCode::kUnimplemented, "The input \"paddings\" for Pad must be a " "constant"); } - TestConvertPad(this); - TestConvertPad(this); { // Make sure that ranges are inferred across a Pad. Reset(); - NodeDef node_def = MakePadNodeDef("my_pad", DT_FLOAT); - AddTestTensor("input", {1, 2, 1}); + NodeDef node_def = MakePadNodeDef("my_pad", tf_type_); + AddTestTensor("input", {1, 1, 2, 1}); AddTestWeights("padding", {4, 2}, {0, 0, 1, 0, 0, 1, 0, 0}); TRT_TensorOrWeights input; TRT_TensorOrWeights output; @@ -6309,17 +8951,758 @@ TEST_F(OpConverterTest, ConvertPad) { TF_EXPECT_OK(GetTensorOrWeights("my_pad", &output)); ITensorProxyPtr input_tensor = input.tensor(); converter_->ProvideQuantizationRange(&input_tensor, -5.0f, 5.0f); - // Input range should be inferred across pad. - PropagateQuantizationRanges(); auto ranges = quantization_ranges(); EXPECT_EQ(5.0f, ranges[input.tensor()->trt_tensor()]); - EXPECT_EQ(5.0f, ranges[output.tensor()->trt_tensor()]); } + + std::vector params{ + // 1 padding dim + { + /*input_dims=*/{1, 1, 3, 2}, // N, H, W, C + /*pad_dims=*/{4, 2}, // #dims, {pad_before, pad_after} + /*pad_values*/ {0, 0, 0, 0, 0, 1, 0, 0}, + /*input_values=*/{1, 2, 3, 4, 5, 6}, + /*expected_output_dims=*/{1, 1, 4, 2}, // N, H, W, C + /*expected_output_values=*/ + {1, 2, 3, 4, 5, 6, 0, 0}, + }, + { + /*input_dims=*/{1, 1, 3, 2}, // N, H, W, C + /*pad_dims=*/{4, 2}, // #dims, {pad_before, pad_after} + /*pad_values*/ {0, 0, 0, 0, 0, 0, 0, 1}, + /*input_values=*/{1, 2, 3, 4, 5, 6}, + /*expected_output_dims=*/{1, 1, 3, 3}, // N, H, W, C + /*expected_output_values=*/ + {1, 2, 0, 3, 4, 0, 5, 6, 0}, + }, + { + /*input_dims=*/{1, 1, 3, 2}, // N, H, W, C + /*pad_dims=*/{4, 2}, // #dims, {pad_before, pad_after} + /*pad_values*/ {0, 0, 1, 0, 0, 0, 0, 0}, + /*input_values=*/{1, 2, 3, 4, 5, 6}, + /*expected_output_dims=*/{1, 2, 3, 2}, // N, H, W, C + /*expected_output_values=*/ + {0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6}, + }, + // 2 padding dims + { + /*input_dims=*/{1, 1, 2, 1}, // N, H, W, C + /*pad_dims=*/{4, 2}, // #dims, {pad_before, pad_after} + /*pad_values*/ {0, 0, 1, 0, 0, 1, 0, 0}, + /*input_values=*/{2.0f, -1.0f}, + /*expected_output_dims=*/{1, 2, 3, 1}, // N, H, W, C + /*expected_output_values=*/ + {0.0, 0.0, 0.0, 2.0f, -1.0f, 0.0}, + }, + PadTestParams{ + /*input_dims=*/{1, 1, 2, 2}, // N, H, W, C + /*pad_dims=*/{4, 2}, // #dims, {pad_before, pad_after} + /*pad_values*/ {0, 0, 1, 0, 0, 1, 0, 0}, + /*input_values=*/{2, -1, 3., 4}, + /*expected_output_dims=*/{1, 2, 3, 2}, // N, H, W, C + /*expected_output_values=*/ + {0, 0, 0, 0, 0, 0, 2, -1, 3, 4, 0, 0}, + }, + PadTestParams{ + /*input_dims=*/{1, 1, 2, 1, 2}, // N, C, H, W, D + /*pad_dims=*/{5, 2}, // #dims, {pad_before, pad_after} + /*pad_values*/ {0, 0, 1, 0, 0, 1, 0, 0, 0, 0}, + /*input_values=*/{2, -1, 3., 4}, + /*expected_output_dims=*/{1, 2, 3, 1, 2}, // N, H, W, C + /*expected_output_values=*/ + {0, 0, 0, 0, 0, 0, 2, -1, 3, 4, 0, 0}, + }, + PadTestParams{ + /*input_dims=*/{1, 1, 2, 1, 2}, // N, C, H, W, D + /*pad_dims=*/{5, 2}, // #dims, {pad_before, pad_after} + /*pad_values*/ {0, 0, 0, 1, 0, 0, 1, 1, 0, 0}, + /*input_values=*/{2, -1, 3., 4}, + /*expected_output_dims=*/{1, 2, 2, 3, 2}, // N, H, W, C + /*expected_output_values=*/ + {0., 0., 2., -1., 0., 0., 0., 0., 3., 4., 0., 0., + 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0}, + }, + PadTestParams{ + /*input_dims=*/{1, 1, 2, 1}, // N, H, W, C + /*pad_dims=*/{4, 2}, // #dims, {pad_before, pad_after} + /*pad_values*/ {1, 0, 0, 0, 0, 1, 0, 0}, + /*input_values=*/{2.0f, -1.0f}, + /*expected_output_dims=*/{2, 1, 3, 1}, // N, H, W, C + /*expected_output_values=*/{0.0, 0.0, 0.0, 2.0f, -1.0f, 0.0}, + trt_mode_ == TrtTestMode::kImplicitBatch + ? errors::InvalidArgument("Padding layer does not support " + "padding on batch dimension") + : Status::OK()}, + PadTestParams{ + /*input_dims=*/{1, 1, 2, 1}, // N, H, W, C + /*pad_dims=*/{4, 2}, // #dims, {pad_before, pad_after} + /*pad_values*/ {0, 0, 1, 0, 0, 1, 1, 1}, + /*input_values=*/{2.0f, -1.0f}, + /*expected_output_dims=*/{}, // N, H, W, C + /*expected_output_values=*/{}, + errors::InvalidArgument("Padding layer does not support padding on " + "> 2")}, + PadTestParams{ + /*input_dims=*/{1, 2, 2}, // N, H, W + /*pad_dims=*/{3, 2}, // #dims, {pad_before, pad_after} + /*pad_values*/ {0, 0, 1, 0, 0, 1}, + /*input_values=*/{2, -1, 3., 4}, + /*expected_output_dims=*/{1, 3, 3}, // N, H, W, C + /*expected_output_values=*/ + {0., 0., 0., 2., -1., 0., 3., 4., 0.}, + errors::InvalidArgument("Convertpad requires at least 4D input")}}; + + for (auto p : params) { + Reset(); + // Create pad node. + NodeDef node_def = MakePadNodeDef("my_pad", tf_type_); + // Create input tensor. + AddTestTensor("input", p.input_dims, p.input_values); + // Create output size. + AddTestWeights("padding", p.pad_dims, p.pad_values); + TestOpConverter(node_def, p.expected_output_dims, p.status, p.status, + ElementsAreArray(p.expected_output_values)); + } +} + +#if IS_TRT_VERSION_GE(8, 2, 0, 0) + +class OpConverter_Select : public ParameterizedOpConverterTestBase { + public: + void RunTest(const string& opName); +}; + +void OpConverter_Select::RunTest(const string& opName) { + const auto testing_SelectV2 = opName == "SelectV2"; + const int maxVal = 32; + const std::array par_name = {"cond", "then", "else"}; + std::array par_type = {DT_BOOL, tf_type_, tf_type_}; + std::vector config(3, 0); + std::array*, 3> par_dims; + std::vector data_then(1, 0), data_else(1, maxVal), + expected_output(1, maxVal); + std::array*, 3> par_value = {nullptr, &data_then, + &data_else}; + std::vector data_cond(1, 0); + + auto set_parameters = [&](DataType cond_type = DT_BOOL) { + Reset(); + if (config[0]) { + AddTestTensor(par_name[0], *par_dims[0], cond_type, data_cond); + } else { + AddTestWeights(par_name[0], {1}, data_cond, cond_type); + } + for (int i = 1; i < 3; i++) { + if (config[i]) { + AddTestTensor(par_name[i], *par_dims[i], par_type[i], *par_value[i]); + } else { + AddTestWeights(par_name[i], {1}, *par_value[i], par_type[i]); + } + } + }; + + auto set_dimension = [this](const nvinfer1::Dims* dims, + std::vector& dims_param, + std::string* comment = nullptr) { + const auto nbDims = dims->nbDims; + if (comment) { + *comment = "batch_dim: " + std::to_string(nbDims + 1) + ", " + + DebugString(*dims); + } + + dims_param.resize(nbDims); + for (int i = 0; i < nbDims; i++) dims_param[i] = dims->d[i]; + }; + + auto adjust_comments = [this](const nvinfer1::Dims* p_dims, + std::string* p_comment) { + if (p_dims[0].nbDims == p_dims[1].nbDims) return; + + const int idx = p_dims[0].nbDims < p_dims[1].nbDims ? 0 : 1; + + nvinfer1::Dims dims; + dims.nbDims = p_dims[1 - idx].nbDims; + int i = 0; + for (; i < dims.nbDims - p_dims[idx].nbDims; i++) dims.d[i] = 1; + + for (int j = i; i < dims.nbDims; i++) dims.d[i] = p_dims[idx].d[i - j]; + + *(p_comment + idx) = + "batch_dim: " + std::to_string(1) + ", " + DebugString(dims); + *(p_comment + 1 - idx) = + "batch_dim: " + std::to_string(p_dims[idx].nbDims + 1) + ", " + + DebugString(p_dims[1 - idx]); + }; + + auto assign_values = [this]( + const std::array*, 3>& dims, + std::array*, 3> par_value, + std::vector& data_cond, int use_indices = 0, + const std::vector* expected_out = nullptr, + std::vector* expect_dims_pntr = nullptr) { + size_t rank[3]; + const auto dim_len = + dims[0]->size() > dims[1]->size() ? dims[0]->size() : dims[1]->size(); + std::vector exp_dims; + if (!expect_dims_pntr) expect_dims_pntr = &exp_dims; + + auto& expect_dims = *expect_dims_pntr; + expect_dims.resize(dim_len); + expect_dims.assign(dim_len, 0); + for (int i = 0; i < 3; i++) { + if (dims[i]) { + const auto& dim = *dims[i]; + for (auto j = 0; j < dims[i]->size(); j++) { + if (expect_dims[j] < dim[j]) expect_dims[j] = dim[j]; + } + + rank[i] = std::accumulate(std::begin(dim), std::end(dim), 1, + std::multiplies()); + } else { + assert(i >= 2); + rank[i] = rank[i - 1]; + } + } + + // Create data for ConvertSelectV2 testing. + for (int k = 1; k <= 2; k++) { + auto& data = *par_value[k]; + data.resize(rank[k]); + if (use_indices) { + const int mult = k == 1 ? 1 : -1; + for (int i = 0; i < rank[k]; i++) { + data[i] = mult * (i + 1); + } + } else { + for (int i = 0; i < rank[k]; i++) { + data[i] = k == 1 ? data[i >> 1] + i % 2 : maxVal - (*par_value[1])[i]; + } + } + } + + data_cond.resize(rank[0]); + data_cond[0] = 0; + for (int i = 0; i < rank[0]; i++) { + data_cond[i] = i % 2 ? 1 - data_cond[i >> 1] : data_cond[i >> 1]; + } + + if (!expected_out || expected_out->size() > 0) { + auto& expected_output = *par_value[0]; + const auto rank_out = + std::accumulate(std::begin(expect_dims), std::end(expect_dims), 1, + std::multiplies()); + + assert(rank_out == (expected_out ? expected_out->size() + : rank[use_indices >= 0 ? 0 : 1])); + + expected_output.resize(rank_out); + const auto& data_then = *par_value[1]; + const auto& data_else = *par_value[2]; + const auto div = use_indices >= 0 ? 1 : rank_out / rank[0]; + for (int i = 0; i < rank_out; i++) { + expected_output[i] = + expected_out ? (*expected_out)[i] + : data_cond[i / div] ? data_then[i] : data_else[i]; + } + } + }; + + auto shape_error_msg = [&](const NodeDef& node, bool same_then_else = true) { + nvinfer1::Dims shape[3]; + const auto j = same_then_else ? 0 : 1; + if (trt_mode_ == TrtTestMode::kDynamicShape) { + // Creating dynamic shapes corresponding to 'cond' and 'then' parameters. + for (int i = 0; i < 2; i++) { + for (int j = shape[i].nbDims = par_dims[i]->size(); j--;) { + shape[i].d[j] = -1; + } + } + } else { + for (int i = 0; i < 2; i++) { + DimsAdapter(*par_dims[i + j]).TrtDims(&shape[i + j]); + } + } + + return input_shapes_error_msg(shape[j], shape[j + 1], node, + !same_then_else); + }; + + auto run_test = [&](const NodeDef& node, const std::vector& exp_dims) { + const bool same_then_else_shapes = *par_dims[1] == *par_dims[2]; + const bool same_cond_chape = *par_dims[0] == *par_dims[1]; + const auto nMax = testing_SelectV2 ? 2 : 1; + for (int n = 0; n < nMax; n++) { + set_parameters(); + if (testing_SelectV2 || (same_then_else_shapes && same_cond_chape)) { + TestOpConverter(node, exp_dims, Status::OK(), Status::OK(), + ElementsAreArray(expected_output)); + } else { + const auto err_msg = shape_error_msg(node, same_then_else_shapes); + RunValidationAndConversion(node, absl::StatusCode::kInvalidArgument, + err_msg); + } + + if (!n) { + // Changing the condition and expected_output. + for (auto idx = data_cond.size(); idx--;) + data_cond[idx] = 1 - data_cond[idx]; + + // Compare of the shapes if the tensors "then" and "else". + if (!same_then_else_shapes) { + // Shapes are different: + // assigning +1's and -1's to the elements + // of the tensors "then" and "else", respectively + for (int p = 1; p <= 2; p++) { + auto& values = *par_value[p]; + const auto val = p == 1 ? 1 : -1; + for (auto idx = values.size(); idx--;) values[idx] = val; + } + // and set the appropriate expected values. + for (auto idx = expected_output.size(); idx--;) + expected_output[idx] = expected_output[idx] > 0 ? -1 : 1; + } else { + // Shapes are the same: + // just change the signs of the expected values. + for (auto idx = expected_output.size(); idx--;) + expected_output[idx] = -expected_output[idx]; + } + } + } + }; + + std::array data_types = {DT_FLOAT, DT_HALF, DT_INT32}; + NodeDef node; + TF_CHECK_OK(NodeDefBuilder("op", opName) + .Input("cond", 0, DT_BOOL) + .Input("then", 0, tf_type_) + .Input("else", 0, tf_type_) + .Finalize(&node)); + + const std::vector> dims_params = { + {8}, {8, 2, 4}, {32, 32, 3200}}; + + // All parameters passed as the weights OR 1-element tensors. + par_dims = {&dims_params[0], &dims_params[0], &dims_params[0]}; + if (trt_mode_ == TrtTestMode::kImplicitBatch) { + const auto& err = convert_not_supported_implicit(node.op(), node.name()); + do { + set_parameters(); + RunValidationAndConversion(node, absl::StatusCode::kUnimplemented, err); + } while (nextTensorWeightConfiguration(config)); + return; + } + + // Parameter 'cond' can only be of type DT_BOOL. + do { + for (auto cond_type : {DT_INT32, DT_FLOAT, DT_HALF}) { + nvinfer1::DataType trt_type; + TF_ASSERT_OK(TfTypeToTrtType(cond_type, &trt_type)); + const auto error_msg = + unexpected_type_error_msg(trt_type, nvinfer1::DataType::kBOOL, node); + set_parameters(cond_type); + RunValidationAndConversion(node, absl::StatusCode::kInvalidArgument, + error_msg); + } + } while (nextTensorWeightConfiguration(config)); + + std::string err_msg = bool_weight_error_msg(node); + + std::vector dims_const = {1}; + par_dims = {&dims_const, &dims_const, &dims_const}; + // Loop when condition is reversed and the expected_output + // should change from 'else' to 'then'. + for (int i = 0; i < 2; i++) { + do { + set_parameters(); + if (config[0]) { + TestOpConverter(node, {1}, Status::OK(), Status::OK(), + ElementsAreArray(expected_output)); + } else { + RunValidationAndConversion(node, absl::StatusCode::kInvalidArgument, + err_msg); + } + } while (nextTensorWeightConfiguration(config)); + + // Changing the condition and expected_output. + data_cond[0] = 1 - data_cond[0]; + expected_output[0] = (*par_value[1 + i])[0]; + } + + // All parameters passed as the tensors. + for (int i = 0; i < 3; i++) { + config[i] = 1; + } + + par_value[0] = &expected_output; + if (trt_mode_ == TrtTestMode::kExplicitBatch) { + // Testing infeasible broadcast schemes. + // For that subtest dims('then') will be equal to dims('else'). + std::string bc_comment[2]; + std::vector dims[4]; + par_dims = {dims, dims + 1, dims + 1}; + const nvinfer1::Dims infeasible_dims[] = { + {3, {4, 3, 2}}, {4, {4, 3, 2, 5}}, {3, {4, 1, 3}}, + {3, {4, 3, 2}}, {3, {4, 3, 2}}, {5, {4, 3, 2, 5, 2}}}; + + auto iMax = sizeof(infeasible_dims) / sizeof(infeasible_dims[0]); + // Loop for all pairs of nvinfer1::Dims from infeasible_dims. + for (int i = 0; i < iMax; i += 2) { + // Loop for all permutations on 2 elements which will assign + // each pairs of nvinfer1::Dims from infeasible_dims to + // (dims('cond'), dims('then')) and (dims('then'), dims('cond')), + // respectively. + for (int k = 0; k < 2; k++) { + for (int j = 0; j < 2; j++) { + set_dimension(infeasible_dims + i + (j + k) % 2, dims[j], + bc_comment + (j + k) % 2); + } + + if (testing_SelectV2) { + adjust_comments(infeasible_dims + i, bc_comment); + err_msg = "Infeasible broadcast scheme (" + bc_comment[k] + " vs " + + bc_comment[1 - k]; + } else { + err_msg = shape_error_msg(node); + } + + set_parameters(); + RunValidationAndConversion(node, absl::StatusCode::kInvalidArgument, + err_msg); + } + } + + // Tests for exactly two identical dims for any two out of 3 tensors. + const nvinfer1::Dims feasible_dims_2[] = { + {3, {1, 3, 2}}, {3, {4, 3, 2}}, {3, {4, 1, 2}}, {3, {4, 3, 2}}, + {3, {4, 3, 1}}, {3, {4, 3, 2}}, {3, {1, 1, 2}}, {3, {4, 3, 2}}, + {3, {1, 3, 1}}, {3, {4, 3, 2}}, {3, {4, 1, 1}}, {3, {4, 3, 2}}, + {3, {1, 1, 1}}, {3, {4, 3, 2}}, {3, {1, 3, 2}}, {3, {4, 1, 2}}, + }; + + // Expected values will be definded directly. + const std::vector expected_val_2[] = { + // Expected values for all feasible ordered pairs of dims + // for dims('then') == dims('else'), dims('then') != dims('cond'). + {-1, 2, 3, -4, 5, -6, -7, 8, 9, -10, 11, -12, + -13, 14, 15, -16, 17, -18, -19, 20, 21, -22, 23, -24}, + {-1, 2, 3, -4, 5, -6, -1, 2, 3, -4, -5, 6, + -1, 2, 3, -4, 5, -6, -1, 2, -3, 4, 5, -6}, + {-1, 2, -3, 4, -5, 6, 7, -8, 9, -10, 11, -12, + 13, -14, 15, -16, 17, -18, -19, 20, -21, 22, -23, 24}, + {-1, 2, 1, -2, 1, -2, -3, 4, 3, -4, -3, 4, + -5, 6, 5, -6, 5, -6, -7, 8, -7, 8, 7, -8}, + {-1, -2, 3, 4, 5, 6, -7, -8, 9, 10, -11, -12, + -13, -14, 15, 16, 17, 18, -19, -20, -21, -22, 23, 24}, + {-1, 1, 2, -2, 3, -3, -4, 4, 5, -5, -6, 6, + -7, 7, 8, -8, 9, -9, -10, 10, -11, 11, 12, -12}, + {-1, 2, -3, 4, -5, 6, -7, 8, -9, 10, -11, 12, + -13, 14, -15, 16, -17, 18, -19, 20, -21, 22, -23, 24}, + {-1, 2, 1, -2, 1, -2, -1, 2, 1, -2, -1, 2, + -1, 2, 1, -2, 1, -2, -1, 2, -1, 2, 1, -2}, + {-1, -2, 3, 4, 5, 6, -7, -8, 9, 10, 11, 12, + -13, -14, 15, 16, 17, 18, -19, -20, 21, 22, 23, 24}, + {-1, 1, 2, -2, 3, -3, -1, 1, 2, -2, -3, 3, + -1, 1, 2, -2, 3, -3, -1, 1, -2, 2, 3, -3}, + {-1, -2, -3, -4, -5, -6, 7, 8, 9, 10, 11, 12, + 13, 14, 15, 16, 17, 18, -19, -20, -21, -22, -23, -24}, + {-1, 1, 1, -1, 1, -1, -2, 2, 2, -2, -2, 2, + -3, 3, 3, -3, 3, -3, -4, 4, -4, 4, 4, -4}, + {-1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, + -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, -23, -24}, + {-1, 1, 1, -1, 1, -1, -1, 1, 1, -1, -1, 1, + -1, 1, 1, -1, 1, -1, -1, 1, -1, 1, 1, -1}, + {-1, 2, 1, -2, 1, -2, -3, 4, 3, -4, 3, -4, + -5, 6, 5, -6, 5, -6, -7, 8, 7, -8, 7, -8}, + {-1, 2, -3, 4, -5, 6, 1, -2, 3, -4, 5, -6, + 1, -2, 3, -4, 5, -6, -1, 2, -3, 4, -5, 6}, + // Expected values for all feasible ordered pairs of dims + // for dims('cond') == dims('else'), dims('then') != dims('else'). + {-1, 2, 3, -4, 5, -6, -7, 2, 3, -10, -11, 6, + -13, 2, 3, -16, 5, -18, -19, 2, -21, 4, 5, -24}, + {-1, 2, 3, -4, 5, -6, -1, 8, 9, -4, 11, -6, + -1, 14, 15, -4, 17, -6, -1, 20, 21, -4, 23, -6}, + {-1, 2, 1, -4, 1, -6, -7, 4, 3, -10, -11, 4, + -13, 6, 5, -16, 5, -18, -19, 8, -21, 8, 7, -24}, + {-1, 2, -1, 4, -1, 6, 7, -4, 9, -4, 11, -4, + 13, -6, 15, -6, 17, -6, -7, 20, -7, 22, -7, 24}, + {-1, 1, 2, -4, 3, -6, -7, 4, 5, -10, -11, 6, + -13, 7, 8, -16, 9, -18, -19, 10, -21, 11, 12, -24}, + {-1, -1, 3, 4, 5, 6, -4, -4, 9, 10, -6, -6, + -7, -7, 15, 16, 17, 18, -10, -10, -11, -11, 23, 24}, + {-1, 2, 1, -4, 1, -6, -7, 2, 1, -10, -11, 2, + -13, 2, 1, -16, 1, -18, -19, 2, -21, 2, 1, -24}, + {-1, 2, -1, 4, -1, 6, -1, 8, -1, 10, -1, 12, + -1, 14, -1, 16, -1, 18, -1, 20, -1, 22, -1, 24}, + {-1, 1, 2, -4, 3, -6, -7, 1, 2, -10, -11, 3, + -13, 1, 2, -16, 3, -18, -19, 1, -21, 2, 3, -24}, + {-1, -1, 3, 4, 5, 6, -1, -1, 9, 10, 11, 12, + -1, -1, 15, 16, 17, 18, -1, -1, 21, 22, 23, 24}, + {-1, 1, 1, -4, 1, -6, -7, 2, 2, -10, -11, 2, + -13, 3, 3, -16, 3, -18, -19, 4, -21, 4, 4, -24}, + {-1, -1, -1, -1, -1, -1, 7, 8, 9, 10, 11, 12, + 13, 14, 15, 16, 17, 18, -4, -4, -4, -4, -4, -4}, + {-1, 1, 1, -4, 1, -6, -7, 1, 1, -10, -11, 1, + -13, 1, 1, -16, 1, -18, -19, 1, -21, 1, 1, -24}, + {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, + {-1, 2, -1, 4, -1, 6, 1, -4, 3, -4, 5, -4, + 1, -6, 3, -6, 5, -6, -7, 2, -7, 4, -7, 6}, + {-1, 2, 1, -4, 1, -6, -1, 4, 3, -4, 3, -6, + -1, 6, 5, -4, 5, -6, -1, 8, 7, -4, 7, -6}}; + + const auto exp_dims = dims + 3; + const int kMax2 = 2; // number of permutations on 2 elements + iMax = sizeof(feasible_dims_2) / sizeof(feasible_dims_2[0]); + assert(kMax2 * iMax / 3 == + sizeof(expected_val_2) / sizeof(expected_val_2[0])); + // Broadcast shapes defined for `cond` OR for `then` and `else`. + // Loop for all pairs of nvinfer1::Dims from feasible_dims_2. + for (int i = 0; i < iMax; i += 2) { + // Loop for all permutations on 2 elements. + for (int k = 0; k < kMax2; k++) { + // Constructing dims for tensors 'cond' and 'then'. + // NOTE: dims('else') will be the same as dims('then'). + for (int j = 0; j < 2; j++) + set_dimension(feasible_dims_2 + i + (j + k) % 2, dims[j]); + + const std::vector* expect = expected_val_2 + i + k; + // Loop where the tensor shapes for 'cond' and 'then' are swapping. + for (int m = 0; m < 2; m++) { + assign_values(par_dims, par_value, data_cond, 1, expect, exp_dims); + run_test(node, *exp_dims); + + // Swapping dims for 'cond' and 'then' tensors. + const auto tmp = par_dims[0]; + par_dims[0] = par_dims[1]; + par_dims[1] = tmp; + expect += iMax; + } + } + } + + // Tests for pairwise different dims('cond'), dims('then'), dims('else'). + const nvinfer1::Dims feasible_dims_3[] = { + {2, {3, 2}}, {2, {3, 1}}, {2, {1, 1}}, {3, {2, 2, 1}}, + {3, {2, 1, 2}}, {3, {1, 2, 2}}, {3, {2, 1, 1}}, {3, {2, 1, 2}}, + {3, {1, 2, 2}}, {3, {2, 1, 1}}, {3, {1, 1, 2}}, {3, {1, 2, 1}}, + }; + + const std::vector expected_val_3[] = { + {-1, 1, 2, -1, 3, -1}, {-1, 1, 1, -2, 1, -3}, + {-1, -1, 3, 4, 5, 6}, {-1, -2, 1, 1, 1, 1}, + {-1, -1, -2, -2, -3, -3}, {-1, -2, -3, -4, -5, -6}, + {-1, -2, 1, 2, 3, 4, -3, -4}, {-1, -2, 3, 4, 1, 2, -3, -4}, + {-1, 1, -3, 2, 3, -2, 4, -4}, {-1, 2, -2, 4, 1, -3, 3, -4}, + {-1, 1, 2, -2, -3, 3, 4, -4}, {-1, 2, 1, -2, -3, 4, 3, -4}, + {-1, -2, -3, -4, 3, 4, 3, 4}, {-1, -2, -1, -2, 1, 2, 3, 4}, + {-1, 1, -3, 1, 2, -2, 2, -4}, {-1, 2, -1, 4, 1, -2, 3, -2}, + {-1, 1, 1, -2, -3, 2, 2, -4}, {-1, 2, 1, -1, -2, 4, 3, -2}, + {-1, -1, -2, -2, 1, 2, 1, 2}, {-1, -2, -1, -2, 1, 1, 2, 2}, + {-1, 1, -2, 1, -1, 2, -2, 2}, {-1, 1, -1, 2, -2, 1, -2, 2}, + {-1, -2, 1, 1, -1, -2, 2, 2}, {-1, -1, 1, 2, -2, -2, 1, 2}, + }; + + const int kMax3 = 6; // number of permutations on 3 elements + const std::array perm[kMax3] = {{0, 1, 2}, {0, 2, 1}, {1, 0, 2}, + {1, 2, 0}, {2, 0, 1}, {2, 1, 0}}; + par_dims = {dims, dims + 1, dims + 2}; + iMax = sizeof(feasible_dims_3) / sizeof(feasible_dims_3[0]); + assert(kMax3 * iMax / 3 == + sizeof(expected_val_3) / sizeof(expected_val_3[0])); + // Loop for all triples of nvinfer1::Dims from feasible_dims_3. + for (int i = 0; i < iMax; i += 3) { + // Loop for all permutations on 3 elements. + for (int k = 0; k < kMax3; k++) { + // Constructing dims for tensors 'cond', 'then' and 'else`. + for (int j = 0; j < 3; j++) + set_dimension(feasible_dims_3 + i + perm[k][j], dims[j]); + + const auto* expect = expected_val_3 + kMax3 * (i / 3) + k; + assign_values(par_dims, par_value, data_cond, 1, expect, exp_dims); + run_test(node, *exp_dims); + } + } + + if (!testing_SelectV2) { + // Tests for `cond` passed as a vector with N elements, where N is a batch + // size. The subtest should not pass a ConvertSelect::Validate() when one + // of following is true: + // (a) N is NOT equal to the first dimention of dims('then'); + // (b dims('cond').nbDims > 1. + // + // For all these subtest dims('then') == dims('else'). + const nvinfer1::Dims vect_dim[] = { + {1, {4}}, {3, {5, 2, 3}}, {2, {5, 2}}, {3, {5, 2, 3}}, + {1, {5}}, {3, {5, 2, 3}}, {1, {4}}, {4, {4, 3, 5, 2}}, + }; + + std::vector dims[4]; + par_dims = {dims, dims + 1, dims + 1}; + auto iMax = sizeof(vect_dim) / sizeof(vect_dim[0]); + // Loop for all pairs of nvinfer1::Dims from vector_dims. + for (int i = 0; i < iMax; i += 2) { + err_msg = + vect_dim[i].nbDims != 1 || vect_dim[i].d[0] != vect_dim[i + 1].d[0] + ? input_shapes_error_msg(vect_dim[i], vect_dim[i + 1], node) + : ""; + + for (int j = 0; j < 2; j++) { + set_dimension(vect_dim + i + j, dims[j]); + } + + assign_values(par_dims, par_value, data_cond, -1); + set_parameters(); + if (err_msg.empty()) { + TestOpConverter(node, dims[1], Status::OK(), Status::OK(), + ElementsAreArray(expected_output)); + } else { + RunValidationAndConversion(node, absl::StatusCode::kInvalidArgument, + err_msg); + } + } + } + } // trt_mode_ == TrtTestMode::kExplicitBatch + + // Tests for dims('cond') == dims('then') == dims('else'). + for (auto dims : dims_params) { + par_dims = {&dims, &dims, &dims}; + assign_values(par_dims, par_value, data_cond); + + // Loop over all possible values of type_else (type_then = tf_type_). + for (const auto type_else : data_types) { + par_type[2] = type_else; + set_parameters(); + if ((par_type[1] == DT_INT32 || par_type[2] == DT_INT32) && + par_type[1] != par_type[2]) { + // ConvertSelectV2::Validation() should fail when exactly one of + // (type_then, type_else) is equal to nvinfer1::DataType::kINT32. + nvinfer1::DataType trt_type[2]; + for (int i = 0; i < 2; i++) { + TF_ASSERT_OK(TfTypeToTrtType(par_type[i + 1], trt_type + i)); + } + + err_msg = then_else_dtypes_error_msg(trt_type[0], trt_type[1], node); + RunValidationAndConversion(node, absl::StatusCode::kInvalidArgument, + err_msg); + } else { + TestOpConverter(node, dims, Status::OK(), Status::OK(), + ElementsAreArray(expected_output)); + } + } + + // Restoring the original value. + par_type[2] = tf_type_; + } + + if (trt_mode_ == TrtTestMode::kDynamicShape) { + std::vector values_then{1, 2, 3, 4, 5, 6}; + std::vector values_else{-1, -2, -3, -4, -5, -6}; + std::vector expected_output{1, -2, 3, 4, -5, 6}; + data_cond = std::vector{1, 0, 1}; + const std::vector cond_dims{1, 3}, input_dims{1, 2, 3}; + par_dims = {&cond_dims, &input_dims, &input_dims}; + // Loop when condition is reversed and the expected_output + // should change from 'else' to 'then'. + const auto len_cond = data_cond.size(); + for (int i = 0; i < 2; i++) { + par_value[i + 1] = &values_then; + par_value[2 - i] = &values_else; + for (int j = 0; j < values_then.size(); j++) { + expected_output[j] = par_value[2 - data_cond[j % len_cond]]->at(j); + } + + set_parameters(); + if (testing_SelectV2) { + TestOpConverter(node, input_dims, Status::OK(), Status::OK(), + ElementsAreArray(expected_output)); + } else { + const auto err_msg = shape_error_msg(node); + RunValidationAndConversion(node, absl::StatusCode::kInvalidArgument, + err_msg); + } + // Changing the condition and expected_output. + for (int j = len_cond; j--;) { + data_cond[j] = 1 - data_cond[j]; + } + } + } +} + +INSTANTIATE_TEST_CASE_P( + OpConvTestInstantiation, OpConverter_Select, + ::testing::Combine(::testing::ValuesIn(ValidTrtModes), + ::testing::Values(DT_FLOAT, DT_HALF, DT_INT32), + ::testing::Values(TrtPrecisionMode::FP32))); + +TEST_P(OpConverter_Select, ConvertSelectV2) { RunTest("SelectV2"); } + +TEST_P(OpConverter_Select, Convert_Select) { RunTest("Select"); } + +TEST_F(OpConverterTest, DuplicateSqueeze) { + // Define a custom converter which performs multiple squeezes. + auto op_converter = [](const OpConverterParams* params) -> Status { + if (params->validation_only) return Status::OK(); + auto input = params->inputs.at(0).tensor(); + ITensorProxyPtr output; + // Squeeze the first dimension. + std::vector new_dims = {0, 1, 2, 3}; + TF_EXPECT_OK(params->converter->SqueezeTensor( + /*input=*/input, /*input_dims=*/&new_dims, /*params=*/params, + /*output=*/&output, /*op_instance=*/0)); + // Squeeze the second dimension. + new_dims = {0, 2, 3}; + TF_EXPECT_OK(params->converter->SqueezeTensor( + /*input=*/output, /*input_dims=*/&new_dims, /*params=*/params, + /*output=*/&output, /*op_instance=*/1)); + params->outputs->push_back(TRT_TensorOrWeights(output)); + return Status::OK(); + }; + // Use a simple unary op for the custom converter and add an input. + NodeDef node_def = CreateUnaryOp(DataType::DT_FLOAT); + AddTestTensor("input", {1, 1, 2, 3}); + // Override the converter for Abs to use the custom converter for this test + // only, and run conversion. + GetOpConverterRegistry()->Register("Abs", kDefaultConverterPriority + 1, + op_converter); + RunValidationAndConversion(node_def); + // Set up the inputs and outputs. + DataVec input_data; + DataVec output_data; + InputOutputData abs_input{ + "input", ConstructTensor(/*data_size=*/6, /*value=*/0, + /*tf_type=*/DataType::DT_FLOAT)}; + InputOutputData abs_output{ + "my_unary", ConstructTensor(/*data_size=*/6, /*value=*/0, + /*tf_type=*/DataType::DT_FLOAT)}; + input_data.push_back(abs_input); + output_data.push_back(abs_output); + // Build and run the cuda engine. + TF_EXPECT_OK(BuildAndRun(input_data, &output_data)); } +#endif + } // namespace convert } // namespace tensorrt } // namespace tensorflow -#endif // GOOGLE_TENSORRT -#endif // GOOGLE_CUDA +int main(int argc, char** argv) { +// TRT >= 8.2 optimizes memory management in the builder. When all builders +// are destroyed, it unloads many resources. This test fixture will create and +// destroy hundreds of builders when run sequentially for parameterized +// tests. We can hold open an IBuilder in order to prevent TRT from unloading +// shared resources between engine builds when using TRT shared library. This +// greatly speeds up unit tests and is safe to do. +#if IS_TRT_VERSION_GE(8, 2, 0, 0) + // This builder holds a copy of cask::KernelLibrary, which is shared with + // other builders. Other builders used during testing won't trigger costly + // loading of cask::KernelLibrary. + std::unique_ptr const holder{ + nvinfer1::createInferBuilder(*tensorflow::tensorrt::Logger::GetLogger())}; +#endif + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} +#else +int main(int, char**) { return 0; } +#endif // GOOGLE_CUDA && GOOGLE_TENSORRT diff --git a/tensorflow/compiler/tf2tensorrt/convert/logger_registry.cc b/tensorflow/compiler/tf2tensorrt/convert/logger_registry.cc new file mode 100644 index 00000000000..07c9c2f1ea0 --- /dev/null +++ b/tensorflow/compiler/tf2tensorrt/convert/logger_registry.cc @@ -0,0 +1,60 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#if GOOGLE_CUDA && GOOGLE_TENSORRT + +#include "tensorflow/compiler/tf2tensorrt/convert/logger_registry.h" + +#include + +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/platform/mutex.h" + +namespace tensorflow { +namespace tensorrt { + +class LoggerRegistryImpl : public LoggerRegistry { + Status Register(const string& name, nvinfer1::ILogger* logger) override { + mutex_lock lock(mu_); + if (!registry_.emplace(name, std::unique_ptr(logger)) + .second) { + return errors::AlreadyExists("Logger ", name, " already registered"); + } + return Status::OK(); + } + + nvinfer1::ILogger* LookUp(const string& name) override { + mutex_lock lock(mu_); + const auto found = registry_.find(name); + if (found == registry_.end()) { + return nullptr; + } + return found->second.get(); + } + + private: + mutable mutex mu_; + mutable std::unordered_map> + registry_ TF_GUARDED_BY(mu_); +}; + +LoggerRegistry* GetLoggerRegistry() { + static LoggerRegistryImpl* registry = new LoggerRegistryImpl; + return registry; +} + +} // namespace tensorrt +} // namespace tensorflow + +#endif // GOOGLE_CUDA && GOOGLE_TENSORRT diff --git a/tensorflow/compiler/tf2tensorrt/convert/logger_registry.h b/tensorflow/compiler/tf2tensorrt/convert/logger_registry.h new file mode 100644 index 00000000000..2a265cf7caa --- /dev/null +++ b/tensorflow/compiler/tf2tensorrt/convert/logger_registry.h @@ -0,0 +1,58 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_LOGGER_REGISTRY_H_ +#define TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_LOGGER_REGISTRY_H_ + +#include "tensorflow/core/lib/core/status.h" +#include "tensorflow/core/platform/macros.h" +#include "tensorflow/core/platform/types.h" + +#if GOOGLE_CUDA && GOOGLE_TENSORRT + +#include "third_party/tensorrt/NvInfer.h" + +namespace tensorflow { +namespace tensorrt { + +class LoggerRegistry { + public: + virtual Status Register(const string& name, nvinfer1::ILogger* logger) = 0; + virtual nvinfer1::ILogger* LookUp(const string& name) = 0; + virtual ~LoggerRegistry() {} +}; + +LoggerRegistry* GetLoggerRegistry(); + +class RegisterLogger { + public: + RegisterLogger(const string& name, nvinfer1::ILogger* logger) { + TF_CHECK_OK(GetLoggerRegistry()->Register(name, logger)); + } +}; + +#define REGISTER_TENSORRT_LOGGER(name, logger) \ + REGISTER_TENSORRT_LOGGER_UNIQ_HELPER(__COUNTER__, name, logger) +#define REGISTER_TENSORRT_LOGGER_UNIQ_HELPER(ctr, name, logger) \ + REGISTER_TENSORRT_LOGGER_UNIQ(ctr, name, logger) +#define REGISTER_TENSORRT_LOGGER_UNIQ(ctr, name, logger) \ + static ::tensorflow::tensorrt::RegisterLogger register_trt_logger##ctr \ + TF_ATTRIBUTE_UNUSED = \ + ::tensorflow::tensorrt::RegisterLogger(name, logger) + +} // namespace tensorrt +} // namespace tensorflow + +#endif // GOOGLE_CUDA && GOOGLE_TENSORRT +#endif // TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_LOGGER_REGISTRY_H_ diff --git a/tensorflow/compiler/tf2tensorrt/convert/logger_registry_test.cc b/tensorflow/compiler/tf2tensorrt/convert/logger_registry_test.cc new file mode 100644 index 00000000000..01921297b98 --- /dev/null +++ b/tensorflow/compiler/tf2tensorrt/convert/logger_registry_test.cc @@ -0,0 +1,34 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include +#include + +namespace { + +class TestLogger : public nvinfer1::ILogger { + void log(nvinfer1::ILogger::Severity severity, const char* msg) override {} +}; + +TestLogger test_logger; + +REGISTER_TENSORRT_LOGGER("test_logger", &test_logger); + +TEST(LoggerRegistryTest, RegistersCorrectly) { + auto registered_logger = GetLoggerRegistry()->LookUp("test_logger"); + EXPECT_THAT(registered_logger, Eq(&test_logger)); +} + +} // namespace diff --git a/tensorflow/compiler/tf2tensorrt/convert/op_converter.h b/tensorflow/compiler/tf2tensorrt/convert/op_converter.h new file mode 100644 index 00000000000..e6f21cbed1d --- /dev/null +++ b/tensorflow/compiler/tf2tensorrt/convert/op_converter.h @@ -0,0 +1,225 @@ +/* Copyright 2021 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_OP_CONVERTER_H_ +#define TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_OP_CONVERTER_H_ + +#if GOOGLE_CUDA && GOOGLE_TENSORRT + +#include +#include + +#include "absl/strings/str_format.h" +#include "tensorflow/compiler/tf2tensorrt/convert/trt_parameters.h" +#include "tensorflow/compiler/tf2tensorrt/convert/weights.h" + +namespace tensorflow { +namespace tensorrt { +namespace convert { + +class Converter; + +// Specifies the expected type taken by a TRT_TensorOrWeights input during op +// conversion. +// kResource is only used for resource variable ops. For an operation like +// Add(tensor, ReadVariableOp(...)), the second operand of Add is the result of +// the ReadVariableOp, which is a kWeight. +enum class TrtInputArg { kTensor = 1, kWeight = 2, kBoth = 3, kResource = 4 }; + +// Parameters for each op converter. +struct OpConverterParams { + // Constructor used for validation only. + OpConverterParams(const NodeDef& node_def, + const std::vector& inputs, + std::vector* outputs, + TrtWeightStore* weight_store, + TrtPrecisionMode precision_mode, bool use_calibration, + bool use_implicit_batch, bool use_explicit_precision); + + // Constructor used for conversion. + OpConverterParams(Converter* converter, const NodeDef& node_def, + const std::vector& inputs, + std::vector* outputs, + TrtWeightStore* weight_store); + + Converter* converter = nullptr; + const NodeDef& node_def; + const std::vector& inputs; + std::vector* outputs; + const bool validation_only; + TrtWeightStore* weight_store; + const TrtPrecisionMode precision_mode; + const bool use_calibration; + const bool use_implicit_batch; + const bool use_explicit_precision; +}; + +// Operation converter function specification. +using OpConverter = std::function; + +struct InputArgSpec { + absl::string_view name; + TrtInputArg allowed_roles; + + static constexpr InputArgSpec Create(absl::string_view n, TrtInputArg role) { + return InputArgSpec{n, role}; + } +}; + +template +std::string convert_not_supported_dtype_msg(const T& allowed_types, + DataType tf_type, + const NodeDef& node) { + string allowed_types_string = + absl::StrJoin(allowed_types, ", ", [](string* out, const DataType& type) { + absl::StrAppendFormat(out, "%s", DataTypeString(type)); + }); + + return absl::StrCat("Data type ", DataTypeString(tf_type), + " is not supported for ", node.op(), ", must be one of [", + allowed_types_string, "]"); +} + +std::string convert_not_supported_implicit(const std::string& pOpName, + const std::string& pNodeName, + const char* pOpType = NULL); + +// A Curiously recurring template pattern (CRTP) template class for operation +// converters. +template +class OpConverterBase { + public: + explicit OpConverterBase(const OpConverterParams* params, + const std::vector& data_types = + {DataType::DT_FLOAT, DataType::DT_HALF}) + : params_(params), + node_def_attrs_(params->node_def), + allowed_dtypes_(data_types) {} + + // Default NodeDef attribute name to inspect in order to determine node data + // type. The Impl class can override this by implementing the same function. + static constexpr const char* NodeDefDataTypeAttributeName() { return "T"; } + + // Validate data type of the given NodeDef against allowed types. + Status ValidateNodeDefDataType() { + // If the attribute name is empty, we should skip this check. + if (absl::string_view(Impl::NodeDefDataTypeAttributeName()).empty()) { + return Status::OK(); + } + + // Get the NodeDef data type. + auto dtype = GetAttrValue(Impl::NodeDefDataTypeAttributeName()); + if (!dtype.ok()) { + return errors::InvalidArgument("Attribute with name ", + Impl::NodeDefDataTypeAttributeName(), + " not found."); + } + + // Check allowed data types.; + if (std::find(allowed_dtypes_.begin(), allowed_dtypes_.end(), + dtype.ValueOrDie()) == allowed_dtypes_.end()) { + return errors::Unimplemented(convert_not_supported_dtype_msg( + allowed_dtypes_, dtype.ValueOrDie(), params_->node_def)); + } + return Status::OK(); + } + + static constexpr bool HasFixNumberOfInputs() { return true; } + + // Validates input argument roles and data types. + Status ValidateInputs() { + const NodeDef& node_def = params_->node_def; + const auto& inputs = params_->inputs; + if (Impl::HasFixNumberOfInputs()) { + TRT_ENSURE(inputs.size() == Impl::InputSpec().size()); + } else { + TRT_ENSURE(inputs.size() <= Impl::InputSpec().size()); + } + for (int i = 0; i < inputs.size(); i++) { + const InputArgSpec arg_spec = Impl::InputSpec()[i]; + if (arg_spec.allowed_roles == TrtInputArg::kWeight && + inputs.at(i).is_tensor()) { + return errors::Unimplemented("The input \"", arg_spec.name, "\" for ", + node_def.op(), " must be a constant, at ", + node_def.name()); + } + if (arg_spec.allowed_roles == TrtInputArg::kTensor && + inputs.at(i).is_weights()) { + return errors::Unimplemented("The input \"", arg_spec.name, "\" for ", + node_def.op(), " must be a tensor, at ", + node_def.name()); + } + } + return Status::OK(); + } + + Status operator()() { + // Validate data type and inputs. + TF_RETURN_IF_ERROR(this->ValidateNodeDefDataType()); + TF_RETURN_IF_ERROR(this->ValidateInputs()); + + // Perform op-level validation. + TF_RETURN_IF_ERROR(reinterpret_cast(this)->Validate()); + if (params_->validation_only) { + return Status::OK(); + } + + // Perform conversion. + return reinterpret_cast(this)->Convert(); + } + + protected: + Status NotSupportedInImplicitBatch(const char* pOpType = nullptr) { + if (params_->use_implicit_batch) { + const auto& op = params_->node_def.op(); + const auto& nodeName = params_->node_def.name(); + const auto& error = convert_not_supported_implicit(op, nodeName, pOpType); + return errors::Unimplemented(error); + } + return Status::OK(); + } + + void AddOutput(const TRT_TensorOrWeights& out) { + params_->outputs->push_back(out); + } + + template + ::stream_executor::port::StatusOr GetAttrValue( + absl::string_view key) const { + T result; + TF_RETURN_IF_ERROR(GetNodeAttr(node_def_attrs_, key, &result)); + return result; + } + + const OpConverterParams* const params_; + const AttrSlice node_def_attrs_; + const std::vector allowed_dtypes_; +}; + +// Constructs and returns a converter function for a given operation converter +// class T. This requires T to be a derived class of StructuredOpConverter. +template +OpConverter MakeConverterFunction() { + return [](const OpConverterParams* params) -> Status { + T converter(params); + return converter(); + }; +} + +} // namespace convert +} // namespace tensorrt +} // namespace tensorflow + +#endif // GOOGLE_CUDA && GOOGLE_TENSORRT +#endif // TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_OP_CONVERTER_H_ diff --git a/tensorflow/compiler/tf2tensorrt/convert/op_converter_registry.cc b/tensorflow/compiler/tf2tensorrt/convert/op_converter_registry.cc new file mode 100644 index 00000000000..6c0ea1e3e00 --- /dev/null +++ b/tensorflow/compiler/tf2tensorrt/convert/op_converter_registry.cc @@ -0,0 +1,158 @@ +/* Copyright 2021 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#include "tensorflow/compiler/tf2tensorrt/convert/op_converter_registry.h" + +#include +#include +#include + +#include "absl/container/flat_hash_set.h" +#include "absl/strings/str_cat.h" +#include "tensorflow/core/lib/core/status.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/mutex.h" +#include "tensorflow/core/util/env_var.h" + +#if GOOGLE_CUDA && GOOGLE_TENSORRT + +namespace tensorflow { +namespace tensorrt { +namespace convert { + +struct OpConverterRegistration { + OpConverter converter; + int priority; +}; +class OpConverterRegistry::Impl { + public: + ~Impl() = default; + + InitOnStartupMarker Register(const string& name, const int priority, + OpConverter converter) { + mutex_lock lock(mu_); + auto item = registry_.find(name); + if (item != registry_.end()) { + const int existing_priority = item->second.priority; + if (priority <= existing_priority) { + LOG(WARNING) << absl::StrCat( + "Ignoring TF->TRT ", name, " op converter with priority ", + existing_priority, " due to another converter with priority ", + priority); + return {}; + } else { + LOG(WARNING) << absl::StrCat( + "Overwriting TF->TRT ", name, " op converter with priority ", + existing_priority, " using another converter with priority ", + priority); + registry_.erase(item); + } + } + registry_.insert({name, OpConverterRegistration{converter, priority}}); + return {}; + } + + ::stream_executor::port::StatusOr LookUp(string name) { + // Fetch the user-provide TF operations denylisted for conversion by TF-TRT. + static const absl::flat_hash_set tftrt_op_fakelist = [] { + string tftrt_op_fakelist_str; + TF_CHECK_OK(ReadStringFromEnvVar("TF_TRT_OP_FAKELIST", + /*default_value=*/"", + &tftrt_op_fakelist_str)); + absl::flat_hash_set tftrt_op_fakelist{}; + for (const auto& x : str_util::Split(tftrt_op_fakelist_str, ",")) { + tftrt_op_fakelist.insert(x); + } + // Force a rehash of the flat hash set + tftrt_op_fakelist.rehash(0); + return tftrt_op_fakelist; + }(); + + // In case the TensorFlow OP `name` matches any of the names passed to + // TF_TRT_OP_FAKELIST environment variable, force ::LookUp to resolves to + // ConvertFake OP converter. + if (tftrt_op_fakelist.contains(name)) { + LOG_FIRST_N(INFO, 2) << "Emulating OP Converter: `" << name << "`. It " + << "will cause TRT engine building to fail. This " + << "feature is only intended to be used for " + << "TF-TRT graph segmentation experiments. This " + << "feature is controlled using: " + << "`TF_TRT_OP_FAKELIST=OpName1,OpName2`."; + // Forces ::LookUp to resolve to `ConvertFake` registred to `FakeOp`. + mutex_lock lock(mu_); + return registry_.find("FakeOp")->second.converter; + } + + mutex_lock lock(mu_); + auto found = registry_.find(name); + if (found != registry_.end()) { + return found->second.converter; + } + return errors::NotFound("No converter for op ", name); + } + + void Clear(const std::string& name) { + mutex_lock lock(mu_); + auto itr = registry_.find(name); + if (itr == registry_.end()) { + return; + } + registry_.erase(itr); + } + + std::vector ListRegisteredOps() const { + mutex_lock lock(mu_); + std::vector result; + result.reserve(registry_.size()); + for (const auto& item : registry_) { + result.push_back(item.first); + } + return result; + } + + private: + mutable mutex mu_; + mutable std::unordered_map registry_ + TF_GUARDED_BY(mu_); +}; + +OpConverterRegistry::OpConverterRegistry() : impl_(std::make_unique()) {} + +::stream_executor::port::StatusOr OpConverterRegistry::LookUp( + const string& name) { + return impl_->LookUp(name); +} + +InitOnStartupMarker OpConverterRegistry::Register(const string& name, + const int priority, + OpConverter converter) { + return impl_->Register(name, priority, converter); +} + +std::vector OpConverterRegistry::ListRegisteredOps() const { + return impl_->ListRegisteredOps(); +} + +void OpConverterRegistry::Clear(const std::string& name) { impl_->Clear(name); } + +OpConverterRegistry* GetOpConverterRegistry() { + static OpConverterRegistry* registry = new OpConverterRegistry(); + return registry; +} + +} // namespace convert +} // namespace tensorrt +} // namespace tensorflow + +#endif // GOOGLE_CUDA && GOOGLE_TENSORRT diff --git a/tensorflow/compiler/tf2tensorrt/convert/op_converter_registry.h b/tensorflow/compiler/tf2tensorrt/convert/op_converter_registry.h new file mode 100644 index 00000000000..cba4e907a39 --- /dev/null +++ b/tensorflow/compiler/tf2tensorrt/convert/op_converter_registry.h @@ -0,0 +1,104 @@ +/* Copyright 2021 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_OP_CONVERTER_REGISTRY_H_ +#define TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_OP_CONVERTER_REGISTRY_H_ + +#include +#if GOOGLE_CUDA && GOOGLE_TENSORRT + +#include +#include +#include + +#include "tensorflow/compiler/tf2tensorrt/convert/op_converter.h" +#include "tensorflow/core/lib/core/status.h" + +namespace tensorflow { +namespace tensorrt { +namespace convert { + +class OpConverterRegistry { + public: + OpConverterRegistry(); + ~OpConverterRegistry() = default; + + InitOnStartupMarker Register(const string& name, const int priority, + OpConverter converter); + + InitOnStartupMarker Register(const std::initializer_list& names, + const int priority, OpConverter converter) { + for (const auto& name : names) { + Register(name, priority, converter); + } + return {}; + } + + template ::value>::type* = nullptr> + InitOnStartupMarker Register(const T& names, const int priority, + OpConverter converter) { + for (const auto& name : names) { + Register(name, priority, converter); + } + return {}; + } + + // Clear all registered converters for the given Tensorflow operation name. + void Clear(const std::string& name); + + ::stream_executor::port::StatusOr LookUp(const string& name); + + std::vector ListRegisteredOps() const; + + private: + class Impl; + std::unique_ptr impl_; +}; + +OpConverterRegistry* GetOpConverterRegistry(); + +class RegisterOpConverter { + public: + RegisterOpConverter(const string& name, const int priority, + OpConverter converter) { + GetOpConverterRegistry()->Register(name, priority, converter); + } +}; + +constexpr int kDefaultConverterPriority = 1; + +} // namespace convert +} // namespace tensorrt + +#define REGISTER_TRT_OP_CONVERTER_IMPL(ctr, func, priority, ...) \ + static ::tensorflow::InitOnStartupMarker const \ + register_trt_op_converter##ctr TF_ATTRIBUTE_UNUSED = \ + TF_INIT_ON_STARTUP_IF(true) \ + << tensorrt::convert::GetOpConverterRegistry()->Register( \ + __VA_ARGS__, priority, func) + +#define REGISTER_TRT_OP_CONVERTER(func, priority, ...) \ + TF_NEW_ID_FOR_INIT(REGISTER_TRT_OP_CONVERTER_IMPL, func, priority, \ + __VA_ARGS__) + +#define REGISTER_DEFAULT_TRT_OP_CONVERTER(func, ...) \ + REGISTER_TRT_OP_CONVERTER( \ + func, tensorrt::convert::kDefaultConverterPriority, __VA_ARGS__) + +} // namespace tensorflow + +#endif // GOOGLE_CUDA && GOOGLE_TENSORRT +#endif // TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_OP_CONVERTER_REGISTRY_H_ diff --git a/tensorflow/compiler/tf2tensorrt/convert/op_converter_registry_test.cc b/tensorflow/compiler/tf2tensorrt/convert/op_converter_registry_test.cc new file mode 100644 index 00000000000..af3f8d7b6cc --- /dev/null +++ b/tensorflow/compiler/tf2tensorrt/convert/op_converter_registry_test.cc @@ -0,0 +1,67 @@ +/* Copyright 2021 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#if GOOGLE_CUDA && GOOGLE_TENSORRT +#include "tensorflow/compiler/tf2tensorrt/convert/op_converter_registry.h" + +#include +#include "tensorflow/compiler/tf2tensorrt/convert/op_converter.h" + +namespace tensorflow { +namespace tensorrt { +namespace convert { + +TEST(TestOpConverterRegistry, TestOpConverterRegistry) { + bool flag{false}; + + auto set_true_func = [&flag](const OpConverterParams*) -> Status { + flag = true; + return Status::OK(); + }; + + auto set_false_func = [&flag](const OpConverterParams*) -> Status { + flag = false; + return Status::OK(); + }; + + GetOpConverterRegistry()->Register("FakeFunc", kDefaultConverterPriority, + set_true_func); + + // Lower priority fails to override. + GetOpConverterRegistry()->Register("FakeFunc", kDefaultConverterPriority - 1, + set_false_func); + + // The lookup should return set_true_func (default). + auto func = GetOpConverterRegistry()->LookUp("FakeFunc"); + EXPECT_TRUE(func.ok()); + EXPECT_TRUE(((*func)(nullptr)).ok()); + EXPECT_TRUE(flag); + + // Override with higher priority. + GetOpConverterRegistry()->Register("FakeFunc", kDefaultConverterPriority + 1, + set_false_func); + func = GetOpConverterRegistry()->LookUp("FakeFunc"); + EXPECT_TRUE(func.ok()); + EXPECT_TRUE((*func)(nullptr).ok()); + EXPECT_FALSE(flag); + + // After clearing the op, lookup should return an error. + GetOpConverterRegistry()->Clear("FakeFunc"); + EXPECT_FALSE(GetOpConverterRegistry()->LookUp("FakeFunc").ok()); +} +} // namespace convert +} // namespace tensorrt +} // namespace tensorflow + +#endif diff --git a/tensorflow/compiler/tf2tensorrt/convert/op_converter_test.cc b/tensorflow/compiler/tf2tensorrt/convert/op_converter_test.cc new file mode 100644 index 00000000000..3d09ea00b7c --- /dev/null +++ b/tensorflow/compiler/tf2tensorrt/convert/op_converter_test.cc @@ -0,0 +1,123 @@ +/* Copyright 2021 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#if GOOGLE_CUDA && GOOGLE_TENSORRT +#include "tensorflow/compiler/tf2tensorrt/convert/op_converter.h" + +#include +#include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h" +#include "tensorflow/compiler/tf2tensorrt/convert/op_converter_registry.h" +#include "tensorflow/core/framework/types.pb.h" +#include "tensorflow/core/lib/core/status_test_util.h" +#include "tensorflow/core/platform/status_matchers.h" +#include "tensorflow/core/protobuf/error_codes.pb.h" + +namespace tensorflow { +namespace tensorrt { +namespace convert { + +using ::tensorflow::testing::IsOk; +using ::tensorflow::testing::StatusIs; +using ::testing::HasSubstr; + +class ExampleOpConverter : public OpConverterBase { + public: + explicit ExampleOpConverter(const OpConverterParams* params) + : OpConverterBase(params, {DataType::DT_FLOAT}) {} + + static constexpr const char* NodeDefDataTypeAttributeName() { + return "data_type"; + } + + static constexpr std::array InputSpec() { + return std::array{ + InputArgSpec::Create("input_tensor", TrtInputArg::kTensor), + InputArgSpec::Create("weight", TrtInputArg::kWeight)}; + } + + Status Validate() { return Status::OK(); } + + Status Convert() { + AddOutput(TRT_TensorOrWeights(nvinfer1::DataType::kFLOAT, + nvinfer1::Dims{1, {1, 1, 1}}, 1)); + return Status::OK(); + } +}; + +TEST(TestOpConverterBase, TestOpConverterBase) { + // Register a converter which uses the base converter class. + GetOpConverterRegistry()->Register( + "FakeFunc", 1, MakeConverterFunction()); + + NodeDef def; + def.set_op("FakeFunc"); + auto converter = Converter::Create(TrtPrecisionMode::FP32, false, + Logger::GetLogger(), false, "test_engine"); + EXPECT_THAT(converter, IsOk()); + + // Base class should check attribute with key given by + // Impl::NodeDefDataTypeAttributeName(). + Status conversion_status = (*converter)->ConvertNode(def); + EXPECT_THAT(conversion_status, + StatusIs(error::INVALID_ARGUMENT, + HasSubstr("Attribute with name data_type not found"))); + + // Add partial inputs to the node and make the converter aware. + def.mutable_input()->Add("input1"); + conversion_status = (*converter) + ->AddInputTensor("input1", nvinfer1::DataType::kFLOAT, + nvinfer1::Dims{4, {1, 1, 1, 1}}, 1); + EXPECT_THAT(conversion_status, IsOk()); + + // Base class method should check number of inputs. + AddNodeAttr("data_type", DT_FLOAT, &def); + conversion_status = (*converter)->ConvertNode(def); + EXPECT_THAT(conversion_status, StatusIs(error::INTERNAL)); + + // Add second input to the node and make the converter aware. + def.mutable_input()->Add("input2"); + conversion_status = (*converter) + ->AddInputTensor("input2", nvinfer1::DataType::kFLOAT, + nvinfer1::Dims{4, {1, 1, 1, 1}}, 1); + EXPECT_THAT(conversion_status, IsOk()); + + // Base class validation should check the type (Constant or Tensor) of the + // inputs. + conversion_status = (*converter)->ConvertNode(def); + EXPECT_THAT( + conversion_status, + StatusIs(error::UNIMPLEMENTED, + HasSubstr("input \"weight\" for FakeFunc must be a constant"))); + + // Correct input2 so that it is a weight. + (*converter)->TensorsMap().erase("input2"); + (*converter) + ->TensorsMap() + .insert(std::make_pair("input2", TRT_TensorOrWeights(TRT_ShapedWeights( + nvinfer1::DataType::kFLOAT)))); + + // With the correct input types, check that the converter is called and sets + // one output tensor. + conversion_status = (*converter)->ConvertNode(def); + EXPECT_THAT(conversion_status, IsOk()); + EXPECT_EQ((*converter)->TensorsMap().size(), 3U); + + GetOpConverterRegistry()->Clear("FakeFunc"); +} + +} // namespace convert +} // namespace tensorrt +} // namespace tensorflow + +#endif diff --git a/tensorflow/compiler/tf2tensorrt/convert/ops/binary_ops.cc b/tensorflow/compiler/tf2tensorrt/convert/ops/binary_ops.cc new file mode 100644 index 00000000000..d611920717c --- /dev/null +++ b/tensorflow/compiler/tf2tensorrt/convert/ops/binary_ops.cc @@ -0,0 +1,235 @@ +/* Copyright 2022 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#if GOOGLE_CUDA && GOOGLE_TENSORRT + +#include "tensorflow/compiler/tf2tensorrt/convert/op_converter_registry.h" +#include "tensorflow/compiler/tf2tensorrt/convert/ops/layer_utils.h" + +namespace tensorflow { +namespace tensorrt { +namespace convert { + +const BinaryOperationMapType* BinaryOperationMap() { + static const auto* map = new BinaryOperationMapType({ + {"Add", nvinfer1::ElementWiseOperation::kSUM}, + {"AddV2", nvinfer1::ElementWiseOperation::kSUM}, + {"Mul", nvinfer1::ElementWiseOperation::kPROD}, + {"Sub", nvinfer1::ElementWiseOperation::kSUB}, + {"Div", nvinfer1::ElementWiseOperation::kDIV}, + {"FloorDiv", nvinfer1::ElementWiseOperation::kFLOOR_DIV}, + {"RealDiv", nvinfer1::ElementWiseOperation::kDIV}, + {"Minimum", nvinfer1::ElementWiseOperation::kMIN}, + {"Maximum", nvinfer1::ElementWiseOperation::kMAX}, + {"Pow", nvinfer1::ElementWiseOperation::kPOW}, +#if IS_TRT_VERSION_GE(8, 2, 0, 0) + {"Greater", nvinfer1::ElementWiseOperation::kGREATER}, + {"Less", nvinfer1::ElementWiseOperation::kLESS}, + {"Equal", nvinfer1::ElementWiseOperation::kEQUAL}, + // Operators are implemented as NOT Less and NOT Greater, respectively. + {"GreaterEqual", nvinfer1::ElementWiseOperation::kLESS}, + {"LessEqual", nvinfer1::ElementWiseOperation::kGREATER}, +#endif + }); + return map; +} + +const BinaryOperationMapType* BinaryBooleanOperationMap() { + static const auto* map = new BinaryOperationMapType({ + {"LogicalOr", nvinfer1::ElementWiseOperation::kOR}, + {"LogicalAnd", nvinfer1::ElementWiseOperation::kAND}, + }); + return map; +} + +namespace { +class ConvertBinaryImpl { + protected: + ConvertBinaryImpl(const BinaryOperationMapType* pOperMap) + : pOperMap_(pOperMap) {} + + Status ValidateImpl( + const OpConverterParams& params, + const std::vector& implicit_batch_not_supported_ops = {}, + bool both_tensors = false) { + const auto& node_def = params.node_def; + const auto& op = node_def.op(); + const auto op_pair = pOperMap_->find(op); + if (op_pair == pOperMap_->end()) { + return errors::Unimplemented("Binary op: ", op, " not supported"); + } + + // Constant folding should have been done by TensorFlow. + const auto& inputs = params.inputs; + if (inputs.at(0).is_weights() && inputs.at(1).is_weights()) { + return errors::Unimplemented( + "Constant folding is falled back to TensorFlow, binary op '", op, + "' received both input as constant"); + } + + if ((convertToBool_ = find_name(op, implicit_batch_not_supported_ops))) { + if (params.use_implicit_batch) { + return errors::Unimplemented( + convert_not_supported_implicit(op, node_def.name(), "Binary")); + } + } + + if (both_tensors) { + if (inputs.at(0).is_weights() || inputs.at(1).is_weights()) { + return errors::InvalidArgument("Both inputs of '", op, + "' are expected to be tensors"); + } + // No need to convert the output of "LogicalOr" and "LogicalAnd" + convertToBool_ = false; + } + + nvinfer1::Dims broadcasted_dims[2]; + TF_RETURN_IF_ERROR(GetTrtBroadcastShape( + inputs.at(0), inputs.at(1), true, params.use_implicit_batch, + broadcasted_dims, broadcasted_dims + 1)); + + for (int i = 0; i < tensor_.size(); i++) { + // This will also convert constants to tensors. + TF_RETURN_IF_ERROR(PrepareTensorForShape( + params.converter, inputs.at(i), broadcasted_dims[i], + params.validation_only, &tensor_[i], node_def, i)); + } + operation_ = op_pair->second; + return Status::OK(); + } + + Status ConvertImpl(const OpConverterParams& params, + const std::vector& revert_bool_ops = {}) { + const auto& node_def = params.node_def; + // Add ElementWise layer. + auto* network = params.converter->network(); + nvinfer1::ILayer* layer = network->addElementWise( + *tensor_[0]->trt_tensor(), *tensor_[1]->trt_tensor(), operation_); + TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name()); + + if (params.use_explicit_precision) { + layer->setPrecision(nvinfer1::DataType::kFLOAT); + } + + params.converter->SetLayerName(layer, node_def); + const auto& output = layer->getOutput(0); + if (convertToBool_) { + output->setType(nvinfer1::DataType::kBOOL); + if (find_name(node_def.op(), revert_bool_ops)) { + nvinfer1::IUnaryLayer* unary_layer = + network->addUnary(*output, nvinfer1::UnaryOperation::kNOT); + TFTRT_RETURN_ERROR_IF_NULLPTR(unary_layer, node_def.name()); + params.outputs->push_back( + TRT_TensorOrWeights(unary_layer->getOutput(0))); + return Status::OK(); + } + } + + params.outputs->push_back(TRT_TensorOrWeights(output)); + return Status::OK(); + } + + static constexpr std::array InputSpec() { + return std::array{ + InputArgSpec::Create("x", TrtInputArg::kBoth), + InputArgSpec::Create("y", TrtInputArg::kBoth)}; + } + + private: + const BinaryOperationMapType* pOperMap_; + std::array tensor_{nullptr, nullptr}; + nvinfer1::ElementWiseOperation operation_; + bool convertToBool_; +}; + +class ConvertBinary : public OpConverterBase, + protected ConvertBinaryImpl { + public: + explicit ConvertBinary(const OpConverterParams* params) + : OpConverterBase( + params, + {DataType::DT_FLOAT, DataType::DT_HALF, DataType::DT_INT32}), + ConvertBinaryImpl(BinaryOperationMap()) {} + + static constexpr std::array InputSpec() { + return ConvertBinaryImpl::InputSpec(); + } + + Status Validate() { + const std::vector implicit_batch_not_supported_ops { +#if IS_TRT_VERSION_GE(8, 2, 0, 0) + "Greater", "Less", "Equal", "GreaterEqual", "LessEqual" +#endif + }; + return ValidateImpl(*params_, implicit_batch_not_supported_ops); + } + Status Convert() { + const std::vector implemented_with_reverted_ops { +#if IS_TRT_VERSION_GE(8, 2, 0, 0) + "GreaterEqual", "LessEqual" +#endif + }; + return ConvertImpl(*params_, implemented_with_reverted_ops); + } +}; + +class ConvertBooleanBinary : public OpConverterBase, + public ConvertBinaryImpl { + public: + explicit ConvertBooleanBinary(const OpConverterParams* params) + : OpConverterBase(params, {DataType::DT_BOOL}), + ConvertBinaryImpl(BinaryBooleanOperationMap()) {} + + static constexpr std::array InputSpec() { + return ConvertBinaryImpl::InputSpec(); + } + + static constexpr const char* NodeDefDataTypeAttributeName() { + /* + node { + name: "..." + op: "LogicalOr" + input: "..." + input: "..." + attr { + key: "_output_shapes" + ... + } + } + */ + return ""; + } + Status Validate() { +#if IS_TRT_VERSION_GE(8, 2, 0, 0) + return ValidateImpl(*params_, {"LogicalOr", "LogicalAnd"}, true); +#else + return errors::Unimplemented("Boolean op: ", params_->node_def.op(), + " is not supported in TRT version < 8.2"); +#endif + } + Status Convert() { return ConvertImpl(*params_); } +}; +} // namespace + +REGISTER_DEFAULT_TRT_OP_CONVERTER(MakeConverterFunction(), + GetOperationNames(*BinaryOperationMap())); +REGISTER_DEFAULT_TRT_OP_CONVERTER( + MakeConverterFunction(), + GetOperationNames(*BinaryBooleanOperationMap())); + +} // namespace convert +} // namespace tensorrt +} // namespace tensorflow +#endif // GOOGLE_CUDA && GOOGLE_TENSORRT diff --git a/tensorflow/compiler/tf2tensorrt/convert/ops/data_format_vec_permute.cc b/tensorflow/compiler/tf2tensorrt/convert/ops/data_format_vec_permute.cc new file mode 100644 index 00000000000..348d478aaeb --- /dev/null +++ b/tensorflow/compiler/tf2tensorrt/convert/ops/data_format_vec_permute.cc @@ -0,0 +1,179 @@ +/* Copyright 2021 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#if GOOGLE_CUDA && GOOGLE_TENSORRT +#include "tensorflow/compiler/tf2tensorrt/common/utils.h" +#include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h" +#include "tensorflow/compiler/tf2tensorrt/convert/op_converter.h" +#include "tensorflow/compiler/tf2tensorrt/convert/op_converter_registry.h" +#include "tensorflow/compiler/tf2tensorrt/convert/utils.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/lib/core/status.h" +#include "third_party/tensorrt/NvInfer.h" +#include "third_party/tensorrt/NvInferRuntimeCommon.h" + +namespace tensorflow { +namespace tensorrt { +namespace convert { + +int get_spatial_dim_count(string format) { + // Spatial dimensions are the dimensions besides NC, and here we assume NC + // always appear in the format string. + return format.size() - 2; +} + +class ConvertDataFormatVecPermute + : public OpConverterBase { + public: + ConvertDataFormatVecPermute(const OpConverterParams* params) + : OpConverterBase(params, + {DataType::DT_INT32}) {} + + struct DataFormatVecPermuteAttributes { + string dst_format; + string src_format; + int x_dim_count; + }; + + static constexpr std::array InputSpec() { + return {InputArgSpec::Create("x", TrtInputArg::kBoth)}; + } + + Status Validate() { + TF_RETURN_IF_ERROR(NotSupportedInImplicitBatch()); + const auto& inputs = params_->inputs; + const auto& nodeName = params_->node_def.name(); + + x_input_ = inputs.at(0); + + // Check input rank. + const auto x_dims = x_input_.GetTrtDims(); + int input_rank = x_dims.nbDims; + if (input_rank != 1 && input_rank != 2) { + return errors::InvalidArgument( + "Input must be a vector or matrix, but got rank ", input_rank, + ", at ", nodeName); + } + + // Verify and consume node attributes. + ::stream_executor::port::StatusOr dst_format = + GetAttrValue("dst_format"); + ::stream_executor::port::StatusOr src_format = + GetAttrValue("src_format"); + TRT_ENSURE_OK(dst_format); + TRT_ENSURE_OK(src_format); + + // Check input dims. + const int full_dim_count = src_format.ValueOrDie().size(); + const int spatial_dim_count = + get_spatial_dim_count(src_format.ValueOrDie()); + if (input_rank == 1) { + if (x_dims.d[0] != spatial_dim_count && x_dims.d[0] != full_dim_count) { + return errors::InvalidArgument( + "1D input must be of size ", spatial_dim_count, " or ", + full_dim_count, ", but got size ", x_dims.d[0], ", at ", nodeName); + } + } else if (input_rank == 2) { + if (x_dims.d[0] != spatial_dim_count && x_dims.d[0] != full_dim_count) { + return errors::InvalidArgument( + "First dimension of 2D input must be of size ", spatial_dim_count, + " or ", full_dim_count, ", but got shape (", x_dims.d[0], ", ", + x_dims.d[1], "), at ", nodeName); + } + if (x_dims.d[1] != 2) { + return errors::InvalidArgument( + "Second dimension of 2D input must be of size 2, but got shape (", + x_dims.d[0], ", ", x_dims.d[1], "), at ", nodeName); + } + } + + // Set custom attributes. + attrs_.x_dim_count = x_dims.d[0]; + attrs_.dst_format = dst_format.ValueOrDie(); + attrs_.src_format = src_format.ValueOrDie(); + + return Status::OK(); + } + + Status Convert() { + // Copy format strings in case they need to be modified. + string dst_format = attrs_.dst_format; + string src_format = attrs_.src_format; + const int& spatial_dim_count = get_spatial_dim_count(src_format); + + // If the input is a vector of size spatial_dim_count, treat the elements + // as spatial dimensions. + if (attrs_.x_dim_count == spatial_dim_count) { + auto keep_only_spatial_dimensions = + [spatial_dim_count](string* format_str) -> void { + auto new_end = std::remove_if(format_str->begin(), format_str->end(), + [spatial_dim_count](const char dim) { + return dim == 'N' || dim == 'C'; + }); + format_str->erase(new_end, format_str->end()); + }; + keep_only_spatial_dimensions(&src_format); + keep_only_spatial_dimensions(&dst_format); + } + + // Create indices for the gather layer and make weights out of them. + std::vector dst_indices(attrs_.x_dim_count); + for (int i = 0; i < attrs_.x_dim_count; ++i) { + for (int j = 0; j < attrs_.x_dim_count; ++j) { + if (src_format[i] == dst_format[j]) { + dst_indices[j] = i; + break; + } + } + } + nvinfer1::Dims indices_dims = {1, {attrs_.x_dim_count}}; + ::stream_executor::port::StatusOr indices_weights = + params_->weight_store->GetTempWeights(nvinfer1::DataType::kINT32, + indices_dims); + TRT_ENSURE_OK(indices_weights); + int32* indices_ptr = indices_weights.ValueOrDie().GetPointer(); + std::copy(dst_indices.data(), dst_indices.data() + attrs_.x_dim_count, + indices_ptr); + ITensorProxyPtr x_tensor = + x_input_.is_weights() ? params_->converter->CreateConstantLayer( + x_input_.weights(), x_input_.GetTrtDims()) + : x_input_.tensor(); + ITensorProxyPtr indices_tensor = params_->converter->CreateConstantLayer( + indices_weights.ValueOrDie(), indices_dims); + + // Gather layer with 1D indices on axis 0, conserves shape. + nvinfer1::IGatherLayer* layer = params_->converter->network()->addGather( + *x_tensor->trt_tensor(), *indices_tensor->trt_tensor(), 0); + TRT_ENSURE(layer); + params_->converter->SetLayerName(layer, params_->node_def); + + ITensorProxyPtr output_tensor = layer->getOutput(0); + + params_->outputs->push_back(TRT_TensorOrWeights(output_tensor)); + return Status::OK(); + } + + private: + TRT_TensorOrWeights x_input_; + DataFormatVecPermuteAttributes attrs_{}; +}; +REGISTER_DEFAULT_TRT_OP_CONVERTER( + MakeConverterFunction(), + {"DataFormatVecPermute"}); + +} // namespace convert +} // namespace tensorrt +} // namespace tensorflow + +#endif // GOOGLE_CUDA && GOOGLE_TENSORRT diff --git a/tensorflow/compiler/tf2tensorrt/convert/ops/fill_ops.cc b/tensorflow/compiler/tf2tensorrt/convert/ops/fill_ops.cc new file mode 100644 index 00000000000..96e5558532e --- /dev/null +++ b/tensorflow/compiler/tf2tensorrt/convert/ops/fill_ops.cc @@ -0,0 +1,316 @@ +/* Copyright 2022 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#if GOOGLE_CUDA && GOOGLE_TENSORRT + +#include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h" +#include "tensorflow/compiler/tf2tensorrt/convert/op_converter_registry.h" +#include "tensorflow/compiler/tf2tensorrt/convert/ops/layer_utils.h" + +namespace tensorflow { +namespace tensorrt { +namespace convert { + +#if IS_TRT_VERSION_GE(8, 2, 0, 0) + +template +class ConvertFillBase : public OpConverterBase { + public: + explicit ConvertFillBase(const OpConverterParams* params) + : OpConverterBase(params, {DataType::DT_FLOAT, DataType::DT_HALF, + DataType::DT_INT32}) {} +}; + +class ConvertFill : public ConvertFillBase { + public: + explicit ConvertFill(const OpConverterParams* params) + : ConvertFillBase(params) {} + + static constexpr std::array InputSpec() { + return std::array{ + InputArgSpec::Create("dims", TrtInputArg::kBoth), + InputArgSpec::Create("value", TrtInputArg::kBoth)}; + } + + Status Validate() { + const auto& params = *this->params_; + TF_RETURN_IF_ERROR(NotSupportedInImplicitBatch()); + + const auto& inputs = params.inputs; + const auto& node_def = params.node_def; + const TRT_TensorOrWeights& dims_input = inputs.at(0); + + const auto dims_type = dims_input.TrtDType(); + if (dims_type != nvinfer1::DataType::kINT32) { + return errors::InvalidArgument("The dims parameter of ", node_def.op(), + " operation in ", node_def.name(), + " is expected to be of type ", + DebugString(nvinfer1::DataType::kINT32), + " type, got ", DebugString(dims_type)); + } + + const auto nbDims = dims_input.GetTrtDims().nbDims; + if (nbDims < 0) { + return errors::InvalidArgument("The shape of parameter ", node_def.op(), + " operation in ", node_def.name(), + " cannot be partial."); + } + return Status::OK(); + } + + Status Convert() { + const auto& params = *this->params_; + auto* network = params.converter->network(); + const auto& inputs = params.inputs; + + const bool is_dims_static = inputs[0].is_weights(); + const bool is_value_static = inputs[1].is_weights(); + + const TRT_TensorOrWeights& dims_input = inputs.at(0); + const TRT_TensorOrWeights& value_input = inputs.at(1); + + int nbDims = dims_input.GetTrtDims().d[0]; + + nvinfer1::Dims trt_dims{0}; + if (is_dims_static) { + const auto dims_weights = dims_input.weights(); + DimsAdapter dims_adapter(dims_weights.GetSpan()); + dims_adapter.TrtDims(&trt_dims); + } + + auto builder = TRTNetworkBuilder::Create(network, params.weight_store); + ::stream_executor::port::StatusOr layer = + builder.ValueOrDie().AddFill(value_input, dims_input, is_value_static, + is_dims_static, nbDims, trt_dims); + ITensorProxyPtr output_tensor = layer.ValueOrDie()->getOutput(0); + this->AddOutput(TRT_TensorOrWeights(output_tensor)); + return Status::OK(); + } +}; + +class ConvertRange : public ConvertFillBase { + public: + explicit ConvertRange(const OpConverterParams* params) + : ConvertFillBase(params) {} + + static constexpr std::array InputSpec() { + return std::array{ + InputArgSpec::Create("start", TrtInputArg::kBoth), + InputArgSpec::Create("limit", TrtInputArg::kBoth), + InputArgSpec::Create("delta", TrtInputArg::kBoth)}; + } + + static constexpr const char* NodeDefDataTypeAttributeName() { + /* + node { + name: "..." + op: "Range" + ... + attr { + key: "Tidx" + value { + type: DT_INT32 + } + } + } + */ + return "Tidx"; + } + Status Validate() { + TF_RETURN_IF_ERROR(NotSupportedInImplicitBatch()); + const auto& params = *this->params_; + const auto& inputs = params.inputs; + const auto& node_def = params.node_def; + + float param[3]; + all_weights_ = all_integers_ = true; + for (int i = 0; i < 3; i++) { + const auto& input = inputs.at(i); + all_integers_ &= input.TrtDType() == nvinfer1::DataType::kINT32; + if (input.is_weights()) { + switch (input.TrtDType()) { + case nvinfer1::DataType::kFLOAT: + param[i] = get_input_param(input); + break; + case nvinfer1::DataType::kHALF: + param[i] = get_input_param(input); + break; + case nvinfer1::DataType::kINT32: + param[i] = get_input_param(input); + break; + default: + return errors::InvalidArgument( + "Unsupported data type ", DebugString(input.TrtDType()), + " used for '", InputSpec()[i].name, "'"); + } + } else { + all_weights_ = false; + } + } + + if (!(all_weights_ || all_integers_)) { + // As of 06/03/2022, when at least one of the (start, limit, delta) + // is passed as a tensor, they must all be of type kINT32 + return errors::Unimplemented(convert_range_expected_msg(node_def)); + } + + if (inputs.at(2).is_weights()) { + if ((delta_ = param[2]) == 0) { + return errors::InvalidArgument("The delta parameter of ", node_def.op(), + " operation cannot be equal to 0"); + } + + if (!all_weights_ && delta_ < 0) { + return errors::InvalidArgument( + "The delta parameter of Range operation " + "cannot be negative, when one of (start, limit) is passed as " + "a tensor, but got ", + delta_); + } + } + + for (int i = 0; i < 3; i++) { + const auto& input = inputs.at(i); + const auto& dims = input.GetTrtDims(); + if (dims.nbDims != 1 || dims.d[0] != 1) { + return errors::InvalidArgument("Dimension for '", InputSpec()[i].name, + "' of ", node_def.op(), " operator ", + "should be equal to 1"); + } + } + + if (all_weights_) { + const auto num_intervals_float = + (param[1] - (start_ = param[0])) / delta_; + if (num_intervals_float < 0) { + const auto error = convert_range_error_msg(start_, param[1], delta_); + return errors::InvalidArgument(error); + } + + num_values_ = static_cast(num_intervals_float); + if (start_ + delta_ * num_values_ != param[1]) { + num_values_++; + } + } + + return Status::OK(); + } + + Status Convert() { + const auto& params = *this->params_; + const auto& inputs = params.inputs; + const TRT_TensorOrWeights& input = inputs.at(0); + TRT_TensorOrWeights value_input; + nvinfer1::Dims trt_dims{1}; + auto builder = TRTNetworkBuilder::Create(params.converter->network(), + params.weight_store); + TRT_ENSURE_OK(builder); + ITensorProxyPtr dims_input_tensor = nullptr; + ITensorProxyPtr beta_tensor = nullptr; + ITensorProxyPtr scalar_tensor = nullptr; + if (!all_weights_) { + ITensorProxyPtr tensors[3]; + for (int i = 0; i < 3; i++) { + TF_RETURN_IF_ERROR(builder.ValueOrDie().get_tensor4TensorOrWeights( + inputs.at(i), tensors + i)); + } + + ::stream_executor::port::StatusOr num = + builder.ValueOrDie().Sub(/*limit*/ tensors[1]->trt_tensor(), + /*start*/ tensors[0]->trt_tensor()); + + TRT_ENSURE_PTR_OK(num); + ::stream_executor::port::StatusOr ceil_div = + builder.ValueOrDie().FloorDiv( + num.ValueOrDie()->getOutput(0), + (beta_tensor = tensors[2])->trt_tensor()); + TRT_ENSURE_PTR_OK(ceil_div); + dims_input_tensor = ceil_div.ValueOrDie()->getOutput(0); + dims_input_tensor->setType(nvinfer1::DataType::kINT32); + + nvinfer1::Dims scalar_dims{0}; + TF_RETURN_IF_ERROR(PrepareTensorForShape( + params.converter, params.inputs.at(0), scalar_dims, false, + &scalar_tensor, params.node_def)); + } else { + DimsAdapter value_input_dims(std::vector{1}); + ::stream_executor::port::StatusOr value_weights = + params.weight_store->GetTempWeights(input.TrtDType(), + value_input_dims); + + TF_RETURN_IF_ERROR(value_weights.status()); + TF_RETURN_IF_ERROR(value_weights.ValueOrDie().SetValues(start_)); + value_input = TRT_TensorOrWeights(value_weights.ValueOrDie()); + + trt_dims.d[0] = num_values_; + ::stream_executor::port::StatusOr const_layer = + builder.ValueOrDie().ConstantShape(value_input_dims); + TRT_ENSURE_PTR_OK(const_layer); + dims_input_tensor = const_layer.ValueOrDie()->getOutput(0); + } + + TRT_TensorOrWeights dims_input(dims_input_tensor); + + ::stream_executor::port::StatusOr layer = + builder.ValueOrDie().AddFill(value_input, dims_input, all_weights_, + all_weights_, 1, trt_dims, scalar_tensor, + beta_tensor, delta_); + + ITensorProxyPtr output_tensor = layer.ValueOrDie()->getOutput(0); + if (all_integers_) { + output_tensor->setType(nvinfer1::DataType::kINT32); + } + + this->AddOutput(TRT_TensorOrWeights(output_tensor)); + return Status::OK(); + } + + private: + template + float get_input_param(const TRT_TensorOrWeights& input) { + return static_cast(*input.weights().GetPointer()); + } + + float start_; + float delta_; + int num_values_; + bool all_weights_; + bool all_integers_; +}; + +std::string convert_range_error_msg(float start, float limit, float delta) { + constexpr const char* format_string = + "For parameters (start, limit) = (%.2f, %.2f) " + "of the Range operation delta cannot be %s, got %.2f"; + return absl::StrFormat(format_string, start, limit, + start < limit ? "negative" : "positive", delta); +} + +std::string convert_range_expected_msg(const NodeDef& node_def) { + return "When at least one of parameters (start, limit, delta) of " + + node_def.op() + " operation in " + node_def.name() + + " is passed as a tensor, they must all be of type kINT32"; +} + +REGISTER_DEFAULT_TRT_OP_CONVERTER(MakeConverterFunction(), "Fill"); +REGISTER_DEFAULT_TRT_OP_CONVERTER(MakeConverterFunction(), + "Range"); + +#endif // IS_TRT_VERSION_GE(8, 2, 0, 0) + +} // namespace convert +} // namespace tensorrt +} // namespace tensorflow +#endif // GOOGLE_CUDA && GOOGLE_TENSORRT diff --git a/tensorflow/compiler/tf2tensorrt/convert/ops/layer_utils.h b/tensorflow/compiler/tf2tensorrt/convert/ops/layer_utils.h new file mode 100644 index 00000000000..458b8e8191d --- /dev/null +++ b/tensorflow/compiler/tf2tensorrt/convert/ops/layer_utils.h @@ -0,0 +1,736 @@ +/* Copyright 2021 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_OPS_LAYER_UTILS_H_ +#define TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_OPS_LAYER_UTILS_H_ +#if GOOGLE_CUDA && GOOGLE_TENSORRT + +#include + +#include "absl/strings/str_cat.h" +#include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h" +#include "tensorflow/compiler/tf2tensorrt/convert/utils.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/lib/core/status.h" +#include "third_party/tensorrt/NvInfer.h" +#include "third_party/tensorrt/NvInferRuntimeCommon.h" + +namespace tensorflow { +namespace tensorrt { + +namespace convert { + +// Facilitates the creation of TensorRT layers inside a network. The user +// provides a INetworkDefinition pointer during construction. They can then add +// operations to the network through the provided functions. Each function +// returns a struct which contains the symbolic result of the operation (ITensor +// pointer) as well as a pointer to the last TensorRT ILayer created. Some +// operations may create multiple layers in order to accomplish the desired +// result (e.g. Sign). +class TRTNetworkBuilder { + public: + static ::stream_executor::port::StatusOr Create( + nvinfer1::INetworkDefinition* network, TrtWeightStore* weight_store) { + TRT_ENSURE(network); + TRT_ENSURE(weight_store); + return TRTNetworkBuilder(network, weight_store); + } + + private: + TRTNetworkBuilder(nvinfer1::INetworkDefinition* network, + TrtWeightStore* weight_store) + : network_(network), weight_store_(weight_store) {} + + public: + // Adds an Add operation to the network. + ::stream_executor::port::StatusOr Add( + nvinfer1::ITensor* lhs, nvinfer1::ITensor* rhs) noexcept { + TRT_ENSURE(lhs); + TRT_ENSURE(rhs); + nvinfer1::IElementWiseLayer* layer = network_->addElementWise( + *lhs, *rhs, nvinfer1::ElementWiseOperation::kSUM); + TRT_ENSURE(layer); + return layer; + }; + + // Adds an elementwise min(lhs, rhs) operation to the network. The output has + // the same data type as the input. + ::stream_executor::port::StatusOr Min( + nvinfer1::ITensor* lhs, nvinfer1::ITensor* rhs) noexcept { + TRT_ENSURE(lhs); + TRT_ENSURE(rhs); + nvinfer1::IElementWiseLayer* layer = network_->addElementWise( + *lhs, *rhs, nvinfer1::ElementWiseOperation::kMIN); + TRT_ENSURE(layer); + return layer; + }; + + // Adds an elementwise max(lhs, rhs) operation to the network. The output has + // the same datatype as the input. + ::stream_executor::port::StatusOr Max( + nvinfer1::ITensor* lhs, nvinfer1::ITensor* rhs) noexcept { + TRT_ENSURE(lhs); + TRT_ENSURE(rhs); + nvinfer1::IElementWiseLayer* layer = network_->addElementWise( + *lhs, *rhs, nvinfer1::ElementWiseOperation::kMAX); + TRT_ENSURE(layer); + return layer; + }; + + // Adds an absolute value operation to the network. Note that this unary + // operation will do an implict float conversion. For int32 tensors, use + // "AbsInt". + ::stream_executor::port::StatusOr AbsFloat( + nvinfer1::ITensor* input) noexcept { + TRT_ENSURE(input); + TRT_ENSURE(input->getType() != nvinfer1::DataType::kFLOAT && + input->getType() != nvinfer1::DataType::kHALF); + nvinfer1::IUnaryLayer* layer = + network_->addUnary(*input, nvinfer1::UnaryOperation::kABS); + TRT_ENSURE(layer); + return layer; + } + + // Performs Abs without implict float conversion. The input should be of type + // kInt32. For float datatypes, use "Abs". + ::stream_executor::port::StatusOr AbsInt( + nvinfer1::ITensor* input) noexcept { + TRT_ENSURE(input); + TRT_ENSURE(input->getType() == nvinfer1::DataType::kINT32); + ::stream_executor::port::StatusOr sign = + this->SignInt(input); + return this->Mul(input, sign.ValueOrDie()->getOutput(0)); + } + + // Returns elementwise sign(x) for int32 input tensors where sign(x) is + // defined as 1 where x > 0, -1 where x < 0 and 0 where x == 0. + ::stream_executor::port::StatusOr SignInt( + nvinfer1::ITensor* input) noexcept { + TRT_ENSURE(input); + + // Create constants +1 and -1. + ::stream_executor::port::StatusOr one = + this->Constant(1, input->getDimensions().nbDims); + TRT_ENSURE_PTR_OK(one); + + ::stream_executor::port::StatusOr neg_one = + this->Constant(-1, input->getDimensions().nbDims); + TRT_ENSURE_PTR_OK(neg_one); + + // Turn all negaitve elements into -1, positive and zero elements + // unaffected. + ::stream_executor::port::StatusOr max = + this->Max(input, neg_one.ValueOrDie()->getOutput(0)); + TRT_ENSURE_PTR_OK(max); + + // Turn all positive elements into +1, negative and zero elements + // unaffected. + ::stream_executor::port::StatusOr min = + this->Min(max.ValueOrDie()->getOutput(0), + one.ValueOrDie()->getOutput(0)); + TRT_ENSURE_PTR_OK(min); + return min; + } + + // Adds a Sub operation to the network. + ::stream_executor::port::StatusOr Sub( + nvinfer1::ITensor* lhs, nvinfer1::ITensor* rhs) noexcept { + TRT_ENSURE(lhs); + TRT_ENSURE(rhs); + nvinfer1::IElementWiseLayer* layer = network_->addElementWise( + *lhs, *rhs, nvinfer1::ElementWiseOperation::kSUB); + TRT_ENSURE(layer); + return layer; + } + + // Adds an Greater operation to the network. + ::stream_executor::port::StatusOr Greater( + nvinfer1::ITensor* lhs, nvinfer1::ITensor* rhs) noexcept { + TRT_ENSURE(lhs); + TRT_ENSURE(rhs); + nvinfer1::IElementWiseLayer* layer = network_->addElementWise( + *lhs, *rhs, nvinfer1::ElementWiseOperation::kGREATER); + TRT_ENSURE(layer); + return layer; + } + + // Adds an Equal operation to the network. + ::stream_executor::port::StatusOr Equal( + nvinfer1::ITensor* lhs, nvinfer1::ITensor* rhs) noexcept { + TRT_ENSURE(lhs); + TRT_ENSURE(rhs); + nvinfer1::IElementWiseLayer* layer = network_->addElementWise( + *lhs, *rhs, nvinfer1::ElementWiseOperation::kEQUAL); + TRT_ENSURE(layer); + return layer; + } + + // Adds a FloorDiv operation to the network. + ::stream_executor::port::StatusOr FloorDiv( + nvinfer1::ITensor* lhs, nvinfer1::ITensor* rhs) noexcept { + TRT_ENSURE(lhs); + TRT_ENSURE(rhs); + nvinfer1::IElementWiseLayer* layer = network_->addElementWise( + *lhs, *rhs, nvinfer1::ElementWiseOperation::kFLOOR_DIV); + TRT_ENSURE(layer); + return layer; + } + + // Returns the equivalent of ceil_divide(abs(x)/abs(y))) operation. The inputs + // "lhs" and "rhs" should be int32 tensors. + ::stream_executor::port::StatusOr AbsCeilDivInt( + nvinfer1::ITensor* lhs, nvinfer1::ITensor* rhs) noexcept { + TRT_ENSURE(lhs); + TRT_ENSURE(rhs); + TRT_ENSURE(lhs->getType() == nvinfer1::DataType::kINT32); + TRT_ENSURE(rhs->getType() == nvinfer1::DataType::kINT32); + + ::stream_executor::port::StatusOr rhs_abs = + this->AbsInt(rhs); + TRT_ENSURE_PTR_OK(rhs_abs); + ::stream_executor::port::StatusOr lhs_abs = + this->AbsInt(lhs); + TRT_ENSURE_PTR_OK(lhs_abs); + ::stream_executor::port::StatusOr add1 = + this->Add(lhs_abs.ValueOrDie()->getOutput(0), + rhs_abs.ValueOrDie()->getOutput(0)); + TRT_ENSURE_PTR_OK(add1); + ::stream_executor::port::StatusOr one_const = + this->Constant(1, rhs->getDimensions().nbDims); + TRT_ENSURE_PTR_OK(one_const); + ::stream_executor::port::StatusOr numerator = + this->Sub(add1.ValueOrDie()->getOutput(0), + one_const.ValueOrDie()->getOutput(0)); + TRT_ENSURE_PTR_OK(numerator); + return FloorDiv(numerator.ValueOrDie()->getOutput(0), + rhs_abs.ValueOrDie()->getOutput(0)); + } + + // Adds an elementwise multiplication operation to the network. + ::stream_executor::port::StatusOr Mul( + nvinfer1::ITensor* lhs, nvinfer1::ITensor* rhs) noexcept { + TRT_ENSURE(lhs); + TRT_ENSURE(rhs); + nvinfer1::IElementWiseLayer* layer = network_->addElementWise( + *lhs, *rhs, nvinfer1::ElementWiseOperation::kPROD); + TRT_ENSURE(layer); + return layer; + } + + // Adds a sequence of elementwise multiplication operations to the network. + // The returned layer's output contains the cumulative elementwise product of + // all tensors in the input. + ::stream_executor::port::StatusOr CumulativeProd( + absl::Span inputs) noexcept { + TRT_ENSURE(!absl::c_any_of( + inputs, [](nvinfer1::ITensor* x) { return x == nullptr; })); + nvinfer1::ILayer* out = nullptr; + if (inputs.size() == 1) { + out = network_->addIdentity(*inputs[0]); + TRT_ENSURE(out != nullptr); + return out; + } + nvinfer1::ITensor* last = inputs[0]; + for (int i = 1; i < inputs.size(); i++) { + ::stream_executor::port::StatusOr mul = + this->Mul(last, inputs[i]); + TRT_ENSURE_PTR_OK(mul); + out = mul.ValueOrDie(); + last = mul.ValueOrDie()->getOutput(0); + } + return out; + } + + // Adds a Constant layer whose output is a TensorRT shape tensor. The shape + // tensor's size and values correspond to dim's nbDims and d[], respectively. + ::stream_executor::port::StatusOr ConstantShape( + const DimsAdapter& shape_data) noexcept { + TRT_ENSURE(shape_data.NumDims() > 0); + nvinfer1::Dims shape_dims; + shape_dims.nbDims = 1; + shape_dims.d[0] = shape_data.NumDims(); + ::stream_executor::port::StatusOr const_weights = + weight_store_->GetTempWeights(nvinfer1::DataType::kINT32, shape_dims); + TRT_ENSURE_OK(const_weights); + absl::c_copy(shape_data, const_weights.ValueOrDie().GetPointer()); + ::stream_executor::port::StatusOr trt_dims = + const_weights.ValueOrDie().Shape().AsTrtDims(); + TRT_ENSURE_OK(trt_dims); + nvinfer1::IConstantLayer* const_layer = network_->addConstant( + trt_dims.ValueOrDie(), const_weights.ValueOrDie().GetTrtWeights()); + TRT_ENSURE(const_layer); + nvinfer1::ITensor* output = const_layer->getOutput(0); + TRT_ENSURE(output); + TRT_ENSURE(output->getType() == nvinfer1::DataType::kINT32); + return const_layer; + } + + // Adds a Constant layer whose output is a TensorRT shape tensor. The shape + // tensor's size and values correspond to dim's nbDims and d[], respectively. + ::stream_executor::port::StatusOr Constant( + const std::vector& data) noexcept { + nvinfer1::Dims shape_dims; + shape_dims.nbDims = 1; + shape_dims.d[0] = data.size(); + ::stream_executor::port::StatusOr const_weights = + weight_store_->GetTempWeights(nvinfer1::DataType::kINT32, shape_dims); + TRT_ENSURE_OK(const_weights); + int32* values = const_weights.ValueOrDie().GetPointer(); + for (int i = 0; i < data.size(); i++) { + values[i] = static_cast(data[i]); + } + ::stream_executor::port::StatusOr trt_dims = + const_weights.ValueOrDie().Shape().AsTrtDims(); + TRT_ENSURE_OK(trt_dims); + nvinfer1::IConstantLayer* const_layer = network_->addConstant( + trt_dims.ValueOrDie(), const_weights.ValueOrDie().GetTrtWeights()); + TRT_ENSURE(const_layer); + nvinfer1::ITensor* output = const_layer->getOutput(0); + TRT_ENSURE(output); + TRT_ENSURE(output->getType() == nvinfer1::DataType::kINT32); + TRT_ENSURE(const_layer); + return const_layer; + } + + // Adds a Constant layer that produces a tensor of shape "shape", + // type "data_type" and filled with value "scalar". + template + ::stream_executor::port::StatusOr Constant( + const T value, nvinfer1::Dims shape, + nvinfer1::DataType data_type) noexcept { + ::stream_executor::port::StatusOr const_weights = + weight_store_->GetTempWeights(data_type, shape); + TRT_ENSURE_OK(const_weights); + TRT_ENSURE(const_weights.ValueOrDie().SetValues(value).ok()); + nvinfer1::IConstantLayer* const_layer = network_->addConstant( + shape, const_weights.ValueOrDie().GetTrtWeights()); + TRT_ENSURE(const_layer); + return const_layer; + } + + // Adds a Constant layer that produces a tensor with a single value "scalar". + // The tensor has "nb_dims" dimensions and each dimension has only one + // element. The data type of the tensor is determined by the data type of + // "scalar". + template ::value>::type* = nullptr> + ::stream_executor::port::StatusOr Constant( + const T scalar, const int nb_dims) noexcept { + TRT_ENSURE(nb_dims <= nvinfer1::Dims::MAX_DIMS); + auto data_type = nvinfer1::DataType::kINT32; + if (std::is_floating_point::value) { + data_type = nvinfer1::DataType::kFLOAT; + } + nvinfer1::Dims zero_shape; + zero_shape.nbDims = nb_dims; + std::fill_n(zero_shape.d, nb_dims, 1); + return Constant(scalar, zero_shape, data_type); + } + + // Adds a Constant layer from a TRT_ShapedWeights object. + ::stream_executor::port::StatusOr + WeightsToConstant(const nvinfer1::Weights& weights, + const DimsAdapter& dims) noexcept { + ::stream_executor::port::StatusOr vol = dims.Volume(); + TRT_ENSURE_OK(vol); + TRT_ENSURE(vol.ValueOrDie() == weights.count); + ::stream_executor::port::StatusOr trt_dims = + dims.AsTrtDims(); + TRT_ENSURE_OK(trt_dims); + nvinfer1::IConstantLayer* const_layer = + network_->addConstant(trt_dims.ValueOrDie(), weights); + TRT_ENSURE(const_layer); + return const_layer; + } + + Status get_tensor4TensorOrWeights(const TRT_TensorOrWeights& input, + ITensorProxyPtr* pTensor) { + if (input.is_weights()) { + ::stream_executor::port::StatusOr const_layer = + WeightsToConstant(input.weights().GetTrtWeights(), + input.GetTrtDims()); + if (!const_layer.status().ok()) return const_layer.status(); + *pTensor = const_layer.ValueOrDie()->getOutput(0); + } else { + *pTensor = input.tensor(); + } + return Status::OK(); + } + + // Creates a nvinfer1::Weights object containing a single scalar. + template ::value>::type* = nullptr> + ::stream_executor::port::StatusOr ScalarWeights( + const T scalar, const int nb_dims) noexcept { + TRT_ENSURE(nb_dims <= nvinfer1::Dims::MAX_DIMS); + auto data_type = nvinfer1::DataType::kINT32; + if (std::is_floating_point::value) { + data_type = nvinfer1::DataType::kFLOAT; + } + nvinfer1::Dims weights_shape; + weights_shape.nbDims = nb_dims; + std::fill_n(weights_shape.d, nb_dims, 1); + ::stream_executor::port::StatusOr const_weights = + weight_store_->GetTempWeights(data_type, weights_shape); + TRT_ENSURE_OK(const_weights); + const_weights.ValueOrDie().GetPointer()[0] = scalar; + return const_weights.ValueOrDie().GetTrtWeights(); + } + + // Adds a TensorRT Slice operation to the network. + ::stream_executor::port::StatusOr Slice( + nvinfer1::ITensor* input, const nvinfer1::Dims& begin, + const nvinfer1::Dims& size, const nvinfer1::Dims& stride) noexcept { + nvinfer1::ISliceLayer* layer = + network_->addSlice(*input, begin, size, stride); + TRT_ENSURE(layer); + return layer; + } + + // Adds a TensorRT Concatenate operation to the network. + ::stream_executor::port::StatusOr Concat( + absl::Span inputs, const int axis) { + for (nvinfer1::ITensor* input : inputs) { + TRT_ENSURE(input); + } + nvinfer1::IConcatenationLayer* layer = network_->addConcatenation( + inputs.data(), static_cast(inputs.size())); + TRT_ENSURE(layer); + layer->setAxis(axis); + return layer; + } + + // Adds a TensorRT Concatenate operation to the network. + ::stream_executor::port::StatusOr Concat( + const std::vector& inputs, const int axis) { + return this->Concat(absl::MakeSpan(inputs), axis); + } + + // Adds a TensorRT Shape operation, which determines the runtime shape of the + // input tensor, to the network. + ::stream_executor::port::StatusOr Shape( + nvinfer1::ITensor* input) { + TRT_ENSURE(input); + nvinfer1::IShapeLayer* layer = network_->addShape(*input); + TRT_ENSURE(layer); + return layer; + } + + // Creates a Gather operation on the shape of the input tensor. The output of + // the gather operation is a 1D shape tensor where output[i] = (!sub_one ? + // input_shape[i] : input_shape[i] -1) if i is in "indices", otherwise zero. + ::stream_executor::port::StatusOr GetPartialShapeOf( + nvinfer1::ITensor* input, absl::InlinedVector indices, + bool sub_one = false) { + TRT_ENSURE(input); + TRT_ENSURE(indices.size() <= nvinfer1::Dims::MAX_DIMS); + + // Get the runtime shape of input; + ::stream_executor::port::StatusOr shape_layer = + this->Shape(input); + TRT_ENSURE_PTR_OK(shape_layer); + nvinfer1::ITensor* runtime_shape = shape_layer.ValueOrDie()->getOutput(0); + + if (sub_one) { + ::stream_executor::port::StatusOr ones = + this->Constant(1, 1); + TRT_ENSURE_PTR_OK(ones); + ::stream_executor::port::StatusOr sub = + this->Sub(runtime_shape, ones.ValueOrDie()->getOutput(0)); + TRT_ENSURE_PTR_OK(sub); + runtime_shape = sub.ValueOrDie()->getOutput(0); + } + + // Create a constant tensor containing the gather indices. + // For any dim not in "indices", we mark it size to gather a zero. + const int input_nb_dims = input->getDimensions().nbDims; + std::vector indices_all(input_nb_dims, input_nb_dims); + for (auto idx : indices) { + TRT_ENSURE(idx < input_nb_dims); + indices_all[idx] = idx; + } + + ::stream_executor::port::StatusOr + indices_result = this->Constant(indices_all); + TRT_ENSURE_PTR_OK(indices_result); + nvinfer1::ITensor* gather_indices = + indices_result.ValueOrDie()->getOutput(0); + TRT_ENSURE(gather_indices->getDimensions().nbDims == 1); + TRT_ENSURE(gather_indices->getType() == nvinfer1::DataType::kINT32); + + // Append a zero to the shape tensor. + ::stream_executor::port::StatusOr zero_result = + this->Constant(std::vector{0}); + TRT_ENSURE_PTR_OK(zero_result); + std::array cat_inputs = { + runtime_shape, zero_result.ValueOrDie()->getOutput(0)}; + nvinfer1::IConcatenationLayer* cat_layer = + network_->addConcatenation(cat_inputs.data(), cat_inputs.size()); + TRT_ENSURE(cat_layer); + nvinfer1::ITensor* gather_input = cat_layer->getOutput(0); + TRT_ENSURE(gather_input); + + // Finally, gather the indices from the input. + nvinfer1::IGatherLayer* gather = + network_->addGather(*gather_input, *gather_indices, 0); + TRT_ENSURE(gather); + return gather; + } + + // Adds a scale layer that uniformly scales the input tensor by the specified + // amount. + ::stream_executor::port::StatusOr AddUniformScale( + nvinfer1::ITensor* input, float scale, const std::string& name) { + TRT_ENSURE(input); + TRT_ENSURE(!name.empty()); + ::stream_executor::port::StatusOr weight = + this->ScalarWeights(scale, 1); + TRT_ENSURE_OK(weight); + const nvinfer1::Weights empty_weights = + nvinfer1::Weights{nvinfer1::DataType::kFLOAT, nullptr, 0}; + nvinfer1::IScaleLayer* scale_layer = + network_->addScale(*input, nvinfer1::ScaleMode::kUNIFORM, empty_weights, + weight.ValueOrDie(), empty_weights); + TRT_ENSURE(scale_layer != nullptr); + scale_layer->setName(name.c_str()); + TRT_ENSURE((*scale_layer).getPower().count == 0); + TRT_ENSURE((*scale_layer).getShift().count == 0); + TRT_ENSURE((*scale_layer).getScale().count == 1); + return scale_layer; + } + + ::stream_executor::port::StatusOr AddFill( + const TRT_TensorOrWeights& value_input, + const TRT_TensorOrWeights& dims_input, bool is_value_static, + bool is_dims_static, int nbDims, const nvinfer1::Dims& trt_dims, + ITensorProxyPtr scalar_tensor = nullptr, + ITensorProxyPtr beta_tensor = nullptr, const float delta = 0) { + // TensorRT IFillLayer requires a rank 0 scalar. + nvinfer1::Dims scalar_dims; + scalar_dims.nbDims = 0; + if (is_value_static) { + ::stream_executor::port::StatusOr const_layer = + WeightsToConstant(value_input.weights().GetTrtWeights(), scalar_dims); + if (!const_layer.status().ok()) return const_layer.status(); + scalar_tensor = const_layer.ValueOrDie()->getOutput(0); + } else { + if (scalar_tensor == nullptr) { + ::stream_executor::port::StatusOr + shuffler_layer = + Reshape(value_input.tensor()->trt_tensor(), scalar_dims); + if (!shuffler_layer.status().ok()) return shuffler_layer.status(); + scalar_tensor = shuffler_layer.ValueOrDie()->getOutput(0); + } + } + + if (beta_tensor == nullptr) { + nvinfer1::Dims beta_shape{1, {nbDims}}; + ::stream_executor::port::StatusOr const_layer = + Constant(delta, beta_shape, value_input.TrtDType()); + TF_RETURN_IF_ERROR(const_layer.status()); + beta_tensor = const_layer.ValueOrDie()->getOutput(0); + } + + nvinfer1::IFillLayer* layer = + network_->addFill(trt_dims, nvinfer1::FillOperation::kLINSPACE); + TRT_ENSURE(layer); + if (!is_dims_static) { + layer->setInput(0, *dims_input.tensor()->trt_tensor()); + } + layer->setInput(1, *scalar_tensor->trt_tensor()); + layer->setInput(2, *beta_tensor->trt_tensor()); + return layer; + } + + // Adds a quantization layer that uniformly scales the input tensor + // by the given multiplicative "scaling_factor", then rounds + // (round-to-nearest-ties-to-even) to the nearest integer and clamps in the + // range of [-128, 127]. + ::stream_executor::port::StatusOr Quantize( + nvinfer1::ITensor* input, const float scaling_factor, + const std::string& name) { + TRT_ENSURE(input); + TRT_ENSURE(!name.empty()); + // Preprocessor usage here is unavoidable because TRT8 API is new. +#if IS_TRT_VERSION_GE(8, 0, 0, 0) + // The TensorRT IQuantizeLayer divides by the scale factor rather than + // multiplies. To be consistent, in this function we expect a multiplicative + // scale factor, so we take the reciprical. + ::stream_executor::port::StatusOr scaling_const = + this->Constant(1.0f / scaling_factor, 1); + TRT_ENSURE_PTR_OK(scaling_const); + scaling_const.ValueOrDie()->setDimensions(nvinfer1::Dims{0, {}}); + nvinfer1::IQuantizeLayer* quant_layer = network_->addQuantize( + *input, *scaling_const.ValueOrDie()->getOutput(0)); + TRT_ENSURE(quant_layer); + quant_layer->setAxis(1); + return quant_layer; +#else + ::stream_executor::port::StatusOr result = + this->AddUniformScale(input, scaling_factor, name); + TRT_ENSURE_PTR_OK(result); + (*result)->setOutputType(0, nvinfer1::DataType::kINT8); + (*result)->setPrecision(nvinfer1::DataType::kFLOAT); + return result; +#endif + } + + // Adds a dequantize layer that casts the input tensor to TensorRT float type + // and scales it uniformly by the given multiplicative "scaling_factor". + ::stream_executor::port::StatusOr Dequantize( + nvinfer1::ITensor* input, const float scaling_factor, + const std::string& name) { + TRT_ENSURE(input); + TRT_ENSURE(!name.empty()); +#if IS_TRT_VERSION_GE(8, 0, 0, 0) + ::stream_executor::port::StatusOr scaling_const = + this->Constant(scaling_factor, 1); + TRT_ENSURE_PTR_OK(scaling_const); + scaling_const.ValueOrDie()->setDimensions(nvinfer1::Dims{0, {}}); + nvinfer1::IDequantizeLayer* dequant_layer = network_->addDequantize( + *input, *scaling_const.ValueOrDie()->getOutput(0)); + dequant_layer->setAxis(1); + TRT_ENSURE(dequant_layer); + return dequant_layer; +#else + ::stream_executor::port::StatusOr result = + this->AddUniformScale(input, scaling_factor, name); + TRT_ENSURE_PTR_OK(result); + (*result)->setOutputType(0, nvinfer1::DataType::kFLOAT); + (*result)->setPrecision(nvinfer1::DataType::kINT8); + return result; +#endif + } + + // Adds TensorRT Q/DQ operations. This is for explicit precision mode. + ::stream_executor::port::StatusOr + UniformQuantizeDequantizeExplicit(nvinfer1::ITensor* input, + float quantize_scale, + float dequantize_scale, + const std::string& name) { + TRT_ENSURE(input); + if (!IS_TRT_VERSION_GE(8, 0, 0, 0)) { + TRT_ENSURE(network_->hasExplicitPrecision()); + } + TRT_ENSURE(IS_TRT_VERSION_GE(7, 1, 0, 0)); + + static int count = 0; + TRT_ENSURE(input->getType() == nvinfer1::DataType::kFLOAT); + std::string quant_name = absl::StrCat(input->getName(), "_quant_", count); + + ::stream_executor::port::StatusOr quant = + this->Quantize(input, quantize_scale, quant_name); + TRT_ENSURE_PTR_OK(quant); + + std::string dequant_name = + absl::StrCat(input->getName(), "_dequant_", count); + ::stream_executor::port::StatusOr dequant = + this->Dequantize(quant.ValueOrDie()->getOutput(0), dequantize_scale, + dequant_name); + TRT_ENSURE_PTR_OK(dequant); + + count++; + return dequant; + } + + ::stream_executor::port::StatusOr Reshape( + nvinfer1::ITensor* input, const nvinfer1::Dims& new_shape) { + TRT_ENSURE(input); + nvinfer1::IShuffleLayer* layer = network_->addShuffle(*input); + TRT_ENSURE(layer); + layer->setReshapeDimensions(new_shape); + return layer; + } + + ::stream_executor::port::StatusOr FindProducerOf( + const nvinfer1::ITensor* tensor) { + const char* name = tensor->getName(); + const int num_layers = network_->getNbLayers(); + for (int i = 0; i < num_layers; i++) { + nvinfer1::ILayer* layer = network_->getLayer(i); + const int num_outputs = layer->getNbOutputs(); + for (int j = 0; j < num_outputs; j++) { + nvinfer1::ITensor* t = layer->getOutput(j); + if (std::string(t->getName()) == name) { + return layer; + } + } + } + return errors::NotFound("could not find producing layer of ", name); + } + + ::stream_executor::port::StatusOr UniqueParentOf( + const nvinfer1::ILayer* layer, int input_idx = 0) { + return FindProducerOf(layer->getInput(input_idx)); + } + + nvinfer1::INetworkDefinition* Network() { return network_; } + + private: + nvinfer1::INetworkDefinition* network_; + TrtWeightStore* weight_store_; +}; + +class ShuffleBuilder { + private: + explicit ShuffleBuilder(TRTNetworkBuilder* builder, nvinfer1::ITensor* input) + : builder_(builder) { + layer_ = builder->Network()->addShuffle(*input); + } + + public: + static ::stream_executor::port::StatusOr Create( + TRTNetworkBuilder* builder, nvinfer1::ITensor* input) { + TRT_ENSURE(builder != nullptr); + TRT_ENSURE(input != nullptr); + return ShuffleBuilder(builder, input); + } + + ShuffleBuilder& SetReshape(const nvinfer1::Dims& dims) { + layer_->setReshapeDimensions(dims); + return *this; + } + + ShuffleBuilder& SetReshape(nvinfer1::ITensor* shape) { + layer_->setInput(1, *shape); + return *this; + } + + ShuffleBuilder& SetFirstTranspose(const nvinfer1::Permutation& perm) { + layer_->setFirstTranspose(perm); + return *this; + } + + ShuffleBuilder& SetSecondTranspose(const nvinfer1::Permutation& perm) { + layer_->setSecondTranspose(perm); + return *this; + } + + ::stream_executor::port::StatusOr Output() { + TRT_ENSURE(layer_ != nullptr); + TRT_ENSURE(layer_->getOutput(0) != nullptr); + return layer_->getOutput(0); + } + + private: + TRTNetworkBuilder* builder_; + nvinfer1::IShuffleLayer* layer_; +}; + +} // namespace convert +} // namespace tensorrt +} // namespace tensorflow + +#endif // GOOGLE_CUDA && GOOGLE_TENSORRT +#endif // TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_OPS_LAYER_UTILS_H_ diff --git a/tensorflow/compiler/tf2tensorrt/convert/ops/like_ops.cc b/tensorflow/compiler/tf2tensorrt/convert/ops/like_ops.cc new file mode 100644 index 00000000000..7a40d9aa9b1 --- /dev/null +++ b/tensorflow/compiler/tf2tensorrt/convert/ops/like_ops.cc @@ -0,0 +1,95 @@ +/* Copyright 2022 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#if GOOGLE_CUDA && GOOGLE_TENSORRT + +#include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h" +#include "tensorflow/compiler/tf2tensorrt/convert/op_converter_registry.h" +#include "tensorflow/compiler/tf2tensorrt/convert/ops/layer_utils.h" + +namespace tensorflow { +namespace tensorrt { +namespace convert { + +#if IS_TRT_VERSION_GE(8, 2, 0, 0) + +template +class ConvertLikeOps : public OpConverterBase> { + public: + explicit ConvertLikeOps(const OpConverterParams *params) + : OpConverterBase>( + params, + {DataType::DT_FLOAT, DataType::DT_HALF, DataType::DT_INT32}) {} + + static constexpr std::array InputSpec() { + return std::array{ + InputArgSpec::Create("input", TrtInputArg::kBoth), + }; + } + Status Validate() { return ConvertLikeOps::NotSupportedInImplicitBatch(); } + + Status Convert() { + const auto ¶ms = *this->params_; + const auto &inputs = params.inputs; + auto *network = params.converter->network(); + + const TRT_TensorOrWeights &input = inputs.at(0); + nvinfer1::Dims dims(input.GetTrtDims()); + + const std::vector value_input_dims_data = {1}; + const DimsAdapter value_input_dims(value_input_dims_data); + ::stream_executor::port::StatusOr value_weights = + params.weight_store->GetTempWeights(input.TrtDType(), value_input_dims); + TF_RETURN_IF_ERROR(value_weights.status()); + TF_RETURN_IF_ERROR(value_weights.ValueOrDie().SetValues(V)); + TRT_TensorOrWeights value_input(value_weights.ValueOrDie()); + + const auto is_dims_static = HasStaticShape(dims); + auto builder = TRTNetworkBuilder::Create(network, params.weight_store); + ITensorProxyPtr dims_input_tensor; + if (!is_dims_static) { + ::stream_executor::port::StatusOr shape_layer = + builder.ValueOrDie().Shape(input.tensor()->trt_tensor()); + TF_RETURN_IF_ERROR(shape_layer.status()); + dims_input_tensor = shape_layer.ValueOrDie()->getOutput(0); + dims.nbDims = 0; + } + + TRT_TensorOrWeights dims_input(dims_input_tensor); + ::stream_executor::port::StatusOr layer = + builder.ValueOrDie().AddFill(value_input, dims_input, true, + is_dims_static, input.GetTrtDims().nbDims, + dims); + ITensorProxyPtr output_tensor = layer.ValueOrDie()->getOutput(0); + this->AddOutput(TRT_TensorOrWeights(output_tensor)); + return Status::OK(); + } +}; + +REGISTER_DEFAULT_TRT_OP_CONVERTER(MakeConverterFunction>(), + "zeros_like"); +REGISTER_DEFAULT_TRT_OP_CONVERTER(MakeConverterFunction>(), + "ones_like"); +REGISTER_DEFAULT_TRT_OP_CONVERTER(MakeConverterFunction>(), + "ZerosLike"); +REGISTER_DEFAULT_TRT_OP_CONVERTER(MakeConverterFunction>(), + "OnesLike"); + +#endif // IS_TRT_VERSION_GE(8, 2, 0, 0) + +} // namespace convert +} // namespace tensorrt +} // namespace tensorflow +#endif // GOOGLE_CUDA && GOOGLE_TENSORRT diff --git a/tensorflow/compiler/tf2tensorrt/convert/ops/log_softmax.cc b/tensorflow/compiler/tf2tensorrt/convert/ops/log_softmax.cc new file mode 100644 index 00000000000..d29b5481643 --- /dev/null +++ b/tensorflow/compiler/tf2tensorrt/convert/ops/log_softmax.cc @@ -0,0 +1,104 @@ +/* Copyright 2022 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#if GOOGLE_CUDA && GOOGLE_TENSORRT + +#include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h" +#include "tensorflow/compiler/tf2tensorrt/convert/op_converter_registry.h" +#include "tensorflow/compiler/tf2tensorrt/convert/ops/layer_utils.h" + +namespace tensorflow { +namespace tensorrt { +namespace convert { + +class ConvertLogSoftmax : public OpConverterBase { + public: + explicit ConvertLogSoftmax(const OpConverterParams *params) + : OpConverterBase(params) {} + + static constexpr std::array InputSpec() { + return std::array{ + InputArgSpec::Create("logits", TrtInputArg::kTensor)}; + } + + Status Validate() { + const auto ¶ms = *this->params_; + const auto &inputs = params.inputs; + + ITensorProxyPtr logits_tensor = inputs.at(0).tensor(); + + const int num_trt_dims = logits_tensor->getDimensions().nbDims; + if (!num_trt_dims && params.use_implicit_batch) { + return errors::InvalidArgument( + "TensorRT LogSoftmax cannot apply on the batch dimension"); + } + + return Status::OK(); + } + + Status Convert() { + const auto ¶ms = *this->params_; + const auto &inputs = params.inputs; + const auto &node_def = params.node_def; + + // Perform LogSoftmax operation: + // `logsoftmax = logits - log(reduce_sum(exp(logits), axis))` + + // Get the logits tensor. + ITensorProxyPtr logits_tensor = inputs.at(0).tensor(); + const int num_trt_dims = logits_tensor->getDimensions().nbDims; + + // Exponent of logits. + nvinfer1::IUnaryLayer *exp = params.converter->network()->addUnary( + *logits_tensor->trt_tensor(), nvinfer1::UnaryOperation::kEXP); + TFTRT_RETURN_ERROR_IF_NULLPTR(exp, node_def.name()); + params.converter->SetLayerName(exp, node_def, "exp"); + + // Reduce-sum operation across the final dimension. + nvinfer1::IReduceLayer *reduced_sum = + params.converter->network()->addReduce( + *exp->getOutput(0), nvinfer1::ReduceOperation::kSUM, + (1 << (num_trt_dims - 1)), /*Reduce across final dimension*/ + true /*Keep reduced dims*/); + params.converter->SetLayerName(reduced_sum, node_def, "reduced_sum"); + + // Logarithm of reduced_sum. + nvinfer1::IUnaryLayer *log_reduced_sum = + params.converter->network()->addUnary(*reduced_sum->getOutput(0), + nvinfer1::UnaryOperation::kLOG); + TFTRT_RETURN_ERROR_IF_NULLPTR(log_reduced_sum, node_def.name()); + params.converter->SetLayerName(log_reduced_sum, node_def, + "log_reduced_sum"); + + // Finally, get the output by subtracting log_reduced_sum from logits. + nvinfer1::IElementWiseLayer *sub = + params.converter->network()->addElementWise( + *logits_tensor->trt_tensor(), *log_reduced_sum->getOutput(0), + nvinfer1::ElementWiseOperation::kSUB); + TFTRT_RETURN_ERROR_IF_NULLPTR(sub, node_def.name()); + params.converter->SetLayerName(sub, node_def, "sub"); + + params.outputs->push_back(TRT_TensorOrWeights(sub->getOutput(0))); + return Status::OK(); + } +}; + +REGISTER_DEFAULT_TRT_OP_CONVERTER(MakeConverterFunction(), + "LogSoftmax"); + +} // namespace convert +} // namespace tensorrt +} // namespace tensorflow +#endif // GOOGLE_CUDA && GOOGLE_TENSORRT diff --git a/tensorflow/compiler/tf2tensorrt/convert/ops/quantization_ops.cc b/tensorflow/compiler/tf2tensorrt/convert/ops/quantization_ops.cc new file mode 100644 index 00000000000..c6622f88345 --- /dev/null +++ b/tensorflow/compiler/tf2tensorrt/convert/ops/quantization_ops.cc @@ -0,0 +1,426 @@ +/* Copyright 2021 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#if GOOGLE_CUDA && GOOGLE_TENSORRT +#include "tensorflow/compiler/tf2tensorrt/convert/ops/quantization_ops.h" + +#include "absl/strings/str_format.h" +#include "tensorflow/cc/ops//array_ops.h" +#include "tensorflow/compiler/tf2tensorrt/common/utils.h" +#include "tensorflow/compiler/tf2tensorrt/convert/op_converter.h" +#include "tensorflow/compiler/tf2tensorrt/convert/op_converter_registry.h" +#include "tensorflow/compiler/tf2tensorrt/convert/ops/layer_utils.h" +#include "tensorflow/compiler/tf2tensorrt/convert/weights.h" +#include "tensorflow/core/framework/node_def_util.h" +#include "third_party/tensorrt/NvInfer.h" + +namespace tensorflow { +namespace tensorrt { +namespace convert { + +bool IsQuantizeAndDequantizeOp(const Node* node) { + return absl::c_find(kQuantizationOpNames, node->def().op()) != + kQuantizationOpNames.end(); +} + +namespace { + +// Provides quantizing and dequantizing tensor scales for a given dynamic range. +// Borrowed from TF quantization kernel logic. +template +QuantizationScales ComputeQuantizationRange(bool signed_input, + int num_bits, + bool narrow_range, + T* min_range, T* max_range) { + // Calculate the range for the simulated integer quantization: + // e.g. [-127,127] for signed = true, narrow_range = true, num_bits = 8, + // or [-128,127] for signed = true, narrow_range = false, num_bits = 8, + // or [0, 255] for signed = false, num_bits = 8. + const int64_t min_quantized = + signed_input ? narrow_range ? -(1ULL << (num_bits - 1)) + 1 + : -(1ULL << (num_bits - 1)) + : 0; + const int64_t max_quantized = + signed_input ? (1ULL << (num_bits - 1)) - 1 : (1ULL << num_bits) - 1; + // Determine the maximum scaling factor that would scale + // [min_range, max_range] to not exceed [min_quantized, max_quantized], + // while keeping 0 unchanged. + const T scale_from_min_side = (min_quantized * *min_range > 0) + ? min_quantized / *min_range + : std::numeric_limits::max(); + const T scale_from_max_side = (max_quantized * *max_range > 0) + ? max_quantized / *max_range + : std::numeric_limits::max(); + + QuantizationScales scales; + // Note: Avoids changing the side of the range that determines scale. + if (scale_from_min_side < scale_from_max_side) { + scales.quantize_scale[0] = scale_from_min_side; + scales.dequantize_scale[0] = *min_range / min_quantized; + *max_range = max_quantized * scales.dequantize_scale[0]; + } else { + scales.quantize_scale[0] = scale_from_max_side; + scales.dequantize_scale[0] = *max_range / max_quantized; + *min_range = min_quantized * scales.dequantize_scale[0]; + } + return scales; +} + +// Prepares the input for a QDQ node in explicit precision mode, returning a +// ITensor pointer. If the input is weights, we convert it to a ITensor by +// adding a constant layer. +::stream_executor::port::StatusOr ExlicitQDQInputToTensor( + TRTNetworkBuilder* builder, const OpConverterParams* params, + const TRT_TensorOrWeights& input) { + if (input.is_tensor()) { + return input.tensor()->trt_tensor(); + } + if (!IS_TRT_VERSION_GE(8, 0, 0, 0) && input.weights().count() > 1) { + LOG(WARNING) << absl::StrCat( + "QDQ per-channel for weights not " + "implemented, assuming uniform scaling"); + } + TRT_ShapedWeights trt_weights = input.weights(); + ::stream_executor::port::StatusOr weights_const = + builder->WeightsToConstant(trt_weights.GetTrtWeights(), + trt_weights.Shape()); + TRT_ENSURE_PTR_OK(weights_const); + params->converter->SetLayerName(weights_const.ValueOrDie(), params->node_def, + "const"); + nvinfer1::ITensor* qdq_input = weights_const.ValueOrDie()->getOutput(0); + std::string name = + absl::StrCat(weights_const.ValueOrDie()->getName(), "_output"); + qdq_input->setName(name.c_str()); + return qdq_input; +} + +} // namespace + +// Carries traits for each specific quantization op type for conversion. +// Specialization for template parameter T should be given for each TF C++ +// quantization op. +template +struct QDQOpSpec {}; + +template <> +struct QDQOpSpec { + static constexpr std::array InputSpec() { + return { + InputArgSpec::Create("input", TrtInputArg::kBoth), + InputArgSpec::Create("input_min", TrtInputArg::kWeight), + InputArgSpec::Create("input_max", TrtInputArg::kWeight), + }; + } + + struct Attrs { + float min_range; + float max_range; + bool narrow_range; + std::string round_mode; + UniformQuantizationScales scales; + }; + + static Status ValidateQDQForExplicitPrecision( + const std::vector& inputs, const NodeDef& node_def, + Attrs* args) { + AttrSlice attrs(node_def); + TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "round_mode", &args->round_mode)); + if (args->round_mode != "HALF_TO_EVEN") { + LOG(WARNING) << node_def.op() << ": " << node_def.name() + << " has round_mode=" << args->round_mode + << ", but for TensorRT conversion, " + "round_mode=HALF_TO_EVEN is recommended."; + } + TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "narrow_range", &args->narrow_range)); + if (args->narrow_range) { + LOG(WARNING) << node_def.op() << ": " << node_def.name() + << " has narrow_range=true, but for TensorRT conversion, " + "narrow_range=false is recommended."; + } + args->min_range = inputs.at(1).weights().template GetPointer()[0]; + args->max_range = inputs.at(2).weights().template GetPointer()[0]; + const int num_bits = 8; + args->scales = ComputeQuantizationRange( + /*signed_input=*/true, num_bits, args->narrow_range, &args->min_range, + &args->max_range); + TRT_ENSURE(args->scales.dequantize_scale[0] != 0); + TRT_ENSURE(args->scales.quantize_scale[0] != 0); + return Status::OK(); + } + + // Converts in explicit precision mode. In this mode, QDQ operations are + // directly converted into TensorRT quantizing and dequantizing scale + // operations. + static Status ConvertExplicit(const OpConverterParams* params, + const Attrs& args) { + const auto& node_def = params->node_def; + + ::stream_executor::port::StatusOr builder = + TRTNetworkBuilder::Create(params->converter->network(), + params->weight_store); + + ::stream_executor::port::StatusOr qdq_input = + ExlicitQDQInputToTensor(&builder.ValueOrDie(), params, + params->inputs.at(0)); + TRT_ENSURE_PTR_OK(qdq_input); + + // TODO(cbate): check this condition exists for TRT8? Outline this block to + // a "reshape policy". + const int required_dims = params->use_implicit_batch ? 3 : 4; + const nvinfer1::Dims idims = qdq_input.ValueOrDie()->getDimensions(); + nvinfer1::Dims intermediate_dims = idims; + TRT_ENSURE(idims.nbDims > 0); + if (idims.nbDims < required_dims) { + const int nb_extra_dims = required_dims - idims.nbDims; + intermediate_dims.nbDims = required_dims; + std::vector ones(nb_extra_dims, 1); + TRT_ENSURE(ones.size() == nb_extra_dims && nb_extra_dims > 0); + + if (!params->use_implicit_batch) { + intermediate_dims.d[0] = idims.d[0]; + std::copy(ones.begin(), ones.end(), intermediate_dims.d + 1); + std::copy_n(idims.d + 1, idims.nbDims - 1, + intermediate_dims.d + ones.size() + 1); + } else { + std::copy(ones.begin(), ones.end(), intermediate_dims.d); + std::copy_n(idims.d, idims.nbDims, intermediate_dims.d + ones.size()); + } + + LOG(WARNING) << absl::StrCat( + node_def.name(), ":", node_def.op(), ": tensor ", + qdq_input.ValueOrDie()->getName(), " has shape ", DebugString(idims), + " but TRT scale layer requires at least 3 dims excluding batch dim, " + "trying to recover by inserting 1's to create shape ", + DebugString(intermediate_dims)); + ::stream_executor::port::StatusOr reshape = + builder.ValueOrDie().Reshape(qdq_input.ValueOrDie(), + intermediate_dims); + TRT_ENSURE_PTR_OK(reshape); + qdq_input.ValueOrDie() = reshape.ValueOrDie()->getOutput(0); + } + + VLOG(1) << "[ExplicitPrecision]" << node_def.op() << ": " << node_def.name() + << " computed scales: " << args.scales << " from min/max ranges " + << args.min_range << "/" << args.max_range; + + ::stream_executor::port::StatusOr qdq = + builder.ValueOrDie().UniformQuantizeDequantizeExplicit( + qdq_input.ValueOrDie(), args.scales.quantize_scale[0], + args.scales.dequantize_scale[0], node_def.name()); + TRT_ENSURE_PTR_OK(qdq); + ITensorProxyPtr final_output = qdq.ValueOrDie()->getOutput(0); + if (idims.nbDims != intermediate_dims.nbDims) { + ::stream_executor::port::StatusOr undo_reshape = + builder.ValueOrDie().Reshape(qdq_input.ValueOrDie(), idims); + TRT_ENSURE_PTR_OK(undo_reshape); + final_output = undo_reshape.ValueOrDie()->getOutput(0); + } + params->outputs->push_back(final_output); + return Status::OK(); + } +}; + +template <> + +struct QDQOpSpec { + static constexpr std::array InputSpec() { + return { + InputArgSpec::Create("input", TrtInputArg::kBoth), + InputArgSpec::Create("min", TrtInputArg::kWeight), + InputArgSpec::Create("max", TrtInputArg::kWeight), + InputArgSpec::Create("num_bits", TrtInputArg::kWeight), + }; + } + // Use same attributes and conversion functions as QDQV2. + using Attrs = QDQOpSpec::Attrs; + + static Status ValidateQDQForExplicitPrecision( + const std::vector& inputs, const NodeDef& node_def, + Attrs* args) { + return QDQOpSpec< + ops::QuantizeAndDequantizeV2>::ValidateQDQForExplicitPrecision(inputs, + node_def, + args); + } + + static Status ConvertExplicit(const OpConverterParams* params, + const Attrs& args) { + return QDQOpSpec::ConvertExplicit(params, + args); + } +}; + +template <> + +struct QDQOpSpec { + static constexpr std::array InputSpec() { + return { + InputArgSpec::Create("input", TrtInputArg::kBoth), + InputArgSpec::Create("min", TrtInputArg::kWeight), + InputArgSpec::Create("max", TrtInputArg::kWeight), + }; + } + struct Attrs { + int num_bits; + bool narrow_range; + }; + + static Status ValidateQDQForExplicitPrecision( + const std::vector& inputs, const NodeDef& node_def, + Attrs* args) { + return errors::Unimplemented(""); + } + + static Status ConvertExplicit(const OpConverterParams* params, + const Attrs& args) { + return errors::Unimplemented(""); + } +}; + +template <> + +struct QDQOpSpec { + static constexpr std::array InputSpec() { + return { + InputArgSpec::Create("input", TrtInputArg::kBoth), + }; + } + + struct Attrs { + float min; + float max; + int num_bits; + bool narrow_range; + }; + + static Status ValidateQDQForExplicitPrecision( + const std::vector& inputs, const NodeDef& node_def, + Attrs* args) { + return errors::Unimplemented(""); + } + + static Status ConvertExplicit(const OpConverterParams* params, + const Attrs& args) { + return errors::Unimplemented(""); + } +}; + +// Converts QDQ operations in non-explicit precision mode. This is the original +// "ConvertQuantize" function. In this mode, Q/DQ operations are no-ops and are +// instead used to set the dynamic range of the input tensor. +Status ConvertDynamicRangeMode(const OpConverterParams* params) { + const auto& inputs = params->inputs; + const auto& node_def = params->node_def; + float min_range = 0.0f; + float max_range = 0.0f; + const auto& op_name = node_def.op(); + if (op_name == "FakeQuantWithMinMaxArgs") { + AttrSlice attrs(node_def); + // Get ranges via node attributes. + TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "min", &min_range)); + TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "max", &max_range)); + } else if (op_name == "FakeQuantWithMinMaxVars" || + op_name == "QuantizeAndDequantizeV2" || + op_name == "QuantizeAndDequantizeV3") { + // Get ranges via inputs. + auto get_weights_value = [&inputs](int index) { + const auto* raw_weights = inputs.at(index).weights().GetPointer(); + return raw_weights[0]; + }; + min_range = get_weights_value(1); + max_range = get_weights_value(2); + } else { + return errors::InvalidArgument("Unknown quantization op ", op_name, ", at ", + node_def.name()); + } + if (params->validation_only) { + return Status::OK(); + } + + // Store ranges for tensor + ITensorProxyPtr input0 = inputs.at(0).tensor(); + params->converter->ProvideQuantizationRange(&input0, min_range, max_range); + // Sometimes, TRT may not quantize a tensor, either because it chooses to + // execute a higher precision kernel or because of op fusion. In these + // cases, accuracy will suffer if the model was trained to expect + // quantization at that tensor. We should consider adding a clip(tensor, + // min_range, max_range) operation here to ensure that any arbitrarily + // placed quantize node will execute as expected. However, this will + // negatively affect performance. If users train their models in a way which + // models inference as close as possible (i.e. not quantizing in place where + // fusion will occur), then there is no problem with the current + // implementation. + params->outputs->push_back(inputs.at(0)); + return Status::OK(); +} + +template +class ConvertQDQ : public OpConverterBase> { + public: + explicit ConvertQDQ(const OpConverterParams* params) + : OpConverterBase>(params) {} + + static constexpr auto InputSpec() { return QDQOpSpec::InputSpec(); } + + // Disable the non-applicable data type check by providing empty string. + static constexpr const char* NodeDefDataTypeAttributeName() { return ""; } + + Status ValidateDynamicRangeINT8Mode() { + // The condition ensures we only call the conversion once. We should break + // this function up into validation and conversion. + if (this->params_->validation_only) { + return ConvertDynamicRangeMode(this->params_); + } + return Status::OK(); + } + + Status Validate() { + if (!this->params_->use_explicit_precision) { + return ValidateDynamicRangeINT8Mode(); + } + return OpSpec::ValidateQDQForExplicitPrecision( + this->params_->inputs, this->params_->node_def, &attrs_); + } + + Status Convert() { + if (!this->params_->use_explicit_precision) { + return ConvertDynamicRangeMode(this->params_); + } + return OpSpec::ConvertExplicit(this->params_, attrs_); + } + + using OpSpec = QDQOpSpec; + using OpSpecAttrs = typename QDQOpSpec::Attrs; + OpSpecAttrs attrs_; +}; + +REGISTER_DEFAULT_TRT_OP_CONVERTER( + MakeConverterFunction>(), + "QuantizeAndDequantizeV2"); +REGISTER_DEFAULT_TRT_OP_CONVERTER( + MakeConverterFunction>(), + "QuantizeAndDequantizeV3"); +REGISTER_DEFAULT_TRT_OP_CONVERTER( + MakeConverterFunction>(), + "FakeQuantWithMinMaxVars"); +REGISTER_DEFAULT_TRT_OP_CONVERTER( + MakeConverterFunction>(), + "FakeQuantWithMinMaxArgs"); + +} // namespace convert +} // namespace tensorrt +} // namespace tensorflow + +#endif // GOOGLE_CUDA && GOOGLE_TENSORRT diff --git a/tensorflow/compiler/tf2tensorrt/convert/ops/quantization_ops.h b/tensorflow/compiler/tf2tensorrt/convert/ops/quantization_ops.h new file mode 100644 index 00000000000..280dc1e79f5 --- /dev/null +++ b/tensorflow/compiler/tf2tensorrt/convert/ops/quantization_ops.h @@ -0,0 +1,76 @@ +/* Copyright 2021 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_OPS_QUANTIZATION_OPS_H_ +#define TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_OPS_QUANTIZATION_OPS_H_ +#if GOOGLE_CUDA && GOOGLE_TENSORRT + +#include "absl/strings/str_format.h" +#include "absl/strings/str_join.h" +#include "tensorflow/core/graph/graph.h" +#include "tensorflow/core/lib/core/status.h" + +namespace tensorflow { +namespace tensorrt { +namespace convert { + +constexpr std::array kQuantizationOpNames = { + "QuantizeAndDequantizeV2", + "QuantizeAndDequantizeV3", + "FakeQuantWithMinMaxVars", + "FakeQuantWithMinMaxArgs", +}; + +// Operations with supported conversion to Q/DQ ops in TensorRT explicit +// precision mode. +constexpr std::array kExplicitQuantizationOpNames = { + "QuantizeAndDequantizeV2", +}; + +// Contains two scaling factors for quantization and dequantization +// respectively. A shift factor is omitted as TensorRT only supports symmetric +// quantization. +template +struct QuantizationScales { + std::array quantize_scale; + std::array dequantize_scale; +}; + +// In TensorRT 7 and 8, only uniform tensor scaling is supported for +// activations. +using UniformQuantizationScales = QuantizationScales; + +// Per-channel scaling is supported for weights in TensorRT version >= 8.0. +template +using PerChannelQuantizationScales = QuantizationScales; + +template +std::ostream& operator<<(std::ostream& os, + const QuantizationScales& scales) { + os << absl::StrFormat("QuantizationScales[quantize={%s},dequantize={%s}]", + absl::StrJoin(scales.quantize_scale, ","), + absl::StrJoin(scales.dequantize_scale, ",")); + return os; +} + +// Returns true if the Tensorflow node is a quantize and dequantize operation. +bool IsQuantizeAndDequantizeOp(const Node*); + +} // namespace convert +} // namespace tensorrt +} // namespace tensorflow + +#endif // GOOGLE_CUDA && GOOGLE_TENSORRT + +#endif // TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_OPS_QUANTIZATION_OPS_H_ diff --git a/tensorflow/compiler/tf2tensorrt/convert/ops/quantization_ops_test.cc b/tensorflow/compiler/tf2tensorrt/convert/ops/quantization_ops_test.cc new file mode 100644 index 00000000000..578fae3577b --- /dev/null +++ b/tensorflow/compiler/tf2tensorrt/convert/ops/quantization_ops_test.cc @@ -0,0 +1,619 @@ +/* Copyright 2021 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#if GOOGLE_CUDA && GOOGLE_TENSORRT + +#include "tensorflow/compiler/tf2tensorrt/convert/ops/quantization_ops.h" + +#include +#include +#include "absl/strings/str_format.h" +#include "absl/strings/string_view.h" +#include "tensorflow/cc/framework/ops.h" +#include "tensorflow/cc/framework/scope.h" +#include "tensorflow/cc/ops/array_ops.h" +#include "tensorflow/cc/ops/const_op.h" +#include "tensorflow/cc/ops/linalg_ops.h" +#include "tensorflow/cc/ops/math_ops.h" +#include "tensorflow/cc/ops/nn_ops.h" +#include "tensorflow/compiler/jit/shape_inference.h" +#include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h" +#include "tensorflow/compiler/tf2tensorrt/convert/utils.h" +#include "tensorflow/compiler/tf2tensorrt/trt_convert_api.h" +#include "tensorflow/core/framework/tensor_testutil.h" +#include "tensorflow/core/lib/core/status_test_util.h" +#include "tensorflow/core/lib/strings/str_util.h" +#include "tensorflow/core/platform/status_matchers.h" +#include "tensorflow/core/protobuf/error_codes.pb.h" + +#if IS_TRT_VERSION_GE(8, 0, 0, 0) + +namespace tensorflow { +namespace tensorrt { +namespace convert { + +namespace ops = ::tensorflow::ops; +using ::tensorflow::testing::StatusIs; + +// This anonymous namespace contains helper functions for instatiating small TF +// building blocks. These are used below to construct specific graph patterns +// which test end-to-end conversion of the TF graph to an explciit-precision +// enabled TensorRT network. +namespace { + +enum class ConvEpilogueType { + kNone, + kReLU, + kBatchNorm, + kReLUBatchnorm, + kBatchnormReLU +}; + +std::ostream& operator<<(std::ostream& os, ConvEpilogueType epilogue) { + switch (epilogue) { + case ConvEpilogueType::kNone: + return os << "None"; + case ConvEpilogueType::kReLU: + return os << "ReLU only"; + case ConvEpilogueType::kBatchNorm: + return os << "BatchNorm Only"; + case ConvEpilogueType::kReLUBatchnorm: + return os << "ReLU+Batchnorm"; + case ConvEpilogueType::kBatchnormReLU: + return os << "BatchNorm+ReLU"; + } +} + +std::string DebugString(ConvEpilogueType epilogue) { + std::stringstream ss; + ss << epilogue; + return ss.str(); +} + +// Adds a 2D 3x3, single channel input with specified data_format. data_format +// must be NHWC,NCHW or NHW. +ops::Placeholder AddInput(Scope scope, int input_idx, + const std::string data_format, + std::array size_chw = {1, 3, 3}) { + PartialTensorShape input_shape; + if (data_format == "NCHW") { + input_shape = + PartialTensorShape({1, size_chw[0], size_chw[1], size_chw[2]}); + } else if (data_format == "NHWC") { + input_shape = + PartialTensorShape({1, size_chw[1], size_chw[2], size_chw[0]}); + } else if (data_format == "NHW") { + input_shape = PartialTensorShape({1, size_chw[1], size_chw[2]}); + } else { + LOG(FATAL) << "Unknown input shape type " << data_format; + } + auto input_attrs = ops::Placeholder::Attrs().Shape(input_shape); + return ops::Placeholder(scope.WithOpName(absl::StrCat("input_", input_idx)), + DT_FLOAT, input_attrs); +} + +// Adds QDQ op with min = -1.0f, max = 1.0f. +Output AddQDQV2(Scope scope, Input input) { + // Create scaling factors. + auto input_min = + ops::Const(scope.WithOpName("in_min"), -1.0f, TensorShape{}); + auto input_max = + ops::Const(scope.WithOpName("in_max"), 1.0f, TensorShape{}); + return ops::QuantizeAndDequantizeV2(scope.WithOpName("qdq"), input, input_min, + input_max); +} + +Output AddOutput(Scope scope, Output input, int idx, bool add_qdq) { + Output out = input; + if (add_qdq) { + out = AddQDQV2(scope, input); + } + return ops::Identity(scope.WithOpName(StrCat("output_", idx)), out); +} + +// Adds a 3x3x1x1 Conv2D op and optional bias weights, followed by ReLU +// activation. Puts QDQ between (weights, op). Puts QDQ between (input, op) +// when qdq_on_output=false. Otherwise, puts QDQ between (op, output). +Output AddConv2D(Scope scope, Input input, int in_channels, int out_channels, + std::array filter_size = {1, 1}, + std::array stride = {1, 1}, + const std::string& data_format = "NCHW", bool with_bias = true, + ConvEpilogueType epilogue = ConvEpilogueType::kBatchnormReLU, + bool qdq_on_output = false) { + // Create 3x3 non-quantized weights weights. + auto weights_const = ops::Const( + scope.WithOpName("weights"), 1.0f, + TensorShape({filter_size[0], filter_size[1], in_channels, out_channels})); + + // Add QDQ to input if we don't add QDQ to output. + auto conv_input = + !qdq_on_output ? AddQDQV2(scope.WithOpName("qdq_input"), input) : input; + + Output result = ops::Conv2D( + scope.WithOpName("conv2d"), conv_input, AddQDQV2(scope, weights_const), + /*strides=*/{1, 1, 1, 1}, + /*padding=*/"SAME", ops::Conv2D::Attrs().DataFormat(data_format)); + + if (with_bias) { + auto bias_const = ops::Const(scope.WithOpName("bias_weights"), 1.0f, + TensorShape({ + out_channels, + })); + result = ops::BiasAdd(scope.WithOpName("bias"), result, bias_const, + ops::BiasAdd::Attrs().DataFormat(data_format)); + } + + auto add_bn = [scope, data_format](Input input, + const int channels) -> Output { + TensorShape constant_shape = TensorShape({channels}); + auto bn_scale = + ops::Const(scope.WithOpName("bn_scale"), 1.0f, constant_shape); + auto bn_offset = + ops::Const(scope.WithOpName("bn_offset"), 1.0f, constant_shape); + auto bn_mean = + ops::Const(scope.WithOpName("bn_mean"), 0.1f, TensorShape({channels})); + auto bn_var = + ops::Const(scope.WithOpName("bn_var"), 1.0f, TensorShape({channels})); + Input conv_bn_input = IS_TRT_VERSION_GE(8, 0, 1, 0) + ? input + : AddQDQV2(scope.WithOpName("qdq_input"), input); + return ops::FusedBatchNormV3( + scope.WithOpName("bn"), conv_bn_input, bn_scale, bn_offset, + bn_mean, bn_var, + ops::FusedBatchNormV3::Attrs().IsTraining(false).DataFormat( + data_format)) + .y; + }; + + switch (epilogue) { + case ConvEpilogueType::kBatchNorm: { + result = add_bn(result, out_channels); + break; + } + case ConvEpilogueType::kReLU: { + result = ops::Relu(scope.WithOpName("relu"), result); + break; + } + case ConvEpilogueType::kReLUBatchnorm: { + result = ops::Relu(scope.WithOpName("relu"), result); + result = add_bn(result, out_channels); + break; + } + case ConvEpilogueType::kBatchnormReLU: { + result = add_bn(result, out_channels); + result = ops::Relu(scope.WithOpName("relu"), result); + break; + } + case ConvEpilogueType::kNone: + break; + } + + if (qdq_on_output) { + result = AddQDQV2(scope.WithOpName("qdq_out"), result); + } + return result; +} + +// Adds a batch matrix multiplication V2 operation, which commonly appears in +// fully connected layers. Puts QDQ between (input, op) as well as between +// (weights, op). +ops::BatchMatMulV2 AddMatMul(Scope scope, const std::string& name, + Input input) { + // Add QDQ to input. + auto input_qdq = AddQDQV2(scope, input); + + // Add 3x3 weights with QDQ. + auto weights_const = + ops::Const(scope.WithOpName(name + "_weights"), + {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}, + TensorShape({3, 3})); + auto weights_qdq = AddQDQV2(scope.WithOpName("weights_qdq"), weights_const); + return ops::BatchMatMulV2(scope.WithOpName(name), input_qdq, weights_qdq); +} +} // namespace + +struct QDQTestOptions { + bool conv_has_bias{true}; + + // TRT7 may have issues with optimizing redundant transpose operations between + // QDQ and Op introduced by TF-TRT when format is not "NCHW". This allows to + // test both cases as well as WAR feasibility. + std::string data_format{"NCHW"}; + + // Tests whether placing QDQ on outputs rather than inputs is handled + // correctly. + bool qdq_on_output{false}; + + // Option for testing whether TRT build succeeds without a final QDQ before + // the output. + bool final_qdq{true}; + + // Whether to add activations (relu) to conv operations + ConvEpilogueType conv_epilogue; + + // TF-TRT API Options + TfTrtConversionParams conversion_params{}; +}; + +std::ostream& operator<<(std::ostream& os, const QDQTestOptions opts) { + return os << absl::StrCat( + "QDQTestOptions(conv_has_bias=", + static_cast(opts.conv_has_bias), + ", qdq_on_output=", static_cast(opts.qdq_on_output), + ", data_format=", opts.data_format, + ", conv_epilogue=", DebugString(opts.conv_epilogue), + ", final_qdq=", opts.final_qdq, ")"); +} + +std::vector EnumerateQDQTestOptions() { + std::vector result; + for (const absl::string_view data_format : {"NCHW", "NHWC"}) { + for (auto use_bias : {true, false}) { + for (auto qdq_on_output : {false, true}) { + // For now, always append a QDQ before output. For small single-op tests + // (besides QDQ), TensorRT7 sometimes has trouble. + for (auto final_qdq : {true, false}) { + for (auto conv_epilogue : + {ConvEpilogueType::kReLU, ConvEpilogueType::kNone, + ConvEpilogueType::kBatchnormReLU}) { + // Currently batch norm converter only supports NHWC. + if (data_format == "NHWC" && + (conv_epilogue == ConvEpilogueType::kBatchnormReLU || + conv_epilogue == ConvEpilogueType::kBatchNorm || + conv_epilogue == ConvEpilogueType::kBatchnormReLU)) { + continue; + } + QDQTestOptions opts{}; + opts.conv_has_bias = use_bias; + opts.data_format = data_format; + opts.qdq_on_output = qdq_on_output; + opts.final_qdq = final_qdq; + opts.conv_epilogue = conv_epilogue; + result.push_back(opts); + } + } + } + } + } + return result; +} + +// This class is a test fixture for running graph conversion and evaluating +// numerical results. +class QDQExplicitTest : public ::testing::Test, + public ::testing::WithParamInterface { + public: + static ::stream_executor::port::StatusOr GetShape( + const std::string& name, const GraphShapeInfo& shapes) { + TRT_ENSURE(shapes.find(name) != shapes.end()); + TRT_ENSURE(shapes.at(name).size() == 1); + return shapes.at(name)[0].shape; + } + + ::stream_executor::port::StatusOr GetModel( + const GraphDef& graph_def, const std::vector& inputs, + const std::vector& outputs, + const GraphShapeInfo& shapes) { + TRT_ENSURE(!inputs.empty()); + TRT_ENSURE(!outputs.empty()); + + MetaGraphDef out; + out.mutable_graph_def()->CopyFrom(graph_def); + + SignatureDef signature_def; + auto& mutable_inputs = *signature_def.mutable_inputs(); + for (int i = 0; i < inputs.size(); i++) { + std::string input_name = inputs[i]->name(); + auto& input = mutable_inputs[input_name]; + input.set_name(input_name); + input.set_dtype(DT_FLOAT); + TRT_ENSURE(shapes.find(input_name) != shapes.end()); + TRT_ENSURE(shapes.at(input_name).size() == 1); + PartialTensorShape input_shape = shapes.at(input_name)[0].shape; + input_shape.AsProto(input.mutable_tensor_shape()); + } + + auto& mutable_outputs = *signature_def.mutable_outputs(); + for (int i = 0; i < outputs.size(); i++) { + std::string output_name = outputs[i]->name(); + auto& output = mutable_outputs[output_name]; + output.set_name(output_name); + output.set_dtype(DT_FLOAT); + TRT_ENSURE(shapes.find(output_name) != shapes.end()); + TRT_ENSURE(shapes.at(output_name).size() == 1); + PartialTensorShape output_shape = shapes.at(output_name)[0].shape; + output_shape.AsProto(output.mutable_tensor_shape()); + } + + (*out.mutable_signature_def())["serving_default"] = signature_def; + return out; + } + + // Confirms that we have a TRT node with the correct attributes. + static Status CheckTrtNode(const GraphDef& converted_graph_def) { + int n_trt_ops = 0; + string op_name{"TRTEngineOp"}; + for (const auto& node : converted_graph_def.node()) { + if (op_name == node.op()) { + n_trt_ops++; + const auto& attr = node.attr(); + TRT_ENSURE(attr.at("static_engine").b()); + VLOG(2) << "Found serialized segment with size " + << attr.at("serialized_segment").s().size(); + TRT_ENSURE(!attr.at("serialized_segment").s().empty()); + } + } + TRT_ENSURE(n_trt_ops == 1); + return Status::OK(); + } + + Status ConvertAndRun(Scope* scope) { + std::vector inputs; + std::vector outputs; + + GraphDef gdef; + TF_RETURN_IF_ERROR(scope->ToGraphDef(&gdef)); + + std::unique_ptr graph(new Graph(OpRegistry::Global())); + TF_RETURN_IF_ERROR(scope->ToGraph(graph.get())); + + GraphShapeInfo shape_info; + TF_RETURN_IF_ERROR(InferShapes(graph.get(), /*arg_shapes=*/{}, + /*fnlib_def=*/nullptr, &shape_info)); + + for (const NodeDef& node : gdef.node()) { + if (absl::StartsWith(node.name(), "input_")) { + inputs.push_back(&node); + } else if (absl::StartsWith(node.name(), "output_")) { + outputs.push_back(&node); + } + } + + ::stream_executor::port::StatusOr meta_graph_def = + GetModel(gdef, inputs, outputs, shape_info); + TRT_ENSURE_OK(meta_graph_def); + + // Create a list of input tensors, they will be used to build the engines. + std::vector input_tensors; + std::vector input_names; + for (const auto& input : inputs) { + input_names.push_back(input->name()); + + ::stream_executor::port::StatusOr input_shape = + GetShape(input->name(), shape_info); + TRT_ENSURE_OK(input_shape); + + TensorShape shape; + input_shape.ValueOrDie().AsTensorShape(&shape); + Tensor tensor(DT_FLOAT, shape); + test::FillIota(&tensor, 1.0f); + input_tensors.push_back(tensor); + } + + std::vector output_names; + for (const auto& output : outputs) { + output_names.push_back(output->name()); + } + + TfTrtConversionParams conversion_params; + conversion_params.allow_build_at_runtime = true; + conversion_params.precision_mode = TrtPrecisionMode::INT8; + conversion_params.use_calibration = false; + conversion_params.convert_to_static_engine = true; + TRT_ENSURE(input_names.size() == input_tensors.size()); + ::stream_executor::port::StatusOr converted_gdef = + tensorrt::ConvertAndBuild(meta_graph_def.ValueOrDie().graph_def(), + input_names, output_names, {input_tensors}, + conversion_params); + TRT_ENSURE_OK(converted_gdef); + return CheckTrtNode(converted_gdef.ValueOrDie()); + } + + protected: + TfTrtConversionParams params_; + TrtUniquePtrType engine_; +}; + +class TestQDQSuite : public QDQExplicitTest {}; + +#define EXPECT_QDQ_ON_OUTPUT_FAILURE(params, scope) \ + if ((params).qdq_on_output) { \ + EXPECT_THAT(ConvertAndRun(&(scope)), StatusIs(error::INTERNAL)); \ + return; \ + } +#define EXPECT_NO_FINAL_QDQ_FAILURE(params, scope) \ + if (!(params).final_qdq) { \ + EXPECT_THAT(ConvertAndRun(&(scope)), StatusIs(error::INTERNAL)); \ + return; \ + } + +#define EXPECT_BUILD_OK(scope) TF_EXPECT_OK(ConvertAndRun(&(scope))) + +#define POLICY_TRT7(params, scope) \ + if (!IS_TRT_VERSION_GE(8, 0, 0, 0)) { \ + EXPECT_QDQ_ON_OUTPUT_FAILURE(params, scope); \ + EXPECT_NO_FINAL_QDQ_FAILURE(params, scope); \ + EXPECT_BUILD_OK(scope); \ + } + +#define POLICY_TRT8(params, scope) \ + if (IS_TRT_VERSION_GE(8, 0, 0, 0)) { \ + if (((params).conv_epilogue == ConvEpilogueType::kBatchNorm || \ + (params).conv_epilogue == ConvEpilogueType::kBatchnormReLU || \ + (params).conv_epilogue == ConvEpilogueType::kReLUBatchnorm) && \ + (params).data_format == "NHWC") { \ + EXPECT_THAT(ConvertAndRun(&(scope)), StatusIs(error::UNIMPLEMENTED)); \ + return; \ + } \ + EXPECT_BUILD_OK(scope); \ + } + +#define SKIP_TRT7(x) \ + if (!IS_TRT_VERSION_GE(8, 0, 0, 0) && (x)) { \ + GTEST_SKIP(); \ + } + +// Tests single convolution operation conversion. +TEST_P(TestQDQSuite, TestConv2DBasic) { + SKIP_TRT7(GetParam().qdq_on_output); + SKIP_TRT7(GetParam().data_format != "NCHW"); + SKIP_TRT7(!GetParam().final_qdq); + + Scope scope = Scope::NewRootScope(); + auto input = AddInput(scope, 0, GetParam().data_format, {3, 28, 28}); + + Output out = input; + const int num_conv = 1; + std::array in_channels = {3, 16}; + std::array out_channels = {16, 32}; + for (int i = 0; i < num_conv; i++) { + out = AddConv2D(scope.WithOpName(absl::StrCat("conv_", i)), out, + in_channels[i], out_channels[i], /*filter_size=*/{3, 3}, + /*stride=*/{1, 1}, GetParam().data_format, + GetParam().conv_has_bias, GetParam().conv_epilogue, + GetParam().qdq_on_output); + } + out = AddOutput(scope, out, 0, GetParam().final_qdq); + POLICY_TRT7(GetParam(), scope); + POLICY_TRT8(GetParam(), scope); +} + +// Tests single convolution operation conversion. +TEST_P(TestQDQSuite, TestMatMulBasic) { + // Some param's don't apply, so pick one combination and skip otherwise. + if (GetParam().data_format != "NCHW" || !GetParam().conv_has_bias || + GetParam().qdq_on_output || + GetParam().conv_epilogue != ConvEpilogueType::kReLU) { + GTEST_SKIP(); + } + Scope scope = Scope::NewRootScope(); + auto input = AddInput(scope, 0, "NHW"); + auto matmul_op = AddMatMul(scope, "matmul", input); + auto out = AddOutput(scope, matmul_op, 0, GetParam().final_qdq); + + TF_EXPECT_OK(ConvertAndRun(&scope)); +} + +// A single input goes through two different Conv2D. Outputs of Conv2D are +// added together, with QQQ on both branches of ADD. +TEST_P(TestQDQSuite, AddBothBranchesQDQConvSingleInput) { + SKIP_TRT7(!GetParam().final_qdq); + SKIP_TRT7(GetParam().data_format != "NCHW"); + + Scope scope = Scope::NewRootScope(); + auto input1 = AddInput(scope, 0, GetParam().data_format, + /*size_chw=*/{3, 28, 28}); + + auto conv1 = + AddConv2D(scope, input1, 3, 16, /*filter_size=*/{3, 3}, /*stride=*/{1, 1}, + GetParam().data_format, GetParam().conv_has_bias, + GetParam().conv_epilogue, GetParam().qdq_on_output); + + auto conv2 = + AddConv2D(scope, input1, 3, 16, /*filter_size=*/{3, 3}, /*stride=*/ + {1, 1}, GetParam().data_format, GetParam().conv_has_bias, + GetParam().conv_epilogue, GetParam().qdq_on_output); + + // In the case of "qdq on output", we don't need to add QDQ. + auto add = + ops::Add(scope.WithOpName("add"), + !GetParam().qdq_on_output ? AddQDQV2(scope, conv1) : conv1, + !GetParam().qdq_on_output ? AddQDQV2(scope, conv2) : conv2); + + auto conv3 = + AddConv2D(scope.WithOpName("conv3"), conv2, 16, 16, {1, 1}, {1, 1}, + GetParam().data_format, GetParam().conv_has_bias, + GetParam().conv_epilogue, GetParam().qdq_on_output); + + auto out = + AddOutput(scope.WithOpName("output"), conv3, 0, GetParam().final_qdq); + + POLICY_TRT7(GetParam(), scope); + POLICY_TRT8(GetParam(), scope); +} + +// Tests adding a single tensor to itself, with QQQ on both branches of ADD. +TEST_P(TestQDQSuite, AddBothBranchesQDQMultipleInput) { + // TRT7 QDQ optimizer makes single-input restriction. + SKIP_TRT7(true); + + Scope scope = Scope::NewRootScope(); + auto input1 = AddInput(scope, 0, GetParam().data_format); + auto input2 = AddInput(scope, 1, GetParam().data_format); + auto add = + ops::Add(scope.WithOpName("add"), + !GetParam().qdq_on_output ? AddQDQV2(scope, input1) : input1, + !GetParam().qdq_on_output ? AddQDQV2(scope, input2) : input2); + auto output = AddOutput(scope, add, 0, true); + TF_EXPECT_OK(ConvertAndRun(&scope)); +} + +// Tests Conv-MaxPool combination +TEST_P(TestQDQSuite, TestConvMaxpool) { + SKIP_TRT7(!GetParam().final_qdq); + SKIP_TRT7(GetParam().data_format != "NCHW"); + + Scope scope = Scope::NewRootScope(); + auto input = AddInput(scope, 0, GetParam().data_format, + /*size_chw=*/{3, 28, 28}); + auto conv1 = + AddConv2D(scope, input, 3, 16, /*filter_size=*/{3, 3}, /*stride=*/{1, 1}, + GetParam().data_format, GetParam().conv_has_bias, + GetParam().conv_epilogue, GetParam().qdq_on_output); + ops::MaxPool maxpool = + ops::MaxPool(scope.WithOpName("maxpool"), + AddQDQV2(scope.WithOpName("mp_qdq_in"), conv1), {1, 1, 1, 1}, + {1, 1, 1, 1}, "SAME", + ops::MaxPool::Attrs().DataFormat(GetParam().data_format)); + auto output = + AddOutput(scope.WithOpName("output"), maxpool, 0, GetParam().final_qdq); + POLICY_TRT7(GetParam(), scope); + POLICY_TRT8(GetParam(), scope); +} + +// Tests QDQ(Conv(QDQ(MaxPool(Conv(QDQ(x)))))) +TEST_P(TestQDQSuite, TestConvMaxpoolConv) { + SKIP_TRT7(!GetParam().final_qdq); + SKIP_TRT7(GetParam().data_format != "NCHW"); + + Scope scope = Scope::NewRootScope(); + auto input = AddInput(scope, 0, GetParam().data_format, + /*size_chw=*/{3, 28, 28}); + auto conv1 = + AddConv2D(scope, input, 3, 16, /*filter_size=*/{3, 3}, /*stride=*/{1, 1}, + GetParam().data_format, GetParam().conv_has_bias, + GetParam().conv_epilogue, GetParam().qdq_on_output); + ops::MaxPool maxpool = + ops::MaxPool(scope.WithOpName("maxpool"), + AddQDQV2(scope.WithOpName("mp_qdq_in"), conv1), {1, 1, 1, 1}, + {1, 1, 1, 1}, "SAME", + ops::MaxPool::Attrs().DataFormat(GetParam().data_format)); + auto conv2 = AddConv2D(scope, maxpool, 16, 16, {3, 3}, {1, 1}, + GetParam().data_format, GetParam().conv_has_bias, + GetParam().conv_epilogue, GetParam().qdq_on_output); + auto output = + AddOutput(scope.WithOpName("out"), conv2, 0, GetParam().final_qdq); + POLICY_TRT7(GetParam(), scope); + POLICY_TRT8(GetParam(), scope); +} + +INSTANTIATE_TEST_SUITE_P(TestQDQSuiteInst, TestQDQSuite, + ::testing::ValuesIn(EnumerateQDQTestOptions())); + +} // namespace convert +} // namespace tensorrt +} // namespace tensorflow + +#endif // IS_TRT_VERSION_GE(8, 0, 0, 0) +#endif // GOOGLE_CUDA && GOOGLE_TENSORRT diff --git a/tensorflow/compiler/tf2tensorrt/convert/ops/selectv2.cc b/tensorflow/compiler/tf2tensorrt/convert/ops/selectv2.cc new file mode 100644 index 00000000000..0e6736f3cf9 --- /dev/null +++ b/tensorflow/compiler/tf2tensorrt/convert/ops/selectv2.cc @@ -0,0 +1,220 @@ +/* Copyright 2022 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#if GOOGLE_CUDA && GOOGLE_TENSORRT + +#include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h" +#include "tensorflow/compiler/tf2tensorrt/convert/op_converter_registry.h" +#include "tensorflow/compiler/tf2tensorrt/convert/ops/layer_utils.h" + +namespace tensorflow { +namespace tensorrt { +namespace convert { + +#if IS_TRT_VERSION_GE(8, 2, 0, 0) +/* The ConvertSelectV2 is working only for cond_input passed as a boolean + * tensor, which could be created only for TRT >= 8.2. + */ +class ConvertSelectBase : public OpConverterBase { + public: + explicit ConvertSelectBase(const OpConverterParams* params, + const std::string& layer_name) + : OpConverterBase( + params, + {DataType::DT_FLOAT, DataType::DT_HALF, DataType::DT_INT32}), + layer_name_(layer_name) {} + + static constexpr std::array InputSpec() { + return std::array{ + InputArgSpec::Create("cond", TrtInputArg::kBoth), + InputArgSpec::Create("then", TrtInputArg::kBoth), + InputArgSpec::Create("else", TrtInputArg::kBoth)}; + } + + Status Validate() { + TF_RETURN_IF_ERROR(NotSupportedInImplicitBatch()); + + const auto& params = *this->params_; + const auto& inputs = params.inputs; + const auto& i_cond = inputs.at(0); + const auto& node = params.node_def; + TF_RETURN_IF_ERROR( + check_type(i_cond.TrtDType(), nvinfer1::DataType::kBOOL, node)); + + if (i_cond.is_weights()) { + // According to + // https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#constant-layer + // Boolean weights are not supported in TRT version 8.4. + return errors::InvalidArgument(bool_weight_error_msg(node)); + } + + const auto& i_then = inputs.at(1); + const auto& i_else = inputs.at(2); + const auto type_then = i_then.TrtDType(); + const auto type_else = i_else.TrtDType(); + if (type_then != type_else && (type_then == nvinfer1::DataType::kINT32 || + type_else == nvinfer1::DataType::kINT32)) { + // Both or none of (type_then, type_else) should be equal to kINT32. + return errors::InvalidArgument( + then_else_dtypes_error_msg(type_then, type_else, node)); + } + + bool cond_is_vector = false; + const auto& shape_cond = i_cond.GetTrtDims(); + if (layer_name_ == "select") { + const auto& shape_then = i_then.GetTrtDims(); + const auto& shape_else = i_else.GetTrtDims(); + TF_RETURN_IF_ERROR(compare_shapes(shape_then, shape_else)); + TF_RETURN_IF_ERROR( + compare_shapes(shape_cond, shape_then, &cond_is_vector)); + } + + nvinfer1::Dims cond_dims(shape_cond); + if (cond_is_vector) { + cond_dims.nbDims = i_then.GetTrtDims().nbDims; + const std::vector ones(cond_dims.d[0], 1); + std::copy(ones.begin(), ones.end(), cond_dims.d + 1); + } + + const TRT_TensorOrWeights new_cond(nvinfer1::DataType::kBOOL, cond_dims, + i_cond.batch_size()); + nvinfer1::Dims broadcasted_dims[3]; + for (int i = 1; i < 3; i++) { + TF_RETURN_IF_ERROR(GetTrtBroadcastShape(new_cond, inputs.at(i), true, + false, broadcasted_dims, + broadcasted_dims + i)); + } + + for (int i = 0; i < tensor_.size(); i++) { + // This will also convert constants to tensors. + tensor_[i] = std::make_unique(inputs.at(i)); + TF_RETURN_IF_ERROR( + ApplyBroadcast(tensor_[i], broadcasted_dims[i], this->params_, 0)); + } + + return Status::OK(); + } + + Status Convert() { + const auto& params = *this->params_; + auto* converter = params.converter; + + nvinfer1::ISelectLayer* select_layer = converter->network()->addSelect( + *tensor_[0].get()->as_tensor(params_)->trt_tensor(), // cond_tensor + *tensor_[1].get()->as_tensor(params_)->trt_tensor(), // then_tensor + *tensor_[2].get()->as_tensor(params_)->trt_tensor() // else_tensor + ); + + converter->SetLayerName(select_layer, params.node_def.name(), layer_name_); + AddOutput(TRT_TensorOrWeights(select_layer->getOutput(0))); + return Status::OK(); + } + + private: + Status compare_shapes(const nvinfer1::Dims& shape1, + const nvinfer1::Dims& shape2, + bool* cond_is_vector = nullptr) const { + const bool then_vs_else = cond_is_vector == nullptr; + bool same_shapes = shape1 == shape2; + if (!same_shapes && shape1.nbDims == shape2.nbDims) { + // We can't check size equivalent when dynamic shapes are involved. + // In this case, the two shapes should be equal at runtime. Therefore, + // the shapes still should be considered as equal if at least one of + // them is a tensor with dynamic shape, + same_shapes = DynamicShapeInput(this->params_->inputs, then_vs_else); + } + if (!same_shapes) { + if (then_vs_else || !(*cond_is_vector = (shape1.nbDims == 1 && + shape1.d[0] == shape2.d[0]))) { + const auto err = input_shapes_error_msg( + shape1, shape2, this->params_->node_def, then_vs_else); + return errors::InvalidArgument(err); + } + } + return Status::OK(); + } + + bool DynamicShapeInput(const std::vector& inputs, + bool then_vs_else) const { + const int idx = then_vs_else ? 1 : 0; + for (int i = 0; i < 2; ++i) { + const auto& input = inputs.at(i + idx); + if (input.is_tensor() && !HasStaticShape(input.GetTrtDims())) { + return true; + } + } + return false; + } + + std::array, 3> tensor_; + const std::string layer_name_; +}; + +class ConvertSelect : public ConvertSelectBase { + public: + explicit ConvertSelect(const OpConverterParams* params) + : ConvertSelectBase(params, "select") {} +}; + +class ConvertSelectV2 : public ConvertSelectBase { + public: + explicit ConvertSelectV2(const OpConverterParams* params) + : ConvertSelectBase(params, "selectv2") {} +}; + +std::string op_node_info(const NodeDef& node) { + return " of the '" + node.op() + "' operation at the node '" + node.name() + + "' "; +} + +std::string bool_weight_error_msg(const NodeDef& node) { + return "The boolean parameter '" + node.input(0) + "'" + op_node_info(node) + + "cannot be passed as a weight in TRT version 8.4."; +} + +std::string then_else_dtypes_error_msg(nvinfer1::DataType type_then, + nvinfer1::DataType type_else, + const NodeDef& node) { + return "DataTypes (" + DebugString(type_then) + ", " + + DebugString(type_else) + ") of parameters (" + node.input(1) + ", " + + node.input(2) + ")" + op_node_info(node) + "are incompatible."; +} + +std::string input_shapes_error_msg(const nvinfer1::Dims& shape1, + const nvinfer1::Dims& shape2, + const NodeDef& node, bool then_vs_else) { + const std::string& param_names = + then_vs_else ? "'then' and 'else'" : "'cond' and 'then'"; + std::string error_msg = "The shapes of the " + param_names + " parameters" + + op_node_info(node) + "must be the same"; + if (!then_vs_else) { + error_msg += + " OR 'cond' must be a vector with N elements, " + "where N is a batch size (the first shape dimension for 'then')"; + } + return error_msg + ", got " + DebugString(shape1) + " vs. " + + DebugString(shape2) + "."; +} + +REGISTER_DEFAULT_TRT_OP_CONVERTER(MakeConverterFunction(), + "Select"); +REGISTER_DEFAULT_TRT_OP_CONVERTER(MakeConverterFunction(), + "SelectV2"); +#endif + +} // namespace convert +} // namespace tensorrt +} // namespace tensorflow +#endif // GOOGLE_CUDA && GOOGLE_TENSORRT diff --git a/tensorflow/compiler/tf2tensorrt/convert/ops/softmax.cc b/tensorflow/compiler/tf2tensorrt/convert/ops/softmax.cc new file mode 100644 index 00000000000..dcbf992b08e --- /dev/null +++ b/tensorflow/compiler/tf2tensorrt/convert/ops/softmax.cc @@ -0,0 +1,81 @@ +/* Copyright 2022 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#if GOOGLE_CUDA && GOOGLE_TENSORRT + +#include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h" +#include "tensorflow/compiler/tf2tensorrt/convert/op_converter_registry.h" +#include "tensorflow/compiler/tf2tensorrt/convert/ops/layer_utils.h" + +namespace tensorflow { +namespace tensorrt { +namespace convert { + +class ConvertSoftmax : public OpConverterBase { + public: + explicit ConvertSoftmax(const OpConverterParams *params) + : OpConverterBase(params) {} + + static constexpr std::array AllowedDataTypes() { + return {DataType::DT_FLOAT, DataType::DT_HALF}; + } + + static constexpr std::array InputSpec() { + return std::array{ + InputArgSpec::Create("logits", TrtInputArg::kTensor)}; + } + + Status Validate() { + const auto ¶ms = *this->params_; + const auto &inputs = params.inputs; + + ITensorProxyPtr logits_tensor = inputs.at(0).tensor(); + const int num_trt_dims = logits_tensor->getDimensions().nbDims; + if (!num_trt_dims && params.use_implicit_batch) { + return errors::InvalidArgument( + "TensorRT Softmax cannot apply on the batch dimension"); + } + return Status::OK(); + } + + Status Convert() { + const auto ¶ms = *this->params_; + const auto &inputs = params.inputs; + const auto &node_def = params.node_def; + + ITensorProxyPtr logits_tensor = inputs.at(0).tensor(); + const int num_trt_dims = logits_tensor->getDimensions().nbDims; + + // Perform Softmax operation: + nvinfer1::ISoftMaxLayer *layer = + params.converter->network()->addSoftMax(*logits_tensor->trt_tensor()); + TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name()); + params.converter->SetLayerName(layer, node_def); + // Tensorflow SoftMax applies softmax operation over the last dimension. + layer->setAxes(1 << (num_trt_dims - 1)); + + ITensorProxyPtr output_tensor = layer->getOutput(0); + params.outputs->push_back(TRT_TensorOrWeights(output_tensor)); + return Status::OK(); + } +}; + +REGISTER_DEFAULT_TRT_OP_CONVERTER(MakeConverterFunction(), + "Softmax"); + +} // namespace convert +} // namespace tensorrt +} // namespace tensorflow +#endif // GOOGLE_CUDA && GOOGLE_TENSORRT diff --git a/tensorflow/compiler/tf2tensorrt/convert/ops/tile.cc b/tensorflow/compiler/tf2tensorrt/convert/ops/tile.cc new file mode 100644 index 00000000000..84961670b33 --- /dev/null +++ b/tensorflow/compiler/tf2tensorrt/convert/ops/tile.cc @@ -0,0 +1,208 @@ +/* Copyright 2022 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#if GOOGLE_CUDA && GOOGLE_TENSORRT + +#include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h" +#include "tensorflow/compiler/tf2tensorrt/convert/op_converter_registry.h" +#include "tensorflow/compiler/tf2tensorrt/convert/ops/layer_utils.h" + +namespace tensorflow { +namespace tensorrt { +namespace convert { + +class ConvertTile : public OpConverterBase { + public: + explicit ConvertTile(const OpConverterParams *params) + : OpConverterBase( + params, + {DataType::DT_FLOAT, DataType::DT_HALF, DataType::DT_INT32}) {} + + static constexpr std::array InputSpec() { + return std::array{ + InputArgSpec::Create("input_tensor", TrtInputArg::kBoth), + InputArgSpec::Create("weight", TrtInputArg::kBoth)}; + } + + Status Validate() { + const auto ¶ms = *this->params_; + const auto &inputs = params.inputs; + + const auto &repl = inputs.at(1); + if (params.use_implicit_batch && repl.is_tensor()) { + return errors::InvalidArgument( + "Conversion for Tile is not implemented for multipliers " + "passed as a tensor in implicit batch mode."); + } + + nvinfer1::DataType dtype; + const int *multiplies; + if (repl.is_weights()) { + TFTRT_CHECK_SHAPE_TENSOR(repl.weights().GetTensor()); + dtype = repl.weights().TrtDType(); + multiplies = repl.weights().GetPointer(); + } else { + dtype = repl.tensor()->getType(); + multiplies = nullptr; + } + + const auto &node = params.node_def; + TF_RETURN_IF_ERROR(check_type(dtype, nvinfer1::DataType::kINT32, node, 1)); + + const auto dims = inputs.at(0).GetTrtDims(); + const auto nb_dims = + dims.nbDims + + (params.use_implicit_batch && inputs.at(0).is_tensor() ? 1 : 0); + if (multiplies) { + const int mult_numb = repl.weights().count(); + if (mult_numb != nb_dims) { + return errors::InvalidArgument( + "The length of the replication vector (", mult_numb, + ") of the Tile operation in '", node.name(), + "' is expected to be equal to the rank of the input vector (", + nb_dims, ")."); + } + + if (std::any_of(multiplies, multiplies + nb_dims, + [](int i) { return i <= 0; })) { + const auto &mul = absl::StrJoin(multiplies, multiplies + nb_dims, ", "); + return errors::InvalidArgument( + "All replications of the Tile operation in '", node.name(), + "' should be positive, got (", mul, ")."); + } + + if (params.use_implicit_batch && multiplies[0] > 1) { + return errors::Unimplemented( + "The Tile operation along the batch dimension in '", node.name(), + "' is not implemented."); + } + } else { + const auto &repl_dims = repl.GetTrtDims(); + if (repl_dims.nbDims != 1) { + return errors::InvalidArgument( + "When replications are defined as a tensor, that tensor must be " + "1-dimensional. Got ", + repl_dims.nbDims, "-dimensional tensor."); + } + + // Check the number of elements in multiplyer for tensors with non-dynamic + // shape + if (repl_dims.d[0] >= 0 && repl_dims.d[0] != nb_dims) { + return errors::InvalidArgument( + "When replications are defined as a tensor, " + "the number of its elements (", + repl_dims.d[0], ") must be equal to the rank of the input tensor (", + nb_dims, ")."); + } + } + + return Status::OK(); + } + + Status Convert() { + const auto ¶ms = *this->params_; + const auto &inputs = params.inputs; + auto *converter = params.converter; + auto *network = converter->network(); + const auto &tensor = inputs.at(0); + const auto &replics = inputs.at(1); + const auto dims = tensor.GetTrtDims(); + const auto nb_dims = dims.nbDims; + + nvinfer1::Dims output_size{nb_dims, {1}}; + bool dynamic_flag = replics.is_tensor() || !HasStaticShape(dims); + + if (!dynamic_flag) { + // If input0 is a tensor, and we're in implicit batch mode, then we need + // dim_offset. + const auto dim_offset = + params.use_implicit_batch && tensor.is_tensor() ? 1 : 0; + const auto *input_size = dims.d; + const int *pReplics = replics.weights().GetPointer() + dim_offset; + for (int i = 0; i < nb_dims; i++) + output_size.d[i] = pReplics[i] * input_size[i]; + } + + ::stream_executor::port::StatusOr builder; + if (tensor.is_weights() || (dynamic_flag && replics.is_weights())) { + builder = + TRTNetworkBuilder::Create(converter->network(), params.weight_store); + TRT_ENSURE_OK(builder); + } + + ITensorProxyPtr input_tensor; + if (tensor.is_weights()) { + ::stream_executor::port::StatusOr + weights_const = builder.ValueOrDie().WeightsToConstant( + tensor.weights().GetTrtWeights(), dims); + TRT_ENSURE_PTR_OK(weights_const); + input_tensor = weights_const.ValueOrDie()->getOutput(0); + } else { + input_tensor = tensor.tensor(); + } + + auto &input_trt_tensor = *input_tensor->trt_tensor(); + nvinfer1::ITensor *target_shape = nullptr; + if (dynamic_flag) { + nvinfer1::ITensor *mult; + if (replics.is_weights()) { + ::stream_executor::port::StatusOr + weights_const = builder.ValueOrDie().WeightsToConstant( + replics.weights().GetTrtWeights(), replics.GetTrtDims()); + TRT_ENSURE_PTR_OK(weights_const); + mult = weights_const.ValueOrDie()->getOutput(0); + } else { + const ITensorProxyPtr multiplies = replics.tensor()->trt_tensor(); + mult = multiplies->trt_tensor(); + } + + nvinfer1::ITensor *shape = + network->addShape(input_trt_tensor)->getOutput(0); + target_shape = network + ->addElementWise(*shape, *mult, + nvinfer1::ElementWiseOperation::kPROD) + ->getOutput(0); + } + + nvinfer1::Dims start{nb_dims, {}}; + DimsAdapter stride(std::vector(nb_dims, 1)); + auto layer = network->addSlice(input_trt_tensor, start, output_size, + stride.AsTrtDims()); + layer->setMode(nvinfer1::SliceMode::kWRAP); + if (target_shape) layer->setInput(2, *target_shape); + + converter->SetLayerName(layer, params.node_def.name(), "to_tile"); + ITensorProxyPtr output_tensor = layer->getOutput(0); + if (tensor.is_weights() && params.use_implicit_batch) { + // Reshape output tensor by removing first dimension. + DimsAdapter adap(output_tensor->getDimensions()); + TF_RETURN_IF_ERROR(adap.RemoveBatchDimension()); + + TF_RETURN_IF_ERROR(PrepareTensorForShape( + params.converter, TRT_TensorOrWeights(output_tensor), + adap.AsTrtDims(), false, &output_tensor, params.node_def)); + } + + AddOutput(TRT_TensorOrWeights(output_tensor)); + return Status::OK(); + } +}; + +REGISTER_DEFAULT_TRT_OP_CONVERTER(MakeConverterFunction(), "Tile"); + +} // namespace convert +} // namespace tensorrt +} // namespace tensorflow +#endif // GOOGLE_CUDA && GOOGLE_TENSORRT diff --git a/tensorflow/compiler/tf2tensorrt/convert/ops/unary_ops.cc b/tensorflow/compiler/tf2tensorrt/convert/ops/unary_ops.cc new file mode 100644 index 00000000000..45bade296f6 --- /dev/null +++ b/tensorflow/compiler/tf2tensorrt/convert/ops/unary_ops.cc @@ -0,0 +1,251 @@ +/* Copyright 2022 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#if GOOGLE_CUDA && GOOGLE_TENSORRT + +#include "tensorflow/compiler/tf2tensorrt/convert/op_converter_registry.h" +#include "tensorflow/compiler/tf2tensorrt/convert/ops/layer_utils.h" + +namespace tensorflow { +namespace tensorrt { +namespace convert { + +const UnaryOperationMapType* UnaryOperationMap() { + static auto* const m = new UnaryOperationMapType({ + {"Exp", nvinfer1::UnaryOperation::kEXP}, + {"Log", nvinfer1::UnaryOperation::kLOG}, + {"Sqrt", nvinfer1::UnaryOperation::kSQRT}, + {"Rsqrt", nvinfer1::UnaryOperation::kSQRT}, + {"Reciprocal", nvinfer1::UnaryOperation::kRECIP}, + {"Abs", nvinfer1::UnaryOperation::kABS}, + {"Neg", nvinfer1::UnaryOperation::kNEG}, + {"Sin", nvinfer1::UnaryOperation::kSIN}, + {"Cos", nvinfer1::UnaryOperation::kCOS}, + {"Tan", nvinfer1::UnaryOperation::kTAN}, + {"Sinh", nvinfer1::UnaryOperation::kSINH}, + {"Cosh", nvinfer1::UnaryOperation::kCOSH}, + {"Asin", nvinfer1::UnaryOperation::kASIN}, + {"Acos", nvinfer1::UnaryOperation::kACOS}, + {"Atan", nvinfer1::UnaryOperation::kATAN}, + {"Asinh", nvinfer1::UnaryOperation::kASINH}, + {"Acosh", nvinfer1::UnaryOperation::kACOSH}, + {"Atanh", nvinfer1::UnaryOperation::kATANH}, + {"Ceil", nvinfer1::UnaryOperation::kCEIL}, + {"Floor", nvinfer1::UnaryOperation::kFLOOR}, + {"Erf", nvinfer1::UnaryOperation::kERF}, +#if IS_TRT_VERSION_GE(8, 2, 0, 0) + {"Round", nvinfer1::UnaryOperation::kROUND}, + {"Sign", nvinfer1::UnaryOperation::kSIGN}, +#endif + }); + return m; +} + +const UnaryOperationMapType* UnaryBooleanOperationMap() { + static auto* const m = new UnaryOperationMapType({ + {"LogicalNot", nvinfer1::UnaryOperation::kNOT}, + }); + return m; +} + +const ActivationTypeMapType* ActivationTypeMap() { + static auto* const m = new ActivationTypeMapType({ + {"LeakyRelu", nvinfer1::ActivationType::kLEAKY_RELU}, + {"Relu", nvinfer1::ActivationType::kRELU}, + {"Relu6", nvinfer1::ActivationType::kCLIP}, + {"Sigmoid", nvinfer1::ActivationType::kSIGMOID}, + {"Tanh", nvinfer1::ActivationType::kTANH}, + {"Elu", nvinfer1::ActivationType::kELU}, + {"Selu", nvinfer1::ActivationType::kSELU}, + {"Softsign", nvinfer1::ActivationType::kSOFTSIGN}, + {"Softplus", nvinfer1::ActivationType::kSOFTPLUS}, + }); + return m; +} + +template +class ConvertUnaryImpl { + protected: + ConvertUnaryImpl(const OperationMap* pOperMap) : pOperMap_(pOperMap) {} + + Status ValidateImpl(const OpConverterParams& params, + const std::vector& not_supported_ops = {}) { + const auto& node = params.node_def; + const auto& op = node.op(); + if (pOperMap_->find(op) == pOperMap_->end()) { + return errors::Unimplemented("Unary op: ", op, " not supported"); + } + DimsAdapter input_dims(params.inputs.at(0).GetTrtDims()); + if (!input_dims.NumDims()) { + return errors::InvalidArgument( + "At least 1 dimension is required for UNARY operation '", op, "'"); + } + + if (!not_supported_ops.empty() && params.use_implicit_batch) { + const auto& end = not_supported_ops.end(); + if (std::find(not_supported_ops.begin(), end, op) != end) { + const auto& err = + convert_not_supported_implicit(op, node.name(), "Unary"); + return errors::Unimplemented(err); + } + } + + return Status::OK(); + } + + Status ConvertImpl(const OpConverterParams& params) { + const auto& node_def = params.node_def; + auto* converter = params.converter; + const auto op_pair = pOperMap_->find(node_def.op()); + ITensorProxyPtr tensor = params.inputs.at(0).tensor(); + nvinfer1::IUnaryLayer* layer = + converter->network()->addUnary(*tensor->trt_tensor(), op_pair->second); + TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name()); + converter->SetLayerName(layer, node_def); + if (node_def.op() == "Rsqrt") { + layer = converter->network()->addUnary(*layer->getOutput(0), + nvinfer1::UnaryOperation::kRECIP); + TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name()); + converter->SetLayerName(layer, node_def, "recip"); + } + params.outputs->push_back(TRT_TensorOrWeights(layer->getOutput(0))); + return Status::OK(); + } + static constexpr std::array InputSpec() { + return std::array{ + InputArgSpec::Create("x", TrtInputArg::kTensor)}; + } + + protected: + const OperationMap* pOperMap_; +}; + +class ConvertUnary : public OpConverterBase, + protected ConvertUnaryImpl { + public: + explicit ConvertUnary(const OpConverterParams* params) + : OpConverterBase( + params, + params->node_def.op() == "Sign" + ? std::vector{DataType::DT_FLOAT, DataType::DT_HALF, + DataType::DT_INT8, DT_INT32} + : std::vector{DataType::DT_FLOAT, DataType::DT_HALF, + DataType::DT_INT8}), + ConvertUnaryImpl(UnaryOperationMap()) {} + + static constexpr std::array InputSpec() { + return ConvertUnaryImpl::InputSpec(); + } + + Status Validate() { return ValidateImpl(*params_, {"Sign", "Round"}); } + Status Convert() { return ConvertImpl(*params_); } +}; + +class ConvertBooleanUnary : public OpConverterBase, + public ConvertUnaryImpl { + public: + explicit ConvertBooleanUnary(const OpConverterParams* params) + : OpConverterBase(params, {DataType::DT_BOOL}), + ConvertUnaryImpl(UnaryBooleanOperationMap()) {} + + static constexpr std::array InputSpec() { + return ConvertUnaryImpl::InputSpec(); + } + + static constexpr const char* NodeDefDataTypeAttributeName() { + /* + node { + name: "..." + op: "LogicalNot" + input: "..." + } + */ + return ""; + } + Status Validate() { +#if IS_TRT_VERSION_GE(8, 2, 0, 0) + return ValidateImpl(*params_, {"LogicalNot"}); +#else + return errors::Unimplemented("Boolean op: ", params_->node_def.op(), + " is not supported in TRT version < 8.2"); +#endif + } + Status Convert() { return ConvertImpl(*params_); } +}; + +class ConvertActivation : public OpConverterBase, + protected ConvertUnaryImpl { + public: + explicit ConvertActivation(const OpConverterParams* params) + : OpConverterBase(params), + ConvertUnaryImpl(ActivationTypeMap()) {} + + static constexpr std::array InputSpec() { + return std::array{ + InputArgSpec::Create("input", TrtInputArg::kTensor)}; + } + + Status Validate() { + TF_RETURN_IF_ERROR(ValidateImpl(*params_)); + const auto& node_def = params_->node_def; + if (node_def.op() == "LeakyRelu") { + return GetNodeAttr(AttrSlice(node_def), "alpha", &alpha_); + } + alpha_ = 1.0f; + return Status::OK(); + } + Status Convert() { + auto* converter = params_->converter; + const auto& inputs = params_->inputs; + const auto& node_def = params_->node_def; + const auto& op = node_def.op(); + const auto op_pair = pOperMap_->find(op); + nvinfer1::IActivationLayer* layer = converter->network()->addActivation( + *inputs.at(0).tensor()->trt_tensor(), op_pair->second); + TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name()); + converter->SetLayerName(layer, node_def, "activation"); + ITensorProxyPtr output_tensor = layer->getOutput(0); + // Set parameters. + if (op == "Selu") { + // From tensorflow/core/kernels/relu_op_functor.h + alpha_ = 1.7580993408473768599402175208123f; + layer->setBeta(1.0507009873554804934193349852946f); + } else if (op == "Softplus") { + layer->setBeta(1.0f); + } else if (op == "Relu6") { + layer->setBeta(6.0f); + converter->ProvideQuantizationRange(&output_tensor, alpha_ = 0.0f, 6.0f); + } + layer->setAlpha(alpha_); + params_->outputs->push_back(TRT_TensorOrWeights(output_tensor)); + return Status::OK(); + } + + private: + float alpha_ = 0.f; +}; + +REGISTER_DEFAULT_TRT_OP_CONVERTER(MakeConverterFunction(), + GetOperationNames(*UnaryOperationMap())); +REGISTER_DEFAULT_TRT_OP_CONVERTER( + MakeConverterFunction(), + GetOperationNames(*UnaryBooleanOperationMap())); + +REGISTER_DEFAULT_TRT_OP_CONVERTER(MakeConverterFunction(), + GetOperationNames(*ActivationTypeMap())); +} // namespace convert +} // namespace tensorrt +} // namespace tensorflow +#endif // GOOGLE_CUDA && GOOGLE_TENSORRT diff --git a/tensorflow/compiler/tf2tensorrt/convert/ops/variable_ops.cc b/tensorflow/compiler/tf2tensorrt/convert/ops/variable_ops.cc new file mode 100644 index 00000000000..3df027e803f --- /dev/null +++ b/tensorflow/compiler/tf2tensorrt/convert/ops/variable_ops.cc @@ -0,0 +1,370 @@ +/* Copyright 2022 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#if GOOGLE_CUDA && GOOGLE_TENSORRT +#include "absl/strings/str_cat.h" +#include "tensorflow/compiler/tf2tensorrt/common/utils.h" +#include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h" +#include "tensorflow/compiler/tf2tensorrt/convert/op_converter.h" +#include "tensorflow/compiler/tf2tensorrt/convert/op_converter_registry.h" +#include "tensorflow/compiler/tf2tensorrt/convert/utils.h" +#include "tensorflow/core/common_runtime/process_function_library_runtime.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/lib/core/status.h" +#include "tensorflow/core/platform/stream_executor.h" +#include "third_party/tensorrt/NvInfer.h" +#include "third_party/tensorrt/NvInferRuntimeCommon.h" + +namespace tensorflow { +namespace tensorrt { +namespace convert { + +struct VarAttributes { + TensorShapeProto shape_proto; + TensorShape shape; + string name; + DataType dtype; + string shared_name; + string container; +}; + +template +Status ReadVariableHelper(const OpConverterParams* params, + const VarAttributes& attrs, + TRT_ShapedWeights* weights) { + Tensor tensor(attrs.dtype, attrs.shape); + auto ctx = params->converter->context(); + TRT_ENSURE(ctx != nullptr); + auto tensor_flat = tensor.flat(); + + // Clone function library runtime in order to get a mutable library + // definition to add and run a function with the variable operation. + auto lib = ctx->function_library(); + std::unique_ptr lib_def; + std::unique_ptr lib_pflr; + FunctionLibraryRuntime* lib_clone; // Not owned. + TF_RETURN_IF_ERROR(lib->Clone(&lib_def, &lib_pflr, &lib_clone)); + + // Create function definition. + FunctionDef fdef; + std::vector args; + string func_name = attrs.name + "/func"; + if (is_resource) { + // Create input tensor with the resource handle. + const auto& inputs = params->inputs; + const TRT_TensorOrWeights& handle = inputs.at(0); + args.emplace_back(handle.resource()); + + fdef = FunctionDefHelper::Define( + func_name, // Name + {"in: resource"}, // Args + {absl::StrCat("out: ", DataTypeString(attrs.dtype))}, // Returns + {}, // Attr def + // Nodes + {{{attrs.name}, + "ReadVariableOp", + {"in"}, // Name of the Placeholder or VarHandleOp + {{"dtype", attrs.dtype}}}, + {{"out"}, "Identity", {attrs.name}, {{"T", attrs.dtype}}}}); + } else { + fdef = FunctionDefHelper::Define( + func_name, // Name + {}, // Args + {absl::StrCat("out: ", DataTypeString(attrs.dtype))}, // Returns + {}, // Attr def + // Nodes + {{{attrs.name}, + "VariableV2", + {}, + {{"dtype", attrs.dtype}, + {"shape", attrs.shape_proto}, + {"container", attrs.container}, + {"shared_name", attrs.shared_name}}}, + {{"out"}, "Identity", {attrs.name}, {{"T", attrs.dtype}}}}); + } + + // Add function definition to the library. + TF_RETURN_IF_ERROR(lib_def->AddFunctionDef(fdef)); + + // Instantiate function. + FunctionLibraryRuntime::Handle func_handle; + FunctionLibraryRuntime::InstantiateOptions inst_ops; + inst_ops.state_handle = ""; + inst_ops.target = ctx->device()->name(); + AttrValueMap attr_list; + TF_RETURN_IF_ERROR(lib_clone->Instantiate(func_name, AttrSlice(&attr_list), + inst_ops, &func_handle)); + + FunctionLibraryRuntime::Options opts; + opts.rendezvous = ctx->rendezvous(); + opts.cancellation_manager = ctx->cancellation_manager(); + opts.runner = ctx->runner(); + + std::vector* rets = new std::vector(); + std::unique_ptr> outputs_wrapper(rets); + + // Run the new function synchronously. + Status s_dry_run; + Notification done_dry_run; + lib_clone->Run(opts, func_handle, args, rets, + [&s_dry_run, &done_dry_run](const Status& s) { + s_dry_run = s; + done_dry_run.Notify(); + }); + done_dry_run.WaitForNotification(); + TF_RETURN_IF_ERROR(s_dry_run); + TRT_ENSURE(ctx->op_device_context() != nullptr); + TRT_ENSURE(ctx->op_device_context()->stream() != nullptr); + + // Copy tensor. + cudaStream_t stream = reinterpret_cast( + CHECK_NOTNULL(ctx->op_device_context() + ->stream() + ->implementation() + ->GpuStreamMemberHack())); + + auto ret = cudaMemcpyAsync(tensor_flat.data(), rets->at(0).flat().data(), + rets->at(0).NumElements() * sizeof(T), + cudaMemcpyDeviceToHost, stream); + if (ret != 0) { + return errors::Internal("Could not copy the variable ", attrs.name); + } + cudaStreamSynchronize(stream); + + TF_RETURN_IF_ERROR( + TfTensorToTrtWeights(tensor, params->weight_store, weights)); + + return Status::OK(); +} + +class ConvertVariableV2 : public OpConverterBase { + public: + ConvertVariableV2(const OpConverterParams* params) + : OpConverterBase(params) {} + + static constexpr std::array InputSpec() { return {}; } + + static constexpr const char* NodeDefDataTypeAttributeName() { + /* + node { + name: "..." + op: "VariableV2" + ... + attr { + key: "dtype" + value { + type: DT_FLOAT + } + } + ... + } + */ + return "dtype"; + } + + template + Status ValidateImpl() { + const auto& node_def = params_->node_def; + + // Verify and consume node attributes. + ::stream_executor::port::StatusOr shape_proto = + GetAttrValue("shape"); + ::stream_executor::port::StatusOr shared_name = + GetAttrValue("shared_name"); + ::stream_executor::port::StatusOr container = + GetAttrValue("container"); + TRT_ENSURE_OK(shape_proto); + TRT_ENSURE_OK(shared_name); + TRT_ENSURE_OK(container); + + attrs_.shape_proto = shape_proto.ValueOrDie(); + attrs_.shape = TensorShape(shape_proto.ValueOrDie()); + attrs_.name = node_def.name(); + attrs_.shared_name = shared_name.ValueOrDie(); + attrs_.container = container.ValueOrDie(); + + Tensor tensor(attrs_.dtype, attrs_.shape); + auto tensor_flat = tensor.flat(); + for (int64_t i = 0; i < tensor_flat.size(); i++) { + tensor_flat(i) = T(0.0f); + } + + TRT_ShapedWeights weights; + TF_RETURN_IF_ERROR( + TfTensorToTrtWeights(tensor, params_->weight_store, &weights)); + + // Only push outputs during validation and when outputs are expected. + if (params_->validation_only && params_->outputs != nullptr) { + AddOutput(TRT_TensorOrWeights(weights)); + } + return Status::OK(); + } + + Status Validate() { + const auto& node_def = params_->node_def; + ::stream_executor::port::StatusOr dtype = + GetAttrValue("dtype"); + TRT_ENSURE_OK(dtype); + attrs_.dtype = dtype.ValueOrDie(); + + switch (attrs_.dtype) { + case DT_FLOAT: + return ValidateImpl(); + case DT_HALF: + return ValidateImpl(); + default: + // Note: this should have been caught by ValidateNodeDefDataType, but + // the compiler expects that all paths be handled in switch. + return errors::Unimplemented("Data type ", DataTypeString(attrs_.dtype), + " is not supported for ", node_def.op(), + ", at ", node_def.name()); + } + } + + template + Status ConvertImpl() { + TRT_ShapedWeights weights; + TF_RETURN_IF_ERROR(ReadVariableHelper(params_, attrs_, &weights)); + AddOutput(TRT_TensorOrWeights(weights)); + return Status::OK(); + } + + Status Convert() { + const auto& node_def = params_->node_def; + + switch (attrs_.dtype) { + case DT_FLOAT: + return ConvertImpl(); + case DT_HALF: + return ConvertImpl(); + default: + // Note: this should have been caught by ValidateNodeDefDataType, but + // the compiler expects that all paths be handled in switch. + return errors::Unimplemented("Data type ", DataTypeString(attrs_.dtype), + " is not supported for ", node_def.op(), + ", at ", node_def.name()); + } + } + + private: + VarAttributes attrs_{}; +}; +REGISTER_DEFAULT_TRT_OP_CONVERTER(MakeConverterFunction(), + {"VariableV2"}); + +class ConvertReadVariableOp : public OpConverterBase { + public: + ConvertReadVariableOp(const OpConverterParams* params) + : OpConverterBase(params) {} + + static constexpr std::array InputSpec() { + return {InputArgSpec::Create("resource", TrtInputArg::kResource)}; + } + + static constexpr const char* NodeDefDataTypeAttributeName() { + return "dtype"; + } + + template + Status ValidateImpl() { + const auto& node_def = params_->node_def; + + // Verify and consume node attributes. + ::stream_executor::port::StatusOr shape_proto = + GetAttrValue("_shape"); + TRT_ENSURE_OK(shape_proto); + + attrs_.shape_proto = shape_proto.ValueOrDie(); + attrs_.shape = TensorShape(shape_proto.ValueOrDie()); + attrs_.name = node_def.name(); + + Tensor tensor(attrs_.dtype, attrs_.shape); + auto tensor_flat = tensor.flat(); + for (int64_t i = 0; i < tensor_flat.size(); i++) { + tensor_flat(i) = T(0.0f); + } + + TRT_ShapedWeights weights; + TF_RETURN_IF_ERROR( + TfTensorToTrtWeights(tensor, params_->weight_store, &weights)); + + // Only push outputs during validation and when outputs are expected. + if (params_->validation_only && params_->outputs != nullptr) { + AddOutput(TRT_TensorOrWeights(weights)); + } + return Status::OK(); + } + + Status Validate() { + const auto& node_def = params_->node_def; + if (params_->use_implicit_batch) { + return errors::Unimplemented("Implicit batch mode not supported, at ", + node_def.name()); + } + + ::stream_executor::port::StatusOr dtype = + GetAttrValue("dtype"); + TRT_ENSURE_OK(dtype); + attrs_.dtype = dtype.ValueOrDie(); + + switch (attrs_.dtype) { + case DT_FLOAT: + return ValidateImpl(); + case DT_HALF: + return ValidateImpl(); + default: + // Note: this should have been caught by ValidateNodeDefDataType, but + // the compiler expects that all paths be handled in switch. + return errors::Unimplemented("Data type ", DataTypeString(attrs_.dtype), + " is not supported for ", node_def.op(), + ", at ", node_def.name()); + } + } + + template + Status ConvertImpl() { + TRT_ShapedWeights weights; + TF_RETURN_IF_ERROR(ReadVariableHelper(params_, attrs_, &weights)); + AddOutput(TRT_TensorOrWeights(weights)); + return Status::OK(); + } + + Status Convert() { + const auto& node_def = params_->node_def; + + switch (attrs_.dtype) { + case DT_FLOAT: + return ConvertImpl(); + case DT_HALF: + return ConvertImpl(); + default: + // Note: this should have been caught by ValidateNodeDefDataType, but + // the compiler expects that all paths be handled in switch. + return errors::Unimplemented("Data type ", DataTypeString(attrs_.dtype), + " is not supported for ", node_def.op(), + ", at ", node_def.name()); + } + } + + private: + VarAttributes attrs_{}; +}; +REGISTER_DEFAULT_TRT_OP_CONVERTER( + MakeConverterFunction(), {"ReadVariableOp"}); + +} // namespace convert +} // namespace tensorrt +} // namespace tensorflow + +#endif // GOOGLE_CUDA && GOOGLE_TENSORRT diff --git a/tensorflow/compiler/tf2tensorrt/convert/timing_cache.cc b/tensorflow/compiler/tf2tensorrt/convert/timing_cache.cc new file mode 100644 index 00000000000..423e5eb6c17 --- /dev/null +++ b/tensorflow/compiler/tf2tensorrt/convert/timing_cache.cc @@ -0,0 +1,87 @@ +/* Copyright 2022 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#if GOOGLE_CUDA && GOOGLE_TENSORRT + +#include "tensorflow/compiler/tf2tensorrt/convert/timing_cache.h" + +#include + +#include "tensorflow/compiler/tf2tensorrt/common/utils.h" +#include "tensorflow/core/platform/errors.h" +#include "third_party/tensorrt/NvInfer.h" + +namespace tensorflow { +namespace tensorrt { +namespace convert { + +::stream_executor::port::StatusOr +TimingCacheRegistry::LookUp(const string& name, + nvinfer1::IBuilderConfig* builder_config) { +#if IS_TRT_VERSION_GE(8, 0, 0, 0) + TRT_ENSURE(builder_config != nullptr); + mutex_lock scoped_lock(mu_); + if (map_.find(name) != map_.end()) { + const std::vector& data = map_[name]; + return std::unique_ptr( + builder_config->createTimingCache(data.data(), data.size())); + } + + // If no such timing cache exists, create a new timing cache. + return std::unique_ptr( + builder_config->createTimingCache(nullptr, 0)); +#endif // IS_TRT_VERSION_GE(8, 0, 0, 0) + return errors::Unimplemented( + "serializable timing cache does not exist in TensorRT versions < 8.0"); +} + +void TimingCacheRegistry::Upsert(const string& name, TimingCache* cache) { +#if IS_TRT_VERSION_GE(8, 0, 0, 0) + nvinfer1::IHostMemory* memory = cache->serialize(); + if (memory == nullptr) { + return; + } + + if (map_.find(name) == map_.end()) { + // If the timing cache with the given name does not exist, emplace the + // serialized buffer. + std::vector mem(memory->size()); + std::copy_n(static_cast(memory->data()), memory->size(), + mem.begin()); + { + mutex_lock scoped_lock(mu_); + map_.emplace(name, std::move(mem)); + } + } else { + // If the timing cache does exist, use the existing buffer. + mutex_lock scoped_lock(mu_); + std::vector& mem = map_[name]; + mem.resize(memory->size()); + std::copy_n(static_cast(memory->data()), memory->size(), + mem.begin()); + } + memory->destroy(); +#endif // IS_TRT_VERSION_GE(8, 0, 0, 0) +} + +TimingCacheRegistry* GetTimingCacheRegistry() { + static TimingCacheRegistry* registry = new TimingCacheRegistry(); + return registry; +} + +} // namespace convert +} // namespace tensorrt +} // namespace tensorflow + +#endif // GOOGLE_CUDA && GOOGLE_TENSORRT diff --git a/tensorflow/compiler/tf2tensorrt/convert/timing_cache.h b/tensorflow/compiler/tf2tensorrt/convert/timing_cache.h new file mode 100644 index 00000000000..27992dd5fe0 --- /dev/null +++ b/tensorflow/compiler/tf2tensorrt/convert/timing_cache.h @@ -0,0 +1,70 @@ +/* Copyright 2022 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_TIMING_CACHE_H_ +#define TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_TIMING_CACHE_H_ +#if GOOGLE_CUDA && GOOGLE_TENSORRT +#include + +#include "tensorflow/compiler/tf2tensorrt/common/utils.h" +#include "tensorflow/core/framework/selective_registration.h" +#include "tensorflow/core/platform/mutex.h" +#include "tensorflow/stream_executor/lib/statusor.h" +#include "third_party/tensorrt/NvInfer.h" + +namespace tensorflow { +namespace tensorrt { +namespace convert { + +// A registry for holding serialized TensorRT autotuner timing caches. +// For TensorRT versions < 8.0, the timing cache is not serializable, so these +// operations become no-ops. +class TimingCacheRegistry { + public: + TimingCacheRegistry() = default; + ~TimingCacheRegistry() = default; + +#if IS_TRT_VERSION_GE(8, 0, 0, 0) + using TimingCache = nvinfer1::ITimingCache; + using TimingCachePtr = std::unique_ptr; +#else + struct TimingCache {}; + using TimingCachePtr = std::unique_ptr; +#endif + + // Insert or update a registry into the map using the given name. The cache + // will be serialized before being placed into the map. + void Upsert(const string& name, TimingCache* cache); + + // Find a timing cache using the given name. The provided BuilderConfig is + // used to deserialize the cache. If no timing cache is found, a new timing + // cache is returned. + ::stream_executor::port::StatusOr LookUp( + const string& name, nvinfer1::IBuilderConfig* builder_config); + + private: + using SerializedTimingCache = std::vector; + + mutex mu_; + std::unordered_map map_; +}; + +TimingCacheRegistry* GetTimingCacheRegistry(); + +} // namespace convert +} // namespace tensorrt +} // namespace tensorflow + +#endif // GOOGLE_CUDA && GOOGLE_TENSORRT +#endif // TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_TIMING_CACHE_H_ diff --git a/tensorflow/compiler/tf2tensorrt/convert/trt_layout_optimization_pass.cc b/tensorflow/compiler/tf2tensorrt/convert/trt_layout_optimization_pass.cc new file mode 100644 index 00000000000..eda360da7a3 --- /dev/null +++ b/tensorflow/compiler/tf2tensorrt/convert/trt_layout_optimization_pass.cc @@ -0,0 +1,97 @@ +/* Copyright 2021 The TensorFlow Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/tf2tensorrt/convert/trt_layout_optimization_pass.h" + +#include "absl/strings/ascii.h" +#include "absl/strings/escaping.h" +#include "absl/strings/match.h" +#include "absl/strings/str_cat.h" +#include "tensorflow/compiler/tf2tensorrt/convert/convert_graph.h" +#include "tensorflow/compiler/tf2tensorrt/convert/utils.h" +#include "tensorflow/core/graph/graph_constructor.h" +#include "tensorflow/core/grappler/clusters/cluster.h" +#include "tensorflow/core/grappler/grappler_item.h" +#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h" +#include "tensorflow/core/grappler/utils/functions.h" +#include "tensorflow/core/lib/strings/numbers.h" +#include "tensorflow/core/lib/strings/str_util.h" +#include "tensorflow/core/lib/strings/strcat.h" +#include "tensorflow/core/platform/casts.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/stacktrace.h" + +#if GOOGLE_CUDA && GOOGLE_TENSORRT +namespace tensorflow { +namespace tensorrt { +namespace convert { + +using absl::AsciiStrToUpper; +using absl::StrAppend; +using absl::StrCat; + +TRTLayoutOptimizationPass::TRTLayoutOptimizationPass(const string& name) + : name_(name), + trt_logger_name_("DefaultLogger"), + minimum_segment_size_(3), + is_dynamic_op_(false), + max_cached_batches_(1), + max_workspace_size_bytes_(256LL << 20) { + VLOG(1) << "Constructing " << name_; +} + +Status TRTLayoutOptimizationPass::Optimize(grappler::Cluster* cluster, + const grappler::GrapplerItem& item, + GraphDef* optimized_graph) { + GraphDef modified_graph_def = item.graph; + + // Construct a GrapplerItem using the modified graph_def and the input + // grappler_item. + grappler::GrapplerItem grappler_item = + grappler_item.WithGraph(std::move(modified_graph_def)); + const GraphDef& graph_def = grappler_item.graph; + + // Convert graphdef to graph. + FunctionLibraryDefinition flib(OpRegistry::Global(), graph_def.library()); + Graph graph(flib); + TF_RETURN_IF_ERROR( + ConvertGraphDefToGraph(GraphConstructorOptions(), graph_def, &graph)); + + // Algorithm steps: + // 1. We iterate over the graph to find any Conv (or other layout sensitive + // op) + // 2. If found, we continue, else we return + // 3. We iterate over the nodes and replace the layout-sensitive params + // 3. We add Transpose before the inputs and after the outputs + + grappler::GraphProperties static_graph_properties(grappler_item); + + std::cout << "TRTLayoutOptimizationPass: reading nodes..." << std::endl; + for (Node* node : graph.nodes()) { + std::cout << node->name() << std::endl; + } + + // TODO: assign output *optimized_graph =; +} + +Status TRTLayoutOptimizationPass::Init( + const RewriterConfig_CustomGraphOptimizer* config) { + std::cout << "Do nothing for now" << std::endl; +} + +} // namespace convert +} // namespace tensorrt +} // namespace tensorflow + +#endif // GOOGLE_CUDA && GOOGLE_TENSORRT diff --git a/tensorflow/compiler/tf2tensorrt/convert/trt_layout_optimization_pass.h b/tensorflow/compiler/tf2tensorrt/convert/trt_layout_optimization_pass.h new file mode 100644 index 00000000000..e91b3cd8e5f --- /dev/null +++ b/tensorflow/compiler/tf2tensorrt/convert/trt_layout_optimization_pass.h @@ -0,0 +1,69 @@ +/* Copyright 20121 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_TRT_LAYOUT_OPTIMIZATION_PASS_H_ +#define TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_TRT_LAYOUT_OPTIMIZATION_PASS_H_ + +#include + +#include "tensorflow/compiler/tf2tensorrt/convert/utils.h" +#include "tensorflow/core/framework/graph.pb.h" +#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer.h" +#include "tensorflow/core/platform/logging.h" + +#if GOOGLE_CUDA && GOOGLE_TENSORRT + +#if !IS_TRT_VERSION_GE(7, 0, 0, 0) +#error From version 2.6, we only support NVIDIA TensorRT version 7 or newer. +#error Please update your environment and relaunch the compilation. +#endif + +namespace tensorflow { +namespace tensorrt { +namespace convert { +class TRTLayoutOptimizationPass : public grappler::CustomGraphOptimizer { + public: + TRTLayoutOptimizationPass(const string& name = "TRTLayoutOptimizationPass"); + + string name() const override { return name_; }; + + bool UsesFunctionLibrary() const override { return true; } + + Status Init( + const RewriterConfig_CustomGraphOptimizer* config = nullptr) override; + + Status Optimize(grappler::Cluster* cluster, + const grappler::GrapplerItem& item, + GraphDef* optimized_graph) override; + + /* void PrintDebugInfo(grappler::Cluster* cluster, + const grappler::GrapplerItem& item); + */ + + private: + const string name_; + string trt_logger_name_; + int minimum_segment_size_; + bool is_dynamic_op_; + int max_cached_batches_; + int64_t max_workspace_size_bytes_; +}; + +} // namespace convert +} // namespace tensorrt +} // namespace tensorflow + +#endif // GOOGLE_CUDA && GOOGLE_TENSORRT +#endif // TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_TRT_LAYOUT_OPTIMIZATION_PASS_H_ diff --git a/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc b/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc index 35a8c6340f8..3ee9e5d98e1 100644 --- a/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc +++ b/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc @@ -14,221 +14,203 @@ limitations under the License. #include "tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.h" +#include + #include "absl/strings/ascii.h" #include "absl/strings/escaping.h" +#include "absl/strings/match.h" #include "absl/strings/str_cat.h" #include "tensorflow/compiler/tf2tensorrt/convert/convert_graph.h" +#include "tensorflow/compiler/tf2tensorrt/convert/ops/quantization_ops.h" +#include "tensorflow/compiler/tf2tensorrt/convert/trt_parameters.h" #include "tensorflow/compiler/tf2tensorrt/convert/utils.h" #include "tensorflow/core/grappler/clusters/cluster.h" #include "tensorflow/core/grappler/grappler_item.h" +#include "tensorflow/core/grappler/op_types.h" #include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h" +#include "tensorflow/core/grappler/utils/functions.h" +#include "tensorflow/core/grappler/utils/topological_sort.h" #include "tensorflow/core/lib/strings/numbers.h" #include "tensorflow/core/lib/strings/str_util.h" #include "tensorflow/core/lib/strings/strcat.h" #include "tensorflow/core/platform/logging.h" -#include "tensorflow/core/platform/stacktrace.h" -#if GOOGLE_CUDA -#if GOOGLE_TENSORRT +#if GOOGLE_CUDA && GOOGLE_TENSORRT namespace tensorflow { namespace tensorrt { namespace convert { -// TODO(sami): Remove VLOG messages once the code matures using absl::AsciiStrToUpper; using absl::StrAppend; using absl::StrCat; +namespace { + +bool ShouldUseExplicitPrecision(const GraphDef& gdef) { + if (!IS_TRT_VERSION_GE(8, 0, 0, 0)) { + return false; + } + return absl::c_any_of(gdef.node(), [](const auto& node) { + return (absl::c_find(kExplicitQuantizationOpNames, node.op()) != + kExplicitQuantizationOpNames.end()); + }); +} + +::stream_executor::port::StatusOr ShouldConvertFunction( + const grappler::GrapplerItem& item) { + if (item.id == "tf_graph") { + return false; + } + const auto& func_item = + static_cast(item); + const AttrSlice& attr = func_item.func_attr(); + const AttrValue* attr_value = attr.Find("_tftrt_convert_function"); + if (attr_value != nullptr) { + bool result = false; + TF_RETURN_IF_ERROR(GetNodeAttr(attr, "_tftrt_convert_function", &result)); + return result; + } + VLOG(1) << "Attribute _tftrt_convert_function was not found."; + return false; +} + +// Converts function conversion attributes to conversion parameters. +Status UpdateFunctionSpecificConversionParams( + TRTOptimizationPass::ConversionParams& cp, + const tensorflow::AttrSlice& attr) { + auto get_size_attr = [](const AttrSlice& attr, absl::string_view name, + size_t* dst) -> Status { + int tmp = 0; + TF_RETURN_IF_ERROR(GetNodeAttr(attr, name, &tmp)); + *dst = static_cast(tmp); + return Status::OK(); + }; + + TF_RETURN_IF_ERROR( + GetNodeAttr(attr, "_tftrt_trt_logger_name", &cp.trt_logger_name)); + TF_RETURN_IF_ERROR( + get_size_attr(attr, "_tftrt_max_batch_size", &cp.max_batch_size)); + TF_RETURN_IF_ERROR(get_size_attr(attr, "_tftrt_max_workspace_size_bytes", + &cp.max_workspace_size_bytes)); + std::string precision_mode; + TF_RETURN_IF_ERROR( + GetNodeAttr(attr, "_tftrt_precision_mode", &precision_mode)); + TF_RETURN_IF_ERROR( + TrtPrecisionModeFromName(precision_mode, &cp.precision_mode)); + TF_RETURN_IF_ERROR(GetNodeAttr(attr, "_tftrt_minimum_segment_size", + &cp.minimum_segment_size)); + TF_RETURN_IF_ERROR(GetNodeAttr(attr, "_tftrt_is_dyn_op", &cp.is_dynamic_op)); + TF_RETURN_IF_ERROR( + GetNodeAttr(attr, "_tftrt_max_cached_engines", &cp.max_cached_engines)); + TF_RETURN_IF_ERROR( + GetNodeAttr(attr, "_tftrt_use_calibration", &cp.use_calibration)); + TF_RETURN_IF_ERROR( + GetNodeAttr(attr, "_tftrt_use_implicit_batch", &cp.use_implicit_batch)); + std::string profile_strategy; + TF_RETURN_IF_ERROR( + GetNodeAttr(attr, "_tftrt_profile_strategy", &profile_strategy)); + TF_RETURN_IF_ERROR( + ProfileStrategyFromName(profile_strategy, &cp.profile_strategy)); + TF_RETURN_IF_ERROR(GetNodeAttr(attr, "_tftrt_allow_build_at_runtime", + &cp.allow_build_at_runtime)); + return Status::OK(); +} +} // namespace + Status TRTOptimizationPass::Init( const RewriterConfig_CustomGraphOptimizer* config) { - VLOG(1) << "Called INIT for " << name_ << " with config = " << config; if (config == nullptr) { return Status::OK(); } const auto params = config->parameter_map(); if (params.count("minimum_segment_size")) { - minimum_segment_size_ = params.at("minimum_segment_size").i(); + params_.minimum_segment_size = params.at("minimum_segment_size").i(); } if (params.count("max_batch_size")) { - maximum_batch_size_ = params.at("max_batch_size").i(); + params_.max_batch_size = params.at("max_batch_size").i(); } if (params.count("is_dynamic_op")) { - is_dynamic_op_ = params.at("is_dynamic_op").b(); + params_.is_dynamic_op = params.at("is_dynamic_op").b(); } if (params.count("maximum_cached_engines")) { - max_cached_batches_ = params.at("maximum_cached_engines").i(); + params_.max_cached_engines = params.at("maximum_cached_engines").i(); } if (params.count("max_workspace_size_bytes")) { - max_workspace_size_bytes_ = params.at("max_workspace_size_bytes").i(); + params_.max_workspace_size_bytes = + params.at("max_workspace_size_bytes").i(); } if (params.count("precision_mode")) { TF_RETURN_IF_ERROR(TrtPrecisionModeFromName( - AsciiStrToUpper(params.at("precision_mode").s()), &precision_mode_)); + AsciiStrToUpper(params.at("precision_mode").s()), + ¶ms_.precision_mode)); } if (params.count("use_calibration")) { - use_calibration_ = params.at("use_calibration").b(); - } - return Status::OK(); -} - -void TRTOptimizationPass::PrintDebugInfo(grappler::Cluster* cluster, - const grappler::GrapplerItem& item) { - LOG(INFO) << "Cluster = " << cluster; - string offset(" "); - string offset2 = StrCat(offset, offset); - string offset3 = StrCat(offset2, offset); - string offset4 = StrCat(offset2, offset2); - if (cluster) { - LOG(INFO) << offset << "type = " << cluster->type(); - LOG(INFO) << offset << "num warmup steps = " << cluster->NumWarmupSteps(); - const auto dev_names = cluster->GetDeviceNames(); - if (!dev_names.empty()) { - LOG(INFO) << offset << " Device names:"; - for (const auto s : dev_names) { - LOG(INFO) << offset2 << s; - } - } - std::unordered_map peak_mem; - auto status = cluster->GetPeakMemoryUsage(&peak_mem); - if (status == Status::OK()) { - LOG(INFO) << offset << "Peak Memory Usage :"; - for (auto s : peak_mem) { - LOG(INFO) << offset2 << s.first << " = " << s.second; - } - } - - const auto dev_props = cluster->GetDevices(); - if (!dev_props.empty()) { - LOG(INFO) << offset << "Device properties:"; - for (auto k : dev_props) { - LOG(INFO) << offset2 << k.first; - const auto& dt = k.second; - LOG(INFO) << offset3 << "type = " << dt.type(); - LOG(INFO) << offset3 << "vendor = " << dt.vendor(); - LOG(INFO) << offset3 << "model = " << dt.model(); - LOG(INFO) << offset3 << "frequency = " << dt.frequency(); - LOG(INFO) << offset3 << "num cores = " << dt.num_cores(); - LOG(INFO) << offset3 << "num registers = " << dt.num_registers(); - LOG(INFO) << offset3 << "L1 cache size = " << dt.l1_cache_size(); - LOG(INFO) << offset3 << "L2 cache size = " << dt.l2_cache_size(); - LOG(INFO) << offset3 << "L3 cache size = " << dt.l3_cache_size(); - LOG(INFO) << offset3 << "SHMem per SMP = " - << dt.shared_memory_size_per_multiprocessor(); - LOG(INFO) << offset3 << "memory size = " << dt.memory_size(); - LOG(INFO) << offset3 << "bandwidth = " << dt.bandwidth(); - if (dt.environment_size()) { - LOG(INFO) << offset3 << "environment :"; - for (const auto e : dt.environment()) { - LOG(INFO) << offset4 << e.first << " = " << e.second; - } - } - } - } + params_.use_calibration = params.at("use_calibration").b(); } - LOG(INFO) << "item: " << item.id; - if (!item.feed.empty()) { - LOG(INFO) << offset << "Feeds :"; - for (const auto& f : item.feed) { - const auto& shape = f.second.shape(); - LOG(INFO) << offset2 << f.first << " = shaped " << shape.DebugString(); - } - } else { - LOG(INFO) << offset << "No Feeds"; - } - if (!item.fetch.empty()) { - LOG(INFO) << offset << "Fetches :"; - for (const auto& f : item.fetch) { - LOG(INFO) << offset2 << f; - } - } else { - LOG(INFO) << offset << "No Fetches"; + if (params.count("trt_logger")) { + params_.trt_logger_name = params.at("trt_logger").s(); } - - if (!item.init_ops.empty()) { - LOG(INFO) << offset << "init ops :"; - for (const auto& f : item.init_ops) { - LOG(INFO) << offset2 << f; - } - } else { - LOG(INFO) << offset << "No init ops"; + if (params.count("allow_build_at_runtime")) { + params_.allow_build_at_runtime = params.at("allow_build_at_runtime").b(); } - LOG(INFO) << "Save Op = " << item.save_op; - LOG(INFO) << "Restore Op = " << item.restore_op; - LOG(INFO) << "save_restore_loc_tensor = " << item.save_restore_loc_tensor; - if (!item.keep_ops.empty()) { - LOG(INFO) << offset << "keep ops :"; - for (const auto& f : item.keep_ops) { - LOG(INFO) << offset2 << f; - } - } else { - LOG(INFO) << offset << "No keep ops"; + if (params.count("use_implicit_batch")) { + params_.use_implicit_batch = params.at("use_implicit_batch").b(); } - for (const auto dev : cluster->GetDeviceSet()->devices()) { - const auto& pname = dev->parsed_name(); - LOG(INFO) << "Device name= " << dev->name() - << " parsedname job= " << pname.job << " id= " << pname.id - << " has_id: " << pname.has_id << " has_job: " << pname.has_job - << "has_type: " << pname.has_type << " type =" << pname.type; + if (params.count("profile_strategy")) { + TF_RETURN_IF_ERROR(ProfileStrategyFromName( + params.at("profile_strategy").s(), ¶ms_.profile_strategy)); } + return Status::OK(); +} + +static bool ExplicitPrecisionModePolicy() { + return IS_TRT_VERSION_GE(8, 0, 0, 0); } Status TRTOptimizationPass::Optimize(grappler::Cluster* cluster, const grappler::GrapplerItem& item, GraphDef* optimized_graph) { - VLOG(1) << "Called TRTOptimization Pass " << name_; - // This is a hack to workaround optimizer issue. MetaOptimizer calls - // optimization passes on function objects as well, we should not modify - // generated funcdefs! This is fragile but we don't have any other option - // until framework fixes it. - if (item.id != "tf_graph") { - LOG(WARNING) << name_ - << " is probably called on funcdef! This optimizer must *NOT* " - "be called on function objects."; + VLOG(1) << "Called TRTOptimization Pass " << name_ + << " on a grappler item with id=" << item.id; + // TF_ASSIGN_OR_RETURN(bool do_function_conversion, + // ShouldConvertFunction(item)); Optimizing the main graph(identified with + // `item.id == "tf_graph"`) with `minimim_segment_size == -1` indicates + // skipping main graph conversion. + if ((params_.minimum_segment_size == -1 && item.id == "tf_graph") || + (item.id != "tf_graph")) { + VLOG(1) << "Not optimizing this grappler item: " << item.id; *optimized_graph = item.graph; return Status::OK(); } - if (VLOG_IS_ON(3)) { - LOG(INFO) << CurrentStackTrace(); - PrintDebugInfo(cluster, item); + + if (params_.use_calibration && + params_.precision_mode != TrtPrecisionMode::INT8) { + LOG(WARNING) << "Calibration with FP32 or FP16 is not implemented. " + << "Falling back to use_calibration = False." + << "Note that the default value of use_calibration is True."; + params_.use_calibration = false; } - if (!is_dynamic_op_) { - int max_batch_dim = -1; - if (!item.feed.empty()) { - for (const auto& f : item.feed) { - const auto& shape = f.second.shape(); - if (shape.dims() > 0) { - if (shape.dim_size(0) > max_batch_dim) - max_batch_dim = shape.dim_size(0); - VLOG(2) << "Setting max_batch_dim to " << max_batch_dim - << " using batch dimension of " << f.first << " with shape " - << shape; - } - } - } - if (max_batch_dim > maximum_batch_size_) { - return errors::InvalidArgument( - "Specified max_batch_size=", maximum_batch_size_, - " is less than maximum batch dimension of inputs (", max_batch_dim, - "). ", "To continue, set max_batch_size to >= ", max_batch_dim); - } else if (max_batch_dim < maximum_batch_size_) { - LOG(INFO) << "Specified max_batch_size=" << maximum_batch_size_ - << " is larger than maximum batch dimension of inputs (" - << max_batch_dim << "). " - << "This can result in poor performance."; + + params_.use_explicit_precision = ShouldUseExplicitPrecision(item.graph); + if (params_.use_explicit_precision) { + LOG(INFO) << "[TF-TRT] Using explicit QDQ mode"; + if (params_.precision_mode != TrtPrecisionMode::INT8 || + params_.use_calibration) { + LOG(WARNING) + << "Explicit precision mode with calibration or FP32/FP16 mode is " + "not supported." + << " Setting precision mode to INT8 and calibration to false."; + params_.precision_mode = TrtPrecisionMode::INT8; + params_.use_calibration = false; } } - grappler::GraphProperties static_graph_properties(item); - TF_RETURN_IF_ERROR(static_graph_properties.InferStatically(true)); - ConversionParams cp; - if (use_calibration_ && precision_mode_ != TrtPrecisionMode::INT8) { - VLOG(1) << "Calibration with FP32 or FP16 is not implemented. " - << "Falling back to use_calibration = False." - << "Note that the default value of use_calibration is True."; - use_calibration_ = false; - } + // Create a copy of the graph to optimize. + grappler::GrapplerItem optimized_item(item); std::vector nodes_to_preserve; - for (const auto& n : item.NodesToPreserve()) { + const auto& old_nodes_to_preserve = item.NodesToPreserve(); + nodes_to_preserve.reserve(old_nodes_to_preserve.size()); + for (const auto& n : old_nodes_to_preserve) { auto tokens = str_util::Split(n, ":"); string s = tokens.at(0); for (int i = 1; i < tokens.size() - 1; ++i) { @@ -243,21 +225,16 @@ Status TRTOptimizationPass::Optimize(grappler::Cluster* cluster, } nodes_to_preserve.push_back(s); } - cp.input_graph_def = &item.graph; - cp.output_names = &nodes_to_preserve; - cp.max_batch_size = maximum_batch_size_; - cp.max_workspace_size_bytes = max_workspace_size_bytes_; - cp.output_graph_def = optimized_graph; - cp.precision_mode = precision_mode_; - cp.minimum_segment_size = minimum_segment_size_; - cp.graph_properties = &static_graph_properties; - cp.cluster = cluster; - cp.is_dyn_op = is_dynamic_op_; - cp.max_cached_engines = max_cached_batches_; - cp.use_calibration = use_calibration_; - auto status = ConvertAfterShapes(cp); - VLOG(1) << "Returning from " << name_; - return status; + + if (item.id != "tf_graph") { + const grappler::GrapplerFunctionItem& func_item = + static_cast(item); + TF_RETURN_IF_ERROR( + UpdateFunctionSpecificConversionParams(params_, func_item.func_attr())); + } + + return ConvertGraph(params_, optimized_item, nodes_to_preserve, cluster, + optimized_graph); } void TRTOptimizationPass::Feedback(grappler::Cluster* cluster, @@ -289,5 +266,4 @@ static VerboseCustomGraphOptimizerRegistrar TRTOptimizationPass_Registrar( } // namespace tensorrt } // namespace tensorflow -#endif -#endif +#endif // GOOGLE_CUDA && GOOGLE_TENSORRT diff --git a/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.h b/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.h index 35a92341ee9..0976dd157d8 100644 --- a/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.h +++ b/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.h @@ -16,15 +16,23 @@ limitations under the License. #ifndef TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_TRT_OPTIMIZATION_PASS_H_ #define TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_TRT_OPTIMIZATION_PASS_H_ +#include #include +#include "tensorflow/compiler/tf2tensorrt/convert/trt_parameters.h" #include "tensorflow/compiler/tf2tensorrt/convert/utils.h" #include "tensorflow/core/framework/graph.pb.h" +#include "tensorflow/core/grappler/costs/graph_properties.h" #include "tensorflow/core/grappler/optimizers/custom_graph_optimizer.h" +#include "tensorflow/core/grappler/utils.h" #include "tensorflow/core/platform/logging.h" -#if GOOGLE_CUDA -#if GOOGLE_TENSORRT +#if GOOGLE_CUDA && GOOGLE_TENSORRT + +#if !IS_TRT_VERSION_GE(7, 0, 0, 0) +#error From version 2.6, we only support NVIDIA TensorRT version 7 or newer. +#error Please update your environment and relaunch the compilation. +#endif namespace tensorflow { namespace tensorrt { @@ -32,17 +40,25 @@ namespace convert { class TRTOptimizationPass : public grappler::CustomGraphOptimizer { public: + struct ConversionParams { + string trt_logger_name = "DefaultLogger"; + size_t max_batch_size = -1; + size_t max_workspace_size_bytes = 1 << 30; + TrtPrecisionMode precision_mode = TrtPrecisionMode::FP32; + int minimum_segment_size = 3; + // Whether to create engine on conversion or execution time + bool is_dynamic_op = false; + // maximum number of cached engines + int max_cached_engines = 1; + bool use_calibration = true; + bool use_implicit_batch = true; + ProfileStrategy profile_strategy = ProfileStrategy::kRange; + bool allow_build_at_runtime = true; + bool use_explicit_precision = false; + }; + TRTOptimizationPass(const string& name = "TRTOptimizationPass") - : name_(name), - minimum_segment_size_(3), - precision_mode_(TrtPrecisionMode::FP32), - maximum_batch_size_(-1), - is_dynamic_op_(false), - max_cached_batches_(1), - max_workspace_size_bytes_(256LL << 20), - use_calibration_(true) { - VLOG(1) << "Constructing " << name_; - } + : name_(name) {} string name() const override { return name_; }; @@ -58,26 +74,17 @@ class TRTOptimizationPass : public grappler::CustomGraphOptimizer { void Feedback(grappler::Cluster* cluster, const grappler::GrapplerItem& item, const GraphDef& optimized_graph, double result) override; - void PrintDebugInfo(grappler::Cluster* cluster, - const grappler::GrapplerItem& item); - private: const string name_; - int minimum_segment_size_; - TrtPrecisionMode precision_mode_; - int maximum_batch_size_; - bool is_dynamic_op_; - std::vector batches_; - int max_cached_batches_; - int64_t max_workspace_size_bytes_; - bool use_calibration_; + ConversionParams params_; + + std::vector batches_; }; } // namespace convert } // namespace tensorrt } // namespace tensorflow -#endif // GOOGLE_CUDA -#endif // GOOGLE_TENSORRT +#endif // GOOGLE_CUDA && GOOGLE_TENSORRT #endif // TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_TRT_OPTIMIZATION_PASS_H_ diff --git a/tensorflow/compiler/tf2tensorrt/convert/trt_parameters.cc b/tensorflow/compiler/tf2tensorrt/convert/trt_parameters.cc new file mode 100644 index 00000000000..c85d119cc81 --- /dev/null +++ b/tensorflow/compiler/tf2tensorrt/convert/trt_parameters.cc @@ -0,0 +1,104 @@ +/* Copyright 2021 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/tf2tensorrt/convert/trt_parameters.h" + +#if GOOGLE_CUDA && GOOGLE_TENSORRT + +#include +#include + +#include "absl/strings/str_cat.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/lib/core/status.h" +#include "tensorflow/core/platform/errors.h" + +namespace tensorflow { +namespace tensorrt { + +Status TrtPrecisionModeToName(const TrtPrecisionMode mode, string* name) { + const char* kUnknown = "UNKNOWN"; + *name = *kUnknown; + switch (mode) { + case TrtPrecisionMode::FP32: + *name = "FP32"; + break; + case TrtPrecisionMode::FP16: + *name = "FP16"; + break; + case TrtPrecisionMode::INT8: + *name = "INT8"; + break; + } + if (name->compare(kUnknown) == 0) + return errors::OutOfRange("Unknown precision mode"); + return Status::OK(); +} + +Status TrtPrecisionModeFromName(const string& name, TrtPrecisionMode* mode) { + if (name == "FP32") { + *mode = TrtPrecisionMode::FP32; + } else if (name == "FP16") { + *mode = TrtPrecisionMode::FP16; + } else if (name == "INT8") { + *mode = TrtPrecisionMode::INT8; + } else { + return errors::InvalidArgument("Invalid precision mode name: ", name); + } + return Status::OK(); +} + +string DebugString(const TrtPrecisionMode mode) { + string mode_str; + TF_CHECK_OK(TrtPrecisionModeToName(mode, &mode_str)); + return absl::StrCat("TrtPrecisionMode::", mode_str); +} + +string ProfileStrategyToName(const ProfileStrategy strategy) { + switch (strategy) { + case ProfileStrategy::kRange: + return "Range"; + case ProfileStrategy::kOptimal: + return "Optimal"; + case ProfileStrategy::kRangeOptimal: + return "Range+Optimal"; + case ProfileStrategy::kImplicitBatchModeCompatible: + return "ImplicitBatchModeCompatible"; + } + return "Unknown"; +} + +Status ProfileStrategyFromName(const string& name, ProfileStrategy* strategy) { + string name_lowercase(name); + std::transform(name.begin(), name.end(), name_lowercase.begin(), + [](unsigned char c) { return std::tolower(c); }); + if (name_lowercase == "range") { + *strategy = ProfileStrategy::kRange; + } else if (name_lowercase == "optimal") { + *strategy = ProfileStrategy::kOptimal; + } else if (name_lowercase == "range+optimal") { + *strategy = ProfileStrategy::kRangeOptimal; + } else if (name_lowercase == "implicitbatchmodecompatible") { + *strategy = ProfileStrategy::kImplicitBatchModeCompatible; + } else { + return errors::InvalidArgument("Invalid profile strategy: ", name); + } + return Status::OK(); +} + +} // namespace tensorrt +} // namespace tensorflow + +#endif // GOOGLE_CUDA && GOOGLE_TENSORRT diff --git a/tensorflow/compiler/tf2tensorrt/convert/trt_parameters.h b/tensorflow/compiler/tf2tensorrt/convert/trt_parameters.h new file mode 100644 index 00000000000..3f44bb5f199 --- /dev/null +++ b/tensorflow/compiler/tf2tensorrt/convert/trt_parameters.h @@ -0,0 +1,72 @@ +/* Copyright 2021 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_TRT_PARAMETERS_H_ +#define TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_TRT_PARAMETERS_H_ + +#if GOOGLE_CUDA && GOOGLE_TENSORRT + +#include "tensorflow/core/platform/status.h" + +namespace tensorflow { +namespace tensorrt { + +// The PrecisionMode controls the precision used in TRT converted parts of the +// model. Setting PrecisionMode other than FP32 enables TensorRT to select +// lower-precision implementations when searching for the fastest kernels. +// +// For regularized models whose input dynamic range is approximately one, this +// typically produces significant speedups with negligible change in accuracy. +// There is additional complexity when working with INT8, see Calibration. +// +// - FP32 +// - FP16 Enable FP16 layer selection, with FP32 fallback. +// - INT8 Enable Int8 layer selection, with FP32 and FP16 fallback. +// +// Note that TensorRT will still choose a higher-precision kernel if it results +// in overall lower runtime, or if no low-precision implementation exists. +enum class TrtPrecisionMode { FP32, FP16, INT8 }; + +Status TrtPrecisionModeToName(const TrtPrecisionMode mode, string* name); + +Status TrtPrecisionModeFromName(const string& name, TrtPrecisionMode* mode); + +string DebugString(const TrtPrecisionMode mode); + +// Optimization profile generation strategies. +// - `kRange`: create one profile that works for inputs with dimension values +// in the range of [min_dims, max_dims] where min_dims and max_dims are +// derived from the provided inputs. +// - `kOptimal`: create one profile for each input. The profile only works for +// inputs with the same dimensions as the input it is created for. The GPU +// engine will be run with optimal performance with such inputs. +// - `kRangeOptimal`: create the profiles for both `Range` and `Optimal`. +// - `kImplicitBatchModeCompatible`: create the profiles that will produce the +// same GPU engines as the implicit_batch_mode would produce. +enum class ProfileStrategy { + kRange, + kOptimal, + kRangeOptimal, + kImplicitBatchModeCompatible, +}; + +string ProfileStrategyToName(const ProfileStrategy strategy); +Status ProfileStrategyFromName(const string& name, ProfileStrategy* strategy); + +} // namespace tensorrt +} // namespace tensorflow + +#endif // GOOGLE_CUDA && GOOGLE_TENSORRT +#endif // TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_TRT_PARAMETERS_H_ diff --git a/tensorflow/compiler/tf2tensorrt/convert/utils.cc b/tensorflow/compiler/tf2tensorrt/convert/utils.cc index ca21c193d63..18bfb2997df 100644 --- a/tensorflow/compiler/tf2tensorrt/convert/utils.cc +++ b/tensorflow/compiler/tf2tensorrt/convert/utils.cc @@ -15,41 +15,266 @@ limitations under the License. #include "tensorflow/compiler/tf2tensorrt/convert/utils.h" +#if GOOGLE_CUDA && GOOGLE_TENSORRT + +#include "absl/strings/ascii.h" #include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/lib/core/status.h" +#include "tensorflow/core/platform/errors.h" namespace tensorflow { namespace tensorrt { -Status TrtPrecisionModeToName(TrtPrecisionMode mode, string* name) { - switch (mode) { - case TrtPrecisionMode::FP32: - *name = "FP32"; +string DebugString(const nvinfer1::Dims& dims) { + string out = StrCat("nvinfer1::Dims(nbDims=", dims.nbDims, ", d="); + for (int i = 0; i < std::max(dims.nbDims, 0); ++i) { + StrAppend(&out, dims.d[i]); + StrAppend(&out, ","); + } + StrAppend(&out, ")"); + return out; +} + +string DebugString(const DataType tf_type) { + switch (tf_type) { + case DT_FLOAT: + return "DT_FLOAT"; + case DT_HALF: + return "DT_HALF"; + case DT_INT32: + return "DT_INT32"; + case DT_INT8: + return "DT_INT8"; + case DT_BOOL: + return "DT_BOOL"; + case DT_UINT8: + return "DT_UINT8"; + default: + return "Unknow TF DataType"; + } +} + +string DebugString(const nvinfer1::DataType trt_dtype) { + switch (trt_dtype) { + case nvinfer1::DataType::kFLOAT: + return "kFLOAT"; + case nvinfer1::DataType::kHALF: + return "kHALF"; + case nvinfer1::DataType::kINT8: + return "kINT8"; + case nvinfer1::DataType::kINT32: + return "kINT32"; + case nvinfer1::DataType::kBOOL: + return "kBOOL"; +#if IS_TRT_VERSION_GE(8, 5, 0, 0) + case nvinfer1::DataType::kUINT8: + return "kUINT8"; +#endif +#if IS_TRT_VERSION_GE(8, 6, 0, 0) + case nvinfer1::DataType::kFP8: + return "kFP8"; +#endif + default: + return "Invalid TRT data type"; + } +} + +string DebugString(const nvinfer1::Permutation& permutation, int len) { + string out = "nvinfer1::Permutation("; + for (int i = 0; i < len; ++i) { + StrAppend(&out, permutation.order[i], ","); + } + StrAppend(&out, ")"); + return out; +} + +string DebugString(const ITensorProxyPtr& tensor) { + return StrCat( + tensor->is_trt_tensor() ? "nvinfer1::ITensor(@" : "SimpleItensor(@", + reinterpret_cast(&tensor), ", name=", tensor->getName(), + ", dtype=", DebugString(tensor->getType()), + ", dims=", DebugString(tensor->getDimensions()), ")"); +} + +string DebugString(const nvinfer1::ITensor& tensor) { + return StrCat("nvinfer1::ITensor(@", reinterpret_cast(&tensor), + ", name=", tensor.getName(), + ", dtype=", DebugString(tensor.getType()), + ", dims=", DebugString(tensor.getDimensions()), ")"); +} + +string DebugString(const std::vector& dimvec) { + return absl::StrCat("[", + absl::StrJoin(dimvec, ",", + [](std::string* out, nvinfer1::Dims in) { + out->append(DebugString(in)); + }), + "]"); +} + +string DebugString(const std::vector& shapes) { + return TensorShapeUtils::ShapeListString(shapes); +} + +string DebugString(const std::vector& shapes) { + return PartialTensorShapeUtils::PartialShapeListString(shapes); +} + +// Checks whether actual_shapes are compatible with cached_shapes. This should +// only be used in implicit batch mode (in explicit batch mode one needs to +// check the profile ranges). Therefore implicit batch mode is assumed. +// It is also assumed that both actual_shapes and cached_shapes have been +// verified by TRTEngineOp::VerifyInputShapes, which ensures that the batch size +// for all tensors are the same. +bool AreShapesCompatible(const std::vector& actual_shapes, + const std::vector& cached_shapes) { + auto match_shape = [](const TensorShape& actual_shape, + const TensorShape& cached_shape) { + // Match the rank. + if (actual_shape.dims() != cached_shape.dims()) return false; + // Match the batch size. In implicit batch mode cached_shape.dim_size(0) is + // the max batch size, which can be larger than the actual batch size. + if (actual_shape.dim_size(0) > cached_shape.dim_size(0)) return false; + // Match remaining dimensions. + for (int i = 1; i < actual_shape.dims(); ++i) { + if (actual_shape.dim_size(i) != cached_shape.dim_size(i)) return false; + } + return true; + }; + for (int i = 0; i < actual_shapes.size(); ++i) { + if (!match_shape(actual_shapes[i], cached_shapes[i])) { + return false; + } + } + return true; +} +Status GetNetworkInputShapes(const nvinfer1::INetworkDefinition* network, + std::vector* input_shapes) { + const int n_inputs = network->getNbInputs(); + input_shapes->resize(n_inputs); + for (int i = 0; i < n_inputs; i++) { + const ITensorProxyPtr input = network->getInput(i); + TF_RETURN_IF_ERROR(DimsAdapter(input->getDimensions()) + .PartialTensorShape(&input_shapes->at(i))); + } + return Status::OK(); +} + +Status TfTypeToTrtType(DataType tf_type, nvinfer1::DataType* trt_type) { + switch (tf_type) { + case DT_FLOAT: + *trt_type = nvinfer1::DataType::kFLOAT; + break; + case DT_HALF: + *trt_type = nvinfer1::DataType::kHALF; + break; + case DT_INT32: + *trt_type = nvinfer1::DataType::kINT32; break; - case TrtPrecisionMode::FP16: - *name = "FP16"; +#if IS_TRT_VERSION_GE(8, 2, 0, 0) + case DT_BOOL: + *trt_type = nvinfer1::DataType::kBOOL; break; - case TrtPrecisionMode::INT8: - *name = "INT8"; +#endif +#if IS_TRT_VERSION_GE(8, 5, 0, 0) + case DT_UINT8: + *trt_type = nvinfer1::DataType::kUINT8; break; +#endif default: - return errors::OutOfRange("Unknown precision mode"); + return errors::InvalidArgument("Unsupported tensorflow data type ", + DataTypeString(tf_type)); } return Status::OK(); } -Status TrtPrecisionModeFromName(const string& name, TrtPrecisionMode* mode) { - if (name == "FP32") { - *mode = TrtPrecisionMode::FP32; - } else if (name == "FP16") { - *mode = TrtPrecisionMode::FP16; - } else if (name == "INT8") { - *mode = TrtPrecisionMode::INT8; - } else { - return errors::InvalidArgument("Invalid precision mode name: ", name); +Status TrtTypeToTfType(nvinfer1::DataType trt_type, DataType* tf_type) { + switch (trt_type) { + case nvinfer1::DataType::kFLOAT: + *tf_type = DT_FLOAT; + break; + case nvinfer1::DataType::kHALF: + *tf_type = DT_HALF; + break; + case nvinfer1::DataType::kINT32: + *tf_type = DT_INT32; + break; +#if IS_TRT_VERSION_GE(8, 2, 0, 0) + case nvinfer1::DataType::kBOOL: + *tf_type = DT_BOOL; + break; +#endif +#if IS_TRT_VERSION_GE(8, 5, 0, 0) + case nvinfer1::DataType::kUINT8: + *tf_type = DT_UINT8; + break; +#endif +#if IS_TRT_VERSION_GE(8, 6, 0, 0) + case nvinfer1::DataType::kFP8: + *tf_type = DT_FLOAT8_E4M3FN; + break; +#endif + default: + return errors::InvalidArgument("Invalid TRT data type"); } return Status::OK(); } +int GetNumberOfEngineInputs(const nvinfer1::ICudaEngine* engine) { + int n_bindings = engine->getNbBindings(); + int n_input = 0; + for (int i = 0; i < n_bindings; i++) { + if (engine->bindingIsInput(i)) n_input++; + } + // According to TensorRT 7 doc: "If the engine has been built for K profiles, + // the first getNbBindings() / K bindings are used by profile number 0, the + // following getNbBindings() / K bindings are used by profile number 1 etc." + // Therefore, to get the number of input tensors, we need to divide by the + // the number of profiles. + int n_profiles = engine->getNbOptimizationProfiles(); + return n_input / n_profiles; +} + +absl::string_view GetDeviceName(const Node* node) { + if (node->has_assigned_device_name()) { + return node->assigned_device_name(); + } + return node->requested_device(); +} + +absl::optional GetDeviceParsedName( + const Node* node) { + absl::string_view device_name = GetDeviceName(node); + DeviceNameUtils::ParsedName parsed_name; + if (!DeviceNameUtils::ParseFullName(device_name, &parsed_name)) { + return absl::nullopt; + } + return parsed_name; +} + +absl::optional MergeIfCompatible( + const DeviceNameUtils::ParsedName& a, + const DeviceNameUtils::ParsedName& b) { + DeviceNameUtils::ParsedName merged_name = a; + if (!DeviceNameUtils::MergeDevNames(&merged_name, b, + /*allow_soft_placement=*/false) + .ok()) { + return absl::nullopt; + } + return merged_name; +} + +absl::optional MergeIfCompatible( + const DeviceNameUtils::ParsedName& a, absl::string_view b) { + DeviceNameUtils::ParsedName b_parsed_name; + if (!DeviceNameUtils::ParseFullName(b, &b_parsed_name)) { + return absl::nullopt; + } + + return MergeIfCompatible(a, b_parsed_name); +} + } // namespace tensorrt } // namespace tensorflow + +#endif // GOOGLE_CUDA && GOOGLE_TENSORRT diff --git a/tensorflow/compiler/tf2tensorrt/convert/utils.h b/tensorflow/compiler/tf2tensorrt/convert/utils.h index eb60829d31d..cd701ed0066 100644 --- a/tensorflow/compiler/tf2tensorrt/convert/utils.h +++ b/tensorflow/compiler/tf2tensorrt/convert/utils.h @@ -16,36 +16,387 @@ limitations under the License. #ifndef TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_UTILS_H_ #define TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_UTILS_H_ +#include +#include #include +#include +#include +#include "absl/algorithm/container.h" +#include "absl/types/optional.h" +#include "tensorflow/compiler/tf2tensorrt/common/utils.h" +#include "tensorflow/compiler/tf2tensorrt/utils/trt_tensor_proxy.h" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/framework/tensor_shape.h" +#include "tensorflow/core/graph/graph.h" #include "tensorflow/core/lib/core/status.h" +#include "tensorflow/core/lib/strings/str_util.h" +#include "tensorflow/core/lib/strings/strcat.h" +#include "tensorflow/core/util/env_var.h" +#include "tensorflow/stream_executor/lib/statusor.h" + +#if GOOGLE_CUDA && GOOGLE_TENSORRT +#include "third_party/tensorrt/NvInfer.h" + +#define TFTRT_ERROR(func, ...) \ + do { \ + return func("TFTRT::", __FUNCTION__, ":", __LINE__, ": ", __VA_ARGS__); \ + } while (0) + +#define TFTRT_CHECK_SHAPE_TENSOR(tensor) \ + if (!IsTrtShapeTensorCompatible(tensor)) { \ + TFTRT_ERROR(errors::InvalidArgument, "Tensor of type ", \ + DebugString(tensor.dtype()), " having shape ", \ + tensor.shape().DebugString(), " is not TRT compatible"); \ + } namespace tensorflow { namespace tensorrt { -class IONamePrefixes { - public: - static constexpr const char* const kInputPHName = "TensorRTInputPH_"; - static constexpr const char* const kOutputPHName = "TensorRTOutputPH_"; -}; +static constexpr char kCastOutputTypeAttrName[] = "DstT"; +#if !IS_TRT_VERSION_GE(8, 2, 0, 0) template struct TrtDestroyer { void operator()(T* t) { if (t) t->destroy(); } }; - template using TrtUniquePtrType = std::unique_ptr>; +#else +template +using TrtUniquePtrType = std::unique_ptr; +#endif + +// Define a hash function for vector because it is used as the key +// for the engine cache. +struct VectorTensorShapeHasher { + std::size_t operator()(const std::vector& key) const { + return std::hash()(TensorShapeUtils::ShapeListString(key)); + } +}; + +using absl::StrAppend; +using absl::StrCat; + +// This utility template converts an arithmetic type to a string. This function +// is necessary to allow the following function to behave recursively: +// `string DebugString(const std::vector&)`. +template ::value, CType>::type> +string DebugString(const CType& el) { + string el_str = std::to_string(el); + // Prettify std::to_string which can sometimes returns 1.50000 instead of 1.5. + // In short it removes trailing 0s in a string-formatted number. + el_str.erase(el_str.find_last_not_of('0') + 1, std::string::npos); + return el_str; +} +// This utility template converts nested vectors to a string for debug purposes. +template +string DebugString(const std::vector& vector) { + string tmp_s = ""; + for (const auto el : vector) { + StrAppend(&tmp_s, StrCat(DebugString(el), ", ")); + } + return StrCat("{", tmp_s.substr(0, tmp_s.length() - 2), "}"); +} +string DebugString(const nvinfer1::Dims& dims); +string DebugString(const nvinfer1::DataType trt_dtype); +string DebugString(const DataType tf_type); +string DebugString(const nvinfer1::Permutation& permutation, int len); +string DebugString(const ITensorProxyPtr& tensor); +string DebugString(const nvinfer1::ITensor& tensor); +string DebugString(const std::vector& dimvec); +string DebugString(const std::vector& shapes); +string DebugString(const std::vector& shapes); + +template +string DebugString(const absl::InlinedVector& data) { + return absl::StrCat("[", absl::StrJoin(data, ","), "]"); +} + +inline bool HasStaticShape(const nvinfer1::Dims& dims) { + if (dims.nbDims < 0) return false; + for (int d = 0; d < dims.nbDims; ++d) { + if (dims.d[d] < 0) return false; + } + return true; +} + +template +bool HasStaticShape(const T& dims) { + return !absl::c_any_of(dims, [](int i) { return i < 0; }); +} + +// Returns whether a shape is compatible with a TRT shape tensor. +template +inline bool IsTrtShapeTensorCompatible(const TensorShapeType& shape) { + return ( + shape.dims() == 0 || + (shape.dims() == 1 && shape.num_elements() <= nvinfer1::Dims::MAX_DIMS)); +} + +// Returns whether a TF tensor could be interpreted as a TRT shape tensor. +inline bool IsTrtShapeTensorCompatible(const Tensor& tensor) { + return tensor.dtype() == DT_INT32 && + IsTrtShapeTensorCompatible(tensor.shape()); +} + +// Adapts various representations of shape (TF Shape, TRT Dims, plain +// containers) and provides methods for properties (length, volume) and +// conversion between types. Note that unlike TF's TensorShape, the underlying +// storage will only contain active dimensions. In the case of scalar shapes, +// `NumDims` is allowed to return 0 or 1, but the `storage_` vector will contain +// 1 element in both cases. In the non-scalar case, `NumDims() == +// storage_.size()`. +class DimsAdapter { + public: + using StorageType = absl::InlinedVector; + + private: + template + using EnableIfNotTensorShapeType = + std::enable_if_t, T>::value>; + + template + using EnableIfInt = std::enable_if_t::value && + std::is_integral::value>; + + public: + //----- Constructors ------ + + // Constructs from an absl::Span. + template + explicit DimsAdapter(absl::Span shape) + : num_dims_(static_cast(shape.size())) { + absl::c_copy(shape, std::back_inserter(storage_)); + } + + // Constructs from an absl::Span. + template + explicit DimsAdapter(const std::vector& shape) + : num_dims_(static_cast(shape.size())) { + absl::c_copy(shape, std::back_inserter(storage_)); + } + + // Constructs from a TRT dims object. + DimsAdapter(const nvinfer1::Dims& dims) : num_dims_(dims.nbDims) { + absl::c_copy(absl::MakeSpan(dims.d, dims.d + std::max(dims.nbDims, 0)), + std::back_inserter(storage_)); + } + + // Constructs explicitly specifing num_dims and storage data. + DimsAdapter(int32_t num_dims, StorageType data) + : num_dims_(num_dims), storage_(std::forward(data)) {} + + // Constructs from a TensorShape or PartialTensorShape. + template + static ::stream_executor::port::StatusOr Create( + const TensorShapeBase& shape, bool ignore_first_dim = false) { + if (shape.dims() > nvinfer1::Dims::MAX_DIMS) + return errors::InvalidArgument("dims of TensorShape exceed MAX_DIMS"); + if (ignore_first_dim && shape.dims() <= 0) + return errors::InvalidArgument( + "removing first dim requires explicit batch dimension"); + if (shape.dims() == -1) { + return DimsAdapter(-1, StorageType{}); + } + if (shape.dims() == 0) { + return DimsAdapter(0, StorageType{1}); + } + auto offt = (ignore_first_dim ? 1 : 0); + return DimsAdapter( + absl::MakeSpan(shape.dim_sizes().begin() + offt, shape.dims() - offt)); + } + + // Constructs from a container. + template > + static ::stream_executor::port::StatusOr Create( + const InputSequence& shape, bool ignore_first_dim = false) { + if (ignore_first_dim && shape.size() <= 0) { + return errors::InvalidArgument( + "removing first dim requires explicit batch dimension"); + } + return DimsAdapter( + absl::MakeSpan(shape).subspan(ignore_first_dim ? 1 : 0, shape.size())); + } + + //----- Conversion Utilities ------ + + // Converts to an nvinfers::Dims and assign the result to the object passed + // in via the result pointer. + void TrtDims(nvinfer1::Dims* result) const { + result->nbDims = num_dims_; + absl::c_copy(storage_, static_cast(result->d)); + } + + // Converts to an nvinfer1::Dims and return by value. + nvinfer1::Dims AsTrtDims() const { + nvinfer1::Dims result; + TrtDims(&result); + return result; + } + + // Converts to a TensorShape and assigns the result to the object passed in + // via the shape pointer. + Status TensorShape(TensorShape* shape, + absl::optional batch_size = absl::nullopt) const { + TF_RETURN_IF_ERROR(TensorShapeUtils::MakeShape( + reinterpret_cast(storage_.data()), storage_.size(), + shape)); + if (batch_size) shape->InsertDim(0, *batch_size); + return Status::OK(); + } + + // Converts to a PartialTensorShape and assigns the result to the object + // passed in via the shape pointer. + Status PartialTensorShape( + PartialTensorShape* shape, + absl::optional batch_size = absl::nullopt) const { + TF_RETURN_IF_ERROR(TensorShapeUtils::MakeShape( + reinterpret_cast(storage_.data()), storage_.size(), + shape)); + if (batch_size) shape->InsertDim(0, *batch_size); + return Status::OK(); + } + + // Copies the dimension values to the vector passed in via the shape pointer. + template > + Status Vector(std::vector* shape) const { + shape->clear(); + absl::c_copy(storage_, std::back_inserter(*shape)); + return Status::OK(); + } + + //----- Property Accessors ------ + + // Returns true if the shape has no dynamic dimensions. + bool IsStatic() const { + return !absl::c_any_of(storage_, [](auto i) { return i < 0; }); + } + + // Returns product of all dimensions. + int64_t Volume() const { + return absl::c_accumulate(storage_, static_cast(1), + std::multiplies<>()); + } + + int32_t NumDims() const { return num_dims_; } + + // Returns true if the shape should be interpreted as a scalar. This follows + // TensorRT conversions: a scalar shape can have NumDims()==1 or NumDims()==0, + // but the underlying storage_ container has a single dimension of size 1. + bool IsScalar() const { + return (num_dims_ == 0 || num_dims_ == 1) && storage_.size() == 1 && + storage_[0] == 1; + } + + // Returns true if the dimension storage is empty. This indicates an empty + // shape in both the scalar and non-scalar case. + bool IsEmpty() const { return storage_.empty(); } + + string DebugString() const { + auto vol = absl::c_accumulate(storage_, static_cast(1), + std::multiplies<>()); + return absl::StrCat("DimsAdapter(num_dims=", num_dims_, ",shape=[", + absl::StrJoin(storage_, ","), "],", "vol=", vol, ")"); + } + + // Returns beginning iterator for the underlying storage. + StorageType::const_iterator begin() const { return storage_.begin(); } + + // Returns ending iterator for the underlying storage. + StorageType::const_iterator end() const { return storage_.end(); } + + // Returns the size of the dimension at `idx`. + StorageType::value_type dim(size_t idx) const { return storage_[idx]; } + + // Returns a references to the dimension at `idx`. + StorageType::value_type& dim(size_t idx) { return storage_[idx]; } + + //----- Non-Const Operators ------ + + DimsAdapter& Append(int32_t dim) { + ::stream_executor::port::StatusOr is_scalar = IsScalar(); + if (!is_scalar.ok()) return *this; + num_dims_ = is_scalar.ValueOrDie() ? 2 : num_dims_ + 1; + storage_.push_back(dim); + return *this; + } + + DimsAdapter& Prepend(absl::optional dim) { + if (dim) { + num_dims_ = IsScalar() ? 2 : num_dims_ + 1; + storage_.insert(storage_.begin(), *dim); + } + return *this; + } + + Status RemoveBatchDimension() { + if (storage_.empty()) + return errors::InvalidArgument( + "attempted to remove batch dim from scalar"); + num_dims_ -= 1; + storage_.erase(storage_.begin()); + return Status::OK(); + } + + //----- Comparison Operators ------ + + bool operator==(const DimsAdapter& rhs) const { + if (rhs.num_dims_ != num_dims_) return false; + for (int i = 0; i < num_dims_; i++) { + if (rhs.storage_[i] != storage_[i]) return false; + } + return true; + } + + bool operator!=(const DimsAdapter& rhs) const { return !(*this == rhs); } + + private: + int32_t num_dims_{0}; + StorageType storage_{}; +}; + +Status GetNetworkInputShapes(const nvinfer1::INetworkDefinition* network, + std::vector* input_shapes); + +Status TfTypeToTrtType(DataType tf_type, nvinfer1::DataType* trt_type); +Status TrtTypeToTfType(nvinfer1::DataType trt_type, DataType* tf_type); + +// Returns true if an engine built for cached_shapes can also run actual_shapes. +bool AreShapesCompatible(const std::vector& actual_shapes, + const std::vector& cached_shapes); + +// Returns the number of inputs for the engine, which also correspends to the +// number of input tensors for the network. This can differ from the number of +// input bindings, because the number of total input bindings equals the number +// of profiles times the number of engine inputs. +int GetNumberOfEngineInputs(const nvinfer1::ICudaEngine* engine); -enum class TrtPrecisionMode { FP32, FP16, INT8 }; +// Returns the string representation for the assigned device or the requested +// device of the given node. +absl::string_view GetDeviceName(const Node* node); -Status TrtPrecisionModeToName(TrtPrecisionMode mode, string* name); +// Returns the ParsedName representation for the assigned device or the +// requested device string of the given node. If the device string is invalid, +// returns absl::nullopt. +absl::optional GetDeviceParsedName( + const Node* node); -Status TrtPrecisionModeFromName(const string& name, TrtPrecisionMode* mode); +// If the given two device assignments as compatible, returns the merge of the +// two assignments. Otherwise, returns absl::nullopt. +absl::optional MergeIfCompatible( + const DeviceNameUtils::ParsedName& a, const DeviceNameUtils::ParsedName& b); +// Similar to the above, except that the second device assignment is represented +// by a string_view. +absl::optional MergeIfCompatible( + const DeviceNameUtils::ParsedName& a, absl::string_view b); } // namespace tensorrt } // namespace tensorflow +#endif // GOOGLE_CUDA && GOOGLE_TENSORRT #endif // TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_UTILS_H_ diff --git a/tensorflow/compiler/tf2tensorrt/convert/weights.cc b/tensorflow/compiler/tf2tensorrt/convert/weights.cc new file mode 100644 index 00000000000..eb15351134d --- /dev/null +++ b/tensorflow/compiler/tf2tensorrt/convert/weights.cc @@ -0,0 +1,216 @@ +/* Copyright 2021 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#include "tensorflow/compiler/tf2tensorrt/convert/weights.h" + +#include +#include + +#include "absl/strings/str_cat.h" +#include "tensorflow/compiler/tf2tensorrt/convert/utils.h" + +#if GOOGLE_CUDA && GOOGLE_TENSORRT + +namespace tensorflow { +namespace tensorrt { + +namespace convert { + +TRT_ShapedWeights::TRT_ShapedWeights(nvinfer1::DataType type) + : shape_(0, DimsAdapter::StorageType{}), type_(type), volume_(0) {} + +::stream_executor::port::StatusOr +TRT_ShapedWeights::CreateWithTensor(nvinfer1::DataType type, DimsAdapter dims, + Tensor tensor) { + TRT_ShapedWeights weights(type); + weights.shape_ = dims; + weights.tensor_ = std::forward(tensor); + weights.volume_ = weights.shape_.Volume(); + if (weights.shape_.NumDims() == 0) { + DCHECK(weights.shape_.IsEmpty() || weights.shape_.IsScalar()); + } + return weights; +} + +nvinfer1::Weights TRT_ShapedWeights::GetTrtWeights() const { + return nvinfer1::Weights{type_, GetPointer(), volume_}; +} + +Status TRT_ShapedWeights::SetShape(DimsAdapter dims) { + if (volume_ != dims.Volume()) { + VLOG(2) << "Changing shape from " << shape_.DebugString() << ", to " + << dims.DebugString(); + return errors::Internal("SetShape would change number of elements"); + } + shape_ = std::move(dims); + return Status::OK(); +} + +size_t TRT_ShapedWeights::size_bytes() const { + size_t data_type_size = -1; + switch (type_) { + case nvinfer1::DataType::kFLOAT: + case nvinfer1::DataType::kINT32: + data_type_size = 4; + break; + case nvinfer1::DataType::kHALF: + data_type_size = 2; + break; +#if IS_TRT_VERSION_GE(8, 5, 0, 0) + case nvinfer1::DataType::kUINT8: +#endif +#if IS_TRT_VERSION_GE(8, 6, 0, 0) + case nvinfer1::DataType::kFP8: +#endif + case nvinfer1::DataType::kINT8: + case nvinfer1::DataType::kBOOL: + data_type_size = 1; + break; + } + return volume_ * data_type_size; +} + +string TRT_ShapedWeights::DebugString() const { + return absl::StrCat( + "TRT_ShapedWeights(shape=", shape_.DebugString(), + ", type=", tensorflow::tensorrt::DebugString(type_), + ", values=", reinterpret_cast(GetPointer()), ")"); +} + +TRT_TensorOrWeights::TRT_TensorOrWeights(ITensorProxyPtr tensor) + : tensor_proxy_ptr_(tensor), + initialized_(true), + arg_type_(TRT_ArgumentType::TENSOR) {} + +TRT_TensorOrWeights::TRT_TensorOrWeights(ITensorProxyPtr tensor, int batch_size) + : tensor_proxy_ptr_(tensor), + batch_size_(batch_size), + initialized_(true), + arg_type_(TRT_ArgumentType::TENSOR) {} + +TRT_TensorOrWeights::TRT_TensorOrWeights(nvinfer1::ITensor* tensor, + int batch_size) + : tensor_proxy_ptr_(tensor), + batch_size_(batch_size), + initialized_(true), + arg_type_(TRT_ArgumentType::TENSOR) {} + +TRT_TensorOrWeights::TRT_TensorOrWeights(nvinfer1::DataType trt_dtype, + const nvinfer1::Dims& trt_dims, + int batch_size) + : tensor_proxy_ptr_(new SimpleITensor(trt_dtype, trt_dims)), + batch_size_(batch_size), + initialized_(true), + arg_type_(TRT_ArgumentType::TENSOR) {} + +TRT_TensorOrWeights::TRT_TensorOrWeights(const TRT_ShapedWeights& weights) + : weights_(weights), + initialized_(true), + arg_type_(TRT_ArgumentType::WEIGHTS) {} + +TRT_TensorOrWeights::TRT_TensorOrWeights(const ResourceHandle& resource) + : resource_(resource), + initialized_(true), + arg_type_(TRT_ArgumentType::RESOURCE) {} + +TRT_TensorOrWeights::TRT_TensorOrWeights(const TRT_TensorOrWeights& rhs) + : tensor_proxy_ptr_(rhs.tensor_proxy_ptr_), + batch_size_(rhs.batch_size_), + resource_(rhs.resource_), + weights_(rhs.weights_), + initialized_(rhs.initialized_), + arg_type_(rhs.arg_type_) {} + +void TRT_TensorOrWeights::operator=(const TRT_TensorOrWeights& rhs) { + tensor_proxy_ptr_ = rhs.tensor_proxy_ptr_; + batch_size_ = rhs.batch_size_; + weights_ = rhs.weights_; + resource_ = rhs.resource_; + initialized_ = rhs.initialized_; + arg_type_ = rhs.arg_type_; +} + +ITensorProxyPtr TRT_TensorOrWeights::tensor() const { + DCHECK(is_tensor()); + return tensor_proxy_ptr_; +} + +ResourceHandle TRT_TensorOrWeights::resource() const { + DCHECK(is_resource()); + return resource_; +} + +nvinfer1::Dims TRT_TensorOrWeights::GetTrtDims() const { + switch (arg_type_) { + case TRT_ArgumentType::TENSOR: + return tensor()->getDimensions(); + case TRT_ArgumentType::WEIGHTS: + return weights().Shape().AsTrtDims(); + case TRT_ArgumentType::RESOURCE: + return {0, {}}; // Scalar. + } +} + +Status TRT_TensorOrWeights::GetTfType(DataType* tf_type) const { + if (!initialized_) { + return errors::Internal("The object is not initialized"); + } + switch (arg_type_) { + case TRT_ArgumentType::TENSOR: { + nvinfer1::DataType trt_type = tensor()->getType(); + return TrtTypeToTfType(trt_type, tf_type); + } + case TRT_ArgumentType::WEIGHTS: + *tf_type = weights().GetTensor().dtype(); + return Status::OK(); + case TRT_ArgumentType::RESOURCE: + *tf_type = DataType::DT_RESOURCE; + return Status::OK(); + } +} + +string TRT_TensorOrWeights::DebugString() const { + string output = "TRT_TensorOrWeights(type="; + if (is_tensor()) { + absl::StrAppend(&output, + "tensor=", tensorflow::tensorrt::DebugString(tensor()), + ", batch_size=", batch_size_); + } else { + absl::StrAppend(&output, "weights=", weights_.DebugString()); + } + absl::StrAppend(&output, ")"); + return output; +} + +::stream_executor::port::StatusOr +TrtWeightStore::GetTempWeights(nvinfer1::DataType trt_dtype, + const DimsAdapter& dims) { + DataType tf_dtype; + TF_RETURN_IF_ERROR(TrtTypeToTfType(trt_dtype, &tf_dtype)); + TensorShape shape; + TF_RETURN_IF_ERROR(dims.TensorShape(&shape)); + // TODO(jie): check weights size_bytes. 0 means type error + Tensor tensor(tf_dtype, shape); + ::stream_executor::port::StatusOr weights = + TRT_ShapedWeights::CreateWithTensor(trt_dtype, dims, tensor); + TRT_ENSURE_OK(weights); + store_.emplace_back(std::move(tensor)); + return weights; +} + +} // namespace convert +} // namespace tensorrt +} // namespace tensorflow + +#endif // GOOGLE_CUDA && GOOGLE_TENSORRT diff --git a/tensorflow/compiler/tf2tensorrt/convert/weights.h b/tensorflow/compiler/tf2tensorrt/convert/weights.h new file mode 100644 index 00000000000..02c26e711df --- /dev/null +++ b/tensorflow/compiler/tf2tensorrt/convert/weights.h @@ -0,0 +1,295 @@ +/* Copyright 2021 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_WEIGHTS_H_ +#define TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_WEIGHTS_H_ + +#if GOOGLE_CUDA && GOOGLE_TENSORRT + +#include + +#include "tensorflow/compiler/tf2tensorrt/convert/utils.h" +#include "tensorflow/compiler/tf2tensorrt/utils/trt_tensor_proxy.h" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/lib/core/status.h" +#include "tensorflow/core/platform/types.h" +#include "third_party/tensorrt/NvInfer.h" + +namespace tensorflow { +namespace tensorrt { +namespace convert { + +// Class to convert TF compile-time constants (e.g. Const nodes) to TRT weight. +class TRT_ShapedWeights { + public: + explicit TRT_ShapedWeights( + nvinfer1::DataType type = nvinfer1::DataType::kFLOAT); + + // Constructs a weights from another weights. + // + // NOTE: this does not copy the underlying buffer but only increase its + // reference count. + TRT_ShapedWeights(const TRT_ShapedWeights& rhs) = default; + + nvinfer1::Weights GetTrtWeights() const; + + const Tensor& GetTensor() const { return tensor_; } + + // Returns a pointer of type const T to the underlying buffer of the tensor. + template + const T* GetPointer() const { + int64 num_elem = + (tensor_.NumElements() * DataTypeSize(tensor_.dtype())) / sizeof(T); + return tensor_.bit_casted_shaped({num_elem}).data(); + } + + // Returns a pointer of type T to the underlying buffer of the tensor. + template + T* GetPointer() { + int64 num_elem = + (tensor_.NumElements() * DataTypeSize(tensor_.dtype())) / sizeof(T); + return tensor_.bit_casted_shaped({num_elem}).data(); + } + + // Fills all the weight values with value. + template + Status SetValues(T value) { + switch (type_) { + case nvinfer1::DataType::kFLOAT: { + float* ptr = tensor_.flat().data(); + std::fill(ptr, ptr + volume_, value); + break; + } + case nvinfer1::DataType::kHALF: { + Eigen::half* ptr = tensor_.flat().data(); + std::fill(ptr, ptr + volume_, Eigen::half(value)); + break; + } + case nvinfer1::DataType::kINT32: { + int32* ptr = tensor_.flat().data(); + std::fill(ptr, ptr + volume_, value); + break; + } + default: + return errors::InvalidArgument( + "Unsupported data type ", tensorflow::tensorrt::DebugString(type_)); + } + return Status::OK(); + } + + Status SetShape(DimsAdapter dims); + void SetShapeUnsafe(DimsAdapter dims) { shape_ = std::move(dims); } + + // Returns total number of elements. Returning 0 means either some dim is 0 + // or the number of dims is 0. Note that a TF scalar constant is marked as + // Dims{0, {1}}, and has a count() == 1. + int64_t count() const { return volume_; } + + size_t size_bytes() const; + + string DebugString() const; + + template + absl::Span GetSpan() const { + return absl::Span(tensor_.flat().data(), volume_); + } + + template + std::vector ToVector() const { + auto span = GetSpan(); + return std::vector(span.data(), span.data() + span.size()); + } + + nvinfer1::DataType TrtDType() const { return type_; } + + const DimsAdapter& Shape() const { return shape_; } + DimsAdapter& Shape() { return shape_; } + + private: + // The shape of the weights. Defaults to the empty shape. + DimsAdapter shape_; + + // This creation method is only used by TrtWeightStore, which creates the + // underlying buffer. + static ::stream_executor::port::StatusOr CreateWithTensor( + nvinfer1::DataType type, DimsAdapter dims, Tensor tensor); + + nvinfer1::DataType type_; + + // All weights should be stored inside TrtWeightStore to make sure lifetime of + // all the underlying tensors are available until the engine is built. For + // this reason, tensor_ should never be reassigned to a different value that + // is not already present in the TrtWeightStore. + Tensor tensor_; + // Contains the volume of the weight's shape. + int64_t volume_; + + friend class TrtWeightStore; +}; + +// Container for TRT_ShapedWeights. We need this container because TRT does not +// manage the lifetime of the weights buffer, it only keeps a pointer to it and +// requires that the data referenced by the pointer be available until the +// building of engine is complete. For more information see +// https://docs.nvidia.com/deeplearning/sdk/tensorrt-api/c_api/classnvinfer1_1_1_weights.html +// +// TODO(laigd): consider adding garbage collection to the unused weights. +class TrtWeightStore { + public: + // Gets a TRT_ShapedWeights with 'type' and 'dims'. + ::stream_executor::port::StatusOr GetTempWeights( + nvinfer1::DataType trt_type, const DimsAdapter& dims); + + // Gets a TRT_ShapedWeights with the same data type and dimensions as + // 'weights'. + ::stream_executor::port::StatusOr GetTempWeights( + const TRT_ShapedWeights& weights) { + return GetTempWeights(weights.TrtDType(), weights.Shape()); + } + + private: + // The backend storage of the TRT_ShapedWeights. + std::vector store_; +}; + +// Enumerates the possible types of arguments of a converter. This determines +// what object is contained in TRT_TensorOrWeights, and converters can require +// a specific type for each of their arguments. +enum class TRT_ArgumentType { + TENSOR = 0, + WEIGHTS = 1, + RESOURCE = 2, +}; + +struct OpConverterParams; + +// Represents a TRT-style input to a TF node, it can be either a +// ITensorProxyPtr (representing nvinfer1::ITensor* or SimpleITensor), +// or TRT_ShapedWeights which is compile-time constant. +// +// TODO(laigd): maybe rename it to TrtArgument, or mimic XlaCompiler::Argument. +class TRT_TensorOrWeights { + public: + TRT_TensorOrWeights() {} + TRT_TensorOrWeights(ITensorProxyPtr); + TRT_TensorOrWeights(ITensorProxyPtr tensor, int batch_size); + + // Constructs a wrapper for the given ITensor. + // This is used by Converter when building the TRT network, where the ITensor + // is owned by the TRT network being built. See comment for 'trt_tensor_' + // in trt_proxy_tensor.h. + explicit TRT_TensorOrWeights(nvinfer1::ITensor* tensor, int batch_size = -1); + + // Creates a SimpleITensor for trt_dtype and trt_dims and takes ownership of + // the object. Constructs a wrapper for the SimpleITensor. This is used by + // TrtNodeValidator to encapsulate the type and shape information for + // validation of graph nodes, and the created ITensor is fake and temporary, + // and should not be used to build any TRT network. See comment for + // 'simple_tensor_' in trt_proxy_tensor.h. + explicit TRT_TensorOrWeights(nvinfer1::DataType trt_dtype, + const nvinfer1::Dims& trt_dims, int batch_size); + + // Constructs a wrapper for the given weights. + explicit TRT_TensorOrWeights(const TRT_ShapedWeights& weights); + + // Constructs a wrapper for the given resource handle. + explicit TRT_TensorOrWeights(const ResourceHandle& resource); + + TRT_TensorOrWeights(const TRT_TensorOrWeights& rhs); + + void operator=(const TRT_TensorOrWeights& rhs); + + bool is_tensor() const { + return initialized_ && arg_type_ == TRT_ArgumentType::TENSOR; + } + bool is_weights() const { + return initialized_ && arg_type_ == TRT_ArgumentType::WEIGHTS; + } + bool is_resource() const { + return initialized_ && arg_type_ == TRT_ArgumentType::RESOURCE; + } + + ITensorProxyPtr tensor() const; + + ResourceHandle resource() const; + + ITensorProxyPtr as_tensor(const OpConverterParams* params); + + TRT_ShapedWeights& weights() { + DCHECK(is_weights()); + return weights_; + } + + const TRT_ShapedWeights& weights() const { + DCHECK(is_weights()); + return weights_; + } + + nvinfer1::Dims GetTrtDims() const; + + Status GetTfType(DataType* tf_type) const; + + int batch_size() const { return batch_size_; } + + string DebugString() const; + + nvinfer1::DataType TrtDType() const { + if (arg_type_ == TRT_ArgumentType::RESOURCE) { + VLOG(0) << "Calling TrtDType() with a RESOURCE argument is undefined " + "behavior."; + } + return arg_type_ == TRT_ArgumentType::TENSOR ? tensor_proxy_ptr_->getType() + : weights_.TrtDType(); + } + + private: + void set_batch_size(int batch_size) { batch_size_ = batch_size; } + + // First dimension of the TF tensor (NOT tensor_) that is represented by + // tensor_ is treated as the "batch dimension" by TRT, and tensor_'s + // dimensions (obtained via tensor_->getDimensions()) do not contain the batch + // dimension. For example, when a TF tensor with shape (A,B,C) is represented + // in TRT, tensor_->getDimensions() will be (B,C) and batch_size_ will be A. + // + // This requires that all tensors in the subgraph that is converted to a TRT + // engine have the same batch size are represented by the first dimension of + // their shape, and Converter will verify this during conversion. The drawback + // is that currently it cannot convert a graph that doesn't have the batch + // size represented in the shapes or the batch sizes are different. See + // b/118387490 for more details. + // + // If use_implicit_batch is false, batch_size_ is unused and + // tensor_->getDimensions() will contain the entire shape (A,B,C). + // + // tensor_proxy_ptr_ is used when arg_type_ == TENSOR. + ITensorProxyPtr tensor_proxy_ptr_ = nullptr; + int batch_size_ = -1; + + // For DT_RESOURCE arguments (there is no corresponding type in TRT). + // resource_ is used when arg_type_ == RESOURCE. + ResourceHandle resource_; + + // weights_ is used when arg_type_ == WEIGHTS. + TRT_ShapedWeights weights_; + bool initialized_ = false; + TRT_ArgumentType arg_type_ = TRT_ArgumentType::WEIGHTS; + + friend class Converter; +}; +} // namespace convert +} // namespace tensorrt +} // namespace tensorflow + +#endif // GOOGLE_CUDA && GOOGLE_TENSORRT +#endif // TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_WEIGHTS_H_ diff --git a/tensorflow/compiler/tf2tensorrt/kernels/get_calibration_data_op.cc b/tensorflow/compiler/tf2tensorrt/kernels/get_calibration_data_op.cc index 3143b06817e..76fb40b9520 100644 --- a/tensorflow/compiler/tf2tensorrt/kernels/get_calibration_data_op.cc +++ b/tensorflow/compiler/tf2tensorrt/kernels/get_calibration_data_op.cc @@ -22,8 +22,7 @@ limitations under the License. #include "tensorflow/core/framework/resource_mgr.h" #include "tensorflow/core/lib/core/refcount.h" -#if GOOGLE_CUDA -#if GOOGLE_TENSORRT +#if GOOGLE_CUDA && GOOGLE_TENSORRT namespace tensorflow { namespace tensorrt { @@ -67,5 +66,4 @@ REGISTER_KERNEL_BUILDER(Name("GetCalibrationDataOp").Device(DEVICE_GPU), } // namespace tensorrt } // namespace tensorflow -#endif // GOOGLE_TENSORRT -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA && GOOGLE_TENSORRT diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc index 7e0e40ceedc..4b53587e75b 100644 --- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc +++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc @@ -20,11 +20,14 @@ limitations under the License. #include "absl/strings/ascii.h" #include "absl/strings/str_cat.h" #include "absl/strings/string_view.h" +#include "tensorflow/compiler/tf2tensorrt/common/utils.h" #include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h" #include "tensorflow/compiler/tf2tensorrt/convert/utils.h" #include "tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h" +#include "tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.h" #include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h" #include "tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h" +#include "tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h" #include "tensorflow/core/common_runtime/function.h" #include "tensorflow/core/common_runtime/graph_optimizer.h" #include "tensorflow/core/framework/function.h" @@ -34,6 +37,8 @@ limitations under the License. #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/graph/algorithm.h" #include "tensorflow/core/graph/graph_constructor.h" +#include "tensorflow/core/grappler/clusters/utils.h" +#include "tensorflow/core/grappler/clusters/virtual_cluster.h" #include "tensorflow/core/lib/core/refcount.h" #include "tensorflow/core/lib/strings/str_util.h" #include "tensorflow/core/lib/strings/strcat.h" @@ -42,42 +47,93 @@ limitations under the License. #include "tensorflow/core/platform/stream_executor.h" #include "tensorflow/core/platform/thread_annotations.h" #include "tensorflow/core/platform/types.h" -#include "tensorflow/stream_executor/lib/statusor.h" +#include "tensorflow/core/profiler/lib/traceme.h" +#include "tensorflow/core/util/env_var.h" -#if GOOGLE_CUDA -#if GOOGLE_TENSORRT +#if GOOGLE_CUDA && GOOGLE_TENSORRT #include "third_party/gpus/cuda/include/cuda_runtime_api.h" #include "third_party/tensorrt/NvInfer.h" namespace tensorflow { namespace tensorrt { +namespace { Logger& logger = *Logger::GetLogger(); using absl::StrAppend; using absl::StrCat; using ::nvinfer1::IRuntime; -using ::stream_executor::port::StatusOr; -// A helper class to call done() when destructed for asynchronous execution. -// Helps simultaneous execution of native and TRT engines. +#define LOG_FIRST_FEW_WARNING_WITH_PREFIX \ + LOG_FIRST_N(WARNING, 5) << "TF-TRT Warning: " -class AsyncHelper : public core::RefCounted { +// Allocates device memory for an execution context to execute a TensorRT +// engine and records the relevant information for deallocating the memory when +// the engine finishes execution. +class ContextDeviceMemory { public: - AsyncHelper(AsyncOpKernel::DoneCallback done) : done_(done) {} - - ~AsyncHelper() override { this->operator()(); } + ContextDeviceMemory() + : execution_context_(nullptr), + device_memory_allocator_(nullptr), + device_memory_(nullptr) {} + + ~ContextDeviceMemory() { + if (device_memory_) { + device_memory_allocator_->free(device_memory_); + } + } - void operator()() { - if (!called_) { - done_(); - called_ = true; + Status AllocateDeviceMemory(nvinfer1::IExecutionContext* execution_context, + TRTBaseAllocator* device_memory_allocator, + size_t device_memory_size) { + execution_context_ = execution_context; + device_memory_allocator_ = device_memory_allocator; + device_memory_ = nullptr; + VLOG(2) << "Device memory size for TensorRT engine " << device_memory_size; + if (device_memory_size > 0) { + device_memory_ = device_memory_allocator_->allocate( + device_memory_size, + /*unused alignment=*/0, /*flags=*/0); + if (device_memory_ == nullptr) { + return errors::InvalidArgument( + "Out of GPU memory for execution context"); + } } + { + tensorflow::profiler::TraceMe activity( + "setDeviceMemory", tensorflow::profiler::TraceMeLevel::kInfo); + execution_context_->setDeviceMemory(device_memory_); + } + return Status::OK(); } + private: + nvinfer1::IExecutionContext* execution_context_; + TRTBaseAllocator* device_memory_allocator_; + void* device_memory_; +}; + +// Macros for asynchronous execution, such as OP_REQUIRES_OK_ASYNC requires an +// object with operator (). Provides such an object with a noop operator() +// because we don't need such macros to invoke the DoneCallback for the +// TRTEngineOp. +struct DummyAsyncHelper { + void operator()() {} +}; + +// A helper class to call the DoneCallback for the TRTEngineOp when the object +// is destructed to support asynchronous of the native segment and TRT engines +// for the TRTEngineOp. +class AsyncHelper : public core::RefCounted { + public: + AsyncHelper(AsyncOpKernel::DoneCallback done) : done_(done) {} + + ~AsyncHelper() override { done_(); } + private: AsyncOpKernel::DoneCallback done_; - bool called_ = false; // Has `done_` been called? }; +} // end anonymous namespace + // This OP can construct TRTEngine on the fly and if construction of engine // fails, executes equivalent subgraph as a TensorFlow function. class TRTEngineOp : public AsyncOpKernel { @@ -88,50 +144,63 @@ class TRTEngineOp : public AsyncOpKernel { AsyncOpKernel::DoneCallback done) override; private: - using CacheType = - LRUCache, std::unique_ptr, - VectorTensorShapeHasher>; - - // Execute calibration + // Executes calibration asynchronously. void ExecuteCalibration(OpKernelContext* ctx, TRTEngineCacheResource* cache_res, - AsyncHelper* helper); - - // Construct a function handle for executing native funcdef graph - // These are the exact same function. - - Status ConstructFunctionHandle(FunctionLibraryRuntime* lib, - const string& device_name); - - // Execute replaced native segment as function Op. - void ExecuteNativeSegment(OpKernelContext* ctx, AsyncHelper* helper); - - // Execute the tensorrt engine. Returns whether we need to retry by running - // the native segment. - bool ExecuteTrtEngine(OpKernelContext* ctx, EngineContext* engine_context); - - // Allocate necessary resources for calibration + AsyncHelper* async_helper); + + // Constructs a function handle for the segment of the TRTEngineOp. + ::stream_executor::port::StatusOr + ConstructFunctionHandle(FunctionLibraryRuntime* lib, + const string& device_name, + bool allow_soft_placement = false, + size_t num_inputs = 0, size_t num_outputs = 0); + + // Imports the GraphDef for the segment of the TRTEngineOp to + // segment_graph_def_. + Status ImportSegmentGraphDef(FunctionLibraryRuntime* lib, + const string& device_name); + + // Executes the native segment as function Op asynchronously. + void ExecuteNativeSegment(OpKernelContext* ctx, AsyncHelper* async_helper); + + // Allocates the device memory for the execution context and enqueues the + // TensorRT engine for execution. Also deallocates the device memory. Returns + // whether we need to retry by running the native segment. + Status ExecuteTrtEngine(OpKernelContext* ctx, EngineContext* engine_context, + int trt_context_idx, + const TrtShapeOptimizationProfile& profiles, + TRTBaseAllocator* allocator); + + // Allocates necessary resources for calibration. Status AllocateCalibrationResources(OpKernelContext* ctx, TRTEngineCacheResource* cache_res); Status GetEngineCacheResource(OpKernelContext* ctx, TRTEngineCacheResource** cache_res); - // Get engine for the input shape - StatusOr GetEngine( - const std::vector& input_shapes, OpKernelContext* ctx, - TRTEngineCacheResource* cache_res); + // Returns a pair of 1) An EngineContext object that is compatible with the + // input and 2) The index of the IExecutionContext compatible with the input. + // If a cuda engine for the given input shapes can't be found, returns + // (nullptr, 0) to allow native engine execution. Returns an error code for + // any problem that would prevent both TensorRT engine exceution and native + // segment execution. + ::stream_executor::port::StatusOr> GetEngine( + const std::vector& input_concrete_shapes, + OpKernelContext* ctx, TRTEngineCacheResource* cache_resource); + + // Builds and returns a cuda engine for the input shapes. If building the + // engine fails, enters a dummy entry into the cache_resource cache so we + // don't continually try to build the same failing engine. + ::stream_executor::port::StatusOr> + BuildEngine(const std::vector& input_concrete_shapes, + int batch_size, bool use_calibration, + TRTInt8Calibrator* calibrator, + TRTEngineCacheResource* cache_resource, OpKernelContext* ctx); // Verify that the input shapes are consistent and can be handled by this op. Status VerifyInputShapes(const std::vector& shapes); - // Return engine batch in cached_engne_batch_sizes_ which is closest to input - // batch. - Status GetEngineInputShapes( - const CacheType& cache, - const std::vector& actual_input_shapes, - std::vector* engine_input_shapes); - std::vector input_nodes_; std::vector output_nodes_; @@ -142,7 +211,7 @@ class TRTEngineOp : public AsyncOpKernel { NameAttrList func_; // GraphDef representation of the segment. - GraphDef segment_graph_; + GraphDef segment_graph_def_; // Engine Precision mode. TrtPrecisionMode precision_mode_; @@ -154,12 +223,35 @@ class TRTEngineOp : public AsyncOpKernel { // Whether to calibrate INT8 engine. bool calibration_mode_; - // Maximum number of cached engines + // Whether to use implicit batch dimension for TensorRT. + bool use_implicit_batch_; + + // Whether to collect optimization profiles for TensorRT, only used when + // use_implicit_batch_=false. + bool profile_generation_mode_; + + // Optimization profile generation strategy. + ProfileStrategy profile_strategy_; + + // Whether the TRTEngineOp has any input with unknown dimensions. + bool has_dynamic_shape_input_; + + // Whether to build TensorRT engines at runtime. + bool allow_build_at_runtime_; + + // Whether to allow soft placement when the graph is executed with native + // TensorFlow. + bool allow_soft_placement_; + + // Maximum number of cached engines. int max_cached_engines_; + // Flag to detect whether native segment nodes have been deleted from graph + bool native_segment_absent_; + int64 workspace_size_; mutex engine_mutex_; - FunctionLibraryRuntime::Handle func_handle_; + FunctionLibraryRuntime::Handle native_execution_func_handle_; // The finalized calibrator for inference. std::unique_ptr calibrator_; @@ -167,20 +259,45 @@ class TRTEngineOp : public AsyncOpKernel { // If true, create calibration graph for INT8 mode. Otherwise, we are using // user-provided quantization ranges. bool use_calibration_; + + tensorflow::grappler::Cluster* cluster_; + + // Array of all input shapes, collected from the input_shapes attribute when + // constructing the TRTEngineOp. The input_shapes attribute is set during + // graph conversion time. This data is used to retrieve which input dimensions + // could be unknown. During inference time this information is not available + // otherwise (all shapes are known (concrete) shapes when we run inference). + std::vector input_partial_shapes_; + // Shapes, excluding resource inputs. + std::vector input_partial_shapes_filtered_; + + // The TF node can have more inputs than the TRT engine: resource inputs are + // saved as weight in the engine, instead of passing that as engine input. + // Input mask is true for those TF input that are TRT engine inputs. + std::vector input_mask_; + + // Whether to use explicit precision (QDQ) mode. + bool use_explicit_precision_; }; -#define TYPECASE(dt, X, Y) \ +#define TYPECASE(dt, X) \ case dt: { \ return (void*)X->flat::Type>().data(); \ } void* GetTensorAddress(const Tensor* tensor_ptr) { - auto tensor_type = tensor_ptr->dtype(); + const auto tensor_type = tensor_ptr->dtype(); switch (tensor_type) { - TYPECASE(DT_FLOAT, tensor_ptr, dest_ptr); - TYPECASE(DT_HALF, tensor_ptr, dest_ptr); - TYPECASE(DT_INT8, tensor_ptr, dest_ptr); - TYPECASE(DT_INT32, tensor_ptr, dest_ptr); + TYPECASE(DT_FLOAT, tensor_ptr); + TYPECASE(DT_HALF, tensor_ptr); + TYPECASE(DT_INT8, tensor_ptr); + TYPECASE(DT_INT32, tensor_ptr); +#if IS_TRT_VERSION_GE(8, 2, 0, 0) + TYPECASE(DT_BOOL, tensor_ptr); +#endif +#if IS_TRT_VERSION_GE(8, 5, 0, 0) + TYPECASE(DT_UINT8, tensor_ptr); +#endif default: { LOG(ERROR) << "Unsupported Data type " << DataTypeString(tensor_type); return nullptr; @@ -232,8 +349,14 @@ static Status FunctionDefToGraphDef(FunctionLibraryRuntime::Handle handle, return Status::OK(); } -Status TRTEngineOp::ConstructFunctionHandle(FunctionLibraryRuntime* lib, - const string& device_name) { +::stream_executor::port::StatusOr +TRTEngineOp::ConstructFunctionHandle(FunctionLibraryRuntime* lib, + const string& device_name, + bool allow_soft_placement, + size_t num_inputs, size_t num_outputs) { + tensorflow::profiler::TraceMe activity( + "TRTEngineOp::ConstructFunctionHandle", + tensorflow::profiler::TraceMeLevel::kInfo); VLOG(1) << "Constructing function handle"; if (lib == nullptr) { return errors::Internal("Context function library is null"); @@ -241,12 +364,55 @@ Status TRTEngineOp::ConstructFunctionHandle(FunctionLibraryRuntime* lib, FunctionLibraryRuntime::InstantiateOptions inst_ops; inst_ops.state_handle = ""; inst_ops.target = device_name; - return lib->Instantiate(func_.name(), AttrSlice(&func_.attr()), inst_ops, - &func_handle_); + if (!native_segment_absent_ && allow_soft_placement) { + const FunctionDef* fdef = + lib->GetFunctionLibraryDefinition()->Find(func_.name()); + if (!fdef) { + return errors::Internal( + StrCat("Can't find FunctionDef for ", func_.name())); + } + bool ints_on_device = + fdef->attr().count(FunctionLibraryDefinition::kIntsOnDeviceAttr) != 0 && + fdef->attr().at(FunctionLibraryDefinition::kIntsOnDeviceAttr).b(); + // kIntsOnDeviceAttr is not compatible with is_multi_device_function which + // is needed to support allow_soft_placement. + if (ints_on_device) { + LOG_FIRST_FEW_WARNING_WITH_PREFIX + << "Function " << name() + << " has attribute kIntsOnDeviceAttr=true " + "and will be executed natively with allow_soft_placement=false. " + "If this is a problem, please re-generate your SavedModel with " + "the TF-TRT runtime you are using."; + } else { + inst_ops.is_multi_device_function = true; + inst_ops.input_devices.resize(num_inputs, device_name); + inst_ops.output_devices.resize(num_outputs, device_name); + inst_ops.config_proto.set_allow_soft_placement(true); + } + } + FunctionLibraryRuntime::Handle func_handle; + Status status = lib->Instantiate(func_.name(), AttrSlice(&func_.attr()), + inst_ops, &func_handle); + if (status.ok()) { + return func_handle; + } + return status; +} + +Status TRTEngineOp::ImportSegmentGraphDef(FunctionLibraryRuntime* lib, + const string& device_name) { + tensorflow::profiler::TraceMe activity( + "TRTEngineOp::ImportSegmentGraphDef", + tensorflow::profiler::TraceMeLevel::kInfo); + TF_ASSIGN_OR_RETURN(FunctionLibraryRuntime::Handle func_handle, + ConstructFunctionHandle(lib, device_name)); + return FunctionDefToGraphDef(func_handle, lib, &segment_graph_def_); } TRTEngineOp::TRTEngineOp(OpKernelConstruction* context) : AsyncOpKernel(context) { + tensorflow::profiler::TraceMe activity( + "TRTEngineOp::TRTEngineOp", tensorflow::profiler::TraceMeLevel::kInfo); // read serialized_engine OP_REQUIRES_OK(context, context->GetAttr("serialized_segment", &serialized_segment_)); @@ -262,21 +428,61 @@ TRTEngineOp::TRTEngineOp(OpKernelConstruction* context) OP_REQUIRES_OK(context, context->GetAttr("calibration_data", &calibration_data)); OP_REQUIRES_OK(context, context->GetAttr("segment_func", &func_)); - OP_REQUIRES(context, !func_.name().empty(), - errors::InvalidArgument( - "The TF function for the TRT segment could not be empty")); OP_REQUIRES_OK(context, TrtPrecisionModeFromName(precision_string, &precision_mode_)); OP_REQUIRES_OK(context, context->GetAttr("use_calibration", &use_calibration_)); - func_handle_ = kInvalidHandle; - if (!static_engine_) { - FunctionLibraryRuntime* lib = context->function_library(); - OP_REQUIRES_OK(context, - ConstructFunctionHandle(lib, context->device()->name())); - OP_REQUIRES_OK(context, - FunctionDefToGraphDef(func_handle_, lib, &segment_graph_)); + OP_REQUIRES_OK(context, + context->GetAttr("input_shapes", &input_partial_shapes_)); + auto status = + context->GetAttr("_allow_build_at_runtime", &allow_build_at_runtime_); + if (status.code() == tensorflow::error::NOT_FOUND) { + VLOG(2) << "Not found _allow_build_at_runtime in " + << context->device()->name() + << ", thus setting _allow_build_at_runtime=true"; + allow_build_at_runtime_ = true; + } else { + OP_REQUIRES_OK(context, status); + } + + // Get a mask of non-resource inputs. + std::vector in_types; + input_mask_.resize(input_partial_shapes_.size()); + OP_REQUIRES_OK(context, context->GetAttr("InT", &in_types)); + for (int i = 0; i < input_mask_.size(); i++) { + input_mask_[i] = (in_types[i] != DataType::DT_RESOURCE); + } + + // Filter the shapes to exclude resources. + for (int i = 0; i < input_partial_shapes_.size(); i++) { + if (input_mask_[i]) { + input_partial_shapes_filtered_.push_back(input_partial_shapes_[i]); + } + } + + status = context->GetAttr("_allow_soft_placement", &allow_soft_placement_); + if (status.code() == tensorflow::error::NOT_FOUND) { + allow_soft_placement_ = true; + } else { + OP_REQUIRES_OK(context, status); + } + + status = context->GetAttr("use_explicit_precision", &use_explicit_precision_); + if (!status.ok()) { + use_explicit_precision_ = false; + } + + // When a TF-TRT converted model without native segments is loaded, + // func_ can be empty. + native_segment_absent_ = (func_.name() == ""); + native_execution_func_handle_ = kInvalidHandle; + if (!native_segment_absent_) { + if (!static_engine_) { + OP_REQUIRES_OK(context, ImportSegmentGraphDef(context->function_library(), + context->device()->name())); + } } + // TODO(laigd): calibration_data is used in TF v1.x and we keep it only for // backward compatibility reasons. Remove it once all known users switch to // 2.0. @@ -289,178 +495,423 @@ TRTEngineOp::TRTEngineOp(OpKernelConstruction* context) } OP_REQUIRES_OK(context, context->GetAttr("max_cached_engines_count", &max_cached_engines_)); + + status = context->GetAttr("_use_implicit_batch", &use_implicit_batch_); + if (status.code() == tensorflow::error::NOT_FOUND) { + VLOG(2) << "Not found _use_implicit_batch in " << context->device()->name() + << ", thus setting _use_implicit_batch=true"; + use_implicit_batch_ = true; + } + + status = + context->GetAttr("_profile_generation_mode", &profile_generation_mode_); + if (status.code() == tensorflow::error::NOT_FOUND) { + VLOG(2) << "Not found _profile_generation_mode in " + << context->device()->name() + << ", thus setting _profile_generation_mode=false"; + profile_generation_mode_ = false; + } + if (static_engine_) { + if (profile_generation_mode_) profile_generation_mode_ = false; + } + if (use_implicit_batch_) { + OP_REQUIRES(context, !profile_generation_mode_, + errors::InvalidArgument( + "profile_generation_mode_=true is only supported if " + "use_implicit_batch=false")); + if (input_partial_shapes_.empty()) { + VLOG(1) << "Attribute input_shapes is not set. This happens probably " + << "because you are using a model that is already converted " + << "to TensorRT with a previous version of TF-TRT (i.e. includes " + << "TRTEngineOp in graph). This is not an error. If you convert " + << "the original model again to TensorRT, the attributes " + << "input_shapes will be set automatically."; + } + } else { + OP_REQUIRES( + context, !input_partial_shapes_.empty(), + errors::InvalidArgument( + "Explicit batch mode requires attribute input_shapes to be set." + "If you are using a model that was converted to TensorRT by a " + "previous version of TF-TRT, (i.e. includes TRTEngineOp in graph " + "without the input_shapes attribute), then you need to convert the " + "original model again to TensorRT in order to set the attribute " + "input_shapes.")); + + string profile_strategy_name; + status = context->GetAttr("profile_strategy", &profile_strategy_name); + if (status.code() == tensorflow::error::NOT_FOUND) { + VLOG(2) << "Not found strategy in " << context->device()->name() + << ", thus setting profile_strategy='Range'"; + profile_strategy_ = ProfileStrategy::kRange; + } else { + OP_REQUIRES_OK(context, ProfileStrategyFromName(profile_strategy_name, + &profile_strategy_)); + } + } + has_dynamic_shape_input_ = absl::c_any_of( + input_partial_shapes_filtered_, + [](PartialTensorShape shape) { return !shape.IsFullyDefined(); }); + VLOG(2) << "TRTEngineOp has_dynamic_shape_input_: " + << has_dynamic_shape_input_; +} + +// Copies input tensor ctx->input(i) (which is in device memory) to the host, +// and place the resulting host tensor to the back of native_inputs. +Status CopyToHostAsync(OpKernelContext* ctx, std::vector* native_inputs, + int i, const cudaStream_t stream) { + // The TRTEngineOp has all ctx->inputs on the device. In contrast, the + // native segment expects to find int32 inputs on the host. We copy int32 + // inputs from device to host. + + AllocatorAttributes allocator_attr; + allocator_attr.set_on_host(true); + Tensor t; + TF_RETURN_IF_ERROR(ctx->allocate_temp( + ctx->input_dtype(i), ctx->input(i).shape(), &t, allocator_attr)); + native_inputs->push_back(t); + const Tensor& gpu_tensor = ctx->input(i); + auto ret = cudaMemcpyAsync( + t.flat().data(), gpu_tensor.flat().data(), + t.NumElements() * sizeof(int32), cudaMemcpyDeviceToHost, stream); + if (ret != 0) { + return errors::Internal("Could not copy tensor for native segment input"); + } + return Status::OK(); +} + +// Copies native_tensor, which is in host memory to ctx->output(t), which is in +// device memory. +Status CopyToDeviceAsync(OpKernelContext* ctx, const Tensor& native_tensor, + int t, cudaStream_t stream) { + Tensor* gpu_tensor; + TF_RETURN_IF_ERROR( + ctx->allocate_output(t, native_tensor.shape(), &gpu_tensor)); + auto ret = cudaMemcpyAsync(gpu_tensor->flat().data(), + native_tensor.flat().data(), + native_tensor.NumElements() * sizeof(int32), + cudaMemcpyHostToDevice, stream); + if (ret != 0) { + return errors::Internal("Could not copy tensor for native segment output"); + } + return Status::OK(); } void TRTEngineOp::ExecuteNativeSegment(OpKernelContext* ctx, - AsyncHelper* helper) { - std::vector inputs; - std::vector* outputs = new std::vector(); - if (func_handle_ == kInvalidHandle) { - OP_REQUIRES_OK_ASYNC( - ctx, - ConstructFunctionHandle(ctx->function_library(), ctx->device()->name()), - *helper); + AsyncHelper* async_helper) { + tensorflow::profiler::TraceMe activity( + "TRTEngineOp::ExecuteNativeSegment", + tensorflow::profiler::TraceMeLevel::kInfo); + std::vector native_inputs; + std::vector* native_outputs = new std::vector(); + DummyAsyncHelper dummy_async_helper; + if (native_execution_func_handle_ == kInvalidHandle) { + ::stream_executor::port::StatusOr + status_or_handle = ConstructFunctionHandle( + ctx->function_library(), ctx->device()->name(), + allow_soft_placement_, ctx->num_inputs(), ctx->num_outputs()); + OP_REQUIRES_OK_ASYNC(ctx, status_or_handle.status(), dummy_async_helper); + native_execution_func_handle_ = status_or_handle.ValueOrDie(); } + auto lib = ctx->function_library(); FunctionLibraryRuntime::Options opts; opts.rendezvous = ctx->rendezvous(); opts.cancellation_manager = ctx->cancellation_manager(); opts.runner = ctx->runner(); - inputs.reserve(ctx->num_inputs()); + native_inputs.reserve(ctx->num_inputs()); + int n_copies = 0; + cudaStream_t stream = reinterpret_cast( + CHECK_NOTNULL(ctx->op_device_context() + ->stream() + ->implementation() + ->GpuStreamMemberHack())); for (int i = 0; i < ctx->num_inputs(); i++) { - inputs.push_back(ctx->input(i)); + if (ctx->input_dtype(i) != DT_INT32) { + native_inputs.push_back(ctx->input(i)); + } else { + OP_REQUIRES_OK_ASYNC(ctx, CopyToHostAsync(ctx, &native_inputs, i, stream), + dummy_async_helper); + n_copies++; + } + } + if (n_copies > 0) { + // If we have any int32 tensors, then wait until data is copied to host. + cudaStreamSynchronize(stream); } - helper->Ref(); // Increment count for calculating native graph VLOG(1) << "Executing native segment: " << name(); - lib->Run(opts, func_handle_, inputs, outputs, - [this, ctx, outputs, helper](const Status& s) { - core::ScopedUnref sc(helper); - OP_REQUIRES_OK_ASYNC(ctx, s, *helper); - VLOG(1) << "Native Segment completed"; - for (size_t t = 0; t < outputs->size(); ++t) { - ctx->set_output(t, outputs->at(t)); - } - delete outputs; - }); + // Increment the reference count of the async_helper by 1. When the native + // segment finishes execution asynchronously, we decrement the reference + // count of the object. + async_helper->Ref(); + lib->Run( + opts, native_execution_func_handle_, native_inputs, native_outputs, + [this, ctx, native_outputs, async_helper, stream](const Status& s) { + core::ScopedUnref sc(async_helper); + DummyAsyncHelper dummy_async_helper; + std::unique_ptr> outputs_wrapper(native_outputs); + OP_REQUIRES_OK_ASYNC(ctx, s, dummy_async_helper); + VLOG(1) << "Native Segment completed"; + int n_copies = 0; + for (size_t t = 0; t < native_outputs->size(); ++t) { + if (native_outputs->at(t).dtype() == DT_INT32) { + OP_REQUIRES_OK_ASYNC( + ctx, CopyToDeviceAsync(ctx, native_outputs->at(t), t, stream), + dummy_async_helper); + n_copies++; + } else { + ctx->set_output(t, native_outputs->at(t)); + } + } + if (n_copies > 0) { + cudaStreamSynchronize(stream); + } + }); } void TRTEngineOp::ExecuteCalibration(OpKernelContext* ctx, TRTEngineCacheResource* cache_res, - AsyncHelper* helper) { + AsyncHelper* async_helper) { + tensorflow::profiler::TraceMe activity( + "TRTEngineOp::ExecuteCalibration", + tensorflow::profiler::TraceMeLevel::kInfo); VLOG(1) << "Executing TRT calibration: " << name(); - helper->Ref(); - core::ScopedUnref sc(helper); + DummyAsyncHelper dummy_async_helper; CalibrationContext* calib_ctx = cache_res->calib_ctx_.get(); const int num_inputs = ctx->num_inputs(); // TODO(laigd): need to check that input shape matches. // Pass input data to calibrator std::unordered_map input_data; + bool input_size_ok = true; for (int i = 0; i < num_inputs; i++) { const Tensor& t = ctx->input(i); void* data_address = GetTensorAddress(&t); OP_REQUIRES_ASYNC(ctx, data_address, errors::InvalidArgument( "Unsupported data type encountered in input ", i), - *helper); + dummy_async_helper); // Check the allocated buffer is sufficient for input - const auto device_tensor = - calib_ctx->device_tensors_.at(i).AccessTensor(ctx); - CHECK_EQ(t.TotalBytes(), device_tensor->TotalBytes()); + const auto device_tensor = &calib_ctx->device_tensors_.at(i); + if (t.TotalBytes() != device_tensor->TotalBytes()) { + // This can happen if the network has data dependent shapes. + input_size_ok = false; + VLOG(2) << "Size differs for input " << i + << ", skipping calibration for this input."; + break; + } input_data.emplace(StrCat(IONamePrefixes::kInputPHName, i), data_address); } - VLOG(2) << "Filled map for sending"; - // copied from cuda_kernel_helper since it seems only valid in *.cu.cc files - const cudaStream_t* stream = CHECK_NOTNULL( - reinterpret_cast(ctx->op_device_context() - ->stream() - ->implementation() - ->GpuStreamMemberHack())); - // If calibrator is terminated before, it means an error has occurred. - // - // Note: setBatch() will wait until TRTInt8Calibrator::getBatch() is called - // the first time before proceeding, so if buildCudaEngine() returns an error, - // it means getBatch() is never called, and the setBatch() here will hang - // until setDone() is called later by the calibration thread in - // AllocateCalibrationResources(). In that case, this setBatch() will always - // be able to detect the error and return false. - OP_REQUIRES_ASYNC(ctx, calib_ctx->calibrator_->setBatch(input_data, *stream), - errors::Internal("Failed to feed calibration data"), - *helper); - VLOG(2) << "Passed calibration data"; - ExecuteNativeSegment(ctx, helper); + if (input_size_ok) { + VLOG(2) << "Filled map for sending"; + // Copied from gpu_kernel_helper.h as the header can only be used in *.cu.cc + // files. + cudaStream_t stream = reinterpret_cast( + CHECK_NOTNULL(ctx->op_device_context() + ->stream() + ->implementation() + ->GpuStreamMemberHack())); + // TRTInt8Calibrator::setBatch will wait until TRTInt8Calibrator::getBatch + // is called before proceeding with feeding the calibration data to the + // calibrator. It returns true if the calibration data is accepted and + // returns false if calibration is terminated due to errors. + // + // If TRTInt8Calibrator::getBatch is never called, which could happen if + // there is any problem in building the cuda engine for calibration inside + // TensorRT, then the TRTInt8Calibrator::setBatch call here will hang until + // TRTInt8Calibrator::setDone is called by the calibration thread in + // AllocateCalibrationResources. + // + // In both of the above cases, setBatch here returns a boolean value to + // indicate the result of the calibration process. + if (!calib_ctx->calibrator_->setBatch(input_data, stream)) { + VLOG(2) << "Failed to feed calibration data"; + } else { + VLOG(2) << "Passed calibration data"; + } + } + if (!native_segment_absent_) { + ExecuteNativeSegment(ctx, async_helper); + } else { + LOG(ERROR) << "Calibration requires native segment, but is not found in " + "the graph."; + } } -Status TRTEngineOp::VerifyInputShapes(const std::vector& shapes) { - if (shapes.empty()) { +Status TRTEngineOp::VerifyInputShapes( + const std::vector& input_concrete_shapes) { + tensorflow::profiler::TraceMe activity( + "TRTEngineOp::VerifyInputShapes", + tensorflow::profiler::TraceMeLevel::kInfo); + if (input_concrete_shapes.empty()) { return errors::InvalidArgument("Input shapes are empty, for ", name()); } - if (shapes[0].dims() < 1) { - return errors::InvalidArgument("Input shapes contain scalar, for ", name(), - ": ", - TensorShapeUtils::ShapeListString(shapes)); - } - const int batch_size = shapes[0].dim_size(0); - for (const TensorShape& shape : shapes) { - if (shape.dims() < 1 || batch_size != shape.dim_size(0)) { + if (input_partial_shapes_filtered_.empty()) { + if (!use_implicit_batch_) { return errors::InvalidArgument( - "Input shapes are inconsistent on the batch dimension, for ", name(), - ": ", TensorShapeUtils::ShapeListString(shapes)); + "Explicit batch mode requires input_partial_shapes_ ", + "to contain the dynamic input shapes to TRTEngineOp"); } - } - return Status::OK(); -} - -bool AreShapesCompatible(const std::vector& actual_shapes, - const std::vector& cached_shapes) { - auto match_shape = [](const TensorShape& actual_shape, - const TensorShape& cached_shape) { - // Match the rank. - if (actual_shape.dims() != cached_shape.dims()) return false; - // Match the batch size. - if (actual_shape.dim_size(0) > cached_shape.dim_size(0)) return false; - // Match remaining dimensions. - for (int i = 1; i < actual_shape.dims(); ++i) { - if (actual_shape.dim_size(i) != cached_shape.dim_size(i)) return false; + // If the graph was converted with an earlier version of TF-TRT, it can + // happen that the input_partial_shapes_ vector is not set (see + // input_shapes attribute handling in the TRTEngineOp constructor). + // In implicit batch mode it is allowed to have empty input_partial_shapes_, + // since it is only required in explicit batch mode (see the input_shapes + // attribute of ConvertGraphDefToEngine in TRTEngineOp::GetEngine. + } else { + // Additional consistency checks if input_partial_shapes_ is present. + const string error_msg = StrCat( + "Input shapes do not match input partial shapes stored in graph, for ", + name(), ": ", DebugString(input_concrete_shapes), + " != ", DebugString(input_partial_shapes_filtered_)); + if (input_concrete_shapes.size() != input_partial_shapes_filtered_.size()) { + return errors::InvalidArgument(error_msg); } - return true; - }; - for (int i = 0; i < actual_shapes.size(); ++i) { - if (!match_shape(actual_shapes[i], cached_shapes[i])) { - return false; + for (int i = 0; i < input_concrete_shapes.size(); i++) { + if (input_concrete_shapes[i].dims() != + input_partial_shapes_filtered_[i].dims()) { + return errors::InvalidArgument(error_msg); + } + } + for (int i = 0; i < input_concrete_shapes.size(); i++) { + for (int d = 0; d < input_concrete_shapes[i].dims(); d++) { + if (input_partial_shapes_filtered_[i].dim_size(d) != -1) { + if (input_concrete_shapes[i].dim_size(d) != + input_partial_shapes_filtered_[i].dim_size(d)) { + return errors::InvalidArgument(error_msg); + } + } + } } } - return true; -} -Status TRTEngineOp::GetEngineInputShapes( - const CacheType& cache, const std::vector& actual_input_shapes, - std::vector* engine_input_shapes) { - // VerifyInputShapes() already ensured that all input shapes have same - // batch size, and are not scalars. - *engine_input_shapes = actual_input_shapes; - int64 min_matched_batch_size = kint64max; - for (const auto& pair : cache) { - const std::vector& cached_input_shapes = pair.first; - // This should not happen, but just for safety. - if (actual_input_shapes.size() != cached_input_shapes.size()) { + if (use_implicit_batch_) { + if (input_concrete_shapes[0].dims() < 1) { return errors::InvalidArgument( - "Input shape list size mismatch for ", name(), - ", cached size: ", cached_input_shapes.size(), - " vs. actual size: ", actual_input_shapes.size()); - } - if (AreShapesCompatible(actual_input_shapes, cached_input_shapes)) { - const int cached_batch_size = cached_input_shapes[0].dim_size(0); - if (min_matched_batch_size > cached_batch_size) { - min_matched_batch_size = cached_batch_size; - *engine_input_shapes = cached_input_shapes; + "Input shapes contain scalar, for ", name(), ": ", + TensorShapeUtils::ShapeListString(input_concrete_shapes)); + } + const int batch_size = input_concrete_shapes[0].dim_size(0); + if (batch_size < 1) { + return errors::InvalidArgument( + "Incorrect batch dimension, for ", name(), ": ", + TensorShapeUtils::ShapeListString(input_concrete_shapes)); + } + for (const TensorShape& shape : input_concrete_shapes) { + if (batch_size != shape.dim_size(0)) { + return errors::InvalidArgument( + "Input shapes are inconsistent on the batch dimension, for ", + name(), ": ", + TensorShapeUtils::ShapeListString(input_concrete_shapes)); } } } return Status::OK(); } +static bool AllowEngineNativeSegmentExecution() { + bool value; + Status status = + ReadBoolFromEnvVar("TF_TRT_ALLOW_ENGINE_NATIVE_SEGMENT_EXECUTION", + /*default_val=*/true, &value); + if (!status.ok()) { + LOG(ERROR) << status; + } + return value; +} + void TRTEngineOp::ComputeAsync(OpKernelContext* ctx, AsyncOpKernel::DoneCallback done) { - auto helper = new AsyncHelper(done); - core::ScopedUnref sc(helper); + tensorflow::profiler::TraceMe activity( + "TRTEngineOp::ComputeAsync", tensorflow::profiler::TraceMeLevel::kInfo); + + // Invoke DoneCallback when this object is destructed, which could be after + // this routine finishes execution, in particular, when native segment is + // executed. + auto async_helper = new AsyncHelper(done); + core::ScopedUnref sc(async_helper); + + // For all async execution macros, use this object as there is no need to call + // DoneCallback from those macros. + DummyAsyncHelper dummy_async_helper; // Get TRT resource. TRTEngineCacheResource* cache_res = nullptr; - OP_REQUIRES_OK_ASYNC(ctx, GetEngineCacheResource(ctx, &cache_res), *helper); + OP_REQUIRES_OK_ASYNC(ctx, GetEngineCacheResource(ctx, &cache_res), + dummy_async_helper); core::ScopedUnref unref_cache_res(cache_res); + // Get shapes of inputs to engine. + std::vector input_concrete_shapes; + input_concrete_shapes.reserve(ctx->num_inputs()); + std::vector input_concrete_shapes_filtered; + for (int i = 0; i < ctx->num_inputs(); ++i) { + input_concrete_shapes.push_back(ctx->input(i).shape()); + if (ctx->input(i).dtype() != DataType::DT_RESOURCE) { + input_concrete_shapes_filtered.push_back(ctx->input(i).shape()); + } + } + + /// TODO(lsugy): fix case of engine with only resource inputs. + Status verify_input_shape_status = + VerifyInputShapes(input_concrete_shapes_filtered); + // TODO(bixia): Fix the segmentation. + if (!verify_input_shape_status.ok() && !native_segment_absent_) { + LOG_FIRST_FEW_WARNING_WITH_PREFIX + << "Running native segment for" << name() + << " due to failure in verifying input shapes: " + << verify_input_shape_status.error_message(); + ExecuteNativeSegment(ctx, async_helper); + return; + } + + if (!use_implicit_batch_ && + (has_dynamic_shape_input_ || cache_res->profiles_.HasShapeTensor())) { + OP_REQUIRES_OK_ASYNC(ctx, cache_res->profiles_.CollectShapeValues(ctx), + dummy_async_helper); + cache_res->profiles_.SetInputMask(input_mask_); + if (profile_generation_mode_) { + // Collecting new shapes for profiles can be only done once. After the + // shapes are converted to TRT profiles, no shapes can be collected + // anymore. + OP_REQUIRES_ASYNC(ctx, cache_res->profiles_.GetNumProfiles() == 0, + errors::Unimplemented("Cannot collect new shapes when " + "profiles are already created."), + dummy_async_helper); + // Just collect the input shape info and return. The shapes are used to + // generate optimization profiles during engine creation. + cache_res->profiles_.AddShape(input_concrete_shapes); + VLOG(1) + << "Native segment is used during collecting shapes for profiles."; + if (!native_segment_absent_) { + ExecuteNativeSegment(ctx, async_helper); + } else { + LOG(ERROR) << "Native segment is required for profile generation, " + "but is not found in the graph."; + } + return; + } else if (cache_res->profiles_.GetNumProfiles() == 0 && !static_engine_) { + // Add current shape if we did not collect any shapes so far. + if (!cache_res->profiles_.HasShape()) { + cache_res->profiles_.AddShape(input_concrete_shapes); + } + // Create profiles out of collected shapes during profile generation. + cache_res->profiles_.InitProfiles(input_partial_shapes_, + profile_strategy_); + } + } + // Run calibration if in int8+calibration mode. // * Logic in TF 1.x: // - During conversion: calibration_mode_ is true and cache size is 0, so it // will run calibration. - // - During inference: calibration_data will be set, so calibration_mode_ is - // false and it won't trigger calibration. + // - During inference: calibration_data will be set, so calibration_mode_ + // is false and it won't trigger calibration. // * Logic in TF 2.0: // - During conversion: similar to 1.x. // - During inference: calibration_data will still be empty, but cache will - // contain the the calibrated engine, so it won't trigger calibration. + // contain the calibrated engine, so it won't trigger calibration. // // TODO(laigd): consider the following alternatives: // 1. Serialize the state (calibration or inference) using @@ -473,174 +924,158 @@ void TRTEngineOp::ComputeAsync(OpKernelContext* ctx, // TODO(laigd): better encapsulation. mutex_lock lock(engine_mutex_); if (!cache_res->calib_ctx_) { + // Add profiles if we are in dynamic shape mode. + if (!use_implicit_batch_ && (has_dynamic_shape_input_ || + cache_res->profiles_.HasShapeTensor())) { + cache_res->profiles_.InitCalibProfile(input_concrete_shapes); + } OP_REQUIRES_OK_ASYNC(ctx, AllocateCalibrationResources(ctx, cache_res), - *helper); + dummy_async_helper); } } // TODO(laigd): check that the input shapes match the shapes of the // persistent tensor in the calibration resource. - ExecuteCalibration(ctx, cache_res, helper); + ExecuteCalibration(ctx, cache_res, async_helper); return; } - // Get shapes of inputs to engine. - std::vector input_shapes; - input_shapes.reserve(ctx->num_inputs()); - for (int i = 0; i < ctx->num_inputs(); ++i) { - input_shapes.push_back(ctx->input(i).shape()); - } - OP_REQUIRES_OK_ASYNC(ctx, VerifyInputShapes(input_shapes), *helper); - StatusOr status = GetEngine(input_shapes, ctx, cache_res); - OP_REQUIRES_OK_ASYNC(ctx, status.status(), *helper); - - EngineContext* engine_context = status.ValueOrDie(); - if (!engine_context->cuda_engine) { - VLOG(1) << "Engine retrieval for input shapes: " - << TensorShapeUtils::ShapeListString(input_shapes) - << " failed. Running native segment for " << name(); - ExecuteNativeSegment(ctx, helper); + ::stream_executor::port::StatusOr> status = + GetEngine(input_concrete_shapes, ctx, cache_res); + OP_REQUIRES_OK_ASYNC(ctx, status.status(), dummy_async_helper); + + EngineContext* engine_context = std::move(status.ValueOrDie()).first; + int trt_context_idx = std::move(status.ValueOrDie()).second; + auto may_execute_native_segment = [&] { + if (!native_segment_absent_ && !AllowEngineNativeSegmentExecution()) { + ctx->CtxFailure( + errors::Aborted("User disallowed engine native segment execution.")); + return false; + } else if (native_segment_absent_) { + ctx->CtxFailure( + errors::Aborted("Native segment execution is enabled but " + " native segment is not found in the graph.")); + return false; + } + return true; + }; + if (!engine_context->GetCudaEngine()) { + LOG_FIRST_FEW_WARNING_WITH_PREFIX + << "Engine retrieval for input shapes: " + << TensorShapeUtils::ShapeListString(input_concrete_shapes) + << " failed. Running native segment for " << name(); + if (may_execute_native_segment()) { + ExecuteNativeSegment(ctx, async_helper); + } return; } - const bool retry = ExecuteTrtEngine(ctx, engine_context); - if (retry) { - LOG(WARNING) << "Failed to execute engine, " - << "retrying with native segment for " << name(); - ExecuteNativeSegment(ctx, helper); + Status stat = + ExecuteTrtEngine(ctx, engine_context, trt_context_idx, + cache_res->profiles_, cache_res->allocator_.get()); + if (stat.ok()) return; + + LOG_FIRST_FEW_WARNING_WITH_PREFIX << "Failed to execute engine: " << stat + << " Retrying with native segment for " + << name(); + if (!may_execute_native_segment()) { return; } + // When Native Segment execution is enabled, release any outputs that + // are allocated. ExecuteNativeSegment will re-allocate them and + // fail if they are currently allocated. + // The Tensor pointer in the returned TensorValue must be explicitly + // deleted. + for (int i = 0; i < ctx->num_outputs(); i++) { + delete ctx->release_output(i).tensor; + } + if (!native_segment_absent_) { + ExecuteNativeSegment(ctx, async_helper); + } else { + LOG(ERROR) << "Native segment execution is enabled, " + "but native segment is not found in the graph."; + } } -bool TRTEngineOp::ExecuteTrtEngine(OpKernelContext* ctx, - EngineContext* engine_context) { +Status TRTEngineOp::ExecuteTrtEngine( + OpKernelContext* ctx, EngineContext* engine_context, int trt_context_idx, + const TrtShapeOptimizationProfile& profiles, TRTBaseAllocator* allocator) { + tensorflow::profiler::TraceMe activity( + "TRTEngineOp::ExecuteTrtEngine", + tensorflow::profiler::TraceMeLevel::kInfo); VLOG(1) << "Executing TRT engine: " << name(); - auto& cuda_engine = engine_context->cuda_engine; - const bool kRetry = true; - // All inputs must have the same batch size, so just get it from the first - // input. - const int num_batch = ctx->input(0).shape().dim_size(0); - const int num_binding = ctx->num_inputs() + ctx->num_outputs(); + nvinfer1::ICudaEngine* cuda_engine = engine_context->GetCudaEngine(); + + if (VLOG_IS_ON(2)) { + VLOG(2) << " Network name: " << cuda_engine->getName(); + VLOG(2) << " Activation size: " << engine_context->GetDeviceMemorySize() + << " bytes"; +#if !IS_TRT_VERSION_GE(8, 0, 0, 0) + // getWorkspaceSize() is deprecated as of TRT 8 + VLOG(2) << " Workspace size: " << cuda_engine->getWorkspaceSize() + << " bytes"; +#endif // #if !IS_TRT_VERSION_GE(8, 0, 0, 0) + VLOG(2) << " Datatype of " << cuda_engine->getNbBindings() + << " inputs/outputs"; + string binding_types = ""; + for (int i = 0; i < cuda_engine->getNbBindings(); i++) { + binding_types += " " + string(cuda_engine->getBindingName(i)) + ": " + + DebugString(cuda_engine->getBindingDataType(i)) + "\n"; + } + VLOG(2) << binding_types; + } + const int num_binding = cuda_engine->getNbBindings(); std::vector buffers(num_binding); - for (int i = 0; i < ctx->num_inputs(); i++) { - const string input_name = StrCat(IONamePrefixes::kInputPHName, i); - const int binding_index = cuda_engine->getBindingIndex(input_name.c_str()); - if (binding_index == -1) { - const string msg = - StrCat("Input node ", input_name, " not found, at ", name()); - LOG(ERROR) << msg; - ctx->SetStatus(errors::NotFound(msg)); - return !kRetry; - } - - const Tensor& input_tensor = ctx->input(i); - const TensorShape& input_shape = input_tensor.shape(); - if (num_batch != input_shape.dim_size(0)) { - LOG(ERROR) << "Input data has inconsistent batch size: " << num_batch - << " vs " << input_shape.dim_size(0); - return kRetry; - } - auto dtype = cuda_engine->getBindingDataType(binding_index); - switch (dtype) { - case nvinfer1::DataType::kFLOAT: - buffers[binding_index] = - const_cast(input_tensor.flat().data()); - break; - case nvinfer1::DataType::kHALF: - buffers[binding_index] = - const_cast(input_tensor.flat().data()); - break; - case nvinfer1::DataType::kINT8: - LOG(ERROR) << "INT8 inputs are not supported yet!"; - return kRetry; - case nvinfer1::DataType::kINT32: - buffers[binding_index] = - const_cast(input_tensor.flat().data()); - break; - default: - LOG(ERROR) << "Unknown TRT data type: " << static_cast(dtype); - return kRetry; - } - } + // nvinfer1::IExecutionContext::enqueue is not thread safe and we need a mutex + // for it. + mutex_lock lock(engine_context->mu); + nvinfer1::IExecutionContext* execution_context; + bool has_device_memory; + TF_RETURN_IF_ERROR(engine_context->GetExecutionContext( + trt_context_idx, &execution_context, &has_device_memory)); - for (int i = 0; i < ctx->num_outputs(); i++) { - // Create an output tensor - const string output_name = StrCat(IONamePrefixes::kOutputPHName, i); - const int binding_index = cuda_engine->getBindingIndex(output_name.c_str()); - Tensor* output_tensor = nullptr; - - TensorShape output_shape; - if (binding_index != -1) { - auto dims = cuda_engine->getBindingDimensions(binding_index); - std::vector trt_shape(dims.nbDims + 1); - trt_shape[0] = num_batch; - for (int j = 0; j < dims.nbDims; j++) trt_shape[j + 1] = dims.d[j]; - auto status = TensorShapeUtils::MakeShape( - trt_shape.data(), trt_shape.size(), &output_shape); - if (!status.ok()) { - LOG(ERROR) << "Failed to get output shape: " << status; - return kRetry; - } - } else { - const string msg = - StrCat("Ouput node ", output_name, " not found, at ", name()); - LOG(ERROR) << msg; - ctx->SetStatus(errors::NotFound(msg)); - return !kRetry; - } - auto status = ctx->allocate_output(i, output_shape, &output_tensor); - if (!status.ok()) { - LOG(ERROR) << "Allocating output failed with " << status; - ctx->SetStatus(status); - // Do not retry since we cannot allocate the same output twice. - // TODO(aaroey): ideally we should retry, fix this. - return !kRetry; - } - auto dtype = cuda_engine->getBindingDataType(binding_index); - switch (dtype) { - case nvinfer1::DataType::kFLOAT: - buffers[binding_index] = - const_cast(output_tensor->flat().data()); - break; - case nvinfer1::DataType::kHALF: - buffers[binding_index] = - const_cast(output_tensor->flat().data()); - break; - case nvinfer1::DataType::kINT8: - LOG(WARNING) << "int8 is not supported yet!"; - return kRetry; - case nvinfer1::DataType::kINT32: - buffers[binding_index] = - const_cast(output_tensor->flat().data()); - break; - default: - LOG(WARNING) << "Unknown TRT data type: " << static_cast(dtype); - return kRetry; - } - } - // Copied from cuda_kernel_helper since it seems only valid in *.cu.cc files + if (VLOG_IS_ON(2)) { + VLOG(2) << "Selected execution context: " << trt_context_idx; + } + const int num_batch = + use_implicit_batch_ ? ctx->input(0).shape().dim_size(0) : 0; + + TF_RETURN_IF_ERROR(SetTrtEngineInputs( + cuda_engine, execution_context, trt_context_idx, buffers, + use_implicit_batch_, num_batch, profiles, ctx)); + + TF_RETURN_IF_ERROR(SetTrtEngineOutputs(cuda_engine, execution_context, + trt_context_idx, buffers, + use_implicit_batch_, num_batch, ctx)); + // Copied from gpu_kernel_helper.h as the header can only be used in *.cu.cc + // files. const cudaStream_t* stream = CHECK_NOTNULL( reinterpret_cast(ctx->op_device_context() ->stream() ->implementation() ->GpuStreamMemberHack())); - // nvinfer1::IExecutionContext::enqueue is not thread safe and we need a mutex - // for it. - mutex_lock lock(engine_context->mu); - // TODO(jie): trt enqueue does not return error - auto ret = engine_context->execution_context->enqueue(num_batch, &buffers[0], - *stream, nullptr); - if (!ret) { - LOG(WARNING) << "Failed to enqueue batch for TRT engine: " << name(); - return kRetry; - } - // Synchronization will be done by TF. - return !kRetry; + ContextDeviceMemory context_device_memory; + if (!has_device_memory) { + tensorflow::profiler::TraceMe activity( + "TRTEngineOp::AllocateDeviceMemory", + tensorflow::profiler::TraceMeLevel::kInfo); + // Allocate device memory for the TensorRT engine execution. The device + // memory will be released when context_device_memory goes out of scope. + TF_RETURN_IF_ERROR(context_device_memory.AllocateDeviceMemory( + execution_context, allocator, engine_context->GetDeviceMemorySize())); + } + + // Enqueue the TensorRT engine for execution. + return TrtEnqueue(execution_context, buffers, stream, use_implicit_batch_, + num_batch); } Status TRTEngineOp::GetEngineCacheResource(OpKernelContext* ctx, TRTEngineCacheResource** cache_res) { + tensorflow::profiler::TraceMe activity( + "TRTEngineOp::GetEngineCachResource", + tensorflow::profiler::TraceMeLevel::kInfo); // Canonicalize the op name by removing the scopes if any. This is mainly // because in TFv2, the function graph can be instantiated in various ways and // it'll insert scope names to the name of the TRTEngineOps, which will result @@ -662,116 +1097,253 @@ Status TRTEngineOp::GetEngineCacheResource(OpKernelContext* ctx, }}); } -StatusOr TRTEngineOp::GetEngine( - const std::vector& input_shapes, OpKernelContext* ctx, - TRTEngineCacheResource* cache_res) { - static EngineContext empty_context; +::stream_executor::port::StatusOr> +TRTEngineOp::BuildEngine(const std::vector& input_concrete_shapes, + int batch_size, bool use_calibration, + TRTInt8Calibrator* calibrator, + TRTEngineCacheResource* cache_resource, + OpKernelContext* ctx) { + tensorflow::profiler::TraceMe activity( + "TRTEngineOp::BuildEngine", tensorflow::profiler::TraceMeLevel::kInfo); + TRT_ENSURE(cache_resource); + TRT_ENSURE(ctx); + // Use concrete shapes for implicit batch mode and partial shapes for + // explicit batch mode. + bool use_concrete_shapes = + use_implicit_batch_ || cache_resource->profiles_.IsStaticCompatible(); + const std::vector& conversion_input_shapes = + use_concrete_shapes + ? std::vector(input_concrete_shapes.begin(), + input_concrete_shapes.end()) + : input_partial_shapes_; + + VLOG(1) << "Building a new TensorRT engine for " << name() + << " with input shapes: " << DebugString(conversion_input_shapes); + + std::unordered_map device_map; + DeviceNameUtils::ParsedName full_parsed_name; + DeviceNameUtils::ParseFullName(ctx->device()->name(), &full_parsed_name); + device_map.emplace(ctx->device()->name(), + grappler::GetDeviceInfo(full_parsed_name)); + tensorflow::grappler::VirtualCluster cluster(device_map); + + TrtUniquePtrType engine; + auto status = convert::ConvertGraphDefToEngine( + segment_graph_def_, ctx, precision_mode_, batch_size, workspace_size_, + conversion_input_shapes, &logger, cache_resource->allocator_.get(), + calibrator, &engine, use_calibration, use_implicit_batch_, nullptr, + &cache_resource->profiles_, name(), use_explicit_precision_, &cluster, + ctx->device()->name()); + if (!status.ok()) { + LOG_FIRST_FEW_WARNING_WITH_PREFIX + << "Engine creation for " << name() << " failed. " + << "The native segment will be used instead. " + << "Reason: " << status; + // Store an empty engine in the cache for these input shapes so we don't try + // to build the same failing engine again. + cache_resource->cache_.emplace(input_concrete_shapes, + std::make_unique()); + return status; + } + return engine; +} +::stream_executor::port::StatusOr> +TRTEngineOp::GetEngine(const std::vector& input_concrete_shapes, + OpKernelContext* ctx, + TRTEngineCacheResource* cache_res) { + tensorflow::profiler::TraceMe activity( + "TRTEngineOp::GetEngine", tensorflow::profiler::TraceMeLevel::kInfo); + static EngineContext empty_context; mutex_lock lock(engine_mutex_); - // Using first input to get batch size is reliable - VerifyInputShapes() has - // verified that. - const int batch_size = input_shapes[0].dim_size(0); + // Using first input to get batch size is reliable - VerifyInputShapes() + // guarantees that the first input is not a scalar. As such we can always use + // the first input to get the batch size for implicit batch mode. For explicit + // batch mode, this value is not used. + const int batch_size = input_concrete_shapes[0].dim_size(0); + // TODO(Tamas): remove the need for batch_size in explicit_batch mode auto& cache = cache_res->cache_; auto allocator = cache_res->allocator_.get(); if (allocator == nullptr) { - return &empty_context; + return std::pair(&empty_context, 0); } // Handle the static engine case. For static engines, the cache will have a // single element containing the only engine. if (static_engine_) { if (cache.size()) { - if (AreShapesCompatible(input_shapes, cache.begin()->first)) { - return cache.begin()->second.get(); + // TODO(laigd): need a better shape compatibility check for the case where + // implicit batch is disabled. + if (!use_implicit_batch_ || + AreShapesCompatible(input_concrete_shapes, cache.begin()->first)) { + int profile_id = 0; + if (!use_implicit_batch_) + profile_id = + cache_res->profiles_.GetProfileNumber(input_concrete_shapes); + if (profile_id != -1) { + return std::pair(cache.begin()->second.get(), + profile_id); + } } - return &empty_context; + return std::pair(&empty_context, 0); } TrtUniquePtrType infer(nvinfer1::createInferRuntime(logger)); infer->setGpuAllocator(allocator); + // Need to initialize plugins in order to deserialize engines that contain + // plugins. + MaybeInitializeTrtPlugins(&logger); TrtUniquePtrType static_engine( infer->deserializeCudaEngine(serialized_segment_.c_str(), serialized_segment_.size(), nullptr)); + int profile_id = 0; + if (static_engine && !use_implicit_batch_) { + // load profiles + std::vector exec_contexts; + TF_RETURN_IF_ERROR(cache_res->profiles_.RestoreProfiles( + static_engine.get(), ctx->num_inputs())); + TF_RETURN_IF_ERROR(cache_res->profiles_.CreateExecutionContexts( + static_engine.get(), &exec_contexts)); + cache.emplace(input_concrete_shapes, + std::make_unique(std::move(static_engine), + std::move(exec_contexts))); + VLOG(1) << "Added new engine to cache of " << name() + << ". Cache size: " << cache.size(); + // Query which profile of the new engine matches the actual input. + profile_id = cache_res->profiles_.GetProfileNumber(input_concrete_shapes); + if (profile_id == -1) { + return std::pair(&empty_context, 0); + } + EngineContext* engine_context = cache_res->GetEngineContext(profile_id); + return std::pair(engine_context, profile_id); + } + + if (!static_engine) { + if (!allow_build_at_runtime_) { + // Store an empty engine in the cache so we don't try to load the same + // failing engine again. + cache.emplace(input_concrete_shapes, std::make_unique()); + return std::pair(&empty_context, 0); + } + if (segment_graph_def_.node().empty()) { + Status status = ImportSegmentGraphDef(ctx->function_library(), + ctx->device()->name()); + if (!status.ok()) { + LOG_FIRST_FEW_WARNING_WITH_PREFIX << "Getting segment graph for " + << name() << " failed. " + << "Reason: " << status; + } + } + auto result = BuildEngine(input_concrete_shapes, batch_size, + /*use_calibration=*/false, + /*calibrator=*/nullptr, cache_res, ctx); + if (!result.ok()) { + return std::pair(&empty_context, 0); + } + static_engine = std::move(result.ValueOrDie()); + } + auto raw_static_engine = static_engine.get(); - const auto max_batch_size = raw_static_engine->getMaxBatchSize(); - // Static engine will have max_batch_size for batch size so that all inputs - // will map to this single engine. - std::vector engine_input_shapes(input_shapes); - for (int i = 0; i < engine_input_shapes.size(); i++) { - // TODO(tmorris): will all inputs have batch size as first dimension?? - engine_input_shapes[i].set_dim(0, max_batch_size); + std::vector engine_input_shapes(input_concrete_shapes); + + int max_batch_size = 1; + if (use_implicit_batch_) { + max_batch_size = raw_static_engine->getMaxBatchSize(); + // Static engine will have max_batch_size for batch size so that all + // inputs will map to this single engine. + for (int i = 0; i < engine_input_shapes.size(); i++) { + engine_input_shapes[i].set_dim(0, max_batch_size); + } } + + ExecutionContext context = ExecutionContext::Create(raw_static_engine); // TODO(laigd): here we assume engine_input_shapes matches the actual input // shapes of the engine, we should verify that. cache.emplace(engine_input_shapes, - absl::make_unique( - std::move(static_engine), - TrtUniquePtrType( - raw_static_engine->createExecutionContext()))); + std::make_unique(std::move(static_engine), + std::move(context))); // Runtime is safe to delete after engine creation VLOG(1) << "Size of serialized TRT engine: " << serialized_segment_.capacity(); string tmp; // Swap with temporary empty string to deallocate the CPU memory. serialized_segment_.swap(tmp); - if (max_batch_size < batch_size) { - return &empty_context; + if (use_implicit_batch_ && (max_batch_size < batch_size)) { + return std::pair(&empty_context, 0); } - return cache.at(engine_input_shapes).get(); + return std::pair(cache.at(engine_input_shapes).get(), + 0); } // static_engine_ - // Handle the dynamic engine case. See if there is a compatible engine cached. - std::vector engine_input_shapes; - TF_RETURN_IF_ERROR( - GetEngineInputShapes(cache, input_shapes, &engine_input_shapes)); + int profile_id = -1; + if (!use_implicit_batch_) { + profile_id = cache_res->profiles_.GetProfileNumber(input_concrete_shapes); + // Since all profiles are already created at this point, finding no + // compatible profiles results in falling back to native TF. + if (profile_id == -1) { + return std::pair(&empty_context, 0); + } + } - // If matched, use that engine. Otherwise, we will look in cache for that - // exact shape and possibly create a new engine if it is not in cache. - if (!cache.count(engine_input_shapes)) { - TrtUniquePtrType engine; - bool convert_successfully = false; - LOG(INFO) << "Building a new TensorRT engine for " << name() - << " input shapes: " - << TensorShapeUtils::ShapeListString(engine_input_shapes); + EngineContext* engine_contexts; + if (use_implicit_batch_) { + engine_contexts = cache_res->GetEngineContext(input_concrete_shapes); + } else { + engine_contexts = cache_res->GetEngineContext(profile_id); + } - // Convert to partial shapes - std::vector partial_shapes(engine_input_shapes.begin(), - engine_input_shapes.end()); + // If cache does not have a compatible engine then create a new engine. + if (engine_contexts == nullptr) { + if (!allow_build_at_runtime_) { + LOG_FIRST_FEW_WARNING_WITH_PREFIX + << "Found no engine in cache matching input shapes. " + << "Not building a new engine because " + << "allow_build_at_runtime=False. " + << "The native segment will be used instead."; + // Store an empty engine in the cache for these input shapes so we don't + // try to build the same failing engine again. + cache.emplace(input_concrete_shapes, std::make_unique()); + return std::pair(&empty_context, 0); + } // Up to this point, calibrator_ can never be empty, since otherwise it // means calibration_mode_ is true and this path won't get executed. - auto status = convert::ConvertGraphDefToEngine( - segment_graph_, precision_mode_, batch_size, workspace_size_, - partial_shapes, &logger, allocator, calibrator_.get(), &engine, - use_calibration_, &convert_successfully); - if (!status.ok()) { - LOG(WARNING) << "Engine creation for " << name() << " failed. " - << "The native segment will be used instead. " - << "Reason: " << status; - // Store an empty engine in the cache for these input shapes so we don't - // try to build the same failing engine again. - cache.emplace(engine_input_shapes, absl::make_unique()); - return &empty_context; + auto result = + BuildEngine(input_concrete_shapes, batch_size, use_calibration_, + calibrator_.get(), cache_res, ctx); + if (!result.ok()) { + return std::pair(&empty_context, 0); } - TrtUniquePtrType exec_context( - engine->createExecutionContext()); - cache.emplace(engine_input_shapes, - absl::make_unique(std::move(engine), - std::move(exec_context))); + TrtUniquePtrType engine = + std::move(result.ValueOrDie()); + std::vector exec_contexts; + TF_RETURN_IF_ERROR(cache_res->profiles_.CreateExecutionContexts( + engine.get(), &exec_contexts)); + cache.emplace(input_concrete_shapes, + std::make_unique(std::move(engine), + std::move(exec_contexts))); VLOG(1) << "Added new engine to cache of " << name() << ". Cache size: " << cache.size(); + engine_contexts = cache.at(input_concrete_shapes).get(); + // Query which profile of the new engine matches the actual input. + profile_id = cache_res->profiles_.GetProfileNumber(input_concrete_shapes); } - return cache.at(engine_input_shapes).get(); + return std::pair(engine_contexts, + use_implicit_batch_ ? 0 : profile_id); } // TODO(hinsu): Move this allocation to CalibrationContext constructor, if // possible. Status TRTEngineOp::AllocateCalibrationResources( OpKernelContext* ctx, TRTEngineCacheResource* cache_res) { - cache_res->calib_ctx_ = absl::make_unique(); + tensorflow::profiler::TraceMe activity( + "TRTEngineOp::AllocateCalibrationResources", + tensorflow::profiler::TraceMeLevel::kInfo); + cache_res->calib_ctx_ = std::make_unique(); auto* cres = cache_res->calib_ctx_.get(); // Get the input shapes. + /// TODO(lsugy): support INT8 calibration in non-frozen mode. const int batch_size = ctx->input(0).dim_size(0); const int num_inputs = ctx->num_inputs(); std::vector shapes; @@ -779,46 +1351,62 @@ Status TRTEngineOp::AllocateCalibrationResources( VLOG(1) << "Constructing calibrator"; for (int i = 0; i < num_inputs; i++) { // allocate workspace on device for inputs + auto* input = &cres->device_tensors_.at(i); const Tensor& t = ctx->input(i); shapes.emplace_back(t.shape()); - Tensor* device_tensor; - TF_RETURN_IF_ERROR(ctx->allocate_persistent( - t.dtype(), t.shape(), &cres->device_tensors_.at(i), &device_tensor)); - CHECK_EQ(t.TotalBytes(), device_tensor->TotalBytes()); - void* device_address = GetTensorAddress(device_tensor); + TF_RETURN_IF_ERROR(ctx->allocate_temp(t.dtype(), t.shape(), input)); + CHECK_EQ(t.TotalBytes(), input->TotalBytes()); // Crash OK + + void* device_address = GetTensorAddress(input); if (device_address == nullptr) { - return errors::InvalidArgument( - "Unsupported data type encountered in input ", i); + return errors::InvalidArgument("Unsupported data type [", + DebugString(t.dtype()), + "] encountered in input ", i); } cres->device_buffers_.emplace( StrCat(IONamePrefixes::kInputPHName, i), - std::pair(device_address, device_tensor->TotalBytes())); + std::pair(device_address, input->TotalBytes())); } cres->calibrator_.reset( new TRTInt8Calibrator(cres->device_buffers_, batch_size, name())); - const int platform_gpu_id = + const int platform_device_id = ctx->device()->tensorflow_gpu_device_info()->gpu_id; - if (platform_gpu_id < 0) { + if (platform_device_id < 0) { LOG(ERROR) << "Can't get gpu_device_info from context->device()"; return errors::InvalidArgument( "Context->device doesn't contain device info!"); } + bool use_concrete_shapes = + use_implicit_batch_ || cache_res->profiles_.IsStaticCompatible(); + const std::vector& conversion_input_shapes = + use_concrete_shapes + ? std::vector(shapes.begin(), shapes.end()) + : input_partial_shapes_; + cache_res->Ref(); - cres->thr_.reset(new std::thread([this, cres, shapes, platform_gpu_id, - cache_res]() { + string platform_device_name = ctx->device()->name(); + cres->thr_.reset(new std::thread([this, cres, shapes, conversion_input_shapes, + platform_device_id, platform_device_name, + cache_res, ctx]() { core::ScopedUnref sc(cache_res); - LOG(INFO) << "Starting calibration thread on device " << platform_gpu_id - << ", Calibration Resource @ " << cres; - auto err = cudaSetDevice(platform_gpu_id); + VLOG(1) << "Starting calibration thread on device " << platform_device_id + << ", Calibration Resource @ " << cres; + auto err = cudaSetDevice(platform_device_id); if (err != cudaSuccess) { // TODO(aaroey): should return error here. - LOG(ERROR) << "Couldn't set cuda device to " << platform_gpu_id + LOG(ERROR) << "Couldn't set cuda device to " << platform_device_id << " in calibration thread"; } - std::vector partial_shapes(shapes.begin(), - shapes.end()); + + std::unordered_map device_map; + DeviceNameUtils::ParsedName full_parsed_name; + DeviceNameUtils::ParseFullName(platform_device_name, &full_parsed_name); + device_map.emplace(platform_device_name, + grappler::GetDeviceInfo(full_parsed_name)); + tensorflow::grappler::VirtualCluster cluster(device_map); + // ConvertGraphDefToEngine() will try to build the engine. This thread // will loop inside buildCudaEngine() consuming the calibration data // that is set by the TF op, and drive the builder until calibrator @@ -828,25 +1416,39 @@ Status TRTEngineOp::AllocateCalibrationResources( // TODO(aaroey): maybe setting the max batch size using the python // calibration wrapper class. auto s = convert::ConvertGraphDefToEngine( - this->segment_graph_, TrtPrecisionMode::INT8, + this->segment_graph_def_, ctx, TrtPrecisionMode::INT8, cres->calibrator_->getBatchSize(), this->workspace_size_, - partial_shapes, &cache_res->GetLogger(), cache_res->allocator_.get(), - cres->calibrator_.get(), &cres->engine_, - /*use_calibration=*/true, - /*convert_successfully=*/nullptr); + conversion_input_shapes, &cache_res->GetLogger(), + cache_res->allocator_.get(), cres->calibrator_.get(), &cres->engine_, + /*use_calibration=*/true, this->use_implicit_batch_, + /*convert_successfully=*/nullptr, + /*profiles=*/&cache_res->profiles_, name(), + /*use_explicit_precision=*/use_explicit_precision_, + /*cluster=*/&cluster, platform_device_name); if (!s.ok()) { LOG(ERROR) << "Calibration failed: " << s; cres->calibrator_->setDone(); // Ignore further pushes + cache_res->cache_.emplace(shapes, std::make_unique()); } else { // Transfer the ownership of the engine to the engine cache, so we can // dump it out during conversion for TF 2.0. mutex_lock lock(this->engine_mutex_); this->calibrator_ = std::move(cres->calibrator_); - TrtUniquePtrType exec_context( - cres->engine_->createExecutionContext()); - cache_res->cache_.emplace( - shapes, absl::make_unique(std::move(cres->engine_), - std::move(exec_context))); + if (!use_implicit_batch_ && + (has_dynamic_shape_input_ || cache_res->profiles_.HasShapeTensor())) { + std::vector exec_contexts; + auto calib_result = cache_res->profiles_.CreateExecutionContexts( + cres->engine_.get(), &exec_contexts); + cache_res->cache_.emplace( + shapes, std::make_unique(std::move(cres->engine_), + std::move(exec_contexts))); + } else { + ExecutionContext context = + ExecutionContext::Create(cres->engine_.get()); + cache_res->cache_.emplace( + shapes, std::make_unique(std::move(cres->engine_), + std::move(context))); + } } VLOG(1) << "Calibration loop terminated " << this->name(); @@ -860,5 +1462,4 @@ REGISTER_KERNEL_BUILDER(Name("TRTEngineOp").Device(DEVICE_GPU), TRTEngineOp); } // namespace tensorrt } // namespace tensorflow -#endif // GOOGLE_TENSORRT -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA && GOOGLE_TENSORRT diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc index 497a2710c24..317f3a54357 100644 --- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc +++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc @@ -13,44 +13,62 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include -#include - -#include +#include #include +#include #include #include #include +#include "absl/container/inlined_vector.h" #include "absl/strings/str_cat.h" +#include "absl/strings/string_view.h" +#include "absl/types/span.h" +#include "tensorflow/cc/framework/scope.h" #include "tensorflow/cc/ops/function_ops.h" -#include "tensorflow/cc/ops/standard_ops.h" +#include "tensorflow/cc/ops/math_ops.h" #include "tensorflow/compiler/tf2tensorrt/convert/convert_graph.h" -#include "tensorflow/compiler/tf2tensorrt/convert/utils.h" #include "tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h" +#include "tensorflow/core/common_runtime/device.h" +#include "tensorflow/core/common_runtime/device_factory.h" +#include "tensorflow/core/common_runtime/process_function_library_runtime.h" +#include "tensorflow/core/framework/attr_value.pb.h" #include "tensorflow/core/framework/fake_input.h" #include "tensorflow/core/framework/function.h" -#include "tensorflow/core/framework/graph_to_functiondef.h" +#include "tensorflow/core/framework/graph.pb.h" #include "tensorflow/core/framework/node_def_builder.h" -#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/resource_mgr.h" #include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/framework/tensor_shape.h" #include "tensorflow/core/framework/types.h" +#include "tensorflow/core/framework/types.pb.h" +#include "tensorflow/core/graph/graph.h" #include "tensorflow/core/kernels/ops_testutil.h" -#include "tensorflow/core/lib/gtl/stl_util.h" -#include "tensorflow/core/platform/test.h" +#include "tensorflow/core/lib/core/status_test_util.h" +#include "tensorflow/core/platform/refcount.h" +#include "tensorflow/core/platform/status.h" +#include "tensorflow/core/public/version.h" +#include "tsl/framework/fixedpoint/FixedPoint.h" -#if GOOGLE_CUDA -#if GOOGLE_TENSORRT -#include "third_party/gpus/cuda/include/cuda_runtime_api.h" +#if GOOGLE_CUDA && GOOGLE_TENSORRT namespace tensorflow { namespace tensorrt { using ::absl::StrCat; using ::testing::ElementsAre; +struct TestParam { + bool static_engine; +}; + class TRTEngineOpTestBase : public OpsTestBase { public: - void AddSimpleTrtOp(DataType dtype, int max_cached_engines_count = 1) { + void AddSimpleTrtOp(DataType dtype, int max_cached_engines_count = 1, + PartialTensorShape shape = PartialTensorShape({-1, -1}), + bool use_implicit_batch = true, + bool allow_build_at_runtime = true, + bool static_engine = false) { // Create the GPU device. std::unique_ptr device( DeviceFactory::NewDevice("GPU", {}, "/job:worker/replica:0/task:0")); @@ -59,40 +77,86 @@ class TRTEngineOpTestBase : public OpsTestBase { Scope s = Scope::NewRootScope(); auto feed = ops::_Arg(s.WithOpName("TensorRTInputPH_0"), dtype, 0); auto add = ops::Add(s.WithOpName("add"), feed, feed); - ops::_Retval(s.WithOpName("TensorRTOutputPH_0"), add, 0); + ops::_Retval give_me_a_name(s.WithOpName("TensorRTOutputPH_0"), add, 0); // Serialize the graph. TRTEngineOp will convert it using dynamic mode. GraphDef graph_def; TF_ASSERT_OK(s.ToGraphDef(&graph_def)); Graph* graph = s.graph(); - const char* op_name = "myop"; - TF_ASSERT_OK( - convert::RegisterGraphToFunctionLibrary(graph_def, graph, op_name)); + TF_ASSERT_OK(convert::RegisterGraphToFunctionLibrary(graph_def, graph, + std::string(kOpName))); TF_ASSERT_OK(flib_def_->AddLibrary(graph->flib_def())); - PartialTensorShape shape({-1, -1}); + string segment_string; + if (static_engine) { + convert::TRTOptimizationPass::ConversionParams params; + convert::EngineInfo info; + info.segment_graph_def.CopyFrom(graph_def); + info.precision_mode = TrtPrecisionMode::FP32; + info.max_workspace_size_bytes = 1 << 20; + info.engine_name = "TRTEngineOP_000_000"; + params.use_implicit_batch = use_implicit_batch; + params.trt_logger_name = "DefaultLogger"; + + TrtShapeOptimizationProfile profile; + // We set the input mask to true (no resource inputs) + std::vector input_mask = {true}; + profile.SetInputMask(input_mask); + // We set profile 0 to be incompatible with the input used in the test. + // This way we ensure that profile selection is tested. + TensorShape my_shape; + TF_CHECK_OK( + TensorShapeUtils::MakeShape(std::vector{4, 2}, &my_shape)); + profile.AddShape({my_shape, {}}); + TF_CHECK_OK( + TensorShapeUtils::MakeShape(std::vector{1, 2}, &my_shape)); + profile.AddShape({my_shape, {}}); + + profile.InitProfiles({shape}, ProfileStrategy::kOptimal); + std::vector shape_vec{shape, {}}; + TF_CHECK_OK(convert::CreateStaticEngine( + params, info, 1, shape_vec, &profile, &segment_string, nullptr)); + } // Create the op. + // In implicit batch mode, the input shapes that we specify here are not + // used for engine creation, we use the concrete shapes during inference + // time for creating the engine. + // In explicit batch mode, the input shapes attribute is used to define + // the network for the TensorRT engine. OpsTestBase::SetDevice(DEVICE_GPU, std::move(device)); NameAttrList function; - function.set_name(StrCat(op_name, "_native_segment")); - TF_ASSERT_OK(NodeDefBuilder(op_name, "TRTEngineOp") + function.set_name(StrCat(std::string(kOpName), "_native_segment")); + // We disable allow_soft_placement when executing the native segment of the + // TRTEngineOp for the following reasons: + // OpsTestBase only allow one device in the device manager. + // We need to define the GPU device to test TRTEngineOp. + // When allow_soft_placement is true, the TensorFlow runtime produces an + // error if a CPU device is not defined + // (see ProcessFunctionLibraryRuntime::InstantiateMultiDevice). + TF_ASSERT_OK(NodeDefBuilder(std::string(kOpName), "TRTEngineOp") .Input(FakeInput(1, dtype)) .Attr("input_shapes", {shape}) .Attr("output_shapes", {shape}) - .Attr("static_engine", false) + .Attr("static_engine", static_engine) .Attr("segment_func", function) - .Attr("serialized_segment", "") + .Attr("serialized_segment", segment_string) .Attr("calibration_data", "") .Attr("max_cached_engines_count", max_cached_engines_count) .Attr("workspace_size_bytes", 1 << 20) .Attr("precision_mode", "FP32") .Attr("use_calibration", false) + .Attr("profile_strategy", "optimal") + .Attr("_use_implicit_batch", use_implicit_batch) + .Attr("_allow_build_at_runtime", allow_build_at_runtime) + .Attr("_allow_soft_placement", false) .Attr("OutT", {dtype}) .Finalize(OpsTestBase::node_def())); TF_ASSERT_OK(InitOpWithFunctionLibrary()); } + static const absl::string_view kOpName; + template void AddSimpleInput(const TensorShape& shape) { std::vector input(shape.num_elements()); @@ -102,22 +166,49 @@ class TRTEngineOpTestBase : public OpsTestBase { void ResetInputs() { inputs_.clear(); - gtl::STLDeleteElements(&tensors_); + for (auto& temp : tensors_) { + delete temp; + } + tensors_.clear(); } private: Status InitOpWithFunctionLibrary() { OpKernel* kernel = nullptr; - Status status = CreateOpKernel(device_type_, device_, allocator(), - pflr_->GetFLR(device_->name()), node_def_, - TF_GRAPH_DEF_VERSION, &kernel); + auto flr = pflr_->GetFLR(device_->name()); + std::shared_ptr props; + Status status = NodeProperties::CreateFromNodeDef( + node_def_, flr->GetFunctionLibraryDefinition(), &props); + if (status.ok()) { + status.Update(CreateOpKernel(device_type_, device_, allocator(), flr, + props, TF_GRAPH_DEF_VERSION, &kernel)); + } kernel_ = std::unique_ptr(kernel); if (kernel_ != nullptr) input_types_ = kernel_->input_types(); return status; } }; -TEST_F(TRTEngineOpTestBase, DynamicShapes) { +class TRTEngineOpTestWithParam + : public TRTEngineOpTestBase, + public ::testing::WithParamInterface { + public: + TRTEngineOpTestWithParam() : param_(GetParam()) {} + + protected: + TestParam param_; +}; + +const absl::string_view TRTEngineOpTestBase::kOpName = "myop"; + +constexpr std::array TestParameters{TestParam{false}, + TestParam{true}}; + +INSTANTIATE_TEST_CASE_P(TRTEngineOpTestInstantiation, TRTEngineOpTestWithParam, + ::testing::ValuesIn(TestParameters)); + +TEST_F(TRTEngineOpTestBase, DynamicEngines) { + // Test dynamic engine creation during inference time TRTEngineOpTestBase::AddSimpleTrtOp(DT_FLOAT, /*max_cached_engines_count=*/4); // Execute the op with batch size > 1. @@ -126,8 +217,8 @@ TEST_F(TRTEngineOpTestBase, DynamicShapes) { // Get the engine cache. TRTEngineCacheResource* cache_resource = nullptr; - TF_ASSERT_OK( - device_->resource_manager()->Lookup("TF-TRT", "myop", &cache_resource)); + TF_ASSERT_OK(device_->resource_manager()->Lookup( + std::string(kTfTrtContainerName), std::string(kOpName), &cache_resource)); core::ScopedUnref sc(cache_resource); // It should contain only one engine. @@ -166,6 +257,98 @@ TEST_F(TRTEngineOpTestBase, DynamicShapes) { EXPECT_EQ(1, cache->count({TensorShape({10, 10})})); } +TEST_F(TRTEngineOpTestBase, AllowBuildAtRuntime) { + TRTEngineOpTestBase::AddSimpleTrtOp(DT_FLOAT, /*max_cached_engines_count=*/1, + PartialTensorShape({-1, -1}), + /*use_implicit_batch=*/true, + /*allow_build_at_runtime=*/false); + + // Execute the op + TensorShape input_shape({2, 2}); + TRTEngineOpTestBase::AddSimpleInput(input_shape); + TF_ASSERT_OK(OpsTestBase::RunOpKernel()); + + // Get the engine cache. + TRTEngineCacheResource* cache_resource = nullptr; + TF_ASSERT_OK(device_->resource_manager()->Lookup( + std::string(kTfTrtContainerName), std::string(kOpName), &cache_resource)); + core::ScopedUnref sc(cache_resource); + + // It should contain a placeholder with an empty cuda_engine (to mark that + // engine creation was not successful for the given input shape). + auto cache = &cache_resource->cache_; + EXPECT_EQ(1, cache->size()); + ASSERT_EQ(1, cache->count({input_shape})); + EngineContext* ectx = cache->at({input_shape}).get(); + EXPECT_EQ(ectx->GetCudaEngine(), nullptr); +} + +TEST_P(TRTEngineOpTestWithParam, ExplicitBatch) { + // Test inference in explicit batch mode with static input shapes. Static + // shapes in this context means that the TensorRT knows all the input shapes + // during engine creation time. + TRTEngineOpTestBase::AddSimpleTrtOp(DT_FLOAT, /*max_cached_engines_count=*/1, + /*shape=*/PartialTensorShape({1, 2}), + /*use_implicit_batch=*/false, + /*allow_build_at_runtime=*/true, + /*static_engine=*/param_.static_engine); + + TensorShape input_shape({1, 2}); + TRTEngineOpTestBase::AddSimpleInput(input_shape); + TF_ASSERT_OK(OpsTestBase::RunOpKernel()); + + // Get the engine cache. + TRTEngineCacheResource* cache_resource = nullptr; + TF_ASSERT_OK(device_->resource_manager()->Lookup( + std::string(kTfTrtContainerName), std::string(kOpName), &cache_resource)); + core::ScopedUnref sc(cache_resource); + + auto cache = &cache_resource->cache_; + EXPECT_EQ(1, cache->size()); + ASSERT_EQ(1, cache->count({input_shape})); + EngineContext* ectx = cache->at({input_shape}).get(); + EXPECT_NE(ectx->GetCudaEngine(), nullptr); +} + +TEST_P(TRTEngineOpTestWithParam, DynamicShapes) { + // Test inference in explicit batch mode with dynamic input shapes. Dynamic + // shapes in this context means that some input shapes for TensorRT are + // unknown during engine creation time. When we create the network, the + // unknow shapes are repsesented as -1. Before we run inference, these shapes + // have to be specified by calling setBindingDimensions. + TRTEngineOpTestBase::AddSimpleTrtOp(DT_FLOAT, /*max_cached_engines_count=*/1, + /*shape=*/PartialTensorShape({-1, -1}), + /*use_implicit_batch=*/false, + /*allow_build_at_runtime=*/true, + param_.static_engine); + + TensorShape input_shape({1, 2}); + TRTEngineOpTestBase::AddSimpleInput(input_shape); + + TF_ASSERT_OK(OpsTestBase::RunOpKernel()); + + // Get the engine cache. + TRTEngineCacheResource* cache_resource = nullptr; + TF_ASSERT_OK(device_->resource_manager()->Lookup( + std::string(kTfTrtContainerName), std::string(kOpName), &cache_resource)); + core::ScopedUnref sc(cache_resource); + + auto cache = &cache_resource->cache_; + EXPECT_EQ(1, cache->size()); + ASSERT_EQ(1, cache->count({input_shape})); + EngineContext* ectx = cache->at({input_shape}).get(); + EXPECT_NE(ectx->GetCudaEngine(), nullptr); + + // Execute the op with an incompatible shape. + ResetInputs(); + TRTEngineOpTestBase::AddSimpleInput(TensorShape({1, 37})); + // Test that the op runs. This should fall back to native segment. + TF_ASSERT_OK(OpsTestBase::RunOpKernel()); + // We should still have a single engine that is not compatible with the input. + EXPECT_EQ(1, cache->size()); + EXPECT_EQ(0, cache->count({TensorShape({1, 37})})); +} + template class TRTEngineOpTest : public TRTEngineOpTestBase {}; @@ -191,5 +374,4 @@ TYPED_TEST(TRTEngineOpTest, Basic) { } // namespace tensorrt } // namespace tensorflow -#endif // GOOGLE_TENSORRT -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA && GOOGLE_TENSORRT diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_resource_ops.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_resource_ops.cc index 533dd02d460..6889b609d19 100644 --- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_resource_ops.cc +++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_resource_ops.cc @@ -32,9 +32,9 @@ limitations under the License. #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/mutex.h" #include "tensorflow/core/platform/thread_annotations.h" +#include "tensorflow/core/profiler/lib/traceme.h" -#if GOOGLE_CUDA -#if GOOGLE_TENSORRT +#if GOOGLE_CUDA && GOOGLE_TENSORRT #include "third_party/tensorrt/NvInfer.h" namespace tensorflow { @@ -48,6 +48,9 @@ class CreateTRTResourceHandle : public OpKernel { } void Compute(OpKernelContext* ctx) override { + tensorflow::profiler::TraceMe activity( + "CreateTRTResourceHandle::Compute", + tensorflow::profiler::TraceMeLevel::kInfo); { mutex_lock l(mutex_); if (!initialized_) { @@ -71,9 +74,10 @@ class CreateTRTResourceHandle : public OpKernel { string resource_name_; Tensor handle_; mutex mutex_; - bool initialized_ GUARDED_BY(mutex_) = false; + bool initialized_ TF_GUARDED_BY(mutex_) = false; - TF_DISALLOW_COPY_AND_ASSIGN(CreateTRTResourceHandle); + CreateTRTResourceHandle(const CreateTRTResourceHandle&) = delete; + void operator=(const CreateTRTResourceHandle&) = delete; }; REGISTER_KERNEL_BUILDER(Name("CreateTRTResourceHandle") @@ -89,6 +93,9 @@ class InitializeTRTResource : public OpKernel { } void Compute(OpKernelContext* ctx) override { + tensorflow::profiler::TraceMe activity( + "InitializeTRTResource::Compute", + tensorflow::profiler::TraceMeLevel::kInfo); ResourceHandle handle = HandleFromInput(ctx, 0); core::RefCountPtr resource; OP_REQUIRES_OK( @@ -116,19 +123,21 @@ class InitializeTRTResource : public OpKernel { // Parse the serialized engines and add them to the cache. std::unique_ptr file; OP_REQUIRES_OK(ctx, ctx->env()->NewRandomAccessFile(filename, &file)); - auto reader = absl::make_unique(file.get()); + auto reader = std::make_unique(file.get()); uint64 offset = 0; int num_loaded_engine = 0; do { - string record; + tstring record; Status status = reader->ReadRecord(&offset, &record); if (errors::IsOutOfRange(status)) break; TRTEngineInstance engine_instance; engine_instance.ParseFromString(record); std::vector engine_input_shapes; - for (const TensorShapeProto& shape : engine_instance.input_shapes()) { + const auto& input_shapes = engine_instance.input_shapes(); + engine_input_shapes.reserve(input_shapes.size()); + for (const TensorShapeProto& shape : input_shapes) { engine_input_shapes.emplace_back(shape); } @@ -140,11 +149,23 @@ class InitializeTRTResource : public OpKernel { engine_instance.serialized_engine().c_str(), engine_instance.serialized_engine().size(), nullptr)); auto raw_engine = engine.get(); - resource->cache_.emplace( - engine_input_shapes, - absl::make_unique( - std::move(engine), TrtUniquePtrType( - raw_engine->createExecutionContext()))); + std::vector ctx_vec; + if (num_loaded_engine == 0) { + // Restore profiles if there are any. Currently only 1 engine is allowed + // in dynamic mode therefore we call this only for the 0th engine. + // it is a no-op in implicit batch mode. + OP_REQUIRES_OK(ctx, resource->profiles_.RestoreProfiles( + raw_engine, engine_input_shapes.size())); + OP_REQUIRES_OK(ctx, resource->profiles_.CreateExecutionContexts( + raw_engine, &ctx_vec)); + } else { + // Multiple engines are only available in static mode. For each engine + // we have only a single execution context. + ctx_vec.push_back(ExecutionContext::Create(raw_engine)); + } + resource->cache_.emplace(engine_input_shapes, + std::make_unique( + std::move(engine), std::move(ctx_vec))); ++num_loaded_engine; } while (1); VLOG(1) << "Loaded " << num_loaded_engine << " TRT engines for op " @@ -156,7 +177,8 @@ class InitializeTRTResource : public OpKernel { // Maximum number of cached engines int max_cached_engines_; - TF_DISALLOW_COPY_AND_ASSIGN(InitializeTRTResource); + InitializeTRTResource(const InitializeTRTResource&) = delete; + void operator=(const InitializeTRTResource&) = delete; }; REGISTER_KERNEL_BUILDER(Name("InitializeTRTResource") @@ -168,9 +190,14 @@ class SerializeTRTResource : public OpKernel { public: explicit SerializeTRTResource(OpKernelConstruction* ctx) : OpKernel(ctx) { OP_REQUIRES_OK(ctx, ctx->GetAttr("delete_resource", &delete_resource_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("save_gpu_specific_engines", + &save_gpu_specific_engines_)); } void Compute(OpKernelContext* ctx) override { + tensorflow::profiler::TraceMe activity( + "SerializeTRTResource::Compute", + tensorflow::profiler::TraceMeLevel::kInfo); const string& resource_name = ctx->input(0).scalar()(); const string& filename = ctx->input(1).scalar()(); OP_REQUIRES(ctx, !filename.empty(), @@ -178,9 +205,12 @@ class SerializeTRTResource : public OpKernel { // Lookup engine cache resource. TRTEngineCacheResource* resource = nullptr; - OP_REQUIRES_OK( - ctx, ctx->resource_manager()->Lookup(std::string(kTfTrtContainerName), - resource_name, &resource)); + OP_REQUIRES( + ctx, + ctx->resource_manager() + ->Lookup(std::string(kTfTrtContainerName), resource_name, &resource) + .ok(), + errors::NotFound("TRTEngineCacheResource not yet created")); core::ScopedUnref unref_me(resource); // Terminate the calibration if any. @@ -189,29 +219,66 @@ class SerializeTRTResource : public OpKernel { // Serialize the engines and write them to file. std::unique_ptr file; OP_REQUIRES_OK(ctx, ctx->env()->NewWritableFile(filename, &file)); - auto writer = absl::make_unique(file.get()); + auto writer = std::make_unique(file.get()); int num_serialized_engines = 0; - for (const auto& pair : resource->cache_) { - // Ignore engines that failed to build. - const std::unique_ptr& engine = pair.second; - if (!engine || !engine->cuda_engine) continue; - - TRTEngineInstance engine_instance; - // Add input shapes. - const std::vector& engine_input_shapes = pair.first; - for (const TensorShape& shape : engine_input_shapes) { - shape.AsProto(engine_instance.add_input_shapes()); + if (save_gpu_specific_engines_) { + // If user requests TRT engines export, recursively create + // requisite directories. + const char* export_trt_engines_env = + getenv("TF_TRT_EXPORT_TRT_ENGINES_PATH"); + if (export_trt_engines_env) { + VLOG(1) << "Exporting TRT engines to directory: " + << export_trt_engines_env; + OP_REQUIRES_OK( + ctx, ctx->env()->RecursivelyCreateDir(export_trt_engines_env)); } - // Add the serialized engine. - TrtUniquePtrType engine_data( - engine->cuda_engine->serialize()); - engine_instance.set_serialized_engine(engine_data->data(), - engine_data->size()); - OP_REQUIRES_OK(ctx, - writer->WriteRecord(engine_instance.SerializeAsString())); - ++num_serialized_engines; + for (const auto& pair : resource->cache_) { + // Ignore engines that failed to build. + const std::unique_ptr& engine = pair.second; + if (!engine || !engine->GetCudaEngine()) continue; + + TRTEngineInstance engine_instance; + // Add input shapes. + const std::vector& engine_input_shapes = pair.first; + for (const TensorShape& shape : engine_input_shapes) { + shape.AsProto(engine_instance.add_input_shapes()); + } + // Add the serialized engine. + TrtUniquePtrType engine_data( + engine->GetCudaEngine()->serialize()); + engine_instance.set_serialized_engine(engine_data->data(), + engine_data->size()); + + if (export_trt_engines_env) { + const std::string engine_filename = + std::string(export_trt_engines_env) + "/" + resource_name; + std::unique_ptr engine_file; + OP_REQUIRES_OK( + ctx, ctx->env()->NewWritableFile(engine_filename, &engine_file)); + OP_REQUIRES_OK(ctx, engine_file->Append(StringPiece( + static_cast(engine_data->data()), + engine_data->size()))); + + const std::string dims_filename = + std::string(export_trt_engines_env) + "/dims-" + resource_name; + std::unique_ptr dims_file; + OP_REQUIRES_OK( + ctx, ctx->env()->NewWritableFile(dims_filename, &dims_file)); + + for (const TensorShape& shape : engine_input_shapes) { + OP_REQUIRES_OK(ctx, + dims_file->Append(StringPiece(shape.DebugString()))); + } + } + + OP_REQUIRES_OK( + ctx, writer->WriteRecord(engine_instance.SerializeAsString())); + ++num_serialized_engines; + } + } else { + VLOG(1) << "TRT Engines are not serialized for op: " << resource_name; } VLOG(1) << "Serialized " << num_serialized_engines << " TRT engines for op " << resource_name << " on device " << ctx->device()->name() @@ -228,8 +295,10 @@ class SerializeTRTResource : public OpKernel { private: bool delete_resource_ = false; + bool save_gpu_specific_engines_ = true; - TF_DISALLOW_COPY_AND_ASSIGN(SerializeTRTResource); + SerializeTRTResource(const SerializeTRTResource&) = delete; + void operator=(const SerializeTRTResource&) = delete; }; REGISTER_KERNEL_BUILDER(Name("SerializeTRTResource").Device(DEVICE_GPU), @@ -238,5 +307,4 @@ REGISTER_KERNEL_BUILDER(Name("SerializeTRTResource").Device(DEVICE_GPU), } // namespace tensorrt } // namespace tensorflow -#endif // GOOGLE_TENSORRT -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA && GOOGLE_TENSORRT diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_resource_ops_test.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_resource_ops_test.cc index e82f89e9c2d..987b01eebcb 100644 --- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_resource_ops_test.cc +++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_resource_ops_test.cc @@ -13,99 +13,231 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include -#include - -#include +#include +#include +#include #include +#include "absl/container/inlined_vector.h" #include "absl/memory/memory.h" +#include "absl/strings/str_join.h" +#include "tensorflow/compiler/tf2tensorrt/common/datavec.h" +#include "tensorflow/compiler/tf2tensorrt/common/utils.h" +#include "tensorflow/compiler/tf2tensorrt/convert/utils.h" #include "tensorflow/compiler/tf2tensorrt/utils/trt_engine_instance.pb.h" // NOLINT #include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h" #include "tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h" -#include "tensorflow/compiler/tf2tensorrt/utils/trt_tensor_proxy.h" +#include "tensorflow/core/common_runtime/device.h" +#include "tensorflow/core/common_runtime/device_factory.h" #include "tensorflow/core/framework/fake_input.h" #include "tensorflow/core/framework/node_def_builder.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/resource_mgr.h" #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/framework/tensor_shape.h" +#include "tensorflow/core/framework/tensor_shape.pb.h" +#include "tensorflow/core/framework/tensor_types.h" #include "tensorflow/core/framework/types.h" +#include "tensorflow/core/framework/types.pb.h" #include "tensorflow/core/kernels/ops_testutil.h" -#include "tensorflow/core/lib/gtl/stl_util.h" -#include "tensorflow/core/lib/io/path.h" +#include "tensorflow/core/lib/core/status_test_util.h" #include "tensorflow/core/lib/io/record_reader.h" #include "tensorflow/core/platform/env.h" +#include "tensorflow/core/platform/errors.h" +#include "tensorflow/core/platform/file_system.h" +#include "tensorflow/core/platform/path.h" +#include "tensorflow/core/platform/status.h" #include "tensorflow/core/platform/test.h" +#include "tensorflow/core/platform/tstring.h" +#include "tensorflow/core/platform/types.h" -#if GOOGLE_CUDA -#if GOOGLE_TENSORRT +#if GOOGLE_CUDA && GOOGLE_TENSORRT namespace tensorflow { namespace tensorrt { -class TRTEngineResourceOpsTest : public OpsTestBase { +struct TestParam { + nvinfer1::Dims dims; + bool dynamic_shape; + int n_inputs; +}; + +class TRTEngineResourceOpsTest + : public OpsTestBase, + public ::testing::WithParamInterface { + public: + TRTEngineResourceOpsTest() : param_(GetParam()) {} + protected: void Reset() { + for (auto& temp : tensors_) { + delete temp; + } + for (auto& temp : managed_outputs_) { + delete temp; + } + tensors_.clear(); + managed_outputs_.clear(); inputs_.clear(); - gtl::STLDeleteElements(&tensors_); - gtl::STLDeleteElements(&managed_outputs_); + } + + ITensorProxyPtr NetworkWith1Input(nvinfer1::INetworkDefinition* network, + ITensorProxyPtr input) { + // Add a unary layer. + nvinfer1::IUnaryLayer* layer = + network->addUnary(*input->trt_tensor(), nvinfer1::UnaryOperation::kEXP); + EXPECT_NE(nullptr, layer); + return layer->getOutput(0); + } + + // Constructs a network with two inputs, where the second input is a shape + // tensor. We take a slice of the first input with the size of the slice + // specified by the second input, assuming the first input is a 2D tensor. + // We then add the slice to itself to produce the output of the network. + ITensorProxyPtr NetworkWith2Inputs(nvinfer1::INetworkDefinition* network, + ITensorProxyPtr input) { + nvinfer1::Dims dims2{1, {2}}; + ITensorProxyPtr input2 = + network->addInput(absl::StrCat(IONamePrefixes::kInputPHName, 1).c_str(), + nvinfer1::DataType::kINT32, dims2); + EXPECT_NE(nullptr, input2->trt_tensor()); + + nvinfer1::Dims start{2, {0, 0}}; + nvinfer1::Dims stride{2, {1, 1}}; + auto slice_layer = + network->addSlice(*input->trt_tensor(), start, stride, stride); + EXPECT_NE(nullptr, slice_layer); + + slice_layer->setInput(2, *input2->trt_tensor()); + ITensorProxyPtr sliced_input = slice_layer->getOutput(0); + EXPECT_NE(nullptr, sliced_input->trt_tensor()); + + auto layer = network->addElementWise(*sliced_input->trt_tensor(), + *sliced_input->trt_tensor(), + nvinfer1::ElementWiseOperation::kSUM); + EXPECT_NE(nullptr, layer); + return layer->getOutput(0); } TrtUniquePtrType CreateTRTEngine() { TrtUniquePtrType builder( nvinfer1::createInferBuilder(logger_)); TrtUniquePtrType network; -#if IS_TRT_VERSION_GE(6, 0, 0, 0) - const uint32_t flags = 0U; +#if IS_TRT_VERSION_GE(8, 0, 0, 0) network = - TrtUniquePtrType(builder->createNetworkV2(flags)); + TrtUniquePtrType(builder->createNetworkV2( + 1U << static_cast( + nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH))); #else - network = TrtUniquePtrType( - builder->createNetwork()); + network = + TrtUniquePtrType(builder->createNetworkV2( + 1U << static_cast( + nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH))); #endif + // Add the input. - nvinfer1::Dims dims; - dims.nbDims = 1; - dims.d[0] = 1; + nvinfer1::Dims dims = this->param_.dims; + if (this->param_.dynamic_shape) { + std::fill(dims.d, dims.d + dims.nbDims, -1); + } + const std::string in_name = StrCat(IONamePrefixes::kInputPHName, 0); ITensorProxyPtr input = - network->addInput("input", nvinfer1::DataType::kFLOAT, dims); + network->addInput(in_name.c_str(), nvinfer1::DataType::kFLOAT, dims); EXPECT_NE(nullptr, input->trt_tensor()); - - // Add a unary layer. - nvinfer1::IUnaryLayer* layer = - network->addUnary(*input->trt_tensor(), nvinfer1::UnaryOperation::kEXP); - EXPECT_NE(nullptr, layer); - // Mark the output. - ITensorProxyPtr output = layer->getOutput(0); + ITensorProxyPtr output = + this->param_.n_inputs == 1 + ? this->NetworkWith1Input(network.get(), input) + : this->NetworkWith2Inputs(network.get(), input); output->setName("output"); network->markOutput(*output->trt_tensor()); // Build the engine + TrtUniquePtrType builder_config( + builder->createBuilderConfig()); + builder_config->setMaxWorkspaceSize(1 << 10); builder->setMaxBatchSize(1); -#if IS_TRT_VERSION_GE(6, 0, 0, 0) - TrtUniquePtrType builder_config(builder->createBuilderConfig()); - builder_config->setMaxWorkspaceSize(1 << 10); - TrtUniquePtrType engine( - builder->buildEngineWithConfig(*network, *builder_config)); -#else - builder->setMaxWorkspaceSize(1 << 10); + + if (this->param_.dynamic_shape) { + TrtShapeOptimizationProfile profile; + profile.SetShapeTensorMask(network.get()); + const int n_input = param_.n_inputs; + // Set the input mask to true (no resource input) + std::vector input_mask(n_input, true); + profile.SetInputMask(input_mask); + // The for loop defines three optimization profiles for the network. + for (int i = 1; i <= 3; i++) { + std::vector shape_vec(n_input); + // Define a shape with all dimensions set to 3*i. + std::vector dimvec(this->param_.dims.nbDims, 3 * i); + TensorShape shape; + TF_CHECK_OK( + TensorShapeUtils::MakeShape(dimvec.data(), dimvec.size(), &shape)); + + const ITensorProxyPtr input = network->getInput(0); + const char* name = input->getName(); + VLOG(2) << "Defining profile for input " << name; + shape_vec[0] = shape; + if (this->param_.n_inputs == 2) { + // The shape of the shape tensor. + TF_CHECK_OK(TensorShapeUtils::MakeShape( + std::vector{param_.dims.nbDims}, &shape)); + shape_vec[1] = shape; + // Values of the shape tensor + Tensor shape_tensor(DT_INT32, shape); + // Define shape values {1, i}, where 1 is the value of the first dim, + // and i is the value of the second dimension. + std::vector vals{1, i}; + std::copy_n(vals.data(), vals.size(), + shape_tensor.flat().data()); + DataVec shape_values{{"one", {}}, {"two", shape_tensor}}; + TF_CHECK_OK(profile.CollectShapeValues(shape_values)); + } else { + TF_CHECK_OK(profile.CollectShapeValues({{"one", {}}})); + } + profile.AddShape(shape_vec); + } + std::vector input_partial_shapes; + TF_CHECK_OK(GetNetworkInputShapes(network.get(), &input_partial_shapes)); + profile.InitProfiles(input_partial_shapes, ProfileStrategy::kOptimal); + // Configure and build engine + TF_CHECK_OK(profile.ConfigureBuilder(builder.get(), builder_config.get(), + network.get())); + } + VLOG(2) << "ConfigureBuilder Finished"; TrtUniquePtrType engine( - builder->buildCudaEngine(*network)); -#endif + builder->buildEngineWithConfig(*network, *builder_config)); + VLOG(2) << "Engine constructed"; EXPECT_NE(nullptr, engine); return engine; } Logger& logger_ = *Logger::GetLogger(); + TestParam param_; }; -TEST_F(TRTEngineResourceOpsTest, Basic) { +#if IS_TRT_VERSION_GE(7, 1, 3, 0) +constexpr std::array TestParameters = { + TestParam{nvinfer1::Dims{1, {1}}, false, 1}, + TestParam{nvinfer1::Dims{1, {1}}, true, 1}, + TestParam{nvinfer1::Dims{2, {3, 3}}, true, 2}}; +#else +constexpr std::array TestParameters = { + TestParam{nvinfer1::Dims{1, {1}}, false, 1}, + TestParam{nvinfer1::Dims{1, {1}}, true, 1}}; +#endif + +INSTANTIATE_TEST_CASE_P(EngineResourceOpsTestInstantiation, + TRTEngineResourceOpsTest, + ::testing::ValuesIn(TestParameters)); + +TEST_P(TRTEngineResourceOpsTest, Basic) { // Create the GPU device. std::unique_ptr device( DeviceFactory::NewDevice("GPU", {}, "/job:worker/replica:0/task:0")); ResourceMgr* rm = device->resource_manager(); SetDevice(DEVICE_GPU, std::move(device)); - // Create the resource handle. + // Create a resource handle. const string container(kTfTrtContainerName); const string resource_name = "myresource"; Reset(); @@ -117,11 +249,12 @@ TEST_F(TRTEngineResourceOpsTest, Basic) { ResourceHandle handle = context_->mutable_output(0)->scalar()(); + // Check that a resource hasn't been created yet. TRTEngineCacheResource* resource = nullptr; EXPECT_TRUE( errors::IsNotFound(rm->Lookup(container, resource_name, &resource))); - // Create the resouce using an empty file with InitializeTRTResource. + // Create a resource and use an empty file to initialize the resource. Reset(); Env* env = Env::Default(); const string filename = io::JoinPath(testing::TmpDir(), "trt_engine_file"); @@ -136,21 +269,32 @@ TEST_F(TRTEngineResourceOpsTest, Basic) { .Finalize(node_def())); TF_ASSERT_OK(InitOp()); AddInputFromArray(TensorShape({}), {handle}); - AddInputFromArray(TensorShape({}), {filename}); + AddInputFromArray(TensorShape({}), {filename}); TF_ASSERT_OK(RunOpKernel()); + + // Check that the resource is registered with the resource manager and the + // cache of the resource is empty. EXPECT_TRUE(rm->Lookup(container, resource_name, &resource).ok()); EXPECT_EQ(0, resource->cache_.size()); - // Create a serialized TRT engine file. + // Create an engine and add it to the cache of the resource. TrtUniquePtrType engine = CreateTRTEngine(); - TrtUniquePtrType context( - engine->createExecutionContext()); + ExecutionContext context = ExecutionContext::Create(engine.get()); + + std::vector engine_input_shape(1); + TF_ASSERT_OK(DimsAdapter(param_.dims).TensorShape(&(engine_input_shape[0]))); + if (param_.n_inputs > 1) { + engine_input_shape.push_back(TensorShape({1, 1})); + } resource->cache_.emplace( - std::vector{TensorShape({1, 1})}, - absl::make_unique(std::move(engine), std::move(context))); - resource->Unref(); + engine_input_shape, + std::make_unique(std::move(engine), std::move(context))); + // Check that the resource has multiple references before it is unregistered + // from the resource manager. + EXPECT_FALSE(resource->RefCountIsOne()); - // Serialize the engine using SerializeTRTResource op. + // Serialize the engine to a file and unregistered the resource from the + // resource manager. Reset(); TF_ASSERT_OK(NodeDefBuilder("op", "SerializeTRTResource") .Attr("delete_resource", true) @@ -161,8 +305,13 @@ TEST_F(TRTEngineResourceOpsTest, Basic) { AddInputFromArray(TensorShape({}), {resource_name}); AddInputFromArray(TensorShape({}), {filename}); TF_ASSERT_OK(RunOpKernel()); + // Check that the resource now has only one reference. Detach the reference + // to the resource to destroy the resource. + EXPECT_TRUE(resource->RefCountIsOne()); + resource->Unref(); - // Make sure the cache is deleted. + // Check that unregistering the resource from the resource manager returns + // an error as the resource has already been unregistered. Reset(); TF_ASSERT_OK(NodeDefBuilder("op", "DestroyResourceOp") .Attr("ignore_lookup_error", false) @@ -172,22 +321,24 @@ TEST_F(TRTEngineResourceOpsTest, Basic) { AddInputFromArray(TensorShape({}), {handle}); EXPECT_TRUE(errors::IsNotFound(RunOpKernel())); - // Verify the serialized engine file. + // Verify the file for the serialized engine. std::unique_ptr file; TF_ASSERT_OK(env->NewRandomAccessFile(filename, &file)); - auto reader = absl::make_unique(file.get()); + auto reader = std::make_unique(file.get()); uint64 offset = 0; - string record; + tstring record; TF_ASSERT_OK(reader->ReadRecord(&offset, &record)); TRTEngineInstance engine_instance; engine_instance.ParseFromString(record); - EXPECT_EQ(1, engine_instance.input_shapes_size()); - EXPECT_EQ(2, engine_instance.input_shapes(0).dim_size()); - EXPECT_EQ(1, engine_instance.input_shapes(0).dim(0).size()); - EXPECT_EQ(1, engine_instance.input_shapes(0).dim(1).size()); + EXPECT_EQ(param_.n_inputs, engine_instance.input_shapes_size()); + EXPECT_EQ(param_.dims.nbDims, engine_instance.input_shapes(0).dim_size()); + for (int i = 0; i < param_.dims.nbDims; i++) { + EXPECT_EQ(param_.dims.d[i], engine_instance.input_shapes(0).dim(i).size()); + } EXPECT_TRUE(errors::IsOutOfRange(reader->ReadRecord(&offset, &record))); - // Recreate the cache resource. + // Recreate the resource and use the file with the serialized engine to + // initialize the resource. Reset(); TF_ASSERT_OK(NodeDefBuilder("op", "InitializeTRTResource") .Input(FakeInput(DT_RESOURCE)) @@ -198,11 +349,47 @@ TEST_F(TRTEngineResourceOpsTest, Basic) { AddInputFromArray(TensorShape({}), {handle}); AddInputFromArray(TensorShape({}), {filename}); TF_ASSERT_OK(RunOpKernel()); + + // Check that the resource is registered with the resource manager again and + // the cache of the resource is not empty. EXPECT_TRUE(rm->Lookup(container, resource_name, &resource).ok()); EXPECT_EQ(1, resource->cache_.size()); - resource->Unref(); + if (this->param_.dynamic_shape) { + EXPECT_EQ(3, resource->profiles_.GetNumProfiles()); + EXPECT_EQ(3, resource->cache_.begin()->second->GetNumContexts()); + + if (this->param_.n_inputs == 1) { + // Check if profiles are restored correctly. + std::vector shapes(1); + // We create a shape vector that matches only profile 1. + TF_CHECK_OK( + TensorShapeUtils::MakeShape(std::vector{6}, &shapes[0])); + EXPECT_EQ(1, resource->profiles_.GetProfileNumber(shapes)); + } else { + // Check if shape values are restored corretly. + std::vector shapes(2); + // We create a shape vector that matches only profile 2. + TF_CHECK_OK( + TensorShapeUtils::MakeShape(std::vector{9, 9}, &shapes[0])); + TF_CHECK_OK( + TensorShapeUtils::MakeShape(std::vector{2}, &shapes[1])); + Tensor shape_tensor(DT_INT32, shapes[1]); + std::vector vals{1, 3}; + std::copy_n(vals.data(), vals.size(), + shape_tensor.flat().data()); + // DataVec names are not in used CollectShapeValues, only the order + // matters. + DataVec shape_values{{"one", {}}, {"two", shape_tensor}}; + TF_CHECK_OK(resource->profiles_.CollectShapeValues(shape_values)); + EXPECT_EQ(2, resource->profiles_.GetProfileNumber(shapes)); + } + } + // Check that the resource has multiple references before it is unregistered + // from the resource manager. + EXPECT_FALSE(resource->RefCountIsOne()); - // Destroy the engine cache again. + // Unregister the resource from the resource manager two times, expect that + // the second time produces an error. Reset(); TF_ASSERT_OK(NodeDefBuilder("op", "DestroyResourceOp") .Attr("ignore_lookup_error", false) @@ -212,10 +399,14 @@ TEST_F(TRTEngineResourceOpsTest, Basic) { AddInputFromArray(TensorShape({}), {handle}); TF_ASSERT_OK(RunOpKernel()); EXPECT_TRUE(errors::IsNotFound(RunOpKernel())); + + // Check that the resource now has only one reference. Detach the reference + // to the resource to destroy resource. + EXPECT_TRUE(resource->RefCountIsOne()); + resource->Unref(); } } // namespace tensorrt } // namespace tensorflow -#endif // GOOGLE_TENSORRT -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA && GOOGLE_TENSORRT diff --git a/tensorflow/compiler/tf2tensorrt/ops/get_calibration_data_op.cc b/tensorflow/compiler/tf2tensorrt/ops/get_calibration_data_op.cc index 573172b92e6..2af3164c3e2 100644 --- a/tensorflow/compiler/tf2tensorrt/ops/get_calibration_data_op.cc +++ b/tensorflow/compiler/tf2tensorrt/ops/get_calibration_data_op.cc @@ -13,8 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#if GOOGLE_CUDA -#if GOOGLE_TENSORRT +#if GOOGLE_CUDA && GOOGLE_TENSORRT #include "tensorflow/core/framework/common_shape_fns.h" #include "tensorflow/core/framework/op.h" @@ -34,5 +33,4 @@ Returns calibration data for the given resource name } // namespace tensorflow -#endif // GOOGLE_TENSORRT -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA && GOOGLE_TENSORRT diff --git a/tensorflow/compiler/tf2tensorrt/ops/trt_engine_op.cc b/tensorflow/compiler/tf2tensorrt/ops/trt_engine_op.cc index 58cabbee53d..1d494bb44af 100644 --- a/tensorflow/compiler/tf2tensorrt/ops/trt_engine_op.cc +++ b/tensorflow/compiler/tf2tensorrt/ops/trt_engine_op.cc @@ -13,8 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#if GOOGLE_CUDA -#if GOOGLE_TENSORRT +#if GOOGLE_CUDA && GOOGLE_TENSORRT #include "tensorflow/core/framework/common_shape_fns.h" #include "tensorflow/core/framework/op.h" @@ -34,22 +33,23 @@ namespace tensorflow { REGISTER_OP("TRTEngineOp") .Attr("serialized_segment: string") .Attr("segment_func: func = {}") - .Attr("InT: list({int8,float16,float32,int32})") - .Attr("OutT: list({int8,float16,float32,int32})") + .Attr("InT: list({bool,int8,float16,float32,int32,resource})") + .Attr("OutT: list({bool,int8,float16,float32,int32})") + .Attr("input_shapes: list(shape) = []") + .Attr("output_shapes: list(shape) = []") .Attr("max_cached_engines_count: int = 1") + .Attr("max_batch_size: int = 1") .Attr("workspace_size_bytes: int") .Attr("precision_mode: {'FP32', 'FP16', 'INT8'}") .Attr("calibration_data: string = ''") .Attr("use_calibration: bool = true") - .Attr("input_shapes: list(shape) = []") - .Attr("output_shapes: list(shape) = []") .Input("in_tensor: InT") .Output("out_tensor: OutT") .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) { std::vector output_shapes; TF_RETURN_IF_ERROR(c->GetAttr("output_shapes", &output_shapes)); - for(int i=0; iMakeShapeFromPartialTensorShape( @@ -63,8 +63,9 @@ REGISTER_OP("TRTEngineOp") .Attr("segment_funcdef_name: string = ''") .Attr("cached_engine_batches: list(int) >= 0 = []") .Attr("fixed_input_size: bool = true") - .Attr("static_engine: bool = true"); + .Attr("static_engine: bool = true") + .Attr("profile_strategy: string = ''") + .Attr("use_explicit_precision: bool = false"); } // namespace tensorflow -#endif // GOOGLE_TENSORRT -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA && GOOGLE_TENSORRT diff --git a/tensorflow/compiler/tf2tensorrt/ops/trt_engine_resource_ops.cc b/tensorflow/compiler/tf2tensorrt/ops/trt_engine_resource_ops.cc index 01911de66ec..3f21a22136e 100644 --- a/tensorflow/compiler/tf2tensorrt/ops/trt_engine_resource_ops.cc +++ b/tensorflow/compiler/tf2tensorrt/ops/trt_engine_resource_ops.cc @@ -13,8 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#if GOOGLE_CUDA -#if GOOGLE_TENSORRT +#if GOOGLE_CUDA && GOOGLE_TENSORRT #include "tensorflow/core/framework/common_shape_fns.h" #include "tensorflow/core/framework/op.h" @@ -39,6 +38,7 @@ REGISTER_OP("InitializeTRTResource") REGISTER_OP("SerializeTRTResource") .Attr("delete_resource: bool = false") + .Attr("save_gpu_specific_engines: bool = True") .Input("resource_name: string") .Input("filename: string") .SetIsStateful() @@ -46,5 +46,4 @@ REGISTER_OP("SerializeTRTResource") } // namespace tensorflow -#endif // GOOGLE_TENSORRT -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA && GOOGLE_TENSORRT diff --git a/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.cc b/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.cc index 563ce724f43..83d5f9b5965 100644 --- a/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.cc +++ b/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.cc @@ -17,8 +17,7 @@ limitations under the License. #include -#if GOOGLE_CUDA -#if GOOGLE_TENSORRT +#if GOOGLE_CUDA && GOOGLE_TENSORRT namespace tensorflow { namespace tensorrt { @@ -30,5 +29,4 @@ const char* kTfTrtPluginNamespace = "TF"; } // namespace tensorrt } // namespace tensorflow -#endif // GOOGLE_CUDA -#endif // GOOGLE_TENSORRT +#endif // GOOGLE_CUDA && GOOGLE_TENSORRT diff --git a/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.h b/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.h index 46b35a24afc..8976cc6e862 100644 --- a/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.h +++ b/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.h @@ -20,8 +20,7 @@ limitations under the License. #include "tensorflow/core/platform/logging.h" -#if GOOGLE_CUDA -#if GOOGLE_TENSORRT +#if GOOGLE_CUDA && GOOGLE_TENSORRT #include "third_party/tensorrt/NvInfer.h" namespace tensorflow { @@ -30,7 +29,6 @@ namespace tensorrt { extern const char* kTfTrtPluginVersion; extern const char* kTfTrtPluginNamespace; -#if NV_TENSORRT_MAJOR > 5 || (NV_TENSORRT_MAJOR == 5 && NV_TENSORRT_MINOR >= 1) // A wrapper class for TensorRT plugin. User application should inherit from // this class to write custom kernels. class TrtPlugin : public nvinfer1::IPluginV2Ext { @@ -51,7 +49,9 @@ class TrtPlugin : public nvinfer1::IPluginV2Ext { namespace_ = plugin_namespace; } - const char* getPluginNamespace() const noexcept override { return namespace_.c_str(); } + const char* getPluginNamespace() const noexcept override { + return namespace_.c_str(); + } protected: template @@ -70,7 +70,6 @@ class TrtPlugin : public nvinfer1::IPluginV2Ext { private: std::string namespace_; }; -#endif template class TrtPluginRegistrar { @@ -90,7 +89,6 @@ class TrtPluginRegistrar { } // namespace tensorrt } // namespace tensorflow -#endif // GOOGLE_TENSORRT -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA && GOOGLE_TENSORRT #endif // TENSORFLOW_COMPILER_TF2TENSORRT_PLUGIN_TRT_PLUGIN_H_ diff --git a/tensorflow/compiler/tf2tensorrt/segment/segment.cc b/tensorflow/compiler/tf2tensorrt/segment/segment.cc index 1b46d54ab5f..7b5860b600e 100644 --- a/tensorflow/compiler/tf2tensorrt/segment/segment.cc +++ b/tensorflow/compiler/tf2tensorrt/segment/segment.cc @@ -15,54 +15,42 @@ limitations under the License. #include "tensorflow/compiler/tf2tensorrt/segment/segment.h" +#include +#include +#include +#include #include -#include +#include #include -#include +#include +#include "absl/container/flat_hash_set.h" #include "absl/strings/str_cat.h" -#include "tensorflow/compiler/tf2tensorrt/segment/union_find.h" +#include "absl/strings/str_format.h" +#include "tensorflow/compiler/tf2tensorrt/common/utils.h" +#include "tensorflow/compiler/tf2tensorrt/convert/utils.h" #include "tensorflow/core/graph/algorithm.h" #include "tensorflow/core/graph/graph.h" -#include "tensorflow/core/graph/graph_util.h" #include "tensorflow/core/graph/graph_constructor.h" #include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/lib/core/status.h" -#include "tensorflow/core/lib/gtl/flatset.h" -#include "tensorflow/core/lib/strings/strcat.h" #include "tensorflow/core/lib/strings/str_util.h" +#include "tensorflow/core/lib/strings/strcat.h" #include "tensorflow/core/platform/types.h" +#include "tensorflow/core/profiler/lib/traceme.h" +#include "tensorflow/core/util/device_name_utils.h" #include "tensorflow/core/util/env_var.h" -#if GOOGLE_CUDA -#if GOOGLE_TENSORRT +#if GOOGLE_CUDA && GOOGLE_TENSORRT namespace tensorflow { namespace tensorrt { - -namespace { - -void GetLabeledNodes(gtl::FlatSet* node_set, Graph* g) { - std::unordered_set boundary_node_set; - graph_util::GetComputeGraphBoundaryNodes(g, boundary_node_set); - - auto label_node_func = [node_set](Node* n) { - node_set->insert(n->name()); - }; - - std::vector boundary_node_vec; - for (const auto node : boundary_node_set) { - boundary_node_vec.emplace_back(node); - } - ReverseDFSFrom(*g, boundary_node_vec, - std::move(label_node_func), nullptr); -} - -} // namespace - namespace segment { +namespace { using absl::StrAppend; +using absl::StrAppendFormat; using absl::StrCat; +using absl::StrJoin; // A simple graph representation to mirror Graph. This structure // helps saving memory since segmenter modifies the graph in place, preventing @@ -259,14 +247,6 @@ struct SimpleEdgePtrCompare { } }; -struct NodePtrCompare { - bool operator()(const Node* lhs, const Node* rhs) const { - return lhs->name() < rhs->name(); - } -}; - -namespace { - // Copied from TF ReverseDFS, which only works for Graph. void StableDFS(const SimpleGraph& g, bool reverse, const std::vector& start, @@ -282,8 +262,9 @@ void StableDFS(const SimpleGraph& g, bool reverse, stack[i] = Work{start[i], false}; } - auto get_nodes = reverse ? [](const SimpleNode* n) { return n->in_nodes(); } - : [](const SimpleNode* n) { return n->out_nodes(); }; + auto get_nodes = [reverse](const SimpleNode* n) { + return reverse ? n->in_nodes() : n->out_nodes(); + }; std::vector visited(g.num_node_ids(), false); while (!stack.empty()) { Work w = stack.back(); @@ -366,7 +347,236 @@ bool CanContractEdge(const SimpleEdge* edge, }); return !has_cycle; } -} // namespace + +// TODO(bixia): put this to a common utility file. +string TensorPropertiesToString(const OpInfo::TensorProperties& prop) { + string s = StrCat(DataTypeString(prop.dtype()), ": "); + StrAppend(&s, "["); + if (prop.shape().unknown_rank()) { + StrAppend(&s, "?"); + } else { + StrAppend(&s, StrJoin(prop.shape().dim(), ",", + [](string* out, const TensorShapeProto_Dim& d) { + StrAppendFormat(out, "%d", d.size()); + })); + } + StrAppend(&s, "]"); + return s; +} + +string TensorPropertiesToString( + const std::vector& properties) { + return StrJoin(properties, "; ", + [](string* out, const OpInfo::TensorProperties& prop) { + StrAppend(out, TensorPropertiesToString(prop)); + }); +} + +// From the given list of input properties, returns the leading shape, which is +// the shape that determines the batch size of the operation. The leading shape +// is selected from the group of input shapes with the highest rank as follows: +// . If all of those shapes have non-negative values for the batch dimension, +// the leading shape is the one with the largest value for the batch +// dimension. +// . If some or all of those shapes have negative values for the batch +// dimension, and the rest of those shapes have 1 for the batch dimension, +// the leading shape is the first of those shapes with a negative value for +// the batch dimension. +// . Otherwise, we can't determine the leading shape for the operation and +// have to exclude the operation from TRT. +// +// Examples: +// case-1: a[1,3,4] + b[2,3,4] => leading shape [2,3,4] +// case-2: a[2,3,4] + b[scalar] => leading shape [2,3,4] +// case-3: a[-1,3,4] + b[1,3,4] => leading shape [-1,3,4] +// case-4: a[-1,3,4] + b[2,3,4] => no leading shape +// +// We have to return "no leading shape" for case-4 to exclude such operation +// from being translated for this reason: +// The actually input for "a" have to be in the shape of [2,3,4] for the +// operation to be valid. On the other hand, if we translate the operation +// to implicit batch mode, it will becomes a[3,4]+b[3,4] which is valid for +// any input shape of "a". +// +// This routine assumes the input program is valid. For example, we shouldn't +// see invalid operation like a[2,3,4] + b[3,3,4]. It also assumes the input +// properties is not empty and all input have known shapes. +// +// TODO(bixia): find a way to share this knowledge with the converter. +// TODO(bixia): investigate the use of symbolic shape analysis to improve +// segmentation, such as by requiring the dynamic dimensions to have the same +// negative value. +absl::optional FindLeadingShape( + absl::Span properties) { + DCHECK(!properties.empty()); + const TensorShapeProto* result; + int max_batch_dim_value; + auto choose_shape_with_higher_rank = [&](const TensorShapeProto* s) { + result = s; + max_batch_dim_value = s->dim_size() < 1 ? 1 : s->dim(0).size(); + }; + + DCHECK(!properties[0].shape().unknown_rank()); + choose_shape_with_higher_rank(&properties[0].shape()); + + for (const OpInfo::TensorProperties& p : properties.subspan(1)) { + DCHECK(!p.shape().unknown_rank()); + if (p.shape().dim_size() < result->dim_size()) continue; + + if (p.shape().dim_size() > result->dim_size()) { + choose_shape_with_higher_rank(&p.shape()); + continue; + } + + // Among the shapes with the same rank, choose the one with a dynamic batch + // size. If no shapes have a dynamic batch size, choose the one with the + // largest size. + if (result->dim_size() < 1) continue; + + if (p.shape().dim(0).size() < 0 || result->dim(0).size() < 0) { + if (p.shape().dim(0).size() < 0 && result->dim(0).size() >= 0) { + result = &p.shape(); + } else { + max_batch_dim_value = + std::max(max_batch_dim_value, p.shape().dim(0).size()); + } + + continue; + } + + if (p.shape().dim(0).size() > result->dim(0).size()) { + result = &p.shape(); + max_batch_dim_value = result->dim(0).size(); + } + } + + if (result->dim_size() > 0 && result->dim(0).size() < 0) { + // dynamic batch size + if (max_batch_dim_value <= 1) { + return result; + } else { + return absl::nullopt; + } + } + + return result; +} + +// Returns the inputs that are relevant to determinate the batch size of the +// operation. This routine handles the following cases: +// . Operations that support implicit boradcasting, such as operation mul. +// In this case, we need to inspect all the inputs in order to determine the +// batch size of the operation. +// . Special cases. Such as "Conv2DBackpropInput", "Conv3DBackpropInputV2". +// . The batch size of a operation is determined by the first input of the +// operation. +absl::Span GetInputsToDeterminateBatchSize( + const Node* node, const std::vector& all_inputs) { + // TODO(bixia): Find a way to share this knowledge with the converter. + static std::set broadcast_supporting_ops = { + // ops corresponding to ConvertBinary in the converter + "Add", + "AddV2", + "Mul", + "Sub", + "Div", + "FloorDiv", + "RealDiv", + "Minimum", + "Maximum", + "Pow", + // other ops that need to need GetTrtBroadcastShape to convert + "BiasAdd", + "SquaredDifference", + "BatchMatMul", + "BatchMatMulV2", + }; + const string& op = node->def().op(); + + if (op == "Conv2DBackpropInput" || op == "Conv3DBackpropInputV2") { + DCHECK_EQ(all_inputs.size(), 3); + return absl::MakeSpan(all_inputs).subspan(2, 1); + } + + if (broadcast_supporting_ops.count(op)) { + return absl::MakeSpan(all_inputs); + } + + // This is the common case for the operations that don't support implicit + // broadcasting: the first operand determines its batch size. All otherwise + // cases are handled before reaching here. + return absl::MakeSpan(all_inputs).subspan(0, 1); +} + +// Returns true if the operation we can remove the implicit batch of the +// operation. +// +// In particular, if the input shape has dynamic rank or the input shape rank +// is less than 2, we can't remove the implicit batch dimension and generate +// a new operation for TRT translation. +bool OperationCanBeTranslatedToImplicitBatch( + const grappler::GraphProperties* graph_properties, const Node* node) { + VLOG(3) << "process node " << node->name(); + if (node->num_inputs() == 0) return true; + if (!graph_properties || !graph_properties->HasInputProperties(node->name())) + return false; + + VLOG(3) << "input shapes " + << TensorPropertiesToString( + graph_properties->GetInputProperties(node->name())); + + const std::vector& all_input_properties = + graph_properties->GetInputProperties(node->name()); + absl::Span input_properties = + GetInputsToDeterminateBatchSize(node, all_input_properties); + if (absl::c_any_of(input_properties, [](const OpInfo::TensorProperties& p) { + return p.shape().unknown_rank(); + })) { + return false; + } + + absl::optional leading_shape = + FindLeadingShape(input_properties); + return leading_shape.has_value() && leading_shape.value()->dim_size() >= 2; +} + +// Returns true if we can't be sure that the operand with the given properties +// won't have negative values for non-batch dimensions. +// +bool HasDynamicNonBatchDimension(const OpInfo::TensorProperties& prop) { + const TensorShapeProto& shape = prop.shape(); + if (shape.unknown_rank()) return true; + + // Scalar is a well specified shape, and TRT supports implicit broadcasting + // from scalar to other shapes. + if (shape.dim_size() == 0) return false; + for (int i = 1; i < shape.dim_size(); ++i) { + // The value of a dynamic dimension can be other negative values besides + // -1, representing the symbolic group of the dimension. + if (shape.dim(i).size() <= -1) { + return true; + } + } + return false; +} + +// Returns true if we can't be sure that the operation won't have dynamic +// non-batch dimension involved. We only check the shape of the first output +// assuming shape inference already propagates the shapes. +bool OperationHasDynamicNonBatchDimension( + const grappler::GraphProperties* graph_properties, const Node* node) { + VLOG(3) << "process node " << node->name(); + // If the node doesn't have any input or output, not computation is involved. + if (node->num_inputs() == 0 || node->num_outputs() == 0) return false; + + // If the node doesn't have output properties, return true to be conservative. + if (!graph_properties->HasOutputProperties(node->name())) return true; + VLOG(3) << "output shapes " + << TensorPropertiesToString( + graph_properties->GetOutputProperties(node->name())); + return HasDynamicNonBatchDimension( + graph_properties->GetOutputProperties(node->name()).at(0)); +} void ContractEdge(SimpleEdge* edge, SimpleGraph* graph, std::vector* remove_edges) { @@ -426,12 +636,246 @@ void ContractEdge(SimpleEdge* edge, SimpleGraph* graph, } } +// Returns a batch size representation for a segment that only contains the +// given node. +ClusterBatchSize GetClusterBatchSizeForNode( + const grappler::GraphProperties* graph_properties, const Node* node, + bool use_implicit_batch) { + ClusterBatchSize cluster_batch_size; + if (!use_implicit_batch || !node || node->num_inputs() == 0) { + return cluster_batch_size; + } + + const NodeDef& node_def = node->def(); + if (node_def.attr().count(kTftrtOpMaxBatchSizeAttr)) { + cluster_batch_size.SetMaxBatchSize( + node_def.attr().at(kTftrtOpMaxBatchSizeAttr).i()); + } + + // As shape inference cannot provide any useful information about the batch + // size, we keep it as missing. + if (!graph_properties || + !graph_properties->HasInputProperties(node->name())) { + VLOG(3) << "doesn't have input property"; + return cluster_batch_size; + } + + const std::vector& input_properties = + graph_properties->GetInputProperties(node->name()); + absl::optional optional_leading_shape = + FindLeadingShape(GetInputsToDeterminateBatchSize(node, input_properties)); + DCHECK(optional_leading_shape.has_value()); + const TensorShapeProto* leading_shape = optional_leading_shape.value(); + DCHECK(!leading_shape->unknown_rank() && leading_shape->dim_size() >= 2); + VLOG(3) << "set batch size as " << leading_shape->dim(0).size(); + return cluster_batch_size.SetBatchSize(leading_shape->dim(0).size()); +} + +void AddSegmentForNode(const grappler::GraphProperties* graph_properties, + std::vector>* segments, + SimpleNode* node, + const DeviceNameUtils::ParsedName& device_name, + bool use_implicit_batch) { + tensorflow::profiler::TraceMe activity( + "AddSegmentForNode", tensorflow::profiler::TraceMeLevel::kInfo); + ClusterProperty property( + GetClusterBatchSizeForNode(graph_properties, + node == nullptr ? nullptr : node->tf_node(), + use_implicit_batch), + device_name); + segments->emplace_back(node, std::move(property)); +} + +} // namespace + +Status ExportNonConversionReportToCSV( + string filename, + std::map>& nonconverted_ops_map, + string sep = "|") { + tensorflow::profiler::TraceMe activity( + "ExportNonConversionReportToCSV", + tensorflow::profiler::TraceMeLevel::kInfo); + std::unique_ptr csv_file; + auto open_status = Env::Default()->NewWritableFile(filename, &csv_file); + + if (!open_status.ok()) { + return errors::Internal("Failed to open output file: `", filename, "`"); + } + + LOG(WARNING) << "TF-TRT Non-Conversion Report saved at: `" << filename << "`"; + + std::ostringstream sstream; + sstream << "OP Name" << sep << "Reason" << sep << "Count" << std::endl; + + for (auto& op_details : nonconverted_ops_map) { + auto op_name = op_details.first; + auto op_data = op_details.second; + + for (auto& reject_data : op_data) { + auto reason = reject_data.first; + auto count = reject_data.second; + sstream << op_name << sep << reason << sep << count << std::endl; + } + } + + auto append_status = csv_file->Append(sstream.str()); + + if (!append_status.ok()) { + return errors::Internal("Error writing to output file `", filename, "`."); + } + + auto close_status = csv_file->Close(); + + if (!close_status.ok()) { + return errors::Internal("Error closing the file `", filename, + "`. The file might be corrupted."); + } + + return Status::OK(); +} + +string GenerateNonConversionReport( + std::map>& nonconverted_ops_map) { + // Fetch whether to print a detailed version of the TF-TRT conversion report. + // TF_TRT_SHOW_DETAILED_REPORT triggers three possible behaviors: + // - If Number >= 1: Print detailed non-conversion report on stdout. + // Usage: TF_TRT_SHOW_DETAILED_REPORT=1 + // - If non empty string: Exports the non-conversion report in CSV format at + // the path defined by the environment variable. + // This will also print the detailed non-conversion + // report on stdout. + // Usage: TF_TRT_SHOW_DETAILED_REPORT=/path/to/file.csv + // - Else: Print normal (undetailed) non-conversion report on + // stdout. + tensorflow::profiler::TraceMe activity( + "GenerateNonConversionReport", tensorflow::profiler::TraceMeLevel::kInfo); + + string detailed_report_var; + TF_CHECK_OK(ReadStringFromEnvVar("TF_TRT_SHOW_DETAILED_REPORT", + /*default_value=*/"", &detailed_report_var)); + + bool show_detailed_conversion_report = false; + + if (detailed_report_var != "") { + // Checking if `TF_TRT_SHOW_DETAILED_REPORT` env var is a string or a number + if (detailed_report_var.find_first_not_of("-0123456789") != string::npos) { + const Status status = ExportNonConversionReportToCSV( + detailed_report_var, nonconverted_ops_map); + + if (!status.ok()) { + // Log the error in case of issue, however do not stop execution. + LOG(ERROR) << "Problem encountered while generating the TF-TRT " + << "Non-Conversion Report in CSV Format:\n" + << status.error_message(); + } + show_detailed_conversion_report = true; + } else if (std::stoi(detailed_report_var) >= 1) { + show_detailed_conversion_report = true; + } + } + + string unsupported_op_report = + StrCat("\n\n", string(80, '#'), "\n", + "TensorRT unsupported/non-converted OP Report:"); + int total_nonconverted_ops{0}; + + // + using ReasonCounterVector = std::vector>; + // >> + using NotConvertedOPTuple = std::tuple; + + std::vector nonconverted_ops_vec; + + // Populate the vector from the map + for (auto& nonconverted_op_data : nonconverted_ops_map) { + int total_nonconverted_op{0}; + ReasonCounterVector reason_occurances_vect; + + auto op_name = nonconverted_op_data.first; + auto op_data = nonconverted_op_data.second; + + for (auto& notconversion_reason_data : op_data) { + auto reason_count = notconversion_reason_data.second; + total_nonconverted_op += reason_count; + reason_occurances_vect.push_back(notconversion_reason_data); + } + + // Sort in descending number of occurances for the reasons why a given + // TensorFlow OP was not converted. + std::sort(reason_occurances_vect.begin(), reason_occurances_vect.end(), + [](const std::pair& a, + const std::pair& b) -> bool { + return a.second > b.second; + }); + + nonconverted_ops_vec.push_back(std::make_tuple( + op_name, total_nonconverted_op, reason_occurances_vect)); + } + + // Sort the vector by descending OP names. + std::sort(nonconverted_ops_vec.begin(), nonconverted_ops_vec.end(), + [](const NotConvertedOPTuple& a, const NotConvertedOPTuple& b) { + return std::get<1>(a) > std::get<1>(b); + }); + + for (auto& notconverted_op_detail : nonconverted_ops_vec) { + auto& op_name = std::get<0>(notconverted_op_detail); + auto& op_total_nonconverted = std::get<1>(notconverted_op_detail); + total_nonconverted_ops += op_total_nonconverted; + + unsupported_op_report = StrCat(unsupported_op_report, "\n\t- ", op_name, + " -> ", op_total_nonconverted, "x"); + + if (show_detailed_conversion_report) { + auto& nonconverted_ops_details = std::get<2>(notconverted_op_detail); + + for (auto& nonconversion_details : nonconverted_ops_details) { + auto& reason = nonconversion_details.first; + auto& reason_count = nonconversion_details.second; + if (reason_count == 0) { + continue; + } + + unsupported_op_report = StrCat(unsupported_op_report, "\n\t\t- ", + "[Count: ", reason_count, "x] ", reason); + } + unsupported_op_report = StrCat(unsupported_op_report, "\n"); + } + } + + unsupported_op_report = + StrCat(unsupported_op_report, "\n", string(80, '-'), + "\n\t- Total nonconverted OPs: ", total_nonconverted_ops, + "\n\t- Total nonconverted OP Types: ", nonconverted_ops_map.size(), + "\nFor more information see https://docs.nvidia.com/deeplearning", + "/frameworks/tf-trt-user-guide/index.html#supported-ops.", "\n", + string(80, '#'), "\n"); + + return unsupported_op_report; +} + Status SegmentGraph(const Graph* tf_graph, + const grappler::GraphProperties* graph_properties, const std::function& candidate_fn, const std::function& input_candidate_fn, const std::function& output_candidate_fn, - const SegmentOptions& options, - SegmentNodesVector* segments) { + const SegmentOptions& options, SegmentVector* segments) { + tensorflow::profiler::TraceMe activity( + "SegmentGraph", tensorflow::profiler::TraceMeLevel::kInfo); + if (!options.use_implicit_batch && !options.allow_dynamic_non_batch_dim) { + return errors::Internal( + "Explicit batch mode should allow dynamic non-batch dimensions"); + } + + if (options.use_implicit_batch && !options.maximum_batch_size.has_value()) { + return errors::Internal("Implicit batch mode requires maximum_batch_size"); + } + + if (!options.allow_dynamic_non_batch_dim && !graph_properties) { + return errors::Internal( + "Need graph propertities to disallow dynamic non-batch dimensions"); + } + // Steps: // 1. run the segmentation algorithm to find all the segments, which uses // candidate_fn to determine the candidates segment nodes; @@ -442,92 +886,96 @@ Status SegmentGraph(const Graph* tf_graph, // --------------------------------- Step 1 --------------------------------- auto graph = std::unique_ptr(new SimpleGraph(tf_graph)); + + // Fetch the user-provide TF operations denylisted for conversion by TF-TRT. + const absl::flat_hash_set tftrt_op_denylist = [] { + string tftrt_op_denylist_str; + TF_CHECK_OK(ReadStringFromEnvVar("TF_TRT_OP_DENYLIST", /*default_value=*/"", + &tftrt_op_denylist_str)); + absl::flat_hash_set tftrt_op_denylist{}; + for (const auto& x : str_util::Split(tftrt_op_denylist_str, ",")) { + tftrt_op_denylist.insert(x); + } + // Force a rehash of the flat hash set + tftrt_op_denylist.rehash(0); + return tftrt_op_denylist; + }(); + // Use a union-find to collect the nodes that belong to the same // segment. A node value of nullptr indicates that the node is not a candidate // for TRT. - std::unordered_set unsupported_ops; - int num_unsupported_ops = 0; - // Getting the nodes blacklisted for conversion - string tftrt_node_blacklist_str; - TF_CHECK_OK(ReadStringFromEnvVar( - "TF_TRT_OP_BLACKLIST", "", &tftrt_node_blacklist_str - )); - - auto tftrt_node_blacklist = gtl::FlatSet{}; - - for (const auto& x : str_util::Split(tftrt_node_blacklist_str, ",")) { - tftrt_node_blacklist.insert(x); - } - - // User defined special subgraphs which can not be convert to trt graph. - // e.g. some sparse lookup subgraphs. - auto labeled_node_blacklist = gtl::FlatSet{}; - GetLabeledNodes(&labeled_node_blacklist, const_cast(tf_graph)); + std::map> nonconverted_ops_map = {}; // Parsing each node of the graph std::vector> node_segments; for (int i = 0; i < graph->num_node_ids(); ++i) { SimpleNode* node = graph->FindNodeId(i); - if (options.exclude_node_list.count(node->name()) != 0) { + + if (!node) { + VLOG(3) << "Node " << i << " doesn't exist in the graph"; + continue; + } + + const string node_op_type{node->tf_node()->type_string()}; + + auto exclude_node = [&](absl::string_view reason) { VLOG(1) << "Not a TF-TRT candidate, " - << "(Op type: " << node->tf_node()->type_string() << "), " + << "(Op type: " << node_op_type << "), " << "(Op name: " << node->name() << "), " - << "(Reason: excluded by segmenter option)"; - unsupported_ops.emplace(node->tf_node()->type_string()); - num_unsupported_ops++; + << "(Reason: " << reason << ")"; + nonconverted_ops_map[node_op_type][string(reason)]++; node = nullptr; + }; + absl::optional device_name = + GetDeviceParsedName(node->tf_node()); + // GetDeviceParseName capitalizes the device type. + if (!device_name.has_value() || + (device_name->has_type && device_name->type != "GPU")) { + exclude_node("node can't be placed on GPU"); + } else if (options.exclude_node_list.count(node->name()) != 0) { + exclude_node( + "excluded by segmenter option. Most likely an input or " + "output node."); + } else if (options.use_implicit_batch && + !OperationCanBeTranslatedToImplicitBatch(graph_properties, + node->tf_node())) { + exclude_node( + "implicit batch mode requires input shape with at least two " + "dimensions"); + } else if (!options.allow_dynamic_non_batch_dim && + OperationHasDynamicNonBatchDimension(graph_properties, + node->tf_node())) { + exclude_node("dynamic non-batch dimensions not allowed"); } else { const Status status = candidate_fn(node->tf_node()); if (!status.ok()) { - VLOG(1) << "Not a TF-TRT candidate, " - << "(Op type: " << node->tf_node()->type_string() << "), " - << "(Op name: " << node->name() << "), " - << "(Reason: " << status << ")"; - unsupported_ops.emplace(node->tf_node()->type_string()); - num_unsupported_ops++; - node = nullptr; - } else if (tftrt_node_blacklist.count(node->tf_node()->type_string())) { + exclude_node(status.error_message()); + } else if (tftrt_op_denylist.contains(node->tf_node()->type_string())) { // WARNING verbosity since the user explicitly requests this behavior. - LOG(WARNING) << "Blacklisted as TF-TRT candidate, " - << "(Op type: " << node->tf_node()->type_string() << "), " - << "(Op name: " << node->name() << "), " - << "(Reason: Blacklisted with the env var TF_TRT_OP_BLACKLIST)"; - unsupported_ops.emplace(node->tf_node()->type_string()); - num_unsupported_ops++; - node = nullptr; - } else if (labeled_node_blacklist.count(node->tf_node()->name())) { - LOG(WARNING) << "Blacklisted as TF-TRT candidate, " - << "(Op name: " << node->name() << "), " - << "(Reason: User labeled nodes blacklist)"; - // TODO FIXME : delete - unsupported_ops.emplace(node->tf_node()->name()); - num_unsupported_ops++; - node = nullptr; + LOG_WARNING_WITH_PREFIX + << "Denylisted as TF-TRT candidate, " + << "(Op type: " << node->tf_node()->type_string() << "), " + << "(Op name: " << node->name() << ")"; + exclude_node("Denylisted with the env var TF_TRT_OP_DENYLIST"); } else { VLOG(2) << "Accepted as a TF-TRT candidate, " << "(Op type: " << node->tf_node()->type_string() << "), " << "(Op name: " << node->name(); } } - node_segments.emplace_back(node); + AddSegmentForNode(graph_properties, &node_segments, node, *device_name, + options.use_implicit_batch); } - string msg = StrCat( - "There are ", num_unsupported_ops, " ops of ", unsupported_ops.size(), - " different types in the graph that", " are not converted to TensorRT: "); - for (const auto& elem : unsupported_ops) { - StrAppend(&msg, elem, ", "); - } - LOG(INFO) << msg << "(For more information see " - << "https://docs.nvidia.com/deeplearning" - << "/frameworks/tf-trt-user-guide/index.html#supported-ops)."; + + LOG(WARNING) << GenerateNonConversionReport(nonconverted_ops_map); // The segmentation algorithm below visits nodes in reverse topological order // and attempts to merge nodes along output edges. That means that subgraphs // grow from the output-side of the network towards the inputs. // // In general this is not guaranteed to produce a globally optimal - // segmentation. For exaample, consider graph with node {A, B, C, D} and edges + // segmentation. For example, consider graph with node {A, B, C, D} and edges // {A->B, A->C, B->D, C->D), where A, B, D are trt compatible but C is not, so // in theory we can choose to contract either A, B or B, D but not both, but // here it always choose to contract B, D. @@ -543,18 +991,25 @@ Status SegmentGraph(const Graph* tf_graph, return true; }); for (const SimpleNode* node : order) { - // All output nodes of 'node' have been visited... + // All output nodes of 'node' have been visited. VLOG(3) << "Trying node " << node->name() << " id=" << node->id(); - // 'node' must be a TRT candidate... + // 'node' must be a TRT candidate. if (node_segments[node->id()].Value() == nullptr) { VLOG(3) << "... not a TRT candidate"; continue; } - // Contract output edges to combine 'node' with output - // nodes. Iterate since combining two nodes may unblock other - // combining. + // Contract output edges to combine 'node' with output nodes. Repeat this + // step until no output edges can be further contracted. This is because + // contracting an output edge may unblock new edges for contracting. + ClusterBatchSize expected_batch_size = + node_segments[node->id()].Property().BatchSize(); + DeviceNameUtils::ParsedName expected_device_name = + node_segments[node->id()].Property().DeviceName(); + VLOG(3) << "batch size " << expected_batch_size; while (true) { std::set contract_edges; + // TODO(bixia): consider merging the loop to find the edges and the loop + // to contract the edges. for (const SimpleEdge* out_edge : node->out_edges()) { VLOG(3) << "... out node " << out_edge->dst()->name() << " ( " << out_edge->dst()->id() << " <- " << node->id() << " )"; @@ -562,14 +1017,39 @@ Status SegmentGraph(const Graph* tf_graph, VLOG(3) << "... ... Control Edge, Skipping"; continue; } - // Out node must be TRT candidate... - if (node_segments[out_edge->dst()->id()].Value() == nullptr) { + UnionFind* out_cluster = + &node_segments[out_edge->dst()->id()]; + // Out node must be a TRT candidate. + if (out_cluster->Value() == nullptr) { VLOG(3) << "... ... not a TRT candidate"; continue; } + // Out node must have compatible batch size. + ClusterBatchSize out_batch_size = out_cluster->Property().BatchSize(); + ClusterBatchSize merged_batch_size = expected_batch_size; + if (!merged_batch_size.MergeIfCompatible(out_batch_size)) { + VLOG(3) << "... ... incompatible batch sizes " + << expected_batch_size.ToString() << " " + << out_batch_size.ToString(); + continue; + } + + const DeviceNameUtils::ParsedName& out_device_name = + out_cluster->Property().DeviceName(); + absl::optional merged_device_name = + MergeIfCompatible(expected_device_name, out_device_name); + if (!merged_device_name.has_value()) { + VLOG(3) << "... ... incompatible device names " + << expected_device_name << " " << out_device_name; + continue; + } + if (CanContractEdge(out_edge, graph)) { - VLOG(3) << "... ... can contract"; + VLOG(3) << "... ... can contract. new batch size " + << merged_batch_size.ToString(); contract_edges.insert(out_edge); + expected_batch_size = merged_batch_size; + expected_device_name = *merged_device_name; } else { VLOG(3) << "... ... cannot contract, would form cycle"; } @@ -586,7 +1066,8 @@ Status SegmentGraph(const Graph* tf_graph, VLOG(3) << "Merge " << src->name() << " <- " << dst->name() << " (" << src->id() << " <- " << dst->id(); - node_segments[src->id()].Merge(&node_segments[dst->id()]); + TF_RETURN_IF_ERROR( + node_segments[src->id()].Merge(&node_segments[dst->id()])); // Contracting the edge leaves disconnected graph edges. // Remove these from the graph and from 'contract_edges' so we @@ -600,6 +1081,16 @@ Status SegmentGraph(const Graph* tf_graph, graph->RemoveEdge(r); } } + if (expected_batch_size != + node_segments[node->id()].Property().BatchSize()) { + return errors::Internal( + "expected batch size is not the same as the actual batch size"); + } + if (expected_device_name != + node_segments[node->id()].Property().DeviceName()) { + return errors::Internal( + "expected device name is not the same as the actual device name"); + } } } @@ -608,43 +1099,21 @@ Status SegmentGraph(const Graph* tf_graph, // A map from the segment identifier (currently the name of the root node of // the segment tree) to the segment nodes set. - std::map> sg_map; - - // A map from the segment identifier (currently the name of the root node of - // the segment tree) to the device names that the nodes in the segment are - // assigned to. - // - // TODO(aaroey): nodes assigned to different devices should not be merged, - // fix this. - std::unordered_map> device_maps; + std::map sg_map; for (auto& u : node_segments) { if ((u.Value() != nullptr) && (u.ParentValue() != nullptr)) { - sg_map[u.ParentValue()->name()].insert(u.Value()->tf_node()); - auto tf_node = u.Value()->tf_node(); - // has_assigned_device_name() is expected to return true - // when called from optimization pass. However, since graph - // is converted back and forth between graph and graphdef, - // assigned devices demoted to requested devices. If the graph - // is passed directly to this module, assigned devices will be set. - if (tf_node->has_assigned_device_name()) { - device_maps[u.ParentValue()->name()].insert( - tf_node->assigned_device_name()); - } else if (!tf_node->requested_device().empty()) { - device_maps[u.ParentValue()->name()].insert( - tf_node->requested_device()); - } else { - VLOG(2) << "Node " << tf_node->name() - << " has no device assigned requested device is: " - << tf_node->requested_device(); - } + sg_map[u.ParentValue()->name()].nodes.insert(u.Value()->tf_node()); + } + if ((u.Value() != nullptr) && (u.ParentValue() == u.Value())) { + sg_map[u.Value()->name()].property = u.Property(); } } // --------------------------------- Step 2 --------------------------------- // Remove ineligible input/output nodes. for (auto& itr : sg_map) { - std::set& segment_nodes = itr.second; + std::set& segment_nodes = itr.second.nodes; VLOG(1) << "Segment original size: " << segment_nodes.size(); while (true) { std::deque in_nodes_que, out_nodes_que; @@ -729,10 +1198,12 @@ Status SegmentGraph(const Graph* tf_graph, // --------------------------------- Step 3 --------------------------------- // Convert the segments into the expected return format + std::vector effective_nodes_counts; for (const auto& itr : sg_map) { const string& segment_root = itr.first; // Return format does not require set comparator. - std::set segment_nodes(itr.second.begin(), itr.second.end()); + std::set segment_nodes( + itr.second.nodes.begin(), itr.second.nodes.end()); if (VLOG_IS_ON(1) && !segment_nodes.empty()) { string s; for (auto node : segment_nodes) { @@ -750,36 +1221,89 @@ Status SegmentGraph(const Graph* tf_graph, }); // Don't use segments whose number of effective nodes is small. - if (num_effective_nodes < options.minimum_segment_size) { + if (num_effective_nodes == 0 || + num_effective_nodes < options.minimum_segment_size) { VLOG(1) << "Segment " << segments->size() << " has only " << num_effective_nodes << " effective nodes, dropping"; continue; } + segments->emplace_back(itr.second.property, segment_nodes); + effective_nodes_counts.push_back(num_effective_nodes); + } + + // --------------------------------- Step 4 --------------------------------- + // If the number of segments exceeds max_engines, prune the smallest ones. + + int64 max_trt_engine_ops; + TF_CHECK_OK(ReadInt64FromEnvVar("TF_TRT_MAX_ALLOWED_ENGINES", + /*default_value=*/20, &max_trt_engine_ops)); - const auto& dev_itr = device_maps.find(segment_root); - if (dev_itr == device_maps.end() || dev_itr->second.empty()) { - VLOG(1) << "No device assigned to segment " << segments->size(); - } else if (dev_itr->second.size() > 1) { - string s = StrCat("Segment ", segments->size(), - " has multiple devices attached: "); - for (const auto& dev : dev_itr->second) { - StrAppend(&s, dev, ", "); + if (max_trt_engine_ops <= 0) { + LOG(WARNING) << "The environment variable TF_TRT_MAX_ALLOWED_ENGINES is " + << "<= 0. TF-TRT did not limit the number of TensorRT engines " + << "created."; + + } else { + if (segments->size() > max_trt_engine_ops) { + LOG(WARNING) << "A total of " << segments->size() << " segments with at " + << "least minimum_segment_size=" + << options.minimum_segment_size << " nodes have been found. " + << "TF-TRT will only convert the " << max_trt_engine_ops + << " largest segments. You can change this behavior by " + << "modifying the environment variable " + << "TF_TRT_MAX_ALLOWED_ENGINES=" << max_trt_engine_ops; + + // Stable sort of the segment indices according to their effective sizes. + std::vector indices(segments->size()); + std::iota(indices.begin(), indices.end(), 0); + + std::stable_sort(indices.begin(), indices.end(), + [&effective_nodes_counts](int i1, int i2) { + return effective_nodes_counts[i1] > + effective_nodes_counts[i2]; + }); + + // Create a mask of segments to keep. + std::vector mask = std::vector(segments->size(), false); + + for (int i = 0; i < max_trt_engine_ops; i++) { + mask[indices[i]] = true; } - LOG(WARNING) << s; - } - segments->emplace_back(segment_nodes); - } - if (VLOG_IS_ON(1)) { - for (const auto& d : device_maps) { - string s("Segment "); - StrAppend(&s, ": '", d.first, "' "); - for (const auto& dd : d.second) { - StrAppend(&s, dd, ", "); + // Gather the masked elements at the start of the array, in place. + int j = 0; + VLOG(1) << "The following segments have been accepted by TF-TRT:"; + for (int i = 0; i < segments->size(); i++) { + if (mask[i]) { + VLOG(1) << "[*] Segment " << i + << " [node count: " << effective_nodes_counts[i] + << "] accepted. Re-assigned " + << "segment id=" << j; + segments->at(j) = segments->at(i); + j++; + } } - VLOG(1) << "Devices " << s; + + VLOG(1) << "The following segments have been rejected by TF-TRT:"; + for (int i = 0; i < segments->size(); i++) { + if (!mask[i]) { + VLOG(1) << "[*] Segment " << i + << " [node count: " << effective_nodes_counts[i] + << "] rejected."; + } + } + + // Resize the array. + segments->resize(max_trt_engine_ops); + } else { + LOG(WARNING) << "The environment variable TF_TRT_MAX_ALLOWED_ENGINES=" + << max_trt_engine_ops << " has no effect since there are " + << "only " << segments->size() << " TRT Engines with at " + << "least minimum_segment_size=" + << options.minimum_segment_size << " nodes."; } } + return Status::OK(); } @@ -787,5 +1311,4 @@ Status SegmentGraph(const Graph* tf_graph, } // namespace tensorrt } // namespace tensorflow -#endif // GOOGLE_TENSORRT -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA && GOOGLE_TENSORRT diff --git a/tensorflow/compiler/tf2tensorrt/segment/segment.h b/tensorflow/compiler/tf2tensorrt/segment/segment.h index 77c0af223c8..ad41d5eb40f 100644 --- a/tensorflow/compiler/tf2tensorrt/segment/segment.h +++ b/tensorflow/compiler/tf2tensorrt/segment/segment.h @@ -19,30 +19,59 @@ limitations under the License. #include #include +#include "absl/types/optional.h" +#include "tensorflow/compiler/tf2tensorrt/segment/union_find.h" #include "tensorflow/core/framework/graph.pb.h" #include "tensorflow/core/graph/graph.h" +#include "tensorflow/core/grappler/costs/graph_properties.h" #include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/platform/types.h" -#if GOOGLE_CUDA -#if GOOGLE_TENSORRT +#if GOOGLE_CUDA && GOOGLE_TENSORRT namespace tensorflow { namespace tensorrt { namespace segment { -// Vector of segments, each entry contains a set of node pointers. -using SegmentNodesVector = std::vector>; +constexpr char kTftrtOpMaxBatchSizeAttr[] = "_tftrt_op_max_batch_size"; struct SegmentOptions { + // This struct holds per graph segmenting parameters. // Segment must contain at least this many nodes. int minimum_segment_size = 2; + bool use_implicit_batch = true; + // The maximum batch size used to build the engines in the graph, when + // use_implicit_batch is true. + absl::optional maximum_batch_size = absl::nullopt; + // When use_implicit_batch is false or when we are building dynamic engines, + // we allow dynamic non-batch dimensions. + bool allow_dynamic_non_batch_dim = false; + // The name of the device to put the segment on. std::set exclude_node_list; }; +struct NodePtrCompare { + bool operator()(const Node* lhs, const Node* rhs) const { + return lhs->name() < rhs->name(); + } +}; + +struct Segment { + Segment() {} + Segment(const ClusterProperty& property, + const std::set& nodes) + : property(property), nodes(nodes) {} + ClusterProperty property; + std::set nodes; +}; + +// Vector of segments, each entry contains a set of node pointers. +using SegmentVector = std::vector; + // Get the subgraphs of a graph that can be handled by TensorRT. // -// @param graph Graph of the network +// @param tf_graph Graph of the network. +// @graph_properties is the static graph properties. // @param candidate_fn A function that returns OK for a Node* if // that node can be handled by TensorRT. // @param segments Returns the TensorRT segments/subgraphs. Each entry @@ -50,17 +79,16 @@ struct SegmentOptions { // all the NodeDefs in that subgraph. // @return the status. Status SegmentGraph(const Graph* tf_graph, + const grappler::GraphProperties* graph_properties, const std::function& candidate_fn, const std::function& input_candidate_fn, const std::function& output_candidate_fn, - const SegmentOptions& options, - SegmentNodesVector* segments); + const SegmentOptions& options, SegmentVector* segments); } // namespace segment } // namespace tensorrt } // namespace tensorflow -#endif // GOOGLE_TENSORRT -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA && GOOGLE_TENSORRT #endif // TENSORFLOW_COMPILER_TF2TENSORRT_SEGMENT_SEGMENT_H_ diff --git a/tensorflow/compiler/tf2tensorrt/segment/segment_test.cc b/tensorflow/compiler/tf2tensorrt/segment/segment_test.cc index cb038e58126..12f3e7a5742 100644 --- a/tensorflow/compiler/tf2tensorrt/segment/segment_test.cc +++ b/tensorflow/compiler/tf2tensorrt/segment/segment_test.cc @@ -26,8 +26,7 @@ limitations under the License. #include "tensorflow/core/platform/types.h" #include "tensorflow/core/public/session.h" -#if GOOGLE_CUDA -#if GOOGLE_TENSORRT +#if GOOGLE_CUDA && GOOGLE_TENSORRT namespace tensorflow { namespace tensorrt { @@ -42,7 +41,7 @@ class SegmentTest : public ::testing::Test { if (node_names.find(node->name()) != node_names.end()) { return Status::OK(); } - return errors::NotFound(""); + return errors::NotFound("Not a user specified candidate"); }; } @@ -60,24 +59,35 @@ class SegmentTest : public ::testing::Test { }; } - void RunTest(const Graph* graph, const std::set& candidates, + void RunTest(const Graph* graph, + const grappler::GraphProperties* graph_properties, + const std::set& candidates, const std::set& input_candidates, const std::set& output_candidates, const std::vector>& expected_segments) { - SegmentNodesVector segments; - TF_EXPECT_OK(SegmentGraph(graph, MakeCandidateFn(candidates), + SegmentVector segments; + TF_EXPECT_OK(SegmentGraph(graph, graph_properties, + MakeCandidateFn(candidates), MakeInputEdgeCandidateFn(input_candidates), MakeOutputEdgeCandidateFn(output_candidates), - default_options_, &segments)); + segment_options_, &segments)); ValidateSegment(segments, expected_segments); } - void ValidateSegment(const SegmentNodesVector& segments, + void RunTest(const Graph* graph, const std::set& candidates, + const std::set& input_candidates, + const std::set& output_candidates, + const std::vector>& expected_segments) { + RunTest(graph, nullptr, candidates, input_candidates, output_candidates, + expected_segments); + } + + void ValidateSegment(const SegmentVector& segments, const std::vector>& expected_segments) { EXPECT_EQ(expected_segments.size(), segments.size()); for (int i = 0; i < segments.size(); ++i) { std::set segment_node_names; - for (const Node* node : segments[i]) { + for (const Node* node : segments[i].nodes) { segment_node_names.insert(node->name()); } const auto& expected = expected_segments[i]; @@ -93,7 +103,18 @@ class SegmentTest : public ::testing::Test { } } - SegmentOptions default_options_; + void DisableImplicitBatchMode() { + segment_options_.use_implicit_batch = false; + segment_options_.allow_dynamic_non_batch_dim = true; + } + + void EnableImplicitBatchModeForStaticEngine(int maximum_batch_size = 1000) { + segment_options_.use_implicit_batch = true; + segment_options_.maximum_batch_size = maximum_batch_size; + segment_options_.allow_dynamic_non_batch_dim = false; + } + + SegmentOptions segment_options_; }; std::set operator-(const std::set& lhs, const string& rhs) { @@ -107,6 +128,7 @@ TEST_F(SegmentTest, Empty) { Graph g(OpRegistry::Global()); TF_EXPECT_OK(s.ToGraph(&g)); // Expect no segments/subgraphs. + DisableImplicitBatchMode(); RunTest(&g, {}, {}, {}, {}); } @@ -133,6 +155,7 @@ TEST_F(SegmentTest, Simple) { // All Add operations are candidates, and we expect all of them to be // collapsed into a single segment const std::set all_adds = {"add0", "add1", "add2", "add3", "add4"}; + DisableImplicitBatchMode(); RunTest(&g, all_adds, all_adds, all_adds, {all_adds}); // Make add1 not a candidate, and we expect all other Add operations to be @@ -157,6 +180,69 @@ TEST_F(SegmentTest, Simple) { RunTest(&g, all_adds, all_adds, without_add3, {all_adds}); } +TEST_F(SegmentTest, WithDeviceAssignments) { + // feed + // // \\ + // add0 add1 + // | \ / + // | add2 + // | / \\ + // add3 add4 + // \ / + // + Scope s = Scope::NewRootScope(); + auto feed = ops::Placeholder(s.WithOpName("feed"), DT_FLOAT); + auto add0 = ops::Add(s.WithOpName("add0"), feed, feed); + auto add1 = ops::Add(s.WithOpName("add1"), feed, feed); + auto add2 = ops::Add(s.WithOpName("add2"), add0, add1); + auto add3 = ops::Add(s.WithOpName("add3"), add0, add2); + auto add4 = ops::Add(s.WithOpName("add4"), add2, add2); + + const std::set all_adds = {"add0", "add1", "add2", "add3", "add4"}; + DisableImplicitBatchMode(); + + { + Graph g(OpRegistry::Global()); + TF_EXPECT_OK(s.ToGraph(&g)); + RunTest(&g, all_adds, all_adds, all_adds, {all_adds}); + } + + { + // Assigning add1 to CPU to exclude it from the cluster. + add1.node()->set_assigned_device_name("/device:CPU:0"); + Graph g(OpRegistry::Global()); + TF_EXPECT_OK(s.ToGraph(&g)); + RunTest(&g, all_adds, all_adds, all_adds, {all_adds - "add1"}); + add1.node()->set_assigned_device_name(""); + } + + { + // Assigning operations add3 and add4 to another GPU to exclude the + // operation from the cluster. + constexpr char kGpu0[] = "/device:GPU:0"; + add0.node()->set_assigned_device_name(kGpu0); + add1.node()->set_assigned_device_name(kGpu0); + add2.node()->set_assigned_device_name(kGpu0); + constexpr char kGpu1[] = "/device:GPU:1"; + add3.node()->set_assigned_device_name(kGpu1); + add4.node()->set_assigned_device_name(kGpu1); + Graph g(OpRegistry::Global()); + TF_EXPECT_OK(s.ToGraph(&g)); + RunTest(&g, all_adds, all_adds, all_adds, {{"add0", "add1", "add2"}}); + } + + { + // Assigning the operations to two compatibile GPU devices resulting in + // one cluster with all operations. + constexpr char kGpuAny[] = "/device:GPU:*"; + add3.node()->set_assigned_device_name(kGpuAny); + add4.node()->set_assigned_device_name(kGpuAny); + Graph g(OpRegistry::Global()); + TF_EXPECT_OK(s.ToGraph(&g)); + RunTest(&g, all_adds, all_adds, all_adds, {all_adds}); + } +} + TEST_F(SegmentTest, AvoidCycle) { // feed // // \\ @@ -179,6 +265,7 @@ TEST_F(SegmentTest, AvoidCycle) { // add2 is not a TRT candidate so there should be no segments generated. const std::set without_add2 = {"add0", "add1", "add3", "add4"}; + DisableImplicitBatchMode(); RunTest(&g, without_add2, without_add2, without_add2, {}); } @@ -212,6 +299,7 @@ TEST_F(SegmentTest, Multiple) { "add5", "add6", "add7", "add8"}; // Make add5 not a TRT candidate, and we expect two segments. auto without_add5 = all_adds - "add5"; + DisableImplicitBatchMode(); RunTest(&g, without_add5, without_add5, without_add5, {{"add0", "add1", "add2", "add3"}, {"add6", "add8"}}); @@ -258,6 +346,7 @@ TEST_F(SegmentTest, BigIfElse) { // Make add2 not a TRT candidate, and we expect 2 segments. const std::set all_adds = {"add0", "add1", "add2", "add3", "add4", "add5", "add6", "add7"}; + DisableImplicitBatchMode(); RunTest(&g, all_adds - "add2", all_adds, all_adds, {{"add0", "add1"}, {"add3", "add4", "add5", "add6", "add7"}}); } @@ -276,13 +365,229 @@ TEST_F(SegmentTest, IdentityOps) { "identity2", "identity3"}; // Identity ops are not counted as effective ops in the segment, so no segment // will be formed in this case. + DisableImplicitBatchMode(); RunTest(&g, all_identities, all_identities, all_identities, {}); } +// Testing implicit batch mode segmentation: it excludes the add-2 operation +// with a dynamic non-batch dimension. +TEST_F(SegmentTest, ExcludeAddWithDynamicNonBatchDimension) { + Scope s = Scope::NewRootScope(); + auto feed_0_shape = ops::Placeholder::Shape(PartialTensorShape({-1, 2, 3})); + auto feed_1_shape = ops::Placeholder::Shape(PartialTensorShape({-1, -1, 3})); + auto const_val = ops::Const(s, {1.0}, {}); + auto feed_0 = + ops::Placeholder(s.WithOpName("feed-1"), DT_FLOAT, feed_0_shape); + auto feed_1 = + ops::Placeholder(s.WithOpName("feed-2"), DT_FLOAT, feed_1_shape); + auto add_0 = ops::Add(s.WithOpName("add-0"), feed_0, const_val); + auto add_1 = ops::Add(s.WithOpName("add-1"), add_0, feed_0); + auto add_2 = ops::Add(s.WithOpName("add-2"), const_val, feed_1); + + grappler::GrapplerItem item; + item.fetch.push_back("add-2"); + TF_EXPECT_OK(s.ToGraphDef(&item.graph)); + + grappler::GraphProperties static_graph_properties(item); + TF_EXPECT_OK(static_graph_properties.InferStatically(true)); + + Graph g(OpRegistry::Global()); + TF_CHECK_OK( + ConvertGraphDefToGraph(GraphConstructorOptions(), item.graph, &g)); + + const std::set all_nodes = {"add-0", "add-1", "add-2"}; + EnableImplicitBatchModeForStaticEngine(); + RunTest(&g, &static_graph_properties, all_nodes, all_nodes, all_nodes, + {all_nodes - "add-2"}); +} + +// Testing implicit batch mode segmentation: It excludes the reshape operation +// with a dynamic non-batch output dimension. +// TODO(bixia): hoist the check for reshape should not change batch size from +// the converter to the segmenter and add another test case for excluding +// a reshape without dynamic dimensions involved. +TEST_F(SegmentTest, ExcludeReshapeWithDynamicNonBatchDimensionInOutput) { + Scope s = Scope::NewRootScope(); + auto feed_0_shape = ops::Placeholder::Shape(PartialTensorShape({-1, 2, 3})); + auto const_val = ops::Const(s, {1.0}, {}); + auto feed_0 = + ops::Placeholder(s.WithOpName("feed-1"), DT_FLOAT, feed_0_shape); + auto add_0 = ops::Add(s.WithOpName("add-0"), feed_0, const_val); + auto reshape = ops::Reshape(s.WithOpName("reshape"), add_0, Input({6, -1})); + auto add_1 = ops::Add(s.WithOpName("add-1"), reshape, const_val); + + grappler::GrapplerItem item; + item.fetch.push_back("add-1"); + TF_EXPECT_OK(s.ToGraphDef(&item.graph)); + + grappler::GraphProperties static_graph_properties(item); + TF_EXPECT_OK(static_graph_properties.InferStatically(true)); + + Graph g(OpRegistry::Global()); + TF_CHECK_OK( + ConvertGraphDefToGraph(GraphConstructorOptions(), item.graph, &g)); + + const std::set all_nodes = {"add-0", "reshape", "add-1"}; + EnableImplicitBatchModeForStaticEngine(); + RunTest(&g, &static_graph_properties, all_nodes, all_nodes, all_nodes, {}); +} + +TEST_F(SegmentTest, RankOneCannotUseImplicitBatch) { + Scope s = Scope::NewRootScope(); + auto input_0_shape = ops::Placeholder::Shape(TensorShape({3})); + auto input_1_shape = ops::Placeholder::Shape(TensorShape({3})); + auto input_0 = + ops::Placeholder(s.WithOpName("input-0"), DT_FLOAT, input_0_shape); + auto input_1 = + ops::Placeholder(s.WithOpName("input-1"), DT_FLOAT, input_1_shape); + auto const_val = ops::Const(s.WithOpName("const-scalar"), 1.0f, {}); + auto output_0 = ops::Add(s.WithOpName("output-0"), input_0, const_val); + auto output_1 = ops::Add(s.WithOpName("output-1"), input_1, const_val); + + grappler::GrapplerItem item; + item.fetch.push_back("output-0"); + item.fetch.push_back("output-1"); + TF_EXPECT_OK(s.ToGraphDef(&item.graph)); + + grappler::GraphProperties static_graph_properties(item); + TF_EXPECT_OK(static_graph_properties.InferStatically(true)); + + Graph g(OpRegistry::Global()); + TF_CHECK_OK( + ConvertGraphDefToGraph(GraphConstructorOptions(), item.graph, &g)); + + const std::set all_nodes = {"const-scalar", "output-0", "output-1"}; + EnableImplicitBatchModeForStaticEngine(); + RunTest(&g, &static_graph_properties, all_nodes, all_nodes, all_nodes, {}); +} + +TEST_F(SegmentTest, TwoChainsDiffBatchSizes) { + Scope s = Scope::NewRootScope(); + auto input_0_shape = ops::Placeholder::Shape(TensorShape({2, 3})); + auto input_1_shape = ops::Placeholder::Shape(TensorShape({5, 3})); + auto input_0 = + ops::Placeholder(s.WithOpName("input-0"), DT_FLOAT, input_0_shape); + auto input_1 = + ops::Placeholder(s.WithOpName("input-1"), DT_FLOAT, input_1_shape); + auto const_val = ops::Const(s.WithOpName("const-scalar"), 1.0f, {}); + auto output_0 = ops::Add(s.WithOpName("output-0"), input_0, const_val); + auto output_1 = ops::Add(s.WithOpName("output-1"), input_1, const_val); + + grappler::GrapplerItem item; + item.fetch.push_back("output-0"); + item.fetch.push_back("output-1"); + TF_EXPECT_OK(s.ToGraphDef(&item.graph)); + + grappler::GraphProperties static_graph_properties(item); + TF_EXPECT_OK(static_graph_properties.InferStatically(true)); + + Graph g(OpRegistry::Global()); + TF_CHECK_OK( + ConvertGraphDefToGraph(GraphConstructorOptions(), item.graph, &g)); + + const std::set all_nodes = {"const-scalar", "output-0", "output-1"}; + EnableImplicitBatchModeForStaticEngine(); + RunTest(&g, &static_graph_properties, all_nodes, all_nodes, all_nodes, + /*expected_segments=*/{{"output-0", "const-scalar"}}); + + // Converter will create engines based on the static batch size + EnableImplicitBatchModeForStaticEngine(1); + RunTest(&g, &static_graph_properties, all_nodes, all_nodes, all_nodes, + /*expected_segments=*/{{"output-0", "const-scalar"}}); +} + +TEST_F(SegmentTest, SameRankImplicitBroadcastingStaticBatchSize) { + Scope s = Scope::NewRootScope(); + auto input_0_shape = ops::Placeholder::Shape(TensorShape({2, 3, 1})); + auto input_1_shape = ops::Placeholder::Shape(TensorShape({1, 3, 4})); + auto input_2_shape = ops::Placeholder::Shape(TensorShape({2, 3, 4})); + auto input_0 = + ops::Placeholder(s.WithOpName("input-0"), DT_FLOAT, input_0_shape); + auto input_1 = + ops::Placeholder(s.WithOpName("input-1"), DT_FLOAT, input_1_shape); + auto input_2 = + ops::Placeholder(s.WithOpName("input-2"), DT_FLOAT, input_2_shape); + auto multiple = ops::Mul(s.WithOpName("multiple"), input_2, input_2); + auto output_0 = ops::Add(s.WithOpName("output-0"), input_0, multiple); + auto output_1 = ops::Add(s.WithOpName("output-1"), input_1, multiple); + + grappler::GrapplerItem item; + item.fetch.push_back("output-0"); + item.fetch.push_back("output-1"); + TF_EXPECT_OK(s.ToGraphDef(&item.graph)); + + grappler::GraphProperties static_graph_properties(item); + TF_EXPECT_OK(static_graph_properties.InferStatically(true)); + + Graph g(OpRegistry::Global()); + TF_CHECK_OK( + ConvertGraphDefToGraph(GraphConstructorOptions(), item.graph, &g)); + + const std::set all_nodes = {"multiple", "output-0", "output-1"}; + EnableImplicitBatchModeForStaticEngine(); + RunTest(&g, &static_graph_properties, all_nodes, all_nodes, all_nodes, + {all_nodes}); +} + +TEST_F(SegmentTest, SameRankImplicitBroadcastingDynamicBatchSize) { + Scope s = Scope::NewRootScope(); + auto input_0_shape = ops::Placeholder::Shape(PartialTensorShape({-1, 2})); + auto input_1_shape = ops::Placeholder::Shape(TensorShape({1, 2})); + auto input_0 = + ops::Placeholder(s.WithOpName("input-0"), DT_FLOAT, input_0_shape); + auto input_1 = + ops::Placeholder(s.WithOpName("input-1"), DT_FLOAT, input_1_shape); + auto const_val = ops::Const(s.WithOpName("const-val"), 1.0f, {1, 1}); + auto add_0 = ops::Add(s.WithOpName("add-0"), input_0, const_val); + auto output_0 = ops::Add(s.WithOpName("output-0"), input_0, add_0); + + grappler::GrapplerItem item; + item.fetch.push_back("output-0"); + TF_EXPECT_OK(s.ToGraphDef(&item.graph)); + + grappler::GraphProperties static_graph_properties(item); + TF_EXPECT_OK(static_graph_properties.InferStatically(true)); + + Graph g(OpRegistry::Global()); + TF_CHECK_OK( + ConvertGraphDefToGraph(GraphConstructorOptions(), item.graph, &g)); + + const std::set all_nodes = {"const-val", "add-0", "output-0"}; + EnableImplicitBatchModeForStaticEngine(); + RunTest(&g, &static_graph_properties, all_nodes, all_nodes, all_nodes, + {{"const-val", "add-0", "output-0"}}); +} + +TEST_F(SegmentTest, IncompatibleBatchSizes) { + Scope s = Scope::NewRootScope(); + auto input_0_shape = ops::Placeholder::Shape(PartialTensorShape({-1, 2})); + auto input_1_shape = ops::Placeholder::Shape(TensorShape({2, 2})); + auto input_0 = + ops::Placeholder(s.WithOpName("input-0"), DT_FLOAT, input_0_shape); + auto input_1 = + ops::Placeholder(s.WithOpName("input-1"), DT_FLOAT, input_1_shape); + auto const_val = ops::Const(s.WithOpName("const-val"), 1.0f, {2, 2}); + auto add_0 = ops::Add(s.WithOpName("add-0"), input_0, const_val); + auto output_0 = ops::Add(s.WithOpName("output-0"), input_0, add_0); + + grappler::GrapplerItem item; + item.fetch.push_back("output-0"); + TF_EXPECT_OK(s.ToGraphDef(&item.graph)); + + grappler::GraphProperties static_graph_properties(item); + TF_EXPECT_OK(static_graph_properties.InferStatically(true)); + + Graph g(OpRegistry::Global()); + TF_CHECK_OK( + ConvertGraphDefToGraph(GraphConstructorOptions(), item.graph, &g)); + + const std::set all_nodes = {"const-val", "add-0", "output-0"}; + EnableImplicitBatchModeForStaticEngine(); + RunTest(&g, &static_graph_properties, all_nodes, all_nodes, all_nodes, {}); +} } // namespace test } // namespace segment } // namespace tensorrt } // namespace tensorflow -#endif // GOOGLE_TENSORRT -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA && GOOGLE_TENSORRT diff --git a/tensorflow/compiler/tf2tensorrt/segment/union_find.cc b/tensorflow/compiler/tf2tensorrt/segment/union_find.cc new file mode 100644 index 00000000000..29882ed6e60 --- /dev/null +++ b/tensorflow/compiler/tf2tensorrt/segment/union_find.cc @@ -0,0 +1,154 @@ +/* Copyright 2020 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/tf2tensorrt/segment/union_find.h" + +#include "absl/strings/str_format.h" +#include "tensorflow/compiler/tf2tensorrt/convert/utils.h" + +#if GOOGLE_CUDA && GOOGLE_TENSORRT + +namespace tensorflow { +namespace tensorrt { +namespace segment { + +namespace { +template +inline bool CheckIfCompatible(const absl::optional& a, + const absl::optional& b) { + if (a.has_value() && b.has_value()) { + return *a == *b; + } + return true; +} + +template +inline bool UnifyValues(absl::optional& a, absl::optional& b) { + if (a.has_value()) { + b = a; + } else { + a = b; + } + return true; +} + +template +inline absl::optional MergeCompatible(const absl::optional& a, + const absl::optional& b) { + DCHECK(CheckIfCompatible(a, b)); + return a.has_value() ? a : b; +} + +} // namespace + +ClusterBatchSize::ClusterBatchSize() + : batch_size_(absl::nullopt), max_batch_size_(absl::nullopt) {} + +bool ClusterBatchSize::operator==(const ClusterBatchSize& other) { + return batch_size_ == other.batch_size_ && + max_batch_size_ == other.max_batch_size_; +} + +ClusterBatchSize& ClusterBatchSize::SetBatchSize(int batch_size) { + SetBatchSize(static_cast>(batch_size)); + return *this; +} + +ClusterBatchSize& ClusterBatchSize::SetBatchSize( + const absl::optional& batch_size) { + batch_size_ = MergeCompatible(batch_size_, batch_size); + if (batch_size_.has_value() && batch_size_.value() >= 0) { + SetMaxBatchSize(batch_size_); + } + return *this; +} + +bool ClusterBatchSize::HasBatchSize() const { return batch_size_.has_value(); } + +int ClusterBatchSize::GetBatchSize() const { + DCHECK(HasBatchSize()); + return batch_size_.value(); +} + +ClusterBatchSize& ClusterBatchSize::SetMaxBatchSize(int max_batch_size) { + SetBatchSize(static_cast>(max_batch_size)); + return *this; +} + +ClusterBatchSize& ClusterBatchSize::SetMaxBatchSize( + const absl::optional& max_batch_size) { + max_batch_size_ = MergeCompatible(max_batch_size_, max_batch_size); + return *this; +} + +absl::optional ClusterBatchSize::GetOptionalMaxBatchSize() const { + return max_batch_size_; +} + +bool ClusterBatchSize::MergeIfCompatible(const ClusterBatchSize& other) { + if (!CheckIfCompatible(batch_size_, other.batch_size_) || + !CheckIfCompatible(max_batch_size_, other.max_batch_size_)) { + return false; + } + + SetBatchSize(other.batch_size_); + SetMaxBatchSize(other.max_batch_size_); + return true; +} + +string ClusterBatchSize::ToString() const { + string s; + const auto append_optional_num = [&](const absl::optional& num) { + if (num.has_value()) { + absl::StrAppendFormat(&s, "%d", num.value()); + } else { + absl::StrAppendFormat(&s, "?"); + } + }; + absl::StrAppendFormat(&s, "batch_size="); + append_optional_num(batch_size_); + absl::StrAppendFormat(&s, ", max_batch_size="); + append_optional_num(max_batch_size_); + return s; +} + +ClusterProperty::ClusterProperty(const ClusterBatchSize& batch_size, + const DeviceNameUtils::ParsedName& device_name) + : batch_size_(batch_size), device_name_(device_name) {} + +Status ClusterProperty::Merge(const ClusterProperty& other) { + ClusterBatchSize merged_batch_size(batch_size_); + if (!merged_batch_size.MergeIfCompatible(other.batch_size_)) { + return errors::Internal( + "trying to merge clusters with incompatible batch sizes."); + } + + absl::optional merged_device_name = + MergeIfCompatible(device_name_, other.device_name_); + if (!merged_device_name.has_value()) { + return errors::Internal( + "trying to merge clusters with incompatible device assignment."); + } + + batch_size_ = std::move(merged_batch_size); + device_name_ = std::move(merged_device_name.value()); + return Status::OK(); +} + +} // namespace segment +} // namespace tensorrt +} // namespace tensorflow + +#endif // GOOGLE_CUDA && GOOGLE_TENSORRT diff --git a/tensorflow/compiler/tf2tensorrt/segment/union_find.h b/tensorflow/compiler/tf2tensorrt/segment/union_find.h index 6458ae692fd..9a2f1e8dd5b 100644 --- a/tensorflow/compiler/tf2tensorrt/segment/union_find.h +++ b/tensorflow/compiler/tf2tensorrt/segment/union_find.h @@ -16,55 +16,192 @@ limitations under the License. #ifndef TENSORFLOW_COMPILER_TF2TENSORRT_SEGMENT_UNION_FIND_H_ #define TENSORFLOW_COMPILER_TF2TENSORRT_SEGMENT_UNION_FIND_H_ +#include "absl/types/optional.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/util/device_name_utils.h" + +#if GOOGLE_CUDA && GOOGLE_TENSORRT + namespace tensorflow { namespace tensorrt { namespace segment { -// Union-Find data structure. -// Each cluster has an associated value; when merging clusters we can control -// which value becomes the representative of the merged clusters. Values must be -// copyable. -template +// ClusterBatchSize is a data structure to record the batch size we have seen +// for a cluster during segmentation. +// +// With the help of shape inference, all the dynamic batch sizes are converted +// to a negative integer number. +// If the number is -1, then nothing is known about the dynamic batch size. +// Ideally, we should not put nodes with -1 batch size into the same cluster, +// as they will likely have different batch sizes at runtime. However, we +// currently treat -1 as an equivalent class for simple implementation. We may +// need to revise this if it causes performance issues. +// If the number is strictly less than -1, then it represents a equivalent +// class. It is infered that all the nodes with the same equivalent class +// (strictly less than -1) shall have the same batch size at runtime. +// +// When constructing clusters for implicit batch mode, we support both +// dynamic batch sizes and static batch sizes. As all the nodes inside the same +// cluster shall have the same batch size at runtime, we restrict nodes inside a +// cluster to either have the same dynamic batch size equivalent class or the +// same static batch size value. +// +// Besides, all the nodes with an annotated max batch size inside the same +// cluster shall have the same annotated max batch size. (It is allowed if +// part or all the nodes inside the cluster doesn't have annotated max batch +// size). Static batch sizes are treated as max batch size annotations. The +// converter max batch size is used for an OP with a dynamic batch size and no +// annotated max batch size. +// +// cluster: a = a1[1,3] + a1[1,3] +// ClusterBatchSize: batch_size_ = 1 +// max_batch_size_ = 1 +// +// cluster: b = b1[-1,3] + b2[-1, 3] +// ClusterBatchSize: batch_size_ = -1 +// max_batch_size_ = null +// +// cluster: c = c1[-2,3] + c2[-2, 3](max_batch_size=100) +// ClusterBatchSize: batch_size_ = -2 +// max_batch_size_ = 100 +// +// When constructing cluster for explicit batch mode, all ClusterBatchSize is +// irrelevant. +// + +class ClusterBatchSize { + public: + ClusterBatchSize(); + + bool operator==(const ClusterBatchSize& other); + bool operator!=(const ClusterBatchSize& other) { return !(*this == other); } + + // Sets the batch size assuming that the object doesn't have a batch size yet: + // A non-negative input representing a static batch size value. + // A negative input representing a dynamic batch size equivalent class. + ClusterBatchSize& SetBatchSize(int batch_size); + bool HasBatchSize() const; + int GetBatchSize() const; + + // Sets the max batch size assuming that the object doesn't have a max batch + // size yet. + ClusterBatchSize& SetMaxBatchSize(int max_batch_size); + absl::optional GetOptionalMaxBatchSize() const; + + // Merge `other` into the current ClusterBatchSize if the two are not + // conflicting. Two ClusterBatchSizes are conflicting iff they both have a + // value and their values are different. + bool MergeIfCompatible(const ClusterBatchSize& other); + + // Returns a string for the batch size and the annotated max batch size. + // For the batch size: + // If the object has a static batch size, return a string representing a + // non-negative integer. + // If the object has a dynamic batch size, return a string representing a + // negative integer as an equivalent class. + // If the object doesn't have a batch size yet, return "?". + // For the annotated max batch size: + // If the cluster has annotated max batch size in at least one of the nodes, + // return a string representing the annotated max batch size. Otherwise, + // return "?". + std::string ToString() const; + + private: + ClusterBatchSize& SetBatchSize(const absl::optional& batch_size); + ClusterBatchSize& SetMaxBatchSize(const absl::optional& batch_size); + + absl::optional batch_size_; + absl::optional max_batch_size_; +}; + +inline std::ostream& operator<<(std::ostream& os, + const ClusterBatchSize& batch_size) { + return os << batch_size.ToString(); +} + +// Represents the accumulated properties of a cluster during segmentation, +// including information about batch size and device assignment. Clusters shall +// have compatible properties in order to be merged together. +class ClusterProperty { + public: + ClusterProperty() {} + ClusterProperty(const ClusterBatchSize& batch_size, + const DeviceNameUtils::ParsedName& device_name); + + // Returns the batch size of the cluster and compresses the path from this + // object to the root object. + const ClusterBatchSize& BatchSize() const { return batch_size_; } + + // Returns the device name of the cluster and compresses the path from this + // object to the root object. + const DeviceNameUtils::ParsedName& DeviceName() const { return device_name_; } + + Status Merge(const ClusterProperty& other); + + private: + ClusterBatchSize batch_size_; + DeviceNameUtils::ParsedName device_name_; +}; + +// Represents a disjoint set of copyable value with type T and accumulated +// property of the values with type P. Most of the methods in this class are +// side-effecting as they also compress the path from the object to the parent +// of its containing set. +template class UnionFind { public: UnionFind() : size_(1), parent_(nullptr) {} - explicit UnionFind(const T& v) : size_(1), parent_(nullptr), value_(v) {} + UnionFind(const T& v, const P& p) + : size_(1), parent_(nullptr), value_(v), property_(p) {} + UnionFind(const T& v, P&& p) + : size_(1), parent_(nullptr), value_(v), property_(p) {} - // Returns the number of elements in a cluster. + // Returns the number of elements in the set and compresses the path from + // this object to the root of the set. int Size() { return FindRoot()->size_; } - // Merges this cluster with 'other'. This cluster's value becomes - // the value of the merged cluster; the value of 'other' is ignored. - void Merge(UnionFind* other); + // Returns the accumulated property of all the elements in the set and + // compresses the path from this object to the root of the set. + const P& Property() { return FindRoot()->property_; } - // Each cluster has an associated value. Retrieves the value associated - // with this cluster. - T& ParentValue() { return FindRoot()->value_; } + // Merges this set with 'other'. This updates the size_ and property_ of the + // set. The size_ and property_ of 'other' becomes inaccessible as only the + // size_ and property_ of the root of the set is accessible. + Status Merge(UnionFind* other); - // Get the original value of this node. - T& Value() { return value_; } + // Retrieves the value for the root of the set. + const T& ParentValue() { return FindRoot()->value_; } + + // Returns the value for the object. + const T& Value() const { return value_; } private: - // Finds the root element of the cluster. Performs path compression. + // Returns the root object for the set and compresses the path from this + // object to the root object. UnionFind* FindRoot(); int size_; UnionFind* parent_; T value_; + P property_; }; -template -void UnionFind::Merge(UnionFind* other) { +template +Status UnionFind::Merge(UnionFind* other) { UnionFind* a = FindRoot(); UnionFind* b = other->FindRoot(); - if (a == b) return; + if (a == b) return Status::OK(); + P merged_property(a->property_); + TF_RETURN_IF_ERROR(merged_property.Merge(b->property_)); b->parent_ = a; a->size_ += b->size_; + a->property_ = std::move(merged_property); + return Status::OK(); } -template -UnionFind* UnionFind::FindRoot() { +template +UnionFind* UnionFind::FindRoot() { if (!parent_) return this; // Path compression: update intermediate nodes to point to the root of the // equivalence class. @@ -76,4 +213,6 @@ UnionFind* UnionFind::FindRoot() { } // namespace tensorrt } // namespace tensorflow +#endif // GOOGLE_CUDA && GOOGLE_TENSORRT + #endif // TENSORFLOW_COMPILER_TF2TENSORRT_SEGMENT_UNION_FIND_H_ diff --git a/tensorflow/compiler/tf2tensorrt/stub/NvInferPlugin_7_0.inc b/tensorflow/compiler/tf2tensorrt/stub/NvInferPlugin_7_0.inc new file mode 100644 index 00000000000..ca8de76aef9 --- /dev/null +++ b/tensorflow/compiler/tf2tensorrt/stub/NvInferPlugin_7_0.inc @@ -0,0 +1,95 @@ +// Auto-generated, do not edit. + +extern "C" { + +nvinfer1::IPluginV2* createRPNROIPlugin(int featureStride, int preNmsTop, + int nmsMaxOut, float iouThreshold, + float minBoxSize, float spatialScale, + nvinfer1::DimsHW pooling, + nvinfer1::Weights anchorRatios, + nvinfer1::Weights anchorScales) { + using FuncPtr = nvinfer1::IPluginV2 * ( *)(int, int, int, float, float, float, nvinfer1::DimsHW, nvinfer1::Weights, nvinfer1::Weights); + static auto func_ptr = LoadSymbol("createRPNROIPlugin"); + if (!func_ptr) LogFatalSymbolNotFound("createRPNROIPlugin"); + return func_ptr(featureStride, preNmsTop, nmsMaxOut, iouThreshold, minBoxSize, spatialScale, pooling, anchorRatios, anchorScales); +} + +nvinfer1::IPluginV2* createNormalizePlugin(const nvinfer1::Weights* scales, + bool acrossSpatial, + bool channelShared, float eps) { + using FuncPtr = nvinfer1::IPluginV2 * ( *)(const nvinfer1::Weights *, bool, bool, float); + static auto func_ptr = LoadSymbol("createNormalizePlugin"); + if (!func_ptr) LogFatalSymbolNotFound("createNormalizePlugin"); + return func_ptr(scales, acrossSpatial, channelShared, eps); +} + +nvinfer1::IPluginV2* createPriorBoxPlugin( + nvinfer1::plugin::PriorBoxParameters param) { + using FuncPtr = nvinfer1::IPluginV2 * ( *)(nvinfer1::plugin::PriorBoxParameters); + static auto func_ptr = LoadSymbol("createPriorBoxPlugin"); + if (!func_ptr) LogFatalSymbolNotFound("createPriorBoxPlugin"); + return func_ptr(param); +} + +nvinfer1::IPluginV2* createAnchorGeneratorPlugin( + nvinfer1::plugin::GridAnchorParameters* param, int numLayers) { + using FuncPtr = nvinfer1::IPluginV2 * ( *)(nvinfer1::plugin::GridAnchorParameters *, int); + static auto func_ptr = LoadSymbol("createAnchorGeneratorPlugin"); + if (!func_ptr) LogFatalSymbolNotFound("createAnchorGeneratorPlugin"); + return func_ptr(param, numLayers); +} + +nvinfer1::IPluginV2* createNMSPlugin( + nvinfer1::plugin::DetectionOutputParameters param) { + using FuncPtr = nvinfer1::IPluginV2 * ( *)(nvinfer1::plugin::DetectionOutputParameters); + static auto func_ptr = LoadSymbol("createNMSPlugin"); + if (!func_ptr) LogFatalSymbolNotFound("createNMSPlugin"); + return func_ptr(param); +} + +nvinfer1::IPluginV2* createLReLUPlugin(float negSlope) { + using FuncPtr = nvinfer1::IPluginV2 * ( *)(float); + static auto func_ptr = LoadSymbol("createLReLUPlugin"); + if (!func_ptr) LogFatalSymbolNotFound("createLReLUPlugin"); + return func_ptr(negSlope); +} + +nvinfer1::IPluginV2* createReorgPlugin(int stride) { + using FuncPtr = nvinfer1::IPluginV2 * ( *)(int); + static auto func_ptr = LoadSymbol("createReorgPlugin"); + if (!func_ptr) LogFatalSymbolNotFound("createReorgPlugin"); + return func_ptr(stride); +} + +nvinfer1::IPluginV2* createRegionPlugin( + nvinfer1::plugin::RegionParameters params) { + using FuncPtr = nvinfer1::IPluginV2 * ( *)(nvinfer1::plugin::RegionParameters); + static auto func_ptr = LoadSymbol("createRegionPlugin"); + if (!func_ptr) LogFatalSymbolNotFound("createRegionPlugin"); + return func_ptr(params); +} + +nvinfer1::IPluginV2* createClipPlugin(const char* layerName, float clipMin, + float clipMax) { + using FuncPtr = nvinfer1::IPluginV2 * ( *)(const char *, float, float); + static auto func_ptr = LoadSymbol("createClipPlugin"); + if (!func_ptr) LogFatalSymbolNotFound("createClipPlugin"); + return func_ptr(layerName, clipMin, clipMax); +} + +nvinfer1::IPluginV2* createBatchedNMSPlugin( + nvinfer1::plugin::NMSParameters param) { + using FuncPtr = nvinfer1::IPluginV2 * ( *)(nvinfer1::plugin::NMSParameters); + static auto func_ptr = LoadSymbol("createBatchedNMSPlugin"); + if (!func_ptr) LogFatalSymbolNotFound("createBatchedNMSPlugin"); + return func_ptr(param); +} + +bool initLibNvInferPlugins(void* logger, const char* libNamespace) { + using FuncPtr = bool ( *)(void *, const char *); + static auto func_ptr = LoadSymbol("initLibNvInferPlugins"); + if (!func_ptr) LogFatalSymbolNotFound("initLibNvInferPlugins"); + return func_ptr(logger, libNamespace); +} + +} // extern "C" diff --git a/tensorflow/compiler/tf2tensorrt/stub/NvInfer_7_0.inc b/tensorflow/compiler/tf2tensorrt/stub/NvInfer_7_0.inc new file mode 100644 index 00000000000..ad393f5c39c --- /dev/null +++ b/tensorflow/compiler/tf2tensorrt/stub/NvInfer_7_0.inc @@ -0,0 +1,47 @@ +// Auto-generated, do not edit. + +extern "C" { + +void* createInferBuilder_INTERNAL(void* logger, int version) { + using FuncPtr = void * (*)(void *, int); + static auto func_ptr = LoadSymbol("createInferBuilder_INTERNAL"); + if (!func_ptr) LogFatalSymbolNotFound("createInferBuilder_INTERNAL"); + return func_ptr(logger, version); +} + +void* createInferRefitter_INTERNAL(void* engine, void* logger, int version) { + using FuncPtr = void * (*)(void *, void *, int); + static auto func_ptr = LoadSymbol("createInferRefitter_INTERNAL"); + if (!func_ptr) LogFatalSymbolNotFound("createInferRefitter_INTERNAL"); + return func_ptr(engine, logger, version); +} + +void* createInferRuntime_INTERNAL(void* logger, int version) { + using FuncPtr = void * (*)(void *, int); + static auto func_ptr = LoadSymbol("createInferRuntime_INTERNAL"); + if (!func_ptr) LogFatalSymbolNotFound("createInferRuntime_INTERNAL"); + return func_ptr(logger, version); +} + +nvinfer1::ILogger* getLogger() { + using FuncPtr = nvinfer1::ILogger * (*)(); + static auto func_ptr = LoadSymbol("getLogger"); + if (!func_ptr) LogFatalSymbolNotFound("getLogger"); + return func_ptr(); +} + +int getInferLibVersion() { + using FuncPtr = int (*)(); + static auto func_ptr = LoadSymbol("getInferLibVersion"); + if (!func_ptr) LogFatalSymbolNotFound("getInferLibVersion"); + return func_ptr(); +} + +nvinfer1::IPluginRegistry* getPluginRegistry() { + using FuncPtr = nvinfer1::IPluginRegistry * (*)(); + static auto func_ptr = LoadSymbol("getPluginRegistry"); + if (!func_ptr) LogFatalSymbolNotFound("getPluginRegistry"); + return func_ptr(); +} + +} // extern "C" diff --git a/tensorflow/compiler/tf2tensorrt/stub/nvinfer_plugin_stub.cc b/tensorflow/compiler/tf2tensorrt/stub/nvinfer_plugin_stub.cc index c884814e009..002406cf9eb 100644 --- a/tensorflow/compiler/tf2tensorrt/stub/nvinfer_plugin_stub.cc +++ b/tensorflow/compiler/tf2tensorrt/stub/nvinfer_plugin_stub.cc @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "tensorflow/compiler/tf2tensorrt/common/utils.h" #include "tensorflow/core/platform/env.h" #include "tensorflow/stream_executor/platform/dso_loader.h" #include "third_party/tensorrt/NvInferPlugin.h" @@ -51,10 +50,10 @@ void LogFatalSymbolNotFound(const char* symbol_name) { } } // namespace -#if IS_TRT_VERSION_GE(5, 1, 0, 0) -#include "tensorflow/compiler/tf2tensorrt/stub/NvInferPlugin_5_1.inc" -#elif IS_TRT_VERSION_GE(5, 0, 0, 0) -#include "tensorflow/compiler/tf2tensorrt/stub/NvInferPlugin_5_0.inc" +#if NV_TENSORRT_MAJOR < 7 +#error TensorRT version earlier than 7 is not supported. +#elif NV_TENSORRT_MAJOR == 7 || NV_TENSORRT_MAJOR == 8 +#include "tensorflow/compiler/tf2tensorrt/stub/NvInferPlugin_7_0.inc" #else -#error TensorRT version earlier than 5 is not supported. +#error This version of TensorRT is not supported. #endif diff --git a/tensorflow/compiler/tf2tensorrt/stub/nvinfer_stub.cc b/tensorflow/compiler/tf2tensorrt/stub/nvinfer_stub.cc index 2feb785350d..a0a11766cd3 100644 --- a/tensorflow/compiler/tf2tensorrt/stub/nvinfer_stub.cc +++ b/tensorflow/compiler/tf2tensorrt/stub/nvinfer_stub.cc @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "tensorflow/compiler/tf2tensorrt/common/utils.h" #include "tensorflow/core/platform/env.h" #include "tensorflow/stream_executor/platform/dso_loader.h" #include "third_party/tensorrt/NvInfer.h" @@ -51,13 +50,12 @@ void LogFatalSymbolNotFound(const char* symbol_name) { } } // namespace -#if IS_TRT_VERSION_GE(8, 0, 0, 0) +#if NV_TENSORRT_MAJOR < 7 +#error TensorRT version earlier than 7 is not supported. +#elif NV_TENSORRT_MAJOR == 7 +#include "tensorflow/compiler/tf2tensorrt/stub/NvInfer_7_0.inc" +#elif NV_TENSORRT_MAJOR == 8 #include "tensorflow/compiler/tf2tensorrt/stub/NvInfer_8_0.inc" -#elif IS_TRT_VERSION_GE(5, 1, 0, 0) -#include "tensorflow/compiler/tf2tensorrt/stub/NvInfer_5_1.inc" -#elif IS_TRT_VERSION_GE(5, 0, 0, 0) -#include "tensorflow/compiler/tf2tensorrt/stub/NvInfer_5_0.inc" #else -#error TensorRT version earlier than 5 is not supported. +#error This version of TensorRT is not supported. #endif - diff --git a/tensorflow/compiler/tf2tensorrt/tensorrt_test.cc b/tensorflow/compiler/tf2tensorrt/tensorrt_test.cc index 8a18e4eaf70..41e74928f77 100644 --- a/tensorflow/compiler/tf2tensorrt/tensorrt_test.cc +++ b/tensorflow/compiler/tf2tensorrt/tensorrt_test.cc @@ -12,24 +12,65 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ +#if GOOGLE_CUDA && GOOGLE_TENSORRT +#include +#include +#include -#include "tensorflow/core/common_runtime/gpu/gpu_init.h" -#include "tensorflow/core/platform/logging.h" -#include "tensorflow/core/platform/stream_executor.h" -#include "tensorflow/core/platform/test.h" - -#if GOOGLE_CUDA -#if GOOGLE_TENSORRT #include "third_party/gpus/cuda/include/cuda.h" #include "third_party/gpus/cuda/include/cuda_runtime_api.h" -#include "third_party/tensorrt/NvInfer.h" #include "tensorflow/compiler/tf2tensorrt/common/utils.h" #include "tensorflow/compiler/tf2tensorrt/convert/utils.h" #include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h" +#include "tensorflow/core/common_runtime/gpu/gpu_init.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/stream_executor.h" +#include "tensorflow/core/platform/test.h" +#include "third_party/tensorrt/NvInfer.h" +#include "third_party/tensorrt/NvInferPlugin.h" +#include "third_party/tensorrt/NvInferRuntimeCommon.h" +#ifdef TF_TRT_USE_EFFICIENT_NMS_PLUGIN +#include "third_party/tensorrt/plugin/efficientNMSPlugin/efficientNMSPlugin.h" namespace tensorflow { namespace tensorrt { +std::unique_ptr +MakeNMSPluginCreator(const std::string& plugin_namespace = "tftrt") { + auto pluginCreator = + std::make_unique(); + pluginCreator->setPluginNamespace(plugin_namespace.c_str()); + std::string pluginType = std::string{pluginCreator->getPluginNamespace()} + + "::" + std::string{pluginCreator->getPluginName()} + + " version " + + std::string{pluginCreator->getPluginVersion()}; + VLOG(0) << "Created plugin type " << pluginType; + return pluginCreator; +} + +struct PluginDeleter { + void operator()(nvinfer1::IPluginV2* t); +}; +void PluginDeleter::operator()(nvinfer1::IPluginV2* t) { t->destroy(); } + +std::unique_ptr createPlugin( + const std::string& name, nvinfer1::IPluginCreator* pluginCreator, + const std::vector& pluginFields) { + if (!pluginCreator) { + return nullptr; + } + nvinfer1::PluginFieldCollection fc; + fc.nbFields = pluginFields.size(); + fc.fields = pluginFields.data(); + return std::unique_ptr{ + pluginCreator->createPlugin(name.c_str(), &fc)}; +} +} // namespace tensorrt +} // namespace tensorflow +#endif + +namespace tensorflow { +namespace tensorrt { class ScopedWeights { public: @@ -45,65 +86,125 @@ class ScopedWeights { nvinfer1::Weights w; }; -const char* kInputTensor = "input"; -const char* kOutputTensor = "output"; +class ScopedShapedWeights { + public: + ScopedShapedWeights(nvinfer1::Dims dims, float value) + : dims_(dims), + value_(std::accumulate(dims.d, dims.d + dims.nbDims, 1, + std::multiplies<>()), + value) { + w.type = nvinfer1::DataType::kFLOAT; + w.values = value_.data(); + w.count = value_.size(); + } + + nvinfer1::Dims dims_; + std::vector value_; + nvinfer1::Weights w; +}; + +const char* kInputTensor1 = "input1"; +const char* kInputTensor2 = "input2"; +const char* kOutputTensor1 = "output"; +const char* kOutputTensor2 = "output-nms"; -// Creates a network to compute y=2x+3. +// Creates a network to compute x+y. TrtUniquePtrType CreateSerializedEngine() { Logger& logger = *Logger::GetLogger(); TrtUniquePtrType builder( nvinfer1::createInferBuilder(logger)); - ScopedWeights weights(2.0); - ScopedWeights bias(3.0); -#if IS_TRT_VERSION_GE(6, 0, 0, 0) TrtUniquePtrType network( - builder->createNetworkV2(0L)); -#else - nvinfer1::INetworkDefinition* network = builder->createNetwork(); -#endif + builder->createNetworkV2( + 1U << static_cast( + nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH))); // Add the input. - auto input = network->addInput(kInputTensor, nvinfer1::DataType::kFLOAT, - nvinfer1::Dims3{1, 1, 1}); - EXPECT_NE(input, nullptr); - // Add the hidden layer. - auto layer = network->addFullyConnected(*input, 1, weights.get(), bias.get()); + auto input1 = network->addInput(kInputTensor1, nvinfer1::DataType::kFLOAT, + nvinfer1::Dims4{1, 1, 1, 1}); + auto input2 = network->addInput(kInputTensor2, nvinfer1::DataType::kFLOAT, + nvinfer1::Dims4{1, 1, 1, 1}); + EXPECT_NE(input1, nullptr); + EXPECT_NE(input2, nullptr); + // Add an ILayer layer. + auto layer = network->addElementWise(*input1, *input2, + nvinfer1::ElementWiseOperation::kSUM); EXPECT_NE(layer, nullptr); - // Mark the output. auto output = layer->getOutput(0); - output->setName(kOutputTensor); + output->setName(kOutputTensor1); network->markOutput(*output); - // Build the engine + +#ifdef TF_TRT_USE_EFFICIENT_NMS_PLUGIN + // Add an efficient nms plugin. + ScopedShapedWeights boxes_weights(nvinfer1::Dims3(1, 10, 4), 0.0f); + ScopedShapedWeights scores_weights(nvinfer1::Dims3(1, 10, 10), 0.0f); + nvinfer1::IConstantLayer* boxes = + network->addConstant(boxes_weights.dims_, boxes_weights.w); + nvinfer1::IConstantLayer* scores = + network->addConstant(scores_weights.dims_, scores_weights.w); + + std::array nms_inputs = {boxes->getOutput(0), + scores->getOutput(0)}; + auto plugin_creator = MakeNMSPluginCreator("tftrt"); + auto plugin = createPlugin("nms_plugin_instance", plugin_creator.get(), {}); + auto nms = network->addPluginV2(nms_inputs.data(), 2, *plugin); + nms->getOutput(0)->setName(kOutputTensor2); + network->markOutput(*nms->getOutput(0)); +#else + auto sub_layer = network->addElementWise( + *input1, *input2, nvinfer1::ElementWiseOperation::kSUB); + EXPECT_NE(sub_layer, nullptr); + network->markOutput(*sub_layer->getOutput(0)); + sub_layer->getOutput(0)->setName(kOutputTensor2); +#endif + + // Build the engine. builder->setMaxBatchSize(1); -#if IS_TRT_VERSION_GE(6, 0, 0, 0) TrtUniquePtrType builderConfig( builder->createBuilderConfig()); - builderConfig->setMaxWorkspaceSize(1 << 10); + builderConfig->setMaxWorkspaceSize(1 << 20); TrtUniquePtrType engine( builder->buildEngineWithConfig(*network, *builderConfig)); -#else - builder->setMaxWorkspaceSize(1 << 10); - auto engine = builder->buildCudaEngine(*network); -#endif EXPECT_NE(engine, nullptr); // Serialize the engine to create a model, then close everything. TrtUniquePtrType model(engine->serialize()); return model; } +template +unsigned GetBindingSizeBytes(const nvinfer1::ICudaEngine& engine, int index, + unsigned batch_size) { + unsigned vol = batch_size; + auto dims = engine.getBindingDimensions(index); + int vecDim = engine.getBindingVectorizedDim(index); + if (-1 != vecDim) // i.e., 0 != lgScalarsPerVector + { + int scalarsPerVec = engine.getBindingComponentsPerElement(index); + // Divide round up. + dims.d[vecDim] = (dims.d[vecDim] + scalarsPerVec - 1 / scalarsPerVec); + vol *= scalarsPerVec; + } + vol *= std::accumulate(dims.d, dims.d + dims.nbDims, 1, std::multiplies<>()); + return vol * sizeof(T); +} + // Executes the network. -void Execute(nvinfer1::IExecutionContext* context, const float* input, - float* output) { +void Execute(nvinfer1::IExecutionContext* context, const float* input1, + const float* input2, float* output1, float* output2) { const nvinfer1::ICudaEngine& engine = context->getEngine(); // We have two bindings: input and output. - ASSERT_EQ(engine.getNbBindings(), 2); - const int input_index = engine.getBindingIndex(kInputTensor); - const int output_index = engine.getBindingIndex(kOutputTensor); + ASSERT_EQ(engine.getNbBindings(), 4); + const int input_index1 = engine.getBindingIndex(kInputTensor1); + const int input_index2 = engine.getBindingIndex(kInputTensor2); + const int output_index1 = engine.getBindingIndex(kOutputTensor1); + const int output_index2 = engine.getBindingIndex(kOutputTensor2); // Create GPU buffers and a stream - void* buffers[2]; - ASSERT_EQ(0, cudaMalloc(&buffers[input_index], sizeof(float))); - ASSERT_EQ(0, cudaMalloc(&buffers[output_index], sizeof(float))); + std::vector buffers(engine.getNbBindings()); + for (int i = 0; i < buffers.size(); i++) { + ASSERT_EQ( + 0, cudaMalloc(&buffers[i], GetBindingSizeBytes(engine, i, 1))); + } + cudaStream_t stream; ASSERT_EQ(0, cudaStreamCreate(&stream)); @@ -112,22 +213,35 @@ void Execute(nvinfer1::IExecutionContext* context, const float* input, // Note that since the host buffer was not created as pinned memory, these // async copies are turned into sync copies. So the following synchronization // could be removed. - ASSERT_EQ(0, cudaMemcpyAsync(buffers[input_index], input, sizeof(float), + ASSERT_EQ(0, cudaMemcpyAsync(buffers[input_index1], input1, sizeof(float), + cudaMemcpyHostToDevice, stream)); + ASSERT_EQ(0, cudaMemcpyAsync(buffers[input_index2], input2, sizeof(float), cudaMemcpyHostToDevice, stream)); - context->enqueue(1, buffers, stream, nullptr); - ASSERT_EQ(0, cudaMemcpyAsync(output, buffers[output_index], sizeof(float), + context->enqueueV2(buffers.data(), stream, nullptr); + ASSERT_EQ(0, cudaMemcpyAsync(output1, buffers[output_index1], sizeof(float), cudaMemcpyDeviceToHost, stream)); + ASSERT_EQ( + 0, cudaMemcpyAsync(output2, buffers[output_index2], + GetBindingSizeBytes(engine, output_index2, 1), + cudaMemcpyDeviceToHost, stream)); cudaStreamSynchronize(stream); // Release the stream and the buffers - ASSERT_EQ(0, cudaFree(buffers[input_index])); - ASSERT_EQ(0, cudaFree(buffers[output_index])); + for (int i = 0; i < buffers.size(); i++) { + ASSERT_EQ(0, cudaFree(buffers[i])); + } cudaStreamDestroy(stream); } TEST(TensorrtTest, BasicFunctions) { + // We must register the plugin creator in order to deserialize the plugin. +#ifdef TF_TRT_USE_EFFICIENT_NMS_PLUGIN + auto plugin_creator = MakeNMSPluginCreator("tftrt"); + getPluginRegistry()->registerCreator(*plugin_creator, "tftrt"); +#endif + // Handle the case where the test is run on machine with no gpu available. - if (CHECK_NOTNULL(GPUMachineManager())->VisibleDeviceCount() <= 0) { + if (CHECK_NOTNULL(se::GPUMachineManager())->VisibleDeviceCount() <= 0) { LOG(WARNING) << "No gpu device available, probably not being run on a gpu " "machine. Skipping..."; return; @@ -145,14 +259,29 @@ TEST(TensorrtTest, BasicFunctions) { engine->createExecutionContext()); // Execute the network. - float input = 1234; - float output; - Execute(context.get(), &input, &output); - EXPECT_EQ(output, input * 2 + 3); + float input1 = 1234; + float input2 = 567; + + std::vector output1( + GetBindingSizeBytes(*engine, 2, 1) / sizeof(float), 0.0f); + + std::vector output2( + GetBindingSizeBytes(*engine, 3, 1) / sizeof(int32), 0.0f); + + ASSERT_EQ(output1.size(), 1); + ASSERT_EQ(output2.size(), 1); + + Execute(context.get(), &input1, &input2, output1.data(), output2.data()); + EXPECT_EQ(output1[0], input1 + input2); + +#ifdef TF_TRT_USE_EFFICIENT_NMS_PLUGIN + EXPECT_EQ(output2[0], 0); +#else + EXPECT_EQ(output2[0], 667); +#endif // TF_TRT_USE_EFFICIENT_NMS_PLUGIN } } // namespace tensorrt } // namespace tensorflow -#endif // GOOGLE_TENSORRT -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA && GOOGLE_TENSORRT diff --git a/tensorflow/compiler/tf2tensorrt/trt_convert_api.cc b/tensorflow/compiler/tf2tensorrt/trt_convert_api.cc new file mode 100644 index 00000000000..a415c5fdd41 --- /dev/null +++ b/tensorflow/compiler/tf2tensorrt/trt_convert_api.cc @@ -0,0 +1,512 @@ +/* Copyright 2021 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/tf2tensorrt/trt_convert_api.h" + +#include +#include +#include + +#include "absl/strings/str_join.h" +#include "tensorflow/cc/tools/freeze_saved_model.h" +#include "tensorflow/compiler/tf2tensorrt/common/utils.h" +#include "tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h" +#include "tensorflow/core/common_runtime/device.h" +#include "tensorflow/core/common_runtime/device_mgr.h" +#include "tensorflow/core/graph/graph_constructor.h" +#include "tensorflow/core/grappler/clusters/cluster.h" +#include "tensorflow/core/grappler/clusters/single_machine.h" +#include "tensorflow/core/grappler/clusters/utils.h" +#include "tensorflow/core/grappler/devices.h" +#include "tensorflow/core/grappler/grappler_item.h" +#include "tensorflow/core/grappler/grappler_item_builder.h" +#include "tensorflow/core/grappler/optimizers/meta_optimizer.h" +#include "tensorflow/core/lib/core/status.h" +#include "tensorflow/core/platform/errors.h" +#include "tensorflow/core/platform/mutex.h" +#include "tensorflow/core/protobuf/config.pb.h" +#include "tensorflow/core/protobuf/meta_graph.pb.h" +#include "tensorflow/core/public/session.h" + +#if GOOGLE_CUDA && GOOGLE_TENSORRT + +namespace tensorflow { + +namespace tensorrt { +namespace { + +// Creates and provisions a new cluster. The caller must call Shutdown before +// the cluster is destroyed. +Status NewCluster(grappler::Cluster** cluster) { + int num_cpu_cores = grappler::GetNumAvailableLogicalCPUCores(); + int num_gpus = grappler::GetNumAvailableGPUs(); + int timeout_s = 60 * 10; + *cluster = new grappler::SingleMachine(timeout_s, num_cpu_cores, num_gpus); + (*cluster)->DisableDetailedStats(true); + (*cluster)->AllowSoftPlacement(true); + (*cluster)->SetNumWarmupSteps(10); + TF_RETURN_IF_ERROR((*cluster)->Provision()); + return Status::OK(); +} + +Status RunGrappler(const MetaGraphDef& meta_graph_def, + const std::vector& input_names, + const std::vector& output_names, + const ConfigProto& config_proto, grappler::Cluster* cluster, + GraphDef* out_graph_def) { + grappler::ItemConfig item_config; + + for (const string& name : input_names) { + item_config.feed_nodes.insert(name); + } + for (const string& name : output_names) { + item_config.fetch_nodes.insert(name); + } + + std::unique_ptr item = + grappler::GrapplerItemFromMetaGraphDef("tf_graph", meta_graph_def, + item_config); + if (!item) { + return tensorflow::errors::Internal( + "Failed to create grappler item from MetaGraphDef."); + } + + tensorflow::DeviceBase* cpu_device = nullptr; + TF_RETURN_IF_ERROR(grappler::RunMetaOptimizer( + std::move(*item), config_proto, cpu_device, cluster, out_graph_def)); + VLOG(2) << "Grappler finished\n"; + return Status::OK(); +} + +Status ImportGraphDefToSession(Session* session, const GraphDef& graph_def, + const string& prefix) { + ImportGraphDefOptions opts; + opts.prefix = prefix; + Graph graph(OpRegistry::Global()); + TF_RETURN_IF_ERROR(ImportGraphDef(opts, graph_def, &graph, nullptr)); + GraphDef new_graph_def; + graph.ToGraphDef(&new_graph_def); + TF_RETURN_IF_ERROR(session->Extend(new_graph_def)); + return Status::OK(); +} + +Status GetTrtRewriterConfig(const TfTrtConversionParams& params, + const GraphDef& frozen_graph_def, + RewriterConfig* opt_config) { + opt_config->set_meta_optimizer_iterations(tensorflow::RewriterConfig::ONE); + opt_config->set_min_graph_nodes(-1); // do not skip small graphs + + // Turn off remapping. + opt_config->set_remapping(RewriterConfig_Toggle::RewriterConfig_Toggle_OFF); + + // If the graph has QDQ nodes, then we need to disable folding of the + // QDQ with constants. Otherwise, the conversion will not work corectly. + // Ideally, we do this after segmentation and outlining of TRT regions to + // functions, but we currently lack that capability. Disabling QDQ-const + // folding doesn't matter if you don't have QDQ nodes, so we always enable + // this. + opt_config->set_experimental_disable_folding_quantization_emulation( + IS_TRT_VERSION_GE(8, 0, 0, 0)); + + // Initial transformations before TensorRTOptimizer is called + opt_config->add_optimizers("function"); + opt_config->add_optimizers("constfold"); + opt_config->add_optimizers("layout"); + opt_config->add_optimizers("constfold"); + + // Parameters for TensorRTOptimizer + auto trt_optimizer = opt_config->add_custom_optimizers(); + trt_optimizer->set_name("TensorRTOptimizer"); + + auto trt_parameter_map = trt_optimizer->mutable_parameter_map(); + (*trt_parameter_map)["is_dynamic_op"].set_b(true); + (*trt_parameter_map)["minimum_segment_size"].set_i( + params.minimum_segment_size); + string prec_string; + TF_RETURN_IF_ERROR( + TrtPrecisionModeToName(params.precision_mode, &prec_string)); + (*trt_parameter_map)["precision_mode"].set_s(prec_string); + (*trt_parameter_map)["max_batch_size"].set_i(1); + (*trt_parameter_map)["max_workspace_size_bytes"].set_i( + params.max_workspace_size_bytes); + (*trt_parameter_map)["max_cached_engines"].set_i(params.max_cached_engines); + (*trt_parameter_map)["use_calibration"].set_b(params.use_calibration); + (*trt_parameter_map)["profile_strategy"].set_s( + ProfileStrategyToName(params.profile_strategy)); + (*trt_parameter_map)["use_implicit_batch"].set_b(!params.use_dynamic_shape); + (*trt_parameter_map)["_allow_build_at_runtime"].set_b( + params.allow_build_at_runtime); + return Status::OK(); +} + +// Runs TRTOptimizer grappler pass. +Status RunTfTrt(const MetaGraphDef& meta_graph_def, + const std::vector& input_names, + const std::vector& output_names, + const RewriterConfig& rewriter_config, + GraphDef* segmented_graph_def) { + ConfigProto config_proto; + *config_proto.mutable_graph_options()->mutable_rewrite_options() = + rewriter_config; + + VLOG(4) << "Setting up Grappler parameters\n" << config_proto.DebugString(); + std::unique_ptr cluster; + grappler::Cluster* p_cluster; + mutex mu_cluster; // There can be only one provisioned cluster per process. + mutex_lock lock(mu_cluster); + TF_RETURN_IF_ERROR(NewCluster(&p_cluster)); + cluster.reset(p_cluster); + TF_RETURN_IF_ERROR(RunGrappler(meta_graph_def, input_names, output_names, + config_proto, cluster.get(), + segmented_graph_def)); + TF_RETURN_IF_ERROR(cluster->Shutdown()); + return Status::OK(); +} + +// Sets the _profile_generation mode attribute of all TRTEngineOp nodes in the +// graph to mode. +Status SetProfileGenerationMode(GraphDef* graph_def, bool mode) { + VLOG(3) << "Setting _profile_generation_mode=" << mode; + std::string op{"TRTEngineOp"}; + for (auto& node : *(graph_def->mutable_node())) { + if (!op.compare(node.op())) { + auto* attr = node.mutable_attr(); + AttrValue profile_generation_mode; + profile_generation_mode.set_b(mode); + (*attr)["_profile_generation_mode"] = profile_generation_mode; + } + } + return Status::OK(); +} + +Status RunSession(Session* session, const std::vector& input_names, + const std::vector& output_names, + const std::vector& input_tensors, + string prefix = "") { + TRT_ENSURE(!input_names.empty()); + TRT_ENSURE(!output_names.empty()); + TRT_ENSURE(!input_tensors.empty()); + + std::vector> input_pairs; + std::vector prefixed_output_names; + auto prefixed_name = [](std::string prefix, std::string name) { + return !prefix.empty() ? absl::StrJoin({prefix, name}, "/") : name; + }; + for (int i = 0; i < input_names.size(); i++) { + input_pairs.push_back( + {prefixed_name(prefix, input_names.at(i)), input_tensors.at(i)}); + } + for (int i = 0; i < output_names.size(); i++) { + prefixed_output_names.push_back(prefixed_name(prefix, output_names.at(i))); + } + std::vector output_tensors; + for (int i = 0; i < output_names.size(); i++) { + output_tensors.push_back({}); + } + VLOG(3) << "TF-TRT Build mode: running inference\n"; + TF_RETURN_IF_ERROR( + session->Run(input_pairs, prefixed_output_names, {}, &output_tensors)); + return Status::OK(); +} + +// Runs the model to create the engines. In dynamic shape mode, before creating +// the engines, we provide shapes to define optimization profiles. +Status Build(GraphDef& segmented_graph_def, + const std::vector& input_names, + const std::vector& output_names, + const std::vector>& inputs, + Session* session, const TfTrtConversionParams params) { + VLOG(2) << "Building the model"; + bool need_collect_profiles = params.use_dynamic_shape && inputs.size() > 1; + if (need_collect_profiles) { + TF_RETURN_IF_ERROR(SetProfileGenerationMode(&segmented_graph_def, true)); + } + TF_RETURN_IF_ERROR(session->Create(segmented_graph_def)); + string prefix = ""; + if (need_collect_profiles) { + for (auto const& input : inputs) { + TF_RETURN_IF_ERROR(RunSession(session, input_names, output_names, input)); + } + prefix = "TrtBuildStep"; + TF_RETURN_IF_ERROR(SetProfileGenerationMode(&segmented_graph_def, false)); + VLOG(3) << "Importing graph with _profile_generation_mode disabled"; + TF_RETURN_IF_ERROR( + ImportGraphDefToSession(session, segmented_graph_def, prefix)); + } + TF_RETURN_IF_ERROR( + RunSession(session, input_names, output_names, *inputs.begin(), prefix)); + return Status::OK(); +} + +// Returns the resource manager associated with the node. +Status GetResourceManager(const NodeDef& node, Session* session, + ResourceMgr** rm) { + const DeviceMgr* device_mgr; + TF_RETURN_IF_ERROR(session->LocalDeviceManager(&device_mgr)); + Device* device; + string device_name = node.device().empty() + ? "/job:localhost/replica:0/task:0/device:GPU:0" + : node.device(); + TF_RETURN_IF_ERROR(device_mgr->LookupDevice(device_name, &device)); + *rm = device->resource_manager(); + return Status::OK(); +} + +// Looks up the cache resurce associated with the TRT node. +Status GetEngineCacheResource(const NodeDef& node, Session* session, + TRTEngineCacheResource** resource) { + ResourceMgr* rm; + TF_RETURN_IF_ERROR(GetResourceManager(node, session, &rm)); + + absl::string_view resource_name = node.name(); + size_t last_slash = resource_name.find_last_of('/'); + if (last_slash != absl::string_view::npos) { + resource_name.remove_prefix(last_slash + 1); + } + const std::string container(kTfTrtContainerName); + *resource = nullptr; + TF_RETURN_IF_ERROR( + rm->Lookup(container, std::string(resource_name), resource)); + if (resource == nullptr || (*resource)->cache_.size() == 0) { + return errors::Internal("Engine cache not found for", resource_name); + } + return Status::OK(); +} + +// Looks up the engine from the engine cache, and serializes the engine. +Status ReadSerializedEngine( + const NodeDef& node, Session* session, + TrtUniquePtrType* engine_data) { + TRTEngineCacheResource* resource; + TF_RETURN_IF_ERROR(GetEngineCacheResource(node, session, &resource)); + core::ScopedUnref unref_cache_res(resource); + if (resource->cache_.size() > 1) { + return errors::Internal( + "Multiple engines found, but we can only serialize one"); + } + const std::unique_ptr& engine = + resource->cache_.begin()->second; + if (!engine) { + return errors::Internal("Engine not found for", node.name()); + } + + if (engine->GetCudaEngine()) { + // Serialize the engine. + engine_data->reset(engine->GetCudaEngine()->serialize()); + } else { + LOG(WARNING) << "Engine cache contains nullptr"; + } + + return Status::OK(); +} + +// Saves the TRT engines as attributes of the TRTEngineOp nodes. +Status ConvertToStaticEngine(const GraphDef graph_def, + GraphDef* static_graph_def, Session* session) { + *static_graph_def = graph_def; + VLOG(1) << "Saving TRT engines as static engine"; + std::string op{"TRTEngineOp"}; + for (auto& node : *(static_graph_def->mutable_node())) { + if (!op.compare(node.op())) { + VLOG(2) << "Saving TRT engine for " << node.name() + << ", device: " << node.device(); + TrtUniquePtrType engine_data; + TF_RETURN_IF_ERROR(ReadSerializedEngine(node, session, &engine_data)); + auto* attr = node.mutable_attr(); + AttrValue static_engine; + static_engine.set_b(true); + AttrValue engine_string; + if (engine_data) { + engine_string.set_s(engine_data->data(), engine_data->size()); + } + (*attr)["static_engine"] = static_engine; + (*attr)["serialized_segment"] = engine_string; + } + } + return Status::OK(); +} + +Status ValidateConversionParams(const TfTrtConversionParams& p, int n_inputs) { + if (p.precision_mode == TrtPrecisionMode::INT8 && p.use_calibration) { + return errors::InvalidArgument( + "Calibration not yet implemented through the C++ interface. Please use " + "our Python API for calibration."); + } + if (p.convert_to_static_engine && n_inputs == 0) { + return errors::InvalidArgument( + "TRT Engine needs to be built before we can convert it to static " + "engine. Please provide input data to build the model."); + } + if (!p.convert_to_static_engine && n_inputs >= 0) { + // After the conversion, the session that was used to build the engines + // will be destroyed. If we do not convert the engine to static engine, + // then we loose the engines. + // + // TODO(tfeher): Provide a way to save dynamic engines and remove this + // warning. + LOG(WARNING) + << "Skipping build mode because we cannot save the " + "engines. Use convert_to_static_engines=true conversion " + "parameter to enable build mode and save the engines in the graph."; + } + if (!p.allow_build_at_runtime && n_inputs == 0) { + LOG(WARNING) + << "TRT will not be used since allow_build_at_runtime is disabled and " + "no inputs are provided to build during conversion."; + } + return Status::OK(); +} + +// Returns configuration used during the build step session run. +tensorflow::SessionOptions GetSessionConfg() { + // We also need to disable constant folding because we already ran constant + // folding and may have prevented quantization operation folding on purpose. + tensorflow::SessionOptions opts; + auto* rewriter_opts = + opts.config.mutable_graph_options()->mutable_rewrite_options(); + rewriter_opts->set_experimental_disable_folding_quantization_emulation(true); + + // It seems that we need to disable the optimizer entirely to prevent the + // folding. + rewriter_opts->set_disable_meta_optimizer(true); + + // Disable optimizations for static graph to allow calls to Session::Extend. + opts.config.mutable_experimental()->set_disable_optimize_for_static_graph( + true); + return opts; +} + +} // namespace + +::stream_executor::port::StatusOr ConvertAndBuild( + const GraphDef& frozen_graph_def, const std::vector& input_names, + const std::vector& output_names, + const std::vector>& inputs, + const TfTrtConversionParams& conv_params) { + TF_RETURN_IF_ERROR(ValidateConversionParams(conv_params, inputs.size())); + MetaGraphDef meta_graph; + *meta_graph.mutable_graph_def() = frozen_graph_def; + + RewriterConfig rewriter_config; + TF_RETURN_IF_ERROR( + GetTrtRewriterConfig(conv_params, frozen_graph_def, &rewriter_config)); + + GraphDef segmented_graph_def; + TF_RETURN_IF_ERROR(RunTfTrt(meta_graph, input_names, output_names, + rewriter_config, &segmented_graph_def)); + + GraphDef output; + + if (!inputs.empty() && conv_params.convert_to_static_engine) { + // The TRTOptimization pass has inserted placeholder TRTEngineOps. Here we + // trigger conversion by inferring the graph. + std::unique_ptr session( + tensorflow::NewSession(GetSessionConfg())); + if (!session) { + return errors::Internal("Failed to create build session"); + } + + TF_RETURN_IF_ERROR(Build(segmented_graph_def, input_names, output_names, + inputs, session.get(), conv_params)); + + TF_RETURN_IF_ERROR( + ConvertToStaticEngine(segmented_graph_def, &output, session.get())); + } else { + output = segmented_graph_def; + } + VLOG(1) << "TF-TRT conversion finished"; + return output; +} + +Status InlineFunctions(const MetaGraphDef& meta_graph_def, + GraphDef* out_graph_def) { + ConfigProto config_proto; + auto opt_config = + config_proto.mutable_graph_options()->mutable_rewrite_options(); + + opt_config->set_meta_optimizer_iterations(tensorflow::RewriterConfig::ONE); + opt_config->set_min_graph_nodes(-1); // do not skip small graphs + opt_config->add_optimizers("function"); + + TF_RETURN_IF_ERROR(RunGrappler(meta_graph_def, {}, {}, config_proto, nullptr, + out_graph_def)); + + VLOG(2) << "Graph is inlined"; + return Status::OK(); +} + +// Freezes the graph. It is assumed that the functions are inlined and the +// variables are initialized. +Status FreezeGraph(SavedModelBundle& bundle, MetaGraphDef* frozen_meta_graph) { + std::unordered_set inputs; + std::unordered_set outputs; + GraphDef frozen_graph_def; + TF_RETURN_IF_ERROR( + FreezeSavedModel(bundle, &frozen_graph_def, &inputs, &outputs)); + + *frozen_meta_graph = bundle.meta_graph_def; + GraphDef* gdef = frozen_meta_graph->mutable_graph_def(); + *gdef = frozen_graph_def; + + VLOG(2) << "Graph frozen"; + return Status::OK(); +} + +// Returns the name of nodes listed in the signature definition. +std::vector GetNodeNames( + const google::protobuf::Map& signature) { + std::vector names; + for (auto const& item : signature) { + absl::string_view name = item.second.name(); + // Remove tensor suffix like ":0". + size_t last_colon = name.find_last_of(':'); + if (last_colon != absl::string_view::npos) { + name.remove_suffix(name.size() - last_colon); + } + names.push_back(std::string(name)); + } + return names; +} + +::stream_executor::port::StatusOr ConvertAndBuild( + SavedModelBundle* bundle, const std::string& signature_key, + const std::vector>& inputs, + const TfTrtConversionParams& conversion_params) { + // Inline the functions. + GraphDef inlined_graph_def; + TF_RETURN_IF_ERROR( + InlineFunctions(bundle->meta_graph_def, &inlined_graph_def)); + + // Replace the graph_def with the inlined graph. Note that bundle->session + // still has the original graph. + *bundle->meta_graph_def.mutable_graph_def() = inlined_graph_def; + + // Freeze variables. + MetaGraphDef frozen_meta_graph; + TF_RETURN_IF_ERROR(FreezeGraph(*bundle, &frozen_meta_graph)); + + // Convert. + auto signature_map = bundle->GetSignatures(); + const tensorflow::SignatureDef& signature = signature_map[signature_key]; + std::vector input_names = GetNodeNames(signature.inputs()); + std::vector output_names = GetNodeNames(signature.outputs()); + return ConvertAndBuild(frozen_meta_graph.graph_def(), input_names, + output_names, inputs, conversion_params); +} + +} // namespace tensorrt +} // namespace tensorflow + +#endif // GOOGLE_CUDA && GOOGLE_TENSORRT diff --git a/tensorflow/compiler/tf2tensorrt/trt_convert_api.h b/tensorflow/compiler/tf2tensorrt/trt_convert_api.h new file mode 100644 index 00000000000..24240b802c0 --- /dev/null +++ b/tensorflow/compiler/tf2tensorrt/trt_convert_api.h @@ -0,0 +1,129 @@ +/* Copyright 2021 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_TRT_CONVERT_API_H_ +#define TENSORFLOW_COMPILER_TF2TENSORRT_TRT_CONVERT_API_H_ + +#include +#include +#include + +#if GOOGLE_CUDA && GOOGLE_TENSORRT + +#include "tensorflow/compiler/tf2tensorrt/common/utils.h" +#include "tensorflow/compiler/tf2tensorrt/convert/trt_parameters.h" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/platform/statusor.h" +#include "tensorflow/core/protobuf/meta_graph.pb.h" + +namespace tensorflow { + +struct SavedModelBundle; + +namespace tensorrt { + +struct TfTrtConversionParams { + // Corresponds 'workspaceSize' parameter of + // nvinfer1::IBuilderConfig::setMaxWorkspaceSize. +#if IS_TRT_VERSION_GE(8, 4, 0, 0) + // Must use `LLONG_MAX - 512` to avoid overflow during casting. + size_t max_workspace_size_bytes = LLONG_MAX - 512; +#else + size_t max_workspace_size_bytes = 1 << 30; // 1,073,741,824 +#endif + + // Minimum precision used by the TRT Engine. + TrtPrecisionMode precision_mode = TrtPrecisionMode::FP32; + + // The minimum number of nodes required for a subgraph to be replaced by + // TRTEngineOp. Note that many small TRT subgraphs could be detrimental for + // performance, increasing the minimum segment size can help avoid the + // problem. + int minimum_segment_size = 3; + + // Max number of cached TRT engines for dynamic TRT ops (by default we have + // dynamic TRT ops). + int max_cached_engines = 1; + + // Note that calibration is currently not implemented with the C++ converter. + // This argument is ignored if precision_mode is not INT8. If set to True, the + // implementation will use the user provided inputs to generate calibration + // data. If set to False, quantization nodes will be expected for every tensor + // in the graph (excluding those which will be fused). If a range is missing, + // an error will occur. Please note that accuracy may be negatively affected + // if there is a mismatch between which tensors TRT quantizes and which + // tensors were trained with fake quantization. + bool use_calibration = true; + + // Whether to enable dynamic shape mode for the TRT engines. It is + // recommended to use_dynamic_shape mode to handle dynamic input shape. + // Enabling dynamic shape mode can also improve the conversion rate of graphs + // with static input shape. + bool use_dynamic_shape = true; + + // In dynamic shape mode we create an engine that can handle various input + // shape ranges. We derive the shape optimization profiles for the TRT engines + // in the graph based on user provided input data and profile_strategy. + ProfileStrategy profile_strategy = ProfileStrategy::kRange; + + // Whether to allow bulding TRT engines at runtime. If no TensorRT engine can + // be found in cache that can handle the given inputs during runtime, then a + // new TensorRT engine is built at runtime if allow_build_at_runtime=True, + // otherwise native TF is used. We recommend to set this value false and build + // the engine in advance, to avoid runtime overhead. + bool allow_build_at_runtime = true; + + // Record the TRT engine as an attribute of the TRTEngineOp. This is only + // valid when max_cached_engines == 1. Note: the frozen graph together with + // the serialized engines have to be below 2GiB (protobuf size limit). If + // convert_to_static_engine = false, then the converted graph_def only + // contains placeholder TRTEngineOp nodes. + bool convert_to_static_engine = true; +}; + +/** + * Converts the graph with TF-TRT. + * + * Performs TF-TRT conversion and returns the converted GraphDef. If inputs is + * not empty and convert_to_static_engine is requested, we also build the + * engines and convert the engines to static engines. + * + * Arguments: + * - frozen_graph_def input graph, it is assumed to be frozen + * - input_names names of the input tensors + * - output_names names of the output tensors + * - inputs tensors that we will use as input while building the TRT engines + * - conv_params parameters for the TF-TRT conversion + * + * Returns the converted graph_def. + */ +::stream_executor::port::StatusOr ConvertAndBuild( + const GraphDef& frozen_graph_def, const std::vector& input_names, + const std::vector& output_names, + const std::vector>& inputs, + const TfTrtConversionParams& conv_params); + +::stream_executor::port::StatusOr ConvertAndBuild( + SavedModelBundle* bundle, + const std::string& signature_key = "serving_default", + const std::vector>& inputs = {}, + const TfTrtConversionParams& conversion_params = TfTrtConversionParams()); + +} // namespace tensorrt +} // namespace tensorflow + +#endif // GOOGLE_CUDA && GOOGLE_TENSORRT + +#endif // TENSORFLOW_COMPILER_TF2TENSORRT_TRT_CONVERT_API_H_ diff --git a/tensorflow/compiler/tf2tensorrt/trt_convert_api_test.cc b/tensorflow/compiler/tf2tensorrt/trt_convert_api_test.cc new file mode 100644 index 00000000000..bcfbb2adab7 --- /dev/null +++ b/tensorflow/compiler/tf2tensorrt/trt_convert_api_test.cc @@ -0,0 +1,358 @@ +/* Copyright 2021 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#if GOOGLE_CUDA && GOOGLE_TENSORRT + +#include "tensorflow/compiler/tf2tensorrt/trt_convert_api.h" + +#include "tensorflow/cc/ops/resource_variable_ops.h" +#include "tensorflow/cc/ops/standard_ops.h" +#include "tensorflow/cc/ops/state_ops.h" +#include "tensorflow/cc/saved_model/loader.h" +#include "tensorflow/core/framework/function_testlib.h" +#include "tensorflow/core/framework/tensor_testutil.h" +#include "tensorflow/core/lib/core/status_test_util.h" +#include "tensorflow/core/platform/test.h" +#include "tensorflow/core/protobuf/meta_graph.pb.h" +#include "tensorflow/core/public/session.h" + +namespace tensorflow { +namespace tensorrt { + +struct TestParam { + TfTrtConversionParams conv_params; + std::vector> input_shapes; +}; + +class TrtConverterTest + : public ::testing::TestWithParam> { + protected: + TrtConverterTest() { + param_ = std::get<0>(GetParam()); + use_variable_ = std::get<1>(GetParam()); + use_function_ = std::get<2>(GetParam()); + input_tensors_ = GetInputTensors(); + } + + // Returns the following graph: output = input * [42, 137] + input + GraphDef GetGraphDef(PartialTensorShape input_shape) { + Scope root = Scope::NewRootScope(); + Output c; + c = ops::Const(root.WithOpName("my_const"), {{42.0f, 137.0f}}); + Output v; + if (use_variable_) { + Output v_handle = ops::VarHandleOp(root.WithOpName("my_var"), + DataType::DT_FLOAT, {1, 2}); + v = ops::ReadVariableOp(root.WithOpName("my_var/Read/ReadVariableOp"), + v_handle, DataType::DT_FLOAT); + auto v_init = + ops::AssignVariableOp(root.WithOpName("my_var/init"), v_handle, c); + } else { + v = c; + } + const auto attrs = ops::Placeholder::Shape(input_shape); + auto x = ops::Placeholder(root.WithOpName("input"), DT_FLOAT, attrs); + auto y = ops::Mul(root.WithOpName("my_mul"), x, v); + auto z = ops::Add(root.WithOpName("my_add"), x, y); + auto q = ops::Identity(root.WithOpName("output"), z); + + GraphDef out; + TF_CHECK_OK(root.ToGraphDef(&out)); + return out; + } + + GraphDef GetGraphWithFunction(PartialTensorShape input_shape) { + using ::tensorflow::test::function::GDef; + using ::tensorflow::test::function::NDef; + const Tensor kOne = test::AsScalar(1.0f); + TensorShapeProto value_shape_proto; + kOne.shape().AsProto(&value_shape_proto); + TensorShapeProto input_shape_proto; + input_shape.AsProto(&input_shape_proto); + NodeDef value_node; + if (use_variable_) { + value_node = + NDef("my_value", "Identity", {"my_var:0"}, {{"T", DT_RESOURCE}}); + } else { + value_node = + NDef("my_value", "Identity", {"my_const:0"}, {{"T", DT_FLOAT}}); + } + GraphDef gdef = GDef( + { + NDef("input", "Placeholder", {}, + {{"dtype", DT_FLOAT}, {"shape", input_shape_proto}}), + NDef("my_const", "Const", {}, + {{"dtype", DT_FLOAT}, {"value", kOne}}), + value_node, + NDef("call", "StatefulPartitionedCall", {"input", "my_value"}, + {{"Tin", DataTypeSlice{DT_FLOAT, use_variable_ ? DT_RESOURCE + : DT_FLOAT}}, + {"Tout", DataTypeSlice{DT_FLOAT}}, + {"f", FunctionDefHelper::FunctionRef("f", {})}}), + NDef("output", "Identity", {"call:0"}, {{"T", DT_FLOAT}}), + }, + {}); + FunctionDef fdef; + if (use_variable_) { + *gdef.add_node() = + NDef("my_var", "VarHandleOp", {}, + {{"dtype", DT_FLOAT}, {"shape", value_shape_proto}}); + + *gdef.add_node() = NDef("my_var/init", "AssignVariableOp", + {"my_var", "my_const"}, {{"dtype", DT_FLOAT}}); + *gdef.add_node() = NDef("my_var/Read/ReadVariableOp", "ReadVariableOp", + {"my_var"}, {{"dtype", DT_FLOAT}}); + // Define function f(x, v) = x * v + x, where v is a variable. + fdef = FunctionDefHelper::Define( + "f", // Name + {"x: float", "v: resource"}, // Args + {"q: float"}, // Returns + {}, // Attr def + // Nodes + {{{"my_var/Read/ReadVariableOp"}, + "ReadVariableOp", + {"v"}, + {{"dtype", DT_FLOAT}}}, + {{"my_mul"}, + "Mul", + {"x", "my_var/Read/ReadVariableOp"}, + {{"T", DT_FLOAT}}}, + {{"my_add"}, "AddV2", {"x", "my_mul"}, {{"T", DT_FLOAT}}}, + {{"q"}, "Identity", {"my_add"}, {{"T", DT_FLOAT}}}}); + } else { + // Define function f(x, v) = x * v + x, where v is const value. + fdef = FunctionDefHelper::Define( + "f", // Name + {"x: float", "v: float"}, // Args + {"q: float"}, // Returns + {}, // Attr def + // Nodes + {{{"my_mul"}, "Mul", {"x", "v"}, {{"T", DT_FLOAT}}}, + {{"my_add"}, "AddV2", {"x", "my_mul"}, {{"T", DT_FLOAT}}}, + {{"q"}, "Identity", {"my_add"}, {{"T", DT_FLOAT}}}}); + } + *gdef.mutable_library()->add_function() = fdef; + + return gdef; + } + + // Returns the following graph: output = input * [42, 137] + input + MetaGraphDef GetModel() { + PartialTensorShape shape({-1, 2}); + MetaGraphDef out; + if (use_function_) { + *(out.mutable_graph_def()) = GetGraphWithFunction(shape); + } else { + *(out.mutable_graph_def()) = GetGraphDef(shape); + } + VLOG(2) << out.graph_def().DebugString(); + TensorShapeProto shape_proto; + shape.AsProto(&shape_proto); + SignatureDef signature_def; + (*signature_def.mutable_inputs())["input"].set_name("input:0"); + (*signature_def.mutable_inputs())["input"].set_dtype(DT_FLOAT); + *(*signature_def.mutable_inputs())["input"].mutable_tensor_shape() = + shape_proto; + (*signature_def.mutable_outputs())["output"].set_name("output:0"); + (*signature_def.mutable_outputs())["output"].set_dtype(DT_FLOAT); + *(*signature_def.mutable_outputs())["output"].mutable_tensor_shape() = + shape_proto; + (*out.mutable_signature_def())["serving_default"] = signature_def; + + VLOG(2) << signature_def.DebugString(); + return out; + } + + Status GetSavedModelBundle(SavedModelBundle* bundle) { + bundle->meta_graph_def = GetModel(); + Session* session = nullptr; + TF_RETURN_IF_ERROR(NewSession(tensorflow::SessionOptions(), &session)); + TF_RETURN_IF_ERROR(session->Create(bundle->meta_graph_def.graph_def())); + bundle->session.reset(session); + TF_RETURN_IF_ERROR(session->Run(/* inputs */ {}, /*outputs*/ {}, + /*targets*/ {"my_var/init"}, nullptr)); + return Status::OK(); + } + + // Confirms that we have a TRT node with the correct attributes. + void CheckTrtNode(const GraphDef& converted_graph_def) { + int n_trt_ops = 0; + string op_name{"TRTEngineOp"}; + for (const auto& node : converted_graph_def.node()) { + if (!op_name.compare(node.op())) { + n_trt_ops++; + const auto& attr = node.attr(); + EXPECT_EQ(attr.at("static_engine").b(), + param_.conv_params.convert_to_static_engine); + if (param_.conv_params.convert_to_static_engine) { + VLOG(2) << "Found serialized segment with size " + << attr.at("serialized_segment").s().size(); + EXPECT_GT(attr.at("serialized_segment").s().size(), 0); + } + } + } + EXPECT_EQ(n_trt_ops, 1); + } + + // Creates a list of input tensors, they will be used to build the engines. + std::vector> GetInputTensors() { + std::vector> input_tensors; + for (const std::vector& shape : param_.input_shapes) { + Tensor tensor(DT_FLOAT, TensorShape(shape)); + test::FillIota(&tensor, 1.0f); + input_tensors.push_back({tensor}); + } + return input_tensors; + } + + void RunAndCompareResults(Session* session, + const GraphDef& converted_graph_def) { + // Create a session to execute the converted graph. + Session* p_session = nullptr; + TF_EXPECT_OK(NewSession(SessionOptions(), &p_session)); + std::unique_ptr trt_session(p_session); + TF_EXPECT_OK(trt_session->Create(converted_graph_def)); + + // Run models and compare the output. + for (const std::vector& input : input_tensors_) { + std::vector outputs; + TF_EXPECT_OK( + session->Run({{"input", input.at(0)}}, {"output"}, {}, &outputs)); + std::cout << outputs.at(0).DebugString() << std::endl; + + std::vector trt_outputs; + TF_EXPECT_OK(trt_session->Run({{"input", input.at(0)}}, {"output"}, {}, + &trt_outputs)); + std::cout << trt_outputs.at(0).DebugString() << std::endl; + ASSERT_EQ(outputs.size(), 1); + ASSERT_EQ(trt_outputs.size(), 1); + tensorflow::test::ExpectEqual(outputs[0], trt_outputs[0]); + } + } + + void ConvertAndRunFrozenGraph() { + MetaGraphDef meta_graph_def = GetModel(); + + ::stream_executor::port::StatusOr result = tensorrt::ConvertAndBuild( + meta_graph_def.graph_def(), {"input"}, {"output"}, input_tensors_, + param_.conv_params); + TF_ASSERT_OK(result.status()); + const GraphDef& converted_graph_def = result.value(); + CheckTrtNode(converted_graph_def); + + // Create a session to execute the original graph. + Session* p_session = nullptr; + TF_EXPECT_OK(NewSession(SessionOptions(), &p_session)); + std::unique_ptr session(p_session); + TF_EXPECT_OK(session->Create(meta_graph_def.graph_def())); + + RunAndCompareResults(session.get(), converted_graph_def); + } + + void ConvertAndRunSavedModel() { + SavedModelBundle bundle; + TF_CHECK_OK(GetSavedModelBundle(&bundle)); + + ::stream_executor::port::StatusOr result = tensorrt::ConvertAndBuild( + &bundle, "serving_default", input_tensors_, param_.conv_params); + TF_ASSERT_OK(result.status()); + const GraphDef& converted_graph_def = result.value(); + CheckTrtNode(converted_graph_def); + + RunAndCompareResults(bundle.GetSession(), converted_graph_def); + } + + TestParam param_; + bool use_variable_; + bool use_function_; + std::vector> input_tensors_; +}; + +INSTANTIATE_TEST_CASE_P( + TrtConverterTestInstantiation, TrtConverterTest, + ::testing::Combine( + ::testing::Values( + // Dynamic shape mode test with conver_to_static_engine=true. + TestParam{TfTrtConversionParams{ + 1 << 20, // max workspace size + TrtPrecisionMode::FP32, + 3, // minimum_segment_size + 1, // max_cached_engines + false, // use_calibration + true, // use_dynamic_shape + ProfileStrategy::kOptimal, + true, // allow_build_at_runtime + true // convert_to_static_engine + }, + {{1, 2}, {4, 2}}}, + // Implicit batch mode test with conver_to_static_engine=true. + TestParam{TfTrtConversionParams{ + 1 << 20, // max workspace size + TrtPrecisionMode::FP16, + 3, // minimum_segment_size + 1, // max_cached_engines + false, // use_calibration + false, // use_dynamic_shape + ProfileStrategy::kRange, + true, // allow_build_at_runtime + true // convert_to_static_engine + }, + {{1, 2}}}, + // Dynamic shape mode test convert_to_static_engine=false: we cannot + // save the engines, therefore we do not generate profiles. A single + // engine will be built during runtime, with profile that matches + // the first shape ({1,2}). The second shape will run as native + // segment. + TestParam{TfTrtConversionParams{ + 1 << 20, // max workspace size + TrtPrecisionMode::FP32, + 3, // minimum_segment_size + 1, // max_cached_engines + false, // use_calibration + true, // use_dynamic_shape + ProfileStrategy::kOptimal, + true, // allow_build_at_runtime + false // convert_to_static_engine + }, + {{1, 2}, {4, 2}}}, + // Implicit batch mode test with convert_to_static_engine=false. + // We will have two engines in the cache to handle the two shapes. + TestParam{TfTrtConversionParams{ + 1 << 20, // max workspace size + TrtPrecisionMode::FP16, + 3, // minimum_segment_size + 2, // max_cached_engines + false, // use_calibration + false, // use_dynamic_shape + ProfileStrategy::kRange, + true, // allow_build_at_runtime + false // convert_to_static_engine + }, + {{1, 2}, {4, 2}}}), + ::testing::Values(false, true), // use_variables + ::testing::Values(false, true))); // use_function + +TEST_P(TrtConverterTest, Basic) { + if (use_variable_) { + ConvertAndRunSavedModel(); + } else { + ConvertAndRunFrozenGraph(); + } +} + +} // namespace tensorrt +} // namespace tensorflow + +#endif // GOOGLE_CUDA && GOOGLE_TENSORRT diff --git a/tensorflow/compiler/tf2tensorrt/utils/py_utils.cc b/tensorflow/compiler/tf2tensorrt/utils/py_utils.cc index c76efd813b1..01fc982c573 100644 --- a/tensorflow/compiler/tf2tensorrt/utils/py_utils.cc +++ b/tensorflow/compiler/tf2tensorrt/utils/py_utils.cc @@ -15,7 +15,11 @@ limitations under the License. #include "tensorflow/compiler/tf2tensorrt/utils/py_utils.h" +#include + #if GOOGLE_CUDA && GOOGLE_TENSORRT +#include "tensorflow/compiler/tf2tensorrt/common/utils.h" +#include "tensorflow/compiler/tf2tensorrt/convert/op_converter_registry.h" #include "tensorflow/stream_executor/platform/dso_loader.h" #include "third_party/tensorrt/NvInfer.h" #endif @@ -25,19 +29,19 @@ namespace tensorrt { bool IsGoogleTensorRTEnabled() { #if GOOGLE_CUDA && GOOGLE_TENSORRT +#if TF_USE_TENSORRT_STATIC + LOG(INFO) << "TensorRT libraries are statically linked, skip dlopen check"; + return true; +#else // TF_USE_TENSORRT_STATIC auto handle_or = se::internal::DsoLoader::TryDlopenTensorRTLibraries(); if (!handle_or.ok()) { - LOG(WARNING) << "Cannot dlopen some TensorRT libraries. If you would like " - "to use Nvidia GPU with TensorRT, please make sure the " - "missing libraries mentioned above are installed properly."; - return false; - } else { - LOG(INFO) << "TensorRT is enabled."; - return true; + LOG_WARNING_WITH_PREFIX << "Could not find TensorRT"; } -#else + return handle_or.ok(); +#endif // TF_USE_TENSORRT_STATIC +#else // GOOGLE_CUDA && GOOGLE_TENSORRT return false; -#endif +#endif // GOOGLE_CUDA && GOOGLE_TENSORRT } void GetLinkedTensorRTVersion(int* major, int* minor, int* patch) { @@ -66,5 +70,14 @@ void GetLoadedTensorRTVersion(int* major, int* minor, int* patch) { #endif } +std::vector GetRegisteredOpConverters() { +#if GOOGLE_CUDA && GOOGLE_TENSORRT + auto* registry = tensorflow::tensorrt::convert::GetOpConverterRegistry(); + return registry->ListRegisteredOps(); +#else + return {"undef"}; +#endif +} + } // namespace tensorrt } // namespace tensorflow diff --git a/tensorflow/compiler/tf2tensorrt/utils/py_utils.h b/tensorflow/compiler/tf2tensorrt/utils/py_utils.h index f52bb6f1bad..60a0d78cee8 100644 --- a/tensorflow/compiler/tf2tensorrt/utils/py_utils.h +++ b/tensorflow/compiler/tf2tensorrt/utils/py_utils.h @@ -16,6 +16,9 @@ limitations under the License. #ifndef TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_PY_UTILS_H_ #define TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_PY_UTILS_H_ +#include +#include + namespace tensorflow { namespace tensorrt { @@ -27,6 +30,8 @@ void GetLinkedTensorRTVersion(int* major, int* minor, int* patch); // Return runtime time TensorRT library version information {Maj, Min, Patch}. void GetLoadedTensorRTVersion(int* major, int* minor, int* patch); +std::vector GetRegisteredOpConverters(); + } // namespace tensorrt } // namespace tensorflow diff --git a/tensorflow/compiler/tf2tensorrt/utils/py_utils.i b/tensorflow/compiler/tf2tensorrt/utils/py_utils.i index d6e8eac5836..1784f5a2a00 100644 --- a/tensorflow/compiler/tf2tensorrt/utils/py_utils.i +++ b/tensorflow/compiler/tf2tensorrt/utils/py_utils.i @@ -83,4 +83,4 @@ version_struct get_linked_tensorrt_version(); version_struct get_loaded_tensorrt_version(); bool is_tensorrt_enabled(); -%rename("%s") ""; +%rename("%s") ""; \ No newline at end of file diff --git a/tensorflow/compiler/tf2tensorrt/utils/py_utils_wrapper.cc b/tensorflow/compiler/tf2tensorrt/utils/py_utils_wrapper.cc new file mode 100644 index 00000000000..ea597383531 --- /dev/null +++ b/tensorflow/compiler/tf2tensorrt/utils/py_utils_wrapper.cc @@ -0,0 +1,46 @@ +/* Copyright 2020 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include +#include +#include + +#include "pybind11/pybind11.h" // from @pybind11 +#include "pybind11/stl.h" // from @pybind11 +#include "tensorflow/compiler/tf2tensorrt/common/utils.h" +#include "tensorflow/compiler/tf2tensorrt/utils/py_utils.h" + +std::tuple get_linked_tensorrt_version() { + return tensorflow::tensorrt::GetLinkedTensorRTVersion(); +} + +std::tuple get_loaded_tensorrt_version() { + return tensorflow::tensorrt::GetLoadedTensorRTVersion(); +} + +PYBIND11_MODULE(_pywrap_py_utils, m) { + m.doc() = "_pywrap_py_utils: Various TensorRT utilities"; + m.def("get_linked_tensorrt_version", get_linked_tensorrt_version, + "Return the compile time TensorRT library version as the tuple " + "(Major, Minor, Patch)."); + m.def("get_loaded_tensorrt_version", get_loaded_tensorrt_version, + "Return the runtime time TensorRT library version as the tuple " + "(Major, Minor, Patch)."); + m.def("is_tensorrt_enabled", tensorflow::tensorrt::IsGoogleTensorRTEnabled, + "Returns True if TensorRT is enabled."); + m.def("get_registered_op_converters", + tensorflow::tensorrt::GetRegisteredOpConverters, + "Return a list of registered op converters by operation name"); +} diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_allocator.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_allocator.cc index f5b2a27f6ce..91b35a18378 100644 --- a/tensorflow/compiler/tf2tensorrt/utils/trt_allocator.cc +++ b/tensorflow/compiler/tf2tensorrt/utils/trt_allocator.cc @@ -17,11 +17,9 @@ limitations under the License. #include "tensorflow/core/platform/logging.h" -#if GOOGLE_CUDA -#if GOOGLE_TENSORRT +#if GOOGLE_CUDA && GOOGLE_TENSORRT #include "third_party/gpus/cuda/include/cuda_runtime_api.h" -#endif // GOOGLE_TENSORRT -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA && GOOGLE_TENSORRT namespace tensorflow { namespace tensorrt { @@ -52,8 +50,7 @@ void* Align(uint64_t alignment, uint64_t size, void*& ptr, uint64_t& space) { } // namespace tensorrt } // namespace tensorflow -#if GOOGLE_CUDA -#if GOOGLE_TENSORRT +#if GOOGLE_CUDA && GOOGLE_TENSORRT namespace tensorflow { namespace tensorrt { @@ -70,11 +67,20 @@ void* TRTDeviceAllocator::allocate(uint64_t size, uint64_t alignment, // TODO(aaroey): AllocateRaw takes size_t size as input, so it'll produce // unexpected result when TRT tries to allocate more bytes than size_t can // carry. Fix this. - void* mem = allocator_->AllocateRaw(alignment, total_size); + // + // Fail immediately if allocation fails, rather than waiting 10 seconds and + // failing then anyway. + // TensorRT 7 can also switch to a different algorithm for a layer if an + // algorithm uses too much memory. If we don't fail immediately building the + // engine can be *very* slow with TensorRT7 when GPU memory is limited. + AllocationAttributes attributes; + attributes.no_retry_on_failure = true; + void* mem = allocator_->AllocateRaw(alignment, total_size, attributes); if (!mem) return nullptr; void* alloc_mem = mem; QCHECK(Align(alignment, size, mem, total_size)); + mutex_lock lock(mu_); if (mem != alloc_mem) { QCHECK(mem_map_.insert({mem, alloc_mem}).second); } @@ -90,6 +96,7 @@ TRTDeviceAllocator::TRTDeviceAllocator(Allocator* allocator) } void TRTDeviceAllocator::free(void* memory) noexcept { + mutex_lock lock(mu_); VLOG(2) << "Deallocating @ " << memory; // allocated memory adjusted for alignment, restore the original pointer if (memory) { @@ -105,5 +112,4 @@ void TRTDeviceAllocator::free(void* memory) noexcept { } // namespace tensorrt } // namespace tensorflow -#endif // GOOGLE_TENSORRT -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA && GOOGLE_TENSORRT diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h b/tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h index 753e2e3f87d..2812aa06457 100644 --- a/tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h +++ b/tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h @@ -19,12 +19,11 @@ limitations under the License. #include #include "tensorflow/core/framework/allocator.h" +#include "tensorflow/core/platform/mutex.h" -#if GOOGLE_CUDA -#if GOOGLE_TENSORRT +#if GOOGLE_CUDA && GOOGLE_TENSORRT #include "third_party/tensorrt/NvInfer.h" -#endif // GOOGLE_TENSORRT -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA && GOOGLE_TENSORRT namespace tensorflow { namespace tensorrt { @@ -33,8 +32,7 @@ void* Align(uint64_t alignment, uint64_t size, void*& ptr, uint64_t& space); } // namespace tensorrt } // namespace tensorflow -#if GOOGLE_CUDA -#if GOOGLE_TENSORRT +#if GOOGLE_CUDA && GOOGLE_TENSORRT namespace tensorflow { namespace tensorrt { @@ -56,19 +54,20 @@ class TRTDeviceAllocator : public TRTBaseAllocator { virtual ~TRTDeviceAllocator() { VLOG(1) << "Destroying allocator attached to " << allocator_->Name(); } - void* allocate(uint64_t size, uint64_t alignment, uint32_t flags) noexcept override; + void* allocate(uint64_t size, uint64_t alignment, + uint32_t flags) noexcept override; void free(void* memory) noexcept override; private: + mutex mu_; Allocator* allocator_; // supporting alignment from allocation request requires a map to free; - std::unordered_map mem_map_; + std::unordered_map mem_map_ TF_GUARDED_BY(mu_); }; } // namespace tensorrt } // namespace tensorflow -#endif // GOOGLE_TENSORRT -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA && GOOGLE_TENSORRT #endif // TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_ALLOCATOR_H_ diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.cc new file mode 100755 index 00000000000..cab00a036a8 --- /dev/null +++ b/tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.cc @@ -0,0 +1,286 @@ +/* Copyright 2020 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.h" + +#include +#include + +#include "tensorflow/compiler/tf2tensorrt/common/utils.h" +#include "tensorflow/compiler/tf2tensorrt/convert/utils.h" +#include "tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h" +#include "tensorflow/compiler/tf2tensorrt/utils/trt_execution_context.h" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/framework/tensor_shape.h" +#include "tensorflow/core/lib/core/status.h" +#include "tensorflow/core/platform/errors.h" +#include "tensorflow/core/profiler/lib/traceme.h" + +#if GOOGLE_CUDA && GOOGLE_TENSORRT +#include "third_party/tensorrt/NvInfer.h" + +namespace tensorflow { +namespace tensorrt { + +using absl::StrCat; + +ExecutionContext ExecutionContext::Create(nvinfer1::ICudaEngine* cuda_engine) { + bool has_int32_output = false; + for (int i = 0; i < cuda_engine->getNbBindings(); i++) { + if (!cuda_engine->bindingIsInput(i) && + cuda_engine->getBindingDataType(i) == nvinfer1::DataType::kINT32) { + has_int32_output = true; + break; + } + } + if (!IS_TRT_VERSION_GE(8, 0, 0, 0) && has_int32_output) { + // TODO(nvbugs/3390469): Remove this workaround when the bug is fixed. + nvinfer1::IExecutionContext* execution_context = + cuda_engine->createExecutionContext(); + return ExecutionContext(execution_context, true); + } + + nvinfer1::IExecutionContext* execution_context = + cuda_engine->createExecutionContextWithoutDeviceMemory(); + return ExecutionContext(execution_context, false); +} + +Status GetTrtBindingShape(const nvinfer1::ICudaEngine* cuda_engine, + const nvinfer1::IExecutionContext* execution_context, + int binding_index, bool use_implicit_batch, + int batch_size, TensorShape& shape) { + tensorflow::profiler::TraceMe activity( + "getBindingDimensions", tensorflow::profiler::TraceMeLevel::kInfo); + nvinfer1::Dims dims = + use_implicit_batch + ? cuda_engine->getBindingDimensions(binding_index) + : execution_context->getBindingDimensions(binding_index); + if (!use_implicit_batch) { + if (dims.nbDims == -1) { + return errors::Internal( + "Binding index out of range. This can happen if profile is not set, " + "or the network is invalid for the current profile."); + } + } + TF_RETURN_IF_ERROR(DimsAdapter(dims).TensorShape( + &shape, + use_implicit_batch ? absl::optional(batch_size) : absl::nullopt)); + return Status::OK(); +} + +Status SetupBindings(nvinfer1::ICudaEngine* cuda_engine, const Tensor& tensor, + std::vector& buffers, int binding_index) { + tensorflow::profiler::TraceMe activity( + "SetBindingPointers", tensorflow::profiler::TraceMeLevel::kInfo); + const auto dtype = cuda_engine->getBindingDataType(binding_index); + VLOG(2) << "<<<<<<<<< SetupBindings with dtype = " << (int)dtype; + switch (dtype) { + case nvinfer1::DataType::kFLOAT: + buffers[binding_index] = const_cast(tensor.flat().data()); + break; + case nvinfer1::DataType::kHALF: + buffers[binding_index] = + const_cast(tensor.flat().data()); + break; + case nvinfer1::DataType::kINT8: + return errors::Internal("INT8 inputs are not supported yet!"); + case nvinfer1::DataType::kINT32: + buffers[binding_index] = const_cast(tensor.flat().data()); + break; +#if IS_TRT_VERSION_GE(8, 2, 0, 0) + case nvinfer1::DataType::kBOOL: + buffers[binding_index] = const_cast(tensor.flat().data()); + break; +#endif +#if IS_TRT_VERSION_GE(8, 5, 0, 0) + case nvinfer1::DataType::kUINT8: + buffers[binding_index] = const_cast(tensor.flat().data()); + break; +#endif +#if IS_TRT_VERSION_GE(8, 6, 0, 0) + case nvinfer1::DataType::kFP8: + return errors::Internal("FP8 inputs are not supported yet!"); +#endif + default: + return errors::Internal("Unknown TRT data type: ", + static_cast(dtype)); + } + return Status::OK(); +} + +// Sets up bindings. +Status SetTrtEngineInputs(nvinfer1::ICudaEngine* cuda_engine, + nvinfer1::IExecutionContext* execution_context, + const int trt_profile_idx, + std::vector& buffers, bool use_implicit_batch, + int num_batch, + const TrtShapeOptimizationProfile& profiles, + OpKernelContext* ctx, const DataVec* input_vec) { + tensorflow::profiler::TraceMe activity( + "SetTrtEngineInputs", tensorflow::profiler::TraceMeLevel::kInfo); + int n_inputs = ctx ? ctx->num_inputs() : (input_vec ? input_vec->size() : 0); + // Setup engine inputs. + for (int i = 0; i < n_inputs; i++) { + const Tensor& input_tensor = ctx ? ctx->input(i) : input_vec->at(i).tensor; + const TensorShape& input_shape = input_tensor.shape(); + + // Skip resource inputs. + if (input_tensor.dtype() == DataType::DT_RESOURCE) { + continue; + } + + const string input_name = + ctx ? StrCat(IONamePrefixes::kInputPHName, i) : input_vec->at(i).name; + int binding_index; + Status status = GetTrtBindingIndex(input_name.c_str(), trt_profile_idx, + cuda_engine, &binding_index); + if (IS_TRT_VERSION_GE(8, 0, 0, 0)) { + TF_RETURN_IF_ERROR(status); + } else if (!status.ok()) { + // Before TRT 8, an input tensor can be pruned if it is not used by the + // network (e.g. only its shape is used, but the shape is already defined + // by the optimization profile by setting min=max). nvbugs/3153064 + VLOG(2) << "Skipping pruned input " << input_name; + continue; + } + + if (use_implicit_batch && ctx) { + // Ensure all inputs have the same batch size + if (num_batch != input_shape.dim_size(0)) { + const string msg = + StrCat("Input data has inconsistent batch size: ", num_batch, + " vs ", input_shape.dim_size(0)); + return errors::NotFound(msg); + } + } + // Set known input dimensions. This is necessary because TRT network + // could be made with dynamic dimensions. + if (!use_implicit_batch) { + TF_RETURN_IF_ERROR(profiles.SetInputShapeBinding( + i, binding_index, cuda_engine, execution_context)); + + if (cuda_engine->isExecutionBinding(binding_index)) { + tensorflow::profiler::TraceMe activity( + "SetTrtEngineInputs::setBindingDimensions", + tensorflow::profiler::TraceMeLevel::kInfo); + auto adap = DimsAdapter::Create(input_shape); + TRT_ENSURE_OK(adap); + nvinfer1::Dims trt_dims = adap.ValueOrDie().AsTrtDims(); + if (execution_context->getBindingDimensions(binding_index) != + trt_dims) { + VLOG(2) << "Setting binding dimensions for idx " << binding_index; + bool ret = + execution_context->setBindingDimensions(binding_index, trt_dims); + if (!ret) { + VLOG(2) << "Error setting engine input " << binding_index << " " + << DebugString(trt_dims); + return errors::Internal( + "Binding dimension does not fit selected profile."); + } + } + } + } + // Setup input bindings. + TF_RETURN_IF_ERROR( + SetupBindings(cuda_engine, input_tensor, buffers, binding_index)); + } + + // Ensure all network dynamic dimensions (if any) are set in execution + // context. + if (!execution_context->allInputDimensionsSpecified()) { + return errors::Internal( + "Failed to set dimensions for all dynamic input tensors"); + } + if (!execution_context->allInputShapesSpecified()) { + return errors::Internal( + "Failed to set dimensions for all shape input tensors."); + } + return Status::OK(); +} + +Status SetTrtEngineOutputs(nvinfer1::ICudaEngine* cuda_engine, + nvinfer1::IExecutionContext* execution_context, + int trt_profile_idx, std::vector& buffers, + bool use_implicit_batch, int batch_size, + OpKernelContext* ctx, DataVec* outputs) { + tensorflow::profiler::TraceMe activity( + "SetTrtEngineOutputs", tensorflow::profiler::TraceMeLevel::kInfo); + // Either one of ctx or outpus should be specified + int n_outputs = ctx ? ctx->num_outputs() : (outputs ? outputs->size() : 0); + for (int i = 0; i < n_outputs; i++) { + const string output_name = + ctx ? StrCat(IONamePrefixes::kOutputPHName, i) : outputs->at(i).name; + int binding_index; + TF_RETURN_IF_ERROR(GetTrtBindingIndex(output_name.c_str(), trt_profile_idx, + cuda_engine, &binding_index)); + + // Get TRT output shapes for allocating output memory. + TensorShape output_shape; + TF_RETURN_IF_ERROR(GetTrtBindingShape(cuda_engine, execution_context, + binding_index, use_implicit_batch, + batch_size, output_shape)); + + // Allocate output tensor of TRTEngineOp. + Tensor* output_tensor = nullptr; + if (ctx) { + tensorflow::profiler::TraceMe activity( + "AllocateOutput", tensorflow::profiler::TraceMeLevel::kInfo); + TF_RETURN_IF_ERROR(ctx->allocate_output(i, output_shape, &output_tensor)); + } else { + // This path is used for unit tests. The tensor is already allocated. + // Its shape is not necessarily set correctly, we fix that. + VLOG(2) << "Applying shape " << output_shape.DebugString() + << " on output."; + output_tensor = &(outputs->at(i).tensor); + bool status = output_tensor->CopyFrom(*output_tensor, output_shape); + if (!status) { + return errors::Internal( + "Buffer size (", output_tensor->NumElements(), + ") do not match while reshaping output tensors to shape ", + output_shape.DebugString()); + } + } + + // Set up output bindings. + TF_RETURN_IF_ERROR( + SetupBindings(cuda_engine, *output_tensor, buffers, binding_index)); + } + return Status::OK(); +} + +Status TrtEnqueue(nvinfer1::IExecutionContext* execution_context, + std::vector& buffers, const cudaStream_t* stream, + bool use_implicit_batch, int batch_size) { + tensorflow::profiler::TraceMe activity( + "TrtEnqueue", tensorflow::profiler::TraceMeLevel::kInfo); + bool ret = false; + if (use_implicit_batch) { + ret = execution_context->enqueue(batch_size, &buffers[0], *stream, nullptr); + VLOG(1) << "Called IExecutionContext::enqueue"; + } else { + ret = execution_context->enqueueV2(&buffers[0], *stream, nullptr); + VLOG(1) << "Called IExecutionContext::enqueueV2"; + } + if (!ret) { + return errors::Internal("Failed to enqueue batch for TRT engine"); + } + // Synchronization will be done by TF. + return Status::OK(); +} + +} // namespace tensorrt +} // namespace tensorflow + +#endif // GOOGLE_CUDA && GOOGLE_TENSORRT diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.h b/tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.h new file mode 100644 index 00000000000..1eb1d852374 --- /dev/null +++ b/tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.h @@ -0,0 +1,82 @@ +/* Copyright 2020 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_ENGINE_UTILS_H_ +#define TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_ENGINE_UTILS_H_ + +#include +#include + +#include "tensorflow/compiler/tf2tensorrt/common/datavec.h" +#include "tensorflow/compiler/tf2tensorrt/common/utils.h" +#include "tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/framework/tensor_shape.h" +#include "tensorflow/core/lib/core/status.h" + +#if GOOGLE_CUDA && GOOGLE_TENSORRT +#include "third_party/tensorrt/NvInfer.h" + +namespace tensorflow { +namespace tensorrt { +using ::stream_executor::port::StatusOr; + +// Creates a TensorRT execution context. +ExecutionContext CreateExecutionContext(nvinfer1::ICudaEngine* cuda_engine); + +// Sets input buffers for TRT from a list of input tensors. The input tensors +// are either defined by ctx or by input_vec. +Status SetTrtEngineInputs(nvinfer1::ICudaEngine* cuda_engine, + nvinfer1::IExecutionContext* execution_context, + const int trt_profile_idx, + std::vector& buffers, bool use_implicit_batch, + int num_batch, + const TrtShapeOptimizationProfile& profiles, + OpKernelContext* ctx = nullptr, + const DataVec* input_vec = nullptr); + +// Returns the shape of a binding from TensorRT. +// +// The binding is identified by its binding_index. The batch_size argument is +// ignored if use_implicit_batch==false. The shape is returned in the last +// argument. +Status GetTrtBindingShape(const nvinfer1::ICudaEngine* cuda_engine, + const nvinfer1::IExecutionContext* execution_context, + int binding_index, bool use_implicit_batch, + int batch_size, TensorShape& shape); + +// Defines output buffers for TRT. The buffers are allocated by ctx, if ctx is +// not null. Otherwise it is expected that the outputs DataVec is not null, and +// the Tensors in outputs are already allocated. +Status SetTrtEngineOutputs(nvinfer1::ICudaEngine* cuda_engine, + nvinfer1::IExecutionContext* execution_context, + int trt_profile_idx, std::vector& buffers, + bool use_implicit_batch, int batch_size = 0, + OpKernelContext* ctx = nullptr, + DataVec* outputs = nullptr); + +// Enqueues TensorRT inference job. The batch_size argument is only relevant in +// implicit batch mode. +Status TrtEnqueue(nvinfer1::IExecutionContext* execution_context, + std::vector& buffers, const cudaStream_t* stream, + bool use_implicit_batch, int batch_size = 1); + +} // namespace tensorrt +} // namespace tensorflow + +#endif // GOOGLE_CUDA && GOOGLE_TENSORRT + +#endif // TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_ENGINE_UTILS_H_ diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_execution_context.h b/tensorflow/compiler/tf2tensorrt/utils/trt_execution_context.h new file mode 100644 index 00000000000..05b5cefbf94 --- /dev/null +++ b/tensorflow/compiler/tf2tensorrt/utils/trt_execution_context.h @@ -0,0 +1,43 @@ +/* Copyright 2021 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_EXECUTION_CONTEXT_H_ +#define TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_EXECUTION_CONTEXT_H_ + +#if GOOGLE_CUDA && GOOGLE_TENSORRT +#include "third_party/tensorrt/NvInfer.h" + +namespace tensorflow { +namespace tensorrt { + +// A wrapper for the TensorRT execution context which will destroy the TensorRT +// execution context when the object goes out of scope. +class ExecutionContext : public TrtUniquePtrType { + public: + ExecutionContext(nvinfer1::IExecutionContext* context, bool has_memory) + : TrtUniquePtrType(context), + has_device_memory_(has_memory) {} + static ExecutionContext Create(nvinfer1::ICudaEngine* cuda_engine); + + bool HasDeviceMemory() { return has_device_memory_; } + + private: + bool has_device_memory_; +}; + +}; // namespace tensorrt +}; // namespace tensorflow +#endif +#endif diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_experimental_features.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_experimental_features.cc new file mode 100644 index 00000000000..319ebff642b --- /dev/null +++ b/tensorflow/compiler/tf2tensorrt/utils/trt_experimental_features.cc @@ -0,0 +1,35 @@ +/* Copyright 2022 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/tf2tensorrt/convert/utils.h" + +#if GOOGLE_CUDA && GOOGLE_TENSORRT + +#include "tensorflow/core/util/env_var.h" + +namespace tensorflow { +namespace tensorrt { + +bool isExperimentalFeatureActivated(string feature_name) { + string envvar_str; + TF_CHECK_OK( + ReadStringFromEnvVar("TF_TRT_EXPERIMENTAL_FEATURES", "", &envvar_str)); + return envvar_str.find(feature_name) != string::npos; +} + +} // namespace tensorrt +} // namespace tensorflow + +#endif // GOOGLE_CUDA && GOOGLE_TENSORRT diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_experimental_features.h b/tensorflow/compiler/tf2tensorrt/utils/trt_experimental_features.h new file mode 100644 index 00000000000..1a502c5f7e7 --- /dev/null +++ b/tensorflow/compiler/tf2tensorrt/utils/trt_experimental_features.h @@ -0,0 +1,31 @@ +/* Copyright 2022 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_EXPERIMENTAL_FEATURES_H_ +#define TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_EXPERIMENTAL_FEATURES_H_ + +#if GOOGLE_CUDA && GOOGLE_TENSORRT + +namespace tensorflow { +namespace tensorrt { + +bool isExperimentalFeatureActivated(string feature_name); + +} // namespace tensorrt +} // namespace tensorflow + +#endif // GOOGLE_CUDA && GOOGLE_TENSORRT + +#endif // TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_EXPERIMENTAL_FEATURES_H_ diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.cc index 25a94502675..f9bf0e0e59d 100644 --- a/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.cc +++ b/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.cc @@ -20,8 +20,7 @@ limitations under the License. #include "tensorflow/core/platform/logging.h" -#if GOOGLE_CUDA -#if GOOGLE_TENSORRT +#if GOOGLE_CUDA && GOOGLE_TENSORRT #include "third_party/gpus/cuda/include/cuda_runtime_api.h" namespace tensorflow { @@ -59,7 +58,7 @@ bool TRTInt8Calibrator::setBatch(const std::unordered_map& data, VLOG(1) << "Set Batch Waiting finished"; // Sets the batch. - for (const auto it : data) { + for (const auto& it : data) { auto devptr = dev_buffers_.find(it.first); if (devptr == dev_buffers_.end()) { LOG(FATAL) << "FATAL " << engine_name_ << " input name '" << it.first @@ -122,7 +121,8 @@ void TRTInt8Calibrator::waitAndSetDone() { } } -const void* TRTInt8Calibrator::readCalibrationCache(std::size_t& length) noexcept { +const void* TRTInt8Calibrator::readCalibrationCache( + std::size_t& length) noexcept { if (calibration_table_.empty()) return nullptr; length = calibration_table_.size(); return calibration_table_.data(); @@ -147,5 +147,4 @@ TRTInt8Calibrator::~TRTInt8Calibrator() { } // namespace tensorrt } // namespace tensorflow -#endif -#endif +#endif // GOOGLE_CUDA && GOOGLE_TENSORRT diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.h b/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.h index d7a3df7ac1e..2fa22662521 100644 --- a/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.h +++ b/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.h @@ -20,10 +20,10 @@ limitations under the License. #include #include #include + #include "tensorflow/core/platform/mutex.h" -#if GOOGLE_CUDA -#if GOOGLE_TENSORRT +#if GOOGLE_CUDA && GOOGLE_TENSORRT #include "third_party/gpus/cuda/include/cuda_runtime_api.h" #include "third_party/tensorrt/NvInfer.h" @@ -34,12 +34,8 @@ namespace tensorrt { // TRTs pull model for calibration. When TRT implements a means for // a push calibration This class should be updated accordingly -// IInt8EntropyCalibrator2 is prefferred for TRT 5.1+. -#if NV_TENSORRT_MAJOR > 5 || (NV_TENSORRT_MAJOR == 5 && NV_TENSORRT_MINOR >= 1) +// IInt8EntropyCalibrator2 is preferred for TRT 5.1+. struct TRTInt8Calibrator : public nvinfer1::IInt8EntropyCalibrator2 { -#else -struct TRTInt8Calibrator : public nvinfer1::IInt8EntropyCalibrator { -#endif public: // Construct a calibrator for future calibration. TRTInt8Calibrator( @@ -72,7 +68,8 @@ struct TRTInt8Calibrator : public nvinfer1::IInt8EntropyCalibrator { // If not null, calibration is skipped. const void* readCalibrationCache(std::size_t& length) noexcept override; - void writeCalibrationCache(const void* ptr, std::size_t length) noexcept override; + void writeCalibrationCache(const void* ptr, + std::size_t length) noexcept override; const string& getCalibrationTableAsString() { return calibration_table_; } @@ -101,6 +98,5 @@ struct TRTInt8Calibrator : public nvinfer1::IInt8EntropyCalibrator { } // namespace tensorrt } // namespace tensorflow -#endif -#endif +#endif // GOOGLE_CUDA && GOOGLE_TENSORRT #endif // TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_INT8_CALIBRATOR_H_ diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_logger.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_logger.cc index 7e72204604e..69e66038661 100644 --- a/tensorflow/compiler/tf2tensorrt/utils/trt_logger.cc +++ b/tensorflow/compiler/tf2tensorrt/utils/trt_logger.cc @@ -15,26 +15,72 @@ limitations under the License. #include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h" -#if GOOGLE_CUDA -#if GOOGLE_TENSORRT +#if GOOGLE_CUDA && GOOGLE_TENSORRT + +#include + +#include "tensorflow/compiler/tf2tensorrt/common/utils.h" +#include "tensorflow/compiler/tf2tensorrt/convert/logger_registry.h" +#include "tensorflow/compiler/tf2tensorrt/utils/trt_experimental_features.h" #include "tensorflow/core/platform/logging.h" namespace tensorflow { namespace tensorrt { +bool filter_string(string msg) { + // This function checks for known substrings that shall be ignored. + + static const std::vector substr_patterns{ + // Automatic messages generated by TensorRT when combined with + // Automatic Mixed Precision - TensorRT 8.2 + "Missing scale and zero-point for", + "Subnormal FP16 values detected", + "If this is not the desired behavior, please modify the weights", + "had the following issues when converted to FP16", + "Values less than smallest positive FP16 Subnormal value detected.", + // Deprecation Warnings + "The implicit batch dimension mode has been deprecated.", + "The getMaxBatchSize() function should not be used with an engine built", + // Input-Warnings + "[RemoveDeadLayers] Input Tensor input is unused or used only at", + "Unused Input:", + // Data Type Warnings + "Tensor DataType is determined at build time for tensors not marked as", + // Myelin Performance Warning in dynamic shape mode + "Myelin graph with multiple dynamic values may have poor performance", + "(# 0 (SHAPE", + "CUDA lazy loading is not enabled. Enabling it can significantly reduce", + }; + + for (int i = 0; i < substr_patterns.size(); i++) { + std::size_t is_found = msg.find(substr_patterns[i]); + if (is_found != string::npos) { + return true; + } + } + return false; +} + // Use TF logging for TensorRT informations void Logger::log(Severity severity, const char* msg) noexcept { + static const bool filter_messages = []() { + return !isExperimentalFeatureActivated("disable_logger_filtering"); + }(); + + if (filter_messages && filter_string(msg)) return; + + if (!isValidSeverity(severity, msg) || suppressedMsg_ & (1 << (int)severity)) + return; + // Suppress info-level messages switch (severity) { -#if NV_TENSORRT_MAJOR > 5 || (NV_TENSORRT_MAJOR == 5 && NV_TENSORRT_MINOR >= 1) case Severity::kVERBOSE: -#endif case Severity::kINFO: { // Mark TRT info messages as debug! VLOG(2) << name_ << " " << msg; break; } case Severity::kWARNING: { - LOG(WARNING) << name_ << " " << msg; + LOG_WARNING_WITH_PREFIX << name_ << " " << msg; break; } case Severity::kERROR: { @@ -45,21 +91,42 @@ void Logger::log(Severity severity, const char* msg) noexcept { LOG(FATAL) << name_ << " " << msg; break; } - // This is useless for now. But would catch it in future if enum changes. It - // is always good to have default case! - default: { - LOG(FATAL) << name_ << "Got unknown severity level " << int(severity) - << " from TensorRT: " << msg; - break; - } } } + +void Logger::suppressLoggerMsgs(Severity severity) { + if (isValidSeverity(severity)) { + suppressedMsg_ |= 1 << (int)severity; + } +} + +void Logger::unsuppressLoggerMsgs(Severity severity) { + if (isValidSeverity(severity)) { + suppressedMsg_ &= (-1) ^ (1 << (int)severity); + } +} + +bool Logger::isValidSeverity(Severity severity, const char* msg) noexcept { + switch (severity) { + case Severity::kVERBOSE: + case Severity::kINFO: + case Severity::kWARNING: + case Severity::kERROR: + case Severity::kINTERNAL_ERROR: + return true; + } + return false; +} + +// static Logger* Logger::GetLogger() { static Logger* logger = new Logger("DefaultLogger"); return logger; } + +REGISTER_TENSORRT_LOGGER("DefaultLogger", Logger::GetLogger()); + } // namespace tensorrt } // namespace tensorflow -#endif // GOOGLE_CUDA -#endif // GOOGLE_TENSORRT +#endif // GOOGLE_CUDA && GOOGLE_TENSORRT diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_logger.h b/tensorflow/compiler/tf2tensorrt/utils/trt_logger.h index a9c1e80668a..8002df53e5c 100644 --- a/tensorflow/compiler/tf2tensorrt/utils/trt_logger.h +++ b/tensorflow/compiler/tf2tensorrt/utils/trt_logger.h @@ -18,8 +18,7 @@ limitations under the License. #include "tensorflow/core/platform/types.h" -#if GOOGLE_CUDA -#if GOOGLE_TENSORRT +#if GOOGLE_CUDA && GOOGLE_TENSORRT #include "third_party/tensorrt/NvInfer.h" namespace tensorflow { @@ -29,17 +28,23 @@ namespace tensorrt { class Logger : public nvinfer1::ILogger { public: Logger(string name = "DefaultLogger") : name_(name) {} - void log(nvinfer1::ILogger::Severity severity, const char* msg) noexcept override; + void log(nvinfer1::ILogger::Severity severity, + const char* msg) noexcept override; + void suppressLoggerMsgs(nvinfer1::ILogger::Severity severity); + void unsuppressLoggerMsgs(nvinfer1::ILogger::Severity severity); + void unsuppressAllLoggerMsgs() { suppressedMsg_ = 0; } static Logger* GetLogger(); private: - string name_; + bool isValidSeverity(nvinfer1::ILogger::Severity severity, + const char* msg = nullptr) noexcept; + const string name_; + unsigned int suppressedMsg_ = 0; }; } // namespace tensorrt } // namespace tensorflow -#endif // GOOGLE_TENSORRT -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA && GOOGLE_TENSORRT #endif // TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_LOGGER_H_ diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.cc index 5ab6bf1a317..30aff91a76d 100644 --- a/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.cc +++ b/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.cc @@ -23,8 +23,7 @@ limitations under the License. #include "tensorflow/core/framework/tensor_shape.h" #include "tensorflow/core/platform/mutex.h" -#if GOOGLE_CUDA -#if GOOGLE_TENSORRT +#if GOOGLE_CUDA && GOOGLE_TENSORRT #include "third_party/tensorrt/NvInfer.h" namespace tensorflow { @@ -87,15 +86,58 @@ string TRTEngineCacheResource::DebugString() const { for (const auto& item : cache_) { mutex_lock lock(item.second->mu); oss << TensorShapeUtils::ShapeListString(item.first) << ": " << hex - << "ICudaEngine: " << item.second->cuda_engine.get() << ", " - << "IExecutionContext: " << item.second->execution_context.get() << dec - << endl; + << "ICudaEngine: " << item.second->GetCudaEngine() << ", " + << "IExecutionContext: "; + absl::c_for_each( + item.second->execution_contexts, + [&](const ExecutionContext& ctx) { oss << ctx.get() << ","; }); + oss << dec << endl; } return oss.str(); } +EngineContext* TRTEngineCacheResource::GetEngineContext( + const std::vector& input_shapes) { + EngineContext* engine_context = nullptr; + int64 min_matched_batch_size = kint64max; + for (const auto& pair : cache_) { + const std::vector& cached_input_shapes = pair.first; + // This should not happen, but just for safety. + if (input_shapes.size() != cached_input_shapes.size()) { + LOG(ERROR) << "Input shape list size mismatch" + << ", cached size: " << cached_input_shapes.size() + << " vs. input size: " << input_shapes.size(); + } + if (AreShapesCompatible(input_shapes, cached_input_shapes)) { + const int cached_batch_size = cached_input_shapes[0].dim_size(0); + if (min_matched_batch_size > cached_batch_size) { + min_matched_batch_size = cached_batch_size; + engine_context = pair.second.get(); + } + } + } + return engine_context; +} + +EngineContext* TRTEngineCacheResource::GetEngineContext(const int profile_id) { + if (profiles_.NeedProfiles() && profile_id >= profiles_.GetNumProfiles()) { + LOG(ERROR) << "Out of range: profile_id " << profile_id + << " is larger than number of profiles " + << profiles_.GetNumProfiles(); + return nullptr; + } + if (cache_.size() > 1) { + LOG(ERROR) << "Cache is expected to have at most " + << "1 engine in explicit batch mode where profiles are used."; + return nullptr; + } + if (cache_.size() == 0) { + return nullptr; + } + return cache_.begin()->second.get(); +} + } // namespace tensorrt } // namespace tensorflow -#endif // GOOGLE_TENSORRT -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA && GOOGLE_TENSORRT diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h b/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h index 8d603ac4d55..5c4a6c1fdd8 100644 --- a/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h +++ b/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h @@ -22,8 +22,10 @@ limitations under the License. #include "tensorflow/compiler/tf2tensorrt/convert/utils.h" #include "tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h" +#include "tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.h" #include "tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.h" #include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h" +#include "tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h" #include "tensorflow/core/framework/resource_mgr.h" #include "tensorflow/core/lib/core/errors.h" @@ -114,31 +116,75 @@ class LRUCache { } }; -// Define a hash function for vector because it is used as the key -// for the engine cache. -struct VectorTensorShapeHasher { - std::size_t operator()(const std::vector& key) const { - return std::hash()(TensorShapeUtils::ShapeListString(key)); - } -}; - -#if GOOGLE_CUDA -#if GOOGLE_TENSORRT +#if GOOGLE_CUDA && GOOGLE_TENSORRT struct EngineContext { EngineContext() {} // Creates an empty context. - EngineContext( - TrtUniquePtrType&& input_cuda_engine, - TrtUniquePtrType&& input_execution_context) - : cuda_engine(std::move(input_cuda_engine)), - execution_context(std::move(input_execution_context)) {} + EngineContext(TrtUniquePtrType&& cuda_engine, + ExecutionContext&& execution_context) + : cuda_engine_(std::move(cuda_engine)) { + execution_contexts.push_back(std::move(execution_context)); + device_memory_size_ = + cuda_engine_ ? cuda_engine_->getDeviceMemorySize() : 0; + } + EngineContext(TrtUniquePtrType&& cuda_engine, + std::vector&& execution_contexts) + : cuda_engine_(std::move(cuda_engine)), + execution_contexts(std::move(execution_contexts)) { + device_memory_size_ = + cuda_engine_ ? cuda_engine_->getDeviceMemorySize() : 0; + } mutex mu; - TrtUniquePtrType cuda_engine; - TrtUniquePtrType execution_context - GUARDED_BY(mu); -}; + nvinfer1::ICudaEngine* GetCudaEngine() { return cuda_engine_.get(); } + + Status GetExecutionContext(int idx, nvinfer1::IExecutionContext** exec_ctx, + bool* has_device_memory) + TF_EXCLUSIVE_LOCKS_REQUIRED(mu) { + if (idx >= execution_contexts.size()) { + return errors::Internal("Requested engine context with index ", idx, + ", but only ", execution_contexts.size(), + "contexts are present."); + } + *exec_ctx = execution_contexts[idx].get(); + *has_device_memory = execution_contexts[idx].HasDeviceMemory(); + return Status::OK(); + } + + int GetNumContexts() { + mutex_lock lock(mu); + return execution_contexts.size(); + } + + size_t GetDeviceMemorySize() { return device_memory_size_; } + + private: + // Note: declaration has to come before execution_contexts, to ensure proper + // order of destruction. + TrtUniquePtrType cuda_engine_; + + public: + // In explicit batch mode, we maintain a vector of contexts for each engine, + // where each context is created for a specific profile. This is because it is + // either not possible or non-trivial to change the profile of a context for + // the following reasons: + // - To switch profiles (from TRT 7), one must first ensure that all inference + // calls in that context are finished. This would require an additional + // synchronization before we call setOptimizationProfile. To avoid this + // extra sync call, we mantain separate execution context for each profile. + // IExecutionContext object is not thread safe: only one thread should use it + // for inference at a time therefore we need a mutex. More details at + // https://docs.nvidia.com/deeplearning/sdk/tensorrt-best-practices/index.html#thread-safety + // Additional discussion about execution context management and thread safety + // at https://github.com/tensorflow/tensorflow/issues/36959 + std::vector execution_contexts TF_GUARDED_BY(mu); + + private: + // Until TRT 8.4 ICudaEngine::getDeviceMemorySize() has a non-negligible + // latency. Since its value remains constant, we can cache it. + size_t device_memory_size_; +}; // Contains the context required to build the calibration data. class CalibrationContext { public: @@ -148,7 +194,7 @@ class CalibrationContext { std::unordered_map> device_buffers_; // Temporary staging areas for calibration inputs. - std::vector device_tensors_; + std::vector device_tensors_; std::unique_ptr calibrator_; TrtUniquePtrType builder_; @@ -158,8 +204,8 @@ class CalibrationContext { private: mutex mu_; - bool terminated_ GUARDED_BY(mu_) = false; - std::string calibration_table_ GUARDED_BY(mu_); + bool terminated_ TF_GUARDED_BY(mu_) = false; + std::string calibration_table_ TF_GUARDED_BY(mu_); }; ABSL_CONST_INIT extern const absl::string_view kTfTrtContainerName; @@ -179,6 +225,16 @@ class TRTEngineCacheResource : public ResourceBase { string DebugString() const override; + // Returns the EngineContext that is compatible with input_shapes. + // Returns nullptr if no compatible EngineContexts is found in cache. + EngineContext* GetEngineContext(const std::vector& input_shapes); + + // Returns the EngineContext that is compatible with profile_id. + // This function should be only called in explicit batch mode where + // cache size is expected to be at most one. + // Returns nullptr if no compatible EngineContexts is found in cache. + EngineContext* GetEngineContext(const int profile_id); + // Keep device allocator for TRT. std::unique_ptr allocator_; @@ -190,10 +246,14 @@ class TRTEngineCacheResource : public ResourceBase { // TODO(hinsu): Use different calibration context for the available shapes and // attach it to each item of the cache. std::unique_ptr calib_ctx_; + + // This object maintains all the optimization profiles during profile + // generation and engine build. During runtime the list of profiles is used to + // look up a matching profile for the input data. + TrtShapeOptimizationProfile profiles_; }; -#endif // GOOGLE_TENSORRT -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA && GOOGLE_TENSORRT } // namespace tensorrt } // namespace tensorflow diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.cc new file mode 100644 index 00000000000..ab9377057f9 --- /dev/null +++ b/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.cc @@ -0,0 +1,664 @@ +/* Copyright 2020 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h" + +#include +#include + +#include "absl/algorithm/container.h" +#include "tensorflow/compiler/tf2tensorrt/common/utils.h" +#include "tensorflow/compiler/tf2tensorrt/convert/utils.h" +#include "tensorflow/core/platform/stream_executor.h" +#include "tensorflow/core/profiler/lib/traceme.h" + +#if GOOGLE_CUDA && GOOGLE_TENSORRT + +#include "third_party/gpus/cuda/include/cuda_runtime_api.h" + +namespace tensorflow { +namespace tensorrt { + +// Returns a vector of nvinfer1::Dims for a vector of TensorShapes. +template +std::vector GetDimVec(std::vector shape_vec) { + std::vector dimvec(shape_vec.size()); + absl::c_transform(shape_vec, dimvec.begin(), [](TensorShapeType shape) { + auto adap = DimsAdapter::Create(shape); + TF_CHECK_OK(adap.status()); + return adap.ValueOrDie().AsTrtDims(); + }); + return dimvec; +} + +// In dynamic shape mode the optimization profile dims are only allowed to +// differ from the network input dims where the network input dims have -1 +// values. We enforce this condition by changing prof_dims if necessary. +void EnforceCompatibility(nvinfer1::Dims* prof_dims, + const PartialTensorShape& input_shape) { + for (int i = 0; i < input_shape.dims(); i++) { + if (input_shape.dim_size(i) != -1) { + prof_dims->d[i] = input_shape.dim_size(i); + } + } +} + +void SetImplicitBatchModeCompatibleProfile( + const std::vector& dimvec, std::vector* min, + std::vector* opt, std::vector* max) { + *min = dimvec; + for (auto& dim : *min) { + // Shape value tensors can have -1 value as a wildcard. We do not change + // in that case. + if (dim.d[0] != -1) dim.d[0] = 1; // Set min batch size to 1. + } + *opt = dimvec; + *max = dimvec; +} + +void TrtShapeOptimizationProfile::ImplicitBatchModeCompatibleStrategy( + const std::vector>& collected_shapes) { + for (auto& shape_vec : collected_shapes) { + std::vector min, opt, max; + SetImplicitBatchModeCompatibleProfile(shape_vec, &min, &opt, &max); + VLOG(2) << "Initializing optimization profile config with min=" + << DebugString(min) << ", opt=max=" << DebugString(max); + OptimizationProfileConfig profConfig{min, opt, max}; + profiles_.push_back(std::move(profConfig)); + } +} + +// Applies a binary operation for each dimension of the input shapes. +// x[i].d[k] = op(x[i].d[k], y[i].d[k]), where i enumerates the input tensors, +// and k enumerates the dimensions of the tensors. The BinaryOperation may be +// std::min, std::max etc. +template +Status ShapeProfileBinaryOp(std::vector* x, + const std::vector& y, + BinaryOperation op) { + if (x->size() != y.size()) + return errors::InvalidArgument( + "Number of input tensors differ during profile creation"); + for (int i = 0; i < x->size(); i++) { + if (x->at(i).nbDims != y[i].nbDims) + return errors::InvalidArgument( + "Number of input dimensions differ during profile creation at dim ", + i, ", values ", x->at(i).nbDims, y[i].nbDims); + for (int j = 0; j < x->at(i).nbDims; j++) { + x->at(i).d[j] = op(x->at(i).d[j], y[i].d[j]); + } + } + return Status::OK(); +} + +Status TrtShapeOptimizationProfile::RangeStrategy( + const std::vector>& collected_shapes) { + if (collected_shapes.empty()) return Status::OK(); + + std::vector min = collected_shapes[0]; + std::vector max = min; + + for (int i = 1; i < collected_shapes.size(); i++) { + TF_RETURN_IF_ERROR( + ShapeProfileBinaryOp(&min, collected_shapes[i], + [](int a, int b) { return std::min(a, b); })); + TF_RETURN_IF_ERROR( + ShapeProfileBinaryOp(&max, collected_shapes[i], + [](int a, int b) { return std::max(a, b); })); + } + VLOG(2) << "Initializing optimization profile config with min=" + << DebugString(min) << ", opt=max=" << DebugString(max); + OptimizationProfileConfig profConfig{min, max, max}; + profiles_.push_back(std::move(profConfig)); + return Status::OK(); +} + +void TrtShapeOptimizationProfile::OptimalStrategy( + const std::vector>& collected_shapes) { + for (auto& shape_vec : collected_shapes) { + std::vector min = shape_vec; + std::vector opt = min; + std::vector max = min; + VLOG(2) << "Initializing optimization profile config with min=opt=max=" + << DebugString(min); + OptimizationProfileConfig profConfig{min, opt, max}; + profiles_.push_back(std::move(profConfig)); + } +} + +// Collects the values of tensors that are ShapeTensorCompatible to. The values +// are stored in the actual_shape_values_ member variable. +Status TrtShapeOptimizationProfile::CollectShapeValues(OpKernelContext* ctx) { + tensorflow::profiler::TraceMe activity( + "TrtShapeOptimizationProfile::CollectShapeValues", + tensorflow::profiler::TraceMeLevel::kInfo); + cudaStream_t stream = reinterpret_cast( + CHECK_NOTNULL(ctx->op_device_context() + ->stream() + ->implementation() + ->GpuStreamMemberHack())); + actual_shape_values_.resize(ctx->num_inputs()); + if (is_shape_tensor_.empty()) { + is_shape_tensor_.resize(ctx->num_inputs()); + for (int i = 0; i < ctx->num_inputs(); i++) { + is_shape_tensor_[i] = IsTrtShapeTensorCompatible(ctx->input(i)); + } + } + int n_shape_val = 0; + // First copy all the shape value candidates into actual_shape_values_ vector. + for (int i = 0; i < ctx->num_inputs(); i++) { + if (is_shape_tensor_[i]) { + if (ctx->input_dtype(i) != DT_INT32) { + // In case the is_shape_tensor mask was initialized with the input + // shapes only (without knowledge of dtype) then we apply correction. + is_shape_tensor_[i] = false; + continue; + } + if (input_shape_values_.size() > 0 && + input_shape_values_[0][i].nbDims != ctx->input(i).NumElements()) { + // Shape tensor dims should not change. It must be a value tensor. + is_shape_tensor_[i] = false; + continue; + } + // We have to copy the shape values to the host, because TRT's + // ExecutionContext::setInputShapeBinding expects a host pointer. + n_shape_val++; + const Tensor& input = ctx->input(i); + actual_shape_values_[i].nbDims = input.NumElements(); + auto ret = cudaMemcpyAsync( + actual_shape_values_[i].d, input.flat().data(), + input.NumElements() * sizeof(int32), cudaMemcpyDeviceToHost, stream); + if (ret != 0) { + return errors::Internal("Could not copy shape tensor values"); + } + VLOG(2) << "Input " << i << " is (probably) a shape tensor, n_values=" + << input.NumElements(); + } else { + actual_shape_values_[i] = {0, {}}; + } + } + if (n_shape_val > 0) { + // If we have any shape values candidates, then wait until data is copied + // to host. + cudaStreamSynchronize(stream); + } + return Status::OK(); +} + +// Collects the values of tensors that are ShapeTensorCompatible to. To be used +// for unit tests. +Status TrtShapeOptimizationProfile::CollectShapeValues(const DataVec& input) { + actual_shape_values_.resize(input.size()); + for (int i = 0; i < input.size(); i++) { + if (is_shape_tensor_[i]) { + if (!IsTrtShapeTensorCompatible(input[i].tensor)) { + return errors::Internal("Inconsistent shape tensor ", input[i].name, + ", ", i); + } + int n_elements = input[i].tensor.NumElements(); + actual_shape_values_[i].nbDims = n_elements; + // During unit tests, the data is in unified memory + std::copy(input[i].tensor.flat().data(), + input[i].tensor.flat().data() + n_elements, + actual_shape_values_[i].d); + VLOG(2) << "Collected tensor shape values " + << DebugString(actual_shape_values_[i]); + } else { + actual_shape_values_[i] = {0, {}}; + } + } + return Status::OK(); +} + +// Adjusts shape value profile to prevent TRT from removing shape value input +// bindings whose value is redundant (only a single value matches the profile). +// This should be removed once the NVIDIA bug 3153064 is fixed. +void FixShapeValueProfile(OptimizationProfileConfig* prof, + const std::vector& is_shape_tensor) { + int shape_value_offset = is_shape_tensor.size(); + for (int i = 0; i < is_shape_tensor.size(); i++) { + if (is_shape_tensor[i] && + std::equal(prof->min[shape_value_offset + i].d, + prof->min[shape_value_offset + i].d + + prof->min[shape_value_offset + i].nbDims, + prof->max[shape_value_offset + i].d)) { + prof->max[shape_value_offset + i].d[0]++; + VLOG(2) << "Adjusted profile for shape value tensor " << i << " " + << DebugString(prof->max[shape_value_offset + i]); + } else { + VLOG(2) << i << " is not a shape tensor." << is_shape_tensor[i]; + } + } +} + +// Checks whether rhs is already contained in values. +bool AlreadyCollected(const std::vector>& values, + const std::vector& rhs) { + for (auto& lhs : values) { + bool ret = lhs.size() == rhs.size(); + for (int i = 0; ret && i < lhs.size(); i++) { + ret &= lhs[i].nbDims == rhs[i].nbDims; + for (int j = 0; ret && j < lhs[i].nbDims; j++) { + ret &= (lhs[i].d[j] == rhs[i].d[j]); + } + } + if (ret) return true; + } + return false; +} + +void TrtShapeOptimizationProfile::InitProfiles( + const std::vector& input_partial_shapes, + ProfileStrategy strategy) { + strategy_ = strategy; + if (input_shapes_.size() == 0) { + VLOG(1) << "Not creating profiles without input_shapes. " + "You have to enable profile generation mode first (build)."; + return; + } + // Preprocess the vector of input shapes and shape values: + // - Converts TensorShape -> nvinfer::Dims. + // - Concatenates the shape values after the input shapes: + // dimvec = [dim0, dim1,..., shapeval0, shapval1, ...] + // - Ensures that the list is unique. + std::vector> collected_shapes; + for (int i = 0; i < input_shapes_.size(); i++) { + auto shape_vec = input_shapes_[i]; + VLOG(2) << "Initprofiles, processing shape " << i; + if (!shape_vec.empty()) { + // Correct for values that are mistakenly used as shape values + for (int k = 0; k < input_shape_values_[i].size(); k++) { + if (!is_shape_tensor_[k]) + input_shape_values_[i][k] = nvinfer1::Dims{0, {}}; + } + std::vector dimvec = GetDimVec(shape_vec); + dimvec.insert(dimvec.end(), input_shape_values_[i].begin(), + input_shape_values_[i].end()); + // TODO(tfeher): This condition should not apply for explicit profile. In + // that case consicutive elements in collected_shapes contain the user + // defined values of min, opt and max, and it is valid the have min = opt + // and opt = max. + if (!AlreadyCollected(collected_shapes, dimvec)) { + collected_shapes.push_back(dimvec); + } + } + } + switch (strategy_) { + case ProfileStrategy::kImplicitBatchModeCompatible: + VLOG(1) << "Creating profiles with ImplicitBatchModeCompatible strategy"; + ImplicitBatchModeCompatibleStrategy(collected_shapes); + break; + // Treat all other strategies the same as kOptimal for now. Implementing + // those is outlined in the dynamic shape support implementation plan. + case ProfileStrategy::kRange: + VLOG(1) << "Creating profiles with Range strategy"; + TF_CHECK_OK(RangeStrategy(collected_shapes)); + break; + case ProfileStrategy::kRangeOptimal: + VLOG(1) << "Creating profiles with RangeOptimal strategy"; + OptimalStrategy(collected_shapes); + TF_CHECK_OK(RangeStrategy(collected_shapes)); + break; + case ProfileStrategy::kOptimal: + VLOG(1) << "Creating profiles with Optimal strategy"; + OptimalStrategy(collected_shapes); + break; + } + // Define a mask that describe which input could be a shape tensor. Note + // that here we can have false positives. The shape tensor mask will be + // updated once the network is constructed. + SetShapeTensorMask(input_partial_shapes); + if (input_partial_shapes.size() > 0) { + for (OptimizationProfileConfig& prof : profiles_) { + // TODO: Remove this when the bug is fixed. +#if !IS_TRT_VERSION_GE(8, 0, 0, 0) + FixShapeValueProfile(&prof, is_shape_tensor_); +#endif + for (int i = 0; i < input_partial_shapes.size(); i++) { + auto network_input = input_partial_shapes[i]; + EnforceCompatibility(&prof.min[i], network_input); + EnforceCompatibility(&prof.opt[i], network_input); + EnforceCompatibility(&prof.max[i], network_input); + } + } + } +} + +void TrtShapeOptimizationProfile::InitCalibProfile( + const std::vector& shapes) { + VLOG(1) << "Collected shape(s) " << DebugString(shapes) << " for " + << " calibration profile."; + auto shape_vec = shapes; + if (!shape_vec.empty()) { + std::vector dimvec = GetDimVec(shape_vec); + dimvec.insert(dimvec.end(), actual_shape_values_.begin(), + actual_shape_values_.end()); + VLOG(2) << "Initializing calibration optimization profile config with " + << "min=opt=max " << DebugString(dimvec); + + OptimizationProfileConfig profConfig{dimvec, dimvec, dimvec}; + calib_profiles_ = std::move(profConfig); + } else { + VLOG(2) << "Failed to initialize calibration optimization profile."; + } +} + +Status TrtShapeOptimizationProfile::AddProfiles( + nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, + const nvinfer1::INetworkDefinition* network) { + // Create optimization profile for calibration if necessary. + if (!calib_profiles_.min.empty()) { + VLOG(2) << "Setting up calibration profies"; + auto* calibProfile = builder->createOptimizationProfile(); + Status status = + calib_profiles_.SetDimensions(network, calibProfile, input_mask_); + if (!status.ok()) { + return status; + } + bool result = false; + if (calibProfile->isValid()) { + result = config->setCalibrationProfile(calibProfile); + } else { + VLOG(2) << "Calibration profile is not valid"; + } + if (result) { + VLOG(2) << "Added calibration optimization profile " + << calib_profiles_.DebugString() << " to builder config."; + } else { + VLOG(2) << "FAILED TO ADD PROFILE"; + LOG(ERROR) << "Failed to add calibration optimization profile " + << calib_profiles_.DebugString() + << ". This usually happens when profile is invalid."; + } + } + // Create a vector of optimization profiles. + for (int i = 0; i < profiles_.size(); i++) { + auto* optProfile = builder->createOptimizationProfile(); + Status status = + profiles_[i].SetDimensions(network, optProfile, input_mask_); + if (!status.ok()) { + return status; + } + int idx = -1; + if (optProfile->isValid()) { + idx = config->addOptimizationProfile(optProfile); + } + if (idx >= 0) { + if (i != idx) { + return errors::Internal( + "Profile index of engine config is different from source profile " + "index: ", + i, " != ", idx); + } + VLOG(1) << "Added optimization profile " << profiles_[i].DebugString() + << " with idx " << idx << " to builder config."; + } else { + LOG(ERROR) << "Failed to add optimization profile " + << profiles_[i].DebugString() + << ". This usually happens when profile is invalid."; + } + } + if (!profiles_.empty() && config->getNbOptimizationProfiles() == 0) { + return errors::Internal("Failure in adding an optimization profile."); + } + need_profiles_ = config->getNbOptimizationProfiles() > 0; + // Update the mask that flag shape tensors. The network is known now, + // the mask will be correct. + SetShapeTensorMask(network); + is_pruned_input_.resize(network->getNbInputs()); + absl::c_fill(is_pruned_input_, false); + return Status::OK(); +} + +Status TrtShapeOptimizationProfile::ConfigureBuilder( + nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, + const nvinfer1::INetworkDefinition* network) { + TF_RETURN_IF_ERROR(AddProfiles(builder, config, network)); + return Status::OK(); +} + +// Sets the shape tensor mask from the TRT engine definition. +void TrtShapeOptimizationProfile::SetShapeTensorMask( + const nvinfer1::ICudaEngine* engine, int n_inputs) { + is_shape_tensor_.resize(n_inputs, false); + for (int i = 0; i < n_inputs; i++) { + int binding_index; + Status status = GetTrtBindingIndex(i, 0, engine, &binding_index); + if (!status.ok()) { + continue; + } + is_shape_tensor_[i] = engine->isShapeBinding(binding_index); + if (is_shape_tensor_[i]) { + VLOG(2) << "Found shape tensor at " << i; + } + } + has_shape_tensor_ = + absl::c_any_of(is_shape_tensor_, [](bool b) { return b; }); +} + +// Sets the shape tensor mask using the network definition. +void TrtShapeOptimizationProfile::SetShapeTensorMask( + const nvinfer1::INetworkDefinition* network) { + int n_inputs = network->getNbInputs(); + is_shape_tensor_.resize(n_inputs, false); + for (int i = 0; i < n_inputs; i++) { + const ITensorProxyPtr input = network->getInput(i); + is_shape_tensor_[i] = input->isShapeTensor(); + if (is_shape_tensor_[i]) { + VLOG(2) << "Found shape tensor " << input->getName() << " at " << i; + } + } + has_shape_tensor_ = + absl::c_any_of(is_shape_tensor_, [](bool b) { return b; }); +} + +// Sets the shape tensor mask using the input partial shapes. This only tells +// whether the tensors are shape value compatible, only the final network +// definition or the engine would give concrete answers. +void TrtShapeOptimizationProfile::SetShapeTensorMask( + const std::vector& input_partial_shapes) { + if (is_shape_tensor_.size() == input_partial_shapes.size()) { + // Already initialized, e.g. by TRTEngineOp::ComputeAsync(). + return; + } + is_shape_tensor_.resize(input_partial_shapes.size(), false); + for (int i = 0; i < input_partial_shapes.size(); i++) { + is_shape_tensor_[i] = IsTrtShapeTensorCompatible(input_partial_shapes[i]); + if (is_shape_tensor_[i]) { + VLOG(2) << "Found shape compatible tensor at " << i; + } + } + has_shape_tensor_ = + absl::c_any_of(is_shape_tensor_, [](bool b) { return b; }); +} + +int TrtShapeOptimizationProfile::GetProfileNumber( + const std::vector& shapes) { + tensorflow::profiler::TraceMe activity( + "TrtShapeOptimizationProfile::GetProfileNumber", + tensorflow::profiler::TraceMeLevel::kInfo); + if (!need_profiles_) return 0; + // TODO(tfeher): Return the best profile not just the first compatible. + for (int i = 0; i < profiles_.size(); i++) { + if (profiles_[i].IncludesShapes(shapes, HasShapeTensor(), + actual_shape_values_, is_pruned_input_, + is_shape_tensor_)) { + return i; + } + } + VLOG(1) << "Profile not found for input shapes " << DebugString(shapes); + VLOG(2) << " and shape values " << DebugString(actual_shape_values_); + return -1; +} + +Status TrtShapeOptimizationProfile::CreateExecutionContexts( + nvinfer1::ICudaEngine* engine, + std::vector* exec_contexts) { + int i = 0; + // The following loop runs once if we have static shapes, to create a single + // execution context without profiles. In dynamic mode we create one context + // for each profile and set the corresponding optimization profile. + do { + VLOG(1) << "Creating execution context " << i; + ExecutionContext context = ExecutionContext::Create(engine); + if (i > 0) { + // This condition is needed for two reasons: + // - using static shapes we do not have any profiles so we cannot call + // set optimizationprofiles. + // - The 0th profile is set implicitly for the first execution context + // therefore we do not need to set. + if (!context->setOptimizationProfile(i)) { + return errors::Internal("Could not set TRT optimization profile."); + } + } + exec_contexts->push_back(std::move(context)); + i++; + } while (i < profiles_.size()); + + return Status::OK(); +} + +Status TrtShapeOptimizationProfile::SetInputShapeBinding( + int input_index, int binding_index, nvinfer1::ICudaEngine* cuda_engine, + nvinfer1::IExecutionContext* exec_context) const { + tensorflow::profiler::TraceMe activity( + "TrtShapeOptimizationProfile::SetInputShapeBinding", + tensorflow::profiler::TraceMeLevel::kInfo); + if (cuda_engine->isShapeBinding(binding_index)) { + // Input shape binding data has to be in host memory. That is the reason + // we can't use input_tensor.flat().data(). which contains the same + // values in device memory. Instead, we use data that was copied to host + // by CollectShapeValues. + VLOG(2) << "Setting input shape binding for idx " << binding_index + << ", with values " + << DebugString(actual_shape_values_.at(input_index)); + bool ret = exec_context->setInputShapeBinding( + binding_index, actual_shape_values_.at(input_index).d); + if (!ret) { + return errors::Internal("Could not set input shape binding for idx ", + binding_index); + } + } + return Status::OK(); +} + +// If binding_idx is a shape tensor, then returns the associated min/max/opt +// shape values from prof_idx. +nvinfer1::Dims GetDimsFromShapeVal(int prof_idx, int binding_idx, + nvinfer1::OptProfileSelector selector, + const nvinfer1::ICudaEngine* engine) { + if (engine->isShapeBinding(binding_idx)) { + const int32* shape_val_ptr = + engine->getProfileShapeValues(binding_idx, prof_idx, selector); + if (shape_val_ptr) { + VLOG(2) << "Found shape value in prof " << prof_idx << ", binding " + << binding_idx; + nvinfer1::Dims dims = engine->getBindingDimensions(binding_idx); + // nbDims == 0 represent scalar, -1 represents invalid dim + int n_values = (dims.nbDims == 0) ? 1 : dims.d[0]; + if (n_values > 0) { + dims.nbDims = n_values; + std::copy(shape_val_ptr, shape_val_ptr + n_values, dims.d); + } + return dims; + } + } + return {0, {0}}; +} + +Status TrtShapeOptimizationProfile::SetPrunedMask( + const nvinfer1::ICudaEngine* engine, int n_network_inputs) { + is_pruned_input_.resize(n_network_inputs); + absl::c_fill(is_pruned_input_, false); + for (int j = 0; j < n_network_inputs; j++) { + int binding_idx; + Status status = GetTrtBindingIndex(j, 0, engine, &binding_idx); + if (!status.ok()) { + // Before TRT 8, an input tensor can be pruned (nvbugs/3153064) + // Resource inputs are also unknown by TRT, so we can treat them as + // pruned (the engine includes the variable as weights). + is_pruned_input_[j] = true; + VLOG(2) << "Skipping pruned input " << j; + continue; + } + } + return Status::OK(); +} + +Status TrtShapeOptimizationProfile::RestoreProfiles( + const nvinfer1::ICudaEngine* engine, int n_network_inputs) { + need_profiles_ = false; + if (!engine) { + // We do not need to restore profiles for an empty engine. + return Status::OK(); + } + if (engine->hasImplicitBatchDimension()) { + // Nothing to do, we cannot have profiles in implicit batch mode. + return Status::OK(); + } + int n_profiles = engine->getNbOptimizationProfiles(); + need_profiles_ = n_profiles > 0; + int n_inputs = GetNumberOfEngineInputs(engine); + if (n_inputs > n_network_inputs) { + return errors::Internal("Incorrect number of engine inputs"); + } + VLOG(2) << "Attempting to restore " << n_profiles << " profiles, each with " + << n_inputs << " inputs"; + SetShapeTensorMask(engine, n_network_inputs); + + TF_RETURN_IF_ERROR(SetPrunedMask(engine, n_network_inputs)); + + for (int prof_idx = 0; prof_idx < n_profiles; prof_idx++) { + OptimizationProfileConfig cfg; + + cfg.min.resize(n_network_inputs * 2); + cfg.max.resize(n_network_inputs * 2); + cfg.opt.resize(n_network_inputs * 2); + // restore shape values + for (int j = 0; j < n_network_inputs; j++) { + if (is_pruned_input_[j]) continue; + int binding_idx; + TF_RETURN_IF_ERROR(GetTrtBindingIndex(j, 0, engine, &binding_idx)); + + nvinfer1::Dims min = engine->getProfileDimensions( + binding_idx, prof_idx, nvinfer1::OptProfileSelector::kMIN); + nvinfer1::Dims max = engine->getProfileDimensions( + binding_idx, prof_idx, nvinfer1::OptProfileSelector::kMAX); + nvinfer1::Dims opt = engine->getProfileDimensions( + binding_idx, prof_idx, nvinfer1::OptProfileSelector::kOPT); + cfg.min[j] = min; + cfg.max[j] = max; + cfg.opt[j] = opt; + + cfg.min[j + n_inputs] = GetDimsFromShapeVal( + prof_idx, binding_idx, nvinfer1::OptProfileSelector::kMIN, engine); + cfg.max[j + n_inputs] = GetDimsFromShapeVal( + prof_idx, binding_idx, nvinfer1::OptProfileSelector::kMAX, engine); + cfg.opt[j + n_inputs] = GetDimsFromShapeVal( + prof_idx, binding_idx, nvinfer1::OptProfileSelector::kOPT, engine); + } + VLOG(2) << "Restored profile " << cfg.DebugString(); + profiles_.push_back(std::move(cfg)); + } + return Status::OK(); +} + +int TrtShapeOptimizationProfile::GetNumProfiles() const { + return profiles_.size(); +} + +} // namespace tensorrt +} // namespace tensorflow +#endif // GOOGLE_CUDA && GOOGLE_TENSORRT diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h b/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h new file mode 100644 index 00000000000..e5af88a1928 --- /dev/null +++ b/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h @@ -0,0 +1,351 @@ +/* Copyright 2020 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_SHAPE_OPTIMIZATION_PROFILES_H_ +#define TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_SHAPE_OPTIMIZATION_PROFILES_H_ + +#include +#include +#include +#include + +#include "tensorflow/compiler/tf2tensorrt/common/datavec.h" +#include "tensorflow/compiler/tf2tensorrt/convert/trt_parameters.h" +#include "tensorflow/compiler/tf2tensorrt/convert/utils.h" +#include "tensorflow/compiler/tf2tensorrt/utils/trt_execution_context.h" +#include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/tensor_shape.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/lib/core/status.h" +#include "tensorflow/core/lib/strings/str_util.h" +#include "tensorflow/core/lib/strings/strcat.h" + +#if GOOGLE_CUDA && GOOGLE_TENSORRT + +#include "third_party/tensorrt/NvInfer.h" + +namespace tensorflow { +namespace tensorrt { + +// Stores optimization profile parameters (min/opt/max of each input shape). +// +// A TensorRT optimization profile describes the possible min/max values of +// each dynamic input shape along with an optimum value. These values are used +// by the TensorRT builder to select the best kernel for the optimum value among +// those kernels that are valid for all input tensors in the [min, max] range. +struct OptimizationProfileConfig { + // Length of vector == 2*num_inputs to engine. min[0:num_inputs-1] are the min + // input dimensions for execution tensors. If engine has shape input tensors, + // then min[num_inputs + i] store the shape value for input i. For inputs that + // are not shape tensors min = opt = max = {0, {}}. + // + // When the OptimizationProfileConfig is created from the network definition + // (AddProfiles), then each elements of the min, opt, max vectors are defined. + // When the OptimizationProfileConfig object is restored during engine + // deserialization (RestoreProfiles), then some inputs can be pruned + // (see TrtShapeOptimizationProfile::is_pruned_input_). In that case min[i] + // is not defined for pruned inputs (same is true for opt and max). + std::vector min; + std::vector opt; + std::vector max; + + string DebugString() const { + using absl::StrCat; + return StrCat("[min: ", tensorflow::tensorrt::DebugString(min), + ", opt: : ", tensorflow::tensorrt::DebugString(opt), + ", max: ", tensorflow::tensorrt::DebugString(max), "]"); + } + + // Sets the min/opt/max dimensions for profile. + // + // The given min/opt/max dimensions should satisfy the condition + // min <= opt <= max. Additionally TRT requires that the min/opt/max values + // are compatible with the network input. Compatibility is defined the + // following way: let dim be the shape of an input binding and min/opt/max the + // corresponding profile dims. TRT requires that dim.d[k] must be -1 if + // (min.d[k] != dim.d[k] || opt.d[k] != dim.d[k] || max.d[k] != dim.d[k]). + // + // Parameters: + // network - TensorRT network, used to enumerate all the input tensors + // profile - on exit the profile information will be set for each input tensor + // input_mask - 1 for TRT inputs, 0 for TF inputs that are not TRT inputs + Status SetDimensions(const nvinfer1::INetworkDefinition* network, + nvinfer1::IOptimizationProfile* profile, + const std::vector& input_mask) const { + int n_inputs_trt = network->getNbInputs(); + int n_inputs_tf = opt.size() / 2; + /// TODO(lsugy): check that the sum of the mask equals n_inputs. + if (input_mask.size() != n_inputs_tf) { + return errors::Internal("Incorrect input mask size: ", input_mask.size()); + } + int n_mask_true = 0; + for (bool mask_val : input_mask) { + if (mask_val) { + n_mask_true++; + } + } + if (n_mask_true != n_inputs_trt) { + return errors::Internal( + "Number of true elements in input_mask (", n_mask_true, + ") doesn't match expected TRT inputs (", n_inputs_trt, ")"); + } + int j = 0; + for (int i = 0; i < n_inputs_tf; i++) { + if (input_mask[i]) { + const ITensorProxyPtr input = network->getInput(j); + const char* name = input->getName(); + if (input->isShapeTensor()) { + int idx = i + n_inputs_tf; + VLOG(2) << "Setting shape values for " << name << ", " + << ::tensorflow::tensorrt::DebugString(opt[idx]); + profile->setShapeValues(name, nvinfer1::OptProfileSelector::kMIN, + min[idx].d, min[idx].nbDims); + profile->setShapeValues(name, nvinfer1::OptProfileSelector::kOPT, + opt[idx].d, opt[idx].nbDims); + profile->setShapeValues(name, nvinfer1::OptProfileSelector::kMAX, + max[idx].d, max[idx].nbDims); + } + VLOG(2) << "Setting input dimensions for " << name << ", " + << ::tensorflow::tensorrt::DebugString(opt[i]); + profile->setDimensions(name, nvinfer1::OptProfileSelector::kMIN, + min[i]); + profile->setDimensions(name, nvinfer1::OptProfileSelector::kOPT, + opt[i]); + profile->setDimensions(name, nvinfer1::OptProfileSelector::kMAX, + max[i]); + + j++; + } + } + return Status::OK(); + } + + // Returns true if profile range completely includes the given shapes. + bool IncludesShapes(const std::vector& shapes, + bool has_shape_tensor, + const std::vector& shape_values, + const std::vector& is_pruned_input, + const std::vector& is_shape_tensor) const { + // min, max, and opt must have the same size which is already verified in + // SetDimensions. + if (min.size() != shapes.size() * 2 || + (has_shape_tensor && min.size() != shape_values.size() * 2)) { + VLOG(2) << "Profile size mismatch min size " << min.size() + << " vs input shapes size " << shapes.size() << " " + << shape_values.size(); + return false; + } + for (int i = 0; i < shapes.size(); i++) { + if (is_pruned_input[i]) { + continue; + } + auto current_shape = shapes[i]; + // min, max, and opt must have the same nbDims, which is already verified + // in SetDimensions. + if (min[i].nbDims != current_shape.dims()) { + return false; + } + // Check if range [min, max] includes current_shape. + for (int dim = 0; dim < current_shape.dims(); dim++) { + if ((min[i].d[dim] > current_shape.dim_size(dim)) || + (max[i].d[dim] < current_shape.dim_size(dim))) { + return false; + } + } + } + // Check shape values. + if (has_shape_tensor) { + int offset = shapes.size(); + for (int i = 0; i < shape_values.size(); i++) { + if (is_pruned_input[i] || !is_shape_tensor[i]) { + continue; + } + auto shape_val = shape_values[i]; + // min, max, and opt must have the same nbDims, which is already + // verified in SetDimensions. + if (min[i + offset].nbDims != shape_val.nbDims) { + return false; + } + // Check if range [min, max] includes shape_val. + for (int dim = 0; dim < shape_val.nbDims; dim++) { + if (min[i + offset].d[dim] > shape_val.d[dim] || + max[i + offset].d[dim] < shape_val.d[dim]) { + return false; + } + } + } + } + return true; + } +}; + +// Manages Optimization profiles during TRT Engine construction. +// +// An optimization profile describes a range of dimensions for each TRT network +// input, and the optimal dimensions that the auto-tuner should use for +// optimization. +// +// This class stores the list of input shapes that were seen during the +// build/profile_generation_mode phase, and using them it creates a set of +// OptimizationProfileConfigs. These configs will be added to IBuilderConfig +// before the engine is created. +class TrtShapeOptimizationProfile { + public: + TrtShapeOptimizationProfile() {} + + // Stores input shape information during profile_generation_mode. + void AddShape(const std::vector& shapes) { + input_shapes_.push_back(shapes); + input_shape_values_.push_back(actual_shape_values_); + VLOG(1) << "Collected shape(s) " << DebugString(shapes) << " for profiles."; + } + + // Stores the input mask. + void SetInputMask(const std::vector& input_mask) { + input_mask_ = input_mask; + } + + // Collects ShapeTensorCompatible tensor values. This is needed both during + // profile_generation_mode and during normal inference calls. + Status CollectShapeValues(OpKernelContext* ctx); + + // Collects ShapeTensorCompatible tensor values, used only for unit tests. + Status CollectShapeValues(const DataVec& input); + + void clear() { profiles_.clear(); } + + // Returns the profile number that should be used to execute the network with + // the given input shapes. Returns -1 if none of cached profiles are + // compatible with the given input shapes. + int GetProfileNumber(const std::vector& shapes); + + // Creates optimization profiles and add them to the builder config. + Status ConfigureBuilder(nvinfer1::IBuilder* builder, + nvinfer1::IBuilderConfig* config, + const nvinfer1::INetworkDefinition* network); + + // Creates execution contexts for each optimization profile. + Status CreateExecutionContexts(nvinfer1::ICudaEngine* engine, + std::vector* exec_contexts); + + Status SetInputShapeBinding(int input_index, int binding_index, + nvinfer1::ICudaEngine* cuda_engine, + nvinfer1::IExecutionContext* exec_context) const; + + // Creates optimization profiles profiles_ for the set of concrete input + // shapes collected in input_shapes_. The input_partial_shapes of the network + // is used to ensure that the created optimization profiles are compatible + // with the network. + void InitProfiles(const std::vector& input_partial_shapes, + ProfileStrategy strategy); + + void InitCalibProfile(const std::vector& shapes); + + // Returns number of created profiles. + int GetNumProfiles() const; + + bool HasShape() const { return !input_shapes_.empty(); } + bool NeedProfiles() const { return need_profiles_; } + + // Restores profiles from the engine (used after deserialization). + Status RestoreProfiles(const nvinfer1::ICudaEngine* engine, + int n_network_inputs); + + // Whether the network has any shape tensors. + bool HasShapeTensor() const { return has_shape_tensor_; } + + void SetShapeTensorMask(const nvinfer1::INetworkDefinition* network); + + // Whether the optimization profiles describe input that can be handled with + // a static engine (only 1 profile with min=max). + bool IsStaticCompatible() { + return strategy_ == ProfileStrategy::kOptimal && profiles_.size() == 1 +#if !IS_TRT_VERSION_GE(8, 0, 0, 0) + && !HasShapeTensor() +#endif + ; + // TODO(tfeher): remove !HasShapeTensor() condition once the + // FixShapeValueProfile workaround is turned off. + } + + private: + // Set of input shape vetors that we collect during profile_generation_mode. + std::vector> input_shapes_; + + // Input shape values that we collect during profile_generation_mode. If the + // tensor is not compatible with a TRT shape tensor then an empty shape is + // stored. + std::vector> input_shape_values_; + + // Shape values present in the current inference call. + std::vector actual_shape_values_; + + // The optimization profiles generated from input_shapes_. + std::vector profiles_; + + // The optimization profile for calibration. + OptimizationProfileConfig calib_profiles_; + + // A TRTEngineOp can have resource inputs. These are treated as constants: + // their value is read during conversion and stored as weights in the TRT + // engine. This means that resource inputs have no corresponding TRT engine + // input, and we do not need to provide profile information for these. The + // input mask helps to identify the TRT inputs, where we need to define + // optimization profiles. + std::vector input_mask_; + + // Whether the network has any shape tensors. Initially we assume that the + // network might have a shape value input. This will be updated when the + // network is created / engine is deserialized. + bool has_shape_tensor_ = true; + + // Whether the network/engine requires optimization profiles. + bool need_profiles_ = false; + + // Whether an input tensor is a shape tensor. + std::vector is_shape_tensor_; + + // Whether a network input was pruned (only in TRT 7). + std::vector is_pruned_input_; + + // Optimization profile generation strategy. + ProfileStrategy strategy_; + + // Adds optimization profiles to the builder config. + Status AddProfiles(nvinfer1::IBuilder* builder, + nvinfer1::IBuilderConfig* config, + const nvinfer1::INetworkDefinition* network); + + void SetShapeTensorMask(const nvinfer1::ICudaEngine* engine, int n_inputs); + void SetShapeTensorMask( + const std::vector& input_partial_shapes); + + Status SetPrunedMask(const nvinfer1::ICudaEngine* engine, + int n_network_inputs); + + void ImplicitBatchModeCompatibleStrategy( + const std::vector>& collected_shapes); + void OptimalStrategy( + const std::vector>& collected_shapes); + Status RangeStrategy( + const std::vector>& collected_shapes); +}; + +} // namespace tensorrt +} // namespace tensorflow + +#endif // GOOGLE_CUDA && GOOGLE_TENSORRT +#endif // TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_SHAPE_OPTIMIZATION_PROFILES_H_ diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles_test.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles_test.cc new file mode 100644 index 00000000000..87e17a9fc3f --- /dev/null +++ b/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles_test.cc @@ -0,0 +1,256 @@ +/* Copyright 2020 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#if GOOGLE_CUDA && GOOGLE_TENSORRT + +#include + +#include + +#include "absl/memory/memory.h" +#include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h" +#include "tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/framework/tensor_shape.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/platform/test.h" +#include "third_party/tensorrt/NvInfer.h" + +namespace tensorflow { +namespace tensorrt { + +std::vector DimVecToShapeVec( + std::vector dimvec, + bool expand_with_empty_shape_values = false) { + std::vector shapevec(dimvec.size()); + for (int i = 0; i < dimvec.size(); i++) { + TensorShape shape; + TF_CHECK_OK( + TensorShapeUtils::MakeShape(dimvec[i].d, dimvec[i].nbDims, &shape)); + shapevec[i] = shape; + } + if (expand_with_empty_shape_values) { + shapevec.resize(2 * dimvec.size()); // Append empty shape values + } + return shapevec; +} + +bool DimsContained(const nvinfer1::Dims& dim, const nvinfer1::Dims& min, + const nvinfer1::Dims& max) { + if (dim.nbDims != min.nbDims || dim.nbDims != max.nbDims) { + return false; + } + for (int i = 0; i < dim.nbDims; i++) { + if (dim.d[i] < min.d[i] || dim.d[i] > max.d[i]) { + return false; + } + } + return true; +} + +bool DimsEqual(const nvinfer1::Dims& a, const nvinfer1::Dims& b) { + if (a.nbDims != b.nbDims) { + return false; + } + for (int i = 0; i < a.nbDims; i++) { + if (a.d[i] != b.d[i]) { + return false; + } + } + return true; +} + +class TrtShapeOptimizationProfileTest + : public ::testing::TestWithParam { + protected: + TrtShapeOptimizationProfileTest() { + strategy_ = GetParam(); + builder_ = TrtUniquePtrType( + nvinfer1::createInferBuilder(logger_)); + network_ = TrtUniquePtrType( + builder_->createNetworkV2(flags_)); + builder_config_ = TrtUniquePtrType( + builder_->createBuilderConfig()); + builder_config_->setMaxWorkspaceSize(1 << 10); + } + + // Defines a simple network: output = input1 + input2. + void DefineNetwork(nvinfer1::INetworkDefinition* network, + nvinfer1::Dims3& dims) { + ITensorProxyPtr input1 = + network->addInput("input1", nvinfer1::DataType::kFLOAT, dims); + EXPECT_NE(nullptr, input1->trt_tensor()); + + ITensorProxyPtr input2 = + network->addInput("input2", nvinfer1::DataType::kFLOAT, dims); + EXPECT_NE(nullptr, input2->trt_tensor()); + + auto layer = + network->addElementWise(*input1->trt_tensor(), *input2->trt_tensor(), + nvinfer1::ElementWiseOperation::kSUM); + EXPECT_NE(nullptr, layer); + // Mark the output. + ITensorProxyPtr output = layer->getOutput(0); + output->setName("output"); + network->markOutput(*output->trt_tensor()); + } + + void CheckProfile(const std::vector& dimvec, + TrtShapeOptimizationProfile* profile, bool has_prof, + bool test_optimality) { + std::vector shape_vec = DimVecToShapeVec(dimvec); + int idx = profile->GetProfileNumber(shape_vec); + ASSERT_EQ(idx >= 0, has_prof); + if (idx < 0) return; + int prof_idx = exec_contexts_[idx]->getOptimizationProfile(); + ASSERT_GE(prof_idx, 0); + for (int j = 0; j < dimvec.size(); j++) { + nvinfer1::Dims min = engine->getProfileDimensions( + j, prof_idx, nvinfer1::OptProfileSelector::kMIN); + nvinfer1::Dims max = engine->getProfileDimensions( + j, prof_idx, nvinfer1::OptProfileSelector::kMAX); + nvinfer1::Dims opt = engine->getProfileDimensions( + j, prof_idx, nvinfer1::OptProfileSelector::kOPT); + + // This should always hold. + EXPECT_TRUE(DimsContained(dimvec[j], min, max)); + + if (test_optimality) { + // We shall have selected an optimal strategy. + EXPECT_TRUE(DimsEqual(dimvec[j], opt)); + } + } + } + + Logger& logger_ = *Logger::GetLogger(); + TrtUniquePtrType builder_; + TrtUniquePtrType network_; + TrtUniquePtrType builder_config_; + TrtUniquePtrType engine; + std::vector exec_contexts_; + // The order is important: exec_context_ must be destroyed first, and logger + // at last. + const uint32_t flags_ = + 1U << static_cast( + nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH); + ProfileStrategy strategy_; +}; + +INSTANTIATE_TEST_CASE_P( + OptProfilesTestInstantiation, TrtShapeOptimizationProfileTest, + ::testing::Values(ProfileStrategy::kRange, ProfileStrategy::kOptimal, + ProfileStrategy::kRangeOptimal, + ProfileStrategy::kImplicitBatchModeCompatible)); + +TEST_P(TrtShapeOptimizationProfileTest, Static) { + // Static mode does not depend on strategies, we test only once. + if (strategy_ != ProfileStrategy::kRange) return; + + // Network with static input shape. + nvinfer1::Dims3 dims(8, 8, 10); + DefineNetwork(network_.get(), dims); + + TrtShapeOptimizationProfile profile; + + // Configure and build engine - should be a no-op. + TF_CHECK_OK(profile.ConfigureBuilder(builder_.get(), builder_config_.get(), + network_.get())); + + engine = TrtUniquePtrType( + builder_->buildEngineWithConfig(*network_, *builder_config_)); + EXPECT_NE(nullptr, engine); + TF_CHECK_OK(profile.CreateExecutionContexts(engine.get(), &exec_contexts_)); + // A single execution context should be created for a graph with static input. + ASSERT_EQ(exec_contexts_.size(), 1); + EXPECT_NE(nullptr, exec_contexts_[0]); + + std::vector dim_vec(2, dims); + std::vector shape_vec = DimVecToShapeVec(dim_vec); + EXPECT_EQ(0, profile.GetProfileNumber(shape_vec)); +} + +TEST_P(TrtShapeOptimizationProfileTest, Dynamic) { + // Network with dynamic input shapes. + nvinfer1::Dims3 dims(-1, -1, 10); + DefineNetwork(network_.get(), dims); + + TrtShapeOptimizationProfile profile; + + // Set the input mask to true (no resource input) + std::vector input_mask(2, true); + profile.SetInputMask(input_mask); + + std::vector> input_profiles{ + {nvinfer1::Dims3(2, 2, 10), nvinfer1::Dims3(2, 2, 10)}, + {nvinfer1::Dims3(3, 3, 10), nvinfer1::Dims3(3, 3, 10)}, + {nvinfer1::Dims3(16, 16, 10), nvinfer1::Dims3(16, 16, 10)}, + }; + + std::vector unseen_shapes{nvinfer1::Dims3(5, 5, 10), + nvinfer1::Dims3(9, 9, 10)}; + + // Simulate a profile collection phase. + for (auto dim_vec : input_profiles) { + std::vector shape_vec = DimVecToShapeVec(dim_vec, true); + profile.AddShape(shape_vec); + } + std::vector input_partial_shapes; + TF_CHECK_OK(GetNetworkInputShapes(network_.get(), &input_partial_shapes)); + profile.InitProfiles(input_partial_shapes, strategy_); + + // Configure and build engine. + TF_CHECK_OK(profile.ConfigureBuilder(builder_.get(), builder_config_.get(), + network_.get())); + engine = TrtUniquePtrType( + builder_->buildEngineWithConfig(*network_.get(), *builder_config_.get())); + ASSERT_NE(nullptr, engine); + + TF_CHECK_OK(profile.CreateExecutionContexts(engine.get(), &exec_contexts_)); + + int n_profiles_exp; + switch (strategy_) { + case (ProfileStrategy::kImplicitBatchModeCompatible): + case (ProfileStrategy::kOptimal): + n_profiles_exp = input_profiles.size(); + break; + case (ProfileStrategy::kRange): + n_profiles_exp = 1; + break; + case (ProfileStrategy::kRangeOptimal): + n_profiles_exp = 1 + input_profiles.size(); + break; + } + // Each profile has an associated execution context. + EXPECT_EQ(exec_contexts_.size(), n_profiles_exp); + + profile.SetShapeTensorMask(network_.get()); + + EXPECT_EQ(profile.HasShapeTensor(), false); + + // Check if the profiles are assigned correctly. + for (auto dimvec : input_profiles) { + bool test_optimal_prof = strategy_ == ProfileStrategy::kOptimal || + strategy_ == ProfileStrategy::kRangeOptimal; + CheckProfile(dimvec, &profile, true, test_optimal_prof); + } + bool has_prof = (strategy_ == ProfileStrategy::kRange || + strategy_ == ProfileStrategy::kRangeOptimal); + CheckProfile(unseen_shapes, &profile, has_prof, false); +} + +} // namespace tensorrt +} // namespace tensorflow + +#endif // GOOGLE_CUDA && GOOGLE_TENSORRT diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_tensor_proxy.h b/tensorflow/compiler/tf2tensorrt/utils/trt_tensor_proxy.h index 789c518f600..5eea183fa9a 100644 --- a/tensorflow/compiler/tf2tensorrt/utils/trt_tensor_proxy.h +++ b/tensorflow/compiler/tf2tensorrt/utils/trt_tensor_proxy.h @@ -13,15 +13,16 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_TRT_TENSOR_PROXY_H -#define TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_TRT_TENSOR_PROXY_H +#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_TENSOR_PROXY_H_ +#define TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_TENSOR_PROXY_H_ -#include +#include #include #include #include #include "tensorflow/compiler/tf2tensorrt/common/utils.h" +#include "tensorflow/core/platform/logging.h" #if GOOGLE_CUDA && GOOGLE_TENSORRT #include "third_party/tensorrt/NvInfer.h" @@ -142,28 +143,26 @@ class ITensorProxy { ttype_(TensorType::kSIMPLE) {} bool is_trt_tensor() const { - assert(validate()); - assert(ttype_ == TensorType::kTRT); + CHECK(validate()); return trt_tensor_ != nullptr; } bool is_simple_tensor() const { - assert(validate()); - assert(ttype_ == TensorType::kSIMPLE); + CHECK(validate()); return simple_tensor_ != nullptr; } TensorType ttype() const { return ttype_; } nvinfer1::ITensor* trt_tensor() const { - assert(trt_tensor_ != nullptr); - assert(ttype_ == TensorType::kTRT); + CHECK_NOTNULL(trt_tensor_); + CHECK(ttype_ == TensorType::kTRT); return trt_tensor_; } SimpleITensor* simple_tensor() const { - assert(simple_tensor_ != nullptr); - assert(ttype_ == TensorType::kSIMPLE); + CHECK_NOTNULL(simple_tensor_); + CHECK(ttype_ == TensorType::kSIMPLE); return simple_tensor_.get(); } @@ -174,7 +173,7 @@ class ITensorProxy { case TensorType::kSIMPLE: return simple_tensor_->setName(name); } - assert(0 && "Unsupported itensor_ type"); + LOG(FATAL) << "Unsupported itensor_ type"; } const char* getName() const { @@ -184,7 +183,7 @@ class ITensorProxy { case TensorType::kSIMPLE: return simple_tensor_->getName(); } - assert(0 && "Unsupported itensor_ type"); + LOG(FATAL) << "Unsupported itensor_ type"; } void setDimensions(nvinfer1::Dims dimensions) { @@ -194,7 +193,7 @@ class ITensorProxy { case TensorType::kSIMPLE: return simple_tensor_->setDimensions(dimensions); } - assert(0 && "Unsupported itensor_ type"); + LOG(FATAL) << "Unsupported itensor_ type"; } nvinfer1::Dims getDimensions() const { @@ -204,7 +203,7 @@ class ITensorProxy { case TensorType::kSIMPLE: return simple_tensor_->getDimensions(); } - assert(0 && "Unsupported itensor_ type"); + LOG(FATAL) << "Unsupported itensor_ type"; } void setType(nvinfer1::DataType type) { @@ -214,7 +213,7 @@ class ITensorProxy { case TensorType::kSIMPLE: return simple_tensor_->setType(type); } - assert(0 && "Unsupported itensor_ type"); + LOG(FATAL) << "Unsupported itensor_ type"; } nvinfer1::DataType getType() const { @@ -224,7 +223,7 @@ class ITensorProxy { case TensorType::kSIMPLE: return simple_tensor_->getType(); } - assert(0 && "Unsupported itensor_ type"); + LOG(FATAL) << "Unsupported itensor_ type"; } bool isNetworkInput() const { @@ -234,7 +233,7 @@ class ITensorProxy { case TensorType::kSIMPLE: return simple_tensor_->isNetworkInput(); } - assert(0 && "Unsupported itensor_ type"); + LOG(FATAL) << "Unsupported itensor_ type"; } bool isNetworkOutput() const { @@ -244,7 +243,7 @@ class ITensorProxy { case TensorType::kSIMPLE: return simple_tensor_->isNetworkOutput(); } - assert(0 && "Unsupported itensor_ type"); + LOG(FATAL) << "Unsupported itensor_ type"; } void setBroadcastAcrossBatch(bool broadcastAcrossBatch) { @@ -254,7 +253,7 @@ class ITensorProxy { case TensorType::kSIMPLE: return simple_tensor_->setBroadcastAcrossBatch(broadcastAcrossBatch); } - assert(0 && "Unsupported itensor_ type"); + LOG(FATAL) << "Unsupported itensor_ type"; } bool getBroadcastAcrossBatch() const { @@ -264,7 +263,7 @@ class ITensorProxy { case TensorType::kSIMPLE: return simple_tensor_->getBroadcastAcrossBatch(); } - assert(0 && "Unsupported itensor_ type"); + LOG(FATAL) << "Unsupported itensor_ type"; } nvinfer1::TensorLocation getLocation() const { @@ -274,7 +273,7 @@ class ITensorProxy { case TensorType::kSIMPLE: return simple_tensor_->getLocation(); } - assert(0 && "Unsupported itensor_ type"); + LOG(FATAL) << "Unsupported itensor_ type"; } void setLocation(nvinfer1::TensorLocation location) { @@ -284,7 +283,7 @@ class ITensorProxy { case TensorType::kSIMPLE: return simple_tensor_->setLocation(location); } - assert(0 && "Unsupported itensor_ type"); + LOG(FATAL) << "Unsupported itensor_ type"; } bool setDynamicRange(float min, float max) { @@ -294,7 +293,7 @@ class ITensorProxy { case TensorType::kSIMPLE: return simple_tensor_->setDynamicRange(min, max); } - assert(0 && "Unsupported itensor_ type"); + LOG(FATAL) << "Unsupported itensor_ type"; } bool dynamicRangeIsSet() const { @@ -304,7 +303,7 @@ class ITensorProxy { case TensorType::kSIMPLE: return simple_tensor_->dynamicRangeIsSet(); } - assert(0 && "Unsupported itensor_ type"); + LOG(FATAL) << "Unsupported itensor_ type"; } void resetDynamicRange() { @@ -314,7 +313,7 @@ class ITensorProxy { case TensorType::kSIMPLE: return simple_tensor_->resetDynamicRange(); } - assert(0 && "Unsupported itensor_ type"); + LOG(FATAL) << "Unsupported itensor_ type"; } float getDynamicRangeMin() const { switch (ttype_) { @@ -323,7 +322,7 @@ class ITensorProxy { case TensorType::kSIMPLE: return simple_tensor_->getDynamicRangeMin(); } - assert(0 && "Unsupported itensor_ type"); + LOG(FATAL) << "Unsupported itensor_ type"; } float getDynamicRangeMax() const { @@ -333,9 +332,9 @@ class ITensorProxy { case TensorType::kSIMPLE: return simple_tensor_->getDynamicRangeMax(); } - assert(0 && "Unsupported itensor_ type"); + LOG(FATAL) << "Unsupported itensor_ type"; } -#if IS_TRT_VERSION_GE(5, 0, 0, 0) && !IS_TRT_VERSION_GE(8, 0, 0, 0) +#if !IS_TRT_VERSION_GE(8, 0, 0, 0) float getDynamicRange() const { switch (ttype_) { case TensorType::kTRT: @@ -343,7 +342,7 @@ class ITensorProxy { case TensorType::kSIMPLE: return simple_tensor_->getDynamicRange(); } - assert(0 && "Unsupported itensor_ type"); + LOG(FATAL) << "Unsupported itensor_ type"; } #endif void setAllowedFormats(nvinfer1::TensorFormats formats) { @@ -353,7 +352,7 @@ class ITensorProxy { case TensorType::kSIMPLE: return simple_tensor_->setAllowedFormats(formats); } - assert(0 && "Unsupported itensor_ type"); + LOG(FATAL) << "Unsupported itensor_ type"; } nvinfer1::TensorFormats getAllowedFormats() const { @@ -363,7 +362,7 @@ class ITensorProxy { case TensorType::kSIMPLE: return simple_tensor_->getAllowedFormats(); } - assert(0 && "Unsupported itensor_ type"); + LOG(FATAL) << "Unsupported itensor_ type"; } bool isShapeTensor() const { @@ -373,7 +372,7 @@ class ITensorProxy { case TensorType::kSIMPLE: return simple_tensor_->isShapeTensor(); } - assert(0 && "Unsupported itensor_ type"); + LOG(FATAL) << "Unsupported itensor_ type"; } bool isExecutionTensor() const { @@ -383,7 +382,7 @@ class ITensorProxy { case TensorType::kSIMPLE: return simple_tensor_->isExecutionTensor(); } - assert(0 && "Unsupported itensor_ type"); + LOG(FATAL) << "Unsupported itensor_ type"; } private: @@ -412,7 +411,7 @@ class ITensorProxy { class ITensorProxyPtr { public: - ITensorProxyPtr(nullptr_t) : p_(nullptr) {} + ITensorProxyPtr(std::nullptr_t) : p_(nullptr) {} ITensorProxyPtr(ITensorProxy* p) : p_(p) {} ITensorProxyPtr(nvinfer1::ITensor* p) : p_(new ITensorProxy(p)) {} ITensorProxyPtr(SimpleITensor* p) : p_(new ITensorProxy(p)) {} @@ -442,6 +441,10 @@ inline bool operator==(const ITensorProxyPtr& p1, const ITensorProxyPtr& p2) { p1->simple_tensor() == p2->simple_tensor())); } +inline bool operator!=(const ITensorProxyPtr& p1, const ITensorProxyPtr& p2) { + return !(p1 == p2); +} + struct ITensorProxyHash { size_t operator()(const ITensorProxyPtr& tensor) const { return reinterpret_cast(tensor.p_.get()); @@ -452,4 +455,4 @@ struct ITensorProxyHash { } // namespace tensorflow #endif // GOOGLE_CUDA && GOOGLE_TENSORRT -#endif // TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_TRT_TENSOR_PROXY_H +#endif // TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_TENSOR_PROXY_H_ diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_testutils.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_testutils.cc new file mode 100644 index 00000000000..82046a2978e --- /dev/null +++ b/tensorflow/compiler/tf2tensorrt/utils/trt_testutils.cc @@ -0,0 +1,76 @@ +/* Copyright 2021 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/tf2tensorrt/utils/trt_testutils.h" + +#if GOOGLE_CUDA && GOOGLE_TENSORRT + +#include +#include +#include + +#include + +namespace tensorflow { + +namespace tensorrt { +namespace convert { + +::testing::Matcher> ArrayFloatNear( + const std::vector& values, float max_abs_error, bool nan_sensitive) { + std::vector<::testing::Matcher> matchers; + matchers.reserve(values.size()); + for (const float& v : values) { + if (nan_sensitive) { + matchers.emplace_back(::testing::NanSensitiveFloatNear(v, max_abs_error)); + } else if (max_abs_error == 0) { + matchers.emplace_back(::testing::FloatEq(v)); + } else { + EXPECT_GE(max_abs_error, 0); + matchers.emplace_back(::testing::FloatNear(v, max_abs_error)); + } + } + return ::testing::ElementsAreArray(matchers); +} + +nvinfer1::Dims CreateDims(const std::vector& d) { + nvinfer1::Dims dims; + dims.nbDims = d.size(); + for (int i = 0; i < d.size(); ++i) { + dims.d[i] = d[i]; + } + return dims; +} + +NodeDef MakeNodeDef(const std::string& name, const std::string& op, + const std::vector& inputs, + const std::map attrs) { + NodeDef node_def; + node_def.set_name(name); + node_def.set_op(op); + for (const auto& input : inputs) { + node_def.add_input(input); + } + for (const auto& attr : attrs) { + (*node_def.mutable_attr())[attr.first] = attr.second; + } + return node_def; +} + +} // namespace convert +} // namespace tensorrt +} // namespace tensorflow + +#endif // GOOGLE_CUDA && GOOGLE_TENSORRT diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_testutils.h b/tensorflow/compiler/tf2tensorrt/utils/trt_testutils.h new file mode 100644 index 00000000000..e0b9a0366a5 --- /dev/null +++ b/tensorflow/compiler/tf2tensorrt/utils/trt_testutils.h @@ -0,0 +1,183 @@ +/* Copyright 2021 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_TESTUTILS_H_ +#define TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_TESTUTILS_H_ + +#if GOOGLE_CUDA && GOOGLE_TENSORRT + +#include +#include +#include +#include +#include + +#include +#include +#include "absl/strings/str_format.h" +#include "absl/types/span.h" +#include "tensorflow/cc/framework/scope.h" +#include "tensorflow/cc/ops/standard_ops.h" +#include "tensorflow/compiler/tf2tensorrt/common/utils.h" +#include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h" +#include "tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.h" +#include "tensorflow/core/framework/node_def.pb.h" // NOLINT +#include "tensorflow/core/framework/tensor.pb.h" // NOLINT +#include "tensorflow/core/framework/tensor_shape.h" +#include "tensorflow/core/framework/tensor_testutil.h" +#include "tensorflow/core/lib/core/status.h" +#include "tensorflow/core/lib/core/status_test_util.h" +#include "third_party/tensorrt/NvInfer.h" + +namespace tensorflow { +namespace tensorrt { +namespace convert { +// Creates a node with the given op, inputs, and attributes. +NodeDef MakeNodeDef(const std::string& name, const std::string& op, + const std::vector& inputs, + const std::map attrs = {}); + +// Creates a constant node with the given name and values arranged in the given +// shape. +template +NodeDef MakeConstNodeDef(const std::string& name, const std::vector& vals, + const TensorShape& shape) { + Scope s = Scope::NewRootScope(); + Tensor t = test::AsTensor(vals, shape); + auto const_op = ops::Const(s.WithOpName(name), t); + return const_op.node()->def(); +} + +// Creates a constant node with the given name and values, assuming a 1-D shape. +template +NodeDef MakeConstNodeDef(const std::string& name, const std::vector& vals) { + TensorShape shape; + const std::vector shape_dims = {static_cast(vals.size())}; + TF_EXPECT_OK(TensorShapeUtils::MakeShape(shape_dims, &shape)); + return MakeConstNodeDef(name, vals, shape); +} + +// Creates an nvinfer1::Dims struct from the given vector. +nvinfer1::Dims CreateDims(const std::vector& d); + +// A gmock matcher that check that elements of a float vector match to a given +// tolerance. +::testing::Matcher> ArrayFloatNear( + const std::vector& values, float max_abs_error = 1e-5, + bool nan_sensitive = false); + +// nvinfer1::Dims gMock matchers + +// matches nvinfer1::Dims to initializer list or vector of ints +// Example: EXPECT_THAT(my_dims, DimsAreArray({1, 2, 3})) +MATCHER_P(DimsAreArrayHelper, array_value, + absl::StrFormat("%s [%s]", negation ? "are" : "are not", + ::testing::PrintToString(array_value))) { + if (arg.nbDims != array_value.size()) return false; + for (int i = 0; i < arg.nbDims; ++i) { + if (arg.d[i] != array_value[i]) { + return false; + } + } + return true; +} +using DimsAreArray = DimsAreArrayHelperMatcherP>; + +// nvinfer1::INetworkDefinition gMock matchers + +// Checks that layer names are equal to initializer list or vector of strings. +// Example: EXPECT_THAT(my_network, LayerNamesAreArray({"conv1", "conv2"})) +MATCHER_P(LayerNamesAreArrayHelper, array_value, + absl::StrFormat("layer names %s [%s]", negation ? "are" : "are not", + ::testing::PrintToString(array_value))) { + if (array_value.size() != arg->getNbLayers()) return false; + for (int i = 0; i < arg->getNbLayers(); ++i) { + if (arg->getLayer(i)->getName() == nullptr) { + return false; + } + } + return true; +} +using LayerNamesAreArray = + LayerNamesAreArrayHelperMatcherP>; + +// Checks layer names are all non-empty. +MATCHER(LayerNamesNonEmpty, "") { + for (int i = 0; i < arg->getNbLayers(); ++i) { + if (arg->getLayer(i)->getName() == nullptr) { + return false; + } + } + return true; +} + +// TRT_ShapedWeights gMock matchers. + +// Checks that the weight dimensions are values are equal to the given values. +// Example: EXPECT_THAT(my_weights, +// ShapedWeightsHasDimsAndValues({1, 2},{1.0f, 2.0f})) +MATCHER_P2(ShapedWeightsHasDimsAndValuesHelper, dims_vec, expected_values, "") { + DimsAdapter dims(dims_vec); + if (arg.Shape() != dims) { + return false; + } + if (arg.count() != expected_values.size()) { + return false; + } + using T = typename decltype(expected_values)::value_type; + const T* actual_values = arg.template GetPointer(); + for (int i = 0; i < expected_values.size(); ++i) { + if (expected_values[i] != actual_values[i]) { + return false; + } + } + return true; +} + +template +using ShapedWeightsHasDimsAndValues = + ShapedWeightsHasDimsAndValuesHelperMatcherP2, + std::vector>; + +// std::vector convenience utilities. + +// Creates a new vector by casting all values of the given InCType vector to +// OutCType. +template +std::vector CastVector( + const gtl::ArraySlice& vals) { // non-absl ok + std::vector res(vals.size()); + std::transform(vals.begin(), vals.end(), res.begin(), + [](const InCType in_val) -> OutCType { + return static_cast(in_val); + }); + return res; +} + +// Creates a new vector of the given size and fills it with an increasing +// sequence starting from the given start_value using std::iota. +template +std::vector CreateVectorIota(int size, CType start_value = CType(0)) { + std::vector res(size); + std::iota(res.begin(), res.end(), start_value); + return res; +} + +} // namespace convert +} // namespace tensorrt +} // namespace tensorflow + +#endif // GOOGLE_CUDA && GOOGLE_TENSORRT +#endif // TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_TESTUTILS_H_ diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_testutils_test.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_testutils_test.cc new file mode 100644 index 00000000000..d5d9fcf99f5 --- /dev/null +++ b/tensorflow/compiler/tf2tensorrt/utils/trt_testutils_test.cc @@ -0,0 +1,99 @@ +/* Copyright 2021 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#if GOOGLE_CUDA && GOOGLE_TENSORRT + +#include "tensorflow/compiler/tf2tensorrt/utils/trt_testutils.h" + +#include "tensorflow/compiler/tf2tensorrt/common/utils.h" +#include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h" +#include "third_party/tensorrt/NvInfer.h" + +namespace tensorflow { +namespace tensorrt { +namespace convert { + +using ::testing::AllOf; +using ::testing::AnyOf; +using ::testing::Eq; +using ::testing::Not; + +TEST(TrtDimsMatcher, ParameterizedMatchers) { + EXPECT_THAT(nvinfer1::Dims4(1, 2, 3, 4), DimsAreArray({1, 2, 3, 4})); + // Check empty dims. + EXPECT_THAT(nvinfer1::Dims{}, Not(DimsAreArray({1, 2}))); + std::vector empty_dims; + EXPECT_THAT(nvinfer1::Dims{}, DimsAreArray(empty_dims)); + // Check mismatching values. + EXPECT_THAT(nvinfer1::Dims4(1, 2, 3, 4), Not(DimsAreArray({1, 2, 3, 5}))); + // Check mismatching number of arguments. + EXPECT_THAT(nvinfer1::Dims4(1, 2, 3, 4), Not(DimsAreArray({1, 2, 5}))); +} + +TEST(TrtDimsMatcher, EqualityMatcher) { + EXPECT_THAT(nvinfer1::Dims4(1, 2, 3, 4), Eq(nvinfer1::Dims4(1, 2, 3, 4))); + // Check empty dims. + EXPECT_THAT(nvinfer1::Dims{}, Eq(nvinfer1::Dims())); + // Check empty Dims is not equal to DimsHW, since their sizes differ. + EXPECT_THAT(nvinfer1::Dims{}, Not(Eq(nvinfer1::DimsHW()))); + // Check mismatching values. + EXPECT_THAT(nvinfer1::Dims4(1, 2, 3, 4), + Not(Eq(nvinfer1::Dims4(1, 2, 3, 3)))); + // Check mismatching number of arguments. + EXPECT_THAT(nvinfer1::Dims4(1, 2, 3, 4), Not(Eq(nvinfer1::Dims2(1, 2)))); +} + +TEST(INetworkDefinitionMatchers, CorrectlyMatch) { + Logger& logger = *Logger::GetLogger(); + TrtUniquePtrType builder( + nvinfer1::createInferBuilder(logger)); + TrtUniquePtrType network( + builder->createNetworkV2(0L)); + + // Empty network checks. + EXPECT_THAT(network.get(), AllOf(Not(LayerNamesAreArray({"some layer"})), + LayerNamesNonEmpty())); + + // Add the input and FC layers. + nvinfer1::Weights weights; + weights.type = nvinfer1::DataType::kFLOAT; + std::array vals; + weights.values = vals.data(); + weights.count = 1; + auto input = network->addInput("input-tensor", nvinfer1::DataType::kFLOAT, + nvinfer1::Dims3{1, 1, 1}); + ASSERT_NE(input, nullptr); + + const char* fc_layer_name = "my-fc-layer"; + auto layer = network->addFullyConnected(*input, 1, weights, weights); + ASSERT_NE(layer, nullptr); + layer->setName(fc_layer_name); + + // Check layer names. + EXPECT_THAT(network.get(), + AllOf(LayerNamesNonEmpty(), LayerNamesAreArray({fc_layer_name}))); + + // Add layer with default name and check layer name. + layer = network->addFullyConnected(*input, 1, weights, weights); + EXPECT_THAT(network.get(), AllOf(LayerNamesNonEmpty(), + Not(LayerNamesAreArray({fc_layer_name})))); +} + +} // namespace convert + +} // namespace tensorrt +} // namespace tensorflow + +#endif // GOOGLE_CUDA && GOOGLE_TENSORRT diff --git a/tensorflow/core/framework/selective_registration.h b/tensorflow/core/framework/selective_registration.h index 4b281a04bf6..2b0225da604 100644 --- a/tensorflow/core/framework/selective_registration.h +++ b/tensorflow/core/framework/selective_registration.h @@ -55,4 +55,69 @@ static_assert(false, "ops_to_register.h must define SHOULD_REGISTER macros"); #define SHOULD_REGISTER_OP_KERNEL(clz) true #endif +namespace tensorflow { + +// An InitOnStartupMarker is 'initialized' on program startup, purely for the +// side-effects of that initialization - the struct itself is empty. (The type +// is expected to be used to define globals.) +// +// The '<<' operator should be used in initializer expressions to specify what +// to run on startup. The following values are accepted: +// - An InitOnStartupMarker. Example: +// InitOnStartupMarker F(); +// InitOnStartupMarker const kInitF = +// InitOnStartupMarker{} << F(); +// - Something to call, which returns an InitOnStartupMarker. Example: +// InitOnStartupMarker const kInit = +// InitOnStartupMarker{} << []() { G(); return +// +// See also: TF_INIT_ON_STARTUP_IF +struct InitOnStartupMarker { + constexpr InitOnStartupMarker operator<<(InitOnStartupMarker) const { + return *this; + } + + template + constexpr InitOnStartupMarker operator<<(T&& v) const { + return std::forward(v)(); + } +}; + +// Conditional initializer expressions for InitOnStartupMarker: +// TF_INIT_ON_STARTUP_IF(cond) << f +// If 'cond' is true, 'f' is evaluated (and called, if applicable) on startup. +// Otherwise, 'f' is *not evaluated*. Note that 'cond' is required to be a +// constant-expression, and so this approximates #ifdef. +// +// The implementation uses the ?: operator (!cond prevents evaluation of 'f'). +// The relative precedence of ?: and << is significant; this effectively expands +// to (see extra parens): +// !cond ? InitOnStartupMarker{} : (InitOnStartupMarker{} << f) +// +// Note that although forcing 'cond' to be a constant-expression should not +// affect binary size (i.e. the same optimizations should apply if it 'happens' +// to be one), it was found to be necessary (for a recent version of clang; +// perhaps an optimizer bug). +// +// The parens are necessary to hide the ',' from the preprocessor; it could +// otherwise act as a macro argument separator. +#define TF_INIT_ON_STARTUP_IF(cond) \ + (::std::integral_constant::value) \ + ? ::tensorflow::InitOnStartupMarker{} \ + : ::tensorflow::InitOnStartupMarker {} + +// Wrapper for generating unique IDs (for 'anonymous' InitOnStartup definitions) +// using __COUNTER__. The new ID (__COUNTER__ already expanded) is provided as a +// macro argument. +// +// Usage: +// #define M_IMPL(id, a, b) ... +// #define M(a, b) TF_NEW_ID_FOR_INIT(M_IMPL, a, b) +#define TF_NEW_ID_FOR_INIT_2(m, c, ...) m(c, __VA_ARGS__) +#define TF_NEW_ID_FOR_INIT_1(m, c, ...) TF_NEW_ID_FOR_INIT_2(m, c, __VA_ARGS__) +#define TF_NEW_ID_FOR_INIT(m, ...) \ + TF_NEW_ID_FOR_INIT_1(m, __COUNTER__, __VA_ARGS__) + +} // namespace tensorflow + #endif // TENSORFLOW_CORE_FRAMEWORK_SELECTIVE_REGISTRATION_H_ diff --git a/tensorflow/core/profiler/lib/annotated_traceme.h b/tensorflow/core/profiler/lib/annotated_traceme.h new file mode 100644 index 00000000000..24ab188674f --- /dev/null +++ b/tensorflow/core/profiler/lib/annotated_traceme.h @@ -0,0 +1,59 @@ +/* Copyright 2020 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#ifndef TENSORFLOW_CORE_PROFILER_LIB_ANNOTATED_TRACEME_H_ +#define TENSORFLOW_CORE_PROFILER_LIB_ANNOTATED_TRACEME_H_ + +#include + +#include "absl/strings/string_view.h" +#include "absl/types/optional.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/macros.h" +#include "tensorflow/core/platform/types.h" +#include "tensorflow/core/profiler/lib/scoped_annotation.h" +#include "tensorflow/core/profiler/lib/traceme.h" + +namespace tensorflow { +namespace profiler { + +// Combination of TraceMe and ScopedAnnotation which share the same label. +// Optimization are done to ensure the label generation are done once. +class AnnotatedTraceMe { + public: + template + explicit AnnotatedTraceMe(NameGeneratorT&& name_generator, int level = 1) { + DCHECK_GE(level, 1); + bool annotation_enabled = ScopedAnnotation::IsEnabled(); + bool traceme_enabled = TraceMe::Active(level); + if (TF_PREDICT_FALSE(annotation_enabled || traceme_enabled)) { + string name = std::forward(name_generator)(); + if (annotation_enabled) { + scoped_annotation_.emplace(absl::string_view(name)); + } + if (TF_PREDICT_TRUE(traceme_enabled)) { + trace_me_.emplace([&name] { return std::move(name); }, level); + } + } + } + + private: + absl::optional trace_me_; + absl::optional scoped_annotation_; +}; + +} // namespace profiler +} // namespace tensorflow + +#endif // TENSORFLOW_CORE_PROFILER_LIB_ANNOTATED_TRACEME_H_ diff --git a/tensorflow/core/util/device_name_utils.h b/tensorflow/core/util/device_name_utils.h index 25ddd2402a5..5e57fdef4b5 100644 --- a/tensorflow/core/util/device_name_utils.h +++ b/tensorflow/core/util/device_name_utils.h @@ -74,6 +74,22 @@ class DeviceNameUtils { (has_id ? (other.has_id && id == other.id) : !other.has_id); } + bool operator!=(const ParsedName& other) const { + return (has_job ? ((other.has_job && job != other.job) || !other.has_job) + : other.has_job) || + (has_replica ? ((other.has_replica && replica == other.replica) || + !other.has_replica) + : other.has_replica) || + (has_task + ? ((other.has_task && task == other.task) || !other.has_task) + : other.has_task) || + (has_type + ? ((other.has_type && type == other.type) || !other.has_type) + : other.has_type) || + (has_id ? ((other.has_id && id == other.id) || !other.has_id) + : other.has_id); + } + bool has_job = false; string job; bool has_replica = false; diff --git a/third_party/tensorrt/BUILD.tpl b/third_party/tensorrt/BUILD.tpl index 5e3b223e695..2b6ae6ca153 100644 --- a/third_party/tensorrt/BUILD.tpl +++ b/third_party/tensorrt/BUILD.tpl @@ -19,14 +19,26 @@ cc_library( strip_include_prefix = "tensorrt/include", ) +config_setting( + name = "use_static_tensorrt", + define_values = {"TF_TENSORRT_STATIC":"1"}, +) + cc_library( name = "tensorrt", - srcs = [":tensorrt_lib"], + srcs = select({ + ":use_static_tensorrt": [":tensorrt_static_lib"], + "//conditions:default": [":tensorrt_lib"], + }), copts = cuda_default_copts(), - data = [":tensorrt_lib"], + data = select({ + ":use_static_tensorrt": [], + "//conditions:default": [":tensorrt_lib"], + }), linkstatic = 1, deps = [ ":tensorrt_headers", + # TODO(b/174608722): fix this line. "@local_config_cuda//cuda", ], ) diff --git a/third_party/tensorrt/tensorrt_configure.bzl b/third_party/tensorrt/tensorrt_configure.bzl index 9316ef864bb..5c59ac6a513 100644 --- a/third_party/tensorrt/tensorrt_configure.bzl +++ b/third_party/tensorrt/tensorrt_configure.bzl @@ -16,6 +16,7 @@ load( ) _TENSORRT_INSTALL_PATH = "TENSORRT_INSTALL_PATH" +_TF_TENSORRT_STATIC_PATH = "TF_TENSORRT_STATIC_PATH" _TF_TENSORRT_CONFIG_REPO = "TF_TENSORRT_CONFIG_REPO" _TF_TENSORRT_VERSION = "TF_TENSORRT_VERSION" _TF_NEED_TENSORRT = "TF_NEED_TENSORRT" @@ -82,6 +83,21 @@ def enable_tensorrt(repository_ctx): """Returns whether to build with TensorRT support.""" return int(repository_ctx.os.environ.get(_TF_NEED_TENSORRT, False)) +def get_host_environ(repository_ctx, env): + if env in repository_ctx.os.environ: + version = repository_ctx.os.environ[env].strip() + return version + else: + return "" + +def _get_tensorrt_static_path(repository_ctx): + """Returns the path for TensorRT static libraries.""" + return get_host_environ(repository_ctx, _TF_TENSORRT_STATIC_PATH) + +def _get_tensorrt_full_version(repository_ctx): + """Returns the full version for TensorRT.""" + return get_host_environ(repository_ctx, _TF_TENSORRT_VERSION) + def _tensorrt_configure_impl(repository_ctx): """Implementation of the tensorrt_configure repository rule.""" if _TF_TENSORRT_CONFIG_REPO in repository_ctx.os.environ: @@ -116,8 +132,11 @@ def _tensorrt_configure_impl(repository_ctx): _create_dummy_repository(repository_ctx) return - config = find_cuda_config(repository_ctx, ["tensorrt"]) + config = find_cuda_config(repository_ctx, ["cuda", "tensorrt"]) + cuda_version = config["cuda_version"] + cuda_library_path = config["cuda_library_dir"] + "/" trt_version = config["tensorrt_version"] + trt_full_version = _get_tensorrt_full_version(repository_ctx) cpu_value = get_cpu_value(repository_ctx) # Copy the library and header files. @@ -140,6 +159,33 @@ def _tensorrt_configure_impl(repository_ctx): ), ] + tensorrt_static_path = _get_tensorrt_static_path(repository_ctx) + if tensorrt_static_path: + tensorrt_static_path = tensorrt_static_path + "/" + if _at_least_version(trt_full_version, "8.4.1"): + raw_static_library_names = _TF_TENSORRT_LIBS + nvrtc_ptxjit_static_raw_names = ["nvrtc", "nvrtc-builtins", "nvptxcompiler"] + nvrtc_ptxjit_static_names = ["%s_static" % name for name in nvrtc_ptxjit_static_raw_names] + nvrtc_ptxjit_static_libraries = [lib_name(lib, cpu_value, trt_version, static = True) for lib in nvrtc_ptxjit_static_names] + elif _at_least_version(trt_version, "8"): + raw_static_library_names = _TF_TENSORRT_LIBS + nvrtc_ptxjit_static_libraries = [] + else: + raw_static_library_names = _TF_TENSORRT_LIBS + ["nvrtc", "myelin_compiler", "myelin_executor", "myelin_pattern_library", "myelin_pattern_runtime"] + nvrtc_ptxjit_static_libraries = [] + static_library_names = ["%s_static" % name for name in raw_static_library_names] + static_libraries = [lib_name(lib, cpu_value, trt_version, static = True) for lib in static_library_names] + copy_rules = copy_rules + [ + make_copy_files_rule( + repository_ctx, + name = "tensorrt_static_lib", + srcs = [tensorrt_static_path + library for library in static_libraries] + + [cuda_library_path + library for library in nvrtc_ptxjit_static_libraries], + outs = ["tensorrt/lib/" + library for library in static_libraries] + + ["tensorrt/lib/" + library for library in nvrtc_ptxjit_static_libraries], + ), + ] + # Set up config file. _tpl(repository_ctx, "build_defs.bzl", {"%{if_tensorrt}": "if_true"}) @@ -161,6 +207,7 @@ tensorrt_configure = repository_rule( _TF_TENSORRT_VERSION, _TF_TENSORRT_CONFIG_REPO, _TF_NEED_TENSORRT, + _TF_TENSORRT_STATIC_PATH, "TF_CUDA_PATHS", ], )