From 0eaa93ebc238b696f4a6eb41afa0d8250d408e04 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=B3=8A=E9=9C=86?= <hujunqi.hjq@alibaba-inc.com>
Date: Mon, 22 Jan 2024 19:58:38 +0800
Subject: [PATCH] [TensorRT] Upgrade TF-TRT version to TF2's implementation.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 泊霆 <hujunqi.hjq@alibaba-inc.com>
---
 tensorflow/compiler/tf2tensorrt/BUILD         |   662 +-
 .../compiler/tf2tensorrt/_pywrap_py_utils.pyi |    19 +
 .../compiler/tf2tensorrt/common/datavec.h     |    38 +
 .../compiler/tf2tensorrt/common/utils.cc      |   242 +
 .../compiler/tf2tensorrt/common/utils.h       |   133 +-
 .../tf2tensorrt/convert/algorithm_selector.cc |   272 +
 .../tf2tensorrt/convert/algorithm_selector.h  |   121 +
 .../convert/algorithm_selector_test.cc        |    97 +
 .../tf2tensorrt/convert/convert_graph.cc      |   610 +-
 .../tf2tensorrt/convert/convert_graph.h       |    51 +-
 .../tf2tensorrt/convert/convert_graph_test.cc |    79 +-
 .../tf2tensorrt/convert/convert_nodes.cc      |  6175 +++++----
 .../tf2tensorrt/convert/convert_nodes.h       |   574 +-
 .../tf2tensorrt/convert/convert_nodes_test.cc | 11333 ++++++++++------
 .../tf2tensorrt/convert/logger_registry.cc    |    60 +
 .../tf2tensorrt/convert/logger_registry.h     |    58 +
 .../convert/logger_registry_test.cc           |    34 +
 .../tf2tensorrt/convert/op_converter.h        |   225 +
 .../convert/op_converter_registry.cc          |   158 +
 .../convert/op_converter_registry.h           |   104 +
 .../convert/op_converter_registry_test.cc     |    67 +
 .../tf2tensorrt/convert/op_converter_test.cc  |   123 +
 .../tf2tensorrt/convert/ops/binary_ops.cc     |   235 +
 .../convert/ops/data_format_vec_permute.cc    |   179 +
 .../tf2tensorrt/convert/ops/fill_ops.cc       |   316 +
 .../tf2tensorrt/convert/ops/layer_utils.h     |   736 +
 .../tf2tensorrt/convert/ops/like_ops.cc       |    95 +
 .../tf2tensorrt/convert/ops/log_softmax.cc    |   104 +
 .../convert/ops/quantization_ops.cc           |   426 +
 .../convert/ops/quantization_ops.h            |    76 +
 .../convert/ops/quantization_ops_test.cc      |   619 +
 .../tf2tensorrt/convert/ops/selectv2.cc       |   220 +
 .../tf2tensorrt/convert/ops/softmax.cc        |    81 +
 .../compiler/tf2tensorrt/convert/ops/tile.cc  |   208 +
 .../tf2tensorrt/convert/ops/unary_ops.cc      |   251 +
 .../tf2tensorrt/convert/ops/variable_ops.cc   |   370 +
 .../tf2tensorrt/convert/timing_cache.cc       |    87 +
 .../tf2tensorrt/convert/timing_cache.h        |    70 +
 .../convert/trt_layout_optimization_pass.cc   |    97 +
 .../convert/trt_layout_optimization_pass.h    |    69 +
 .../convert/trt_optimization_pass.cc          |   322 +-
 .../convert/trt_optimization_pass.h           |    57 +-
 .../tf2tensorrt/convert/trt_parameters.cc     |   104 +
 .../tf2tensorrt/convert/trt_parameters.h      |    72 +
 .../compiler/tf2tensorrt/convert/utils.cc     |   261 +-
 .../compiler/tf2tensorrt/convert/utils.h      |   369 +-
 .../compiler/tf2tensorrt/convert/weights.cc   |   216 +
 .../compiler/tf2tensorrt/convert/weights.h    |   295 +
 .../kernels/get_calibration_data_op.cc        |     6 +-
 .../tf2tensorrt/kernels/trt_engine_op.cc      |  1413 +-
 .../tf2tensorrt/kernels/trt_engine_op_test.cc |   246 +-
 .../kernels/trt_engine_resource_ops.cc        |   144 +-
 .../kernels/trt_engine_resource_ops_test.cc   |   313 +-
 .../ops/get_calibration_data_op.cc            |     6 +-
 .../compiler/tf2tensorrt/ops/trt_engine_op.cc |    21 +-
 .../ops/trt_engine_resource_ops.cc            |     7 +-
 .../compiler/tf2tensorrt/plugin/trt_plugin.cc |     6 +-
 .../compiler/tf2tensorrt/plugin/trt_plugin.h  |    12 +-
 .../compiler/tf2tensorrt/segment/segment.cc   |   841 +-
 .../compiler/tf2tensorrt/segment/segment.h    |    46 +-
 .../tf2tensorrt/segment/segment_test.cc       |   329 +-
 .../tf2tensorrt/segment/union_find.cc         |   154 +
 .../compiler/tf2tensorrt/segment/union_find.h |   181 +-
 .../tf2tensorrt/stub/NvInferPlugin_7_0.inc    |    95 +
 .../compiler/tf2tensorrt/stub/NvInfer_7_0.inc |    47 +
 .../tf2tensorrt/stub/nvinfer_plugin_stub.cc   |    11 +-
 .../compiler/tf2tensorrt/stub/nvinfer_stub.cc |    14 +-
 .../compiler/tf2tensorrt/tensorrt_test.cc     |   233 +-
 .../compiler/tf2tensorrt/trt_convert_api.cc   |   512 +
 .../compiler/tf2tensorrt/trt_convert_api.h    |   129 +
 .../tf2tensorrt/trt_convert_api_test.cc       |   358 +
 .../compiler/tf2tensorrt/utils/py_utils.cc    |    31 +-
 .../compiler/tf2tensorrt/utils/py_utils.h     |     5 +
 .../compiler/tf2tensorrt/utils/py_utils.i     |     2 +-
 .../tf2tensorrt/utils/py_utils_wrapper.cc     |    46 +
 .../tf2tensorrt/utils/trt_allocator.cc        |    24 +-
 .../tf2tensorrt/utils/trt_allocator.h         |    19 +-
 .../tf2tensorrt/utils/trt_engine_utils.cc     |   286 +
 .../tf2tensorrt/utils/trt_engine_utils.h      |    82 +
 .../tf2tensorrt/utils/trt_execution_context.h |    43 +
 .../utils/trt_experimental_features.cc        |    35 +
 .../utils/trt_experimental_features.h         |    31 +
 .../tf2tensorrt/utils/trt_int8_calibrator.cc  |    11 +-
 .../tf2tensorrt/utils/trt_int8_calibrator.h   |    16 +-
 .../compiler/tf2tensorrt/utils/trt_logger.cc  |    95 +-
 .../compiler/tf2tensorrt/utils/trt_logger.h   |    17 +-
 .../tf2tensorrt/utils/trt_lru_cache.cc        |    56 +-
 .../tf2tensorrt/utils/trt_lru_cache.h         |   108 +-
 .../utils/trt_shape_optimization_profiles.cc  |   664 +
 .../utils/trt_shape_optimization_profiles.h   |   351 +
 .../trt_shape_optimization_profiles_test.cc   |   256 +
 .../tf2tensorrt/utils/trt_tensor_proxy.h      |    75 +-
 .../tf2tensorrt/utils/trt_testutils.cc        |    76 +
 .../tf2tensorrt/utils/trt_testutils.h         |   183 +
 .../tf2tensorrt/utils/trt_testutils_test.cc   |    99 +
 .../core/framework/selective_registration.h   |    65 +
 .../core/profiler/lib/annotated_traceme.h     |    59 +
 tensorflow/core/util/device_name_utils.h      |    16 +
 third_party/tensorrt/BUILD.tpl                |    16 +-
 third_party/tensorrt/tensorrt_configure.bzl   |    49 +-
 100 files changed, 26474 insertions(+), 8636 deletions(-)
 create mode 100644 tensorflow/compiler/tf2tensorrt/_pywrap_py_utils.pyi
 create mode 100644 tensorflow/compiler/tf2tensorrt/common/datavec.h
 create mode 100644 tensorflow/compiler/tf2tensorrt/common/utils.cc
 create mode 100644 tensorflow/compiler/tf2tensorrt/convert/algorithm_selector.cc
 create mode 100644 tensorflow/compiler/tf2tensorrt/convert/algorithm_selector.h
 create mode 100644 tensorflow/compiler/tf2tensorrt/convert/algorithm_selector_test.cc
 create mode 100644 tensorflow/compiler/tf2tensorrt/convert/logger_registry.cc
 create mode 100644 tensorflow/compiler/tf2tensorrt/convert/logger_registry.h
 create mode 100644 tensorflow/compiler/tf2tensorrt/convert/logger_registry_test.cc
 create mode 100644 tensorflow/compiler/tf2tensorrt/convert/op_converter.h
 create mode 100644 tensorflow/compiler/tf2tensorrt/convert/op_converter_registry.cc
 create mode 100644 tensorflow/compiler/tf2tensorrt/convert/op_converter_registry.h
 create mode 100644 tensorflow/compiler/tf2tensorrt/convert/op_converter_registry_test.cc
 create mode 100644 tensorflow/compiler/tf2tensorrt/convert/op_converter_test.cc
 create mode 100644 tensorflow/compiler/tf2tensorrt/convert/ops/binary_ops.cc
 create mode 100644 tensorflow/compiler/tf2tensorrt/convert/ops/data_format_vec_permute.cc
 create mode 100644 tensorflow/compiler/tf2tensorrt/convert/ops/fill_ops.cc
 create mode 100644 tensorflow/compiler/tf2tensorrt/convert/ops/layer_utils.h
 create mode 100644 tensorflow/compiler/tf2tensorrt/convert/ops/like_ops.cc
 create mode 100644 tensorflow/compiler/tf2tensorrt/convert/ops/log_softmax.cc
 create mode 100644 tensorflow/compiler/tf2tensorrt/convert/ops/quantization_ops.cc
 create mode 100644 tensorflow/compiler/tf2tensorrt/convert/ops/quantization_ops.h
 create mode 100644 tensorflow/compiler/tf2tensorrt/convert/ops/quantization_ops_test.cc
 create mode 100644 tensorflow/compiler/tf2tensorrt/convert/ops/selectv2.cc
 create mode 100644 tensorflow/compiler/tf2tensorrt/convert/ops/softmax.cc
 create mode 100644 tensorflow/compiler/tf2tensorrt/convert/ops/tile.cc
 create mode 100644 tensorflow/compiler/tf2tensorrt/convert/ops/unary_ops.cc
 create mode 100644 tensorflow/compiler/tf2tensorrt/convert/ops/variable_ops.cc
 create mode 100644 tensorflow/compiler/tf2tensorrt/convert/timing_cache.cc
 create mode 100644 tensorflow/compiler/tf2tensorrt/convert/timing_cache.h
 create mode 100644 tensorflow/compiler/tf2tensorrt/convert/trt_layout_optimization_pass.cc
 create mode 100644 tensorflow/compiler/tf2tensorrt/convert/trt_layout_optimization_pass.h
 create mode 100644 tensorflow/compiler/tf2tensorrt/convert/trt_parameters.cc
 create mode 100644 tensorflow/compiler/tf2tensorrt/convert/trt_parameters.h
 create mode 100644 tensorflow/compiler/tf2tensorrt/convert/weights.cc
 create mode 100644 tensorflow/compiler/tf2tensorrt/convert/weights.h
 create mode 100644 tensorflow/compiler/tf2tensorrt/segment/union_find.cc
 create mode 100644 tensorflow/compiler/tf2tensorrt/stub/NvInferPlugin_7_0.inc
 create mode 100644 tensorflow/compiler/tf2tensorrt/stub/NvInfer_7_0.inc
 create mode 100644 tensorflow/compiler/tf2tensorrt/trt_convert_api.cc
 create mode 100644 tensorflow/compiler/tf2tensorrt/trt_convert_api.h
 create mode 100644 tensorflow/compiler/tf2tensorrt/trt_convert_api_test.cc
 create mode 100644 tensorflow/compiler/tf2tensorrt/utils/py_utils_wrapper.cc
 create mode 100755 tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.cc
 create mode 100644 tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.h
 create mode 100644 tensorflow/compiler/tf2tensorrt/utils/trt_execution_context.h
 create mode 100644 tensorflow/compiler/tf2tensorrt/utils/trt_experimental_features.cc
 create mode 100644 tensorflow/compiler/tf2tensorrt/utils/trt_experimental_features.h
 create mode 100644 tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.cc
 create mode 100644 tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h
 create mode 100644 tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles_test.cc
 create mode 100644 tensorflow/compiler/tf2tensorrt/utils/trt_testutils.cc
 create mode 100644 tensorflow/compiler/tf2tensorrt/utils/trt_testutils.h
 create mode 100644 tensorflow/compiler/tf2tensorrt/utils/trt_testutils_test.cc
 create mode 100644 tensorflow/core/profiler/lib/annotated_traceme.h

diff --git a/tensorflow/compiler/tf2tensorrt/BUILD b/tensorflow/compiler/tf2tensorrt/BUILD
index 2b9196f874c..4e35a4912d2 100644
--- a/tensorflow/compiler/tf2tensorrt/BUILD
+++ b/tensorflow/compiler/tf2tensorrt/BUILD
@@ -5,39 +5,45 @@
 
 load(
     "//tensorflow:tensorflow.bzl",
-    "tf_cc_test",
+    "VERSION",
     "tf_copts",
     "tf_cuda_library",
     "tf_custom_op_library_additional_deps",
-    "tf_gen_op_libs",
     "tf_gen_op_wrapper_py",
-    "tf_gpu_kernel_library",
+    "tf_cuda_cc_test",
+    "tf_gen_op_libs",
+    "tf_py_wrap_cc",
+    "tf_custom_op_py_library",
+    "pybind_extension",
 )
-load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
-load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
-load("//tensorflow:tensorflow.bzl", "tf_py_wrap_cc")
+
 load(
     "//tensorflow/core/platform:default/build_config.bzl",
     "tf_additional_all_protos",
     "tf_proto_library",
 )
+
+# Platform specific build config
+load(
+    "//tensorflow/core/platform:default/build_config_root.bzl",
+    "if_static",
+)
+
 load("@local_config_tensorrt//:build_defs.bzl", "if_tensorrt")
-# Placeholder for Google-internal load statements.
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
-    licenses = ["notice"],  # Apache 2.0
+    licenses = ["notice"],
 )
 
-exports_files(["LICENSE"])
-
 cc_library(
     name = "tensorrt_stub",
     srcs = if_tensorrt([
         "stub/nvinfer_stub.cc",
         "stub/nvinfer_plugin_stub.cc",
     ]),
-    textual_hdrs = glob(["stub/*.inc", "common/utils.h"]),
+    textual_hdrs = glob(["stub/*.inc"]),
     deps = if_tensorrt([
         "@local_config_tensorrt//:tensorrt_headers",
         "//tensorflow/core:lib",
@@ -48,8 +54,8 @@ cc_library(
 alias(
     name = "tensorrt_lib",
     actual = select({
-        "//tensorflow:oss": ":tensorrt_stub",
-        "//conditions:default": "@local_config_tensorrt//:tensorrt",
+        "@local_config_tensorrt//:use_static_tensorrt": "@local_config_tensorrt//:tensorrt",
+        "//conditions:default": ":tensorrt_stub",
     }),
     visibility = ["//visibility:private"],
 )
@@ -57,22 +63,155 @@ alias(
 tf_cuda_cc_test(
     name = "tensorrt_test_cc",
     size = "small",
-    srcs = ["tensorrt_test.cc"],
+    srcs = [
+        "tensorrt_test.cc",
+    ],
     tags = [
+        "no_cuda_on_cpu_tap",
         "no_windows",
         "nomac",
+        # TODO(b/303453873): Re-enable test once TensorRT has been updated
+        "notap",
     ],
     deps = [
         ":trt_logging",
         ":utils",
-        "//tensorflow/core:gpu_init",
         "//tensorflow/core:lib",
+        "//tensorflow/core:gpu_init",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
         "//tensorflow/core:stream_executor",
+    ] + if_tensorrt([
+        ":tensorrt_lib",
+    ]),
+)
+
+cc_library(
+    name = "trt_convert_api",
+    srcs = ["trt_convert_api.cc"],
+    hdrs = [
+        "trt_convert_api.h",
+    ],
+    copts = tf_copts(),
+    deps = [
+        ":trt_parameters",
+        ":trt_resources",
+        "//tensorflow/cc/tools:freeze_saved_model",
+        "//tensorflow/core:direct_session",
+        "//tensorflow/core:framework",
+        "//tensorflow/core/grappler:grappler_item_builder",
+        "//tensorflow/core/grappler/clusters:single_machine",
+        "//tensorflow/core/platform:logging",
+        "@com_google_absl//absl/strings",
+    ] + if_tensorrt([":tensorrt_lib"]),
+)
+
+filegroup(
+    name = "headers",
+    srcs = [
+        "trt_convert_api.h",
+    ],
+)
+
+tf_cuda_cc_test(
+    name = "trt_convert_api_test",
+    size = "small",
+    srcs = ["trt_convert_api_test.cc"],
+    tags = [
+        "no_cuda_on_cpu_tap",
+        "no_windows",
+        "nomac",
+        # TODO(b/303453873): Re-enable test once TensorRT has been updated
+        "notap",
+    ],
+    deps = [
+        ":common_utils",
+        ":testutils",
+        ":trt_conversion",
+        ":trt_convert_api",
+        ":trt_logging",
+        ":trt_op_kernels",
+        ":trt_resources",
+        ":utils",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:resource_variable_ops",
+        "//tensorflow/cc:scope",
+        "//tensorflow/core:array_ops_op_lib",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:direct_session",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:function_ops_op_lib",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:math_ops_op_lib",
+        "//tensorflow/core:no_op_op_lib",
+        "//tensorflow/core:ops",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:state_ops_op_lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels:array",
+        "//tensorflow/core/kernels:assign_op",
+        "//tensorflow/core/kernels:ops_testutil",
+        "//tensorflow/core/kernels:partitioned_function_ops",
+        "//tensorflow/core/kernels:resource_variable_ops",
+    ],
+)
+
+cc_library(
+    name = "common_utils",
+    srcs = ["common/utils.cc"],
+    hdrs = [
+        "common/datavec.h",
+        "common/utils.h",
+    ],
+    copts = tf_copts(),
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core/platform:logging",
+        "//tensorflow/core/profiler/lib:traceme",
+    ] + if_tensorrt([":tensorrt_lib"]),
+)
+
+cc_library(
+    name = "testutils",
+    testonly = 1,
+    srcs = ["utils/trt_testutils.cc"],
+    hdrs = [
+        "utils/trt_testutils.h",
+    ],
+    copts = tf_copts(),
+    visibility = ["//visibility:private"],
+    deps = [
+        ":trt_conversion",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/framework:tensor_testutil",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest",
+    ] + if_tensorrt([":tensorrt_lib"]),
+)
+
+tf_cuda_cc_test(
+    name = "testutils_test",
+    size = "small",
+    srcs = ["utils/trt_testutils_test.cc"],
+    tags = [
+        "no_cuda_on_cpu_tap",
+        "no_windows",
+        "nomac",
+        # TODO(b/303453873): Re-enable test once TensorRT has been updated
+        "notap",
+    ],
+    deps = [
+        ":testutils",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/platform:protobuf",
     ] + if_tensorrt([
         ":tensorrt_lib",
-        "@local_config_cuda//cuda:cuda_headers",
     ]),
 )
 
@@ -85,26 +224,30 @@ cc_library(
     copts = tf_copts(),
     visibility = ["//visibility:public"],
     deps = [
+        ":common_utils",
         ":trt_allocator",
         ":trt_conversion",
+        ":trt_engine_utils",
         ":trt_logging",
         ":trt_plugins",
         ":trt_resources",
         ":utils",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
-        "@local_config_cuda//cuda:cuda_headers",
-        "//tensorflow/core:core_cpu_lib_no_ops",
         "//tensorflow/core:framework",
         "//tensorflow/core:gpu_headers_lib",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:lib_proto_parsing",
-        "//tensorflow/core:stream_executor",
         "//tensorflow/core:stream_executor_headers_lib",
+        "//tensorflow/core:core_cpu_lib_no_ops",
         "//tensorflow/core/grappler/costs:graph_properties",
-        "//tensorflow/stream_executor/lib",
-    ] + if_tensorrt([":tensorrt_lib"]) + tf_custom_op_library_additional_deps(),
+        "//tensorflow/core:stream_executor",
+        "//tensorflow/core/profiler/lib:traceme",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+    ] + if_tensorrt([
+        ":tensorrt_lib",
+        "@local_config_cuda//cuda:cuda_headers",
+    ]) + tf_custom_op_library_additional_deps(),
     alwayslink = 1,
 )
 
@@ -119,13 +262,14 @@ cc_library(
         ":trt_logging",
         ":trt_plugins",
         ":trt_resources",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
         "//tensorflow/core:framework",
         "//tensorflow/core:gpu_headers_lib",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:lib_proto_parsing",
+        "//tensorflow/core/profiler/lib:traceme",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
     ] + if_tensorrt([":tensorrt_lib"]) + tf_custom_op_library_additional_deps(),
     alwayslink = 1,
 )
@@ -138,23 +282,32 @@ tf_cuda_cc_test(
         "no_cuda_on_cpu_tap",
         "no_windows",
         "nomac",
+        # TODO(b/303453873): Re-enable test once TensorRT has been updated
+        "notap",
     ],
     deps = [
+        ":common_utils",
+        ":testutils",
         ":trt_engine_instance_proto_cc",
         ":trt_engine_resource_op_kernels",
         ":trt_engine_resource_ops_op_lib",
         ":trt_logging",
         ":trt_resources",
+        ":utils",
+        "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core:ops",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
+        "//tensorflow/core/framework:fake_input",
         "//tensorflow/core/kernels:ops_testutil",
         "//tensorflow/core/kernels:resource_variable_ops",
+        "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -166,77 +319,115 @@ tf_cuda_cc_test(
         "no_cuda_on_cpu_tap",
         "no_windows",
         "nomac",
+        # TODO(b/303453873): Re-enable test once TensorRT has been updated
+        "notap",
     ],
     deps = [
+        ":testutils",
+        ":trt_conversion",
         ":trt_op_kernels",
         ":trt_op_libs",
         ":trt_resources",
-        ":trt_conversion",
-        ":utils",
-        "@com_google_googletest//:gtest",
-        "@com_google_absl//absl/strings",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:function_ops",
-        "//tensorflow/cc:ops",
         "//tensorflow/cc:scope",
+        "//tensorflow/core:core_cpu_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
+        "//tensorflow/core/framework:fake_input",
+        "//tensorflow/core/kernels:array",
+        "//tensorflow/core/kernels:function_ops",
         "//tensorflow/core/kernels:ops_testutil",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+        "@com_google_googletest//:gtest",
+        "@eigen_archive//:eigen3",
     ] + if_tensorrt([
         "@local_config_cuda//cuda:cuda_headers",
     ]),
 )
 
-tf_gen_op_libs(
-    op_lib_names = [
-        "trt_engine_op",
-        "get_calibration_data_op",
-        "trt_engine_resource_ops",
-    ],
-)
+tf_gen_op_libs(op_lib_names = [
+    "trt_engine_op",
+    "get_calibration_data_op",
+    "trt_engine_resource_ops",
+])
 
 cc_library(
     name = "trt_op_libs",
     deps = [
         ":get_calibration_data_op_op_lib",
         ":trt_engine_op_op_lib",
+        ":trt_engine_utils",
     ],
 )
 
+tf_cuda_library(
+    name = "trt_engine_utils",
+    srcs = [
+        "utils/trt_engine_utils.cc",
+        "utils/trt_shape_optimization_profiles.cc",
+    ],
+    hdrs = [
+        "utils/trt_engine_utils.h",
+        "utils/trt_execution_context.h",
+        "utils/trt_shape_optimization_profiles.h",
+    ],
+    deps = [
+        ":common_utils",
+        ":trt_allocator",
+        ":trt_logging",
+        ":trt_parameters",
+        ":utils",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_headers_lib",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:stream_executor_headers_lib",
+        "//tensorflow/core/platform:status",
+        "//tensorflow/core/profiler/lib:traceme",
+        "@com_google_absl//absl/strings",
+    ] + if_tensorrt([":tensorrt_lib"]),
+)
+
 tf_cuda_library(
     name = "trt_logging",
     srcs = ["utils/trt_logger.cc"],
     hdrs = ["utils/trt_logger.h"],
     visibility = ["//visibility:public"],
     deps = [
+        ":common_utils",
+        ":logger_registry",
+        ":utils",
         "//tensorflow/core:lib_proto_parsing",
+        "@com_google_absl//absl/strings",
     ] + if_tensorrt([":tensorrt_lib"]),
 )
 
 tf_gen_op_wrapper_py(
     name = "trt_ops",
     deps = [
-        ":trt_engine_resource_ops_op_lib",
         ":trt_op_libs",
     ],
 )
 
-tf_custom_op_py_library(
-    name = "trt_ops_loader",
-    srcs_version = "PY2AND3",
-    deps = [
-        ":trt_ops",
-        ":wrap_py_utils",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:resources",
+tf_cuda_library(
+    name = "trt_parameters",
+    srcs = ["convert/trt_parameters.cc"],
+    hdrs = [
+        "convert/trt_parameters.h",
     ],
+    copts = tf_copts(),
+    deps = [
+        ":utils",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
+    ] + if_tensorrt([":tensorrt_lib"]),
 )
 
 tf_cuda_library(
@@ -248,18 +439,21 @@ tf_cuda_library(
     hdrs = [
         "utils/trt_int8_calibrator.h",
         "utils/trt_lru_cache.h",
+        "utils/trt_shape_optimization_profiles.h",
         "utils/trt_tensor_proxy.h",
     ],
     deps = [
+        ":common_utils",
         ":trt_allocator",
+        ":trt_engine_utils",
         ":trt_logging",
         ":utils",
         "//tensorflow/core:framework_headers_lib",
         "//tensorflow/core:framework_lite",
-        "//tensorflow/core/grappler:op_types",
-        "//tensorflow/core:graph",
         "//tensorflow/core:gpu_runtime",
+        "//tensorflow/core:graph",
         "//tensorflow/core:lib_proto_parsing",
+        "//tensorflow/core/grappler:op_types",
     ] + if_tensorrt([":tensorrt_lib"]),
 )
 
@@ -274,7 +468,7 @@ tf_cuda_library(
     ] + if_tensorrt([":tensorrt_lib"]),
 )
 
-tf_cc_test(
+tf_cuda_cc_test(
     name = "trt_allocator_test",
     size = "small",
     srcs = ["utils/trt_allocator_test.cc"],
@@ -289,7 +483,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+tf_cuda_cc_test(
     name = "trt_lru_cache_test",
     size = "small",
     srcs = ["utils/trt_lru_cache_test.cc"],
@@ -304,33 +498,205 @@ tf_cc_test(
     ],
 )
 
+tf_cuda_cc_test(
+    name = "trt_shape_optimization_profiles_test",
+    size = "small",
+    srcs = ["utils/trt_shape_optimization_profiles_test.cc"],
+    tags = [
+        "no_cuda_on_cpu_tap",
+        "no_windows",
+        "nomac",
+        # TODO(b/303453873): Re-enable test once TensorRT has been updated
+        "notap",
+    ],
+    deps = [
+        ":trt_resources",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+tf_cuda_library(
+    name = "logger_registry",
+    srcs = ["convert/logger_registry.cc"],
+    hdrs = [
+        "convert/logger_registry.h",
+    ],
+    copts = tf_copts(),
+    deps = [
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
+    ] + if_tensorrt([":tensorrt_lib"]),
+)
+
+tf_cuda_library(
+    name = "trt_weights",
+    srcs = ["convert/weights.cc"],
+    hdrs = [
+        "convert/weights.h",
+    ],
+    copts = tf_copts(),
+    deps = [
+        ":utils",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ] + if_tensorrt([":tensorrt_lib"]),
+)
+
+tf_cuda_library(
+    name = "op_converter",
+    srcs = [],
+    hdrs = [
+        "convert/op_converter.h",
+    ],
+    deps = [
+        ":trt_parameters",
+        ":trt_weights",
+    ] + if_tensorrt([":tensorrt_lib"]),
+)
+
+# This rule contains static variables for the converter registry. Do not depend
+# on it directly; use :op_converter_registry, and link against
+# libtensorflow_framework.so for the registry symbols. The library
+# libtensorflow_framework.so depends on this target so that users can
+# register custom op converters without the need to incorporate Tensorflow into
+# their build system.
+tf_cuda_library(
+    name = "op_converter_registry_impl",
+    srcs = ["convert/op_converter_registry.cc"],
+    hdrs = [
+        "convert/op_converter_registry.h",
+    ],
+    visibility = ["//tensorflow:__subpackages__"],
+    deps = [
+        ":op_converter",
+        ":utils",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
+    ] + if_tensorrt([":tensorrt_lib"]),
+)
+
+tf_cuda_library(
+    name = "op_converter_registry",
+    hdrs = [
+        "convert/op_converter_registry.h",
+    ],
+    copts = tf_copts(),
+    deps = [
+        ":op_converter",
+        ":utils",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/platform:logging",
+        "//tensorflow/core/platform:status",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
+        ":op_converter_registry_impl",
+    ],
+)
+
+tf_py_wrap_cc(
+    name = "wrap_py_utils",
+    srcs = ["utils/py_utils.i"],
+    copts = tf_copts(),
+    deps = [
+        ":py_utils",
+        "//third_party/python_runtime:headers",
+    ],
+)
+
+tf_custom_op_py_library(
+    name = "trt_ops_loader",
+    srcs_version = "PY2AND3",
+    deps = [
+        ":wrap_py_utils",
+        ":trt_ops",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:resources",
+    ],
+)
+
+tf_cuda_cc_test(
+    name = "op_converter_registry_test",
+    size = "small",
+    srcs = ["convert/op_converter_registry_test.cc"],
+    tags = [
+        "no_windows",
+        "nomac",
+    ],
+    deps = [
+        ":op_converter_registry",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+tf_cuda_library(
+    name = "algorithm_selector",
+    srcs = [
+        "convert/algorithm_selector.cc",
+    ],
+    hdrs = [
+        "convert/algorithm_selector.h",
+    ],
+    deps = [":common_utils"] + if_tensorrt([":tensorrt_lib"]),
+)
+
+tf_cuda_cc_test(
+    name = "algorithm_selector_test",
+    srcs = [
+        "convert/algorithm_selector_test.cc",
+    ],
+    deps = [
+        ":algorithm_selector",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ] + if_tensorrt([":tensorrt_lib"]),
+)
+
 # Library for the node-level conversion portion of TensorRT operation creation
 tf_cuda_library(
     name = "trt_conversion",
     srcs = [
         "convert/convert_graph.cc",
         "convert/convert_nodes.cc",
+        "convert/ops/binary_ops.cc",
+        "convert/ops/data_format_vec_permute.cc",
+        "convert/ops/fill_ops.cc",
+        "convert/ops/like_ops.cc",
+        "convert/ops/log_softmax.cc",
+        "convert/ops/quantization_ops.cc",
+        "convert/ops/selectv2.cc",
+        "convert/ops/softmax.cc",
+        "convert/ops/tile.cc",
+        "convert/ops/unary_ops.cc",
+        "convert/ops/variable_ops.cc",
+        "convert/timing_cache.cc",
         "convert/trt_optimization_pass.cc",
     ],
     hdrs = [
         "convert/convert_graph.h",
         "convert/convert_nodes.h",
+        "convert/ops/layer_utils.h",
+        "convert/ops/quantization_ops.h",
+        "convert/timing_cache.h",
         "convert/trt_optimization_pass.h",
     ],
     deps = [
+        ":algorithm_selector",
+        ":common_utils",
+        ":logger_registry",
+        ":op_converter",
+        ":op_converter_registry",
         ":segment",
         ":trt_allocator",
-        ":trt_plugins",
         ":trt_logging",
+        ":trt_parameters",
+        ":trt_plugins",
         ":trt_resources",
+        ":trt_weights",
         ":utils",
-        "@com_google_absl//absl/strings",
-        "//tensorflow/core/grappler/clusters:cluster",
-        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer",
-        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
-        "//tensorflow/core/grappler:grappler_item",
-        "//tensorflow/core/grappler:op_types",
-        "//tensorflow/core/grappler:utils",
+        "//tensorflow/cc:array_ops",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_lite",
         "//tensorflow/core:gpu_runtime",
@@ -338,11 +704,23 @@ tf_cuda_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:core_cpu",
         "//tensorflow/core/grappler:devices",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:op_types",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/clusters:cluster",
         "//tensorflow/core/grappler/clusters:virtual_cluster",
         "//tensorflow/core/grappler/costs:graph_properties",
+        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer",
+        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
         "//tensorflow/core/grappler/optimizers:meta_optimizer",
-    ] + if_tensorrt([":tensorrt_lib"]) + tf_custom_op_library_additional_deps(),
+        "//tensorflow/core/grappler/utils:functions",
+        "//tensorflow/core/profiler/lib:traceme",
+        "//tensorflow/tools/graph_transforms:transform_utils",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+    ] + if_tensorrt([":tensorrt_lib"]),
     alwayslink = 1,
 )
 
@@ -354,18 +732,17 @@ tf_cuda_cc_test(
         "no_cuda_on_cpu_tap",
         "no_windows",
         "nomac",
+        # TODO(b/303453873): Re-enable test once TensorRT has been updated
+        "notap",
     ],
     deps = [
+        ":testutils",
+        ":trt_conversion",
         ":trt_op_kernels",
         ":trt_op_libs",
-        ":trt_conversion",
-        "@com_google_googletest//:gtest",
-        "@com_google_absl//absl/strings",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:ops",
         "//tensorflow/cc:scope",
-        "//tensorflow/core/grappler:grappler_item",
-        "//tensorflow/core/grappler/clusters:cluster",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:direct_session",
@@ -374,40 +751,104 @@ tf_cuda_cc_test(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler/clusters:cluster",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest",
     ] + if_tensorrt([":tensorrt_lib"]),
 )
 
 tf_cuda_cc_test(
     name = "convert_nodes_test",
     size = "medium",
-    srcs = ["convert/convert_nodes_test.cc"],
+    srcs = [
+        "convert/convert_nodes_test.cc",
+        "convert/op_converter_test.cc",
+    ],
     tags = [
         "no_cuda_on_cpu_tap",
         "no_windows",
         "nomac",
+        # TODO(b/303453873): Re-enable test once TensorRT has been updated
+        "notap",
     ],
     deps = [
-        ":trt_logging",
+        ":testutils",
         ":trt_conversion",
+        ":trt_engine_utils",
+        ":trt_logging",
         ":trt_plugins",
-        "@com_google_googletest//:gtest",
+        ":utils",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:cc_ops_internal",
+        "//tensorflow/cc:ops",
+        "//tensorflow/cc:scope",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core/framework:tensor_testutil",
+        "//tensorflow/core/grappler/costs:graph_properties",
+        "//tensorflow/core/kernels:function_ops",
+        "//tensorflow/core/kernels:identity_op",
+        "//tensorflow/core/kernels:resource_variable_ops",
+        "//tensorflow/core/platform:status_matchers",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
+        "@com_google_googletest//:gtest",
+    ] + if_tensorrt([
+        ":tensorrt_lib",
+        "@local_config_cuda//cuda:cuda_headers",
+    ]),
+)
+
+tf_cuda_cc_test(
+    name = "convert_qdq_test",
+    size = "medium",
+    srcs = [
+        "convert/ops/quantization_ops_test.cc",
+    ],
+    tags = [
+        "no_cuda_on_cpu_tap",
+        "no_windows",
+        "nomac",
+        "notap",  # Fails w/ tensorrt 8.x
+    ],
+    deps = [
+        ":testutils",
+        ":trt_conversion",
+        ":trt_convert_api",
+        ":trt_engine_utils",
+        ":trt_logging",
+        ":trt_op_kernels",
+        ":trt_plugins",
+        ":trt_resources",
+        ":utils",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:cc_ops_internal",
         "//tensorflow/cc:ops",
         "//tensorflow/cc:scope",
-        "//tensorflow/core/grappler/costs:graph_properties",
+        "//tensorflow/compiler/jit:shape_inference",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core:ops",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:tensor_testutil",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
+        "//tensorflow/core/framework:tensor_testutil",
+        "//tensorflow/core/kernels:array",
+        "//tensorflow/core/kernels:function_ops",
+        "//tensorflow/core/kernels:nn",
+        "//tensorflow/core/kernels:ops_testutil",
+        "//tensorflow/core/kernels:pooling_ops",
+        "//tensorflow/core/platform:status_matchers",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+        "@com_google_googletest//:gtest",
     ] + if_tensorrt([
         ":tensorrt_lib",
         "@local_config_cuda//cuda:cuda_headers",
@@ -415,21 +856,46 @@ tf_cuda_cc_test(
 )
 
 # Library for the segmenting portion of TensorRT operation creation
+cc_library(
+    name = "union_find",
+    srcs = ["segment/union_find.cc"],
+    hdrs = [
+        "segment/union_find.h",
+    ],
+    copts = tf_copts(),
+    deps = [
+        ":utils",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/types:optional",
+    ],
+)
+
 cc_library(
     name = "segment",
     srcs = ["segment/segment.cc"],
     hdrs = [
         "segment/segment.h",
-        "segment/union_find.h",
     ],
     copts = tf_copts(),
     deps = [
+        ":common_utils",
+        ":union_find",
+        ":utils",
         "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:lib_proto_parsing",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core/grappler/costs:graph_properties",
+        "//tensorflow/core/profiler/lib:traceme",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/types:optional",
         "@com_google_protobuf//:protobuf_headers",
     ],
 )
@@ -453,22 +919,9 @@ tf_cuda_cc_test(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
     ],
 )
 
-#tf_gpu_kernel_library(
-#    name = "plugin_cast",
-#    srcs = ["plugin/plugin_cast.cu.cc"],
-#    deps = [
-#        ":trt_plugins",
-#        "//tensorflow/core:framework_lite",
-#    ] + if_tensorrt([
-#        "@local_config_cuda//cuda:cuda_headers",
-#        "@local_config_tensorrt//:tensorrt",
-#    ]),
-#)
-
 tf_cuda_library(
     name = "trt_plugins",
     srcs = ["plugin/trt_plugin.cc"],
@@ -481,17 +934,25 @@ tf_cuda_library(
 
 cc_library(
     name = "utils",
-    srcs = ["convert/utils.cc"],
+    srcs = [
+        "convert/utils.cc",
+        "utils/trt_experimental_features.cc",
+    ],
     hdrs = [
         "common/utils.h",
         "convert/utils.h",
+        "utils/trt_experimental_features.h",
         "utils/trt_tensor_proxy.h",
     ],
     copts = tf_copts(),
     deps = [
         "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
         "//tensorflow/core:lib_proto_parsing",
-    ],
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/strings",
+    ] + if_tensorrt([":tensorrt_lib"]),
 )
 
 tf_proto_library(
@@ -501,23 +962,18 @@ tf_proto_library(
     protodeps = tf_additional_all_protos(),
 )
 
-cc_library(
+tf_cuda_library(
     name = "py_utils",
     srcs = ["utils/py_utils.cc"],
     hdrs = ["utils/py_utils.h"],
-    copts = tf_copts(),
+    copts = select({
+        "@local_config_tensorrt//:use_static_tensorrt": ["TF_USE_TENSORRT_STATIC=1"],
+        "//conditions:default": [],
+    }),
     deps = if_tensorrt([
+        ":common_utils",
         ":tensorrt_lib",
+        ":op_converter_registry",
         "//tensorflow/stream_executor/platform:dso_loader",
     ]),
 )
-
-tf_py_wrap_cc(
-    name = "wrap_py_utils",
-    srcs = ["utils/py_utils.i"],
-    copts = tf_copts(),
-    deps = [
-        ":py_utils",
-        "//third_party/python_runtime:headers",
-    ],
-)
diff --git a/tensorflow/compiler/tf2tensorrt/_pywrap_py_utils.pyi b/tensorflow/compiler/tf2tensorrt/_pywrap_py_utils.pyi
new file mode 100644
index 00000000000..1ef7abbd7d1
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/_pywrap_py_utils.pyi
@@ -0,0 +1,19 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+def get_linked_tensorrt_version() -> tuple[int,int,int]: ...
+def get_loaded_tensorrt_version() -> tuple[int,int,int]: ...
+def get_registered_op_converters() -> list[str]: ...
+def is_tensorrt_enabled() -> bool: ...
diff --git a/tensorflow/compiler/tf2tensorrt/common/datavec.h b/tensorflow/compiler/tf2tensorrt/common/datavec.h
new file mode 100644
index 00000000000..eff32f1f521
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/common/datavec.h
@@ -0,0 +1,38 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_COMMON_DATAVEC_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_COMMON_DATAVEC_H_
+
+#include <vector>
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace tensorrt {
+
+// Input/output data format for OpConverterTest::BuildAndRun().
+struct InputOutputData {
+  size_t TotalBytes() const { return tensor.TotalBytes(); }
+  string name;
+  Tensor tensor;
+};
+
+using DataVec = std::vector<InputOutputData>;
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+#endif
diff --git a/tensorflow/compiler/tf2tensorrt/common/utils.cc b/tensorflow/compiler/tf2tensorrt/common/utils.cc
new file mode 100644
index 00000000000..14f0e3d487c
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/common/utils.cc
@@ -0,0 +1,242 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2tensorrt/common/utils.h"
+
+#include <tuple>
+
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+#include "absl/base/call_once.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
+#include "third_party/tensorrt/NvInferPlugin.h"
+
+#endif
+
+namespace tensorflow {
+namespace tensorrt {
+
+std::tuple<int, int, int> GetLinkedTensorRTVersion() {
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+  return std::tuple<int, int, int>{NV_TENSORRT_MAJOR, NV_TENSORRT_MINOR,
+                                   NV_TENSORRT_PATCH};
+#else
+  return std::tuple<int, int, int>{0, 0, 0};
+#endif
+}
+
+std::tuple<int, int, int> GetLoadedTensorRTVersion() {
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+  int ver = getInferLibVersion();
+  int major = ver / 1000;
+  ver = ver - major * 1000;
+  int minor = ver / 100;
+  int patch = ver - minor * 100;
+  return std::tuple<int, int, int>{major, minor, patch};
+#else
+  return std::tuple<int, int, int>{0, 0, 0};
+#endif
+}
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+namespace tensorflow {
+namespace tensorrt {
+
+Status GetTrtBindingIndex(const char* tensor_name, int profile_index,
+                          const nvinfer1::ICudaEngine* cuda_engine,
+                          int* binding_index) {
+  tensorflow::profiler::TraceMe activity(
+      "GetTrtBindingIndex", tensorflow::profiler::TraceMeLevel::kInfo);
+  // If the engine has been built for K profiles, the first getNbBindings() / K
+  // bindings are used by profile number 0, the following getNbBindings() / K
+  // bindings are used by profile number 1 etc.
+  //
+  // GetBindingIndex(tensor_name) returns the binding index for the progile 0.
+  // We can also consider it as a "binding_index_within_profile".
+  *binding_index = cuda_engine->getBindingIndex(tensor_name);
+  if (*binding_index == -1) {
+    const string msg = absl::StrCat("Input node ", tensor_name, " not found");
+    return errors::NotFound(msg);
+  }
+  int n_profiles = cuda_engine->getNbOptimizationProfiles();
+  // If we have more then one optimization profile, then we need to shift the
+  // binding index according to the following formula:
+  // binding_index_within_engine = binding_index_within_profile +
+  //                               profile_index * bindings_per_profile
+  const int bindings_per_profile = cuda_engine->getNbBindings() / n_profiles;
+  *binding_index = *binding_index + profile_index * bindings_per_profile;
+  return Status::OK();
+}
+
+Status GetTrtBindingIndex(int network_input_index, int profile_index,
+                          const nvinfer1::ICudaEngine* cuda_engine,
+                          int* binding_index) {
+  const string input_name =
+      absl::StrCat(IONamePrefixes::kInputPHName, network_input_index);
+  return GetTrtBindingIndex(input_name.c_str(), profile_index, cuda_engine,
+                            binding_index);
+}
+
+namespace {
+
+void InitializeTrtPlugins(nvinfer1::ILogger* trt_logger) {
+#if defined(PLATFORM_WINDOWS)
+  LOG_WARNING_WITH_PREFIX
+      << "Windows support is provided experimentally. No guarantee is made "
+         "regarding functionality or engineering support. Use at your own "
+         "risk.";
+#endif
+  LOG(INFO) << "Linked TensorRT version: "
+            << absl::StrJoin(GetLinkedTensorRTVersion(), ".");
+  LOG(INFO) << "Loaded TensorRT version: "
+            << absl::StrJoin(GetLoadedTensorRTVersion(), ".");
+
+  bool plugin_initialized = initLibNvInferPlugins(trt_logger, "");
+  if (!plugin_initialized) {
+    LOG(ERROR) << "Failed to initialize TensorRT plugins, and conversion may "
+                  "fail later.";
+  }
+
+  int num_trt_plugins = 0;
+  nvinfer1::IPluginCreator* const* trt_plugin_creator_list =
+      getPluginRegistry()->getPluginCreatorList(&num_trt_plugins);
+  if (!trt_plugin_creator_list) {
+    LOG_WARNING_WITH_PREFIX << "Can not find any TensorRT plugins in registry.";
+  } else {
+    VLOG(1) << "Found the following " << num_trt_plugins
+            << " TensorRT plugins in registry:";
+    for (int i = 0; i < num_trt_plugins; ++i) {
+      if (!trt_plugin_creator_list[i]) {
+        LOG_WARNING_WITH_PREFIX
+            << "TensorRT plugin at index " << i
+            << " is not accessible (null pointer returned by "
+               "getPluginCreatorList for this plugin)";
+      } else {
+        VLOG(1) << "  " << trt_plugin_creator_list[i]->getPluginName();
+      }
+    }
+  }
+}
+
+}  // namespace
+
+void MaybeInitializeTrtPlugins(nvinfer1::ILogger* trt_logger) {
+  static absl::once_flag once;
+  absl::call_once(once, InitializeTrtPlugins, trt_logger);
+}
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+namespace nvinfer1 {
+std::ostream& operator<<(std::ostream& os,
+                         const nvinfer1::TensorFormat& format) {
+  os << "nvinfer1::TensorFormat::";
+  switch (format) {
+    case nvinfer1::TensorFormat::kLINEAR:
+      os << "kLINEAR";
+      break;
+
+    case nvinfer1::TensorFormat::kCHW2:
+      os << "kCHW2";
+      break;
+
+    case nvinfer1::TensorFormat::kHWC8:
+      os << "kHWC8";
+      break;
+
+    case nvinfer1::TensorFormat::kCHW4:
+      os << "kCHW4";
+      break;
+
+    case nvinfer1::TensorFormat::kCHW16:
+      os << "kCHW16";
+      break;
+
+    case nvinfer1::TensorFormat::kCHW32:
+      os << "kCHW32";
+      break;
+
+#if IS_TRT_VERSION_GE(8, 0, 0, 0)
+    case nvinfer1::TensorFormat::kDHWC8:
+      os << "kDHWC8";
+      break;
+
+    case nvinfer1::TensorFormat::kCDHW32:
+      os << "kCDHW32";
+      break;
+
+    case nvinfer1::TensorFormat::kHWC:
+      os << "kHWC";
+      break;
+
+    case nvinfer1::TensorFormat::kDLA_LINEAR:
+      os << "kDLA_LINEAR";
+      break;
+
+    case nvinfer1::TensorFormat::kDLA_HWC4:
+      os << "kDLA_HWC4";
+      break;
+
+    case nvinfer1::TensorFormat::kHWC16:
+      os << "kHWC16";
+      break;
+#endif
+
+    default:
+      os << "unknown format";
+  }
+  return os;
+}
+
+std::ostream& operator<<(std::ostream& os, const nvinfer1::DataType& v) {
+  os << "nvinfer1::DataType::";
+  switch (v) {
+    case nvinfer1::DataType::kFLOAT:
+      os << "kFLOAT";
+      break;
+    case nvinfer1::DataType::kHALF:
+      os << "kHalf";
+      break;
+#if IS_TRT_VERSION_GE(8, 6, 0, 0)
+    case nvinfer1::DataType::kFP8:
+      os << "kFP8";
+      break;
+#endif
+    case nvinfer1::DataType::kINT8:
+      os << "kINT8";
+      break;
+    case nvinfer1::DataType::kINT32:
+      os << "kINT32";
+      break;
+    case nvinfer1::DataType::kBOOL:
+      os << "kBOOL";
+      break;
+#if IS_TRT_VERSION_GE(8, 5, 0, 0)
+    case nvinfer1::DataType::kUINT8:
+      os << "kUINT8";
+      break;
+#endif
+  }
+  return os;
+}
+}  // namespace nvinfer1
+
+#endif
diff --git a/tensorflow/compiler/tf2tensorrt/common/utils.h b/tensorflow/compiler/tf2tensorrt/common/utils.h
index 4bfce409127..08e837a410c 100644
--- a/tensorflow/compiler/tf2tensorrt/common/utils.h
+++ b/tensorflow/compiler/tf2tensorrt/common/utils.h
@@ -16,8 +16,13 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_TF2TENSORRT_COMMON_UTILS_H_
 #define TENSORFLOW_COMPILER_TF2TENSORRT_COMMON_UTILS_H_
 
+#include <numeric>
 #include <tuple>
 
+#include "absl/strings/str_join.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+
 namespace tensorflow {
 namespace tensorrt {
 // Returns the compile time TensorRT library version information
@@ -30,31 +35,143 @@ std::tuple<int, int, int> GetLoadedTensorRTVersion();
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#define IS_TRT_VERSION_GE(major, minor, patch, build)           \
-  ((NV_TENSORRT_MAJOR > major) ||                               \
-   (NV_TENSORRT_MAJOR == major && NV_TENSORRT_MINOR > minor) || \
-   (NV_TENSORRT_MAJOR == major && NV_TENSORRT_MINOR == minor && \
-    NV_TENSORRT_PATCH > patch) ||                               \
-   (NV_TENSORRT_MAJOR == major && NV_TENSORRT_MINOR == minor && \
-    NV_TENSORRT_PATCH == patch && NV_TENSORRT_BUILD >= build))
-
 #if GOOGLE_CUDA && GOOGLE_TENSORRT
 
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/status.h"
 #include "third_party/tensorrt/NvInfer.h"
 
+#define ERROR_LOC __FILE__, ":", __LINE__
+
+#define TFTRT_INTERNAL_ERROR_AT_NODE(node)                          \
+  return errors::Internal("TFTRT::", __FUNCTION__, "\n", ERROR_LOC, \
+                          " failed to add TRT layer, at: ", node);
+
+#define TFTRT_RETURN_ERROR_IF_NULLPTR(ptr, node) \
+  if (ptr == nullptr) {                          \
+    TFTRT_INTERNAL_ERROR_AT_NODE(node);          \
+  }
+
+// Use this macro within functions that return a Status or StatusOR<T> to check
+// boolean conditions. If the condition fails, it returns an
+// errors::Internal message with the file and line number.
+#define TRT_ENSURE(x)                                          \
+  if (!(x)) {                                                  \
+    return errors::Internal(ERROR_LOC, " TRT_ENSURE failure"); \
+  }
+
+// Checks that a Status or ::stream_executor::port::StatusOr<T> object does not
+// carry an error message. If it does have an error, returns an errors::Internal
+// instance containing the error message, along with the file and line number.
+// For pointer-containing ::stream_executor::port::StatusOr<T*>, use the below
+// TRT_ENSURE_PTR_OK macro.
+#define TRT_ENSURE_OK(x)                                              \
+  if (!x.ok()) {                                                      \
+    return errors::Internal(ERROR_LOC, " TRT_ENSURE_OK failure:\n  ", \
+                            x.status().ToString());                   \
+  }
+
+// Checks that a ::stream_executor::port::StatusOr<T* >object does not carry an
+// error, and that the contained T* is non-null. If it does have an error
+// status, returns an errors::Internal instance containing the error message,
+// along with the file and line number.
+#define TRT_ENSURE_PTR_OK(x)                                       \
+  TRT_ENSURE_OK(x);                                                \
+  if (x.ValueOrDie() == nullptr) {                                 \
+    return errors::Internal(ERROR_LOC, " pointer had null value"); \
+  }
+
 namespace tensorflow {
 namespace tensorrt {
 
+#define IS_TRT_VERSION_GE(major, minor, patch, build)           \
+  ((NV_TENSORRT_MAJOR > major) ||                               \
+   (NV_TENSORRT_MAJOR == major && NV_TENSORRT_MINOR > minor) || \
+   (NV_TENSORRT_MAJOR == major && NV_TENSORRT_MINOR == minor && \
+    NV_TENSORRT_PATCH > patch) ||                               \
+   (NV_TENSORRT_MAJOR == major && NV_TENSORRT_MINOR == minor && \
+    NV_TENSORRT_PATCH == patch && NV_TENSORRT_BUILD >= build))
 
 #define LOG_WARNING_WITH_PREFIX LOG(WARNING) << "TF-TRT Warning: "
 
 // Initializes the TensorRT plugin registry if this hasn't been done yet.
 void MaybeInitializeTrtPlugins(nvinfer1::ILogger* trt_logger);
 
+class IONamePrefixes {
+ public:
+  static constexpr const char* const kInputPHName = "TensorRTInputPH_";
+  static constexpr const char* const kOutputPHName = "TensorRTOutputPH_";
+};
+
+// Gets the binding index of a tensor in an engine.
+//
+// The binding index is looked up using the tensor's name and the profile index.
+// Profile index should be set to zero, if we do not have optimization profiles.
+Status GetTrtBindingIndex(const char* tensor_name, int profile_index,
+                          const nvinfer1::ICudaEngine* cuda_engine,
+                          int* binding_index);
+
+// Gets the binding index of a tensor in an engine.
+//
+// Same as above, but uses the network input index to identify the tensor.
+Status GetTrtBindingIndex(int network_input_idx, int profile_index,
+                          const nvinfer1::ICudaEngine* cuda_engine,
+                          int* binding_index);
 }  // namespace tensorrt
 }  // namespace tensorflow
 
+namespace nvinfer1 {
+// Prints nvinfer1::Dims or any drived type to the given ostream. Per GTest
+// printing requirements, this must be in the nvinfer1 namespace.
+inline std::ostream& operator<<(std::ostream& os, const nvinfer1::Dims& v) {
+  os << "nvinfer1::Dims[";
+  os << absl::StrJoin(std::vector<int>(v.d, v.d + v.nbDims), ",");
+  os << "]";
+  return os;
+}
+
+// Returns true if any two derived nvinfer1::Dims type structs are equivalent.
+inline bool operator==(const nvinfer1::Dims& lhs, const nvinfer1::Dims& rhs) {
+  if (rhs.nbDims != lhs.nbDims) {
+    return false;
+  }
+  for (int i = 0; i < lhs.nbDims; i++) {
+    if (rhs.d[i] != lhs.d[i]) {
+      return false;
+    }
+  }
+  return true;
+}
+
+// Returns false if any 2 subclasses of nvinfer1::Dims are equivalent.
+inline bool operator!=(const nvinfer1::Dims& lhs, const nvinfer1::Dims& rhs) {
+  return !(rhs == lhs);
+}
+
+// Prints nvinfer1::INetworkDefinition* information to the given ostream.
+inline std::ostream& operator<<(std::ostream& os,
+                                nvinfer1::INetworkDefinition* n) {
+  os << "nvinfer1::INetworkDefinition{\n";
+  std::vector<int> layer_idxs(n->getNbLayers());
+  std::iota(layer_idxs.begin(), layer_idxs.end(), 0);
+  os << absl::StrJoin(layer_idxs, "\n ",
+                      [n](std::string* out, const int layer_idx) {
+                        out->append(n->getLayer(layer_idx)->getName());
+                      });
+  os << "}";
+  return os;
+}
+
+// Prints the TensorFormat enum name to the stream.
+std::ostream& operator<<(std::ostream& os,
+                         const nvinfer1::TensorFormat& format);
+
+// Prints the DataType enum name to the stream.
+std::ostream& operator<<(std::ostream& os, const nvinfer1::DataType& data_type);
+
+}  // namespace nvinfer1
+
 #endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
 
 #endif  // TENSORFLOW_COMPILER_TF2TENSORRT_COMMON_UTILS_H_
diff --git a/tensorflow/compiler/tf2tensorrt/convert/algorithm_selector.cc b/tensorflow/compiler/tf2tensorrt/convert/algorithm_selector.cc
new file mode 100644
index 00000000000..82ed2254989
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/convert/algorithm_selector.cc
@@ -0,0 +1,272 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+#include "tensorflow/compiler/tf2tensorrt/convert/algorithm_selector.h"
+
+#include <utility>
+
+#include "absl/strings/str_format.h"
+#include "absl/strings/str_join.h"
+#include "tensorflow/compiler/tf2tensorrt/common/utils.h"
+#include "tensorflow/core/util/env_var.h"
+#include "third_party/tensorrt/NvInfer.h"
+
+// getAlgorithmIOInfo is deprecated in TRT >= 8, replaced by
+// getAlgorithmIOInfoByIndex.
+#if IS_TRT_VERSION_GE(8, 0, 0, 0)
+#define ALGORITHM_IO_INFO_BY_IDX(alg, idx) *(alg).getAlgorithmIOInfoByIndex(idx)
+#else
+#define ALGORITHM_IO_INFO_BY_IDX(alg, idx) (alg).getAlgorithmIOInfo(idx)
+#endif
+
+namespace nvinfer1 {
+
+std::ostream& operator<<(std::ostream& os,
+                         const nvinfer1::IAlgorithmContext& ctx) {
+  os << "AlgorithmContext(name=" << ctx.getName()
+     << ",nbInputs=" << ctx.getNbInputs() << ",nbOutputs=" << ctx.getNbOutputs()
+     << ")";
+  return os;
+}
+
+std::ostream& operator<<(std::ostream& os, const nvinfer1::IAlgorithm& alg) {
+  const nvinfer1::IAlgorithmVariant& variant = alg.getAlgorithmVariant();
+  os << "Algorithm("
+     << "variant.implementation=" << variant.getImplementation()
+     << ",variant.tactic=" << variant.getTactic()
+     << ",timingMSec=" << alg.getTimingMSec()
+     << ",workspaceSize=" << alg.getWorkspaceSize() << ")";
+  return os;
+}
+
+std::ostream& operator<<(std::ostream& os,
+                         const nvinfer1::IAlgorithmIOInfo& info) {
+  os << "IOTensor(format=" << info.getTensorFormat()
+     << ",dtype=" << info.getDataType() << ",strides=" << info.getStrides()
+     << ")";
+  return os;
+}
+}  // namespace nvinfer1
+
+namespace tensorflow {
+namespace tensorrt {
+namespace convert {
+
+bool operator>=(const AlgorithmSelectorImpl::TRTVersion& lhs,
+                const AlgorithmSelectorImpl::TRTVersion& rhs) {
+  if (lhs[0] > rhs[0]) return true;
+  if (lhs[0] == rhs[0] && lhs[1] > rhs[1]) return true;
+  if (lhs[0] == rhs[0] && lhs[1] == rhs[1] && lhs[2] > rhs[2]) return true;
+  if (lhs[0] == rhs[0] && lhs[1] == rhs[1] && lhs[2] == rhs[2] &&
+      lhs[3] >= rhs[3]) {
+    return true;
+  }
+  return false;
+}
+
+bool AlgorithmSelectorImpl::IsTrtVersionGE(const TRTVersion& version) const {
+  return version_ >= version;
+}
+
+bool AlgorithmSelectorImpl::IsShuffleLayer(ImplementationID id) const {
+  if (IsTrtVersionGE({8, 2, 0, 0})) {
+    return id == 0x80000000 + 13;
+  }
+  if (IsTrtVersionGE({8, 0, 0, 0})) {
+    return id == 0x80000000 + 14;
+  }
+  if (IsTrtVersionGE({7, 2, 0, 0})) {
+    return id == 0x80000000 + 16;
+  }
+  return id == 18;
+}
+
+std::set<AlgorithmSelectorImpl::TacticID>
+AlgorithmSelectorImpl::GetBannedTRT72TuringTactics() {
+  static const std::set<TacticID> banned_turing_72{
+      // turing_fp16_s1688cudnn_fp16_128x128_ldg8_relu_f2f_exp_medium_nhwc_gelu_tn_v1
+      -5927686925093575778,
+      // turing_fp16_s1688cudnn_fp16_128x128_ldg8_relu_f2f_exp_interior_nhwc_gelu_tn_v1
+      -3848538574386518527,
+      // turing_fp16_s1688cudnn_fp16_128x128_ldg8_relu_f2f_exp_small_nhwc_gelu_tn_v1
+      -959009792490796596};
+  return banned_turing_72;
+}
+
+bool AlgorithmSelectorImpl::IsBannedTactic(TacticID id) const {
+  // Disable problematic FP16-Turing tactics in TensorRT 7.2.
+  if (IsTrtVersionGE({7, 2, 0, 0}) && !IsTrtVersionGE({8, 0, 0, 0})) {
+    auto banned_turing_72 = GetBannedTRT72TuringTactics();
+    return banned_turing_72.find(id) != banned_turing_72.end();
+  }
+  return false;
+}
+
+bool AlgorithmSelectorImpl::AllowShuffleAlgorithm(
+    TacticID tactic, nvinfer1::DataType input_dtype,
+    nvinfer1::TensorFormat input_format) const {
+  if (IsTrtVersionGE({8, 0, 0, 0}) && !IsTrtVersionGE({8, 0, 3, 0})) {
+    // Reject shuffle node when input format is linear row major INT8
+    // format in TensorRT 8.0 GA.
+    return !(input_format == nvinfer1::TensorFormat::kLINEAR &&
+             input_dtype == nvinfer1::DataType::kINT8);
+  }
+
+  if (IsTrtVersionGE({7, 2, 0, 0}) && !IsTrtVersionGE({8, 0, 0, 0})) {
+    // For TRT 7.2, accept shuffle node when input format is not 32-wide
+    // channel vectorized row major FP32 format
+    return !(input_format == nvinfer1::TensorFormat::kCHW32 &&
+             input_dtype == nvinfer1::DataType::kFLOAT);
+  }
+  return true;
+}
+
+bool AlgorithmSelectorImpl::IsAlgorithmSelectorRequired() const {
+  // If we are in turing for TensorRT 7.2, we need the  selector for shuffle and
+  // avoiding specfic Turing tactics.
+  if (IsTrtVersionGE({7, 2, 0, 0}) && !IsTrtVersionGE({8, 0, 0, 0})) {
+    return true;
+  }
+
+  // If we are in TensorRT 8.0 GA, we want to reject certain types of shuffles.
+  if (IsTrtVersionGE({8, 0, 0, 0}) && !IsTrtVersionGE({8, 0, 3, 0})) {
+    return true;
+  }
+
+  return false;
+}
+
+namespace {
+
+string FormatAlgorithmList(const nvinfer1::IAlgorithmContext& ctx,
+                           absl::Span<const nvinfer1::IAlgorithm* const> algs) {
+  return absl::StrFormat(
+      "%s:\n\t%s", absl::FormatStreamed(ctx),
+      absl::StrJoin(
+          algs, "\n\t",
+          [&ctx](std::string* out, const nvinfer1::IAlgorithm* const alg) {
+            absl::StrAppendFormat(out, "%s", absl::FormatStreamed(*alg));
+            for (int i = 0; i < ctx.getNbInputs() + ctx.getNbOutputs(); i++) {
+              absl::StrAppendFormat(
+                  out, "\n\t\t%s",
+                  absl::FormatStreamed(ALGORITHM_IO_INFO_BY_IDX(*alg, i)));
+            }
+          }));
+}
+
+}  // namespace
+
+TftrtAlgorithmSelector::TftrtAlgorithmSelector()
+    : fixed_algorithm_idx_(GetFixedAlgorithmID()),
+      selector_(AlgorithmSelectorImpl::CompileTimeTRTVersion()) {}
+
+absl::optional<int64_t> TftrtAlgorithmSelector::GetFixedAlgorithmID() {
+  int64 trt_algorithm_idx = 0;
+  constexpr auto null_idx =
+      std::numeric_limits<decltype(trt_algorithm_idx)>::min();
+  Status status = tensorflow::ReadInt64FromEnvVar("TF_TRT_FIXED_ALGORITHM_ID",
+                                                  /*default_val=*/null_idx,
+                                                  &trt_algorithm_idx);
+  if (!status.ok()) {
+    LOG(ERROR) << status;
+    return absl::nullopt;
+  }
+  if (trt_algorithm_idx != null_idx) {
+    return std::max(static_cast<int32_t>(trt_algorithm_idx), 0);
+  }
+  return absl::nullopt;
+}
+
+bool TftrtAlgorithmSelector::AlgorithmPolicy(
+    const nvinfer1::IAlgorithmContext& context,
+    const nvinfer1::IAlgorithm& alg) const {
+  const nvinfer1::IAlgorithmVariant& variant = alg.getAlgorithmVariant();
+
+  // Check if this tactic ID is banned.
+  TacticID tactic_id = variant.getTactic();
+  if (selector_.IsBannedTactic(tactic_id)) {
+    return false;
+  }
+
+  if (selector_.IsShuffleLayer(variant.getImplementation())) {
+    return selector_.AllowShuffleAlgorithm(
+        tactic_id, alg.getAlgorithmIOInfo(0).getDataType(),
+        alg.getAlgorithmIOInfo(0).getTensorFormat());
+  }
+  return true;
+}
+
+int32_t TftrtAlgorithmSelector::selectAlgorithms(
+    const nvinfer1::IAlgorithmContext& algoContext,
+    const nvinfer1::IAlgorithm* const* algoChoices, int32_t nbChoices,
+    int32_t* selection) noexcept {
+  if (fixed_algorithm_idx_) {
+    LOG(WARNING) << "Forcing TRT algorithm selection to: ID = "
+                 << *fixed_algorithm_idx_;
+    selection[0] = std::min(*fixed_algorithm_idx_, nbChoices - 1);
+    return 1;
+  }
+
+  int num_selections = 0;
+
+  VLOG(1) << "Algorithm selection choices: "
+          << FormatAlgorithmList(algoContext,
+                                 absl::MakeSpan(algoChoices, nbChoices));
+
+  for (int i = 0; i < nbChoices; i++) {
+    const nvinfer1::IAlgorithm& alg = *algoChoices[i];
+
+    // Check layer-specific issues.
+    if (!AlgorithmPolicy(algoContext, alg)) {
+      LOG(WARNING) << absl::StrFormat("Rejecting Algorithm: %s ",
+                                      absl::FormatStreamed(alg));
+      continue;
+    }
+    selection[num_selections++] = i;
+  }
+  return num_selections;
+}
+
+// Called by TensorRT to report choices it made.
+void TftrtAlgorithmSelector::reportAlgorithms(
+    const nvinfer1::IAlgorithmContext* const* algoContexts,
+    const nvinfer1::IAlgorithm* const* algoChoices,
+    int32_t nbAlgorithms) noexcept {
+  if (VLOG_IS_ON(1)) {
+    string selection_msg = "Algorithms selected:\n";
+    for (int i = 0; i < nbAlgorithms; i++) {
+      absl::StrAppend(&selection_msg,
+                      FormatAlgorithmList(*algoContexts[i],
+                                          absl::MakeSpan(algoChoices + i, 1)));
+    }
+    VLOG(1) << selection_msg;
+  }
+}
+
+std::unique_ptr<TftrtAlgorithmSelector> MaybeCreateAlgorithmSelector() {
+  auto selector = std::make_unique<TftrtAlgorithmSelector>();
+
+  if (selector->IsRequired()) {
+    return selector;
+  }
+
+  return nullptr;
+}
+
+}  // namespace convert
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/convert/algorithm_selector.h b/tensorflow/compiler/tf2tensorrt/convert/algorithm_selector.h
new file mode 100644
index 00000000000..1ce0def0c75
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/convert/algorithm_selector.h
@@ -0,0 +1,121 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_ALGORITHM_SELECTOR_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_ALGORITHM_SELECTOR_H_
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+#include <array>
+#include <memory>
+#include <set>
+
+#include "absl/types/optional.h"
+#include "third_party/tensorrt/NvInfer.h"
+
+namespace tensorflow {
+namespace tensorrt {
+namespace convert {
+
+// Implements core algorithm selection logic in a testable manner. The policy
+// implemented depends on the given TRT version. We have this class because TRT
+// interfaces make it difficult to directly test an IAlgorithmSelector
+// implementation.
+class AlgorithmSelectorImpl {
+ public:
+  using TRTVersion = std::array<int, 4>;
+  using ImplementationID = int64_t;
+  using TacticID = int64_t;
+
+  static constexpr TRTVersion CompileTimeTRTVersion() {
+    return TRTVersion{NV_TENSORRT_MAJOR, NV_TENSORRT_MINOR, NV_TENSORRT_PATCH,
+                      NV_TENSORRT_BUILD};
+  }
+
+  explicit AlgorithmSelectorImpl(
+      const TRTVersion& version = CompileTimeTRTVersion())
+      : version_(version) {}
+
+  bool IsShuffleLayer(ImplementationID id) const;
+
+  bool IsBannedTactic(TacticID id) const;
+
+  // Returns true if the algorithm implementing the IShuffleLayer is acceptable.
+  bool AllowShuffleAlgorithm(TacticID tactic, nvinfer1::DataType input_dtype,
+                             nvinfer1::TensorFormat input_format) const;
+
+  bool IsTrtVersionGE(const TRTVersion& version) const;
+
+  // Returns true if we know at compile time that the algorithm selector
+  // should be required. This is a conservative estimate.
+  bool IsAlgorithmSelectorRequired() const;
+
+  static std::set<TacticID> GetBannedTRT72TuringTactics();
+
+ private:
+  TRTVersion version_;
+};
+
+// Impelements the TRT IAlgorithmSelector interface. The method
+// "selectAlgorithms" selects allowable algorithms for each layer, and
+// "reportAlgorithms" summarizes the algorithms selected by TensorRT.
+class TftrtAlgorithmSelector : public nvinfer1::IAlgorithmSelector {
+ private:
+  using TacticID = AlgorithmSelectorImpl::TacticID;
+
+  // An index we should choose for all algorithms. Used for debugging.
+  absl::optional<int32_t> fixed_algorithm_idx_;
+
+  AlgorithmSelectorImpl selector_;
+
+ public:
+  TftrtAlgorithmSelector();
+
+  // If the environment variable TF_TRT_FIXED_ALGORITHM_ID is empty, this
+  // function returns nullopt. Otherwise, it returns the specified number.
+  static absl::optional<int64_t> GetFixedAlgorithmID();
+
+  // Returns true if the algorithm associated with context is acceptable.
+  bool AlgorithmPolicy(const nvinfer1::IAlgorithmContext& context,
+                       const nvinfer1::IAlgorithm& alg) const;
+
+  // This function fills the array "selection" with the indices of selected
+  // algorithm candidates from "algoChoices", each of which is an implementation
+  // for the kernel described by the given IAlgorithmContext. It should return a
+  // number in [0, nbChoices] indicating the number of selected indices. If 0 is
+  // returned, TensorRT will use its default selection mechanism.
+  int32_t selectAlgorithms(const nvinfer1::IAlgorithmContext& algoContext,
+                           const nvinfer1::IAlgorithm* const* algoChoices,
+                           int32_t nbChoices,
+                           int32_t* selection) noexcept override;
+
+  // Called by TensorRT to report choices it made.
+  void reportAlgorithms(const nvinfer1::IAlgorithmContext* const* algoContexts,
+                        const nvinfer1::IAlgorithm* const* algoChoices,
+                        int32_t nbAlgorithms) noexcept override;
+
+  bool IsRequired() const {
+    return selector_.IsAlgorithmSelectorRequired() ||
+           fixed_algorithm_idx_ != absl::nullopt;
+  }
+};
+
+// Returns an initialized AlgorithmSelector if an algorithm selector is required
+// for the current TRT version. Otherwise, returns nullptr.
+std::unique_ptr<TftrtAlgorithmSelector> MaybeCreateAlgorithmSelector();
+
+}  // namespace convert
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_ALGORITHM_SELECTOR_H_
diff --git a/tensorflow/compiler/tf2tensorrt/convert/algorithm_selector_test.cc b/tensorflow/compiler/tf2tensorrt/convert/algorithm_selector_test.cc
new file mode 100644
index 00000000000..12eb1fabc86
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/convert/algorithm_selector_test.cc
@@ -0,0 +1,97 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+
+#include "tensorflow/compiler/tf2tensorrt/convert/algorithm_selector.h"
+
+#include <memory>
+
+#include <gtest/gtest.h>
+#include "third_party/tensorrt/NvInfer.h"
+
+namespace tensorflow {
+namespace tensorrt {
+namespace convert {
+
+TEST(TestAlgorithmSelector, TensorRT7_1) {
+  // Verify that the algorithm selector for TRT 7.1 is not required.
+  AlgorithmSelectorImpl sel71({7, 1, 3, 4});
+  ASSERT_FALSE(sel71.IsAlgorithmSelectorRequired());
+}
+
+TEST(TestAlgorithmSelector, TensorRT7_2) {
+  // Verify that the algorithm selector for TRT 7.2 is required.
+  AlgorithmSelectorImpl sel72({7, 2, 0, 0});
+  ASSERT_TRUE(sel72.IsAlgorithmSelectorRequired());
+
+  // Check that the correct tactics are banned.
+  auto turing_tactics = AlgorithmSelectorImpl::GetBannedTRT72TuringTactics();
+
+  for (auto id : turing_tactics) {
+    EXPECT_TRUE(sel72.IsBannedTactic(id));
+  }
+
+  // Check that a bad shuffle format is banned.
+  EXPECT_FALSE(sel72.AllowShuffleAlgorithm(0, nvinfer1::DataType::kFLOAT,
+                                           nvinfer1::TensorFormat::kCHW32));
+
+  // Check that other formats are not banned.
+  EXPECT_TRUE(sel72.AllowShuffleAlgorithm(0, nvinfer1::DataType::kHALF,
+                                          nvinfer1::TensorFormat::kCHW32));
+  EXPECT_TRUE(sel72.AllowShuffleAlgorithm(0, nvinfer1::DataType::kINT32,
+                                          nvinfer1::TensorFormat::kCHW32));
+  EXPECT_TRUE(sel72.AllowShuffleAlgorithm(0, nvinfer1::DataType::kFLOAT,
+                                          nvinfer1::TensorFormat::kCHW16));
+}
+
+TEST(TestAlgorithmSelector, TensorRT8_0) {
+  // Verify that the algorithm selector for TRT 8.0 is required.
+  AlgorithmSelectorImpl sel80({8, 0, 1, 6});
+  ASSERT_TRUE(sel80.IsAlgorithmSelectorRequired());
+
+  // Check that the turing 7.2 tactics are not banned.
+  auto turing_tactics = AlgorithmSelectorImpl::GetBannedTRT72TuringTactics();
+  for (auto id : turing_tactics) {
+    EXPECT_FALSE(sel80.IsBannedTactic(id));
+  }
+
+  // Check that a bad shuffle format is banned.
+  EXPECT_FALSE(sel80.AllowShuffleAlgorithm(0, nvinfer1::DataType::kINT8,
+                                           nvinfer1::TensorFormat::kLINEAR));
+
+  // Check that other formats are not banned.
+  EXPECT_TRUE(sel80.AllowShuffleAlgorithm(0, nvinfer1::DataType::kHALF,
+                                          nvinfer1::TensorFormat::kLINEAR));
+  EXPECT_TRUE(sel80.AllowShuffleAlgorithm(0, nvinfer1::DataType::kINT32,
+                                          nvinfer1::TensorFormat::kLINEAR));
+  EXPECT_TRUE(sel80.AllowShuffleAlgorithm(0, nvinfer1::DataType::kFLOAT,
+                                          nvinfer1::TensorFormat::kLINEAR));
+  EXPECT_TRUE(sel80.AllowShuffleAlgorithm(0, nvinfer1::DataType::kINT8,
+                                          nvinfer1::TensorFormat::kCHW16));
+  EXPECT_TRUE(sel80.AllowShuffleAlgorithm(0, nvinfer1::DataType::kINT8,
+                                          nvinfer1::TensorFormat::kCHW32));
+}
+
+TEST(TestAlgorithmSelector, TensorRT8_2) {
+  // Verify that the algorithm selector for TRT 8.0 is required.
+  AlgorithmSelectorImpl sel({8, 2, 0, 0});
+  ASSERT_FALSE(sel.IsAlgorithmSelectorRequired());
+}
+
+}  // namespace convert
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
index 01dcfba9c52..9c9fce4d30a 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
@@ -25,7 +25,11 @@ limitations under the License.
 #include <vector>
 
 #include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+#include "tensorflow/compiler/tf2tensorrt/common/utils.h"
 #include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/logger_registry.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/ops/quantization_ops.h"
 #include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
 #include "tensorflow/compiler/tf2tensorrt/segment/segment.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_id.h"
@@ -40,28 +44,30 @@ limitations under the License.
 #include "tensorflow/core/grappler/clusters/virtual_cluster.h"
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/devices.h"
-#include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/protobuf/config.pb.h"  // NOLINT
+#include "tensorflow/core/protobuf/config.pb.h"             // NOLINT
 #include "tensorflow/core/protobuf/device_properties.pb.h"  // NOLINT
-#include "tensorflow/core/protobuf/rewriter_config.pb.h"  // NOLINT
+#include "tensorflow/core/protobuf/rewriter_config.pb.h"    // NOLINT
 #include "tensorflow/core/util/device_name_utils.h"
+#include "tensorflow/tools/graph_transforms/transform_utils.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
 #include "third_party/tensorrt/NvInfer.h"
 namespace tensorflow {
 namespace tensorrt {
 namespace convert {
+
 using absl::StrAppend;
 using absl::StrCat;
+using ::tensorflow::tensorrt::segment::ClusterProperty;
+using ::tensorflow::tensorrt::segment::NodePtrCompare;
+using ::tensorflow::tensorrt::segment::Segment;
 
 namespace {
 
@@ -76,7 +82,20 @@ Status BuildNodeMap(const Graph& graph,
   return Status::OK();
 }
 
-}  // namespace
+EngineInfo::EngineType GetEngineType(
+    const TRTOptimizationPass::ConversionParams& params) {
+  return (params.is_dynamic_op || params.use_calibration)
+             ? EngineInfo::EngineType::TRTDynamic
+             : EngineInfo::EngineType::TRTStatic;
+}
+
+// Returns true when use_implicit_batch is false or when we are building dynamic
+// engine, to allow unknown size for dimensions rather than dimension 0.
+bool AllowDynamicNonBatchDimension(
+    const TRTOptimizationPass::ConversionParams& params) {
+  return !params.use_implicit_batch ||
+         GetEngineType(params) == EngineInfo::EngineType::TRTDynamic;
+}
 
 struct EdgePtrCompare {
   bool operator()(const Edge* lhs, const Edge* rhs) const {
@@ -88,30 +107,48 @@ struct EdgePtrCompare {
 // a device name as one of the conversion parameter so users can control on
 // which device they want to run the conversion.
 std::pair<TfGpuId, PlatformGpuId> GetFirstValidDeviceId() {
-  for (int tf_gpu_id_value = 0; tf_gpu_id_value < 100; ++tf_gpu_id_value) {
-    TfGpuId tf_gpu_id(tf_gpu_id_value);
-    PlatformGpuId platform_gpu_id;
-    Status s = GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id);
+  for (int tf_device_id_value = 0; tf_device_id_value < 100;
+       ++tf_device_id_value) {
+    TfGpuId tf_device_id(tf_device_id_value);
+    PlatformGpuId platform_device_id;
+    Status s =
+        GpuIdManager::TfToPlatformGpuId(tf_device_id, &platform_device_id);
     if (s.ok()) {
-      VLOG(1) << "Found TF GPU " << tf_gpu_id.value() << " at cuda device "
-              << platform_gpu_id.value();
-      return std::make_pair(tf_gpu_id, platform_gpu_id);
+      VLOG(1) << "Found TF GPU " << tf_device_id.value() << " at cuda device "
+              << platform_device_id.value();
+      return std::make_pair(tf_device_id, platform_device_id);
     }
   }
   LOG(ERROR) << "Could not find any TF GPUs";
   return std::make_pair(TfGpuId(-1), PlatformGpuId(-1));
 }
 
+// Returns false for const nodes (we intend to drop control edges from those).
+bool ShallKeepControlEdgeFrom(const Node* input_node) {
+  if (!input_node) {
+    LOG(ERROR) << "Node pointer is null, this should not happen";
+    return false;
+  }
+  return input_node->type_string() != "Const";
+}
+
 // Function to get subsegment information structure.
 Status GetEngineInfo(const Graph* g,
                      const grappler::GraphProperties& graph_properties,
-                     const std::set<const Node*>& segment_nodes,
-                     const std::unordered_map<string, Node*>& node_map,
+                     const Segment& segment,
                      const std::vector<Node*>& reverse_topo_order,
                      EngineInfo* info) {
   std::vector<const Node*> subgraph_nodes;  // Topologically sorted nodes.
   std::set<const Node*> added_const_nodes;  // Used to prevent double insertion.
-  std::set<string> segment_devices;
+
+  const ClusterProperty& segment_property = segment.property;
+  const std::set<const Node*, NodePtrCompare>& segment_nodes = segment.nodes;
+
+  // The device assignment accumulated from the compatible device assignments
+  // for the nodes in the segment.
+  const DeviceNameUtils::ParsedName segment_device =
+      segment_property.DeviceName();
+  info->max_batch_size = segment_property.BatchSize().GetOptionalMaxBatchSize();
 
   // Map from src_node_name+port to the unique port numbers of the TRT op, where
   // the src_node_name is the name of the source node of the input/output
@@ -124,52 +161,12 @@ Status GetEngineInfo(const Graph* g,
        ++it) {
     const Node* node = *it;
     if (segment_nodes.count(node) == 0) continue;
-    auto node_device = node->requested_device();
-    if (!node_device.empty()) {
-      // If device is set, it means device placement may have been done before,
-      // so we need to assign a device for the TRTEngineOp to maintain the
-      // invariance.
-      // If the device is CPU in this case, it tries to find the first available
-      // GPU and use it as the device.
-      DeviceNameUtils::ParsedName parsed_name;
-      const bool parse_succeeded =
-          DeviceNameUtils::ParseFullName(node_device, &parsed_name);
-      if (!parse_succeeded || (parse_succeeded && parsed_name.type == "CPU")) {
-        string msg;
-        if (!parse_succeeded) {
-          msg = StrCat("Failed to parse assigned device of node ", node->name(),
-                       ". ");
-        } else {
-          msg = StrCat("Node ", node->name(), " was assigned to the CPU. ");
-        }
-        VLOG(1) << msg << "Attempting to place on GPU.";
-        TfGpuId tf_gpu_id;
-        PlatformGpuId platform_gpu_id;
-        std::tie(tf_gpu_id, platform_gpu_id) = GetFirstValidDeviceId();
-        if (tf_gpu_id.value() >= 0) {
-          parsed_name.type = "GPU";
-          parsed_name.id = tf_gpu_id.value();
-          segment_devices.insert(DeviceNameUtils::FullName(
-              parsed_name.job, parsed_name.replica, parsed_name.task,
-              parsed_name.type, parsed_name.id));
-        }
-      } else {
-        segment_devices.insert(node_device);
-      }
-    } else if (node->has_assigned_device_name()) {
-      // It appears that nodes will not have assigned devices at this point in
-      // execution.
-      segment_devices.insert(node->assigned_device_name());
-    } else {
-      VLOG(2) << "Node " << node->name()
-              << " neither have requested device nor assigned device";
-    }
     subgraph_nodes.push_back(node);
 
     const int node_id = node->id();
     const string& node_name = node->name();
 
-    // Create input connections. Sort edges first to make determnistic since
+    // Create input connections. Sort edges first to make deterministic since
     // in_edges is a set of pointers.
     std::vector<const Edge*> in_edges(node->in_edges().begin(),
                                       node->in_edges().end());
@@ -180,7 +177,7 @@ Status GetEngineInfo(const Graph* g,
         continue;
       }
       if (edge->IsControlEdge()) {
-        if (input_node->type_string() != "Const") {
+        if (ShallKeepControlEdgeFrom(input_node)) {
           // Non-Const control input.
           info->connections.emplace_back(input_node->name(), input_node->id(),
                                          node_name, node_id,
@@ -194,7 +191,7 @@ Status GetEngineInfo(const Graph* g,
         // If it doesn't have any edges, TF will prune it out.
         //
         // Note that the segmenter already ensure that the constant data input
-        // is valid and suppported by the engine.
+        // is valid and supported by the engine.
         if (!added_const_nodes.insert(input_node).second) {
           // Already added before.
           continue;
@@ -217,7 +214,7 @@ Status GetEngineInfo(const Graph* g,
             node_id, edge->dst_input(), /*input_edge=*/true, port);
       }
     }
-    // Create output connections. Sort edges first to make determnistic since
+    // Create output connections. Sort edges first to make deterministic since
     // out_edges is a set of pointers.
     std::vector<const Edge*> out_edges(node->out_edges().begin(),
                                        node->out_edges().end());
@@ -229,9 +226,11 @@ Status GetEngineInfo(const Graph* g,
       }
       if (edge->IsControlEdge()) {
         // Control output.
-        info->connections.emplace_back(output_node->name(), output_node->id(),
-                                       node_name, node_id,
-                                       /*input_edge=*/false);
+        if (ShallKeepControlEdgeFrom(node)) {
+          info->connections.emplace_back(output_node->name(), output_node->id(),
+                                         node_name, node_id,
+                                         /*input_edge=*/false);
+        }
       } else {
         // Data output.
         int port = Graph::kControlSlot - 1;
@@ -254,22 +253,35 @@ Status GetEngineInfo(const Graph* g,
   // Construct the const nodes first.
   subgraph_nodes.insert(subgraph_nodes.begin(), added_const_nodes.begin(),
                         added_const_nodes.end());
-  string scope_name;
-  TF_RETURN_IF_ERROR(ConvertSegmentToGraphDef(
-      g, graph_properties, subgraph_nodes, &info->connections,
-      &info->segment_graph_def, &scope_name));
-  info->engine_name = StrCat(scope_name, info->engine_name);
+  TF_RETURN_IF_ERROR(
+      ConvertSegmentToGraphDef(g, graph_properties, subgraph_nodes, info));
   VLOG(1) << "Converted TensorRT candidate segment '" << info->engine_name
           << "' to a GraphDef";
-  if (segment_devices.size() == 1) {
-    info->device = *segment_devices.begin();
-  } else if (segment_devices.size() > 1) {
-    LOG(WARNING) << "Detected multiple (" << segment_devices.size()
-                 << ") devices for the segment. Picking first one to continue.";
-    info->device = *segment_devices.begin();
+  if (segment_device.has_type) {
+    // If the accumulated device assignment for the segment has a device type,
+    // the segmenter guarantees the device type is GPU. Use the device
+    // assignment in this case.
+    if (segment_device.type != "GPU") {
+      return errors::Internal(
+          "segment device is not GPU: ",
+          DeviceNameUtils::ParsedNameToString(segment_device));
+    }
+    info->device = DeviceNameUtils::ParsedNameToString(segment_device);
   } else {
-    VLOG(1) << "No device is assigned to the segment. "
-            << "A device will be assigned during graph execution (inference).";
+    TfGpuId tf_device_id;
+    PlatformGpuId platform_device_id;
+    std::tie(tf_device_id, platform_device_id) = GetFirstValidDeviceId();
+    if (tf_device_id.value() >= 0) {
+      DeviceNameUtils::ParsedName parsed_name;
+      parsed_name.type = "GPU";
+      parsed_name.has_type = true;
+      parsed_name.id = tf_device_id.value();
+      parsed_name.has_id = true;
+      info->device = DeviceNameUtils::ParsedNameToString(parsed_name);
+    } else {
+      VLOG(1) << "No device is assigned to the segment. A device will be "
+                 "assigned during graph execution (inference).";
+    }
   }
   return Status::OK();
 }
@@ -303,7 +315,22 @@ void UpdateToEngineNode(const std::vector<EngineInfo>& infos,
       }
     }
   }
-  LOG(FATAL) << "Node " << (**node).name() << " not found in any engine.";
+  LOG(FATAL) << "Node " << node_name << " not found in any engine.";
+}
+
+tensorflow::TensorShapeProto ComputeTRTNodeIOShape(
+    std::vector<PartialTensorShape>& partial_tensorshape_vect,
+    std::vector<tensorflow::TensorShapeProto>& shape_proto_vect,
+    const PartialTensorShape& conn_shape, int port_number) {
+  tensorflow::TensorShapeProto tmp_shape_proto;
+  conn_shape.AsProto(&tmp_shape_proto);
+
+  if (partial_tensorshape_vect.size() <= port_number) {
+    shape_proto_vect.resize(port_number + 1);
+    partial_tensorshape_vect.resize(port_number + 1);
+  }
+
+  return tmp_shape_proto;
 }
 
 // Function to insert a TRT engine node into the graph.
@@ -318,15 +345,16 @@ void UpdateToEngineNode(const std::vector<EngineInfo>& infos,
 //         one). Connect to the pre-existing engine node instead.
 // 3. In this way, we ensure the graph is topologically sort-able after each
 //    invocation of CreateTRTNode().
-Status CreateTRTNode(const ConversionParams& params,
+Status CreateTRTNode(const TRTOptimizationPass::ConversionParams& params,
                      const std::vector<EngineInfo>& infos, int pos,
-                     int max_batch_size, Graph* graph,
-                     nvinfer1::IGpuAllocator* alloc,
-                     std::vector<Node*>* engine_nodes) {
+                     int default_max_batch_size, Graph* graph,
+                     std::vector<Node*>* engine_nodes,
+                     grappler::Cluster* cluster) {
   const auto& info = infos.at(pos);
   std::vector<tensorflow::TensorShapeProto> input_shape_protos;
   std::vector<tensorflow::TensorShapeProto> output_shape_protos;
   std::vector<PartialTensorShape> input_shapes;
+  std::vector<PartialTensorShape> output_shapes;
   std::vector<NodeDefBuilder::NodeOut> inputs;
   std::vector<Node*> input_nodes;
   std::vector<Node*> control_input_nodes;
@@ -359,36 +387,42 @@ Status CreateTRTNode(const ConversionParams& params,
     } else {
       // Data edges
       if (!conn.is_input_edge) {
-        // Set the data types of output edge.
+        // Set the shapes and data types of the output edge.
+        tensorflow::TensorShapeProto out_shape = ComputeTRTNodeIOShape(
+            /*partial_tensorshape_vect=*/output_shapes,
+            /*shape_proto_vect=*/output_shape_protos,
+            /*conn_shape=*/conn.inside_shape,
+            /*port_number=*/conn.port_number);
+
+        output_shape_protos.at(conn.port_number) = out_shape;
+        output_shapes.at(conn.port_number) = conn.inside_shape;
+
         if (out_types.size() <= conn.port_number) {
           out_types.resize(conn.port_number + 1);
         }
         out_types.at(conn.port_number) = conn.connection_type;
-        if (output_shape_protos.size() <= conn.port_number) {
-          output_shape_protos.resize(conn.port_number + 1);
-        }
-        conn.inside_shape.AsProto(&output_shape_protos.at(conn.port_number));
         VLOG(2) << "Collected output shape "
                 << output_shape_protos.at(conn.port_number).DebugString();
       } else {
-        // Set the shapes and data types of input edge.
-        tensorflow::TensorShapeProto in_shape;
-        conn.outside_shape.AsProto(&in_shape);
-        if (input_shapes.size() <= conn.port_number) {
-          input_shape_protos.resize(conn.port_number + 1);
-          input_shapes.resize(conn.port_number + 1);
-        }
+        // Set the shapes of the input edge.
+        tensorflow::TensorShapeProto in_shape = ComputeTRTNodeIOShape(
+            /*partial_tensorshape_vect=*/input_shapes,
+            /*shape_proto_vect=*/input_shape_protos,
+            /*conn_shape=*/conn.outside_shape,
+            /*port_number=*/conn.port_number);
+
         input_shape_protos.at(conn.port_number) = in_shape;
         input_shapes.at(conn.port_number) = conn.outside_shape;
+
         // Shape must be fully defined (excluding batch dimension) for static
         // mode.
-        if (info.engine_type == EngineInfo::EngineType::TRTStatic) {
+        if (params.use_implicit_batch &&
+            info.engine_type == EngineInfo::EngineType::TRTStatic) {
           for (int i = 1; i < conn.outside_shape.dims(); i++) {
             if (conn.outside_shape.dim_size(i) <= 0) {
               return errors::Internal(
-                  "Input shapes must be fully defined when in static mode. "
-                  "Please try is_dynamic_op=True (shape was ",
-                  conn.outside_shape.DebugString(), ")");
+                  "Not fully defined input shape when in static mode which "
+                  "should have been excluded by the segmenter. ");
             }
           }
         }
@@ -421,25 +455,17 @@ Status CreateTRTNode(const ConversionParams& params,
         "Segment has no inputs (possible constfold failure)");
   }
 
-  const bool calibrate_int8 =
-      (info.precision_mode == TrtPrecisionMode::INT8 && info.use_calibration);
   // Build the engine and get its serialized representation.
   string segment_string;
+
+  int max_batch_size = info.max_batch_size.has_value()
+                           ? info.max_batch_size.value()
+                           : default_max_batch_size;
+
   if (info.engine_type == EngineInfo::EngineType::TRTStatic) {
-    // Create static engine for fp32/fp16 mode.
-    Logger trt_logger;
-    TrtUniquePtrType<nvinfer1::ICudaEngine> engine;
-    // TODO(sami): What happens if 1st dim is not batch?
-    TF_RETURN_IF_ERROR(ConvertGraphDefToEngine(
-        info.segment_graph_def,
-        calibrate_int8 ? TrtPrecisionMode::FP32 : info.precision_mode,
-        max_batch_size, info.max_workspace_size_bytes, input_shapes,
-        &trt_logger, alloc, /*calibrator=*/nullptr, &engine,
-        info.use_calibration,
-        /*convert_successfully=*/nullptr));
-    TrtUniquePtrType<nvinfer1::IHostMemory> engine_data(engine->serialize());
-    segment_string = string(static_cast<const char*>(engine_data->data()),
-                            engine_data->size());
+    TF_RETURN_IF_ERROR(CreateStaticEngine(params, info, max_batch_size,
+                                          input_shapes, nullptr,
+                                          &segment_string, cluster));
   }
 
   string prec_string;
@@ -461,21 +487,31 @@ Status CreateTRTNode(const ConversionParams& params,
   NodeDef trt_node;
   NameAttrList function;
   function.set_name(StrCat(info.engine_name, "_native_segment"));
-  Status status =
-      node_builder
-          .Attr("input_shapes", input_shape_protos)
-          .Attr("output_shapes", output_shape_protos)
-	  .Attr("static_engine",
-                info.engine_type == EngineInfo::EngineType::TRTStatic)
-          .Attr("segment_func", function)
-          .Attr("serialized_segment", segment_string)
-          .Attr("calibration_data", "")
-          .Attr("max_cached_engines_count", info.maximum_cached_engines)
-          .Attr("workspace_size_bytes", info.max_workspace_size_bytes)
-          .Attr("precision_mode", prec_string)
-          .Attr("use_calibration", info.use_calibration)
-          .Attr("OutT", out_types)
-          .Finalize(&trt_node);
+
+  node_builder.Attr("input_shapes", input_shape_protos)
+      .Attr("output_shapes", output_shape_protos)
+      .Attr("static_engine",
+            info.engine_type == EngineInfo::EngineType::TRTStatic)
+      .Attr("segment_func", function)
+      .Attr("serialized_segment", segment_string)
+      .Attr("calibration_data", "")
+      .Attr("max_cached_engines_count", info.maximum_cached_engines)
+      .Attr("workspace_size_bytes", info.max_workspace_size_bytes)
+      .Attr("max_batch_size", max_batch_size)
+      .Attr("precision_mode", prec_string)
+      .Attr("use_calibration", info.use_calibration)
+      .Attr("_use_implicit_batch", params.use_implicit_batch)
+      .Attr("use_explicit_precision", params.use_explicit_precision)
+      .Attr("_allow_build_at_runtime", info.allow_build_at_runtime)
+      .Attr("OutT", out_types);
+
+  if (!params.use_implicit_batch) {
+    node_builder.Attr("profile_strategy",
+                      ProfileStrategyToName(params.profile_strategy));
+  }
+
+  Status status = node_builder.Finalize(&trt_node);
+
   if (!status.ok()) {
     LOG(ERROR) << "Node construction failed with" << status;
     return status;
@@ -488,10 +524,6 @@ Status CreateTRTNode(const ConversionParams& params,
   // instead of checking fail.
   Node* engine_node = graph->AddNode(trt_node, &status);
   (*engine_nodes)[pos] = engine_node;
-  if (!status.ok()) {
-    LOG(ERROR) << "Adding node failed " << status;
-    return status;
-  }
   // Add control input and input edges to the engine node.
   for (const auto in : control_input_nodes) {
     VLOG(1) << "Connecting control edge from " << in->name() << " to "
@@ -536,6 +568,58 @@ Status CreateTRTNode(const ConversionParams& params,
   return Status::OK();
 }
 
+int64 GetNextGraphSequenceNumber() {
+  static std::atomic<int64_t> graph_sequence_num;
+  return graph_sequence_num++;
+}
+
+constexpr char kCastInputTypeAttrName[] = "SrcT";
+
+// Transforms node = cast(x, fp32) where datatype(x) != fp16 to:
+//   castToFp16 = cast(x, fp16)
+//   node = cast(castToFp16, fp32)
+//
+Status MaybeRewriteCastToFp32(GraphDef* graph_def, NodeDef* node_def) {
+  if (node_def->op() != "Cast") {
+    return Status::OK();
+  }
+
+  DataTypeVector input_types;
+  DataTypeVector output_types;
+  TF_RETURN_IF_ERROR(
+      graph_transforms::GetInOutTypes(*node_def, &input_types, &output_types));
+
+  if (input_types.size() != 1 || output_types.size() != 1) {
+    return errors::Internal("Bad cast operation");
+  }
+
+  if (input_types[0] == DT_HALF || output_types[0] != DT_FLOAT) {
+    return Status::OK();
+  }
+
+  VLOG(2) << "Rewriting cast to FP32 " << node_def->DebugString();
+
+  NodeDef* castToFp16 = graph_def->add_node();
+  for (auto attr_value : node_def->attr()) {
+    (*castToFp16->mutable_attr())[attr_value.first] = attr_value.second;
+  }
+  castToFp16->set_name(node_def->name() + "_split");
+  castToFp16->set_op("Cast");
+  castToFp16->set_device(node_def->device());
+  castToFp16->add_input(node_def->input(0));
+  (*castToFp16->mutable_attr())[kCastOutputTypeAttrName].set_type(DT_HALF);
+
+  node_def->set_input(0, castToFp16->name() + ":0");
+  (*node_def->mutable_attr())[kCastInputTypeAttrName].set_type(DT_HALF);
+
+  VLOG(2) << castToFp16->DebugString();
+  VLOG(2) << node_def->DebugString();
+
+  return Status::OK();
+}
+
+}  // namespace
+
 Status RegisterGraphToFunctionLibrary(const GraphDef& segment_graph_def,
                                       Graph* graph, const string& engine_name) {
   Graph segment_graph(graph->flib_def());
@@ -545,11 +629,6 @@ Status RegisterGraphToFunctionLibrary(const GraphDef& segment_graph_def,
   auto segment_func = library.add_function();
   TF_RETURN_IF_ERROR(GraphToFunctionDef(
       segment_graph, StrCat(engine_name, "_native_segment"), segment_func));
-  // Set kIntsonDeviceAttr to true so that all TRTEngineOp outputs are always on
-  // a GPU device as expected. Otherwise, some of the tensors of type DT_INT32
-  // would be on host if the op generating the tensor has host memory tag set.
-  (*segment_func->mutable_attr())[FunctionLibraryDefinition::kIntsOnDeviceAttr]
-      .set_b(true);
   if (VLOG_IS_ON(7)) {
     VLOG(7) << engine_name << " Function_Def ";
     VLOG(7) << segment_func->DebugString();
@@ -560,30 +639,30 @@ Status RegisterGraphToFunctionLibrary(const GraphDef& segment_graph_def,
   return Status::OK();
 }
 
-std::pair<int, Allocator*> GetDeviceAndAllocator(const ConversionParams& params,
-                                                 const EngineInfo& engine) {
+std::pair<int, Allocator*> GetDeviceAndAllocator(
+    const grappler::Cluster* cluster, const EngineInfo& engine) {
   int cuda_device_id = -1;
   Allocator* dev_allocator = nullptr;
-  if (params.cluster == nullptr || params.cluster->GetDeviceSet() == nullptr ||
+  if (cluster == nullptr || cluster->GetDeviceSet() == nullptr ||
       engine.device.empty()) {
     // If device is not set, use the first found GPU device for the conversion.
-    TfGpuId tf_gpu_id;
-    PlatformGpuId platform_gpu_id;
-    std::tie(tf_gpu_id, platform_gpu_id) = GetFirstValidDeviceId();
-    cuda_device_id = platform_gpu_id.value();
+    TfGpuId tf_device_id;
+    PlatformGpuId platform_device_id;
+    std::tie(tf_device_id, platform_device_id) = GetFirstValidDeviceId();
+    cuda_device_id = platform_device_id.value();
     if (cuda_device_id >= 0) {
       GPUOptions gpu_options;
       // If the TF to Cuda gpu id mapping exist, the device and corresponding
       // allocator must have been initialized already, so the
       // GetGPUAllocator() call won't create a new allocator.
       dev_allocator = GPUProcessState::singleton()->GetGPUAllocator(
-          gpu_options, tf_gpu_id, 1);
+          gpu_options, tf_device_id, /*total_bytes=*/1);
     }
     return std::make_pair(cuda_device_id, dev_allocator);
   }
 
   // Use the device requested by the engine.
-  auto device_set = params.cluster->GetDeviceSet();
+  auto device_set = cluster->GetDeviceSet();
   std::vector<Device*> devices;
   DeviceNameUtils::ParsedName parsed_name;
   if (DeviceNameUtils::ParseFullName(engine.device, &parsed_name) &&
@@ -596,7 +675,7 @@ std::pair<int, Allocator*> GetDeviceAndAllocator(const ConversionParams& params,
       StrAppend(&msg, engine.device, "': ");
       for (auto d : devices) StrAppend(&msg, d->name(), ", ");
       StrAppend(&msg, ". Will get the allocator from first one.");
-      LOG(WARNING) << msg;
+      LOG_WARNING_WITH_PREFIX << msg;
     }
     AllocatorAttributes alloc_attr;
     cuda_device_id = devices[0]->tensorflow_gpu_device_info()->gpu_id;
@@ -604,92 +683,182 @@ std::pair<int, Allocator*> GetDeviceAndAllocator(const ConversionParams& params,
     VLOG(1) << "Using allocator " << dev_allocator->Name()
             << " and cuda_device_id " << cuda_device_id;
   } else {
-    LOG(WARNING) << "Cluster is set but device '" << engine.device
-                 << "' is not found in the cluster";
+    LOG_WARNING_WITH_PREFIX << "Cluster is set but device '" << engine.device
+                            << "' is not found in the cluster";
   }
   return std::make_pair(cuda_device_id, dev_allocator);
 }
 
-// Entry function from optimization pass.
-Status ConvertAfterShapes(const ConversionParams& params) {
+Status CreateStaticEngine(const TRTOptimizationPass::ConversionParams& params,
+                          const EngineInfo& info, int max_batch_size,
+                          const std::vector<PartialTensorShape>& input_shapes,
+                          TrtShapeOptimizationProfile* profile,
+                          string* segment_string, grappler::Cluster* cluster) {
+  std::pair<int, Allocator*> device_allocator =
+      GetDeviceAndAllocator(cluster, info);
+  int cuda_device_id = 0;
+  std::unique_ptr<TRTBaseAllocator> trt_allocator;
+  if (device_allocator.first >= 0) {
+    cuda_device_id = device_allocator.first;
+    trt_allocator.reset(new TRTDeviceAllocator(device_allocator.second));
+  } else {
+    // The value in trt_allocator is a nullptr and cudamalloc will be used.
+    LOG_WARNING_WITH_PREFIX << "Can't identify the cuda device. Running on "
+                               "device 0 and use cudamalloc as an allocator";
+  }
+  cudaSetDevice(cuda_device_id);
+
+  auto trt_logger = GetLoggerRegistry()->LookUp(params.trt_logger_name);
+  const bool calibrate_int8 =
+      (info.precision_mode == TrtPrecisionMode::INT8 && info.use_calibration);
+
+  // Create static engines with precision_mode fp32/fp16.
+  TrtUniquePtrType<nvinfer1::ICudaEngine> engine;
+  TF_RETURN_IF_ERROR(ConvertGraphDefToEngine(
+      info.segment_graph_def, nullptr,
+      calibrate_int8 ? TrtPrecisionMode::FP32 : info.precision_mode,
+      max_batch_size, info.max_workspace_size_bytes, input_shapes, trt_logger,
+      trt_allocator.get(), /*calibrator=*/nullptr, &engine,
+      info.use_calibration, params.use_implicit_batch,
+      /*convert_successfully=*/nullptr, profile, info.engine_name,
+      /*use_explicit_precision=*/params.use_explicit_precision, cluster));
+  TrtUniquePtrType<nvinfer1::IHostMemory> engine_data(engine->serialize());
+  *segment_string = string(static_cast<const char*>(engine_data->data()),
+                           engine_data->size());
+  return Status::OK();
+}
+
+Status ConvertGraph(const TRTOptimizationPass::ConversionParams& params,
+                    grappler::GrapplerItem& grappler_item,
+                    const std::vector<string>& input_output_names,
+                    grappler::Cluster* cluster, GraphDef* output) {
   // Sanity checks.
+  TRT_ENSURE(output != nullptr)
   if (params.precision_mode != TrtPrecisionMode::INT8 &&
       params.use_calibration) {
     return errors::InvalidArgument(
         "Calibration with FP32 or FP16 is not supported.");
   }
 
+  GraphDef& graph_def = grappler_item.graph;
+
+  // When precision_mode is FP16, transform cast(x, fp32) to
+  // cast(cast(x, fp16), fp32). This creates cast(fp16, f32) that can be
+  // included in the TRTEngineOp as an TensorRT Identity layer for performance:
+  //  . Avoid cast(fp32, fp16) in the TRT engine implementation for fp16
+  //    precision.
+  //  . Changing the input to the TRTEngine from fp32 to fp16 may reduce data
+  //    moving from the host to the GPU.
+  if (params.precision_mode == TrtPrecisionMode::FP16) {
+    for (int i = 0; i < graph_def.node_size(); i++) {
+      NodeDef* node_def = graph_def.mutable_node(i);
+      TF_RETURN_IF_ERROR(MaybeRewriteCastToFp32(&graph_def, node_def));
+    }
+  }
+
+  // Construct a GrapplerItem using the modified graph_def and the input
+  // grappler_item.
+  grappler::GraphProperties static_graph_properties(grappler_item);
+  TF_RETURN_IF_ERROR(static_graph_properties.InferStatically(true));
+
   // Convert graphdef to graph.
-  FunctionLibraryDefinition flib(OpRegistry::Global(),
-                                 params.input_graph_def->library());
+  FunctionLibraryDefinition flib(OpRegistry::Global(), graph_def.library());
   Graph graph(flib);
-  TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(GraphConstructorOptions(),
-                                            *params.input_graph_def, &graph));
+  TF_RETURN_IF_ERROR(
+      ConvertGraphDefToGraph(GraphConstructorOptions(), graph_def, &graph));
 
   // Segment the graph into subgraphs that can be converted to TensorRT
   segment::SegmentOptions segment_options;
   // TODO(ben,jie,sami): exclude output nodes (DISCUSS IT)
-  for (auto node : *(params.output_names)) {
+  for (const auto& node : input_output_names) {
     segment_options.exclude_node_list.insert(node);
   }
   segment_options.minimum_segment_size = params.minimum_segment_size;
-  segment::SegmentNodesVector initial_segments;
-  TrtNodeValidator validator(*params.graph_properties, params.precision_mode,
-                             params.use_calibration);
+  segment_options.use_implicit_batch = params.use_implicit_batch;
+  if (segment_options.use_implicit_batch)
+    segment_options.maximum_batch_size = params.max_batch_size;
+  segment_options.allow_dynamic_non_batch_dim =
+      AllowDynamicNonBatchDimension(params);
+
+  segment::SegmentVector initial_segments;
+  TrtNodeValidator validator(static_graph_properties, params.precision_mode,
+                             params.use_calibration, params.use_implicit_batch,
+                             params.use_explicit_precision);
   TF_RETURN_IF_ERROR(segment::SegmentGraph(
-      &graph,
+      /*tf_graph=*/&graph,
+      /*graph_properties=*/&static_graph_properties,
+      /*candidate_fn=*/
       std::bind(&TrtNodeValidator::IsTensorRTCandidate, &validator,
                 std::placeholders::_1),
       // Input validation is already done by TrtNodeValidator, so we don't
       // need to check the input edges.
-      [](const Edge* edge) { return true; }, OutputEdgeValidator(),
-      segment_options, &initial_segments));
+      /*input_candidate_fn=*/[](const Edge* edge) { return true; },
+      /*output_candidate_fn=*/OutputEdgeValidator(),
+      /*options=*/segment_options,
+      /*segments=*/&initial_segments));
   LOG(INFO) << "Number of TensorRT candidate segments: "
             << initial_segments.size();
 
   // Get the EngineInfo for each segment.
   std::unordered_map<string, Node*> node_map;
   TF_RETURN_IF_ERROR(BuildNodeMap(graph, &node_map));
-  float total_num_nodes_in_segments = 0.;
   std::vector<EngineInfo> engine_segments;
   engine_segments.reserve(initial_segments.size());
   std::vector<Node*> reverse_topo_order;
   GetPostOrder(graph, &reverse_topo_order);
-  size_t total_engine_bytes_size = 0;
-  std::vector<size_t> engine_bytes_size;
-  segment::SegmentNodesVector converted_segments;
+  segment::SegmentVector converted_segments;
   converted_segments.reserve(initial_segments.size());
+  string engine_name_prefix =
+      StrCat("TRTEngineOp_",
+             absl::StrFormat("%0*d", 3, GetNextGraphSequenceNumber()), "_");
   for (size_t t = 0; t < initial_segments.size(); t++) {
     auto& curr_segment = initial_segments.at(t);
     EngineInfo curr_engine;
-    curr_engine.engine_name = StrCat("TRTEngineOp_", t);
-    Status status =
-        GetEngineInfo(&graph, *params.graph_properties, curr_segment, node_map,
-                      reverse_topo_order, &curr_engine);
+    curr_engine.engine_name =
+        StrCat(engine_name_prefix, absl::StrFormat("%0*d", 3, t));
+
+    bool int8_no_calib = (!params.use_calibration &&
+                          params.precision_mode == TrtPrecisionMode::INT8);
+    bool has_qdq = false;
+    if (int8_no_calib) {
+      has_qdq = absl::c_any_of(reverse_topo_order, IsQuantizeAndDequantizeOp);
+    }
+
+    Status status = GetEngineInfo(&graph, static_graph_properties, curr_segment,
+                                  reverse_topo_order, &curr_engine);
     if (!status.ok()) {
-      LOG(WARNING) << "Failed to get engine info for segment " << t << ": "
-                   << status;
+      LOG_WARNING_WITH_PREFIX << "Failed to get engine info for segment " << t
+                              << ": " << status;
       continue;
     }
-    curr_engine.precision_mode = params.precision_mode;
-    curr_engine.engine_type = ((params.is_dyn_op || params.use_calibration)
-                                   ? EngineInfo::EngineType::TRTDynamic
-                                   : EngineInfo::EngineType::TRTStatic);
+
+    curr_engine.engine_type = GetEngineType(params);
     curr_engine.use_calibration = params.use_calibration;
+    // Building cuda engines for INT8 without calibration and without dynamic
+    // range info cause TRT failure. Avoid this situation by setting the
+    // precision to FP16.
+    if (int8_no_calib && !has_qdq) {
+      LOG(WARNING) << "Set engine precision to FP16 due to missing QDQ OP";
+      curr_engine.precision_mode = TrtPrecisionMode::FP16;
+    } else {
+      curr_engine.precision_mode = params.precision_mode;
+    }
     curr_engine.maximum_cached_engines = params.max_cached_engines;
+    curr_engine.allow_build_at_runtime = params.allow_build_at_runtime;
+    if (!curr_engine.max_batch_size.has_value()) {
+      curr_engine.max_batch_size = params.max_batch_size;
+    }
 
     status = RegisterGraphToFunctionLibrary(curr_engine.segment_graph_def,
                                             &graph, curr_engine.engine_name);
 
     if (!status.ok()) {
-      LOG(WARNING) << "Failed to register segment graphdef to the library " << t
-                   << ": " << status;
+      LOG_WARNING_WITH_PREFIX
+          << "Failed to register segment graphdef to the library " << t << ": "
+          << status;
       continue;
     }
 
-    engine_bytes_size.push_back(curr_engine.segment_graph_def.ByteSizeLong());
-    total_engine_bytes_size += engine_bytes_size.back();
-    total_num_nodes_in_segments += curr_segment.size();
     engine_segments.push_back(std::move(curr_engine));
     converted_segments.push_back(std::move(curr_segment));
 
@@ -703,56 +872,54 @@ Status ConvertAfterShapes(const ConversionParams& params) {
     }
   }
 
-  // Create a TRT node for each segment using its EngineInfo.
-  int old_cuda_device = 0;
-  auto err = cudaGetDevice(&old_cuda_device);
-  if (err != cudaSuccess) {
-    LOG(ERROR) << "Couldn't get current device: " << cudaGetErrorString(err);
+  // Save the cuda device since we may need to switch to another cuda device to
+  // build static engines.
+  absl::optional<int> old_cuda_device = absl::nullopt;
+  if (!params.is_dynamic_op) {
+    int cuda_device_id;
+    cudaError_t cuda_error = cudaGetDevice(&cuda_device_id);
+    if (cuda_error != cudaSuccess) {
+      LOG_WARNING_WITH_PREFIX << "Couldn't get current device: "
+                              << cudaGetErrorString(cuda_error);
+    } else {
+      VLOG(1) << "Current cuda device is " << cuda_device_id;
+      old_cuda_device = cuda_device_id;
+    }
   }
-  VLOG(1) << "Current cuda device is " << old_cuda_device;
+
+  auto restore_cuda_device = gtl::MakeCleanup([old_cuda_device] {
+    if (old_cuda_device.has_value()) {
+      cudaSetDevice(old_cuda_device.value());
+    }
+  });
+
   std::vector<Node*> engine_nodes;
   engine_nodes.resize(engine_segments.size());
   for (int i = 0; i < engine_segments.size(); ++i) {
     auto& engine = engine_segments.at(i);
-    // Partition the workspace size by the average of node ratio and segment
-    // graphdef size
-    engine.max_workspace_size_bytes =
-        params.max_workspace_size_bytes *
-        (engine_bytes_size.at(i) / total_engine_bytes_size +
-         converted_segments.at(i).size() / total_num_nodes_in_segments) /
-        2.0;
+    // TODO(b/170762693): implement the heuristic to calculate
+    // max_workspace_size_bytes.
+    engine.max_workspace_size_bytes = params.max_workspace_size_bytes;
     VLOG(1) << "Assigned " << engine.max_workspace_size_bytes << " bytes to "
             << engine.engine_name;
-    // The allocator is used to build the engine. The build and the built engine
-    // will be destroyed after we get the serialized engine string, so it's fine
-    // to use unique_ptr here.
-    std::unique_ptr<TRTBaseAllocator> alloc;
-    auto device_alloc = GetDeviceAndAllocator(params, engine);
-    int cuda_device_id = 0;
-    if (device_alloc.first >= 0) {
-      cuda_device_id = device_alloc.first;
-      alloc.reset(new TRTDeviceAllocator(device_alloc.second));
-    } else {
-      // Setting allocator as nullptr should get revert to the cudamalloc
-      LOG(WARNING) << "Can't identify the cuda device. Running on device 0 ";
-    }
-    cudaSetDevice(cuda_device_id);
     auto status =
         CreateTRTNode(params, engine_segments, i, params.max_batch_size, &graph,
-                      alloc.get(), &engine_nodes);
+                      &engine_nodes, cluster);
 
-    string msg =
-        StrCat("TensorRT node ", engine.engine_name, " added for segment ", i,
-               " consisting of ", converted_segments.at(i).size(), " nodes");
+    string msg = StrCat("segment ", i, " consisting of ",
+                        converted_segments.at(i).nodes.size(), " nodes by ",
+                        engine.engine_name);
     if (status.ok()) {
-      LOG(INFO) << msg << " succeeded.";
+      LOG(INFO) << "Replaced " << msg << ".";
     } else {
       // Graph is not modified.
-      LOG(WARNING) << msg << " failed: " << status << ". Fallback to TF...";
+      LOG_WARNING_WITH_PREFIX << "Cannot replace " << msg
+                              << " reason: " << status.error_message()
+                              << " (keeping original segment).";
     }
     if (VLOG_IS_ON(1)) {
       msg = "Segment consists of nodes: ";
-      for (const Node* node : converted_segments.at(i)) {
+      for (const Node* node : converted_segments.at(i).nodes) {
         StrAppend(&msg, node->name(), ", ");
       }
       VLOG(1) << msg;
@@ -761,14 +928,12 @@ Status ConvertAfterShapes(const ConversionParams& params) {
     // If status is ok, we successfully added the node to the graph and can
     // remove segment ops. Otherwise graph is not modified.
     if (status.ok()) {
-      for (const Node* node : converted_segments.at(i)) {
+      for (const Node* node : converted_segments.at(i).nodes) {
         graph.RemoveNode(const_cast<Node*>(node));
       }
     }
   }
-  cudaSetDevice(old_cuda_device);
-  graph.ToGraphDef(params.output_graph_def);
-  VLOG(1) << "Returning from conversion";
+  graph.ToGraphDef(output);
   return Status::OK();
 }
 
@@ -776,5 +941,4 @@ Status ConvertAfterShapes(const ConversionParams& params) {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h
index 9288829574e..0607fb85346 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h
@@ -18,54 +18,53 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h"
-#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h"
 #include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/grappler/clusters/cluster.h"
-#include "tensorflow/core/grappler/costs/graph_properties.h"
+#include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 namespace tensorflow {
 namespace tensorrt {
 namespace convert {
 
-struct ConversionParams {
-  const GraphDef* input_graph_def = nullptr;
-  const std::vector<string>* output_names = nullptr;
-  size_t max_batch_size = 1;
-  size_t max_workspace_size_bytes = 1 << 30;
-  GraphDef* output_graph_def = nullptr;
-  TrtPrecisionMode precision_mode = TrtPrecisionMode::FP32;
-  int minimum_segment_size = 3;
-  const grappler::GraphProperties* graph_properties = nullptr;
-  const grappler::Cluster* cluster = nullptr;
-  // Whether to create engine on conversion or execution time
-  bool is_dyn_op = false;
-  // maximum number of cached engines
-  int max_cached_engines = 1;
-  bool use_calibration = true;
-};
+// These functions are internal implementation functions for the
+// TRTOptimizationPass.
 
-// Method to call from optimization pass
-Status ConvertAfterShapes(const ConversionParams& params);
+// Performs segmentation and conversion on the given Grappler item. This method
+// contains the core logic of the TRTOptimizationPass.
+Status ConvertGraph(const TRTOptimizationPass::ConversionParams& params,
+                    grappler::GrapplerItem& grappler_item,
+                    const std::vector<string>& input_output_names,
+                    grappler::Cluster* cluster, GraphDef* output);
 
 // Helper method for the conversion, expose for testing.
-std::pair<int, Allocator*> GetDeviceAndAllocator(const ConversionParams& params,
-                                                 const EngineInfo& engine);
+std::pair<int, Allocator*> GetDeviceAndAllocator(
+    const grappler::Cluster* cluster, const EngineInfo& engine);
 
 // Helper method that registers `segment_graph` as a function to the function
 // library in `graph`.
 Status RegisterGraphToFunctionLibrary(const GraphDef& segment_graph_def,
                                       Graph* graph, const string& engine_name);
 
+// Creates and serializes an ICudaEngine. Used only in is_dynamic_op=false,
+// a.k.a. static engine mode.
+Status CreateStaticEngine(const TRTOptimizationPass::ConversionParams& params,
+                          const EngineInfo& info, int max_batch_size,
+                          const std::vector<PartialTensorShape>& input_shapes,
+                          TrtShapeOptimizationProfile* profile,
+                          string* segment_string, grappler::Cluster* cluster);
+
 }  // namespace convert
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
 
 #endif  // TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_CONVERT_GRAPH_H_
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph_test.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_graph_test.cc
index 58fe39b08ba..ba74bd25528 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph_test.cc
@@ -15,12 +15,15 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2tensorrt/convert/convert_graph.h"
 
+#include <regex>  // NOLINT
+
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/framework/scope.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_testutils.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/device_set.h"
 #include "tensorflow/core/framework/tensor_shape.h"
@@ -32,24 +35,12 @@ limitations under the License.
 #include "tensorflow/core/protobuf/config.pb.h"  // NOLINT
 #include "tensorflow/core/public/session.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 namespace tensorflow {
 namespace tensorrt {
 namespace convert {
 
-// TODO(laigd): put this into some test utils file.
-void ExpectStatus(Status status, error::Code code = error::OK,
-                  const char* substr = nullptr) {
-  EXPECT_EQ(code, status.code())
-      << status << " vs expected error code \"" << error::Code_Name(code)
-      << "\" and message \"" << substr << "\"";
-  if (substr) {
-    EXPECT_THAT(status.error_message(), ::testing::HasSubstr(substr)) << status;
-  }
-}
-
 class FakeCluster : public grappler::Cluster {
  public:
   FakeCluster() : Cluster(0) {}
@@ -70,15 +61,15 @@ class FakeCluster : public grappler::Cluster {
   }
 
  private:
-  const DeviceSet* device_set_;
+  const DeviceSet* device_set_ = nullptr;
 };
 
-TEST(ConvertGraphTest, GetDeviceAndAllocator) {
-  ConversionParams params;
+TEST(GetDeviceAndAllocatorTest, GetDeviceAndAllocator) {
+  TRTOptimizationPass::ConversionParams params;
   EngineInfo engine_info;
   {
-    // params.cluster is not set, and no gpu device is available.
-    auto result = GetDeviceAndAllocator(params, engine_info);
+    // cluster is not set, and no gpu device is available.
+    auto result = GetDeviceAndAllocator(nullptr, engine_info);
     EXPECT_EQ(-1, result.first);
     EXPECT_EQ(nullptr, result.second);
   }
@@ -94,20 +85,19 @@ TEST(ConvertGraphTest, GetDeviceAndAllocator) {
   std::unique_ptr<Session> session(NewSession(options));
 
   {
-    // params.cluster is not set, should find and return first gpu id and
+    // cluster is not set, should find and return first gpu id and
     // corresponding allocator.
-    auto result = GetDeviceAndAllocator(params, engine_info);
+    auto result = GetDeviceAndAllocator(nullptr, engine_info);
     EXPECT_EQ(0, result.first);
     EXPECT_NE(nullptr, result.second);
     EXPECT_EQ("GPU_0_bfc", result.second->Name());
   }
 
   FakeCluster cluster;
-  params.cluster = &cluster;
   {
     // params.cluster->GetDeviceSet() returns null, should find and return first
     // gpu id and corresponding allocator.
-    auto result = GetDeviceAndAllocator(params, engine_info);
+    auto result = GetDeviceAndAllocator(&cluster, engine_info);
     EXPECT_EQ(0, result.first);
     EXPECT_NE(nullptr, result.second);
     EXPECT_EQ("GPU_0_bfc", result.second->Name());
@@ -124,7 +114,7 @@ TEST(ConvertGraphTest, GetDeviceAndAllocator) {
   {
     // engine_info.device is not set, should find and return first gpu id and
     // corresponding allocator.
-    auto result = GetDeviceAndAllocator(params, engine_info);
+    auto result = GetDeviceAndAllocator(&cluster, engine_info);
     EXPECT_EQ(0, result.first);
     EXPECT_NE(nullptr, result.second);
     EXPECT_EQ("GPU_0_bfc", result.second->Name());
@@ -133,7 +123,7 @@ TEST(ConvertGraphTest, GetDeviceAndAllocator) {
   engine_info.device = "/GPU:1";
   {
     // Set to use second device.
-    auto result = GetDeviceAndAllocator(params, engine_info);
+    auto result = GetDeviceAndAllocator(&cluster, engine_info);
     EXPECT_EQ(0, result.first);
     EXPECT_NE(nullptr, result.second);
     EXPECT_EQ("GPU_1_bfc", result.second->Name());
@@ -142,15 +132,16 @@ TEST(ConvertGraphTest, GetDeviceAndAllocator) {
   engine_info.device = "/GPU:3";
   {
     // Set to use nonexistent device.
-    auto result = GetDeviceAndAllocator(params, engine_info);
+    auto result = GetDeviceAndAllocator(&cluster, engine_info);
     EXPECT_EQ(-1, result.first);
     EXPECT_EQ(nullptr, result.second);
   }
 }
 
-class ConvertAfterShapesTest : public ::testing::Test {
+class ConvertGraphTest : public ::testing::Test {
  public:
-  Status RunConvertAfterShape(Scope s, GraphDef* output_graph_def) {
+  Status RunConvertGraph(Scope s, GraphDef* output_graph_def,
+                         int maximum_batch_size = 1000) {
     // Create GraphProperties.
     grappler::GrapplerItem item;
     TF_EXPECT_OK(s.ToGraphDef(&item.graph));
@@ -158,21 +149,19 @@ class ConvertAfterShapesTest : public ::testing::Test {
     TF_EXPECT_OK(graph_properties.InferStatically(true));
 
     // Construct ConversionParams.
-    const std::vector<string> output_names{"output"};
-    ConversionParams params;
-    params.input_graph_def = &item.graph;
-    params.output_names = &output_names;
+    const std::vector<string> input_output_names{"output"};
+    TRTOptimizationPass::ConversionParams params;
+    params.max_batch_size = maximum_batch_size;
     params.max_workspace_size_bytes = 8 << 20;
-    params.output_graph_def = output_graph_def;
     params.minimum_segment_size = 1;
-    params.graph_properties = &graph_properties;
     params.use_calibration = false;
-
-    return ConvertAfterShapes(params);
+    params.trt_logger_name = "DefaultLogger";
+    return ConvertGraph(params, item, input_output_names, nullptr,
+                        output_graph_def);
   }
 };
 
-TEST_F(ConvertAfterShapesTest, DirectlyConnectedEngines) {
+TEST_F(ConvertGraphTest, DirectlyConnectedEngines) {
   // Create the graph. There will be two TRTEngineOps after the conversion, and
   // the upstream TRTEngineOp will have two output connections from the same
   // node:port inside the op to the downstream TRTEngineOp. Then, if it adds the
@@ -200,17 +189,24 @@ TEST_F(ConvertAfterShapesTest, DirectlyConnectedEngines) {
   ops::Identity(s.WithOpName("output"), add3);
 
   GraphDef output_graph_def;
-  TF_EXPECT_OK(RunConvertAfterShape(s, &output_graph_def));
+  TF_EXPECT_OK(RunConvertGraph(s, &output_graph_def));
 
+  auto remove_graph_sequence_number = [](std::string node_name) {
+    const std::regex pattern("TRTEngineOp_[0-9]+_");
+    return std::regex_replace(node_name, pattern, "TRTEngineOp_");
+  };
   int num_trt_ops = 0;
   for (const NodeDef& node : output_graph_def.node()) {
-    if (node.name() == "TRTEngineOp_1") {
+    std::string node_name = node.name();
+    if (node.op() != "TRTEngineOp") continue;
+    node_name = remove_graph_sequence_number(node_name);
+    if (node_name == "TRTEngineOp_001") {
       EXPECT_EQ(1, node.input_size());
       EXPECT_EQ("input", node.input(0));
       ++num_trt_ops;
-    } else if (node.name() == "TRTEngineOp_0") {
+    } else if (node_name == "TRTEngineOp_000") {
       EXPECT_EQ(2, node.input_size());
-      EXPECT_EQ("TRTEngineOp_1", node.input(0));
+      EXPECT_EQ("TRTEngineOp_001", remove_graph_sequence_number(node.input(0)));
       EXPECT_EQ("reshape2", node.input(1));
       ++num_trt_ops;
     }
@@ -222,5 +218,4 @@ TEST_F(ConvertAfterShapesTest, DirectlyConnectedEngines) {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
index 459136d3eef..22799c00888 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h"
 
 #include <algorithm>
+#include <bitset>
 #include <cmath>
 #include <cstring>
 #include <map>
@@ -25,21 +26,37 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/memory/memory.h"
 #include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
+#include "tensorflow/compiler/tf2tensorrt/common/utils.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/algorithm_selector.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/op_converter_registry.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/ops/layer_utils.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/ops/quantization_ops.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/timing_cache.h"
 #include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_experimental_features.h"
 #include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h"
 #include "tensorflow/core/framework/node_def.pb.h"  // NOLINT
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/tensor.pb.h"  // NOLINT
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"  // NOLINT
+#include "tensorflow/core/framework/tensor_util.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/op_types.h"
+#include "tensorflow/core/grappler/optimizers/constant_folding.h"
+#include "tensorflow/core/grappler/optimizers/generic_layout_optimizer.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/strings/numbers.h"
@@ -49,11 +66,14 @@ limitations under the License.
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/tensor_coding.h"
+#include "tensorflow/core/platform/tensor_float_32_utils.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
+#include "tensorflow/core/public/version.h"
+#include "tensorflow/core/util/env_var.h"
 #include "tensorflow/core/util/strided_slice_op.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 #include "third_party/tensorrt/NvInfer.h"
 #include "third_party/tensorrt/NvInferPlugin.h"
 
@@ -61,33 +81,97 @@ limitations under the License.
 // would work!
 #define TFTRT_CHECK_EQ_TYPE(val1, val2) CHECK_EQ((int)val1, (int)val2)
 
-#define TFTRT_INTERNAL_ERROR_AT_NODE(node)                           \
-  do {                                                               \
-    return errors::Internal("TFTRT::", __FUNCTION__, ":", __LINE__,  \
-                            " failed to add TRT layer, at: ", node); \
-  } while (0)
+#define TFTRT_CHECK_INPUT_SIZE(size, exp_size, node_def)                 \
+  if ((size) != (exp_size)) {                                            \
+    TFTRT_ERROR(errors::InvalidArgument, node_def.op(), " got ", (size), \
+                " inputs but expected ", (exp_size));                    \
+  }
 
-#define TFTRT_RETURN_ERROR_IF_NULLPTR(ptr, node) \
-  do {                                           \
-    if (ptr == nullptr) {                        \
-      TFTRT_INTERNAL_ERROR_AT_NODE(node);        \
-    }                                            \
-  } while (0)
+// Max kernel volume copied from TRT's limits.
+#define MAX_KERNEL_DIMS_PRODUCT(x) (int64_t(std::pow(100000.0F, (x)*0.5F)))
 
 namespace tensorflow {
 namespace tensorrt {
 namespace convert {
 
-bool IsEngineInput(absl::string_view name) {
-  return absl::StartsWith(name, IONamePrefixes::kInputPHName);
-}
-bool IsEngineOutput(absl::string_view name) {
-  return absl::StartsWith(name, IONamePrefixes::kOutputPHName);
-}
-
 using absl::StrAppend;
 using absl::StrCat;
 
+namespace {
+
+#define ADD_LAYER(layer_name)              \
+  case nvinfer1::LayerType::k##layer_name: \
+    return #layer_name;
+
+const char* LayerTypeToString(nvinfer1::LayerType layer_type) {
+  switch (layer_type) {
+    ADD_LAYER(CONVOLUTION)
+    ADD_LAYER(FULLY_CONNECTED)
+    ADD_LAYER(ACTIVATION)
+    ADD_LAYER(POOLING)
+    ADD_LAYER(LRN)
+    ADD_LAYER(SCALE)
+    ADD_LAYER(SOFTMAX)
+    ADD_LAYER(DECONVOLUTION)
+    ADD_LAYER(CONCATENATION)
+    ADD_LAYER(ELEMENTWISE)
+    ADD_LAYER(PLUGIN)
+    ADD_LAYER(UNARY)
+    ADD_LAYER(PADDING)
+    ADD_LAYER(SHUFFLE)
+    ADD_LAYER(REDUCE)
+    ADD_LAYER(TOPK)
+    ADD_LAYER(GATHER)
+#if IS_TRT_VERSION_GE(8, 5, 0, 0)
+    ADD_LAYER(GRID_SAMPLE)
+#endif
+    ADD_LAYER(MATRIX_MULTIPLY)
+    ADD_LAYER(RAGGED_SOFTMAX)
+    ADD_LAYER(CONSTANT)
+    ADD_LAYER(RNN_V2)
+    ADD_LAYER(IDENTITY)
+    ADD_LAYER(PLUGIN_V2)
+    ADD_LAYER(SLICE)
+    ADD_LAYER(SHAPE)
+    ADD_LAYER(PARAMETRIC_RELU)
+    ADD_LAYER(RESIZE)
+    ADD_LAYER(TRIP_LIMIT)
+    ADD_LAYER(RECURRENCE)
+    ADD_LAYER(ITERATOR)
+    ADD_LAYER(LOOP_OUTPUT)
+    ADD_LAYER(SELECT)
+    ADD_LAYER(FILL)
+#if IS_TRT_VERSION_GE(8, 0, 0, 0)
+    ADD_LAYER(QUANTIZE)
+    ADD_LAYER(DEQUANTIZE)
+#endif
+#if IS_TRT_VERSION_GE(8, 2, 0, 0)
+    ADD_LAYER(CONDITION)
+    ADD_LAYER(CONDITIONAL_INPUT)
+    ADD_LAYER(CONDITIONAL_OUTPUT)
+    ADD_LAYER(SCATTER)
+    ADD_LAYER(EINSUM)
+    ADD_LAYER(ASSERTION)
+#endif
+#if IS_TRT_VERSION_GE(8, 5, 0, 0)
+    ADD_LAYER(ONE_HOT)
+    ADD_LAYER(NON_ZERO)
+    ADD_LAYER(NMS)
+#endif
+#if IS_TRT_VERSION_GE(8, 6, 0, 0)
+    ADD_LAYER(REVERSE_SEQUENCE)
+#endif
+#if !IS_TRT_VERSION_GE(8, 0, 0, 0)
+    // The TRT IRNNv2Layer has been deprecated in favor of the loop API.
+    ADD_LAYER(RNN)
+#endif
+    default:
+      return "UNKNOWN_LAYER";
+  }
+}
+
+#undef ADD_LAYER
+
 inline Status TfDataTypeToTrt(DataType tf_dtype,
                               nvinfer1::DataType* trt_dtype) {
   switch (tf_dtype) {
@@ -126,6 +210,62 @@ inline Status TrtDataTypeToTf(nvinfer1::DataType trt_dtype,
   return Status::OK();
 }
 
+// Sets the ILayer name in the form of
+// <engine_name>/<tf_related_part>:<trt_operation_name>.
+void SetLayerNameHelper(nvinfer1::ILayer* layer, absl::string_view engine_name,
+                        absl::string_view tf_name) {
+  const char* trt_name = LayerTypeToString(layer->getType());
+  layer->setName(
+      absl::StrCat(engine_name, "/", tf_name, ":", trt_name).c_str());
+}
+
+// Returns a string in the form of <sub_op_name><sub_op_instance>.
+std::string GetLayerNameSuffix(absl::string_view sub_op_name,
+                               absl::optional<int> sub_op_instance) {
+  std::string op_suffix(sub_op_name);
+  if (sub_op_instance.has_value()) {
+    op_suffix =
+        absl::StrCat(op_suffix, "_", std::to_string(sub_op_instance.value()));
+  }
+  return op_suffix;
+}
+
+}  // namespace
+
+bool IsEngineInput(absl::string_view name) {
+  return absl::StartsWith(name, IONamePrefixes::kInputPHName);
+}
+bool IsEngineOutput(absl::string_view name) {
+  return absl::StartsWith(name, IONamePrefixes::kOutputPHName);
+}
+
+void GetOutputProperties(const grappler::GraphProperties& graph_properties,
+                         const Node* node, const int out_port,
+                         PartialTensorShape* shape, DataType* dtype) {
+  if (graph_properties.HasOutputProperties(node->name())) {
+    auto output_params = graph_properties.GetOutputProperties(node->name());
+    auto out_shape = output_params.at(out_port);
+    *dtype = out_shape.dtype();
+    *shape = out_shape.shape();
+  } else {
+    LOG(INFO) << "Unknown output shape at node: " << node->name();
+    *dtype = node->output_type(out_port);
+  }
+}
+
+void GetInputProperties(const grappler::GraphProperties& graph_properties,
+                        const Node* node, const int in_port,
+                        PartialTensorShape* shape, DataType* dtype) {
+  if (graph_properties.HasInputProperties(node->name())) {
+    auto input_params = graph_properties.GetInputProperties(node->name());
+    auto in_shape = input_params.at(in_port);
+    *dtype = in_shape.dtype();
+    *shape = in_shape.shape();
+  } else {
+    *dtype = node->input_type(in_port);
+  }
+}
+
 class TFAttrs {
  public:
   explicit TFAttrs(const NodeDef& tf_node) {
@@ -220,71 +360,53 @@ Status TensorShapeArrayToTrtDims(const Container& shape, nvinfer1::Dims* out,
   return Status::OK();
 }
 
-// TODO(laigd): use this utility function in more places.
-Status RemoveBatchDimension(nvinfer1::Dims* dims) {
-  if (dims->nbDims < 2) {
-    return errors::InvalidArgument(
-        "Dropping batch dimension requires dims with rank>=2.");
-  }
-  std::copy(dims->d + 1, dims->d + dims->nbDims, dims->d);
-  dims->nbDims--;
-  return Status::OK();
-}
-
-void GetOutputProperties(const grappler::GraphProperties& graph_properties,
-                         const Node* node, const int out_port,
-                         PartialTensorShape* shape, DataType* dtype) {
-  if (graph_properties.HasOutputProperties(node->name())) {
-    auto output_params = graph_properties.GetOutputProperties(node->name());
-    auto out_shape = output_params.at(out_port);
-    *dtype = out_shape.dtype();
-    *shape = out_shape.shape();
-  } else {
-    LOG(INFO) << "Unknown output shape" << node->name();
-    *dtype = node->output_type(out_port);
-  }
-}
-
-void GetInputProperties(const grappler::GraphProperties& graph_properties,
-                        const Node* node, const int in_port,
-                        PartialTensorShape* shape, DataType* dtype) {
-  if (graph_properties.HasInputProperties(node->name())) {
-    auto input_params = graph_properties.GetInputProperties(node->name());
-    auto in_shape = input_params.at(in_port);
-    *dtype = in_shape.dtype();
-    *shape = in_shape.shape();
-  } else {
-    *dtype = node->input_type(in_port);
-  }
-}
-
+// This function checks if a tensor is compatible with TRT.
+//
+// We check that the shape and datatype are compatible with TensorRT. We also
+// return the corresponding trt_dtype, the trt_dims and the batch_size (latter
+// is only needed in implicit batch mode).
+//
+// The return status indicates wether the tensor is compatible.
+//
+// For implicit batch mode, when validation_only == false, we also check that
+// all input dimensions (besides the batch dimension) are known dimensions.
 Status ValidateTensorProperties(const string& producer_node_type,
                                 const DataType dtype,
                                 const PartialTensorShape& shape,
+                                const bool use_implicit_batch,
                                 bool validation_only,
                                 nvinfer1::DataType* trt_dtype,
                                 nvinfer1::Dims* trt_dims, int* batch_size) {
   // Convert data type.
-  TF_RETURN_IF_ERROR(TfDataTypeToTrt(dtype, trt_dtype));
+  TF_RETURN_IF_ERROR(TfTypeToTrtType(dtype, trt_dtype));
 
   // Convert shape.
   if (shape.dims() < 0) {
     return errors::InvalidArgument("Input tensor rank is unknown.");
   }
-  if (shape.dims() > nvinfer1::Dims::MAX_DIMS + 1) {  // +1 for batch dim
-    return errors::OutOfRange("Input tensor rank is greater than ",
-                              nvinfer1::Dims::MAX_DIMS + 1);
+  // Add 1 to maximum rank for implicit batch dim.
+  const int max_rank = nvinfer1::Dims::MAX_DIMS + (use_implicit_batch ? 1 : 0);
+  if (shape.dims() > max_rank) {
+    return errors::OutOfRange("Input tensor rank is greater than ", max_rank);
   }
-  if (producer_node_type != "Const" && shape.dims() < 1) {
+  if (use_implicit_batch && (producer_node_type != "Const") &&
+      (shape.dims() < 1)) {
     return errors::InvalidArgument(
         "Scalar input tensor is not supported since the first dimension "
         "is treated as batch dimension by TRT");
   }
-  *trt_dims = TensorShapeToTrtDims(shape, /*ignore_first_dim=*/true);
-  *batch_size = shape.dim_size(0);
+  ::stream_executor::port::StatusOr<DimsAdapter> dims =
+      DimsAdapter::Create(shape, use_implicit_batch);
+  TRT_ENSURE_OK(dims);
+  *trt_dims = dims.ValueOrDie().AsTrtDims();
+  // Get batch size for tensor if it will not be included the shape.
+  if (use_implicit_batch) {
+    *batch_size = shape.dim_size(0);
+  }
 
   // Don't convert empty tensors (dim value of 0).
-  for (int d = 1; d < shape.dims(); ++d) {
+  const int first_trt_dim = use_implicit_batch ? 1 : 0;
+  for (int d = first_trt_dim; d < shape.dims(); ++d) {
     if (shape.dim_size(d) == 0) {
       return errors::Unimplemented(
           "Input tensor with shape ", shape.DebugString(),
@@ -293,69 +415,24 @@ Status ValidateTensorProperties(const string& producer_node_type,
   }
 
   if (validation_only) return Status::OK();
-  // Following are validations at runtime.
 
-  for (int d = 1; d < shape.dims(); ++d) {
-    if (shape.dim_size(d) < 0) {
-      return errors::InvalidArgument(
-          "Input tensor with shape ", shape.DebugString(),
-          " has an unknown non-batch dimension at dim ", d);
+  // Following checks are only used during TRT engine creation time.
+  if (use_implicit_batch) {
+    for (int d = first_trt_dim; d < shape.dims(); ++d) {
+      if (shape.dim_size(d) < 0) {
+        return errors::InvalidArgument(
+            "Input tensor with shape ", shape.DebugString(),
+            " has an unknown non-batch dimension at dim ", d);
+      }
     }
   }
   return Status::OK();
 }
 
-string DebugString(const nvinfer1::DataType trt_dtype) {
-  switch (trt_dtype) {
-    case nvinfer1::DataType::kFLOAT:
-      return "kFLOAT";
-    case nvinfer1::DataType::kHALF:
-      return "kHALF";
-    case nvinfer1::DataType::kINT8:
-      return "kINT8";
-    case nvinfer1::DataType::kINT32:
-      return "kINT32";
-    default:
-      return "Invalid TRT data type";
-  }
-}
-
-string DebugString(const nvinfer1::Dims& dims) {
-  string out = StrCat("nvinfer1::Dims(nbDims=", dims.nbDims, ", d=");
-  for (int i = 0; i < dims.nbDims; ++i) {
-    StrAppend(&out, dims.d[i]);
-    StrAppend(&out, ",");
-  }
-  StrAppend(&out, ")");
-  return out;
-}
-
-string DebugString(const nvinfer1::Permutation& permutation, int len) {
-  string out = "nvinfer1::Permutation(";
-  for (int i = 0; i < len; ++i) {
-    StrAppend(&out, permutation.order[i], ",");
-  }
-  StrAppend(&out, ")");
-  return out;
-}
-
-string DebugString(const ITensorProxyPtr& tensor) {
-  return StrCat("nvinfer1::ITensor(@", reinterpret_cast<uintptr_t>(tensor->trt_tensor()),
-                ", name=", tensor->trt_tensor()->getName(),
-                ", dtype=", DebugString(tensor->trt_tensor()->getType()),
-                ", dims=", DebugString(tensor->trt_tensor()->getDimensions()), ")");
-}
-
-string DebugString(const nvinfer1::ITensor& tensor) {
-  return StrCat("nvinfer1::ITensor(@", reinterpret_cast<uintptr_t>(&tensor),
-                ", name=", tensor.getName(),
-                ", dtype=", DebugString(tensor.getType()),
-                ", dims=", DebugString(tensor.getDimensions()), ")");
-}
-
 Status GetTrtBroadcastShape(const TRT_TensorOrWeights& operand_l,
                             const TRT_TensorOrWeights& operand_r,
                             const bool check_feasibility,
+                            const bool use_implicit_batch,
                             nvinfer1::Dims* operand_l_new_dims,
                             nvinfer1::Dims* operand_r_new_dims) {
   // TensorRT Elementwise op supports broadcast but requires both tensor to be
@@ -382,19 +459,24 @@ Status GetTrtBroadcastShape(const TRT_TensorOrWeights& operand_l,
   // -> W: 1 1 1  1 3 5 1
   // ***************************************************************************
   if (!operand_l.is_tensor() && !operand_r.is_tensor()) {
+    // TODO(lsugy): remove this check in dynamic shapes mode. This should work
+    // if both inputs are weights.
     return errors::InvalidArgument(
         "Broadcasting requires at least one of the operands be tensors");
   }
 
-  const int max_nb_dims = nvinfer1::Dims::MAX_DIMS + 1;
-  auto compute_output_dims = [](const TRT_TensorOrWeights& input,
-                                int broadcast_num_dims, int* output_dims_array,
-                                nvinfer1::Dims* output_dims) {
+  constexpr int max_nb_dims = nvinfer1::Dims::MAX_DIMS + 1;
+  auto compute_output_dims =
+      [use_implicit_batch](const TRT_TensorOrWeights& input,
+                           int broadcast_num_dims,
+                           std::array<int32_t, max_nb_dims>* output_dims_array,
+                           nvinfer1::Dims* output_dims) -> Status {
     const nvinfer1::Dims input_dims = input.GetTrtDims();
-    std::fill(output_dims_array, output_dims_array + max_nb_dims, 1);
-    std::copy(input_dims.d, input_dims.d + input_dims.nbDims,
-              output_dims_array + broadcast_num_dims - input_dims.nbDims);
-    if (input.is_tensor()) {
+    absl::c_fill(*output_dims_array, 1);
+    absl::c_copy(
+        DimsAdapter(input_dims),
+        output_dims_array->begin() + broadcast_num_dims - input_dims.nbDims);
+    if (use_implicit_batch && input.is_tensor()) {
       const int true_input_dims = input_dims.nbDims + 1;
       if (true_input_dims < broadcast_num_dims) {
         return errors::InvalidArgument(
@@ -404,28 +486,44 @@ Status GetTrtBroadcastShape(const TRT_TensorOrWeights& operand_l,
       }
       // Set the batch dimension to -1, since batch size is not supposed to
       // be broadcasted.
-      output_dims_array[0] = -1;
+      (*output_dims_array)[0] = -1;
     }
-    // Copy to output dimensions (stripping the batch dimension).
-    output_dims->nbDims = broadcast_num_dims - 1;
-    std::copy(output_dims_array + 1, output_dims_array + broadcast_num_dims,
-              output_dims->d);
+    // Copy to output dimensions
+    auto offt = use_implicit_batch ? 1 : 0;
+    output_dims->nbDims = broadcast_num_dims - offt;
+    absl::c_copy(
+        absl::MakeSpan(*output_dims_array).subspan(offt, broadcast_num_dims),
+        output_dims->d);
     return Status::OK();
   };
 
   // Compute the output dimensions.
   const int broadcast_num_dims =
-      std::max(operand_l.GetTrtDims().nbDims + (operand_l.is_tensor() ? 1 : 0),
-               operand_r.GetTrtDims().nbDims + (operand_r.is_tensor() ? 1 : 0));
-  int output_l[max_nb_dims], output_r[max_nb_dims];
+      std::max(operand_l.GetTrtDims().nbDims +
+                   (use_implicit_batch && operand_l.is_tensor()),
+               operand_r.GetTrtDims().nbDims +
+                   (use_implicit_batch && operand_r.is_tensor()));
+  std::array<int32_t, max_nb_dims> output_l, output_r;
   TF_RETURN_IF_ERROR(compute_output_dims(operand_l, broadcast_num_dims,
-                                         output_l, operand_l_new_dims));
+                                         &output_l, operand_l_new_dims));
   TF_RETURN_IF_ERROR(compute_output_dims(operand_r, broadcast_num_dims,
-                                         output_r, operand_r_new_dims));
+                                         &output_r, operand_r_new_dims));
 
   // Compare broadcast feasibility
   if (check_feasibility) {
     for (int i = 0; i < broadcast_num_dims; ++i) {
+      if (!use_implicit_batch && (output_l[i] == -1 || output_r[i] == -1)) {
+        // If the condition is true then we are in explicit batch mode and (at
+        // least) one of the input dimensions are unknown. In other words we
+        // are in dynamic shape mode. During conversion time we only see -1 for
+        // the unknown shapes, therefore we cannot decide on the feasibility of
+        // broadcast over the unknown dimensions. Therefore we just continue for
+        // the next dimension. In dynamic shape mode TRT can only check the
+        // feasibility of the broadcast when the actual input dimensions are
+        // specified by SetTrtEngineInputs and the inference job is launched by
+        // TrtEnque.
+        continue;
+      }
       if ((output_l[i] != output_r[i]) && (output_l[i] != 1) &&
           (output_r[i] != 1)) {
         return errors::InvalidArgument("Infeasible broadcast scheme (",
@@ -439,32 +537,141 @@ Status GetTrtBroadcastShape(const TRT_TensorOrWeights& operand_l,
   return Status::OK();
 }
 
-ITensorProxyPtr Converter::CreateConstantLayer(
-    const TRT_ShapedWeights& weights, const nvinfer1::Dims& dims) {
+// Prepares a dynamic shape tensor for broadcast by adding leading 1 dimensions.
+Status DynamicBroadcast(ITensorProxyPtr operand,
+                        const OpConverterParams* params,
+                        ITensorProxyPtr* output, int broadcasted_nbDims,
+                        absl::optional<int> op_instance) {
+  int operand_nbDims = operand->getDimensions().nbDims;
+  if (broadcasted_nbDims > operand_nbDims) {
+    if (params->validation_only) return Status::OK();
+    int n_extra_dims = broadcasted_nbDims - operand_nbDims;
+    VLOG(2) << "Dynamic broadcast adding " << n_extra_dims << " leading 1s";
+    TF_RETURN_IF_ERROR(params->converter->DynamicReshape(
+        /*input=*/operand,
+        /*slices=*/{std::make_pair(0, operand_nbDims)},
+        /*params=*/params,
+        /*output=*/output,
+        /*size_for_added_dims*/ {n_extra_dims},
+        /*op_instance=*/op_instance));
+  } else {
+    *output = operand;
+  }
+  return Status::OK();
+}
+
+Status BroadcastWeights(std::unique_ptr<TRT_TensorOrWeights>& p,
+                        const DimsAdapter& broadcasted_dims) {
+  if (!p->is_weights()) return errors::Internal("Weight input expected");
+  if (p->GetTrtDims().nbDims != broadcasted_dims.NumDims()) {
+    TRT_ShapedWeights weights(p->weights());
+    TF_RETURN_IF_ERROR(weights.SetShape(broadcasted_dims));
+    p = std::make_unique<TRT_TensorOrWeights>(weights);
+  }
+  return Status::OK();
+}
+
+Status ApplyBroadcast(std::unique_ptr<TRT_TensorOrWeights>& operand,
+                      const DimsAdapter& broadcasted_dims,
+                      const OpConverterParams* params,
+                      absl::optional<int> op_instance) {
+  if (operand->is_weights()) {
+    TF_RETURN_IF_ERROR(BroadcastWeights(operand, broadcasted_dims));
+  } else {
+    ITensorProxyPtr tensor = nullptr;
+    auto is_static_shuffle_compatible = [](const auto& dims) {
+      return absl::c_count(dims, -1) <= 1;
+    };
+    if (is_static_shuffle_compatible(broadcasted_dims)) {
+      TF_RETURN_IF_ERROR(PrepareTensorForShape(
+          params->converter, *operand, broadcasted_dims,
+          params->validation_only, &tensor, params->node_def));
+    } else {
+      TF_RETURN_IF_ERROR(DynamicBroadcast(
+          /*operand=*/operand->tensor(),
+          /*params=*/params,
+          /*output=*/&tensor,
+          /*broadcasted_nbDims*/ broadcasted_dims.NumDims(),
+          /*op_instance=*/op_instance));
+    }
+    operand = std::make_unique<TRT_TensorOrWeights>(tensor);
+  }
+  return Status::OK();
+}
+
+// Inserts leading 1 dimensions so that both operands have the same rank.
+// Note: In implicit batch mode, weights' shape can include an explicit 1 batch
+// dimension. The broadcasted shape might loose this leading batch dim, because
+// the broadcasted shape does not include the implicit batch dim.
+// TODO(tfeher): Other code blocks that use GetTrtBroadcastShape need to be
+// fixed to use this routine to handle dynamic inputs. Eventually,
+// GetTrtBroadcastShape should only be used by this routine.
+Status BroadcastTensors(std::unique_ptr<TRT_TensorOrWeights>& operand_l,
+                        std::unique_ptr<TRT_TensorOrWeights>& operand_r,
+                        bool check_feasibility,
+                        const OpConverterParams* params) {
+  nvinfer1::Dims broadcasted_dims_l, broadcasted_dims_r;
+  TF_RETURN_IF_ERROR(GetTrtBroadcastShape(
+      *operand_l, *operand_r, check_feasibility, params->use_implicit_batch,
+      &broadcasted_dims_l, &broadcasted_dims_r));
+
+  if (params->validation_only) return Status::OK();
+
+  TF_RETURN_IF_ERROR(ApplyBroadcast(
+      /*operand=*/operand_l,
+      /*broadcasted_dims=*/broadcasted_dims_l,
+      /*params=*/params,
+      /*op_instance=*/0));
+
+  TF_RETURN_IF_ERROR(ApplyBroadcast(
+      /*operand=*/operand_r,
+      /*broadcasted_dims=*/broadcasted_dims_r,
+      /*params=*/params,
+      /*op_instance=*/1));
+
+  return Status::OK();
+}
+
+ITensorProxyPtr Converter::CreateConstantLayer(const TRT_ShapedWeights& weights,
+                                               const nvinfer1::Dims& dims) {
   nvinfer1::Weights trt_weights = weights.GetTrtWeights();
   nvinfer1::IConstantLayer* layer = network()->addConstant(dims, trt_weights);
   if (!layer) return nullptr;
+  SetLayerName(layer, "_tftrt_constant_",
+               std::to_string(next_constant_layer_id_));
+  next_constant_layer_id_++;
   ITensorProxyPtr trt_tensor = layer->getOutput(0);
-#if !IS_TRT_VERSION_GE(5, 1, 3, 0)
-  // TODO(laigd): there is a bug in TensorRT 5.0 library that, if we don't set
-  // the data type below, it will always be kFLOAT regardless what the data type
-  // of the weights is. Once NVIDIA fixes this bug, we should remove the data
-  // type setting logic below and test should still pass.
-  trt_tensor->setType(trt_weights.type);
-#endif
   return trt_tensor;
 }
 
-Status CreateBroadcastableScalarConstant(OpConverterParams* params, float value,
+// Creates a scalar constant and fills with value.
+template <typename T>
+Status CreateScalarConstant(
+    const OpConverterParams* params, T value, ITensorProxyPtr* tensor,
+    nvinfer1::DataType trt_type = nvinfer1::DataType::kINT32,
+    const nvinfer1::Dims& dims = {1, {1}}) {
+  ::stream_executor::port::StatusOr<TRT_ShapedWeights> weights =
+      params->weight_store->GetTempWeights(trt_type, dims);
+  TRT_ENSURE_OK(weights);
+  TF_RETURN_IF_ERROR(weights.ValueOrDie().SetValues(value));
+  *tensor = params->converter->CreateConstantLayer(weights.ValueOrDie(), dims);
+  TFTRT_RETURN_ERROR_IF_NULLPTR(*tensor, params->node_def.name());
+  return Status::OK();
+}
+
+// Creates a constant with the same rank as dims, where each dimension has
+// size = 1.
+Status CreateBroadcastableScalarConstant(const OpConverterParams* params,
+                                         float value,
                                          const nvinfer1::Dims& dims,
                                          ITensorProxyPtr* tensor,
                                          const char* dtype_attr_name = "T") {
-  nvinfer1::DataType trt_dtype =
-      nvinfer1::DataType::kFLOAT;  // Default to FP32.
-  TFAttrs attrs(params->node_def);
-  if (attrs.count(dtype_attr_name)) {
-    DataType dtype = attrs.get<DataType>(dtype_attr_name);
-    TF_RETURN_IF_ERROR(TfDataTypeToTrt(dtype, &trt_dtype));
+  nvinfer1::DataType trt_type = nvinfer1::DataType::kFLOAT;  // Default to FP32.
+  AttrSlice attrs(params->node_def);
+  if (attrs.Find(dtype_attr_name) != nullptr) {
+    DataType dtype;
+    TF_RETURN_IF_ERROR(GetNodeAttr(attrs, dtype_attr_name, &dtype));
+    TF_RETURN_IF_ERROR(TfTypeToTrtType(dtype, &trt_type));
   }
 
   // In order to be broadcastable, the number of dims has to match.
@@ -472,24 +679,29 @@ Status CreateBroadcastableScalarConstant(OpConverterParams* params, float value,
   for (int i = 0; i < broadcastable_dims.nbDims; i++) {
     broadcastable_dims.d[i] = 1;
   }
-  TRT_ShapedWeights weights =
-      params->weight_store->GetTempWeights(trt_dtype, broadcastable_dims);
-  void* raw_ptr = weights.GetValues();
-  switch (trt_dtype) {
-    case nvinfer1::DataType::kFLOAT:
-      static_cast<float*>(raw_ptr)[0] = value;
-      break;
-    case nvinfer1::DataType::kHALF:
-      static_cast<Eigen::half*>(raw_ptr)[0] = Eigen::half(value);
-      break;
-    default:
-      return errors::InvalidArgument("Unsupported data type ",
-                                     DebugString(trt_dtype));
+  return CreateScalarConstant(params, value, tensor, trt_type,
+                              broadcastable_dims);
+}
+
+// The function concatenates tensors on the first axis. This can be used to
+// create a shape tensor from individual dimension sizes.
+::stream_executor::port::StatusOr<ITensorProxyPtr> ConcatenateTensors(
+    const OpConverterParams* params,
+    const std::vector<ITensorProxyPtr> input_tensors,
+    absl::optional<int> op_instance = absl::nullopt) {
+  std::vector<nvinfer1::ITensor*> trt_input_tensors;
+  for (const auto& t : input_tensors) {
+    trt_input_tensors.push_back(t->trt_tensor());
   }
-  *tensor = params->converter->CreateConstantLayer(weights, broadcastable_dims);
-  TFTRT_RETURN_ERROR_IF_NULLPTR(*tensor, params->node_def.name());
-  params->converter->ProvideQuantizationRange(tensor, value, value);
-  return Status::OK();
+  nvinfer1::IConcatenationLayer* layer =
+      params->converter->network()->addConcatenation(
+          static_cast<nvinfer1::ITensor* const*>(trt_input_tensors.data()),
+          input_tensors.size());
+  TFTRT_RETURN_ERROR_IF_NULLPTR(layer, params->node_def.op());
+  params->converter->SetLayerName(layer, params->node_def.name(),
+                                  "concat_shapes", op_instance);
+  layer->setAxis(0);
+  return ITensorProxyPtr(layer->getOutput(0));
 }
 
 // Convert an axis from TF format to TRT format while validating. TF format
@@ -509,27 +721,13 @@ Status ConvertAxis(int tf_axis, int trt_nb_dims, absl::string_view node_name,
   // Don't allow axis to be the batch dimension.
   if (use_implicit_batch && tf_axis == 0) {
     return errors::Unimplemented(
-        "TensorRT does not allow manipulation of the batch dimension, at ",
-        node_name);
+        "TensorRT does not allow manipulation of the batch dimension");
   }
   // Remove batch dimension if it is implicit.
   *trt_axis = use_implicit_batch ? tf_axis - 1 : tf_axis;
   return Status::OK();
 }
 
-inline bool DimsEqual(const nvinfer1::Dims& dim_l,
-                      const nvinfer1::Dims& dim_r) {
-  if (dim_l.nbDims != dim_r.nbDims) {
-    return false;
-  }
-  for (int i = 0; i < dim_l.nbDims; i++) {
-    if (dim_l.d[i] != dim_r.d[i]) {
-      return false;
-    }
-  }
-  return true;
-}
-
 bool AllLengthsEqual(const std::vector<std::vector<int>>& inputs) {
   if (inputs.size() == 0) return true;
   int length = inputs.at(0).size();
@@ -539,69 +737,21 @@ bool AllLengthsEqual(const std::vector<std::vector<int>>& inputs) {
   return true;
 }
 
-inline nvinfer1::Dims GetTrtDimsForTensor(const Tensor& tensor) {
-  nvinfer1::Dims dims;
-  dims.nbDims = tensor.dims();
-  for (int i = 0; i < dims.nbDims; i++) {
-    dims.d[i] = tensor.dim_size(i);
-  }
-  return dims;
-}
-
-inline bool HasStaticShape(const nvinfer1::Dims& dims) {
-  if (dims.nbDims < 0) return false;
-  for (int d = 0; d < dims.nbDims; ++d) {
-    if (dims.d[d] < 0) return false;
-  }
-  return true;
-}
-
-int64_t Prod(const nvinfer1::Dims& dims) {
-  int64_t count = 1;
-  for (int d = 0; d < dims.nbDims; ++d) {
-    count *= dims.d[d];
-  }
-  return count;
-}
-
-// Returns total number of elements in a TensorRT weights dimensions.
-// Returning 0 means either some dim is 0 or the number of dims is 0 (TensorRT
-// doesn't allow scalar weights).
-// Note that for TF scalar constant, we always convert to dims [1].
-int64_t TrtWeightDimsNumElements(const nvinfer1::Dims& dims) {
-  if (dims.nbDims == 0) return 0;
-  return Prod(dims);
-}
-
-// Returns total number of elements in an ITensor dimension.
-// Returns 1 if the number of dims is 0 (the total number is fully determined by
-// the batch size).
-// Returns -1 if any dimension is known.
-int64_t TrtTensorDimsNumElements(const nvinfer1::Dims& dims) {
-  if (!HasStaticShape(dims)) return -1;
-  return Prod(dims);
-}
-
-bool DimsHaveSameSize(const nvinfer1::Dims& lhs, const nvinfer1::Dims& rhs,
-                      bool is_tensor) {
-  if (is_tensor) {
-    return TrtTensorDimsNumElements(lhs) == TrtTensorDimsNumElements(rhs);
-  }
-  return TrtWeightDimsNumElements(lhs) == TrtWeightDimsNumElements(rhs);
+bool DimsHaveSameSize(const DimsAdapter& lhs, const DimsAdapter& rhs) {
+  return lhs.Volume() == rhs.Volume();
 }
 
 // Returns whether both dimensions are fully specified and the total number of
 // elements equals.
-bool AreDimsStaticWithSameSize(const nvinfer1::Dims& lhs,
-                               const nvinfer1::Dims& rhs, bool is_tensor) {
-  if (!HasStaticShape(lhs) || !HasStaticShape(rhs)) return false;
-  return DimsHaveSameSize(lhs, rhs, is_tensor);
+bool AreDimsStaticWithSameSize(const DimsAdapter& lhs, const DimsAdapter& rhs) {
+  if (!lhs.IsStatic() || !rhs.IsStatic()) return false;
+  return DimsHaveSameSize(lhs, rhs);
 }
 
-bool AreDimsStaticWithDifferentSize(const nvinfer1::Dims& lhs,
-                                    const nvinfer1::Dims& rhs, bool is_tensor) {
-  if (!HasStaticShape(lhs) || !HasStaticShape(rhs)) return false;
-  return !DimsHaveSameSize(lhs, rhs, is_tensor);
+bool AreDimsStaticWithDifferentSize(const DimsAdapter& lhs,
+                                    const DimsAdapter& rhs) {
+  if (!lhs.IsStatic() || !rhs.IsStatic()) return false;
+  return !DimsHaveSameSize(lhs, rhs);
 }
 
 static std::vector<std::pair<int, int>> CreateSamePadding(
@@ -653,6 +803,8 @@ Status VerifyShapesMatch(absl::Span<const TRT_TensorOrWeights> inputs,
           "Received inputs with inconsistent rank, at ", node_name);
     }
     for (size_t j = 0; j < dims_0.nbDims; ++j) {
+      // Dynamic dimensions will be verified at runtime.
+      if (dim_i.d[j] == -1 || dims_0.d[j] == -1) continue;
       if (dim_i.d[j] != dims_0.d[j] && j != masked_dim) {
         return errors::InvalidArgument(
             "Received inputs with inconsistent shape, at ", node_name);
@@ -662,115 +814,6 @@ Status VerifyShapesMatch(absl::Span<const TRT_TensorOrWeights> inputs,
   return Status::OK();
 }
 
-TRT_ShapedWeights::TRT_ShapedWeights(nvinfer1::DataType type) : type_(type) {
-  shape_.nbDims = 0;
-}
-
-TRT_ShapedWeights::TRT_ShapedWeights(nvinfer1::DataType type,
-                                     nvinfer1::Dims dims, Tensor tensor)
-    : shape_(dims), type_(type), tensor_(tensor) {}
-
-TRT_ShapedWeights::TRT_ShapedWeights(const TRT_ShapedWeights& rhs)
-    : shape_(rhs.shape_), type_(rhs.type_), tensor_(rhs.tensor_) {}
-
-int64_t TRT_ShapedWeights::count() const {
-  return TrtWeightDimsNumElements(shape_);
-}
-
-nvinfer1::Weights TRT_ShapedWeights::GetTrtWeights() const {
-  return nvinfer1::Weights{type_, GetValues(), count()};
-}
-
-size_t TRT_ShapedWeights::size_bytes() const {
-  size_t data_type_size = -1;
-  switch (type_) {
-    case nvinfer1::DataType::kFLOAT:
-    case nvinfer1::DataType::kINT32:
-      data_type_size = 4;
-      break;
-    case nvinfer1::DataType::kHALF:
-      data_type_size = 2;
-      break;
-    case nvinfer1::DataType::kINT8:
-      data_type_size = 1;
-      break;
-  }
-  return this->count() * data_type_size;
-}
-
-string TRT_ShapedWeights::DebugString() const {
-  return StrCat("TRT_ShapedWeights(shape=", convert::DebugString(shape_),
-                ", type=", convert::DebugString(type_),
-                ", values=", reinterpret_cast<uintptr_t>(GetValues()), ")");
-}
-
-TRT_TensorOrWeights::TRT_TensorOrWeights(ITensorProxyPtr tensor)
-    : tensor_proxy_ptr_(tensor), initialized_(true), is_tensor_(true) {}
-
-TRT_TensorOrWeights::TRT_TensorOrWeights(ITensorProxyPtr tensor, int batch_size)
-    : tensor_proxy_ptr_(tensor),
-      batch_size_(batch_size),
-      initialized_(true),
-      is_tensor_(true) {}
-
-TRT_TensorOrWeights::TRT_TensorOrWeights(nvinfer1::ITensor* tensor,
-                                         int batch_size)
-    : tensor_proxy_ptr_(tensor),
-      batch_size_(batch_size),
-      initialized_(true),
-      is_tensor_(true) {}
-
-TRT_TensorOrWeights::TRT_TensorOrWeights(nvinfer1::DataType trt_dtype,
-                                         const nvinfer1::Dims& trt_dims,
-                                         int batch_size)
-    : tensor_proxy_ptr_(new SimpleITensor(trt_dtype, trt_dims)),
-      batch_size_(batch_size),
-      initialized_(true),
-      is_tensor_(true) {}
-
-TRT_TensorOrWeights::TRT_TensorOrWeights(const TRT_ShapedWeights& weights)
-    : weights_(weights), initialized_(true), is_tensor_(false) {}
-
-TRT_TensorOrWeights::TRT_TensorOrWeights(const TRT_TensorOrWeights& rhs)
-    : tensor_proxy_ptr_(rhs.tensor_proxy_ptr_),
-      batch_size_(rhs.batch_size_),
-      weights_(rhs.weights_),
-      initialized_(rhs.initialized_),
-      is_tensor_(rhs.is_tensor_) {}
-
-void TRT_TensorOrWeights::operator=(const TRT_TensorOrWeights& rhs) {
-  tensor_proxy_ptr_ = rhs.tensor_proxy_ptr_;
-  batch_size_ = rhs.batch_size_;
-  weights_ = rhs.weights_;
-  initialized_ = rhs.initialized_;
-  is_tensor_ = rhs.is_tensor_;
-}
-
-ITensorProxyPtr TRT_TensorOrWeights::tensor() const {
-  CHECK(is_tensor());
-  return tensor_proxy_ptr_;
-}
-
-nvinfer1::Dims TRT_TensorOrWeights::GetTrtDims() const {
-  if (is_tensor()) {
-    return tensor()->getDimensions();
-  } else {
-    return weights().shape_;
-  }
-}
-
-string TRT_TensorOrWeights::DebugString() const {
-  string output = "TRT_TensorOrWeights(type=";
-  if (is_tensor()) {
-    StrAppend(&output, "tensor=", convert::DebugString(tensor()),
-              ", batch_size=", batch_size_);
-  } else {
-    StrAppend(&output, "weights=", weights_.DebugString());
-  }
-  StrAppend(&output, ")");
-  return output;
-}
-
 // Perform 5 dimensional reorder of data on CPU
 // This is done once at convert time and does not affect GPU inference perf
 // Example: reorder NDHWC (Tensorflow) -> NCDHW (TensorRT)
@@ -830,22 +873,21 @@ void Reorder2(const nvinfer1::DimsHW& shape, const T* idata,
 // TODO(jie): fallback to tensorflow!!
 void ReorderCKtoKC(const TRT_ShapedWeights& iweights,
                    TRT_ShapedWeights* oweights) {
-  const int c = iweights.shape_.d[0];
-  const int k = iweights.shape_.d[1];
-  oweights->shape_.d[0] = k;
-  oweights->shape_.d[1] = c;
+  const int c = iweights.Shape().dim(0);
+  const int k = iweights.Shape().dim(1);
+  oweights->Shape().dim(0) = k;
+  oweights->Shape().dim(1) = c;
   const nvinfer1::DimsHW istrides = {1, k};
   const nvinfer1::DimsHW ostrides = {c, 1};
   switch (iweights.TrtDType()) {
     case nvinfer1::DataType::kFLOAT: {
-      Reorder2({k, c}, static_cast<float const*>(iweights.GetValues()),
-               istrides, static_cast<float*>(oweights->GetValues()), ostrides);
+      Reorder2({k, c}, iweights.GetPointer<float>(), istrides,
+               oweights->GetPointer<float>(), ostrides);
       break;
     }
     case nvinfer1::DataType::kHALF: {
-      Reorder2({k, c}, static_cast<Eigen::half const*>(iweights.GetValues()),
-               istrides, static_cast<Eigen::half*>(oweights->GetValues()),
-               ostrides);
+      Reorder2({k, c}, iweights.GetPointer<Eigen::half>(), istrides,
+               oweights->GetPointer<Eigen::half>(), ostrides);
       break;
     }
     default:
@@ -860,31 +902,30 @@ void ReorderRSCKToKCRS(const TRT_ShapedWeights& iweights,
   CHECK_EQ(iweights.size_bytes(), oweights->size_bytes());
   // K indexes over output channels, C over input channels, and R and S over the
   // height and width of the convolution
-  const int r = iweights.shape_.d[0];
-  const int s = iweights.shape_.d[1];
+  const int r = iweights.Shape().dim(0);
+  const int s = iweights.Shape().dim(1);
   // TRT requires GKcRS, while TF depthwise has RSCK where c=1, C=G
-  const int c = iweights.shape_.d[2] / num_groups;
-  const int k = iweights.shape_.d[3] * num_groups;
-  VLOG(2) << "num_groups: " << num_groups << "c" << iweights.shape_.d[2]
-          << " then " << c << "k" << iweights.shape_.d[3] << " then " << k
-          << "r" << iweights.shape_.d[0] << " then " << r << "s"
-          << iweights.shape_.d[1] << " then " << s;
-  oweights->shape_.d[0] = k / num_groups;
-  oweights->shape_.d[1] = c * num_groups;
-  oweights->shape_.d[2] = r;
-  oweights->shape_.d[3] = s;
+  const int c = iweights.Shape().dim(2) / num_groups;
+  const int k = iweights.Shape().dim(3) * num_groups;
+  VLOG(2) << "num_groups: " << num_groups << "c" << iweights.Shape().dim(2)
+          << " then " << c << "k" << iweights.Shape().dim(3) << " then " << k
+          << "r" << iweights.Shape().dim(0) << " then " << r << "s"
+          << iweights.Shape().dim(1) << " then " << s;
+  oweights->Shape().dim(0) = k / num_groups;
+  oweights->Shape().dim(1) = c * num_groups;
+  oweights->Shape().dim(2) = r;
+  oweights->Shape().dim(3) = s;
   const nvinfer1::Dims4 istrides = {1, k, s * k * c, c * k};
   const nvinfer1::Dims4 ostrides = {c * r * s, r * s, s, 1};
   switch (iweights.TrtDType()) {
     case nvinfer1::DataType::kFLOAT: {
-      Reorder4({k, c, r, s}, static_cast<float const*>(iweights.GetValues()),
-               istrides, static_cast<float*>(oweights->GetValues()), ostrides);
+      Reorder4({k, c, r, s}, iweights.GetPointer<float>(), istrides,
+               oweights->GetPointer<float>(), ostrides);
       break;
     }
     case nvinfer1::DataType::kHALF: {
-      Reorder4({k, c, r, s},
-               static_cast<Eigen::half const*>(iweights.GetValues()), istrides,
-               static_cast<Eigen::half*>(oweights->GetValues()), ostrides);
+      Reorder4({k, c, r, s}, iweights.GetPointer<Eigen::half>(), istrides,
+               oweights->GetPointer<Eigen::half>(), ostrides);
       break;
     }
 
@@ -909,22 +950,22 @@ void ReorderDRSCKToKCDRS(const TRT_ShapedWeights& iweights,
   CHECK_EQ(iweights.size_bytes(), oweights->size_bytes());
   // K indexes over output channels, C over input channels, and R, S, D over the
   // height, width, depth
-  const int d = iweights.shape_.d[0];
-  const int r = iweights.shape_.d[1];
-  const int s = iweights.shape_.d[2];
+  const int d = iweights.Shape().dim(0);
+  const int r = iweights.Shape().dim(1);
+  const int s = iweights.Shape().dim(2);
   // TRT requires GKcRS, while TF depthwise has RSCK where c=1, C=G
-  const int c = iweights.shape_.d[3] / num_groups;
-  const int k = iweights.shape_.d[4] * num_groups;
+  const int c = iweights.Shape().dim(3) / num_groups;
+  const int k = iweights.Shape().dim(4) * num_groups;
 
-  VLOG(2) << "num_groups: " << num_groups << ", c: " << iweights.shape_.d[3]
-          << " becomes " << c << ", k: " << iweights.shape_.d[4] << " becomes "
-          << k << ", d: " << d << ", r: " << r << ", s: " << s;
+  VLOG(2) << "num_groups: " << num_groups << ", c: " << iweights.Shape().dim(3)
+          << " becomes " << c << ", k: " << iweights.Shape().dim(4)
+          << " becomes " << k << ", d: " << d << ", r: " << r << ", s: " << s;
 
-  oweights->shape_.d[0] = iweights.shape_.d[4];  // k / num_groups;
-  oweights->shape_.d[1] = iweights.shape_.d[3];  // c * num_groups;
-  oweights->shape_.d[2] = d;
-  oweights->shape_.d[3] = r;
-  oweights->shape_.d[4] = s;
+  oweights->Shape().dim(0) = iweights.Shape().dim(4);  // k / num_groups;
+  oweights->Shape().dim(1) = iweights.Shape().dim(3);  // c * num_groups;
+  oweights->Shape().dim(2) = d;
+  oweights->Shape().dim(3) = r;
+  oweights->Shape().dim(4) = s;
 
   nvinfer1::Dims shape =
       InitDimsN({k, c, d, r, s});  // KCDRS shape (same as output)
@@ -939,14 +980,13 @@ void ReorderDRSCKToKCDRS(const TRT_ShapedWeights& iweights,
 
   switch (iweights.TrtDType()) {
     case nvinfer1::DataType::kFLOAT: {
-      Reorder5(shape, static_cast<float const*>(iweights.GetValues()), istrides,
-               static_cast<float*>(oweights->GetValues()), ostrides);
+      Reorder5(shape, iweights.GetPointer<float>(), istrides,
+               oweights->GetPointer<float>(), ostrides);
       break;
     }
     case nvinfer1::DataType::kHALF: {
-      Reorder5(shape, static_cast<Eigen::half const*>(iweights.GetValues()),
-               istrides, static_cast<Eigen::half*>(oweights->GetValues()),
-               ostrides);
+      Reorder5(shape, iweights.GetPointer<Eigen::half>(), istrides,
+               oweights->GetPointer<Eigen::half>(), ostrides);
       break;
     }
     default:
@@ -955,31 +995,20 @@ void ReorderDRSCKToKCDRS(const TRT_ShapedWeights& iweights,
   }
 }
 
-TRT_ShapedWeights TrtWeightStore::GetTempWeights(nvinfer1::DataType trt_dtype,
-                                                 const nvinfer1::Dims& dims) {
-  TensorShape shape;
-  DataType tf_dtype;
-  // TODO(laigd): make it return a status.
-  TF_CHECK_OK(TensorShapeUtils::MakeShape(dims.d, dims.nbDims, &shape));
-  TF_CHECK_OK(TrtDataTypeToTf(trt_dtype, &tf_dtype));
-  // TODO(jie): check weights size_bytes. 0 means type error
-  Tensor tensor(tf_dtype, shape);
-  TRT_ShapedWeights weights(trt_dtype, dims, tensor);
-  store_.emplace_back(std::move(tensor));
-  return weights;
-}
-
 OpConverterParams::OpConverterParams(
     const NodeDef& node_def, const std::vector<TRT_TensorOrWeights>& inputs,
     std::vector<TRT_TensorOrWeights>* outputs, TrtWeightStore* weight_store,
-    TrtPrecisionMode precision_mode, bool use_calibration)
+    TrtPrecisionMode precision_mode, bool use_calibration,
+    bool use_implicit_batch, bool use_explicit_precision)
     : node_def(node_def),
       inputs(inputs),
       outputs(outputs),
       validation_only(true),
       weight_store(weight_store),
       precision_mode(precision_mode),
-      use_calibration(use_calibration) {}
+      use_calibration(use_calibration),
+      use_implicit_batch(use_implicit_batch),
+      use_explicit_precision(use_explicit_precision) {}
 
 OpConverterParams::OpConverterParams(
     Converter* converter, const NodeDef& node_def,
@@ -992,31 +1021,44 @@ OpConverterParams::OpConverterParams(
       validation_only(false),
       weight_store(weight_store),
       precision_mode(converter->precision_mode()),
-      use_calibration(converter->use_calibration()) {}
-
-const std::set<string>* TrtNodeValidator::quantize_ops = new std::set<string>{
-    "QuantizeAndDequantizeV2",
-    "QuantizeAndDequantizeV3",
-    "FakeQuantWithMinMaxVars",
-    "FakeQuantWithMinMaxArgs",
-};
+      use_calibration(converter->use_calibration()),
+      use_implicit_batch(converter->use_implicit_batch()),
+      use_explicit_precision(converter->UseExplicitPrecision()) {}
 
 TrtNodeValidator::TrtNodeValidator(
     const grappler::GraphProperties& graph_properties,
-    TrtPrecisionMode precision_mode, bool use_calibration)
+    TrtPrecisionMode precision_mode, bool use_calibration,
+    bool use_implicit_batch, bool use_explicit_precision)
     : graph_properties_(graph_properties),
       precision_mode_(precision_mode),
-      use_calibration_(use_calibration) {
-  RegisterOpValidators();
+      use_calibration_(use_calibration),
+      use_implicit_batch_(use_implicit_batch),
+      use_explicit_precision_(use_explicit_precision) {}
+
+::stream_executor::port::StatusOr<OpConverter> TrtNodeValidator::GetValidator(
+    const std::string& op) {
+  return GetOpConverterRegistry()->LookUp(op);
 }
 
 Status TrtNodeValidator::ConvertToTensorOrWeights(
     const NodeDef& node_def, int output_port,
     TRT_TensorOrWeights* tensor_or_weights) {
-  if (node_def.op() == "Const") {
-    if (output_port != 0) {
-      return errors::InvalidArgument("Const node should only have one output.");
+  // Treat handles separately.
+  if (node_def.op() == "VarHandleOp" || node_def.op() == "Placeholder") {
+    AttrSlice attrs(node_def);
+    DataType dtype;
+    TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "dtype", &dtype));
+    if (dtype == DataType::DT_RESOURCE) {
+      // The converter doesn't use the input resource at the validation stage
+      // (it gets the dtype and shape from attributes). A fake resource can be
+      // used.
+      ResourceHandle fake_resource;
+      *tensor_or_weights = TRT_TensorOrWeights(fake_resource);
+      return Status::OK();
     }
+  }
+
+  if (node_def.op() == "Const" || node_def.op() == "VariableV2") {
     // The output of the conversion will be used as input to other nodes to
     // determine whether TRT supports those nodes. If it cannot convert the
     // Const, it's very likely we cannot treat it as a tensor and make it an
@@ -1024,9 +1066,22 @@ Status TrtNodeValidator::ConvertToTensorOrWeights(
     // treats it as batch size. Also, it's not likely that the converter can
     // support the op, and performance may suffer even if it can, so we just
     // simply return error if the conversion fails.
+    if (output_port != 0) {
+      return errors::InvalidArgument(node_def.op(),
+                                     " node should only have one output.");
+    }
     std::vector<TRT_TensorOrWeights> inputs;
     return ConvertConstToWeights(node_def, inputs, tensor_or_weights);
   }
+  if (node_def.op() == "ReadVariableOp") {
+    // Similar treatment to Const and VariableV2, but we provide a fake
+    // resource input to the converter.
+    const std::vector<TRT_TensorOrWeights> inputs{
+        TRT_TensorOrWeights(ResourceHandle())};
+
+    // Convert the variable to weights.
+    return ConvertConstToWeights(node_def, inputs, tensor_or_weights);
+  }
   if (!graph_properties_.HasOutputProperties(node_def.name())) {
     return errors::InvalidArgument("Shape and data type are unknown");
   }
@@ -1041,8 +1096,8 @@ Status TrtNodeValidator::ConvertToTensorOrWeights(
   nvinfer1::Dims trt_dims;
   int batch_size = -1;
   TF_RETURN_IF_ERROR(ValidateTensorProperties(
-      node_def.op(), dtype, shape, /*validation_only_=*/true, &trt_dtype,
-      &trt_dims, &batch_size));
+      node_def.op(), dtype, shape, use_implicit_batch_,
+      /*validation_only_=*/true, &trt_dtype, &trt_dims, &batch_size));
 
   // Adds a fake ITensor. This is fine since op converter operates in
   // validation-only mode and it won't (and shouldn't) use the tensor to do
@@ -1057,11 +1112,12 @@ Status TrtNodeValidator::IsTensorRTCandidate(const Node* node) {
   // these ops to the relevant tensors. This happens regardless of the value of
   // use_calibration.
   bool is_supported_op = false;
-  if (quantize_ops->count(op)) {
+  if (absl::c_find(kQuantizationOpNames, op) != kQuantizationOpNames.end()) {
     is_supported_op = (precision_mode_ == TrtPrecisionMode::INT8);
   } else {
-    is_supported_op = op_validators_.count(op);
+    is_supported_op = GetValidator(op).ok();
   }
+
   if (!is_supported_op) {
     return errors::Unimplemented("Op type ", op, " is not supported.");
   }
@@ -1072,22 +1128,35 @@ Status TrtNodeValidator::IsTensorRTCandidate(const Node* node) {
   std::vector<const Edge*> input_edges;
   TF_RETURN_IF_ERROR(node->input_edges(&input_edges));
   for (const Edge* edge : input_edges) {
+    // Go up the chain of Identity nodes.
+    Node* src_node = edge->src();
+    while (src_node->def().op() == "Identity") {
+      std::vector<const Edge*> input_edges_temp;
+      TF_RETURN_IF_ERROR(src_node->input_edges(&input_edges_temp));
+      src_node = input_edges_temp[0]->src();
+    }
+    const NodeDef& src_def = src_node->def();
+
     TRT_TensorOrWeights tensor_or_weights;
-    const NodeDef& src_def = edge->src()->def();
     Status status = ConvertToTensorOrWeights(src_def, edge->src_output(),
                                              &tensor_or_weights);
     if (!status.ok()) {
+      VLOG(2) << "Failed to convert input `" << src_def.name() << "` to a "
+              << "TRT_TensorOrWeights: " << status.error_message();
+
       return errors::Internal(
-          "Failed to convert input ", src_def.name(),
-          " to a TRT_TensorOrWeights: ", status.error_message());
+          "Failed to convert at least one input to a TRT_TensorOrWeights: ",
+          status.error_message());
     }
     inputs.push_back(tensor_or_weights);
   }
 
-  OpConverter validator = op_validators_[op];
+  auto validator = GetValidator(op);
+  TF_RETURN_IF_ERROR(validator.status());
   OpConverterParams params(node->def(), inputs, /*arg_outputs=*/nullptr,
-                           &weight_store_, precision_mode_, use_calibration_);
-  return validator(&params);
+                           &weight_store_, precision_mode_, use_calibration_,
+                           use_implicit_batch_, use_explicit_precision_);
+  return validator.ValueOrDie()(&params);
 }
 
 Status TrtNodeValidator::ConvertConstToWeights(
@@ -1096,71 +1165,83 @@ Status TrtNodeValidator::ConvertConstToWeights(
     TRT_TensorOrWeights* output) {
   std::vector<TRT_TensorOrWeights> outputs;
   OpConverterParams params(const_node_def, inputs, &outputs, &weight_store_,
-                           precision_mode_, use_calibration_);
-  Status status = op_validators_["Const"](&params);
-  if (status.ok() && output) *output = outputs[0];
+                           precision_mode_, use_calibration_,
+                           use_implicit_batch_, use_explicit_precision_);
+  auto const_val = GetValidator(const_node_def.op());
+  TF_RETURN_IF_ERROR(const_val.status());
+  Status status = const_val.ValueOrDie()(&params);
+  if (status.ok() && (output != nullptr)) {
+    *output = outputs[0];
+  }
   return status;
 }
 
-static void InitializeTrtPlugins() {
-  static mutex plugin_mutex(LINKER_INITIALIZED);
-  static bool plugin_initialized = false;
-  static Logger trt_logger;
-  mutex_lock lock(plugin_mutex);
-  if (plugin_initialized) return;
-
-  plugin_initialized = initLibNvInferPlugins(&trt_logger, "");
-  if (!plugin_initialized) {
-    LOG(ERROR) << "Failed to initialize TensorRT plugins, and conversion may "
-                  "fail later.";
-  }
-
-  int num_trt_plugins = 0;
-  nvinfer1::IPluginCreator* const* trt_plugin_creator_list =
-      getPluginRegistry()->getPluginCreatorList(&num_trt_plugins);
-  if (!trt_plugin_creator_list) {
-    LOG(WARNING) << "Can not find any TensorRT plugins in registry.";
-  } else {
-    VLOG(1) << "Found the following " << num_trt_plugins
-            << " TensorRT plugins in registry:";
-    for (int i = 0; i < num_trt_plugins; ++i) {
-      if (!trt_plugin_creator_list[i]) {
-        LOG(WARNING) << "TensorRT plugin at index " << i
-                     << " is not accessible (null pointer returned by "
-                        "getPluginCreatorList for this plugin)";
-      } else {
-        VLOG(1) << "  " << trt_plugin_creator_list[i]->getPluginName();
-      }
-    }
-  }
-}
-
-Converter::Converter(nvinfer1::INetworkDefinition* trt_network,
-                     TrtPrecisionMode precision_mode, bool use_calibration)
-    : trt_network_(trt_network),
+// static
+::stream_executor::port::StatusOr<std::unique_ptr<Converter>> Converter::Create(
+    TrtPrecisionMode precision_mode, bool use_calibration,
+    nvinfer1::ILogger* trt_logger, const bool use_implicit_batch,
+    absl::string_view engine_name, bool use_explicit_precision,
+    OpKernelContext* ctx) {
+  std::unique_ptr<Converter> converter = absl::WrapUnique(new Converter(
+      precision_mode, use_calibration, trt_logger, use_implicit_batch,
+      engine_name, use_explicit_precision, ctx));
+  TF_RETURN_IF_ERROR(converter->Init(trt_logger));
+  return converter;
+}
+
+Converter::Converter(TrtPrecisionMode precision_mode, bool use_calibration,
+                     nvinfer1::ILogger* trt_logger,
+                     const bool use_implicit_batch,
+                     absl::string_view engine_name, bool use_explicit_precision,
+                     OpKernelContext* ctx)
+    : ctx_(ctx),
       precision_mode_(precision_mode),
-      use_calibration_(use_calibration) {
-  InitializeTrtPlugins();
-  this->RegisterOpConverters();
+      use_calibration_(use_calibration),
+      use_implicit_batch_(use_implicit_batch),
+      engine_name_(engine_name),
+      use_explicit_precision_(use_explicit_precision) {
+  MaybeInitializeTrtPlugins(trt_logger);
+}
+
+Status Converter::Init(nvinfer1::ILogger* trt_logger) {
+  VLOG(1) << "Creating TensorRT builder";
+  trt_builder_.reset(nvinfer1::createInferBuilder(*trt_logger));
+
+  VLOG(1) << "Creating TensorRT network";
+  uint32_t flags =
+      use_implicit_batch_
+          ? 0U
+          : (1U << static_cast<int>(
+                 nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH));
+  if (use_explicit_precision_) {
+    flags |=
+        (1U << static_cast<int>(
+             nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_PRECISION));
+  }
+  trt_network_.reset(trt_builder_->createNetworkV2(flags));
+  if (!trt_network_) {
+    return errors::Internal("Failed to create TensorRT network object");
+  }
+  return Status::OK();
 }
 
 Status Converter::ConvertNode(const NodeDef& node_def) {
-  std::vector<TRT_TensorOrWeights> inputs, outputs;
+  std::vector<TRT_TensorOrWeights> inputs;
+  std::vector<TRT_TensorOrWeights> outputs;
   TF_RETURN_IF_ERROR(this->GetInputs(node_def, &inputs));
 
   OpConverterParams params(this, node_def, inputs, &outputs, &weight_store_);
   const string& op = node_def.op();
-  auto itr = op_registry_.find(op);
-  if (itr == op_registry_.end()) {
-    return errors::Unimplemented("No converter registered for op: ", op);
-  }
-  OpConverter op_converter = itr->second;
-  TF_RETURN_IF_ERROR(op_converter(&params));
+  auto op_converter = GetOpConverterRegistry()->LookUp(op);
+  TF_RETURN_IF_ERROR(op_converter.status());
+  TF_RETURN_IF_ERROR(op_converter.ValueOrDie()(&params));
 
   for (size_t i = 0; i < outputs.size(); ++i) {
     TRT_TensorOrWeights& output = outputs[i];
     string output_name = node_def.name();
-    if (i != 0) absl::StrAppend(&output_name, ":", i);
+    if (i != 0) {
+      StrAppend(&output_name, ":", i);
+    }
     // We need to check the name before setting it. If the input is one of the
     // engine input, setting the name here will overwrite engine input
     // bindings which will cause runtime error.
@@ -1182,9 +1263,9 @@ Status Converter::ConvertNode(const NodeDef& node_def) {
             << output.DebugString();
     Status status = AddTensorOrWeights(output_name, output);
     if (!status.ok()) {
-      return Status(status.code(),
-                    StrCat("Failed to add output for node ", node_def.name(),
-                           ": ", status.error_message()));
+      return errors::InvalidArgument(
+          StrCat("Failed to add output for node: ", node_def.name(), ": ",
+                 status.error_message()));
     }
   }
   return Status::OK();
@@ -1195,10 +1276,13 @@ Status Converter::AddInputTensor(const string& name, nvinfer1::DataType dtype,
   // We verify the batch size only for the input nodes, and rely on individual
   // op converter to ensure the batch size of the outputs is not changed.
   // TODO(laigd): we need to test this properties.
-  Status status = MaybeUpdateBatchSize(batch_size);
-  if (!status.ok()) {
-    return Status(status.code(), StrCat("Batch size doesn't match for tensor ",
-                                        name, ": ", status.error_message()));
+  Status status;
+  if (use_implicit_batch_) {
+    status = MaybeUpdateBatchSize(batch_size);
+    if (!status.ok()) {
+      return Status(status.code(),
+                    batch_size_error(name, status.error_message()));
+    }
   }
   ITensorProxyPtr tensor = network()->addInput(name.c_str(), dtype, dims);
   if (*tensor == nullptr) {
@@ -1213,8 +1297,19 @@ Status Converter::AddInputTensor(const string& name, nvinfer1::DataType dtype,
   return Status::OK();
 }
 
+Status Converter::AddInputResource(const string& name,
+                                   const ResourceHandle& resource) {
+  Status status = AddTensorOrWeights(name, TRT_TensorOrWeights(resource));
+  if (!status.ok()) {
+    return Status(status.code(), StrCat("Failed to add input resource ", name,
+                                        ": ", status.error_message()));
+  }
+  return Status::OK();
+}
+
 Status Converter::RenameAndMarkOutputTensors(
     const std::vector<Converter::EngineOutputInfo>& output_tensors) {
+  int output_index = 0;
   for (const auto& output : output_tensors) {
     TRT_TensorOrWeights tensor_or_weights;
     TF_RETURN_IF_ERROR(
@@ -1240,22 +1335,248 @@ Status Converter::RenameAndMarkOutputTensors(
     // in ConvertIdentity.
     if (IsEngineInput(tensor->getName()) || IsEngineOutput(tensor->getName())) {
       // Using shuffle layer for identity by not setting reshape or transpose.
-      nvinfer1::IShuffleLayer* layer = network()->addShuffle(*tensor->trt_tensor());
+      nvinfer1::IShuffleLayer* layer =
+          network()->addShuffle(*tensor->trt_tensor());
       TFTRT_RETURN_ERROR_IF_NULLPTR(
           layer, StrCat("Output Copy for ", tensor->getName()));
-      ITensorProxyPtr output_tensor = layer->getOutput(0);
-      MarkQuantizationRangesAsInferrable(&tensor, &output_tensor);
-      tensor = output_tensor;
+      SetLayerName(layer, tensor->getName(), "shuffle", output_index);
+      tensor = layer->getOutput(0);
     }
     tensor->setName(output.dest_node_name.c_str());
     network()->markOutput(*tensor->trt_tensor());
     // Set type after marking as output. TRT only supports setType for engine
     // outputs and inputs (type is inferred otherwise).
     tensor->setType(output.trt_dtype);
+    output_index++;
     VLOG(1) << "Marking output TRT tensor " << output.source_tensor_name
             << " with data type " << DebugString(output.trt_dtype)
             << ", which feeds TF node " << output.dest_node_name;
   }
+  if (VLOG_IS_ON(2)) {
+    VLOG(2) << "Created TensorRT network with the following layers:";
+    for (int i = 0; i < network()->getNbLayers(); i++) {
+      auto layer = network()->getLayer(i);
+      VLOG(2) << "    " << layer->getName() << " ("
+              << "type: " << static_cast<int>(layer->getType())
+              << ", precision: " << static_cast<int>(layer->getPrecision())
+              << ")";
+    }
+  }
+  return Status::OK();
+}
+
+// Returns the value of TF_TRT_ABORT_CUDA_ENGINE_BUILD environment variable.
+// This variable can be used to abort CUDA engine construction, therefore it
+// provides a way to test and debug the native segment fallback of TF-TRT.
+bool AbortCudaEngineBuild() {
+  bool value;
+  Status status = ReadBoolFromEnvVar("TF_TRT_ABORT_CUDA_ENGINE_BUILD",
+                                     /*default_value=*/false, &value);
+  if (!status.ok()) {
+    LOG(ERROR) << status;
+  }
+  return value;
+}
+
+Status Converter::BuildCudaEngine(
+    TrtUniquePtrType<nvinfer1::ICudaEngine>* engine, int max_batch_size,
+    size_t max_workspace_size_bytes, nvinfer1::IGpuAllocator* allocator,
+    TRTInt8Calibrator* calibrator, TrtShapeOptimizationProfile* profiles) {
+  if (AbortCudaEngineBuild()) {
+    return errors::Aborted(
+        "Engine creation aborted by TF_TRT_ABORT_CUDA_ENGINE_BUILD variable");
+  }
+
+  VLOG(1) << "Configuring TensorRT builder";
+  trt_builder_->setMaxBatchSize(max_batch_size);
+  trt_builder_->setGpuAllocator(allocator);
+
+  // Create a network configuration and use it to build a TRT engine.
+  TrtUniquePtrType<nvinfer1::IBuilderConfig> builder_config(
+      trt_builder_->createBuilderConfig());
+  builder_config->setMaxWorkspaceSize(max_workspace_size_bytes);
+
+  // Create the algorithm selector. For TensorRT 7.x, the algorithm selector
+  // cannot be used when building with INT8 calibration.
+  std::unique_ptr<nvinfer1::IAlgorithmSelector> trt_algorithm_selector{nullptr};
+  if (!IS_TRT_VERSION_GE(8, 0, 0, 0)) {
+    if (!use_calibration_ || precision_mode_ != TrtPrecisionMode::INT8) {
+      trt_algorithm_selector = MaybeCreateAlgorithmSelector();
+    }
+  } else {
+    trt_algorithm_selector = MaybeCreateAlgorithmSelector();
+  }
+
+  if (trt_algorithm_selector != nullptr) {
+    builder_config->setAlgorithmSelector(trt_algorithm_selector.get());
+  }
+
+#if IS_TRT_VERSION_GE(8, 0, 0, 0)
+  enum class SparseComputeMode { DISABLED, ENABLED, SIMULATED };
+
+  static SparseComputeMode sparse_compute_mode = []() {
+    SparseComputeMode _sparse_compute_mode;
+    int64 _sparse_mode;
+    /*TF_TRT_SPARSE_MODE environment variable controls if sparse compute is
+    enabled. It also allows to simulate the performance benefits of training a
+    model with sparse compute in mind.
+    Possible Values:
+    - 1 [Default]: Sparse compute is enabled if the model was trained with
+    sparse weights. Otherwise it has no effect.
+    - < 1: Sparse compute is explicitly disabled regardless on how the model was
+    trained.
+    - > 1: Sparse compute is forced. This mode is only to be used for
+    benchmarking or debugging purpose. This feature artificially introduces a
+    sparse weight pattern compatible with Sparse TensorCores introduced in
+    NVIDIA Ampere GPU architecture. As a side effect, it will completely corrupt
+    the numerical values of the computation. Therefore shall only be used to
+    evaluate the benefit of using sparse computation for inference.*/
+    TF_CHECK_OK(tensorflow::ReadInt64FromEnvVar("TF_TRT_SPARSE_MODE",
+                                                /*default_val=*/1,
+                                                &_sparse_mode));
+
+    string sparse_log_msg = "[TF-TRT] Sparse compute capability: ";
+    if (_sparse_mode == 1) {
+      sparse_log_msg = StrCat(sparse_log_msg, "enabled.");
+      _sparse_compute_mode = SparseComputeMode::ENABLED;
+    } else if (_sparse_mode < 1) {
+      sparse_log_msg = StrCat(sparse_log_msg, "disabled.");
+      _sparse_compute_mode = SparseComputeMode::DISABLED;
+    } else {
+      sparse_log_msg = StrCat(
+          sparse_log_msg, "simulated.",
+          "It shall only be used for sparse computing benchmark and debug.");
+      _sparse_compute_mode = SparseComputeMode::SIMULATED;
+    }
+    LOG(INFO) << sparse_log_msg;
+
+    return _sparse_compute_mode;
+  }();
+
+  if (sparse_compute_mode == SparseComputeMode::ENABLED ||
+      sparse_compute_mode == SparseComputeMode::SIMULATED) {
+    builder_config->setFlag(nvinfer1::BuilderFlag::kSPARSE_WEIGHTS);
+  }
+#endif
+
+  if (tensorflow::tensor_float_32_execution_enabled()) {
+    builder_config->setFlag(nvinfer1::BuilderFlag::kTF32);
+  } else {
+    builder_config->clearFlag(nvinfer1::BuilderFlag::kTF32);
+  }
+
+  if (precision_mode_ == TrtPrecisionMode::FP16) {
+    builder_config->setFlag(nvinfer1::BuilderFlag::kFP16);
+  } else if (precision_mode_ == TrtPrecisionMode::INT8) {
+    // FP16 is not available in Explicit Precision mode with TensorRT 7.
+    if (IS_TRT_VERSION_GE(8, 0, 0, 0) || !use_explicit_precision_) {
+      builder_config->setFlag(nvinfer1::BuilderFlag::kFP16);
+    } else {
+      LOG_WARNING_WITH_PREFIX << "With explicit precision mode, FP16 is not "
+                                 "allowed before TensorRT 8. TRT will consider "
+                                 "INT8 and FP32 tactics.";
+    }
+    builder_config->setFlag(nvinfer1::BuilderFlag::kINT8);
+  }
+  if (!use_implicit_batch_ && profiles) {
+    TF_RETURN_IF_ERROR(profiles->ConfigureBuilder(
+        trt_builder_.get(), builder_config.get(), network()));
+  }
+  if (precision_mode_ == TrtPrecisionMode::INT8) {
+    builder_config->setInt8Calibrator(use_calibration_ ? calibrator : nullptr);
+  }
+
+  std::unique_ptr<TimingCacheRegistry::TimingCache> timing_cache = nullptr;
+  // We only use a timing cache if the algorithm selector is not used. If we
+  // are using TRT version >= 8.0, then we can try to deserialize an existing
+  // cache.
+  if (trt_algorithm_selector == nullptr) {
+#if IS_TRT_VERSION_GE(8, 0, 0, 0)
+    TimingCacheRegistry* registry = GetTimingCacheRegistry();
+
+    auto cache = registry->LookUp("default_cache", builder_config.get());
+    if (!cache.ok()) {
+      LOG(WARNING) << "failed to create a timing cache: "
+                   << cache.status().error_message();
+    } else {
+      timing_cache = std::move(cache.ValueOrDie());
+      builder_config->setTimingCache(*timing_cache, /*ignoreMismatch*/ false);
+    }
+#endif  // IS_TRT_VERSION_GE(8, 0, 0, 0)
+  } else {
+    // Disabling the timing cache is recommended when using the algorithm
+    // selector.
+    builder_config->setFlag(nvinfer1::BuilderFlag::kDISABLE_TIMING_CACHE);
+  }
+
+  string precision_mode_str;
+  TF_RETURN_IF_ERROR(
+      TrtPrecisionModeToName(precision_mode_, &precision_mode_str));
+  string trt_network_name = StrCat(
+      "TF:", TF_VERSION_STRING, ", ",
+      "TRT:", absl::StrJoin(GetLoadedTensorRTVersion(), "."), "-",
+      "Precision:", precision_mode_str, ", ", "Calibration:", use_calibration_,
+      ", ", "Max-Batch-Size:", max_batch_size, ", ",
+      "Max-Workspace-Size:", max_workspace_size_bytes);
+
+#if IS_TRT_VERSION_GE(8, 0, 0, 0)
+  trt_network_name = StrCat(trt_network_name, ", Sparse Compute: ");
+
+  switch (sparse_compute_mode) {
+    case SparseComputeMode::SIMULATED:
+      trt_network_name = StrCat(trt_network_name, "Simulated");
+      break;
+    case SparseComputeMode::ENABLED:
+      trt_network_name = StrCat(trt_network_name, "Enabled");
+      break;
+    case SparseComputeMode::DISABLED:
+      trt_network_name = StrCat(trt_network_name, "Disabled");
+      break;
+  }
+#endif
+
+  VLOG(1) << "Setting TensorRT network name to " << trt_network_name;
+  network()->setName(trt_network_name.c_str());
+
+  VLOG(1) << "Building TensorRT engine";
+  if (VLOG_IS_ON(2)) {
+    VLOG(2) << "Network inputs";
+    int n_inputs = network()->getNbInputs();
+    for (int i = 0; i < n_inputs; i++) {
+      const ITensorProxyPtr input = network()->getInput(i);
+      if (*input) {
+        VLOG(2) << "  " << i << " " << input->getName();
+      } else {
+        VLOG(2) << "Could not find input " << i;
+      }
+    }
+  }
+  engine->reset(
+      trt_builder_->buildEngineWithConfig(*network(), *builder_config));
+  if (engine->get() == nullptr) {
+    return errors::Internal("Failed to build TensorRT engine");
+  }
+  if (VLOG_IS_ON(2)) {
+    VLOG(2) << "TRT engine created";
+    int nbBindings = (*engine)->getNbBindings();
+    VLOG(2) << "Number of engine bindings: " << nbBindings;
+    for (int i = 0; i < nbBindings; i++) {
+      auto get_location_string = [&engine](int i) {
+        if ((*engine)->getLocation(i) == nvinfer1::TensorLocation::kDEVICE)
+          return " on device";
+        else
+          return " on host";
+      };
+      VLOG(2) << "Binding " << i << " name: " << (*engine)->getBindingName(i)
+              << get_location_string(i);
+    }
+  }
+
+  // Write back the new timing cache results to the registry.
+  if (timing_cache) {
+    GetTimingCacheRegistry()->Upsert("default_cache", timing_cache.get());
+  }
+
   return Status::OK();
 }
 
@@ -1280,7 +1601,9 @@ Status Converter::AddTensorOrWeights(const string& name,
   // We rely on the individual op converter to understand the semantics of the
   // TF node, and make sure it doesn't change the batch size nor introduce
   // intra-element dependency inside the batch.
-  if (input.is_tensor()) input.set_batch_size(batch_size_);
+  if (use_implicit_batch_ && input.is_tensor()) {
+    input.set_batch_size(batch_size_);
+  }
   if (trt_tensors_.insert({name, std::move(input)}).second) return Status::OK();
   return errors::AlreadyExists("tensor/weights ", name, " already exist.");
 }
@@ -1297,26 +1620,34 @@ Status Converter::GetTensorOrWeights(const string& name,
 
 Status Converter::TransposeTensor(ITensorProxyPtr input_tensor,
                                   const std::vector<int>& order_with_batch_dim,
-                                  ITensorProxyPtr* output_tensor) {
+                                  ITensorProxyPtr* output_tensor,
+                                  const NodeDef& node_def,
+                                  absl::string_view sub_op_name) {
   const auto dims = input_tensor->getDimensions();
-
-  if (order_with_batch_dim.size() - 1 != size_t(dims.nbDims)) {
+  const int order_size = use_implicit_batch_ ? order_with_batch_dim.size() - 1
+                                             : order_with_batch_dim.size();
+  if (order_size != size_t(dims.nbDims)) {
     return errors::InvalidArgument(
         "Rank of perm for transpose does not match with that of the input.");
   }
-  if (order_with_batch_dim[0] != 0) {
+  if (use_implicit_batch_ && order_with_batch_dim[0] != 0) {
     return errors::Unimplemented(
         "Transpose at batch dimension is not supported.");
   }
 
-  nvinfer1::IShuffleLayer* layer = this->network()->addShuffle(*input_tensor->trt_tensor());
+  nvinfer1::IShuffleLayer* layer =
+      this->network()->addShuffle(*input_tensor->trt_tensor());
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, "TF-TRT Internal Transpose");
-  ITensorProxyPtr shuffle_tensor = layer->getOutput(0);
-  MarkQuantizationRangesAsInferrable(&input_tensor, &shuffle_tensor);
+  SetLayerName(layer, node_def, sub_op_name);
 
   nvinfer1::Permutation permutation;
-  for (int32_t i = 0; i < dims.nbDims; ++i) {
-    permutation.order[i] = order_with_batch_dim[i + 1] - 1;
+  if (use_implicit_batch_) {
+    for (int32_t i = 0; i < dims.nbDims; ++i) {
+      permutation.order[i] = order_with_batch_dim[i + 1] - 1;
+    }
+  } else {
+    std::copy(order_with_batch_dim.begin(), order_with_batch_dim.end(),
+              permutation.order);
   }
   VLOG(1) << "TransposeTensor permutation: "
           << DebugString(permutation, dims.nbDims);
@@ -1337,21 +1668,21 @@ Status Converter::GetWeightRange(const TRT_ShapedWeights& weights,
                                  float* out_min, float* out_max) const {
   switch (weights.TrtDType()) {
     case nvinfer1::DataType::kFLOAT: {
-      auto inp = static_cast<float const*>(weights.GetValues());
+      auto inp = weights.GetPointer<float>();
       auto result = std::minmax_element(inp, inp + weights.count());
       *out_min = *result.first;
       *out_max = *result.second;
       break;
     }
     case nvinfer1::DataType::kHALF: {
-      auto inp = static_cast<Eigen::half const*>(weights.GetValues());
+      auto inp = weights.GetPointer<Eigen::half>();
       auto result = std::minmax_element(inp, inp + weights.count());
-      *out_min = Eigen::half_impl::half_to_float(*result.first);
-      *out_max = Eigen::half_impl::half_to_float(*result.second);
+      *out_min = static_cast<float>(*result.first);
+      *out_max = static_cast<float>(*result.second);
       break;
     }
     case nvinfer1::DataType::kINT32: {
-      auto inp = static_cast<int const*>(weights.GetValues());
+      auto inp = weights.GetPointer<int>();
       auto result = std::minmax_element(inp, inp + weights.count());
       *out_min = static_cast<float>(*result.first);
       *out_max = static_cast<float>(*result.second);
@@ -1365,84 +1696,106 @@ Status Converter::GetWeightRange(const TRT_ShapedWeights& weights,
   return Status::OK();
 }
 
-Status Converter::PrepareTensorForShape(const TRT_TensorOrWeights& input,
-                                        const nvinfer1::Dims& dims,
-                                        const bool validation_only,
-                                        ITensorProxyPtr* tensor) {
-  const nvinfer1::Dims input_dims = input.GetTrtDims();
-  // If one of input_dims and dims doesn't have static shape, it means some of
-  // the dims are unknown or need to be inferred. And we don't do further checks
-  // but rely on the caller to not make mistakes.
-  // Otherwise we do simple check to make sure the total sizes are the same.
+// Constructs <tf_related_part> for the ILayer name as
+// <tf_node_def_name>_<sub_op_name>_<sub_op_instance> and callSetLayerNameHelper
+// to set the name for the ILayer.
+//
+// If the operation represented by the ILayer is generated by the converter to
+// support the conversion of node_def, callers need to specify a non-empty
+// sub_op_name to be appended to the name of node_def to avoid layer name
+// conflicts. If the operation is generated multiple times, callers also need
+// to specify sub_op_instance to be appended to the name of the layers to avoid
+// layer name conflicts.
+void Converter::SetLayerName(nvinfer1::ILayer* layer, const NodeDef& node_def,
+                             absl::string_view sub_op_name,
+                             absl::optional<int> sub_op_instance,
+                             absl::optional<std::string> origin_node_name) {
+  std::string sub_op_suffix = GetLayerNameSuffix(sub_op_name, sub_op_instance);
+  if (sub_op_suffix.empty()) {
+    SetLayerNameHelper(layer, engine_name_, node_def.name());
+  } else if (origin_node_name.has_value()) {
+    auto layer_name = absl::StrCat(node_def.name(), "-",
+                                   absl::string_view(origin_node_name.value()),
+                                   "-", sub_op_suffix);
+    SetLayerNameHelper(layer, engine_name_, layer_name);
+  } else {
+    SetLayerNameHelper(layer, engine_name_,
+                       absl::StrCat(node_def.name(), "-", sub_op_suffix));
+  }
+}
+
+// Constructs <tf_related_part> for the ILayer name as
+// <main_op_name>_<sub_op_name>_<sub_op_instance> and callSetLayerNameHelper to
+// set the name for the ILayer.
+void Converter::SetLayerName(nvinfer1::ILayer* layer,
+                             absl::string_view main_op_name,
+                             absl::string_view sub_op_name,
+                             absl::optional<int> sub_op_instance) {
+  std::string layer_name_suffix =
+      GetLayerNameSuffix(sub_op_name, sub_op_instance);
+  SetLayerNameHelper(layer, engine_name_,
+                     absl::StrCat(main_op_name, "-", layer_name_suffix));
+}
+
+// Converts 'input' of 'node_def' into 'tensor' with shape specified by 'dims'
+// (which doesn't contain the batch dimension).
+//
+// If validation_only is true, it doesn't do the conversion but only do some
+// minimum validation for the eligibility of the conversion, and *tensor will
+// be set to nullptr.
+Status PrepareTensorForShape(Converter* converter,
+                             const TRT_TensorOrWeights& input,
+                             const DimsAdapter& dims,
+                             const bool validation_only,
+                             ITensorProxyPtr* tensor, const NodeDef& node_def,
+                             absl::optional<int> op_instance,
+                             absl::optional<std::string> origin_node_name) {
+  DimsAdapter input_dims(input.GetTrtDims());
+  // The input shape may have -1s for dynamic shape. The target shape may have
+  // 0s representing copy over the corresponding input dimensions. It may also
+  // have at most one -1 representing a dimension value that needs to be
+  // inferred. If none of those special values present, we verify that the total
+  // sizes of the input and output shape are the same.
+  // TODO(tfeher): Verify that the total sizes of the input and output shape are
+  // the same in the present of 0s but no -1 in the target shape.
   // If an input is a weight, it is going to become a tensor via
   // CreateConstantLayer. So we can treat it as a tensor for
   // AreDimsStaticWithDifferentSize(). This really only matters for 0-D tensors.
-  if (AreDimsStaticWithDifferentSize(input_dims, dims, /*is_tensor=*/true)) {
+  if (dims.Volume() > 0 && AreDimsStaticWithDifferentSize(input_dims, dims)) {
     return errors::InvalidArgument(
-        "Incompatible shapes: ", DebugString(input_dims), " vs. ",
-        DebugString(dims));
+        "Incompatible shapes: ", input_dims.DebugString(), " vs. ",
+        dims.DebugString());
   }
   // ConstantLayer requires static shapes (cannot infer -1).
-  if (input.is_weights() && !HasStaticShape(dims)) {
+  if (input.is_weights() && !dims.IsStatic()) {
     return errors::InvalidArgument("Shape is not fully defined: ",
-                                   DebugString(dims));
+                                   dims.DebugString());
   }
   if (validation_only) {
     *tensor = nullptr;
     return Status::OK();
   }
 
+  TFTRT_RETURN_ERROR_IF_NULLPTR(converter, "converter is nullptr");
   if (input.is_tensor()) {
-    if (DimsEqual(input_dims, dims)) {
+    if (input_dims == dims) {
       *tensor = input.tensor();
     } else {
       nvinfer1::IShuffleLayer* layer =
-          this->network()->addShuffle(*input.tensor()->trt_tensor());
+          converter->network()->addShuffle(*input.tensor()->trt_tensor());
       TFTRT_RETURN_ERROR_IF_NULLPTR(layer, "TF-TRT Internal Reshape");
-      layer->setReshapeDimensions(dims);
-      ITensorProxyPtr input_tensor = input.tensor();
-      ITensorProxyPtr output_tensor = layer->getOutput(0);
-      this->MarkQuantizationRangesAsInferrable(&input_tensor,
-                                                    &output_tensor);
-      *tensor = output_tensor;
+      converter->SetLayerName(layer, node_def, "shuffle", op_instance,
+                              origin_node_name);
+      layer->setReshapeDimensions(dims.AsTrtDims());
+      *tensor = layer->getOutput(0);
     }
   } else {
-    *tensor = CreateConstantLayer(input.weights(), dims);
+    *tensor = converter->CreateConstantLayer(input.weights(), dims.AsTrtDims());
     TFTRT_RETURN_ERROR_IF_NULLPTR(*tensor, "TF-TRT Internal Reshape");
-    if (precision_mode() == TrtPrecisionMode::INT8 && !use_calibration()) {
-      // If we are in int8 mode and not calibrating, we need to explicitly set a
-      // quantization range for the output tensor of the IConstantLayer. Here we
-      // set the range to [min(weights), max(weights)].
-      float min_range = 0.0f;
-      float max_range = 0.0f;
-      TF_RETURN_IF_ERROR(
-          GetWeightRange(input.weights(), &min_range, &max_range));
-      // Avoid setting range to 0 because TRT will throw an error. If the
-      // weights are zero then the range doesn't matter: using 127.0f should
-      // ensure the quantized weight will be exactly zero.
-      if (min_range == 0.0f && max_range == 0.0f) {
-        min_range = -127.0f;
-        max_range = 127.0f;
-      }
-      ProvideQuantizationRange(tensor, min_range, max_range);
-    }
   }
   return Status::OK();
 }
 
-void Converter::MarkQuantizationRangesAsInferrable(ITensorProxyPtr* input,
-                                                   ITensorProxyPtr* output) {
-  if ((*input)->is_trt_tensor()) {
-    quantization_infer_.push_back(
-        {(*input)->trt_tensor(), (*output)->trt_tensor()});
-    quantization_infer_.push_back(
-        {(*output)->trt_tensor(), (*input)->trt_tensor()});
-  } else if ((*input)->is_simple_tensor()) {
-    quantization_infer_proxy_.push_back({input, output});
-    quantization_infer_proxy_.push_back({output, input});
-  }
-}
-
 void Converter::ProvideQuantizationRange(ITensorProxyPtr* tensor,
                                          float min_range, float max_range) {
   float symmetric_range = std::max(std::abs(min_range), std::abs(max_range));
@@ -1453,48 +1806,10 @@ void Converter::ProvideQuantizationRange(ITensorProxyPtr* tensor,
   }
 }
 
-namespace {
-
-bool IsConvolution(const nvinfer1::ILayer* layer) {
-  return layer->getType() == nvinfer1::LayerType::kCONVOLUTION;
-}
-
-bool IsScale(const nvinfer1::ILayer* layer) {
-  return layer->getType() == nvinfer1::LayerType::kSCALE;
-}
-
-bool IsClipOrRelu(const nvinfer1::ILayer* layer) {
-  if (layer->getType() != nvinfer1::LayerType::kACTIVATION) {
-    return false;
-  }
-  auto activation_type = static_cast<const nvinfer1::IActivationLayer*>(layer)
-                             ->getActivationType();
-#if IS_TRT_VERSION_GE(5, 1, 2, 0)
-  return activation_type == nvinfer1::ActivationType::kRELU ||
-         activation_type == nvinfer1::ActivationType::kCLIP;
-#else
-  return activation_type == nvinfer1::ActivationType::kRELU;
-#endif
-}
-
-bool IsAdd(const nvinfer1::ILayer* layer) {
-  if (layer->getType() != nvinfer1::LayerType::kELEMENTWISE) {
-    return false;
-  }
-  auto operation =
-      static_cast<const nvinfer1::IElementWiseLayer*>(layer)->getOperation();
-  return operation == nvinfer1::ElementWiseOperation::kSUM;
-}
-
-}  // namespace
-
 void Converter::MaybeApplyQuantizationRanges() {
   if (precision_mode() != TrtPrecisionMode::INT8) return;
 
-  // Infer ranges across marked ops.
-  PropagateQuantizationRanges();
   // Apply ranges.
-#if IS_TRT_VERSION_GE(5, 0, 0, 0)
   for (auto pair : quantization_ranges_) {
     nvinfer1::ITensor* tensor = pair.first;
     const float range = pair.second;
@@ -1511,206 +1826,6 @@ void Converter::MaybeApplyQuantizationRanges() {
     // 'range', it should report error.
     tensor->setDynamicRange(-range, range);
   }
-#endif
-
-  if (use_calibration()) return;
-#if !IS_TRT_VERSION_GE(6, 0, 0, 0)
-  // Attempt to find tensors that are missing ranges, and set the corresponding
-  // layer's precision to FP16 to avoid Builder::buildCudaEngine() failing.
-  // This is only needed for TensorRT 5 and before because
-  // TensorRT6 falls to FP16 internally.
-  // TensorRT doesn't need ranges for intermediate tensors when layers are fused
-  // so find fused layers first.
-  // Get all tensors from network and deduce fused ops.
-  std::map<nvinfer1::ILayer*, std::vector<nvinfer1::ILayer*>> layer_consumers;
-  std::map<ITensorProxyPtr*, nvinfer1::ILayer*> tensor_layer;
-  std::set<ITensorProxyPtr*> all_tensors;
-  for (int i = 0; i < this->network()->getNbLayers(); i++) {
-    nvinfer1::ILayer* layer = this->network()->getLayer(i);
-    layer_consumers[layer] = {};
-    for (int j = 0; j < layer->getNbInputs(); j++) {
-      ITensorProxyPtr input_tensor = layer->getInput(j);
-      all_tensors.insert(&input_tensor);
-    }
-    for (int j = 0; j < layer->getNbOutputs(); j++) {
-      ITensorProxyPtr output_tensor = layer->getOutput(j);
-      tensor_layer[&output_tensor] = layer;
-      all_tensors.insert(&output_tensor);
-    }
-  }
-  for (int i = 0; i < this->network()->getNbLayers(); i++) {
-    nvinfer1::ILayer* layer = this->network()->getLayer(i);
-    layer_consumers[layer] = {};
-    for (int j = 0; j < layer->getNbInputs(); j++) {
-      ITensorProxyPtr input_tensor = layer->getInput(j);
-      auto input_layer = tensor_layer.find(&input_tensor);
-      if (input_layer != tensor_layer.end()) {
-        auto consumed_layer = layer_consumers.find(input_layer->second);
-        if (consumed_layer != layer_consumers.end()) {
-          consumed_layer->second.push_back(layer);
-        }
-      }
-      all_tensors.insert(&input_tensor);
-    }
-  }
-  // Identify fused tensors.
-  // Conv+BiasAdd+Add+Activation(Clip or Relu), Conv+BiasAdd+Add,
-  // Conv+BiasAdd+Activation(Clip or Relu), Conv+BiasAdd,
-  // Conv+Activation(Clip or Relu) are fused.
-  std::set<ITensorProxyPtr*> fused_tensors;
-  typedef std::function<bool(const nvinfer1::ILayer*)> matcher;
-  const std::vector<std::pair<string, std::vector<matcher>>> fused_patterns = {
-      {"Fused Conv+Bias+Add+Activation",
-       {
-           IsConvolution,
-           IsScale,
-           IsAdd,
-           IsClipOrRelu,
-       }},
-      {"Fused Conv+Bias+Add",
-       {
-           IsConvolution,
-           IsScale,
-           IsAdd,
-       }},
-      {"Fused Conv+Bias+Activation",
-       {
-           IsConvolution,
-           IsScale,
-           IsClipOrRelu,
-       }},
-      {"Fused Conv+Bias",
-       {
-           IsConvolution,
-           IsScale,
-       }},
-      {"Fused Conv+Activation",
-       {
-           IsConvolution,
-           IsClipOrRelu,
-       }},
-  };
-  for (int i = 0; i < this->network()->getNbLayers(); i++) {
-    for (const auto& pattern : fused_patterns) {
-      size_t last_matcher = pattern.second.size() - 1;
-      nvinfer1::ILayer* layer = this->network()->getLayer(i);
-      // We should skip this layer if its outputs are already marked as fused,
-      // but all the current patterns start with a convolution and are ordered
-      // in decreasing pattern length, so that is not necessary (yet).
-      std::vector<nvinfer1::ILayer*> fused_candidates;
-      for (size_t index = 0; index <= last_matcher; ++index) {
-        if ((!pattern.second[index](layer)) ||
-            (index < last_matcher && layer_consumers[layer].size() != 1)) {
-          fused_candidates.clear();
-          break;
-        }
-        if (index < last_matcher) {
-          fused_candidates.push_back(layer);
-          layer = layer_consumers[layer].front();
-        }
-      }
-      if (!fused_candidates.empty()) {
-        VLOG(1) << pattern.first;
-        for (const auto& fused_layer : fused_candidates) {
-          for (int i = 0; i < fused_layer->getNbOutputs(); i++) {
-            VLOG(1) << "  Fused output tensor:"
-                    << fused_layer->getOutput(i)->getName();
-            ITensorProxyPtr output_tensor = fused_layer->getOutput(i);
-            fused_tensors.insert(&output_tensor);
-          }
-        }
-        break;  // Don't try other patterns on this layer.
-      }
-    }
-  }
-  // Find tensors with no ranges that are not fused and force their layers to
-  // not be quantized.
-  for (auto tensor : all_tensors) {
-    if (!quantization_ranges_proxy_.count(tensor) &&
-        fused_tensors.find(tensor) == fused_tensors.end()) {
-      // Note: there may be some warnings for "(Unnamed ITensor* N)". These
-      // are tensors which are created internally by TF-TRT. The ranges for
-      // these unnamed ITensors are always inferred from user provided ranges,
-      // thus there will also be a warning for the range(s) the user missed.
-      LOG(WARNING) << "Quantization range was not found for "
-                   << (*tensor)->getName() << ". "
-                   << "Setting invalid quantization range.";
-      // Set the range to something unusable so the engine will fail if it
-      // tries to actually use the tensor's range.
-      (*tensor)->setDynamicRange(0, 0);
-      auto layer = tensor_layer.find(tensor);
-      // If the tensor is the output of a layer, set the layer's precision
-      // to fp16 so that it isn't quantized.
-      // Shuffle doesn't support setting precision.
-      if (layer != tensor_layer.end() &&
-          layer->second->getType() != nvinfer1::LayerType::kSHUFFLE) {
-        VLOG(1) << "And setting layer " << layer->second->getName()
-                << " precision to fp16.";
-        layer->second->setPrecision(nvinfer1::DataType::kHALF);
-      }
-    }
-  }
-#endif
-}
-
-void Converter::PropagateQuantizationRanges() {
-  // Propagate ranges across edges in quantization_infer_ until no new
-  // information is added.
-  // Note: this function modifies quantization_infer_, it might be better to
-  // modify a copy instead if we for some reason need quantization_infer_
-  // later.
-  bool information_added = true;
-  while (information_added) {
-    // Propogate for real tensors.
-    information_added = false;
-    for (auto it = quantization_infer_.begin();
-         it != quantization_infer_.end();) {
-      auto input_tensor_range = quantization_ranges_.find(it->first);
-      auto output_tensor_range = quantization_ranges_.find(it->second);
-      if (input_tensor_range != quantization_ranges_.end() &&
-          output_tensor_range == quantization_ranges_.end()) {
-        // Input has range but output doesn't: copy range
-        // TODO(laigd): consider reporting error if it a different range is
-        // already set.
-        quantization_ranges_[it->second] = input_tensor_range->second;
-        information_added = true;
-        VLOG(1) << "Copy quantization range: " << it->first->getName() << " -> "
-                << it->second->getName();
-      }
-      // We can remove edges when the output range is known
-      if (quantization_ranges_.find(it->second) != quantization_ranges_.end()) {
-        it = quantization_infer_.erase(it);
-      } else {
-        ++it;
-      }
-    }
-    // Propogate for proxy.
-    information_added = false;
-    for (auto it = quantization_infer_proxy_.begin();
-         it != quantization_infer_proxy_.end();) {
-      auto input_tensor_range = quantization_ranges_proxy_.find(it->first);
-      auto output_tensor_range = quantization_ranges_proxy_.find(it->second);
-      if (input_tensor_range != quantization_ranges_proxy_.end() &&
-          output_tensor_range == quantization_ranges_proxy_.end()) {
-        // Input has range but output doesn't: copy range
-        // TODO(laigd): consider reporting error if it a different range is
-        // already set.
-        quantization_ranges_proxy_[it->second] = input_tensor_range->second;
-        information_added = true;
-        VLOG(1) << "Copy quantization range: " << (*it->first)->getName()
-                << " -> " << (*it->second)->getName();
-        std::cout << "Copy quantization range: " << (*it->first)->getName()
-                  << " -> " << (*it->second)->getName();
-      }
-      // We can remove edges when the output range is known
-      if (quantization_ranges_proxy_.find(it->second) !=
-          quantization_ranges_proxy_.end()) {
-        it = quantization_infer_proxy_.erase(it);
-      } else {
-        ++it;
-      }
-    }
-  }
 }
 
 Status Converter::GetInputs(const NodeDef& node_def,
@@ -1755,173 +1870,127 @@ Status Converter::GetInputs(const NodeDef& node_def,
 }
 
 // Checks that the number of inputs match, and enforces that the inputs marked
-// as true are constant weights. true means that the input must be a weight,
-// while false means the input must be a tensor. In the future, false will mean
-// the input can be a tensor or weight.
+// as weights are constant. Inputs are allowed to be both weight and tensor.
 Status CheckInputsWeights(
     const OpConverterParams& params,
-    const std::vector<std::pair<string, bool>>& inputs_is_weight) {
+    const std::vector<std::pair<string, TrtInputArg>>& expected_inputs) {
   const auto& inputs = params.inputs;
   const auto& node_def = params.node_def;
-  if (inputs.size() != inputs_is_weight.size()) {
-    return errors::InvalidArgument(
-        node_def.op(), " got ", inputs.size(), " inputs but expected ",
-        inputs_is_weight.size(), ", at ", node_def.name());
-  }
+  TFTRT_CHECK_INPUT_SIZE(inputs.size(), expected_inputs.size(), node_def);
   for (int i = 0; i < inputs.size(); i++) {
-    if (inputs_is_weight[i].second && inputs.at(i).is_tensor()) {
-      return errors::Unimplemented("The input \"", inputs_is_weight[i].first,
+    if (expected_inputs[i].second == TrtInputArg::kWeight &&
+        !inputs.at(i).is_weights()) {
+      return errors::Unimplemented("The input \"", expected_inputs[i].first,
                                    "\" for ", node_def.op(),
-                                   " must be a constant, at ", node_def.name());
+                                   " must be a constant");
     }
-    // TODO(tmorris): Remove this check and provide a method to automatically
+    // TODO(tfeher): Remove this check and provide a method to automatically
     // retrieve an input as a tensor, converting via CreateConstantLayer if it
     // was originally a weight. We will want a caching mechanism to prevent many
     // duplicate constants from being created.
-    if (!inputs_is_weight[i].second && inputs.at(i).is_weights()) {
-      return errors::Unimplemented("The input \"", inputs_is_weight[i].first,
+    if (expected_inputs[i].second == TrtInputArg::kTensor &&
+        !inputs.at(i).is_tensor()) {
+      return errors::Unimplemented("The input \"", expected_inputs[i].first,
+                                   "\" for ", node_def.op(),
+                                   " must be a tensor");
+    }
+    if (expected_inputs[i].second == TrtInputArg::kResource &&
+        !inputs.at(i).is_resource()) {
+      return errors::Unimplemented("The input \"", expected_inputs[i].first,
                                    "\" for ", node_def.op(),
-                                   " must be a tensor, at ", node_def.name());
+                                   " must be a resource handle");
     }
   }
   return Status::OK();
 }
 
-Status AllowDataTypes(const OpConverterParams& params,
-                      const std::set<DataType>& allowed_dtypes,
-                      const char* dtype_attr_name = "T") {
-  const auto& node_def = params.node_def;
-  TFAttrs attrs(node_def);
-  if (!attrs.count(dtype_attr_name)) {
-    return errors::InvalidArgument("Attribute with name ", dtype_attr_name,
-                                   " not found.");
-  }
-  const auto op_dtype = attrs.get<DataType>(dtype_attr_name);
-  if (!allowed_dtypes.count(op_dtype)) {
-    // Build string list of allowed types.
-    std::ostringstream ss;
-    for (auto it = allowed_dtypes.begin(); it != allowed_dtypes.end(); ++it) {
-      if (it != allowed_dtypes.begin()) ss << ", ";
-      ss << DataTypeString(*it);
+// Checks that the number of inputs match, and enforces that the inputs marked
+// as true are constant weights. true means that the input must be a weight,
+// while false means the input must be a tensor.
+Status CheckInputsWeights(
+    const OpConverterParams& params,
+    const std::vector<std::pair<string, bool>>& inputs_is_weight) {
+  std::vector<std::pair<string, TrtInputArg>> expected_inputs;
+  expected_inputs.reserve(inputs_is_weight.size());
+  std::transform(
+      inputs_is_weight.begin(), inputs_is_weight.end(),
+      std::back_inserter(expected_inputs), [](std::pair<string, bool> x) {
+        return std::make_pair(
+            x.first, x.second ? TrtInputArg::kWeight : TrtInputArg::kTensor);
+      });
+  return CheckInputsWeights(params, expected_inputs);
+}
+
+Status GetNodeDefTfType(const NodeDef& node_def, DataType* tf_type,
+                        const string type_attr_name_in = "") {
+  string type_attr_name;
+  if (type_attr_name_in.empty()) {
+    if (node_def.op() == "ReadVariableOp" ||
+        node_def.op() == "ResourceGather") {
+      type_attr_name = "dtype";
+    } else {
+      type_attr_name = "T";
     }
-    return errors::Unimplemented("Data type ", DataTypeString(op_dtype),
-                                 " is not supported for ", node_def.op(),
-                                 ", must be one of [", ss.str(), "], at ",
-                                 node_def.name());
+  } else {
+    type_attr_name = type_attr_name_in;
   }
+
+  AttrSlice attrs(node_def);
+  if (attrs.Find(type_attr_name) == nullptr) {
+    return errors::InvalidArgument("Attribute with name ", type_attr_name,
+                                   " not found.");
+  }
+  TF_RETURN_IF_ERROR(GetNodeAttr(attrs, type_attr_name, tf_type));
   return Status::OK();
 }
 
-// ****************************************************************************
-// Constant folding functions for weights.
-// TODO(laigd): we should probably use eigen directly.
-// *****************************************************************************
-struct LambdaFactory {
-  enum class OP_CATEGORY : int { RSQRT = 0, NEG, RECIP };
-  OP_CATEGORY op;
-
-  template <typename T>
-  std::function<T(T)> unary() {
-    switch (op) {
-      case OP_CATEGORY::RSQRT: {
-        VLOG(2) << "RSQRT GETS DONE";
-        return [](T t) -> T { return 1.0 / std::sqrt(t); };
-      }
-      case OP_CATEGORY::NEG:
-        return [](T t) -> T { return -t; };
-      case OP_CATEGORY::RECIP:
-        return [](T t) -> T { return 1.0 / t; };
-      default:
-        LOG(ERROR) << "Not supported op for unary: " << static_cast<int>(op);
-        return nullptr;
-    }
+Status GetInputTfType(const OpConverterParams& params, DataType* tf_type,
+                      int pos) {
+  const std::vector<TRT_TensorOrWeights>& inputs = params.inputs;
+  if (inputs.size() <= pos) {
+    return errors::Internal("Invalid input position");
   }
-};
 
-template <>
-std::function<Eigen::half(Eigen::half)> LambdaFactory::unary<Eigen::half>() {
-  switch (op) {
-    case OP_CATEGORY::RSQRT: {
-      VLOG(2) << "RSQRT GETS DONE";
-      return [](Eigen::half t) {
-        return Eigen::half(1.0 / std::sqrt(static_cast<float>(t)));
-      };
-    }
-    case OP_CATEGORY::NEG:
-      return [](Eigen::half t) { return -t; };
-    case OP_CATEGORY::RECIP:
-      return [](Eigen::half t) {
-        return Eigen::half(1.0 / static_cast<float>(t));
-      };
-    default:
-      LOG(ERROR) << "Not supported op for unary: " << static_cast<int>(op);
-      return nullptr;
-  }
+  return inputs[pos].GetTfType(tf_type);
 }
 
-Status UnaryCompute(const TRT_ShapedWeights& iweights,
-                    TRT_ShapedWeights* oweights, LambdaFactory unary_op) {
-  CHECK(iweights.TrtDType() == oweights->TrtDType());
-  switch (iweights.TrtDType()) {
-    case nvinfer1::DataType::kFLOAT: {
-      auto inp = static_cast<float const*>(iweights.GetValues());
-      auto oup = static_cast<float*>(oweights->GetValues());
-      std::transform(inp, inp + iweights.count(), oup, unary_op.unary<float>());
-      break;
-    }
-    case nvinfer1::DataType::kHALF: {
-      auto inp = static_cast<Eigen::half const*>(iweights.GetValues());
-      auto oup = static_cast<Eigen::half*>(oweights->GetValues());
-      std::transform(inp, inp + iweights.count(), oup,
-                     unary_op.unary<Eigen::half>());
-      break;
-    }
-    default:
-      return errors::Unimplemented("Data type not supported: ",
-                                   DebugString(iweights.TrtDType()));
+Status GetOutputTfType(const OpConverterParams& params, DataType* tf_type) {
+  return GetNodeDefTfType(params.node_def, tf_type);
+}
+
+Status AllowDataTypes(const OpConverterParams& params,
+                      const std::set<DataType>& allowed_types,
+                      const char* type_attr_name = "") {
+  const auto& node_def = params.node_def;
+  DataType tf_type;
+  TF_RETURN_IF_ERROR(GetNodeDefTfType(node_def, &tf_type, type_attr_name));
+  if (!allowed_types.count(tf_type)) {
+    const auto error =
+        convert_not_supported_dtype_msg(allowed_types, tf_type, node_def);
+    return errors::Unimplemented(error);
   }
   return Status::OK();
 }
 
-// Before TRT 5.1.3, we have to calculate padding for convolutions ourselves.
-Status Conv2DPaddingHelper(OpConverterParams* params, const TFAttrs& attrs,
-                           const nvinfer1::DimsHW& kernel_size,
-                           const nvinfer1::DimsHW& dilation,
-                           const nvinfer1::DimsHW& stride,
-                           const std::vector<int64_t>& input_dims,
-                           ITensorProxyPtr tensor,
-                           std::vector<std::pair<int, int>>* padding,
-                           ITensorProxyPtr* padded_tensor) {
-  if (attrs.get<string>("padding") == "SAME") {
-    nvinfer1::DimsHW effective_kernel_size = kernel_size;
-    effective_kernel_size.h() += (kernel_size.h() - 1) * (dilation.h() - 1);
-    effective_kernel_size.w() += (kernel_size.w() - 1) * (dilation.w() - 1);
-    *padding = CreateSamePadding(stride, effective_kernel_size, input_dims);
-  } else {
-    *padding = {{0, 0}, {0, 0}};
-  }
-
-  // Handle asymmetric padding. TensorRT 5.1 added support for asymmetric
-  // padding via setPrePadding and setPostPadding. Due to a bug in 5.1.2, we can
-  // only use asymmetric padding in convolutions with 5.1.3+. But in 5.1.3, we
-  // will always use setPaddingMode for simplicity.
-  if ((*padding)[0].first != (*padding)[0].second ||
-      (*padding)[1].first != (*padding)[1].second) {
-    auto pad_layer = params->converter->network()->addPadding(
-        *tensor->trt_tensor(), nvinfer1::DimsHW((*padding)[0].first, (*padding)[1].first),
-        nvinfer1::DimsHW((*padding)[0].second, (*padding)[1].second));
-    TFTRT_RETURN_ERROR_IF_NULLPTR(pad_layer, params->node_def.name());
-    ITensorProxyPtr output_tensor = pad_layer->getOutput(0);
-    params->converter->MarkQuantizationRangesAsInferrable(&tensor,
-                                                          &output_tensor);
-    *padding = {{0, 0}, {0, 0}};
-    tensor = output_tensor;
-  }
-  *padded_tensor = tensor;
-  return Status::OK();
+namespace {
+// Extracts the spatial dimensions from `output_sizes` and returns them as a
+// vector of size 2.
+std::vector<int64_t> GetSpatialDimsFromOutputSizes(
+    const TRT_TensorOrWeights& output_sizes, const int h_index,
+    const int w_index) {
+  // We use h_index and w_index instead of 1 and 2 because we haven't
+  // transposed output_sizes along with the input.
+  const TRT_ShapedWeights& weights = output_sizes.weights();
+  const int output_sizes_length = weights.count();
+  auto output_sizes_values = weights.GetPointer<int>();
+  // The length of output_sizes can be 2 or 4. When the length is 4,
+  // output_sizes represents <height,width>.
+  return {output_sizes_values[output_sizes_length == 4 ? h_index : 0],
+          output_sizes_values[output_sizes_length == 4 ? w_index : 1]};
 }
+}  // namespace
 
-Status ConvertConv2DHelper(OpConverterParams* params, int group,
+Status ConvertConv2DHelper(const OpConverterParams* params, int group,
                            bool is_conv2d_backprop_input) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
@@ -1930,56 +1999,99 @@ Status ConvertConv2DHelper(OpConverterParams* params, int group,
   if (is_conv2d_backprop_input) {
     // In the case when Conv2dBackpropInput is used for conv2d_transpose, these
     // inputs correspond to: output size, filter, and input.
-    TF_RETURN_IF_ERROR(CheckInputsWeights(
-        *params,
-        {{"input_sizes", true}, {"filter", true}, {"out_backprop", false}}));
+    // TODO(cbate): refine this check when moving to structured op converter.
+    if (!params->use_explicit_precision) {
+      TF_RETURN_IF_ERROR(CheckInputsWeights(
+          *params,
+          {{"input_sizes", true}, {"filter", true}, {"out_backprop", false}}));
+    }
+
     backprop_output_size = inputs.at(0);
     tensor = inputs.at(2).tensor();
+    bool has_dynamic_hw_shape{false};
+    int start_idx{0};
+    auto dims = tensor->getDimensions();
+    if (params->use_implicit_batch) {
+      if (dims.nbDims != 3) {
+        return errors::Internal(
+            "In implicit batch mode, input nbDims should be 3");
+      }
+      start_idx = 1;
+    } else {
+      if (dims.nbDims != 4) {
+        return errors::Internal(
+            "In explicit batch mode, input nbDims should be 4");
+      }
+      start_idx = 2;
+    }
+    for (int i = start_idx; i < dims.nbDims; ++i) {
+      if (dims.d[i] < 0) {
+        has_dynamic_hw_shape = true;
+      }
+    }
+    if (has_dynamic_hw_shape) {
+      return errors::Unimplemented(
+          "Conv2dBackpropInput does not support input with unknown spatial "
+          "shape");
+    }
   } else {
-    TF_RETURN_IF_ERROR(
-        CheckInputsWeights(*params, {{"input", false}, {"filter", true}}));
+    TF_RETURN_IF_ERROR(CheckInputsWeights(
+        *params,
+        {{"input", false}, {"filter", !params->use_explicit_precision}}));
     tensor = inputs.at(0).tensor();
   }
   TF_RETURN_IF_ERROR(
       AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
-  TRT_ShapedWeights weights_rsck = inputs.at(1).weights();
-  if (weights_rsck.shape_.nbDims != 4) {
-    return errors::InvalidArgument("Conv2D expects kernel of dimension 4, at " +
-                                   node_def.name());
+
+  if (inputs.at(1).GetTrtDims().nbDims != 4) {
+    return errors::InvalidArgument("Conv2D expects kernel of dimension 4");
   }
-  TFAttrs attrs(node_def);
-  auto data_format = attrs.get<string>("data_format");
+
+  string data_format, padding_type;
+  std::vector<int64> tf_dilations, tf_stride;
+  AttrSlice attrs(node_def);
+  TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "data_format", &data_format));
+  TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "padding", &padding_type));
+  TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "dilations", &tf_dilations));
+  TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "strides", &tf_stride));
+
   int c_index = (data_format == "NHWC") ? 3 : 1;
   int h_index = (data_format == "NHWC") ? 1 : 2;
   int w_index = (data_format == "NHWC") ? 2 : 3;
-  auto tf_dilations = attrs.get<std::vector<int64>>("dilations");
+
   if (tf_dilations.size() != 4) {
     return errors::InvalidArgument(
-        "Convolution dilations field must specify 4 dimensions, at ",
-        node_def.name());
+        "Convolution dilations field must specify 4 dimensions");
   }
   if (tf_dilations[0] != 1 || tf_dilations[c_index] != 1) {
     return errors::Unimplemented(
-        "Dilation rate must be 1 for batch and channel dimensions, at ",
-        node_def.name());
+        "Dilation rate must be 1 for batch and channel dimensions");
   }
   const nvinfer1::DimsHW dilation(tf_dilations[h_index], tf_dilations[w_index]);
   if (is_conv2d_backprop_input && (dilation.d[0] != 1 || dilation.d[1] != 1)) {
     return errors::Unimplemented(
-        "Dilation with Conv2DBackpropInput (conv2d_transpose) is not supported",
-        ", at ", node_def.name());
+        "Dilation with Conv2DBackpropInput (conv2d_transpose) is not"
+        " supported");
   }
 
-  const auto tf_stride = attrs.get<std::vector<int64>>("strides");
   if (tf_stride.size() != 4) {
     return errors::InvalidArgument(
-        "Convolution strides field must specify 4 dimensions, at ",
-        node_def.name());
+        "Convolution strides field must specify 4 dimensions");
   }
   if (tf_stride[0] != 1 || tf_stride[c_index] != 1) {
     return errors::Unimplemented(
-        "Stride must be 1 for batch and channel dimensions, at ",
-        node_def.name());
+        "Stride must be 1 for batch and channel dimensions");
+  }
+  // Channel dim must be static for DepthwiseConv2dNative since we use that
+  // value for num_groups at build time.
+  if (!params->use_implicit_batch && tensor->getDimensions().d[c_index] == -1) {
+    return errors::InvalidArgument("Channel dimension must be static");
+  }
+
+  if (padding_type != "SAME" && padding_type != "VALID") {
+    return errors::Unimplemented(padding_type +
+                                 " padding type not implemented, "
+                                 "only VALID and SAME are supported");
   }
   const nvinfer1::DimsHW stride(tf_stride[h_index], tf_stride[w_index]);
   if (params->validation_only) return Status::OK();
@@ -1987,139 +2099,206 @@ Status ConvertConv2DHelper(OpConverterParams* params, int group,
   // Transpose to NCHW (NCHW is required for IConvLayer).
   const bool need_transpose = (data_format == "NHWC");
   if (need_transpose) {
-    TF_RETURN_IF_ERROR(
-        params->converter->TransposeTensor(tensor, {0, 3, 1, 2}, &tensor));
+    TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
+        tensor, {0, 3, 1, 2}, &tensor, node_def, "to_NCHW"));
   }
   // Dimensions of transposed tensor.
   const auto tensor_dim = tensor->getDimensions();
+  const int c_dim_size = tensor_dim.d[params->use_implicit_batch ? 0 : 1];
 
   // group == 0 signifies that this is a depthwise convolution, so set
   // num_groups to size of input's channel dim. For a non-depthwise conv,
   // num_groups will be 1.
-  const int num_groups = (group == 0) ? tensor_dim.d[0] : group;
+  const int num_groups = (group == 0) ? c_dim_size : group;
 
   // For conv, TF weights are RSCK, and TRT expects KCRS.
   // For backprop, TF weights are RSKC, and TRT expects CKRS.
   // Therefore, this reorder will work for both cases.
-  TRT_ShapedWeights weights =
-      params->weight_store->GetTempWeights(weights_rsck);
-  ReorderRSCKToKCRS(weights_rsck, &weights, num_groups);
-  TRT_ShapedWeights biases(weights.TrtDType());
-  const int output_axis = is_conv2d_backprop_input ? 1 : 0;
-  const int noutput = weights.shape_.d[output_axis] * num_groups;
+  const int output_axis = is_conv2d_backprop_input ? 2 : 3;
+  auto weights_shape = inputs.at(1).GetTrtDims();
+  const int noutput = weights_shape.d[output_axis] * num_groups;
   nvinfer1::DimsHW kernel_size;
-  kernel_size.h() = weights.shape_.d[2];
-  kernel_size.w() = weights.shape_.d[3];
+  kernel_size.h() = weights_shape.d[0];
+  kernel_size.w() = weights_shape.d[1];
 
-// Before TRT 5.1.3, we have to calculate padding ourselves.
-#if !IS_TRT_VERSION_GE(5, 1, 3, 0)
-  std::vector<std::pair<int, int>> padding;
-  std::vector<int64_t> input_dims;
-  if (is_conv2d_backprop_input) {
-    // For backprop, calculate padding based on "input_sizes" input, which
-    // actually corresponds to output size. ("input_sizes" makes sense in the
-    // context of Conv2DBackpropInput).
-    // We use h_index and w_index instead of 1 and 2 because we havent
-    // transposed backprop_output_size along with the input.
-    auto output_size_weights =
-        static_cast<int*>(backprop_output_size.weights().GetValues());
-    input_dims = {output_size_weights[h_index], output_size_weights[w_index]};
+  TRT_ShapedWeights weights_rsck;
+  if (inputs.at(1).is_weights()) {
+    weights_rsck = inputs.at(1).weights();
   } else {
-    // Use 1 and 2 because tensor_dim has the dimensions of the transposed
-    // input.
-    input_dims = {static_cast<int>(tensor_dim.d[1]),
-                  static_cast<int>(tensor_dim.d[2])};
-  }
-  ITensorProxyPtr padded_tensor = nullptr;
-  TF_RETURN_IF_ERROR(Conv2DPaddingHelper(params, attrs, kernel_size, dilation,
-                                         stride, input_dims, tensor, &padding,
-                                         &padded_tensor));
-  tensor = padded_tensor;
-#endif
+    ::stream_executor::port::StatusOr<TRT_ShapedWeights> tmp =
+        params->weight_store->GetTempWeights(nvinfer1::DataType::kFLOAT,
+                                             weights_shape);
+    TRT_ENSURE_OK(tmp);
+    weights_rsck = std::move(tmp).ValueOrDie();
+  }
+
+  // In explcit precision mode, trace the input back to the constant while also
+  // verifying that QDQ scale layers are present.
+  if (!inputs.at(1).is_weights()) {
+    TRT_ENSURE(params->use_explicit_precision);
+    ::stream_executor::port::StatusOr<TRTNetworkBuilder> builder =
+        TRTNetworkBuilder::Create(params->converter->network(),
+                                  params->weight_store);
+    TRT_ENSURE_OK(builder);
+    auto dequant_layer = builder.ValueOrDie().FindProducerOf(
+        inputs.at(1).tensor()->trt_tensor());
+    TRT_ENSURE_PTR_OK(dequant_layer);
+
+    // TODO(cbate): corresponding TRT layer name check
+    if (!IS_TRT_VERSION_GE(8, 0, 0, 0)) {
+      TRT_ENSURE(dequant_layer.ValueOrDie()->getType() ==
+                 nvinfer1::LayerType::kSCALE);
+    }
+
+    auto quant_layer =
+        builder.ValueOrDie().UniqueParentOf(dequant_layer.ValueOrDie(), 0);
+    TRT_ENSURE_PTR_OK(quant_layer);
+
+    // TODO(cbate): corresponding TRT layer name check
+    if (!IS_TRT_VERSION_GE(8, 0, 0, 0)) {
+      TRT_ENSURE(quant_layer.ValueOrDie()->getType() ==
+                 nvinfer1::LayerType::kSCALE);
+    }
+
+    auto weights_layer =
+        builder.ValueOrDie().UniqueParentOf(quant_layer.ValueOrDie(), 0);
+    TRT_ENSURE_PTR_OK(weights_layer);
+    TRT_ENSURE(weights_layer.ValueOrDie()->getType() ==
+               nvinfer1::LayerType::kCONSTANT);
+    auto const_weights_rsck =
+        reinterpret_cast<nvinfer1::IConstantLayer*>(weights_layer.ValueOrDie())
+            ->getWeights();
+
+    TRT_ENSURE(weights_rsck.count() == weights_rsck.count());
+    const auto* weights_ptr =
+        static_cast<const float*>(const_weights_rsck.values);
+    std::copy_n(weights_ptr, const_weights_rsck.count,
+                weights_rsck.GetPointer<float>());
+  }
+
+  ::stream_executor::port::StatusOr<TRT_ShapedWeights> weights =
+      params->weight_store->GetTempWeights(weights_rsck);
+  TRT_ENSURE_OK(weights);
+  ::stream_executor::port::StatusOr<TRT_ShapedWeights> biases =
+      params->weight_store->GetTempWeights(nvinfer1::DataType::kFLOAT,
+                                           nvinfer1::Dims{1, {noutput}});
+  TRT_ENSURE_OK(biases);
+  std::fill_n(biases.ValueOrDie().GetPointer<float>(), noutput, 0.0f);
+  ReorderRSCKToKCRS(weights_rsck, &weights.ValueOrDie(), num_groups);
 
   // Add convolution.
   nvinfer1::ILayer* conv_layer = nullptr;
   if (is_conv2d_backprop_input) {
     nvinfer1::IDeconvolutionLayer* layer =
         params->converter->network()->addDeconvolution(
-            *tensor->trt_tensor(), noutput, kernel_size, weights.GetTrtWeights(),
-            biases.GetTrtWeights());
+            *tensor->trt_tensor(), noutput, kernel_size,
+            weights.ValueOrDie().GetTrtWeights(),
+            biases.ValueOrDie().GetTrtWeights());
     TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
     layer->setStride(stride);
-// TensorRT 5.1.3 added support for padding modes.
-#if IS_TRT_VERSION_GE(5, 1, 3, 0)
     // VALID padding is the default TRT behavior.
-    if (attrs.get<string>("padding") == "SAME") {
+    if (padding_type == "SAME") {
       // SAME_UPPER means that post padding is preferred.
       layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER);
     }
-#else
-    layer->setPadding(nvinfer1::DimsHW{padding[0].first, padding[1].first});
-#endif
-    layer->setName(node_def.name().c_str());
     layer->setNbGroups(num_groups);
     conv_layer = layer;
   } else {
+    const nvinfer1::Weights empty_weights{nvinfer1::DataType::kFLOAT, nullptr,
+                                          0};
     nvinfer1::IConvolutionLayer* layer =
         params->converter->network()->addConvolution(
-            *tensor->trt_tensor(), noutput, kernel_size, weights.GetTrtWeights(),
-            biases.GetTrtWeights());
+            *tensor->trt_tensor(), noutput, kernel_size,
+            params->use_explicit_precision
+                ? empty_weights
+                : weights.ValueOrDie().GetTrtWeights(),
+            empty_weights);
     TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
     layer->setStride(stride);
-#if IS_TRT_VERSION_GE(5, 1, 3, 0)
-    if (attrs.get<string>("padding") == "SAME") {
+    if (padding_type == "SAME") {
       layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER);
     }
-#else
-    layer->setPadding(nvinfer1::DimsHW{padding[0].first, padding[1].first});
-#endif
-    layer->setName(node_def.name().c_str());
     layer->setNbGroups(num_groups);
     layer->setDilation(dilation);
     conv_layer = layer;
   }
+
+  // After creating the conv layer, if we are in explicit precision mode and the
+  // weights input is a tensor, then we need to override the weights input by
+  // calling setInput() on the layer.
+  if (params->use_explicit_precision) {
+    TRT_ENSURE(inputs.at(1).is_tensor());
+
+    nvinfer1::IShuffleLayer* layer = params->converter->network()->addShuffle(
+        *inputs.at(1).tensor()->trt_tensor());
+    layer->setFirstTranspose({3, 2, 0, 1});
+    layer->setReshapeDimensions({4, {0, 0, 0, 0}});
+    conv_layer->setInput(1, *layer->getOutput(0));
+  }
+
+  params->converter->SetLayerName(conv_layer, node_def, "conv");
   ITensorProxyPtr output_tensor = conv_layer->getOutput(0);
   // Add an extra padding for Deconv because TRT doesn't accept the
   // argument output_shape and thus the TRT output shape could be wrong
   // in case of strides>1.
   if (is_conv2d_backprop_input) {
-    auto tf_output_shape =
-        static_cast<int*>(backprop_output_size.weights().GetValues());
+    std::vector<int64_t> output_spatial_dims =
+        GetSpatialDimsFromOutputSizes(backprop_output_size, h_index, w_index);
+    const int output_height = output_spatial_dims[0];
+    const int output_width = output_spatial_dims[1];
     nvinfer1::Dims trt_output_shape = output_tensor->getDimensions();
     // What determines the padding size is the difference between the given
     // input_sizes (tf_output_shape) and TRT computed size.
-    const int height_diff = tf_output_shape[h_index] - trt_output_shape.d[1];
-    const int width_diff = tf_output_shape[w_index] - trt_output_shape.d[2];
+    int out_h_idx = params->use_implicit_batch ? 1 : 2;
+    int out_w_idx = params->use_implicit_batch ? 2 : 3;
+    const int height_diff = output_height - trt_output_shape.d[out_h_idx];
+    const int width_diff = output_width - trt_output_shape.d[out_w_idx];
     if ((height_diff < 0) || (width_diff < 0)) {
       return errors::InvalidArgument(
           "input_sizes argument of Conv2DBackprop (i.e. output_shape argument "
           "of conv2d_transpose) ",
           "is too small for the given out_backprop argument of Conv2DBackprop "
           "(i.e. input argument of conv2d_transpose). Expect: ",
-          "(", tf_output_shape[h_index], ", ", tf_output_shape[w_index],
-          ") >= ", "(", trt_output_shape.d[1], ", ", trt_output_shape.d[2],
-          ") for op ", node_def.name());
+          "(", output_height, ", ", output_width, ") >= ", "(",
+          trt_output_shape.d[out_h_idx], ", ", trt_output_shape.d[out_w_idx],
+          ")");
     }
     // Only add a padding layer if padding sizes are larger than 0
     if ((height_diff > 0) || (width_diff > 0)) {
       nvinfer1::DimsHW pre_padding(0, 0);
       nvinfer1::DimsHW post_padding(height_diff, width_diff);
       nvinfer1::IPaddingLayer* padding_layer =
-          params->converter->network()->addPadding(*output_tensor->trt_tensor(), pre_padding,
-                                                   post_padding);
+          params->converter->network()->addPadding(*output_tensor->trt_tensor(),
+                                                   pre_padding, post_padding);
       output_tensor = padding_layer->getOutput(0);
+      params->converter->SetLayerName(padding_layer, node_def, "pad");
     }
   }
   // Restore transpose.
   if (need_transpose) {
     TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
-        output_tensor, {0, 2, 3, 1}, &output_tensor));
+        output_tensor, {0, 2, 3, 1}, &output_tensor, node_def, "to_NHWC"));
   }
   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
   return Status::OK();
 }
 
-Status ConvertTranspose(OpConverterParams* params) {
+bool AllowInefficientTranspose() {
+  static bool result = [] {
+    bool value;
+    Status status =
+        ReadBoolFromEnvVar("TF_DEBUG_TRT_ALLOW_INEFFICIENT_TRANSPOSE",
+                           /*default_value=*/false, &value);
+    if (!status.ok()) {
+      LOG(ERROR) << status;
+    }
+    return value;
+  }();
+
+  return result;
+}
+
+Status ConvertTranspose(const OpConverterParams* params) {
   const auto& inputs = params->inputs;
   TF_RETURN_IF_ERROR(
       CheckInputsWeights(*params, {{"x", false}, {"perm", true}}));
@@ -2127,143 +2306,208 @@ Status ConvertTranspose(OpConverterParams* params) {
       *params, {DataType::DT_FLOAT, DataType::DT_HALF, DataType::DT_INT32}));
   // Get the permutation from weights.
   TRT_ShapedWeights weights = inputs.at(1).weights();
-  const int* weights_ptr = static_cast<int*>(weights.GetValues());
+  const int* weights_ptr = weights.GetPointer<int>();
   std::vector<int> perm(weights_ptr, weights_ptr + weights.count());
 
   // Verify the permutation.
   ITensorProxyPtr input_tensor = inputs.at(0).tensor();
-  if (perm.size() - 1 != size_t(input_tensor->getDimensions().nbDims)) {
+  const int perm_size =
+      params->use_implicit_batch ? perm.size() - 1 : perm.size();
+  if (perm_size != size_t(input_tensor->getDimensions().nbDims)) {
     return errors::InvalidArgument(
         "Rank of perm for transpose does not match with that of the input.");
   }
-  if (perm[0] != 0) {
+  if (params->use_implicit_batch && perm[0] != 0) {
     return errors::Unimplemented(
         "Transpose at batch dimension is not supported.");
   }
 
+  if (!IS_TRT_VERSION_GE(7, 1, 3, 4)) {
+    // TensorRT versions before 7.1.3.4 is slow transposing large tensors.
+    // So check tensor size, and don't convert if it is too large.
+    constexpr int64_t kMaxEfficientTranspose = 2500000;
+    int64_t tensor_size = DimsAdapter(input_tensor->getDimensions()).Volume();
+    if (!AllowInefficientTranspose() && tensor_size > kMaxEfficientTranspose) {
+      return errors::Unimplemented(StrCat("Transpose too large:", tensor_size));
+    }
+  }
+
   if (params->validation_only) return Status::OK();
 
   // Start conversion.
   ITensorProxyPtr output_tensor = nullptr;
-  TF_RETURN_IF_ERROR(
-      params->converter->TransposeTensor(input_tensor, perm, &output_tensor));
+  TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
+      input_tensor, perm, &output_tensor, params->node_def));
   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
   return Status::OK();
 }
 
-Status ConvertReshape(OpConverterParams* params) {
+Status ConvertShape(const OpConverterParams* params) {
   const auto& inputs = params->inputs;
-  const auto& node_def = params->node_def;
   TF_RETURN_IF_ERROR(
-      CheckInputsWeights(*params, {{"tensor", false}, {"shape", true}}));
-  TF_RETURN_IF_ERROR(AllowDataTypes(
-      *params, {DataType::DT_FLOAT, DataType::DT_HALF, DataType::DT_INT32}));
-  const TRT_TensorOrWeights& input_tensor = inputs.at(0);
-  TRT_ShapedWeights weights = inputs.at(1).weights();
-  if (weights.count() == 0) {
-    return errors::Unimplemented("Reshape to shape=[] is not supported, at ",
-                                 node_def.name());
+      CheckInputsWeights(*params, {{"input", TrtInputArg::kBoth}}));
+  if (params->use_implicit_batch) {
+    return errors::Unimplemented(
+        "Shape is only supported for explicit batch mode.");
   }
+  DimsAdapter input_dims(inputs.at(0).GetTrtDims());
+  if (params->validation_only) return Status::OK();
 
-  const int* weights_ptr = static_cast<int*>(weights.GetValues());
+  ::stream_executor::port::StatusOr<TRTNetworkBuilder> builder =
+      TRTNetworkBuilder::Create(params->converter->network(),
+                                params->weight_store);
+  TRT_ENSURE_OK(builder);
+  if (input_dims.IsStatic()) {
+    // Create a const node with the value of the shape.
+    ::stream_executor::port::StatusOr<nvinfer1::IConstantLayer*> const_layer =
+        builder.ValueOrDie().ConstantShape(input_dims);
+    TRT_ENSURE_PTR_OK(const_layer);
+    params->outputs->push_back(
+        TRT_TensorOrWeights(const_layer.ValueOrDie()->getOutput(0)));
+    return Status::OK();
+  }
+  ::stream_executor::port::StatusOr<nvinfer1::IShapeLayer*> shape_layer =
+      builder.ValueOrDie().Shape(inputs.at(0).tensor()->trt_tensor());
+  TRT_ENSURE_PTR_OK(shape_layer);
+  params->converter->SetLayerName(shape_layer.ValueOrDie(), params->node_def,
+                                  "shape");
+  params->outputs->push_back(
+      TRT_TensorOrWeights(shape_layer.ValueOrDie()->getOutput(0)));
+  return Status::OK();
+}
 
-  // Check that it doesn't change the batch dimension. This check is
-  // conservative, for example, when the first dim of the shape is -1 and input
-  // tensor shape is not fixed, it is still possible that the reshape doesn't
-  // change the batch dim, but as long as there is a possibility that it could
-  // change the batch dim, it reject the conversion. The parameters are:
-  //
-  // * reshape_batch_dim: the value of the first dim of the input shape constant
-  // * reshape_dims: all other dims of the input shape constant
-  // * input_batch_dim: the value of the first dim of the input tensor to
-  //   reshape
-  // * input_dims: all other dims of the input tensor to reshape
-  //
-  // The validation logic is:
-  //
-  // if input_batch_dim is fixed:
-  //   if reshape_batch_dim == input_batch_dim:
-  //     ok
-  //   elif reshape_batch_dim == -1 (meaning reshape_dims are fixed) and
-  //        input_dims are fixed and
-  //        prod(input_dims) == prod(reshape_dims)
-  //     ok
-  //   else:
-  //     not ok
-  // elif input_dims are fixed:
-  //   if reshape_dims are fixed and
-  //      prod(input_dims) == prod(reshape_dims):
-  //     ok
-  //   else:
-  //     not ok
-  // else:
-  //   not ok
-  //
-  // Note that the following is ok no matter whether reshape_batch_dim is fixed
-  // or not:
-  //
-  // ```
-  // input_batch_dim is not fixed &&
-  //     reshape_dims are fixed &&
-  //     prod(input_dims) == prod(reshape_dims),
-  // ```
-  //
-  // because the non-batch dims of the new and old shapes match, and TF runtime
-  // should make sure the batch dim is not changed.
+Status ExpectShapeTensor(const TRT_TensorOrWeights& tensor) {
+  if (tensor.tensor()->getType() != nvinfer1::DataType::kINT32) {
+    return errors::InvalidArgument("Expected a shape tensor with INT32 type");
+  }
+  if (tensor.GetTrtDims().nbDims > 1) {
+    return errors::InvalidArgument("Expected a 0D or 1D shape tensor");
+  }
+  return Status::OK();
+}
 
-  const int input_batch_dim = input_tensor.batch_size();
-  const int reshape_batch_dim = weights_ptr[0];
-  const nvinfer1::Dims input_dims = input_tensor.GetTrtDims();
+// Converts Reshape op if the input has dynamic (unknown) dims.
+Status ConvertDynamicReshape(const OpConverterParams* params) {
+  if (params->use_implicit_batch) {
+    return errors::InvalidArgument(
+        "The input \"shape\" for Reshape must be a constant in implicit batch"
+        " mode.");
+  }
+  if (!IS_TRT_VERSION_GE(7, 1, 3, 0)) {
+    // While officially TRT supports shape value input , there are problems with
+    // shape input handling that cause networks converted with
+    // ConvertDynamicReshape fail. Here we conservatively switch off the
+    // converter before TRT 7.1.3.
+    return errors::InvalidArgument(
+        "Non constant shape input tensor for Reshape requires minimum TRT "
+        "7.1.3");
+  }
+  const auto& inputs = params->inputs;
+  const TRT_TensorOrWeights& input_tensor = inputs.at(0);
 
-  nvinfer1::Dims reshape_dims;
-  reshape_dims.nbDims = weights.count() - 1;
-  for (int i = 1; i < weights.count(); i++) {
-    reshape_dims.d[i - 1] = weights_ptr[i];
+  // If the input is a tensor it must be a shape tensor.
+  TF_RETURN_IF_ERROR(ExpectShapeTensor(inputs.at(1)));
+  if (inputs.at(1).tensor()->getDimensions().nbDims == 0) {
+    // Dynamic reshape requires a 1D shape tensor.
+    return errors::Unimplemented(
+        "Reshape with dynamic input requires 1D input tensor");
   }
+  if (params->validation_only) return Status::OK();
+  nvinfer1::IShuffleLayer* layer = params->converter->network()->addShuffle(
+      *input_tensor.tensor()->trt_tensor());
+  VLOG(2) << "ConvertReshape setInput (1) "
+          << DebugString(inputs.at(1).tensor()->getDimensions());
+  layer->setInput(1, *inputs.at(1).tensor()->trt_tensor());
+  params->outputs->push_back(TRT_TensorOrWeights(layer->getOutput(0)));
+  return Status::OK();
+}
+
+// Converts Reshape in explicit batch mode if the input has static (known) dims.
+Status ConvertStaticReshapeForExplicitBatchMode(
+    const OpConverterParams* params, DimsAdapter output_dims,
+    ITensorProxyPtr* output_tensor) {
+  return PrepareTensorForShape(params->converter, params->inputs.at(0),
+                               output_dims, params->validation_only,
+                               output_tensor, params->node_def);
+}
+
+// Converts Reshape in implicit batch mode. The input has static (known) dims.
+Status ConvertStaticReshapeForImplicitBatchMode(
+    const OpConverterParams* params, DimsAdapter output_dims,
+    ITensorProxyPtr* output_tensor) {
+  const auto& inputs = params->inputs;
+  const TRT_TensorOrWeights& input_tensor = inputs.at(0);
+  const int input_batch_dim = input_tensor.batch_size();
+  const int64_t output_batch_dim = output_dims.dim(0);
+
+  DimsAdapter input_nonbatch_dims(input_tensor.GetTrtDims());
+  DimsAdapter output_nonbatch_dims(output_dims);
+  TF_RETURN_IF_ERROR(output_nonbatch_dims.RemoveBatchDimension());
 
-  // Check that it doesn't change the batch dimension according to the logic
-  // mentioned above.
+  VLOG(1) << "input_batch_dim=" << input_batch_dim
+          << ", input_nonbatch_dims=" << input_nonbatch_dims.DebugString()
+          << "\nresult_batch_dim=" << output_batch_dim
+          << ", result_nonbatch_dims=" << output_nonbatch_dims.DebugString();
+
+  // Check whether input_batch_dim and output_batch_dim will have the same
+  // static value.
   bool reshape_may_change_batch_dim = false;
-  if (input_batch_dim > 0) {        // Batch size is fixed.
-    if (reshape_batch_dim == -1) {  // Other dims of the shape must be fixed.
-      if (!AreDimsStaticWithSameSize(input_dims, reshape_dims,
-                                     /*is_tensor=*/true)) {
-        reshape_may_change_batch_dim = true;
-      }
-    } else if (reshape_batch_dim != input_batch_dim) {
-      reshape_may_change_batch_dim = true;
-    } else {
-      // This means (input_batch_dim>0 && input_batch_dim==reshape_batch_dim),
-      // and TF runtime should make sure non-batch dims are matched.
-    }
-  } else if (!AreDimsStaticWithSameSize(input_dims, reshape_dims,
-                                        /*is_tensor=*/true)) {
-    reshape_may_change_batch_dim = true;
+  if (input_batch_dim != -1 && output_batch_dim != -1) {
+    reshape_may_change_batch_dim = (input_batch_dim != output_batch_dim);
+  } else {
+    reshape_may_change_batch_dim =
+        !AreDimsStaticWithSameSize(input_nonbatch_dims, output_nonbatch_dims);
   }
-  VLOG(1) << "input_batch_dim=" << input_batch_dim
-          << ", input_dims=" << DebugString(input_dims)
-          << "\nreshape_batch_dim=" << reshape_batch_dim
-          << ", reshape_dims=" << DebugString(reshape_dims);
   if (reshape_may_change_batch_dim) {
-    const string msg = StrCat(
-        "Reshape on batch dimension is not supported, at ", node_def.name(),
-        ". input_batch_dim=", input_batch_dim, ", ", DebugString(input_dims),
-        "; reshape_batch_dim=", reshape_batch_dim, ", ",
-        DebugString(reshape_dims));
-    return errors::Unimplemented(msg);
+    return errors::Unimplemented("Reshape on batch dimension is not supported");
+  }
+  // Perform the conversion.
+  return PrepareTensorForShape(params->converter, input_tensor,
+                               output_nonbatch_dims, params->validation_only,
+                               output_tensor, params->node_def);
+}
+
+Status ConvertReshape(const OpConverterParams* params) {
+  const auto& inputs = params->inputs;
+  TF_RETURN_IF_ERROR(CheckInputsWeights(
+      *params,
+      {{"tensor", TrtInputArg::kTensor}, {"shape", TrtInputArg::kBoth}}));
+  TF_RETURN_IF_ERROR(AllowDataTypes(
+      *params, {DataType::DT_FLOAT, DataType::DT_HALF, DataType::DT_INT32}));
+  if (inputs.at(1).is_tensor()) {
+    return ConvertDynamicReshape(params);
   }
 
-  // Start conversion.
+  // TODO(bixia): we can't use inputs.at(1).weights().ToVector<int>() for two
+  // reasons: (1) When weights.count()==0, TRT_ShapedWeights::tensor_ dtype is
+  // not properly set to INT32. (2) I tried a fix for the first problem, I got
+  // shared pointer related error in convert_nodes_test. We should fix the
+  // problems and switch to use inputs.at(1).weights().ToVector<int>(), a type
+  // safe method to access the content of the tensor.
+  TRT_ShapedWeights weights = inputs.at(1).weights();
+  if (weights.count() == 0 && params->use_implicit_batch) {
+    return errors::Unimplemented("Reshape to shape=[] is not supported");
+  }
+
+  DimsAdapter output_shape_dims(
+      absl::MakeSpan(weights.GetPointer<int>(), weights.count()));
   ITensorProxyPtr output_tensor = nullptr;
-  TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
-      input_tensor, reshape_dims, params->validation_only, &output_tensor));
+
+  if (!params->use_implicit_batch) {
+    TF_RETURN_IF_ERROR(ConvertStaticReshapeForExplicitBatchMode(
+        params, output_shape_dims, &output_tensor));
+  } else {
+    TF_RETURN_IF_ERROR(ConvertStaticReshapeForImplicitBatchMode(
+        params, output_shape_dims, &output_tensor));
+  }
   if (params->validation_only) return Status::OK();
 
+  // Record the conversion result.
   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
   return Status::OK();
 }
 
-Status ConvertExpandDims(OpConverterParams* params) {
+Status ConvertExpandDims(const OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
   TF_RETURN_IF_ERROR(
@@ -2277,29 +2521,161 @@ Status ConvertExpandDims(OpConverterParams* params) {
   // Get axis to expand on.
   auto axis = inputs.at(1).weights().GetSpan<int>();
   if (axis.size() != 1) {
-    return errors::InvalidArgument("ExpandDims axis must be a scalar, at ",
-                                   node_def.name());
+    return errors::InvalidArgument("ExpandDims axis must be a scalar");
   }
   // Use rank = nbDims + 1 for ConvertAxis's bounds checking to account for
   // ExpandDim's ability to add an axis at end of the shape.
   int trt_axis;
   TF_RETURN_IF_ERROR(ConvertAxis(axis[0], dims.nbDims + 1, node_def.name(),
-                                 /*use_implicit_batch=*/true, &trt_axis));
+                                 params->use_implicit_batch, &trt_axis));
   if (params->validation_only) return Status::OK();
-
-  // ExpandDims: Insert new dim of size 1.
-  input_dims.insert(input_dims.begin() + trt_axis, 1);
-  // Reshape tensor.
-  nvinfer1::Dims new_dims;
-  TF_RETURN_IF_ERROR(TensorShapeArrayToTrtDims(input_dims, &new_dims));
   ITensorProxyPtr output_tensor = nullptr;
-  TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
-      input_tensor, new_dims, /*validation_only=*/false, &output_tensor));
+
+  if (!params->use_implicit_batch && !HasStaticShape(input_dims)) {
+    TF_RETURN_IF_ERROR(params->converter->DynamicExpandDims(
+        /*input=*/input_tensor.tensor(),
+        /*dims=*/dims,
+        /*axis=*/trt_axis,
+        /*params=*/params,
+        /*output=*/&output_tensor));
+  } else {
+    // ExpandDims: Insert new dim of size 1.
+    input_dims.insert(input_dims.begin() + trt_axis, 1);
+    // Reshape tensor.
+    DimsAdapter dims(input_dims);
+    TF_RETURN_IF_ERROR(PrepareTensorForShape(
+        params->converter, input_tensor, dims,
+        /*validation_only=*/false, &output_tensor, params->node_def));
+  }
   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
   return Status::OK();
 }
 
-Status ConvertSqueeze(OpConverterParams* params) {
+Status Converter::DynamicReshape(ITensorProxyPtr input,
+                                 std::vector<std::pair<int, int>> slices,
+                                 const OpConverterParams* params,
+                                 ITensorProxyPtr* output,
+                                 std::vector<int> size_for_added_dims,
+                                 absl::optional<int> op_instance) {
+  *output = nullptr;
+  // DynamicReshape relies on INetworkDefinition::addShape
+  if (params->validation_only) {
+    return errors::Internal(
+        "DynamicReshape should not be used during validation");
+  }
+  ITensorProxyPtr shape =
+      network()->addShape(*input->trt_tensor())->getOutput(0);
+  // Build new shape = shape[:trt_axis] + [1] + shape[trt_axis:]
+  std::vector<ITensorProxyPtr> concat_inputs;
+  int max_num_slices = std::max(slices.size(), size_for_added_dims.size());
+  int op_instance_value = op_instance.has_value() ? op_instance.value() : 0;
+
+  for (int i = 0; i < max_num_slices; i++) {
+    ITensorProxyPtr tensor;
+    // maybe_add_a_dimension(i);
+    if (i < size_for_added_dims.size() && size_for_added_dims[i] >= 0) {
+      nvinfer1::Dims dims{1, {1}};
+      if (size_for_added_dims[i] > 0) {
+        dims.d[0] = size_for_added_dims[i];
+      }
+      TF_RETURN_IF_ERROR(
+          CreateScalarConstant(params, std::min(size_for_added_dims[i], 1),
+                               &tensor, nvinfer1::DataType::kINT32, dims));
+      concat_inputs.push_back(tensor);
+    }
+    if (i < slices.size()) {
+      nvinfer1::ISliceLayer* slice_layer = network()->addSlice(
+          *shape->trt_tensor(), {1, {slices[i].first}},
+          {1, {slices[i].second - slices[i].first}}, {1, {1}});
+      concat_inputs.push_back(slice_layer->getOutput(0));
+      string slice_name = StrCat("slice_", op_instance_value);
+      SetLayerName(slice_layer, params->node_def, slice_name,
+                   /*op_instance=*/i);
+    }
+  }
+  std::vector<nvinfer1::ITensor*> trt_concat_inputs;
+  for (const auto& t : concat_inputs) {
+    trt_concat_inputs.push_back(t->trt_tensor());
+  }
+  nvinfer1::IConcatenationLayer* concat_layer = network()->addConcatenation(
+      static_cast<nvinfer1::ITensor* const*>(trt_concat_inputs.data()),
+      concat_inputs.size());
+  SetLayerName(concat_layer, params->node_def, "concat", op_instance);
+  concat_layer->setAxis(0);
+  ITensorProxyPtr new_shape = concat_layer->getOutput(0);
+  // Reshape input using new shape
+  nvinfer1::IShuffleLayer* shuffle =
+      network()->addShuffle(*input->trt_tensor());
+  SetLayerName(shuffle, params->node_def, "shuffle", op_instance);
+  shuffle->setInput(1, *new_shape->trt_tensor());
+  *output = shuffle->getOutput(0);
+  return Status::OK();
+}
+
+Status Converter::DynamicExpandDims(ITensorProxyPtr input,
+                                    const nvinfer1::Dims& dims, int axis,
+                                    const OpConverterParams* params,
+                                    ITensorProxyPtr* output,
+                                    absl::optional<int> op_instance) {
+  if (params->validation_only) {
+    *output = nullptr;
+    return errors::Internal(
+        "DynamicExpandDims should not be used during validation");
+  }
+  std::vector<std::pair<int, int>> slices;
+  std::vector<int> extra_dims;
+  if (axis != 0) {
+    slices.push_back(std::pair<int, int>{0, axis});
+    extra_dims.push_back(-1);
+  }
+  extra_dims.push_back(1);
+  if (axis != dims.nbDims) {
+    slices.push_back(std::pair<int, int>{axis, dims.nbDims});
+  }
+  return DynamicReshape(
+      /*input=*/input,
+      /*slices=*/slices,
+      /*params=*/params,
+      /*output=*/output,
+      /*size_for_added_dims=*/extra_dims,
+      /*op_instance=*/op_instance);
+}
+
+Status Converter::SqueezeTensor(ITensorProxyPtr input,
+                                std::vector<int>* input_dims,
+                                const OpConverterParams* params,
+                                ITensorProxyPtr* output,
+                                absl::optional<int> op_instance) {
+  // If the remaining dimensions of a squeeze operation have dynamic sizes, we
+  // need to use TRT ops to build the result shape for the squeeze operation.
+  // This is because IShuffleLayer::setReshapeDimensions treats -1 as a special
+  // value.
+  if (!params->use_implicit_batch && !HasStaticShape(*input_dims)) {
+    std::vector<std::pair<int, int>> slices;
+    for (int i = 0; i < input_dims->size(); i++) {
+      if (input_dims->at(i) != 0) {
+        slices.push_back(std::pair<int, int>(i, i + 1));
+      }
+    }
+    return DynamicReshape(
+        /*input=*/input,
+        /*slices=*/slices,
+        /*params=*/params,
+        /*output=*/output,
+        /*size_for_added_dims=*/{},
+        /*op_instance=*/op_instance);
+  }
+  // Remove all dims which are equal to 0.
+  input_dims->erase(std::remove(input_dims->begin(), input_dims->end(), 0),
+                    input_dims->end());
+  // Reshape tensor.
+  TF_RETURN_IF_ERROR(PrepareTensorForShape(
+      params->converter, TRT_TensorOrWeights(input), DimsAdapter(*input_dims),
+      /*validation_only=*/false, output, params->node_def, op_instance));
+  return Status::OK();
+}
+
+Status ConvertSqueeze(const OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
   TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"input", false}}));
@@ -2309,45 +2685,59 @@ Status ConvertSqueeze(OpConverterParams* params) {
   const TRT_TensorOrWeights& input_tensor = inputs.at(0);
   const nvinfer1::Dims dims = input_tensor.GetTrtDims();
   std::vector<int> input_dims(dims.d, dims.d + dims.nbDims);
-  // Mark axes to remove by setting them to 0.
-  TFAttrs attrs(node_def);
-  auto squeeze_dims = attrs.get<std::vector<int64>>("squeeze_dims");
+  std::vector<int64> squeeze_dims;
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(AttrSlice(node_def), "squeeze_dims", &squeeze_dims));
   if (squeeze_dims.empty()) {
-    return errors::Unimplemented(
-        "Squeeze is only implemented for explicit dims, at ", node_def.name());
-  }
-  for (int tf_axis : squeeze_dims) {
-    // Make sure axis is valid.
-    int trt_axis;
-    TF_RETURN_IF_ERROR(ConvertAxis(tf_axis, dims.nbDims, node_def.name(),
-                                   /*use_implicit_batch=*/true, &trt_axis));
-    // Make sure target dimension is size 1.
-    if (input_dims[trt_axis] != 1) {
-      return errors::InvalidArgument(
-          "Dimension ", tf_axis, " with size ", input_dims[trt_axis],
-          " cannot be squeezed because it must be size 1, at ",
-          node_def.name());
+    if (params->use_implicit_batch || !HasStaticShape(dims)) {
+      return errors::Unimplemented(
+          "Squeeze is not implemented for empty squeeze_dims");
+    } else {
+      // explicit batch mode with static input shape we squeeze all singleton
+      // dimensions
+      for (int& dim : input_dims) {
+        if (dim == 1) {
+          // Mark it for removal by setting it to 0
+          dim = 0;
+        }
+      }
+    }
+  } else {
+    std::vector<int> trt_axes;
+    trt_axes.reserve(squeeze_dims.size());
+    for (int tf_axis : squeeze_dims) {
+      // If the axis is valid, then convert it to TRT axis, otherwise abort
+      // conversion.
+      int trt_axis;
+      TF_RETURN_IF_ERROR(ConvertAxis(tf_axis, dims.nbDims, node_def.name(),
+                                     params->use_implicit_batch, &trt_axis));
+      // Make sure target dimension is size 1 or unknown size (-1)
+      if (input_dims[trt_axis] != -1 && input_dims[trt_axis] != 1) {
+        return errors::InvalidArgument(
+            "Dimension ", tf_axis, " with size ", input_dims[trt_axis],
+            " cannot be squeezed because it must be size 1");
+      }
+      trt_axes.push_back(trt_axis);
+    }
+    // Mark axes to remove by setting them to 0.
+    for (int axis : trt_axes) {
+      input_dims[axis] = 0;
     }
-    // Mark dim for removal by setting to 0.
-    input_dims[trt_axis] = 0;
   }
   if (params->validation_only) return Status::OK();
 
-  // Remove all dims which are equal to 0.
-  input_dims.erase(std::remove(input_dims.begin(), input_dims.end(), 0),
-                   input_dims.end());
-  // Reshape tensor.
-  nvinfer1::Dims new_dims;
-  TF_RETURN_IF_ERROR(TensorShapeArrayToTrtDims(input_dims, &new_dims));
   ITensorProxyPtr output_tensor = nullptr;
-  TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
-      input_tensor, new_dims, /*validation_only=*/false, &output_tensor));
+  TF_RETURN_IF_ERROR(params->converter->SqueezeTensor(
+      /*input=*/input_tensor.tensor(),
+      /*input_dims=*/&input_dims,
+      /*params=*/params,
+      /*output=*/&output_tensor));
   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
   return Status::OK();
 }
 
 template <typename Container>
-Status ConvertStridedSliceHelper(OpConverterParams* params,
+Status ConvertStridedSliceHelper(const OpConverterParams* params,
                                  const TRT_TensorOrWeights& input,
                                  Container begin, Container size,
                                  const Container& stride,
@@ -2393,9 +2783,9 @@ Status ConvertStridedSliceHelper(OpConverterParams* params,
   ITensorProxyPtr tensor = layer->getOutput(0);
   // Reshape for shrink_axis.
   if (final_shape) {
-    TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
-        TRT_TensorOrWeights(tensor), *final_shape, /*validation_only=*/false,
-        &tensor));
+    TF_RETURN_IF_ERROR(PrepareTensorForShape(
+        params->converter, TRT_TensorOrWeights(tensor), *final_shape,
+        /*validation_only=*/false, &tensor, params->node_def));
   }
   params->outputs->push_back(TRT_TensorOrWeights(tensor));
   return Status::OK();
@@ -2496,8 +2886,9 @@ Status ConvertStridedSliceHelper(OpConverterParams* params,
   // Start conversion.
   nvinfer1::ITensor* tensor = input.tensor();
   if (need_reshape) {
-    TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
-        input, reshape_dims, /*validation_only=*/false, &tensor));
+    TF_RETURN_IF_ERROR(PrepareTensorForShape(
+        params->converter, input, reshape_dims, /*validation_only=*/false,
+        &tensor, params->node_def));
   }
   if (need_transpose) {
     TF_RETURN_IF_ERROR(
@@ -2517,9 +2908,9 @@ Status ConvertStridedSliceHelper(OpConverterParams* params,
   }
   // Reshape for shrink_axis.
   if (final_shape) {
-    TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
-        TRT_TensorOrWeights(tensor), *final_shape, /*validation_only=*/false,
-        &tensor));
+    TF_RETURN_IF_ERROR(PrepareTensorForShape(
+        params->converter, TRT_TensorOrWeights(tensor), *final_shape,
+        /*validation_only=*/false, &tensor, params->node_def));
   } else if (need_reshape) {
     // Restore reshape.
     // Calculate output dimensions
@@ -2540,9 +2931,9 @@ Status ConvertStridedSliceHelper(OpConverterParams* params,
     nvinfer1::Dims new_dims;
     TF_RETURN_IF_ERROR(TensorShapeArrayToTrtDims(input_dims, &new_dims,
                                                  /*ignore_first_dim=*/true));
-    TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
-        TRT_TensorOrWeights(tensor), new_dims, /*validation_only=*/false,
-        &tensor));
+    TF_RETURN_IF_ERROR(PrepareTensorForShape(
+        params->converter, TRT_TensorOrWeights(tensor), new_dims,
+        /*validation_only=*/false, &tensor, params->node_def));
   }
 
   params->outputs->push_back(TRT_TensorOrWeights(tensor));
@@ -2550,7 +2941,7 @@ Status ConvertStridedSliceHelper(OpConverterParams* params,
 #endif
 }
 
-Status ConvertSlice(OpConverterParams* params) {
+Status ConvertSlice(const OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
   TF_RETURN_IF_ERROR(CheckInputsWeights(
@@ -2595,7 +2986,7 @@ Status ConvertSlice(OpConverterParams* params) {
   return ConvertStridedSliceHelper(params, inputs.at(0), begin, size, stride);
 }
 
-Status ConvertStridedSlice(OpConverterParams* params) {
+Status ConvertStridedSlice(const OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
   TF_RETURN_IF_ERROR(CheckInputsWeights(
@@ -2664,7 +3055,7 @@ Status ConvertStridedSlice(OpConverterParams* params) {
   // If batch dimension is covered by the ellipsis mask, it means it's left
   // untouched. Otherwise we check whether it modifies the batch dimension here.
   if (!(ellipsis_mask & 1) ||
-      begin_weights.shape_.nbDims >= input_dims.size()) {
+      begin_weights.Shape().NumDims() >= input_dims.size()) {
     // Check that batch dimension is unmodified. We need to use the expanded
     // begin/end/strides array since the original array may be incorrect when
     // (ellipsis_mask&1)==1.
@@ -2708,20 +3099,19 @@ Status ConvertStridedSlice(OpConverterParams* params) {
                                    final_shape_dims_ptr);
 }
 
-Status ConvertConv2D(OpConverterParams* params) {
+Status ConvertConv2D(const OpConverterParams* params) {
   return ConvertConv2DHelper(params, 1, /*is_conv2d_backprop_input=*/false);
 }
 
-Status ConvertConv2DDepthwise(OpConverterParams* params) {
+Status ConvertConv2DDepthwise(const OpConverterParams* params) {
   return ConvertConv2DHelper(params, 0, /*is_conv2d_backprop_input=*/false);
 }
 
-Status ConvertConv2DBackpropInput(OpConverterParams* params) {
+Status ConvertConv2DBackpropInput(const OpConverterParams* params) {
   return ConvertConv2DHelper(params, 1, /*is_conv2d_backprop_input=*/true);
 }
 
-#if IS_TRT_VERSION_GE(6, 0, 0, 0)
-Status ConvertConv3DHelper(OpConverterParams* params, int group,
+Status ConvertConv3DHelper(const OpConverterParams* params, int group,
                            bool is_conv3d_backprop_input = false) {
   const int kNumDims = 5;
   const auto& inputs = params->inputs;
@@ -2744,27 +3134,30 @@ Status ConvertConv3DHelper(OpConverterParams* params, int group,
   TF_RETURN_IF_ERROR(
       AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
   const TRT_ShapedWeights weights_drsck = inputs.at(1).weights();
-  if (weights_drsck.shape_.nbDims != kNumDims) {
-    return errors::InvalidArgument("Conv3D expects kernel of dimension 5, at ",
-                                   node_def.name());
+  if (weights_drsck.Shape().NumDims() != kNumDims) {
+    return errors::InvalidArgument("Conv3D expects kernel of dimension 5");
   }
-  TFAttrs attrs(node_def);
-  auto data_format = attrs.get<string>("data_format");
+
+  string data_format, padding_type;
+  std::vector<int64> tf_dilations, tf_stride;
+  AttrSlice attrs(node_def);
+  TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "data_format", &data_format));
+  TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "padding", &padding_type));
+  TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "dilations", &tf_dilations));
+  TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "strides", &tf_stride));
+
   const bool is_ndhwc = (data_format == "NDHWC");  // Or NCDHW 01234 - > 02341
   const int d_index = is_ndhwc ? 1 : 2;
   const int h_index = is_ndhwc ? 2 : 3;
   const int w_index = is_ndhwc ? 3 : 4;
   const int c_index = is_ndhwc ? 4 : 1;
-  auto tf_dilations = attrs.get<std::vector<int64>>("dilations");
   if (tf_dilations.size() != kNumDims) {
     return errors::InvalidArgument(
-        "Convolution dilations field must specify 5 dimensions, at ",
-        node_def.name());
+        "Convolution dilations field must specify 5 dimensions");
   }
   if (tf_dilations[0] != 1 || tf_dilations[c_index] != 1) {
     return errors::Unimplemented(
-        "Dilation rate must be 1 for batch and channel dimensions, at ",
-        node_def.name());
+        "Dilation rate must be 1 for batch and channel dimensions");
   }
 
   const nvinfer1::Dims3 dilation_dhw(
@@ -2774,20 +3167,16 @@ Status ConvertConv3DHelper(OpConverterParams* params, int group,
        dilation_dhw.d[2] != 1)) {
     return errors::Unimplemented(
         "Dilation with Conv3DBackpropInputV2 (conv3d_transpose) is not "
-        "supported",
-        ", at ", node_def.name());
+        "supported");
   }
 
-  const auto tf_stride = attrs.get<std::vector<int64>>("strides");
   if (tf_stride.size() != kNumDims) {
     return errors::InvalidArgument(
-        "Convolution strides field must specify 5 dimensions, at ",
-        node_def.name());
+        "Convolution strides field must specify 5 dimensions");
   }
   if (tf_stride[0] != 1 || tf_stride[c_index] != 1) {
     return errors::Unimplemented(
-        "Stride must be 1 for batch and channel dimensions, at ",
-        node_def.name());
+        "Stride must be 1 for batch and channel dimensions");
   }
 
   const nvinfer1::Dims3 stride_dhw(tf_stride[d_index], tf_stride[h_index],
@@ -2795,24 +3184,24 @@ Status ConvertConv3DHelper(OpConverterParams* params, int group,
   const auto tensor_dim = tensor->getDimensions();
 
   // Asymmetric padding on Deconv not supported for now
-  if (is_conv3d_backprop_input && attrs.get<string>("padding") == "SAME") {
-    const int tensor_c_idx = c_index - 1;
-    const int num_groups = (group == 0) ? tensor_dim.d[tensor_c_idx] : group;
-
-    TRT_ShapedWeights weights =
+  if (is_conv3d_backprop_input && padding_type == "SAME") {
+    ::stream_executor::port::StatusOr<TRT_ShapedWeights> weights =
         params->weight_store->GetTempWeights(weights_drsck);
-
+    TRT_ENSURE_OK(weights);
     nvinfer1::Dims3 effective_kernel_size(
-        weights.shape_.d[0] +
-            (weights.shape_.d[0] - 1) * (dilation_dhw.d[0] - 1),  // D
-        weights.shape_.d[1] +
-            (weights.shape_.d[1] - 1) * (dilation_dhw.d[1] - 1),  // R
-        weights.shape_.d[2] +
-            (weights.shape_.d[2] - 1) * (dilation_dhw.d[2] - 1)  // S
+        weights.ValueOrDie().Shape().dim(0) +
+            (weights.ValueOrDie().Shape().dim(0) - 1) *
+                (dilation_dhw.d[0] - 1),  // D
+        weights.ValueOrDie().Shape().dim(1) +
+            (weights.ValueOrDie().Shape().dim(1) - 1) *
+                (dilation_dhw.d[1] - 1),  // R
+        weights.ValueOrDie().Shape().dim(2) +
+            (weights.ValueOrDie().Shape().dim(2) - 1) *
+                (dilation_dhw.d[2] - 1)  // S
     );
 
     const auto output_size_weights =
-        static_cast<int*>(backprop_output_size.weights().GetValues());
+        backprop_output_size.weights().GetPointer<int>();
     const std::vector<int64_t> input_dims = {output_size_weights[d_index],
                                              output_size_weights[h_index],
                                              output_size_weights[w_index]};
@@ -2825,19 +3214,26 @@ Status ConvertConv3DHelper(OpConverterParams* params, int group,
         padding[2].first != padding[2].second) {
       return errors::Unimplemented(
           "Asymmetric padding with Conv3DBackpropInputV2 (conv3d_transpose) is "
-          "not supported, at ",
-          node_def.name());
+          "not supported");
     }
   }
 
-  if (params->validation_only)
-    return Status::OK();  // Finished validation checks
+  // Channel dim must be static for Conv3D since we use that value for
+  // num_groups at build time.
+  // TODO: Allow conversion if kImplicitBatchModeCompatible||kOptimal is used.
+  int implicit_batch_offset = params->use_implicit_batch ? -1 : 0;
+  if (tensor->getDimensions().d[c_index + implicit_batch_offset] == -1) {
+    return errors::InvalidArgument("Channel dimension must be static");
+  }
+
+  // Finished validation checks
+  if (params->validation_only) return Status::OK();
 
   // Transpose to NCDHW (NCDHW is required for IConvLayer).
   const bool need_transpose = is_ndhwc;
   if (need_transpose) {
-    TF_RETURN_IF_ERROR(
-        params->converter->TransposeTensor(tensor, {0, 4, 1, 2, 3}, &tensor));
+    TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
+        tensor, {0, 4, 1, 2, 3}, &tensor, node_def, "to_NCDHW"));
   }
 
   // group == 0 signifies that this is a depthwise convolution, so set
@@ -2848,15 +3244,17 @@ Status ConvertConv3DHelper(OpConverterParams* params, int group,
   // For conv, TF weights are DRSCK, and TRT expects KCDRS.
   // For backprop, TF weights are DRSKC, and TRT expects KCDRS.
   // Therefore, this reorder will work for both cases.
-  TRT_ShapedWeights weights =
+  ::stream_executor::port::StatusOr<TRT_ShapedWeights> weights =
       params->weight_store->GetTempWeights(weights_drsck);
-  ReorderDRSCKToKCDRS(weights_drsck, &weights, num_groups);
-  TRT_ShapedWeights biases(weights.TrtDType());
+  TRT_ENSURE_OK(weights);
+  ReorderDRSCKToKCDRS(weights_drsck, &weights.ValueOrDie(), num_groups);
+  TRT_ShapedWeights biases(weights.ValueOrDie().TrtDType());
   const int output_axis = is_conv3d_backprop_input ? 1 : 0;
-  const int noutput = weights.shape_.d[output_axis] * num_groups;
-  nvinfer1::Dims3 kernel_size_drs(weights.shape_.d[2],  // D
-                                  weights.shape_.d[3],  // R
-                                  weights.shape_.d[4]   // S
+  const int noutput =
+      weights.ValueOrDie().Shape().dim(output_axis) * num_groups;
+  nvinfer1::Dims3 kernel_size_drs(weights.ValueOrDie().Shape().dim(2),  // D
+                                  weights.ValueOrDie().Shape().dim(3),  // R
+                                  weights.ValueOrDie().Shape().dim(4)   // S
   );
 
   // Add convolution.
@@ -2864,60 +3262,151 @@ Status ConvertConv3DHelper(OpConverterParams* params, int group,
   if (is_conv3d_backprop_input) {
     nvinfer1::IDeconvolutionLayer* layer =
         params->converter->network()->addDeconvolutionNd(
-            *tensor->trt_tensor(), noutput, kernel_size_drs, weights.GetTrtWeights(),
-            biases.GetTrtWeights());
+            *tensor->trt_tensor(), noutput, kernel_size_drs,
+            weights.ValueOrDie().GetTrtWeights(), biases.GetTrtWeights());
     TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
     layer->setStrideNd(stride_dhw);  // change to nd set stride
 
-    // TensorRT 5.1.3 added support for padding modes.
-    if (attrs.get<string>("padding") == "SAME") {
+    if (padding_type == "SAME") {
       VLOG(2) << "Using SAME padding";
       // SAME_UPPER means that post padding is preferred.
       layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER);
     }
 
-    layer->setName(node_def.name().c_str());
     layer->setNbGroups(num_groups);
     conv_layer = layer;
   } else {
     nvinfer1::IConvolutionLayer* layer =
         params->converter->network()->addConvolutionNd(
-            *tensor->trt_tensor(), noutput, kernel_size_drs, weights.GetTrtWeights(),
-            biases.GetTrtWeights());
+            *tensor->trt_tensor(), noutput, kernel_size_drs,
+            weights.ValueOrDie().GetTrtWeights(), biases.GetTrtWeights());
     TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
     layer->setStrideNd(stride_dhw);
 
-    if (attrs.get<string>("padding") == "SAME") {
+    if (padding_type == "SAME") {
       VLOG(2) << "Using SAME padding";
       layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER);
     }
 
-    layer->setName(node_def.name().c_str());
     layer->setNbGroups(num_groups);
     layer->setDilationNd(dilation_dhw);
     conv_layer = layer;
   }
+  params->converter->SetLayerName(conv_layer, node_def, "conv");
   ITensorProxyPtr output_tensor = conv_layer->getOutput(0);
 
   // Restore transpose.
   if (need_transpose) {
     TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
-        output_tensor, {0, 2, 3, 4, 1}, &output_tensor));
+        output_tensor, {0, 2, 3, 4, 1}, &output_tensor, node_def, "to_NDHWC"));
   }
   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
   return Status::OK();
 }
 
-Status ConvertConv3D(OpConverterParams* params) {
+Status ConvertConv3D(const OpConverterParams* params) {
   return ConvertConv3DHelper(params, 1, /*is_conv3d_backprop_input=*/false);
 }
 
-Status ConvertConv3DBackpropInputV2(OpConverterParams* params) {
+Status ConvertConv3DBackpropInputV2(const OpConverterParams* params) {
   return ConvertConv3DHelper(params, 1, /*is_conv3d_backprop_input=*/true);
 }
-#endif  // #if IS_TRT_VERSION_GE(6, 0, 0, 0)
 
-Status ConvertFusedConv2DBiasActivation(OpConverterParams* params) {
+Status ConvertPool3D(const OpConverterParams* params) {
+  const int kNumDims = 5;
+  const auto& inputs = params->inputs;
+  const auto& node_def = params->node_def;
+  TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"input", false}}));
+  TF_RETURN_IF_ERROR(
+      AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
+  nvinfer1::PoolingType type;
+  if (node_def.op() == "MaxPool3D") {
+    type = nvinfer1::PoolingType::kMAX;
+  } else if (node_def.op() == "AvgPool3D") {
+    type = nvinfer1::PoolingType::kAVERAGE;
+  } else {
+    return errors::Unimplemented("Unsupported pooling type: ", node_def.op());
+  }
+
+  string data_format, padding_type;
+  std::vector<int64> tf_stride, tf_kernel;
+  AttrSlice attrs(node_def);
+  TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "padding", &padding_type));
+  TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "data_format", &data_format));
+  TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "strides", &tf_stride));
+  TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "ksize", &tf_kernel));
+
+  if ((padding_type != "SAME") && (padding_type != "VALID")) {
+    return errors::Unimplemented("Unsupported padding type: ", padding_type);
+  }
+
+  const bool is_ndhwc = (data_format == "NDHWC");
+  const int c_index = is_ndhwc ? 4 : 1;
+  const int d_index = is_ndhwc ? 1 : 2;
+  const int h_index = is_ndhwc ? 2 : 3;
+  const int w_index = is_ndhwc ? 3 : 4;
+
+  if (tf_stride.size() != kNumDims) {
+    return errors::InvalidArgument(
+        "Pooling strides field must specify 5 dimensions");
+  }
+  if (tf_stride[0] != 1 || tf_stride[c_index] != 1) {
+    return errors::Unimplemented(
+        "stride must be 1 for batch and channel dimensions");
+  }
+
+  if (tf_kernel.size() != kNumDims) {
+    return errors::InvalidArgument(
+        "Pooling ksize field must specify 5 dimensions");
+  }
+  if (tf_kernel[0] != 1 || tf_kernel[c_index] != 1) {
+    return errors::Unimplemented(
+        "ksize must be 1 for batch and channel dimensions");
+  }
+
+  const nvinfer1::Dims3 stride(tf_stride[d_index], tf_stride[h_index],
+                               tf_stride[w_index]);
+  const nvinfer1::Dims3 ksize(tf_kernel[d_index], tf_kernel[h_index],
+                              tf_kernel[w_index]);
+
+  if (!(ksize.nbDims >= 3 &&
+        (ksize.d[0] >= 1 && ksize.d[1] >= 1 && ksize.d[2] >= 1) &&
+        (ksize.d[0] * ksize.d[1] * ksize.d[2] < MAX_KERNEL_DIMS_PRODUCT(3)))) {
+    return errors::InvalidArgument("Window dimensions are not within bounds");
+  }
+  if (params->validation_only) return Status::OK();
+
+  ITensorProxyPtr tensor = inputs.at(0).tensor();
+  if (data_format == "NDHWC") {
+    // NDHWC => NCDHW
+    TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
+        tensor, {0, 4, 1, 2, 3}, &tensor, node_def, "to_NCDHW"));
+  }
+
+  nvinfer1::IPoolingLayer* layer = params->converter->network()->addPoolingNd(
+      *tensor->trt_tensor(), type, ksize);
+  TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+
+  layer->setStrideNd(stride);
+  // VALID padding is the default TRT behavior.
+  if (padding_type == "SAME") {
+    // SAME_UPPER means that post padding is preferred.
+    layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER);
+  }
+  params->converter->SetLayerName(layer, node_def, "pooling");
+
+  ITensorProxyPtr output_tensor = layer->getOutput(0);
+  if (data_format == "NDHWC") {
+    // NCDHW => NDHWC
+    TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
+        output_tensor, {0, 2, 3, 4, 1}, &output_tensor, node_def, "to_NDHWC"));
+  }
+
+  params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
+  return Status::OK();
+}
+
+Status ConvertFusedConv2DBiasActivation(const OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
 
@@ -2931,107 +3420,88 @@ Status ConvertFusedConv2DBiasActivation(OpConverterParams* params) {
   TF_RETURN_IF_ERROR(
       AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
   TRT_ShapedWeights weights = inputs.at(1).weights();
-  if (weights.shape_.nbDims != 4) {
+  if (weights.Shape().NumDims() != 4) {
     return errors::InvalidArgument(
-        "FusedConv2DBiasActivation expects kernel of dimension 4, at " +
-        node_def.name());
+        "FusedConv2DBiasActivation expects kernel of dimension 4");
   }
-  TFAttrs attrs(node_def);
-  auto data_format = attrs.get<string>("data_format");
+
+  string data_format, filter_format, activation_mode, padding_type;
+  std::vector<int64> tf_dilations, tf_stride;
+  AttrSlice attrs(node_def);
+  TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "data_format", &data_format));
+  TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "filter_format", &filter_format));
+  TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "activation_mode", &activation_mode));
+  TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "padding", &padding_type));
+  TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "dilations", &tf_dilations));
+  TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "strides", &tf_stride));
+
   if (data_format != "NHWC" && data_format != "NCHW") {
-    return errors::InvalidArgument("Unsupported data_format:", data_format,
-                                   " at ", node_def.name());
+    return errors::InvalidArgument("Unsupported data_format:", data_format);
   }
-
   int c_index = (data_format == "NHWC") ? 3 : 1;
   int h_index = (data_format == "NHWC") ? 1 : 2;
   int w_index = (data_format == "NHWC") ? 2 : 3;
-  auto tf_dilations = attrs.get<std::vector<int64>>("dilations");
+
   if (tf_dilations.size() != 4) {
     return errors::InvalidArgument(
-        "Convolution dilations field must specify 4 dimensions, at ",
-        node_def.name());
+        "Convolution dilations field must specify 4 dimensions");
   }
   if (tf_dilations[0] != 1 || tf_dilations[c_index] != 1) {
     return errors::Unimplemented(
-        "Dilation rate must be 1 for batch and channel dimensions, at ",
-        node_def.name());
+        "Dilation rate must be 1 for batch and channel dimensions");
   }
   const nvinfer1::DimsHW dilation(tf_dilations[h_index], tf_dilations[w_index]);
 
-  const auto tf_stride = attrs.get<std::vector<int64>>("strides");
   if (tf_stride.size() != 4) {
     return errors::InvalidArgument(
-        "Convolution strides field must specify 4 dimensions, at ",
-        node_def.name());
+        "Convolution strides field must specify 4 dimensions");
   }
   if (tf_stride[0] != 1 || tf_stride[c_index] != 1) {
     return errors::Unimplemented(
-        "Stride must be 1 for batch and channel dimensions, at ",
-        node_def.name());
+        "Stride must be 1 for batch and channel dimensions");
   }
   const nvinfer1::DimsHW stride(tf_stride[h_index], tf_stride[w_index]);
-  const auto activation_mode = attrs.get<string>("activation_mode");
   auto op_pair = ActivationTypeMap()->find(activation_mode);
   if (op_pair == ActivationTypeMap()->end() && activation_mode != "None") {
-    return errors::Unimplemented("Activation mode: ", activation_mode,
-                                 " not supported at: ", node_def.name());
+    return errors::Unimplemented("Activation mode not supported: ",
+                                 activation_mode);
   }
 
-  const auto filter_format = attrs.get<string>("filter_format");
   if (filter_format != "HWIO" && filter_format != "OIHW") {
-    return errors::InvalidArgument("Unsupported filter_format:", filter_format,
-                                   " at ", node_def.name());
+    return errors::InvalidArgument("Unsupported filter_format:", filter_format);
   }
   // Check that there's no side_input or conv_input_scale.
   TRT_ShapedWeights side_input = inputs.at(3).weights();
   if (side_input.count() != 0) {
     return errors::InvalidArgument(
-        "FusedConv2DBiasActivation doesn't yet support side_input, at " +
-        node_def.name());
+        "FusedConv2DBiasActivation doesn't yet support side_input");
   }
   TRT_ShapedWeights conv_input_scale = inputs.at(4).weights();
   if (conv_input_scale.count() != 1 ||
       conv_input_scale.TrtDType() != nvinfer1::DataType::kFLOAT ||
       conv_input_scale.GetSpan<float>()[0] != 1.0) {
     return errors::InvalidArgument(
-        "FusedConv2DBiasActivation doesn't yet support conv_input_scale, at " +
-        node_def.name());
+        "FusedConv2DBiasActivation doesn't yet support conv_input_scale");
   }
   if (params->validation_only) return Status::OK();
 
   // Transpose to NCHW (NCHW is required for IConvLayer).
   const bool need_transpose = (data_format == "NHWC");
   if (need_transpose) {
-    TF_RETURN_IF_ERROR(
-        params->converter->TransposeTensor(tensor, {0, 3, 1, 2}, &tensor));
+    TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
+        tensor, {0, 3, 1, 2}, &tensor, node_def, "to_NCHW"));
   }
 
   nvinfer1::DimsHW kernel_size;
   if (filter_format == "OIHW") {
-    kernel_size.h() = weights.shape_.d[2];
-    kernel_size.w() = weights.shape_.d[3];
+    kernel_size.h() = weights.Shape().dim(2);
+    kernel_size.w() = weights.Shape().dim(3);
   } else {
     // HWIO.
     DCHECK_EQ(filter_format, "HWIO");
-    kernel_size.h() = weights.shape_.d[0];
-    kernel_size.w() = weights.shape_.d[1];
+    kernel_size.h() = weights.Shape().dim(0);
+    kernel_size.w() = weights.Shape().dim(1);
   }
-// Before TRT 5.1.3, we have to calculate padding ourselves.
-#if !IS_TRT_VERSION_GE(5, 1, 3, 0)
-  const auto tensor_dim = tensor->getDimensions();
-  std::vector<int64_t> input_dims;
-  // Use 1 and 2 because tensor_dim has the dimensions of the transposed
-  // input.
-  input_dims = {static_cast<int>(tensor_dim.d[1]),
-                static_cast<int>(tensor_dim.d[2])};
-  std::vector<std::pair<int, int>> padding;
-  ITensorProxyPtr padded_tensor = nullptr;
-  TF_RETURN_IF_ERROR(Conv2DPaddingHelper(params, attrs, kernel_size, dilation,
-                                         stride, input_dims, tensor, &padding,
-                                         &padded_tensor));
-  tensor = padded_tensor;
-#endif
 
   // Add convolution.
   TRT_ShapedWeights biases = inputs.at(2).weights();
@@ -3039,28 +3509,25 @@ Status ConvertFusedConv2DBiasActivation(OpConverterParams* params) {
   if (filter_format == "OIHW") {
     // Weights are already in the right order.
     conv_layer = params->converter->network()->addConvolution(
-        *tensor->trt_tensor(), weights.shape_.d[0], kernel_size, weights.GetTrtWeights(),
-        biases.GetTrtWeights());
+        *tensor->trt_tensor(), weights.Shape().dim(0), kernel_size,
+        weights.GetTrtWeights(), biases.GetTrtWeights());
   } else {
     // For conv, TF weights are RSCK, and TRT expects KCRS.
-    DCHECK_EQ(filter_format, "HWIO");
-    TRT_ShapedWeights weights_kcrs =
+    TRT_ENSURE(filter_format == "HWIO");
+    ::stream_executor::port::StatusOr<TRT_ShapedWeights> weights_kcrs =
         params->weight_store->GetTempWeights(weights);
-    ReorderRSCKToKCRS(weights, &weights_kcrs, 1);
+    TRT_ENSURE_OK(weights_kcrs);
+    ReorderRSCKToKCRS(weights, &weights_kcrs.ValueOrDie(), 1);
     conv_layer = params->converter->network()->addConvolution(
-        *tensor->trt_tensor(), weights.shape_.d[3], kernel_size, weights_kcrs.GetTrtWeights(),
-        biases.GetTrtWeights());
+        *tensor->trt_tensor(), weights.Shape().dim(3), kernel_size,
+        weights_kcrs.ValueOrDie().GetTrtWeights(), biases.GetTrtWeights());
   }
   TFTRT_RETURN_ERROR_IF_NULLPTR(conv_layer, node_def.name());
   conv_layer->setStride(stride);
-#if IS_TRT_VERSION_GE(5, 1, 3, 0)
-  if (attrs.get<string>("padding") == "SAME") {
+  if (padding_type == "SAME") {
     conv_layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER);
   }
-#else
-  conv_layer->setPadding(nvinfer1::DimsHW{padding[0].first, padding[1].first});
-#endif
-  conv_layer->setName(node_def.name().c_str());
+  params->converter->SetLayerName(conv_layer, node_def, "conv");
   conv_layer->setNbGroups(1);
   conv_layer->setDilation(dilation);
   ITensorProxyPtr output_tensor = conv_layer->getOutput(0);
@@ -3068,182 +3535,94 @@ Status ConvertFusedConv2DBiasActivation(OpConverterParams* params) {
   // Add activation if there is one.
   if (op_pair != ActivationTypeMap()->end()) {
     nvinfer1::IActivationLayer* activation_layer =
-        params->converter->network()->addActivation(*output_tensor->trt_tensor(),
-                                                    op_pair->second);
+        params->converter->network()->addActivation(
+            *output_tensor->trt_tensor(), op_pair->second);
     TFTRT_RETURN_ERROR_IF_NULLPTR(activation_layer, node_def.name());
+    params->converter->SetLayerName(activation_layer, node_def, "activation");
     output_tensor = activation_layer->getOutput(0);
   }
   // Restore transpose.
   if (need_transpose) {
     TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
-        output_tensor, {0, 2, 3, 1}, &output_tensor));
+        output_tensor, {0, 2, 3, 1}, &output_tensor, node_def, "to_NHWC"));
   }
   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
   return Status::OK();
 }
 
-Status ConvertPool(OpConverterParams* params) {
+Status ConvertPool(const OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
   TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"input", false}}));
-  TF_RETURN_IF_ERROR(
-      AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
+  std::set<DataType> allowed_types{DataType::DT_FLOAT, DataType::DT_HALF,
+                                   DataType::DT_INT8};
+  TF_RETURN_IF_ERROR(AllowDataTypes(*params, allowed_types));
   nvinfer1::PoolingType type;
   if (node_def.op() == "MaxPool") {
     type = nvinfer1::PoolingType::kMAX;
   } else if (node_def.op() == "AvgPool") {
     type = nvinfer1::PoolingType::kAVERAGE;
   } else {
-    return errors::Unimplemented("Unsupported pooling type: ", node_def.op(),
-                                 ", at ", node_def.name());
+    return errors::Unimplemented("Unsupported pooling type: ", node_def.op());
   }
-  TFAttrs attrs(node_def);
-  const string padding_type = attrs.get<string>("padding");
+
+  string data_format, padding_type;
+  std::vector<int64> tf_stride, tf_kernel;
+  AttrSlice attrs(node_def);
+  TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "data_format", &data_format));
+  TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "padding", &padding_type));
+  TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "strides", &tf_stride));
+  TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "ksize", &tf_kernel));
+
   if ((padding_type != "SAME") && (padding_type != "VALID")) {
-    return errors::Unimplemented("Unsupported padding type: ", padding_type,
-                                 ", at ", node_def.name());
+    return errors::Unimplemented("Unsupported padding type: ", padding_type);
   }
-  if (params->validation_only) return Status::OK();
 
   ITensorProxyPtr tensor = inputs.at(0).tensor();
   int h_index = 2;
   int w_index = 3;
-  const auto data_format = attrs.get<string>("data_format");
   if (data_format == "NHWC") {
     h_index = 1;
     w_index = 2;
-    TF_RETURN_IF_ERROR(
-        params->converter->TransposeTensor(tensor, {0, 3, 1, 2}, &tensor));
   }
 
-  const auto tf_stride = attrs.get<std::vector<int64>>("strides");
   const nvinfer1::DimsHW stride(tf_stride[h_index], tf_stride[w_index]);
-
-  const auto tf_kernel = attrs.get<std::vector<int64>>("ksize");
   const nvinfer1::DimsHW ksize(tf_kernel[h_index], tf_kernel[w_index]);
 
-// Before TRT 5.1.3, we have to calculate padding ourselves.
-#if !IS_TRT_VERSION_GE(5, 1, 3, 0)
-  auto tensor_dim = tensor->getDimensions();
-  std::vector<std::pair<int, int>> padding;
-  if (padding_type == "SAME") {
-    // This is NCHW tensor with no batch dimension.
-    //  1 -> h
-    //  2 -> w
-    padding = CreateSamePadding(
-        stride, ksize,
-        {static_cast<int>(tensor_dim.d[1]), static_cast<int>(tensor_dim.d[2])});
-  } else if (padding_type == "VALID") {
-    padding = {{0, 0}, {0, 0}};
+  if (!((ksize.h() >= 1 && ksize.w() >= 1) &&
+        (ksize.h() * ksize.w() < MAX_KERNEL_DIMS_PRODUCT(2)))) {
+    return errors::InvalidArgument("Window dimensions are not within bounds");
   }
-#endif
-// TensorRT 5.1 added support for asymmetric padding. Before that, we need an
-// extra padding layer.
-#if !IS_TRT_VERSION_GE(5, 1, 0, 0)
-  // Asymmetric padding case.
-  if (padding[0].first != padding[0].second ||
-      padding[1].first != padding[1].second) {
-    auto pad_layer = params->converter->network()->addPadding(
-        *tensor->trt_tensor(), nvinfer1::DimsHW(padding[0].first, padding[1].first),
-        nvinfer1::DimsHW(padding[0].second, padding[1].second));
-    TFTRT_RETURN_ERROR_IF_NULLPTR(pad_layer, node_def.name());
-    ITensorProxyPtr out_tensor = pad_layer->getOutput(0);
-    params->converter->MarkQuantizationRangesAsInferrable(&tensor, &out_tensor);
-    padding = {{0, 0}, {0, 0}};
-    tensor = out_tensor;
+
+  if (params->validation_only) return Status::OK();
+
+  if (data_format == "NHWC") {
+    TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
+        tensor, {0, 3, 1, 2}, &tensor, node_def, "to_NCHW"));
   }
-#endif
 
-  nvinfer1::IPoolingLayer* layer =
-      params->converter->network()->addPooling(*tensor->trt_tensor(), type, ksize);
+  nvinfer1::IPoolingLayer* layer = params->converter->network()->addPooling(
+      *tensor->trt_tensor(), type, ksize);
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
-  // TODO(tmorris): Average pooling may not be entirely safe to infer
-  // quantization range through (at least forwards - backwards should be fine).
-  // Max pooling is okay.
-  ITensorProxyPtr out_tensor = layer->getOutput(0);
-  params->converter->MarkQuantizationRangesAsInferrable(&tensor, &out_tensor);
 
   layer->setStride(stride);
-#if IS_TRT_VERSION_GE(5, 1, 3, 0)
   // VALID padding is the default TRT behavior.
-  if (attrs.get<string>("padding") == "SAME") {
+  if (padding_type == "SAME") {
     // SAME_UPPER means that post padding is preferred.
     layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER);
   }
-#elif IS_TRT_VERSION_GE(5, 1, 0, 0)
-  layer->setPrePadding(nvinfer1::DimsHW{padding[0].first, padding[1].first});
-  layer->setPostPadding(nvinfer1::DimsHW{padding[0].second, padding[1].second});
-#else
-  layer->setPadding(nvinfer1::DimsHW{padding[0].first, padding[1].first});
-#endif
-  layer->setName(node_def.name().c_str());
+  params->converter->SetLayerName(layer, node_def, "pooling");
   ITensorProxyPtr output_tensor = layer->getOutput(0);
 
   if (data_format == "NHWC") {
     TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
-        output_tensor, {0, 2, 3, 1}, &output_tensor));
+        output_tensor, {0, 2, 3, 1}, &output_tensor, node_def, "to_NHWC"));
   }
   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
   return Status::OK();
 }
 
-Status ConvertLeakyRelu(OpConverterParams* params) {
-  const auto& inputs = params->inputs;
-  const auto& node_def = params->node_def;
-  TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"input", false}}));
-  TF_RETURN_IF_ERROR(
-      AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
-  TFAttrs attrs(node_def);
-  const float alpha = attrs.get<float>("alpha");
-
-#if IS_TRT_VERSION_GE(5, 1, 2, 0)
-  // Use IActivationLayer when available.
-  if (params->validation_only) return Status::OK();
-
-  nvinfer1::IActivationLayer* layer =
-      params->converter->network()->addActivation(
-          *inputs.at(0).tensor()->trt_tensor(), nvinfer1::ActivationType::kLEAKY_RELU);
-  TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
-  layer->setAlpha(alpha);
-  params->outputs->push_back(TRT_TensorOrWeights(layer->getOutput(0)));
-  return Status::OK();
-#else
-  // Use elementwise ops when IActivationLayer is not available.
-  if (alpha < 0.0f || alpha > 1.0f) {
-    return errors::Unimplemented(
-        "Alpha value for LeakyRelu must be between 0 and 1, at ",
-        node_def.name());
-  }
-  if (params->validation_only) return Status::OK();
-
-  ITensorProxyPtr tensor = inputs.at(0).tensor();
-  // Create const for alpha.
-  ITensorProxyPtr const_alpha_tensor = nullptr;
-  TF_RETURN_IF_ERROR(CreateBroadcastableScalarConstant(
-      params, alpha, tensor->getDimensions(), &const_alpha_tensor));
-  // alpha * x
-  nvinfer1::IElementWiseLayer* mul_layer =
-      params->converter->network()->addElementWise(
-          *tensor->trt_tensor(), *const_alpha_tensor->trt_tensor(), nvinfer1::ElementWiseOperation::kPROD);
-  TFTRT_RETURN_ERROR_IF_NULLPTR(mul_layer, node_def.name());
-  // max(x, alpha * x)
-  nvinfer1::IElementWiseLayer* max_layer =
-      params->converter->network()->addElementWise(
-          *tensor->trt_tensor(), *mul_layer->getOutput(0),
-          nvinfer1::ElementWiseOperation::kMAX);
-  TFTRT_RETURN_ERROR_IF_NULLPTR(max_layer, node_def.name());
-  ITensorProxyPtr max_tensor = max_layer->getOutput(0);
-  ITensorProxyPtr mul_tensor = mul_layer->getOutput(0);
-  params->converter->MarkQuantizationRangesAsInferrable(&max_tensor,
-                                                        &mul_tensor);
-
-  params->outputs->push_back(TRT_TensorOrWeights(max_tensor));
-  return Status::OK();
-#endif
-}
-
-#if IS_TRT_VERSION_GE(5, 1, 2, 0)
-Status ConvertClipByValue(OpConverterParams* params) {
+Status ConvertClipByValue(const OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
   // TODO(tmorris): We can also allow the case where min and max are tensors by
@@ -3255,8 +3634,9 @@ Status ConvertClipByValue(OpConverterParams* params) {
       AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
   if (params->validation_only) return Status::OK();
 
-  TFAttrs attrs(node_def);
-  const DataType dtype = attrs.get<DataType>("T");
+  DataType dtype;
+  TF_RETURN_IF_ERROR(GetNodeAttr(AttrSlice(node_def), "T", &dtype));
+
   float clip_value_min = 0.0f;
   float clip_value_max = 0.0f;
   // TODO(tmorris): Add a templated helper function to get scalar weights of
@@ -3265,324 +3645,32 @@ Status ConvertClipByValue(OpConverterParams* params) {
     clip_value_min = inputs.at(1).weights().GetSpan<float>()[0];
     clip_value_max = inputs.at(2).weights().GetSpan<float>()[0];
   } else if (dtype == DataType::DT_HALF) {
-    clip_value_min = Eigen::half_impl::half_to_float(
-        inputs.at(1).weights().GetSpan<Eigen::half>()[0]);
-    clip_value_max = Eigen::half_impl::half_to_float(
-        inputs.at(2).weights().GetSpan<Eigen::half>()[0]);
+    clip_value_min =
+        static_cast<float>(inputs.at(1).weights().GetSpan<Eigen::half>()[0]);
+    clip_value_max =
+        static_cast<float>(inputs.at(2).weights().GetSpan<Eigen::half>()[0]);
   }
 
   nvinfer1::IActivationLayer* layer =
       params->converter->network()->addActivation(
-          *inputs.at(0).tensor()->trt_tensor(), nvinfer1::ActivationType::kCLIP);
+          *inputs.at(0).tensor()->trt_tensor(),
+          nvinfer1::ActivationType::kCLIP);
   layer->setAlpha(clip_value_min);
   layer->setBeta(clip_value_max);
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
-  ITensorProxyPtr output_tensor = layer->getOutput(0);
-  params->converter->ProvideQuantizationRange(&output_tensor, clip_value_min,
-                                              clip_value_max);
-  params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
-  return Status::OK();
-}
-#endif
-
-const std::unordered_map<string, nvinfer1::ActivationType>*
-ActivationTypeMap() {
-  static auto* const m =
-      new std::unordered_map<string, nvinfer1::ActivationType>({
-        {"Relu", nvinfer1::ActivationType::kRELU},
-            {"Sigmoid", nvinfer1::ActivationType::kSIGMOID},
-            {"Tanh", nvinfer1::ActivationType::kTANH},
-#if IS_TRT_VERSION_GE(5, 1, 2, 0)
-            {"Elu", nvinfer1::ActivationType::kELU},
-            {"Selu", nvinfer1::ActivationType::kSELU},
-            {"Softsign", nvinfer1::ActivationType::kSOFTSIGN},
-            {"Softplus", nvinfer1::ActivationType::kSOFTPLUS},
-#endif
-      });
-  return m;
-}
-
-Status ConvertActivation(OpConverterParams* params) {
-  const auto& inputs = params->inputs;
-  const auto& node_def = params->node_def;
-  TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"input", false}}));
-  TF_RETURN_IF_ERROR(
-      AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
-  auto op_pair = ActivationTypeMap()->find(node_def.op());
-  if (op_pair == ActivationTypeMap()->end()) {
-    return errors::Unimplemented("Activation op: ", node_def.op(),
-                                 " not supported at: ", node_def.name());
-  }
-  if (params->validation_only) return Status::OK();
-
-  // Start conversion.
-  nvinfer1::IActivationLayer* layer =
-      params->converter->network()->addActivation(*inputs.at(0).tensor()->trt_tensor(),
-                                                  op_pair->second);
-  TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
-  // Set parameters.
-#if IS_TRT_VERSION_GE(5, 1, 2, 0)
-  if (node_def.op() == "Elu") {
-    layer->setAlpha(1.0f);
-  } else if (node_def.op() == "Selu") {
-    // From tensorflow/core/kernels/relu_op_functor.h
-    layer->setAlpha(1.7580993408473768599402175208123f);
-    layer->setBeta(1.0507009873554804934193349852946f);
-  } else if (node_def.op() == "Softplus") {
-    layer->setAlpha(1.0f);
-    layer->setBeta(1.0f);
-  }
-#endif
-  ITensorProxyPtr output_tensor = layer->getOutput(0);
-  // Set quantization range for output when known.
-  if (node_def.op() == "Sigmoid") {
-    params->converter->ProvideQuantizationRange(&output_tensor, 0.0f, 1.0f);
-  } else if (node_def.op() == "Tanh") {
-    params->converter->ProvideQuantizationRange(&output_tensor, -1.0f, 1.0f);
-  } else if (node_def.op() == "Softsign") {
-    params->converter->ProvideQuantizationRange(&output_tensor, -1.0f, 1.0f);
-  }
-  params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
-  return Status::OK();
-}
-
-Status ConvertQuantize(OpConverterParams* params) {
-  const auto& inputs = params->inputs;
-  const auto& node_def = params->node_def;
-  if (node_def.op() == "FakeQuantWithMinMaxArgs") {
-    TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"input", false}}));
-  } else if (node_def.op() == "FakeQuantWithMinMaxVars") {
-    TF_RETURN_IF_ERROR(CheckInputsWeights(
-        *params, {{"input", false}, {"min", true}, {"max", true}}));
-  } else if (node_def.op() == "QuantizeAndDequantizeV2") {
-    TF_RETURN_IF_ERROR(CheckInputsWeights(
-        *params, {{"input", false}, {"input_min", true}, {"input_max", true}}));
-  } else if (node_def.op() == "QuantizeAndDequantizeV3") {
-    TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"input", false},
-                                                    {"input_min", true},
-                                                    {"input_max", true},
-                                                    {"num_bits", true}}));
-  }
-  float min_range = 0.0f;
-  float max_range = 0.0f;
-  if (node_def.op() == "FakeQuantWithMinMaxArgs") {
-    // Get ranges via node attributes.
-    TFAttrs attrs(node_def);
-    if (attrs.count("min") == 0 || attrs.count("max") == 0) {
-      return errors::InvalidArgument("Min or max attribute not found for ",
-                                     node_def.op(), " at ", node_def.name());
-    }
-    min_range = attrs.get<float>("min");
-    max_range = attrs.get<float>("max");
-  } else if (node_def.op() == "FakeQuantWithMinMaxVars" ||
-             node_def.op() == "QuantizeAndDequantizeV2" ||
-             node_def.op() == "QuantizeAndDequantizeV3") {
-    // Get ranges via inputs.
-    auto get_weights_value = [&inputs](int index) {
-      auto raw_weights =
-          static_cast<float*>(inputs.at(index).weights().GetValues());
-      return raw_weights[0];
-    };
-    min_range = get_weights_value(1);
-    max_range = get_weights_value(2);
-  } else {
-    return errors::InvalidArgument("Unknown quantization op ", node_def.op(),
-                                   ", at ", node_def.name());
-  }
-  if (params->validation_only) return Status::OK();
-
-  // Store ranges for tensor
-  ITensorProxyPtr input0 = inputs.at(0).tensor();
-  params->converter->ProvideQuantizationRange(&input0, min_range, max_range);
-  // Sometimes, TRT may not quantize a tensor, either because it chooses to
-  // execute a higher precision kernel or because of op fusion. In these cases,
-  // accuracy will suffer if the model was trained to expect quantization at
-  // that tensor. We should consider adding a clip(tensor, min_range, max_range)
-  // operation here to ensure that any arbitrarily placed quantize node will
-  // execute as expected. However, this will negatively affect performance. If
-  // users train their models in a way which models inference as close as
-  // possible (i.e. not quantizing in place where fusion will occur), then there
-  // is no problem with the current implementation.
-  params->outputs->push_back(inputs.at(0));
-  return Status::OK();
-}
-
-Status ConvertRelu6(OpConverterParams* params) {
-  const auto& inputs = params->inputs;
-  const auto& node_def = params->node_def;
-  TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"input", false}}));
-  TF_RETURN_IF_ERROR(
-      AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
-  if (params->validation_only) return Status::OK();
-
-#if IS_TRT_VERSION_GE(5, 1, 2, 0)
-  // Use IActivationLayer for TRT >= 5.1
-  nvinfer1::IActivationLayer* layer =
-      params->converter->network()->addActivation(
-          *inputs.at(0).tensor()->trt_tensor(), nvinfer1::ActivationType::kCLIP);
-  layer->setAlpha(0.0f);
-  layer->setBeta(6.0f);
-  TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
-  ITensorProxyPtr output_tensor = layer->getOutput(0);
-  params->converter->ProvideQuantizationRange(&output_tensor, 0.0f, 6.0f);
-  params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
-  return Status::OK();
-#else
-  // Convert using min(Relu(x), 6) before TRT 5.1
-  // Input Tensor
-  ITensorProxyPtr tensor = inputs.at(0).tensor();
-
-  // Relu operation i.e. Relu(x) = max(0, x)
-  nvinfer1::IActivationLayer* relu_layer =
-      params->converter->network()->addActivation(
-          *tensor->trt_tensor(), nvinfer1::ActivationType::kRELU);
-  TFTRT_RETURN_ERROR_IF_NULLPTR(relu_layer, node_def.name());
-
-  // Large range of relu is problematic during quantization in INT8 precision
-  // mode. Setting dynamic range of relu = [0.f, 6.0f] helps with quantization.
-  // TRT only uses dynamic ranges in INT8 precision mode,
-  // and this does not affect the FP32 path.
-  params->converter->ProvideQuantizationRange(&relu_layer->getOutput(0), 0.0f,
-                                              6.0f);
-
-  // Create a constant layer to store the floating point weight i.e. 6.0f
-  ITensorProxyPtr const6_tensor = nullptr;
-  TF_RETURN_IF_ERROR(CreateBroadcastableScalarConstant(
-      params, 6.0f, relu_layer->getOutput(0)->getDimensions(), &const6_tensor));
-
-  // ElementWise Min Operation
-  // Min op is a nop for INT8 execution path, as the input tensor
-  // to this layer will only have values in range [0.f, 6.0f].
-  nvinfer1::IElementWiseLayer* relu6_layer =
-      params->converter->network()->addElementWise(
-          *relu_layer->getOutput(0), *const6_tensor->trt_tensor(),
-          nvinfer1::ElementWiseOperation::kMIN);
-  TFTRT_RETURN_ERROR_IF_NULLPTR(relu6_layer, node_def.name());
-  ITensorProxyPtr output_tensor = relu6_layer->getOutput(0);
-  params->converter->ProvideQuantizationRange(&output_tensor, 0.0f, 6.0f);
-
-  params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
-  return Status::OK();
-#endif
-}
-
-Status ConvertBiasAddInt8WithoutCalibration(OpConverterParams* params) {
-  const auto& inputs = params->inputs;
-  const auto& node_def = params->node_def;
-  TF_RETURN_IF_ERROR(
-      CheckInputsWeights(*params, {{"value", false}, {"bias", true}}));
-  TF_RETURN_IF_ERROR(
-      AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
-  if (params->validation_only) return Status::OK();
-
-  ITensorProxyPtr tensor = inputs.at(0).tensor();
-  const nvinfer1::Dims original_dims = tensor->getDimensions();
-  TFAttrs attrs(node_def);
-  const string data_format = attrs.get<string>("data_format");
-  const int channel_index =
-      (data_format == "NHWC" ? original_dims.nbDims - 1 : 0);
-
-  nvinfer1::Permutation permutation;
-  if (channel_index != 0) {
-    // Permute the dimensions so that the channel dimension is the first
-    // dimension.
-    for (int i = 0; i < original_dims.nbDims; ++i) {
-      permutation.order[i] = i;
-    }
-    permutation.order[0] = channel_index;
-    permutation.order[channel_index] = 0;
-    VLOG(1) << "ConvertBiasAdd permutation: "
-            << DebugString(permutation, original_dims.nbDims);
-  }
-
-  // TensorRT addScale requires input to be of rank 3, we need to apply
-  // transpose as well as reshape.
-  // TODO(laigd): this doesn't match what the TRT doc says, fix the doc?
-  if (channel_index != 0 || original_dims.nbDims != 3) {
-    nvinfer1::IShuffleLayer* shuffle_layer =
-        params->converter->network()->addShuffle(*tensor->trt_tensor());
-    TFTRT_RETURN_ERROR_IF_NULLPTR(shuffle_layer, node_def.name());
-    ITensorProxyPtr out_tensor = shuffle_layer->getOutput(0);
-    params->converter->MarkQuantizationRangesAsInferrable(&tensor, &out_tensor);
-
-    // NOTE(laigd): for some reason we need to apply the reshape
-    // unconditionally. The default shape has nbDims==-1 and it seems the
-    // behavior is undefined in some cases.
-    nvinfer1::Dims reshape_dims;
-    reshape_dims.nbDims = 3;
-    // 0 means copying from input; -1 means inferring from the rest.
-    reshape_dims.d[0] = 0;
-    reshape_dims.d[1] = original_dims.nbDims >= 2 ? 0 : 1;
-    reshape_dims.d[2] = original_dims.nbDims >= 3 ? -1 : 1;
-    shuffle_layer->setReshapeDimensions(reshape_dims);
-
-    if (channel_index != 0) {
-      shuffle_layer->setFirstTranspose(permutation);
-    }
-    tensor = out_tensor;
-  }
-
-  TRT_ShapedWeights weights = inputs.at(1).weights();
-  nvinfer1::ScaleMode mode = nvinfer1::ScaleMode::kCHANNEL;
-  if (weights.shape_.d[0] == 1) {
-    mode = nvinfer1::ScaleMode::kUNIFORM;
-  }
-
-  TRT_ShapedWeights empty_weights(weights.TrtDType());
-  nvinfer1::IScaleLayer* layer = params->converter->network()->addScale(
-      *tensor->trt_tensor(), mode, weights.GetTrtWeights(), empty_weights.GetTrtWeights(),
-      empty_weights.GetTrtWeights());
-  TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
-
-  ITensorProxyPtr output_tensor = layer->getOutput(0);
-
-  // Restore transpose & reshape.
-  if (channel_index != 0 || original_dims.nbDims != 3) {
-    nvinfer1::IShuffleLayer* shuffle_layer =
-        params->converter->network()->addShuffle(*output_tensor->trt_tensor());
-    TFTRT_RETURN_ERROR_IF_NULLPTR(shuffle_layer, node_def.name());
-    // NOTE: for same reason as mentioned above we need to apply the reshape
-    // unconditionally.
-    nvinfer1::Dims reshape_dims = original_dims;
-    if (channel_index != 0) {
-      // NOTE: according to NVIDIA dimension types are deprecated, so we don't
-      // need to copy them back.
-      reshape_dims.d[channel_index] = original_dims.d[0];
-      reshape_dims.d[0] = original_dims.d[channel_index];
-    }
-    shuffle_layer->setReshapeDimensions(reshape_dims);
-
-    if (channel_index != 0) {
-      shuffle_layer->setSecondTranspose(permutation);
-    }
-    ITensorProxyPtr shuffle_tensor = shuffle_layer->getOutput(0);
-    params->converter->MarkQuantizationRangesAsInferrable(&output_tensor,
-                                                          &shuffle_tensor);
-    output_tensor = shuffle_tensor;
-  }
-
-  params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
+  params->converter->SetLayerName(layer, node_def, "activation");
+  params->outputs->push_back(TRT_TensorOrWeights(layer->getOutput(0)));
   return Status::OK();
 }
 
-Status ConvertBiasAdd(OpConverterParams* params) {
-  if (params->precision_mode == TrtPrecisionMode::INT8 &&
-      !params->use_calibration) {
-    // NOTE(laigd): based on some observation, it seems TensorRT cannot fuse
-    // IConvolutionLayer and IElementwiseLayer and will require range
-    // information for the output of Conv2D. Using IScaleLayer will fix the
-    // problem.
-    return ConvertBiasAddInt8WithoutCalibration(params);
-  }
+Status ConvertBiasAdd(const OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
-
-  if (inputs.size() != 2) {
-    return errors::InvalidArgument(
-        "BiasAdd expects exactly 2 inputs, but received ", inputs.size());
-  }
+  TFTRT_CHECK_INPUT_SIZE(inputs.size(), 2, node_def);
 
   if (inputs[0].is_weights() && inputs[1].is_weights()) {
+    // TODO(lsugy): don't assume that if all inputs are weights, grappler
+    // should fold them, because variables are weights.
     return errors::InvalidArgument(
         "All inputs are weights, but Grappler is expected to fold them.");
   }
@@ -3590,60 +3678,69 @@ Status ConvertBiasAdd(OpConverterParams* params) {
   TF_RETURN_IF_ERROR(
       AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
 
-  TFAttrs attrs(node_def);
-  const string& data_format = attrs.get<string>("data_format");
+  string data_format;
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(AttrSlice(node_def), "data_format", &data_format));
 
   nvinfer1::Dims input_shape = inputs.at(0).GetTrtDims();
   nvinfer1::Dims bias_shape = inputs.at(1).GetTrtDims();
-  // If the input is NCHW, then we need to unsqueeze the bias such that its last
-  // dimensions are 1s (and the first dimension is C).
+  // The bias input arg is a 1-D tensor with length C. If the input is NCHW,
+  // then we need to unsqueeze the bias such that its shape is [1, C, 1, 1].
   if (data_format == "NCHW") {
-    bias_shape.nbDims = input_shape.nbDims;
-    std::fill(bias_shape.d + 1, bias_shape.d + bias_shape.nbDims, 1);
+    if (params->use_implicit_batch) {
+      // The batch dim is not included in implicit batch mode, so the shape of
+      // the bias tensor is [C, 1, 1].
+      bias_shape.nbDims = input_shape.nbDims;
+      std::fill(bias_shape.d + 1, bias_shape.d + bias_shape.nbDims, 1);
+    } else {
+      // In explicit batch mode we create a tensor with shape [1, C, 1, 1].
+      std::vector<int> bias_shape_vec(bias_shape.d,
+                                      bias_shape.d + bias_shape.nbDims);
+      // Insert 1 before for batch dim
+      bias_shape_vec.insert(bias_shape_vec.begin(), 1);
+      // Trail with 1s to match input_shape size
+      bias_shape_vec.insert(bias_shape_vec.end(),
+                            input_shape.nbDims - bias_shape_vec.size(), 1);
+      DimsAdapter(bias_shape_vec).TrtDims(&bias_shape);
+    }
   } else {
     // Next, broadcast the bias across the input.
     TF_RETURN_IF_ERROR(GetTrtBroadcastShape(inputs.at(0), inputs.at(1),
                                             /*check_feasibility=*/true,
+                                            params->use_implicit_batch,
                                             &input_shape, &bias_shape));
   }
 
   // Convert input to a TRT tensor
   ITensorProxyPtr input_tensor{nullptr};
-  TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
-      inputs.at(0), input_shape, params->validation_only, &input_tensor));
+  TF_RETURN_IF_ERROR(PrepareTensorForShape(
+      params->converter, inputs.at(0), DimsAdapter(input_shape),
+      params->validation_only, &input_tensor, node_def,
+      /*op_instance=*/0));
 
   // Finally, reshape bias. Since the bias is usually a constant, this will
   // normally happen at conversion-time.
   ITensorProxyPtr bias_tensor{nullptr};
-  TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
-      inputs.at(1), bias_shape, params->validation_only, &bias_tensor));
+  TF_RETURN_IF_ERROR(PrepareTensorForShape(
+      params->converter, inputs.at(1), DimsAdapter(bias_shape),
+      params->validation_only, &bias_tensor, node_def,
+      /*op_instance=*/1));
   VLOG(2) << "Bias shape adjusted to " << DebugString(bias_shape);
 
   if (params->validation_only) return Status::OK();
 
   nvinfer1::IElementWiseLayer* layer =
       params->converter->network()->addElementWise(
-          *input_tensor->trt_tensor(), *bias_tensor->trt_tensor(), nvinfer1::ElementWiseOperation::kSUM);
+          *input_tensor->trt_tensor(), *bias_tensor->trt_tensor(),
+          nvinfer1::ElementWiseOperation::kSUM);
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+  params->converter->SetLayerName(layer, node_def, "sum");
   ITensorProxyPtr output_tensor = layer->getOutput(0);
 
   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
   return Status::OK();
 }
 
-void GetTensorDimsWithProtoShape(const Tensor& tensor, nvinfer1::Dims* dims) {
-  if (tensor.dims() > 0) {
-    *dims = GetTrtDimsForTensor(tensor);
-  } else {
-    dims->nbDims = 1;
-    // No dimension provided. Flatten it.
-    dims->d[0] = tensor.NumElements();
-    for (int i = 1; i < nvinfer1::Dims::MAX_DIMS; ++i) {
-      dims->d[i] = 0;
-    }
-  }
-}
-
 template <typename Input>
 inline bool IsIntegerInInt32Bounds(const Input& inp) {
   static_assert(std::is_integral<Input>::value,
@@ -3691,7 +3788,7 @@ Status TfTensorToTrtWeights(const Tensor& tensor, TrtWeightStore* weight_store,
 
   // Verify that the dtype is supported by TensorRT. Otherwise, return an error.
   nvinfer1::DataType trt_dtype;
-  TF_RETURN_IF_ERROR(TfDataTypeToTrt(converted_dtype, &trt_dtype));
+  TF_RETURN_IF_ERROR(TfTypeToTrtType(converted_dtype, &trt_dtype));
 
   if (tensor.NumElements() == 0) {
     // Return empty weights.
@@ -3699,21 +3796,26 @@ Status TfTensorToTrtWeights(const Tensor& tensor, TrtWeightStore* weight_store,
     return Status::OK();
   }
 
-  nvinfer1::Dims weight_dims;
-  GetTensorDimsWithProtoShape(tensor, &weight_dims);
-  *weights = weight_store->GetTempWeights(trt_dtype, weight_dims);
+  ::stream_executor::port::StatusOr<DimsAdapter> weight_dims =
+      DimsAdapter::Create(tensor.shape());
+  TRT_ENSURE_OK(weight_dims);
+
+  auto tmp = weight_store->GetTempWeights(trt_dtype,
+                                          weight_dims.ValueOrDie().AsTrtDims());
+  TRT_ENSURE_OK(tmp);
+  *weights = std::move(tmp).ValueOrDie();
 
   // Copy the tensor directly if the tensor does not require cast to the
   // supported type.
   if (converted_dtype == dtype) {
-    char* dst = static_cast<char*>(weights->GetValues());
-    memcpy(dst, tensor.tensor_data().data(), tensor.TotalBytes());
+    std::copy_n(tensor.tensor_data().data(), tensor.TotalBytes(),
+                weights->GetPointer<int8>());
     return Status::OK();
   }
 
   Status status = Status::OK();
   // Copy tensor elements after casting them to the converted DataType.
-  int32* dst = static_cast<int32*>(weights->GetValues());
+  int32* dst = weights->GetPointer<int32>();
   switch (dtype) {
     case DT_INT8:
       status = CopyToTrtInt32Array<DT_INT8>(tensor, dst);
@@ -3747,13 +3849,12 @@ Status TfTensorToTrtWeights(const Tensor& tensor, TrtWeightStore* weight_store,
 // weights to params->outputs. We did this since TrtNodeValidator needs the
 // weights as input to other nodes, and use it to determine whether those nodes
 // are supported by TRT.
-Status ConvertConst(OpConverterParams* params) {
+Status ConvertConst(const OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
   if (!inputs.empty()) {
     return errors::InvalidArgument(
-        "Constant node is expected to have empty input list: ",
-        node_def.name());
+        "Constant node is expected to have empty input list");
   }
 
   // Create shaped weights as output
@@ -3764,8 +3865,9 @@ Status ConvertConst(OpConverterParams* params) {
                             node_def.name());
   }
 
-  TFAttrs attrs(node_def);
-  const DataType dtype = attrs.get<DataType>("dtype");
+  DataType dtype;
+  TF_RETURN_IF_ERROR(GetNodeAttr(AttrSlice(node_def), "dtype", &dtype));
+
   if (dtype != tensor.dtype()) {
     return errors::InvalidArgument("DataType mismatch between attr (",
                                    DataTypeString(dtype), ") and tensor (",
@@ -3782,206 +3884,42 @@ Status ConvertConst(OpConverterParams* params) {
   return Status::OK();
 }
 
-Status ConvertIdentity(OpConverterParams* params) {
+Status ConvertIdentity(const OpConverterParams* params) {
   // TODO(tmorris): TRT's Identity layer does not get optimized away as of TRT
   // 5.0, however once we know that it does it would be nice to use that
   // instead.
   if (params->validation_only) return Status::OK();
-  params->outputs->push_back(params->inputs.at(0));
-  return Status::OK();
-}
-
-const std::unordered_map<string, nvinfer1::ElementWiseOperation>*
-BinaryOperationMap() {
-  static auto* const m =
-      new std::unordered_map<string, nvinfer1::ElementWiseOperation> {
-    {"Add", nvinfer1::ElementWiseOperation::kSUM},
-        {"AddV2", nvinfer1::ElementWiseOperation::kSUM},
-        {"Mul", nvinfer1::ElementWiseOperation::kPROD},
-        {"Sub", nvinfer1::ElementWiseOperation::kSUB},
-        {"Div", nvinfer1::ElementWiseOperation::kDIV},
-#if IS_TRT_VERSION_GE(5, 1, 0, 0)
-        // This op applies Floor after Div.
-        {"FloorDiv", nvinfer1::ElementWiseOperation::kDIV},
-#endif
-        {"RealDiv", nvinfer1::ElementWiseOperation::kDIV},
-        {"Minimum", nvinfer1::ElementWiseOperation::kMIN},
-        {"Maximum", nvinfer1::ElementWiseOperation::kMAX},
-        {"Pow", nvinfer1::ElementWiseOperation::kPOW},
-  };
-  return m;
-}
-
-Status ConvertBinary(OpConverterParams* params) {
-  const auto& inputs = params->inputs;
-  const auto& node_def = params->node_def;
-  if (inputs.size() != 2) {
-    return errors::InvalidArgument(node_def.op(), " got ", inputs.size(),
-                                   " inputs but expected 2, at ",
-                                   node_def.name());
-  }
-  TF_RETURN_IF_ERROR(
-      AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
-
-  // Constant folding should have been done by TensorFlow
-  if (inputs.at(0).is_weights() && inputs.at(1).is_weights()) {
-    return errors::Unimplemented(
-        "Constant folding is falled back to TensorFlow, binary op received "
-        "both input as constant at: ",
-        node_def.name());
-  }
-  const TRT_TensorOrWeights& operand_l = inputs.at(0);
-  const TRT_TensorOrWeights& operand_r = inputs.at(1);
-
-  auto op_pair = BinaryOperationMap()->find(node_def.op());
-  if (op_pair == BinaryOperationMap()->end()) {
-    return errors::Unimplemented("Binary op ", node_def.op(),
-                                 " not supported at: ", node_def.name());
-  }
-
-  nvinfer1::Dims broadcasted_dims_l, broadcasted_dims_r;
-  TF_RETURN_IF_ERROR(
-      GetTrtBroadcastShape(operand_l, operand_r, /*check_feasibility=*/true,
-                           &broadcasted_dims_l, &broadcasted_dims_r));
-  ITensorProxyPtr tensor_l = nullptr;
-  ITensorProxyPtr tensor_r = nullptr;
-  // This will also convert constants to tensors, and set quantization ranges.
-  TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
-      operand_l, broadcasted_dims_l, params->validation_only, &tensor_l));
-  TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
-      operand_r, broadcasted_dims_r, params->validation_only, &tensor_r));
-  if (params->validation_only) return Status::OK();
-
-  // Add ElementWise layer.
-  nvinfer1::ILayer* layer = params->converter->network()->addElementWise(
-      *tensor_l->trt_tensor(), *tensor_r->trt_tensor(), op_pair->second);
-  TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
-  ITensorProxyPtr trt_tensor = layer->getOutput(0);
-
-#if IS_TRT_VERSION_GE(5, 1, 0, 0)
-  if (node_def.op() == "FloorDiv") {
-    layer = params->converter->network()->addUnary(
-        *trt_tensor->trt_tensor(), nvinfer1::UnaryOperation::kFLOOR);
-    TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
-    trt_tensor = layer->getOutput(0);
-  }
-#endif
-  params->outputs->push_back(TRT_TensorOrWeights(trt_tensor));
-  return Status::OK();
-}
-
-Status ConvertRsqrt(OpConverterParams* params) {
-  const auto& inputs = params->inputs;
-  const auto& node_def = params->node_def;
-  TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"x", false}}));
-  TF_RETURN_IF_ERROR(
-      AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
-  if (params->validation_only) return Status::OK();
 
-  // TODO(tmorris): params->converter is null during validation. Allow
-  // precision_mode and use_calibration to be accessed during validation and
-  // include this check in validation.
-  // We will need a quantization range for intermediate tensor if not using
-  // calibration.
-  //
-  //   x -> [Sqrt] -> sqrt(x) -> [Recip] -> 1/sqrt(x)
-  //                     ^
-  //               need range here
-  if (params->converter->precision_mode() == TrtPrecisionMode::INT8 &&
-      !params->converter->use_calibration()) {
-    return errors::Unimplemented(
-        "Intermediate quantization range cannot be determined without"
-        " calibration for Rsqrt, consider replacing with "
-        "Sqrt -> FakeQuant -> Reciprocal ops, at ",
-        node_def.name());
+  for (int i = 0; i < params->inputs.size(); i++) {
+    params->outputs->push_back(params->inputs.at(i));
   }
-  // Start conversion.
-  ITensorProxyPtr tensor = inputs.at(0).tensor();
-  // Sqrt
-  nvinfer1::IUnaryLayer* sqrt_layer = params->converter->network()->addUnary(
-      *tensor->trt_tensor(), nvinfer1::UnaryOperation::kSQRT);
-  TFTRT_RETURN_ERROR_IF_NULLPTR(sqrt_layer, node_def.name());
-  // Recip
-  nvinfer1::IUnaryLayer* recip_layer = params->converter->network()->addUnary(
-      *sqrt_layer->getOutput(0), nvinfer1::UnaryOperation::kRECIP);
-  TFTRT_RETURN_ERROR_IF_NULLPTR(recip_layer, node_def.name());
-  params->outputs->push_back(TRT_TensorOrWeights(recip_layer->getOutput(0)));
   return Status::OK();
 }
 
-const std::unordered_map<string, nvinfer1::UnaryOperation>*
-UnaryOperationMap() {
-  static auto* const m =
-      new std::unordered_map<string, nvinfer1::UnaryOperation>({
-        {"Neg", nvinfer1::UnaryOperation::kNEG},
-            {"Exp", nvinfer1::UnaryOperation::kEXP},
-            {"Log", nvinfer1::UnaryOperation::kLOG},
-            {"Sqrt", nvinfer1::UnaryOperation::kSQRT},
-            {"Abs", nvinfer1::UnaryOperation::kABS},
-            {"Reciprocal", nvinfer1::UnaryOperation::kRECIP},
-#if IS_TRT_VERSION_GE(5, 1, 0, 0)
-            {"Sin", nvinfer1::UnaryOperation::kSIN},
-            {"Cos", nvinfer1::UnaryOperation::kCOS},
-            {"Tan", nvinfer1::UnaryOperation::kTAN},
-            {"Sinh", nvinfer1::UnaryOperation::kSINH},
-            {"Cosh", nvinfer1::UnaryOperation::kCOSH},
-            {"Asin", nvinfer1::UnaryOperation::kASIN},
-            {"Acos", nvinfer1::UnaryOperation::kACOS},
-            {"Atan", nvinfer1::UnaryOperation::kATAN},
-            {"Asinh", nvinfer1::UnaryOperation::kASINH},
-            {"Acosh", nvinfer1::UnaryOperation::kACOSH},
-            {"Atanh", nvinfer1::UnaryOperation::kATANH},
-            {"Ceil", nvinfer1::UnaryOperation::kCEIL},
-            {"Floor", nvinfer1::UnaryOperation::kFLOOR},
-#endif
-      });
-  return m;
-}
-
-Status ConvertUnary(OpConverterParams* params) {
-  const auto& inputs = params->inputs;
-  const auto& node_def = params->node_def;
-  TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"x", false}}));
-  TF_RETURN_IF_ERROR(
-      AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
-  auto op_pair = UnaryOperationMap()->find(node_def.op());
-  if (op_pair == UnaryOperationMap()->end()) {
-    return errors::Unimplemented("Unary op: ", node_def.op(),
-                                 " not supported at: ", node_def.name());
-  }
+// This converter is a debug-only feature designed to allow graph segmentation
+// experiments. Its use is being controled by
+// `TF_TRT_OP_FAKELIST=OpName1,OpName2,...`.
+// See `op_converter_registry.cc` for further details.
+//
+// This converter is designed as followed:
+//   - always succeed at graph segmentation time.
+//   - always fail at TRT Engine build time.
+Status ConvertFake(const OpConverterParams* params) {
   if (params->validation_only) return Status::OK();
 
-  // Start conversion.
-  ITensorProxyPtr tensor = inputs.at(0).tensor();
-  nvinfer1::IUnaryLayer* layer =
-      params->converter->network()->addUnary(*tensor->trt_tensor(), op_pair->second);
-  TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
-  ITensorProxyPtr output_tensor = layer->getOutput(0);
-
-  // Set quantization ranges.
-  if (node_def.op() == "Sin" || node_def.op() == "Cos") {
-    params->converter->ProvideQuantizationRange(&output_tensor, -1.0f, 1.0f);
-  } else if (node_def.op() == "Asin" || node_def.op() == "Atan") {
-    params->converter->ProvideQuantizationRange(&output_tensor, -M_PI_2, M_PI_2);
-  } else if (node_def.op() == "Acos") {
-    params->converter->ProvideQuantizationRange(&output_tensor, 0.0f, M_PI);
-  } else if (node_def.op() == "Neg" || node_def.op() == "Abs") {
-    // Neg and Abs will have same range as input since TRT uses symmetric
-    // quantization.
-    // TODO(tmorris): Should we infer ranges for Ceil and Floor as well?
-    params->converter->MarkQuantizationRangesAsInferrable(&tensor,
-                                                          &output_tensor);
-  }
-  params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
-  return Status::OK();
+  return errors::Unimplemented(
+      "This converter is not valid after graph "
+      "segmentation. Building an engine using this "
+      "converter will trigger a native segment "
+      "fallback.");
 }
 
-Status ConvertSquare(OpConverterParams* params) {
+Status ConvertSquare(const OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
   TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"x", false}}));
-  TF_RETURN_IF_ERROR(
-      AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
+  TF_RETURN_IF_ERROR(AllowDataTypes(
+      *params, {DataType::DT_FLOAT, DataType::DT_HALF, DataType::DT_INT32}));
   if (params->validation_only) return Status::OK();
 
   // Constant 2 with same rank as input
@@ -3995,40 +3933,45 @@ Status ConvertSquare(OpConverterParams* params) {
           *inputs.at(0).tensor()->trt_tensor(), *const2_tensor->trt_tensor(),
           nvinfer1::ElementWiseOperation::kPOW);
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+  params->converter->SetLayerName(layer, node_def);
   ITensorProxyPtr output_tensor = layer->getOutput(0);
 
   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
   return Status::OK();
 }
 
-Status ConvertReduce(OpConverterParams* params) {
+Status ConvertReduce(const OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
   TF_RETURN_IF_ERROR(
       CheckInputsWeights(*params, {{"input", false}, {"axis", true}}));
-  TF_RETURN_IF_ERROR(
-      AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
+  TF_RETURN_IF_ERROR(AllowDataTypes(
+      *params, {DataType::DT_FLOAT, DataType::DT_HALF, DataType::DT_INT32}));
 
   ITensorProxyPtr tensor = inputs.at(0).tensor();
   auto tf_axes_list = inputs.at(1).weights().GetSpan<int>();
 
-  TFAttrs attrs(node_def);
+  DataType idx_dtype{DataType::DT_INT32};
+  bool keep_dims{false};
+  AttrSlice attrs(node_def);
+  TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "Tidx", &idx_dtype));
+  TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "keep_dims", &keep_dims));
+
   // Only expect to handle INT32 as attributes for now
-  if (attrs.get<DataType>("Tidx") != DataType::DT_INT32) {
+  if (idx_dtype != DataType::DT_INT32) {
     return errors::Unimplemented("Tidx supports only DT_INT32");
   }
 
   int axes = 0;
   if (tf_axes_list.size() == 0) {
     return errors::InvalidArgument(
-        "TRT cannot support reduce on all (batch) dimensions, at",
-        node_def.name());
+        "TRT cannot support reduce on all (batch) dimensions");
   }
   for (int i = 0; i < tf_axes_list.size(); i++) {
     int trt_axis;
     TF_RETURN_IF_ERROR(
         ConvertAxis(tf_axes_list[i], tensor->getDimensions().nbDims,
-                    node_def.name(), /*use_implicit_batch=*/true, &trt_axis));
+                    node_def.name(), params->use_implicit_batch, &trt_axis));
     axes |= (1 << trt_axis);
   }
 
@@ -4044,15 +3987,14 @@ Status ConvertReduce(OpConverterParams* params) {
   } else if (node_def.op() == "Mean") {
     reduce_operation = nvinfer1::ReduceOperation::kAVG;
   } else {
-    return errors::Unimplemented("Op not supported ", node_def.op(), ", at ",
-                                 node_def.name());
+    return errors::Unimplemented("Op not supported ", node_def.op());
   }
   if (params->validation_only) return Status::OK();
 
-  const auto keep_dims = attrs.get<bool>("keep_dims");
   nvinfer1::ILayer* layer = params->converter->network()->addReduce(
       *tensor->trt_tensor(), reduce_operation, axes, keep_dims);
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+  params->converter->SetLayerName(layer, node_def);
 
   params->outputs->push_back(TRT_TensorOrWeights(layer->getOutput(0)));
   return Status::OK();
@@ -4062,55 +4004,97 @@ Status ConvertReduce(OpConverterParams* params) {
 // converted by first expanding input tensors by adding a new dimension of size
 // one at the specified axis and then concatenating the tensors at the same
 // axis.
-Status ConvertPack(OpConverterParams* params) {
+Status ConvertPack(const OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
 
-  TFAttrs attrs(node_def);
-  const int num_inputs = attrs.get<int64>("N");
+  int num_inputs{0};
+  int64 tf_axis{0};
+  AttrSlice attrs(node_def);
+  TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "N", &num_inputs));
+  TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "axis", &tf_axis));
+
   if (num_inputs != inputs.size()) {
     return errors::InvalidArgument(
-        "Number of inputs for Pack is inconsistent with N attribute, at ",
-        node_def.name());
-  }
-
-  // Validate inputs. Values must be tensors for now.
-  std::vector<std::pair<string, bool>> inputs_is_weight;
+        "Number of inputs for Pack is inconsistent with N attribute");
+  }
+
+  // In implicit batch mode we do not allow weight input. An input tensor with
+  // dims NCHW is represented with dims CHW during conversion time, and N is
+  // defined only during runtime. A weight is represented with dims NCHW. We
+  // cannot be sure that the runtime N will agree with the conversion time N,
+  // therefore we do not convert the pack op if it has both tensor and weight
+  // inputs. This restriction does not apply in explicit batch mode, in that
+  // case the input tensors are also represented with full dims that include the
+  // batch size.
+  TrtInputArg expected_arg =
+      params->use_implicit_batch ? TrtInputArg::kTensor : TrtInputArg::kBoth;
+
+  std::vector<std::pair<string, TrtInputArg>> inputs_is_weight;
+  inputs_is_weight.reserve(num_inputs);
   for (int i = 0; i < num_inputs; ++i) {
-    inputs_is_weight.push_back({StrCat("values_", i), false});
+    inputs_is_weight.push_back({StrCat("values_", i), expected_arg});
   }
   TF_RETURN_IF_ERROR(CheckInputsWeights(*params, inputs_is_weight));
 
-  // TODO(hinsu): Enable INT32 with TensorRT version 5.1.3 after testing.
-  TF_RETURN_IF_ERROR(
-      AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
-
+  std::set<DataType> allowed_types{DataType::DT_FLOAT, DataType::DT_HALF,
+                                   DataType::DT_INT32};
+  TF_RETURN_IF_ERROR(AllowDataTypes(*params, allowed_types));
   if (num_inputs > 1) {
     // Verify that inputs are compatible for concatenation after the expansion.
     TF_RETURN_IF_ERROR(
         VerifyShapesMatch(inputs, /*masked_dim=*/-1, node_def.name()));
   }
 
+  // Find the dimension of the inputs. In general inputs can have dynamic shape,
+  // in that case we have to use DynamicExpandDims to calculate the expanded
+  // dimensions. To avoid that, we try to find a weight input which is
+  // guaranteed to have known static shape.
+  int idx = 0;
+  for (int i = 1; i < inputs.size(); i++) {
+    if (HasStaticShape(inputs.at(i).GetTrtDims())) {
+      idx = i;
+    }
+  }
+  DimsAdapter dims(inputs.at(idx).GetTrtDims());
   // Convert axis from the TensorFlow format to TensorRT format.
-  const nvinfer1::Dims dims = inputs.at(0).GetTrtDims();
-  const int64 tf_axis = attrs.get<int64>("axis");
-  int trt_axis;
-  TF_RETURN_IF_ERROR(ConvertAxis(tf_axis, dims.nbDims + 1, node_def.name(),
-                                 /*use_implicit_batch=*/true, &trt_axis));
+  int trt_axis{0};
+  TF_RETURN_IF_ERROR(ConvertAxis(tf_axis, dims.NumDims() + 1, node_def.name(),
+                                 params->use_implicit_batch, &trt_axis));
 
   // Compute expanded dimensions and then reshape input tensors.
-  std::vector<int> tensor_dims(dims.d, dims.d + dims.nbDims);
+  std::vector<int64_t> tensor_dims(dims.begin(), dims.end());
   tensor_dims.insert(tensor_dims.begin() + trt_axis, 1);
-  nvinfer1::Dims expanded_dims;
-  TF_RETURN_IF_ERROR(TensorShapeArrayToTrtDims(tensor_dims, &expanded_dims));
   std::vector<ITensorProxyPtr> expanded_tensors;
-  for (const TRT_TensorOrWeights& tensor : inputs) {
+
+  int input_index = 0;
+  for (const TRT_TensorOrWeights& input : inputs) {
     ITensorProxyPtr expanded_tensor = nullptr;
-    TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
-        tensor, expanded_dims, params->validation_only, &expanded_tensor));
+    if (input.is_tensor() && !params->use_implicit_batch &&
+        !HasStaticShape(dims)) {
+      if (!params->validation_only) {
+        TF_RETURN_IF_ERROR(params->converter->DynamicExpandDims(
+            /*input=*/input.tensor(),
+            /*dims=*/dims.AsTrtDims(),
+            /*axis=*/trt_axis,
+            /*params=*/params,
+            /*output=*/&expanded_tensor,
+            /*op_instance=*/input_index));
+      }
+    } else {
+      TF_RETURN_IF_ERROR(PrepareTensorForShape(
+          /*converter=*/params->converter,
+          /*input=*/input,
+          /*dims=*/DimsAdapter(tensor_dims),
+          /*validation_only=*/params->validation_only,
+          /*tensor=*/&expanded_tensor,
+          /*node_def=*/node_def,
+          /*op_instance=*/input_index));
+    }
     if (!params->validation_only) {
       expanded_tensors.push_back(expanded_tensor);
     }
+    input_index++;
   }
   if (params->validation_only) return Status::OK();
 
@@ -4130,118 +4114,152 @@ Status ConvertPack(OpConverterParams* params) {
           static_cast<nvinfer1::ITensor* const*>(trt_expanded_tensors.data()),
           expanded_tensors.size());
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+  params->converter->SetLayerName(layer, node_def, "concat");
   // Note that trt_axis stays the same even after expanding tensors at the axis.
   layer->setAxis(trt_axis);
   params->outputs->push_back(TRT_TensorOrWeights(layer->getOutput(0)));
   return Status::OK();
 }
 
-Status ConvertPad(OpConverterParams* params) {
+Status ConvertPad(const OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
   TF_RETURN_IF_ERROR(
       CheckInputsWeights(*params, {{"tensor", false}, {"paddings", true}}));
-  TF_RETURN_IF_ERROR(
-      AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
+  TF_RETURN_IF_ERROR(AllowDataTypes(
+      *params, {DataType::DT_FLOAT, DataType::DT_HALF, DataType::DT_INT8}));
 
   // Implement tensor binaryOp weight [channel wise] for now;
   ITensorProxyPtr tensor = inputs.at(0).tensor();
   const auto dims = tensor->getDimensions();
   // Restore implicit batch dimension
-  const int nb_dims = dims.nbDims + 1;
+  const int nb_dims =
+      params->use_implicit_batch ? dims.nbDims + 1 : dims.nbDims;
 
+  // TODO(tfeher): Support nb_dims < 4 by inserting extra dimensions to the
+  // original input.
+  if (nb_dims < 4) {
+    return errors::InvalidArgument("Convertpad requires at least 4D input");
+  }
   TRT_ShapedWeights pads = inputs.at(1).weights();
 
-  TFAttrs attrs(node_def);
-  // Padding type here is done through TF type
-  //   so I can leverage their EnumToDataType for my cast
-  auto padding_type = attrs.get<DataType>("Tpaddings");
   // TODO(jie): handle data type conversion for TRT?
+  DataType padding_dtype{DataType::DT_INT32};
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(AttrSlice(node_def), "Tpaddings", &padding_dtype));
 
-  if (pads.shape_.d[0] != nb_dims || pads.shape_.d[1] != 2) {
-    return errors::InvalidArgument(
-        "Pad only supports explicit padding on 4 dimensional tensor, at ",
-        node_def.name());
+  if (pads.Shape().dim(0) != nb_dims || pads.Shape().dim(1) != 2) {
+    return errors::InvalidArgument("Paddings must be a weight with shape ",
+                                   "[n, 2], where n is the rank of input ",
+                                   "tensor");
   }
 
   // Only expect to handle INT32 as attributes for now
-  if (padding_type != DataType::DT_INT32) {
+  if (padding_dtype != DataType::DT_INT32) {
     return errors::Unimplemented("Tpaddings supports only DT_INT32");
   }
-  auto pad_data = static_cast<int*>(pads.GetValues());
+  auto pad_data = pads.GetPointer<int>();
 
-  std::vector<int32_t> pad_index;
+  std::vector<int32_t> tf_pad_index;
   for (int i = 0; i < nb_dims; i++) {
     if (pad_data[2 * i] != 0 || pad_data[2 * i + 1] != 0) {
-      pad_index.push_back(i);
+      tf_pad_index.push_back(i);
     }
   }
 
   // No padding at all, we should exit
-  if (pad_index.empty()) {
+  if (tf_pad_index.empty()) {
     params->outputs->push_back(inputs.at(0));
     return Status::OK();
   }
 
-  // Only supports padding on less than 2 axis GIE-2579
-  if (pad_index.size() > 2) {
+  // TRT pad layer can only support padding on up to 2 dimensions (TRT-2579).
+  // TODO(tfeher): Use multiple TRT pad layers to support padding on more than 2
+  // dimensions.
+  if (tf_pad_index.size() > 2) {
     return errors::InvalidArgument(
         "Padding layer does not support padding on > 2");
   }
 
   // Padding on batch dimension is not supported
-  if (pad_index[0] == 0) {
+  if (params->use_implicit_batch && tf_pad_index[0] == 0) {
     return errors::InvalidArgument(
         "Padding layer does not support padding on batch dimension");
   }
 
-  // Not doing the legit thing here. ignoring padding on dim 1 and 3;
-  // TODO(jie): implement pad as uff parser
-  if (pad_index.size() == 2 && pad_index[0] == 0 && pad_index[1] == 3) {
-    return errors::Unimplemented(
-        "Padding layer does not support padding on dimension 1 and 3 yet");
-  }
   if (params->validation_only) return Status::OK();
 
-  bool legit_pad = true;
+  // TRT can only do the padding at the last two dimensions. We transpose the
+  // input tensor if needed.
+  bool transposed_pad = false;
+  std::vector<int> transpose_idx(nb_dims);
+  std::iota(transpose_idx.begin(), transpose_idx.end(), 0);
+
+  // trt_pad_index denotes the actual idx where the padding is performed by TRT.
+  std::vector<int> trt_pad_index{nb_dims - 2, nb_dims - 1};
+
+  // How many zeros are padded at the last two dimensions.
   nvinfer1::DimsHW pre_padding(0, 0);
   nvinfer1::DimsHW post_padding(0, 0);
 
-  std::vector<int32_t> permuted_pad_index(pad_index);
-  if (pad_index[0] == 1) {
-    legit_pad = false;
-    TF_RETURN_IF_ERROR(
-        params->converter->TransposeTensor(tensor, {0, 3, 2, 1}, &tensor));
-    permuted_pad_index[0] = 3;
-  }
-
-  for (size_t i = 0; i < pad_index.size(); i++) {
-    int index = pad_index[i];
-    if (permuted_pad_index[i] == 2) {
-      pre_padding.h() = pad_data[index * 2];
-      post_padding.h() = pad_data[index * 2 + 1];
-    } else if (permuted_pad_index[i] == 3) {
-      pre_padding.w() = pad_data[index * 2];
-      post_padding.w() = pad_data[index * 2 + 1];
+  // Dimension to set in the pre_padding and post_padding array.
+  std::vector<int> trt_pre_post_padding_index{0, 1};
+
+  // Two special cases where we can avoid permutations.
+  if (tf_pad_index.size() == 1 && tf_pad_index[0] == nb_dims - 1) {
+    // Only one dimension needs to be padded. We store its index at
+    // trt_pad_index[0]. We ignore trt_pad_index[1].
+    trt_pad_index[0] = nb_dims - 1;
+    trt_pre_post_padding_index[0] = 1;
+  }
+  if (tf_pad_index.size() == 2 && tf_pad_index[1] == nb_dims - 2) {
+    // tf_pad_index only has two values that are in ascending order. If
+    // tf_pad_index[1] is nb_dims-2, then swapping the two values in
+    // trt_pad_index here makes it possible to only swap one pair of dimensions
+    // (swap tf_pad_index[0] with nb_dims-1) in the input tensor. Otherwise, we
+    // would have to swap two pairs of dimensions in the input tensor:
+    // (tf_pad_index[0] with nb_dims-2) and (tf_pad_index[1], with nb_dims-1).
+    // Here is an example for a 4D input tensor:
+    // tf_pad_index = [1, 2]
+    // trt_pad_index = [3, 2]
+    // transpose_idx = [0, 3, 2, 1]
+    std::swap(trt_pad_index[0], trt_pad_index[1]);
+    std::swap(trt_pre_post_padding_index[0], trt_pre_post_padding_index[1]);
+  }
+
+  for (int i = 0; i < tf_pad_index.size(); i++) {
+    const int tf_index = tf_pad_index[i];
+    const int trt_index = trt_pad_index[i];
+    const int k = trt_pre_post_padding_index[i];
+    pre_padding.d[k] = pad_data[tf_index * 2];
+    post_padding.d[k] = pad_data[tf_index * 2 + 1];
+    if (tf_index != trt_index) {
+      transposed_pad = true;
+      std::swap(transpose_idx[tf_index], transpose_idx[trt_index]);
     }
   }
 
+  if (transposed_pad) {
+    TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
+        tensor, transpose_idx, &tensor, node_def, "to_pad"));
+  }
+
   nvinfer1::IPaddingLayer* layer = params->converter->network()->addPadding(
       *tensor->trt_tensor(), pre_padding, post_padding);
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+  params->converter->SetLayerName(layer, node_def);
   ITensorProxyPtr output_tensor = layer->getOutput(0);
-  params->converter->MarkQuantizationRangesAsInferrable(&tensor, &output_tensor);
 
-  if (!legit_pad) {
+  if (transposed_pad) {
     TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
-        output_tensor, {0, 3, 2, 1}, &output_tensor));
+        output_tensor, transpose_idx, &output_tensor, node_def, "from_pad"));
   }
 
   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
   return Status::OK();
 }
 
-Status ConvertSplitHelper(OpConverterParams* params,
+Status ConvertSplitHelper(const OpConverterParams* params,
                           const TRT_TensorOrWeights& input, int tf_axis,
                           int num_splits, bool squeeze_after) {
   const auto& node_def = params->node_def;
@@ -4299,7 +4317,7 @@ Status ConvertSplitHelper(OpConverterParams* params,
   return Status::OK();
 }
 
-Status ConvertSplit(OpConverterParams* params) {
+Status ConvertSplit(const OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
   TF_RETURN_IF_ERROR(
@@ -4317,60 +4335,101 @@ Status ConvertSplit(OpConverterParams* params) {
   return ConvertSplitHelper(params, inputs.at(1), tf_axis, num_split, false);
 }
 
-Status ConvertUnpack(OpConverterParams* params) {
+Status ConvertUnpack(const OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
   TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"value", false}}));
-  TF_RETURN_IF_ERROR(AllowDataTypes(*params, {
-    DataType::DT_FLOAT, DataType::DT_HALF,
-#if IS_TRT_VERSION_GE(5, 1, 3, 1)
-        DataType::DT_INT32,
-#endif
-  }));
+  TF_RETURN_IF_ERROR(AllowDataTypes(
+      *params, {DataType::DT_FLOAT, DataType::DT_HALF, DataType::DT_INT32}));
   // Input must be rank 1 or higher, since we can't unpack on axis 0.
   if (inputs.at(0).GetTrtDims().nbDims == 0) {
     return errors::Unimplemented(
-        "Input \"value\" for Unpack must be rank 2 or greater, at ",
-        node_def.name());
+        "Input \"value\" for Unpack must be rank 2 or greater");
   }
-  TFAttrs attrs(node_def);
-  const int tf_axis = attrs.get<int64>("axis");
-  const int num = attrs.get<int64>("num");
+
+  int tf_axis = 0, num = 0;
+  AttrSlice attrs(node_def);
+  TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "axis", &tf_axis));
+  TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "num", &num));
 
   return ConvertSplitHelper(params, inputs.at(0), tf_axis, num, true);
 }
 
-Status ConvertConcat(OpConverterParams* params) {
+Status ConvertCast(const OpConverterParams* params) {
+  auto unsupport_cast_error = [&](string msg) {
+    return errors::Unimplemented("Cast op is not supported - ", msg);
+  };
+
+  if (isExperimentalFeatureActivated("reject_all_fp_cast_ops")) {
+    LOG(WARNING) << "`TF_TRT_EXPERIMENTAL_FEATURES=reject_all_fp_cast_ops`is "
+                 << "meant as a workaround. If the Cast converter leads to any "
+                 << "performance or accuracy regression, please open an issue "
+                 << "on GitHub.";
+    return unsupport_cast_error(
+        "TF_TRT_EXPERIMENTAL_FEATURES=reject_all_fp_cast_ops has been defined");
+  }
+
+  std::set<DataType> allowed_types{DataType::DT_FLOAT, DataType::DT_HALF};
+
+  DataType input_type;
+  TF_RETURN_IF_ERROR(GetInputTfType(*params, &input_type, 0));
+
+  if (allowed_types.find(input_type) == allowed_types.end()) {
+    return unsupport_cast_error(
+        StrCat("Allowed input dtypes: [", DataTypeString(DataType::DT_FLOAT),
+               ", ", DataTypeString(DataType::DT_HALF),
+               "]. Received: ", DataTypeString(input_type)));
+  }
+
+  DataType output_type;
+  TF_RETURN_IF_ERROR(GetNodeDefTfType(params->node_def, &output_type,
+                                      kCastOutputTypeAttrName));
+
+  if (allowed_types.find(output_type) == allowed_types.end()) {
+    return unsupport_cast_error(
+        StrCat("Allowed output dtypes: [", DataTypeString(DataType::DT_FLOAT),
+               ", ", DataTypeString(DataType::DT_HALF),
+               "]. Received: ", DataTypeString(output_type)));
+  }
+
+  return ConvertIdentity(params);
+}
+
+Status ConvertConcat(const OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
-  TFAttrs attrs(node_def);
-  // Get number of tensor inputs.
-  const int num_inputs = attrs.get<int64>("N");
+
+  int num_inputs{0};
+  TF_RETURN_IF_ERROR(GetNodeAttr(AttrSlice(node_def), "N", &num_inputs));
+
   if (num_inputs != static_cast<int>(inputs.size()) - 1) {
     return errors::InvalidArgument(
-        "Number of inputs for ConcatV2 is inconsistent with N attribute, at ",
-        node_def.name());
+        "Number of inputs for ConcatV2 is inconsistent with N attributes.");
   }
-  // Validate inputs. Values must be tensors for now.
-  std::vector<std::pair<string, bool>> inputs_is_weight;
+  // Validate inputs.
+  std::vector<std::pair<string, TrtInputArg>> inputs_kinds;
+  TrtInputArg expected_input =
+      params->use_implicit_batch ? TrtInputArg::kTensor : TrtInputArg::kBoth;
+
+  inputs_kinds.reserve(num_inputs);
   for (int i = 0; i < num_inputs; ++i) {
-    inputs_is_weight.push_back({StrCat("values_", i), false});
+    inputs_kinds.push_back({StrCat("values_", i), expected_input});
   }
-  inputs_is_weight.push_back({"axis", true});
-  TF_RETURN_IF_ERROR(CheckInputsWeights(*params, inputs_is_weight));
-  // TODO(tmorris): There is a bug with Concat and INT32 in TRT - it is supposed
-  // to be supported.
-  TF_RETURN_IF_ERROR(
-      AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
+  inputs_kinds.push_back({"axis", TrtInputArg::kWeight});
+  TF_RETURN_IF_ERROR(CheckInputsWeights(*params, inputs_kinds));
+
+  std::set<DataType> allowed_types{DataType::DT_FLOAT, DataType::DT_HALF,
+                                   DataType::DT_INT32};
+
+  TF_RETURN_IF_ERROR(AllowDataTypes(*params, allowed_types));
   const auto axis = inputs.at(num_inputs).weights().GetSpan<int>();
   if (axis.size() != 1) {
-    return errors::InvalidArgument("Axis for ConcatV2 must be a scalar, at ",
-                                   node_def.name());
+    return errors::InvalidArgument("Axis for ConcatV2 must be a scalar");
   }
   int trt_axis = 0;
   const auto dim = inputs.at(0).GetTrtDims();
   TF_RETURN_IF_ERROR(ConvertAxis(axis[0], dim.nbDims, node_def.name(),
-                                 /*use_implicit_batch=*/true, &trt_axis));
+                                 params->use_implicit_batch, &trt_axis));
   // Check that dimensions match on non-concatenate axis.
   TF_RETURN_IF_ERROR(VerifyShapesMatch(
       absl::Span<const TRT_TensorOrWeights>(inputs).first(num_inputs), trt_axis,
@@ -4379,8 +4438,15 @@ Status ConvertConcat(OpConverterParams* params) {
 
   // Gather inputs as tensors
   std::vector<ITensorProxyPtr> input_tensors;
+  input_tensors.reserve(num_inputs);
+
   for (int i = 0; i < num_inputs; i++) {
-    input_tensors.push_back(inputs.at(i).tensor());
+    if (inputs.at(i).is_tensor()) {
+      input_tensors.push_back(inputs.at(i).tensor());
+    } else {
+      input_tensors.push_back(params->converter->CreateConstantLayer(
+          inputs.at(i).weights(), inputs.at(i).GetTrtDims()));
+    }
   }
   std::vector<nvinfer1::ITensor*> trt_input_tensors;
   for (const auto& t : input_tensors) {
@@ -4391,12 +4457,13 @@ Status ConvertConcat(OpConverterParams* params) {
           static_cast<nvinfer1::ITensor* const*>(trt_input_tensors.data()),
           input_tensors.size());
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+  params->converter->SetLayerName(layer, node_def);
   layer->setAxis(trt_axis);
   params->outputs->push_back(TRT_TensorOrWeights(layer->getOutput(0)));
   return Status::OK();
 }
 
-Status ConvertFusedBatchNorm(OpConverterParams* params) {
+Status ConvertFusedBatchNorm(const OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
   TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"x", false},
@@ -4406,41 +4473,54 @@ Status ConvertFusedBatchNorm(OpConverterParams* params) {
                                                   {"variance", true}}));
   TF_RETURN_IF_ERROR(
       AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
-  TFAttrs attrs(node_def);
-  float epsilon = attrs.get<float>("epsilon");
-  auto data_format = attrs.get<string>("data_format");
-  if (data_format != "NCHW") {
-    return errors::Unimplemented(
-        node_def.op(), " only supports data_format=NCHW, at ", node_def.name());
-  }
-  bool is_training = attrs.get<bool>("is_training");
+
+  float epsilon{0.1f};
+  string data_format;
+  bool is_training{false};
+  AttrSlice attrs(node_def);
+  TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "epsilon", &epsilon));
+  TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "data_format", &data_format));
+  TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "is_training", &is_training));
+
   if (is_training) {
     // Trying to use batchnorm in training mode is a very common problem.
     // Because the error message will only be printed in VLOG(1) by the
     // segmenter, we issue a special warning so that users will actually see it.
-    LOG(WARNING) << node_def.op() << " only supports is_training=false. If you "
-                 << "are using Keras, please call "
-                 << "keras.backend.set_learning_phase(0) before constructing "
-                 << "your model. At " << node_def.name();
+    LOG_WARNING_WITH_PREFIX
+        << node_def.op() << " only supports is_training=false. If you "
+        << "are using Keras, please call "
+        << "keras.backend.set_learning_phase(0) before constructing "
+        << "your model. At " << node_def.name();
     return errors::Unimplemented(node_def.op(),
-                                 " only supports is_training=false, at ",
-                                 node_def.name());
+                                 " only supports is_training=false");
   }
   ITensorProxyPtr tensor = inputs.at(0).tensor();
-
+  if (!params->use_implicit_batch) {
+    // This check is to make sure that channel dimension is known during
+    // conversion.
+    //
+    // We check this only in explicit batch mode and reject an op with unknown
+    // channel dimension during segmentation. In implicit batch mode we have
+    // known shapes during conversion even though the shapes may not be known
+    // during segmentation (see the actual argument for input_shapes when
+    // ConvertGraphDefToEngine is called from TRTEngineOp::BuildEngine).
+    int channel_dim = (data_format == "NCHW" ? 1 : 3);
+    if (tensor->getDimensions().d[channel_dim] == -1) {
+      return errors::InvalidArgument("Channel dimension must be static");
+    }
+  }
   //  Check parameter types
   auto parameter_type = inputs.at(1).weights().TrtDType();
   if ((parameter_type != nvinfer1::DataType::kFLOAT) &&
       (parameter_type != nvinfer1::DataType::kHALF)) {
     return errors::Unimplemented(
-        "Only float32 or float16 weight data type is supported, for node ",
-        node_def.name(), " got ", DebugString(parameter_type));
+        "Only float32 or float16 weight data type is supported,", " got ",
+        DebugString(parameter_type));
   }
   for (int i = 1; i < 5; i++) {
     if (inputs.at(i).weights().TrtDType() != parameter_type) {
       return errors::Unimplemented(
-          "Inconsistent parameter type for batchnorm is not supported, at: " +
-          node_def.name());
+          "Inconsistent parameter type for batchnorm is not supported");
     }
   }
 
@@ -4454,35 +4534,34 @@ Status ConvertFusedBatchNorm(OpConverterParams* params) {
     if (inputs.at(i).weights().count() == nweight) {
       ptr_shape_weights = &(inputs.at(i).weights());
     } else if (inputs.at(i).weights().count() != 1) {
-      return errors::InvalidArgument(
-          "Inconsistent batchnorm parameter count, at: " + node_def.name());
+      return errors::InvalidArgument("Inconsistent batchnorm parameter count");
     }
   }
   if (params->validation_only) return Status::OK();
 
   //  We could technically have two weights with different shape.
   //  that requires two addScale op, arguably less performant
-  TRT_ShapedWeights combined_scale_weights =
+  ::stream_executor::port::StatusOr<TRT_ShapedWeights> combined_scale_weights =
       params->weight_store->GetTempWeights(*ptr_shape_weights);
-  TRT_ShapedWeights combined_offset_weights =
+  TRT_ENSURE_OK(combined_scale_weights);
+  ::stream_executor::port::StatusOr<TRT_ShapedWeights> combined_offset_weights =
       params->weight_store->GetTempWeights(*ptr_shape_weights);
+  TRT_ENSURE_OK(combined_offset_weights);
 
   const Eigen::half* cast_vals_array[4];
   const float* vals_array[4];
   for (int j = 0; j < 4; j++) {
-    cast_vals_array[j] =
-        static_cast<Eigen::half const*>(inputs.at(j + 1).weights().GetValues());
-    vals_array[j] =
-        static_cast<float const*>(inputs.at(j + 1).weights().GetValues());
+    cast_vals_array[j] = inputs.at(j + 1).weights().GetPointer<Eigen::half>();
+    vals_array[j] = inputs.at(j + 1).weights().GetPointer<float>();
   }
   Eigen::half* cast_combined_scale_vals =
-      static_cast<Eigen::half*>(combined_scale_weights.GetValues());
+      combined_scale_weights.ValueOrDie().GetPointer<Eigen::half>();
   Eigen::half* cast_combined_offset_vals =
-      static_cast<Eigen::half*>(combined_offset_weights.GetValues());
+      combined_offset_weights.ValueOrDie().GetPointer<Eigen::half>();
   float* combined_scale_vals =
-      static_cast<float*>(combined_scale_weights.GetValues());
+      combined_scale_weights.ValueOrDie().GetPointer<float>();
   float* combined_offset_vals =
-      static_cast<float*>(combined_offset_weights.GetValues());
+      combined_offset_weights.ValueOrDie().GetPointer<float>();
 
   for (size_t i = 0; i < nweight; ++i) {
     float batchnorm_data[4];
@@ -4491,15 +4570,13 @@ Status ConvertFusedBatchNorm(OpConverterParams* params) {
         if (parameter_type == nvinfer1::DataType::kFLOAT) {
           batchnorm_data[j] = vals_array[j][i];
         } else if (parameter_type == nvinfer1::DataType::kHALF) {
-          batchnorm_data[j] =
-              Eigen::half_impl::half_to_float(cast_vals_array[j][i]);
+          batchnorm_data[j] = static_cast<float>(cast_vals_array[j][i]);
         }
       } else {
         if (parameter_type == nvinfer1::DataType::kFLOAT) {
           batchnorm_data[j] = vals_array[j][0];
         } else if (parameter_type == nvinfer1::DataType::kHALF) {
-          batchnorm_data[j] =
-              Eigen::half_impl::half_to_float(cast_vals_array[j][0]);
+          batchnorm_data[j] = static_cast<float>(cast_vals_array[j][0]);
         }
       }
     }
@@ -4518,41 +4595,64 @@ Status ConvertFusedBatchNorm(OpConverterParams* params) {
     }
   }
 
-  nvinfer1::ScaleMode mode = nweight == 1 ? nvinfer1::ScaleMode::kUNIFORM
-                                          : nvinfer1::ScaleMode::kCHANNEL;
-  nvinfer1::IScaleLayer* layer = params->converter->network()->addScale(
-      *tensor->trt_tensor(), mode, combined_offset_weights.GetTrtWeights(),
-      combined_scale_weights.GetTrtWeights(),
-      dummy_power_weights.GetTrtWeights());
-  TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
-  ITensorProxyPtr output_tensor = layer->getOutput(0);
+  ITensorProxyPtr output_tensor;
+
+  if (data_format == "NCHW") {
+    // IScaleLayer CHANNEL mode requires NCHW format.
+    nvinfer1::ScaleMode mode = nvinfer1::ScaleMode::kCHANNEL;
+    nvinfer1::IScaleLayer* layer = params->converter->network()->addScale(
+        *tensor->trt_tensor(), mode,
+        combined_offset_weights.ValueOrDie().GetTrtWeights(),
+        combined_scale_weights.ValueOrDie().GetTrtWeights(),
+        nvinfer1::Weights{nvinfer1::DataType::kFLOAT, nullptr, 0});
+    TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+    params->converter->SetLayerName(layer, node_def);
+    output_tensor = layer->getOutput(0);
+  }
+  if (data_format == "NHWC") {
+    // nweight is the number of channels. TensorRT IElementWiseLayer supports
+    // implicit broadcasting for dimensions of size 1.
+    nvinfer1::Dims dims = tensor->getDimensions();
+    for (int i = 0; i < dims.nbDims - 1; i++) {
+      dims.d[i] = 1;
+    }
+    dims.d[dims.nbDims - 1] = nweight;
+    ::stream_executor::port::StatusOr<TRTNetworkBuilder> builder =
+        TRTNetworkBuilder::Create(params->converter->network(),
+                                  params->weight_store);
+    TRT_ENSURE_OK(builder);
+    auto scale_constant_layer = builder.ValueOrDie().WeightsToConstant(
+        combined_scale_weights.ValueOrDie().GetTrtWeights(), dims);
+    ITensorProxyPtr scale_constant =
+        scale_constant_layer.ValueOrDie()->getOutput(0);
+    auto scale_layer = builder.ValueOrDie().Mul(tensor->trt_tensor(),
+                                                scale_constant->trt_tensor());
+    auto offset_constant_layer = builder.ValueOrDie().WeightsToConstant(
+        combined_offset_weights.ValueOrDie().GetTrtWeights(), dims);
+    ITensorProxyPtr offset_constant =
+        offset_constant_layer.ValueOrDie()->getOutput(0);
+    auto offset_layer = builder.ValueOrDie().Add(
+        scale_layer.ValueOrDie()->getOutput(0), offset_constant->trt_tensor());
+    output_tensor = offset_layer.ValueOrDie()->getOutput(0);
+  }
+
   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
   return Status::OK();
 }
 
-Status ConvertGather(OpConverterParams* params) {
+Status ConvertGather(const OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
   // TODO(tmorris): Use CheckInputsWeights by changing bool to enum with an
   // option for an input to be either tensor or weight.
-  if (inputs.size() != 3) {
-    return errors::InvalidArgument("GatherV2 got ", inputs.size(),
-                                   " inputs but expected 3, at ",
-                                   node_def.name());
-  }
+  TF_RETURN_IF_ERROR(
+      CheckInputsWeights(*params, {{"params", TrtInputArg::kBoth},
+                                   {"indices", TrtInputArg::kBoth},
+                                   {"axis", TrtInputArg::kWeight}}));
+
   const auto& params_input = inputs.at(0);
   const auto& indices_input = inputs.at(1);
   const auto& axis_input = inputs.at(2);
-  if (!axis_input.is_weights()) {
-    return errors::Unimplemented(
-        "The input \"axis\" for GatherV2 must be a constant, at ",
-        node_def.name());
-  }
-  if (!indices_input.is_tensor()) {
-    return errors::Unimplemented(
-        "The input \"indices\" for GatherV2 must be a tensor, at ",
-        node_def.name());
-  }
 
   TF_RETURN_IF_ERROR(AllowDataTypes(
       *params, {DataType::DT_FLOAT, DataType::DT_HALF, DataType::DT_INT32},
@@ -4562,280 +4662,407 @@ Status ConvertGather(OpConverterParams* params) {
 
   absl::Span<const int> axis = axis_input.weights().GetSpan<int>();
   if (axis.size() != 1) {
-    return errors::InvalidArgument("Axis for GatherV2 must be a scalar, at ",
-                                   node_def.name());
+    return errors::InvalidArgument("Axis for GatherV2 must be a scalar");
   }
+
   int trt_axis = 0;
-  TF_RETURN_IF_ERROR(ConvertAxis(axis[0], params_input.GetTrtDims().nbDims,
-                                 node_def.name(), params_input.is_tensor(),
-                                 &trt_axis));
-  if (params_input.is_weights() && trt_axis != 0) {
+  TF_RETURN_IF_ERROR(ConvertAxis(
+      axis[0], params_input.GetTrtDims().nbDims, node_def.name(),
+      params->use_implicit_batch && params_input.is_tensor(), &trt_axis));
+  if (params->use_implicit_batch && params_input.is_weights() &&
+      trt_axis != 0) {
     return errors::Unimplemented(
         "The input axis must be zero when params is a weight.");
   }
-  if (params_input.is_tensor() && indices_input.batch_size() != 1) {
+  if (params->use_implicit_batch &&
+      (params_input.is_tensor() == indices_input.is_tensor()) &&
+      (indices_input.batch_size() != 1 || params_input.batch_size() != 1)) {
     return errors::Unimplemented(
-        "Indices must have a batch size of 1 when params is a tensor.");
+        "Params and indices must have a batch size of 1 when params and indices"
+        " are both tensors or both constants.");
   }
+
+  auto get_rank = [params](const auto& input) {
+    return input.GetTrtDims().nbDims +
+           (params->use_implicit_batch && input.is_tensor() ? 1 : 0);
+  };
   // Both input are tensors, and the TF gather result will have rank:
   // (params.nbDims + 1) + (indices.nbDims + 1) - 1,
   // where "+ 1" adds the batch dim. If params is a weight, the TRT rank matches
   // the TF rank so we don't have to add + 1.
-  const int params_tf_rank =
-      params_input.GetTrtDims().nbDims + (params_input.is_tensor() ? 1 : 0);
-  const int indices_tf_rank = indices_input.GetTrtDims().nbDims + 1;
+  const int params_tf_rank = get_rank(params_input);
+  const int indices_tf_rank = get_rank(indices_input);
   const int tf_gather_output_rank = params_tf_rank + indices_tf_rank - 1;
-  if (tf_gather_output_rank > nvinfer1::Dims::MAX_DIMS + 1) {
+  if (tf_gather_output_rank >
+      nvinfer1::Dims::MAX_DIMS + (params->use_implicit_batch ? 1 : 0)) {
     return errors::InvalidArgument(
         "Result of gather has dimension greater than ",
         nvinfer1::Dims::MAX_DIMS + 1);
   }
-  if (params->validation_only) return Status::OK();
 
-  // Convert params to tensor is it is a weight.
-  ITensorProxyPtr params_tensor = nullptr;
-  if (params_input.is_weights()) {
-    params_tensor = params->converter->CreateConstantLayer(
-        params_input.weights(), params_input.GetTrtDims());
-  } else {
-    params_tensor = params_input.tensor();
+  int32 batch_dims;
+  TF_RETURN_IF_ERROR(GetNodeAttr(node_def, "batch_dims", &batch_dims));
+  if (params->use_implicit_batch && batch_dims) {
+    return errors::InvalidArgument(
+        "batch_dims must be zero in implicit batch mode");
+  }
+  if (!params->use_implicit_batch && batch_dims > 1) {
+    return errors::InvalidArgument(
+        "batch_dims cannot exceed 1 in dynamic shape mode");
   }
 
+  if (params->validation_only) return Status::OK();
+
+  // Convert input or indices to tensor if it is a constant.
+  auto populate_tensor = [params](const auto& input) -> ITensorProxyPtr {
+    ITensorProxyPtr result_tensor = nullptr;
+
+    if (input.is_weights()) {
+      result_tensor = params->converter->CreateConstantLayer(
+          input.weights(), input.GetTrtDims());
+    } else {
+      result_tensor = input.tensor();
+    }
+
+    return result_tensor;
+  };
+
+  ITensorProxyPtr params_tensor = populate_tensor(params_input);
+  ITensorProxyPtr indices_tensor = populate_tensor(indices_input);
+
   // Note on how IGatherLayer works: if both the data and indices tensors have
   // a batch size dimension of size N, it performs:
   // for batchid in xrange(N):
   //   output[batchid, a0, ..., an, i, ..., j, b0, ..., bn] = (
   //       data[batchid, a0, ..., an, indices[batchid, i, ..., j] b0, ..., bn])
   nvinfer1::IGatherLayer* layer = params->converter->network()->addGather(
-      *params_tensor->trt_tensor(), *indices_input.tensor()->trt_tensor(), trt_axis);
+      *params_tensor->trt_tensor(), *indices_tensor->trt_tensor(), trt_axis);
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+  params->converter->SetLayerName(layer, node_def);
+  layer->setNbElementWiseDims(batch_dims);
 
   ITensorProxyPtr output_tensor = layer->getOutput(0);
   nvinfer1::Dims trt_gather_output_dims = output_tensor->getDimensions();
-  // Note for the "- 2": one is for the output batch dim encapsulated by TF-TRT,
-  // and the other is for the output dimension that is squeezed by IGatherLayer
-  // because of the implicit batch dim in the indices (see the above note).
-  const int expected_trt_output_rank =
-      tf_gather_output_rank - (params_input.is_tensor() ? 2 : 1);
-  if (trt_gather_output_dims.nbDims != expected_trt_output_rank) {
-    return errors::Internal(
-        "Get unexpected output dimensions of IGatherLayer. Expect nbDims: ",
-        expected_trt_output_rank,
-        ", actual nbDims: ", trt_gather_output_dims.nbDims);
+
+  if (params->use_implicit_batch) {
+    // Note for the "- 2": one is for the output batch dim encapsulated by
+    // TF-TRT, and the other is for the output dimension that is squeezed by
+    // IGatherLayer because of the implicit batch dim in the indices (see the
+    // above note).
+    const int expected_trt_output_rank = tf_gather_output_rank -
+                                         (params_input.is_tensor() ? 1 : 0) -
+                                         (indices_input.is_tensor() ? 1 : 0);
+
+    if (trt_gather_output_dims.nbDims != expected_trt_output_rank) {
+      return errors::Internal(
+          "Get unexpected output dimensions of IGatherLayer. Expect nbDims: ",
+          expected_trt_output_rank,
+          ", actual nbDims: ", trt_gather_output_dims.nbDims);
+    }
   }
   // Reshape the output so after adding the implicit batch dim it'll match the
   // output shape of TF GatherV2.
-  if (params_input.is_tensor()) {
+  if (params->use_implicit_batch && params_input.is_tensor() &&
+      indices_input.is_tensor()) {
     for (int i = trt_gather_output_dims.nbDims; i > trt_axis; --i) {
       trt_gather_output_dims.d[i] = trt_gather_output_dims.d[i - 1];
     }
     trt_gather_output_dims.d[trt_axis] = 1;
     ++trt_gather_output_dims.nbDims;
 
-    TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
-        TRT_TensorOrWeights(output_tensor), trt_gather_output_dims,
-        /*validation_only=*/false, &output_tensor));
+    TF_RETURN_IF_ERROR(PrepareTensorForShape(
+        params->converter, TRT_TensorOrWeights(output_tensor),
+        trt_gather_output_dims,
+        /*validation_only=*/false, &output_tensor, node_def));
+  }
+
+  // When input and indices are both constants, for the supported cases, reshape
+  // output so that after removing the implicit batch dim it will match the
+  // output shape of TF GatherV2 op.
+  if (params->use_implicit_batch && params_input.is_weights() &&
+      indices_input.is_weights()) {
+    for (int i = trt_axis; i < trt_gather_output_dims.nbDims - 1; ++i) {
+      trt_gather_output_dims.d[i] = trt_gather_output_dims.d[i + 1];
+    }
+
+    // Squeeze the implicit batch dimension out. Note: this works only
+    // when batch size for both inputs and indices are 1.
+    --trt_gather_output_dims.nbDims;
+
+    TF_RETURN_IF_ERROR(PrepareTensorForShape(
+        params->converter, TRT_TensorOrWeights(output_tensor),
+        trt_gather_output_dims,
+        /*validation_only=*/false, &output_tensor, node_def));
   }
 
   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
   return Status::OK();
 }
 
-Status ConvertFullyConnectedHelper(OpConverterParams* params,
-                                   ITensorProxyPtr tensor_a,
-                                   TRT_ShapedWeights weights_b,
-                                   bool transpose_b, const string& node_name) {
-  // Reshape input to 3D - this will be a no-op unless using int8 precision.
-  auto input_dim = tensor_a->getDimensions();
-  while (input_dim.nbDims < 3) {
-    input_dim.d[input_dim.nbDims++] = 1;
+// Converts the input matrix multiplication node to a fully connected (FC) layer
+// if possible, as the FC layer has more tactics and INT implementations.
+// Returns the output ITensor* if the node is converted or nullptr if conversion
+// is not possible. An error status indicates internal problems during
+// conversion.
+::stream_executor::port::StatusOr<ITensorProxyPtr> ConvertFullyConnectedImpl(
+    const OpConverterParams* params, TRT_TensorOrWeights input_a,
+    TRT_TensorOrWeights input_b, bool transpose_a, bool transpose_b) {
+  if (!(!transpose_a && input_a.is_tensor() && input_b.is_weights())) {
+    VLOG(2) << "Not FC compatible, A must be non transposed tensor, and B "
+               "must be constant.";
+    return ITensorProxyPtr(nullptr);
+  }
+
+  if (!params->use_implicit_batch && input_b.GetTrtDims().nbDims > 2 &&
+      input_b.GetTrtDims().d[0] != 1) {
+    // Implicit broadcasting, if needed, has already been considered to
+    // transform the inputs and ensure the two operands have the same rank here.
+    // If the inputs have rank >= 3, then d[0] is the explicit batch dimension.
+    // The weight (input_b) must have batch size 1 in implicit batch mode.
+    VLOG(2) << "Not FC compatible, if B has an explicit batch dimension, then "
+               "it must be 1.";
+    return ITensorProxyPtr(nullptr);
+  }
+
+  nvinfer1::Dims input_dim = input_a.GetTrtDims();
+  if (input_dim.d[input_dim.nbDims - 1] == -1) {
+    VLOG(2) << "Not FC compatible, last dim of A must be static.";
+    return ITensorProxyPtr(nullptr);
+  }
+
+  if (input_dim.nbDims + 2 > nvinfer1::Dims::MAX_DIMS) {
+    VLOG(2) << "Not FC compatible, cannot expand A's shape.";
+    return ITensorProxyPtr(nullptr);
+  }
+
+  // Add two trailing 1's because FC layer combines the last three dims.
+  ITensorProxyPtr tensor_a = nullptr;
+
+  // Initialize the elements of reshap_dim to 0. A value 0 in
+  // reshape_dim(i) will preserve the i-th dimension value from the shape of
+  // input_a. Add two trailing dimensions of size 1.
+  auto reshape_dim = DimsAdapter(input_dim.nbDims,
+                                 DimsAdapter::StorageType(input_dim.nbDims, 0))
+                         .Append(1)
+                         .Append(1);
+
+  const NodeDef& node_def = params->node_def;
+  TF_RETURN_IF_ERROR(PrepareTensorForShape(
+      params->converter, input_a, reshape_dim,
+      /*validation_only=*/false, &tensor_a, node_def, /*op_instance=*/0,
+      /*origin_node_name=*/"FULLY_CONNECTED"));
+
+  VLOG(2) << "New shape of A " << DebugString(tensor_a->getDimensions());
+
+  TRT_ShapedWeights weights_b = input_b.weights();
+  TRT_ShapedWeights weights_2D(weights_b);
+  if (weights_b.Shape().NumDims() > 2) {
+    // Combine first nbDims-1 dims into a single dim, e.g. for a 4D tensor we
+    // transform [N, H, W, C] -> [N*H*W, C]. This is only valid if all batch
+    // dimensions are 1.
+    if (std::any_of(weights_b.Shape().begin(),
+                    weights_b.Shape().begin() + weights_b.Shape().NumDims() - 2,
+                    [](int d) { return d != 1; })) {
+      VLOG(2) << "Not FC compatible, B has a batch dim larger than 1";
+      return ITensorProxyPtr(nullptr);
+    }
+    int k = weights_b.Shape().dim(weights_b.Shape().NumDims() - 1);
+    nvinfer1::Dims dims{2, {static_cast<int>(weights_b.count() / k), k}};
+    TF_RETURN_IF_ERROR(weights_2D.SetShape(dims));
   }
-  TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
-      TRT_TensorOrWeights(tensor_a), input_dim, /*validation_only=*/false,
-      &tensor_a));
 
   // FC layer will transpose weights, so we need to pre-transpose.
-  TRT_ShapedWeights weights(weights_b.TrtDType());
+  TRT_ShapedWeights weights(weights_2D.TrtDType());
   if (!transpose_b) {
-    weights = params->weight_store->GetTempWeights(weights_b);
-    ReorderCKtoKC(weights_b, &weights);
+    auto tmp = params->weight_store->GetTempWeights(weights_2D);
+    TRT_ENSURE_OK(tmp);
+    weights = std::move(tmp).ValueOrDie();
+    ReorderCKtoKC(weights_2D, &weights);
   } else {
-    weights = weights_b;
+    weights = weights_2D;
   }
   TRT_ShapedWeights biases(weights.TrtDType());
-  const int noutput = weights.shape_.d[0];
+  int k = weights.Shape().dim(weights.Shape().NumDims() - 1);
+  const int noutput = weights.count() / k;
+  VLOG(2) << "Using fully connected layer with k=" << k
+          << ", n_output=" << noutput
+          << " weights shape: " << weights.Shape().DebugString()
+          << " to convert " << node_def.op();
   nvinfer1::IFullyConnectedLayer* layer =
       params->converter->network()->addFullyConnected(
-          *tensor_a->trt_tensor(), noutput, weights.GetTrtWeights(), biases.GetTrtWeights());
+          *tensor_a->trt_tensor(), noutput, weights.GetTrtWeights(),
+          biases.GetTrtWeights());
 
-  TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_name);
+  TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+  params->converter->SetLayerName(layer, node_def);
   ITensorProxyPtr output_tensor = layer->getOutput(0);
 
-  // Reshape output to 1D - this will be a no-op unless using int8 precision.
+  // A fully connected layer produces output with two trailing singleton
+  // dimensions. We remove these.
   auto output_dim = output_tensor->getDimensions();
-  output_dim.nbDims = 1;
-  TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
-      TRT_TensorOrWeights(output_tensor), output_dim, /*validation_only=*/false,
-      &output_tensor));
-
-  params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
-  return Status::OK();
-}
-
-Status ConvertMatMulHelper(OpConverterParams* params,
-                           TRT_TensorOrWeights input_a,
-                           TRT_TensorOrWeights input_b, bool transpose_a,
-                           bool transpose_b, string node_name) {
-  // TODO: ReorderCKtoKC is currently not general enough to transpose weights
-  // that are not 2D.
-  if ((transpose_a && input_a.is_weights() &&
-       input_a.GetTrtDims().nbDims != 2) ||
-      (transpose_b && input_b.is_weights() &&
-       input_b.GetTrtDims().nbDims != 2)) {
-    return errors::InvalidArgument(
-        "Cannot currently transpose constant input if it is not 2 dimensional");
-  }
-
-  // If A is a tensor, we can only transpose if it is at least 3D in TF,
-  // or TRT will not do the correct transposition.
-  if (transpose_a && input_a.is_tensor() && input_a.GetTrtDims().nbDims < 2) {
-    return errors::InvalidArgument(
-        "Cannot transpose first input if it is a tensor with fewer than 2 "
-        "non-batch dimensions.");
+  output_dim.nbDims -= 2;
+  // A zero in output_dim indicates copying the corresponding input dimension
+  // value during reshape.
+  std::fill(output_dim.d, output_dim.d + output_dim.nbDims, 0);
+  TF_RETURN_IF_ERROR(PrepareTensorForShape(
+      params->converter, TRT_TensorOrWeights(output_tensor), output_dim,
+      /*validation_only=*/false, &output_tensor, node_def,
+      /*op_instance=*/1, /*origin_node_name=*/"FULLY_CONNECTED"));
+  return output_tensor;
+}
+
+::stream_executor::port::StatusOr<ITensorProxyPtr> ConvertMatMulImpl(
+    const OpConverterParams* params, TRT_TensorOrWeights input_a,
+    TRT_TensorOrWeights input_b, bool transpose_a, bool transpose_b) {
+  if (params->use_implicit_batch) {
+    // In implicit batch mode we are very limited when can we multiply 2D
+    // matrices. If input_A is a 2D tensor, then nbDims==1 (implicit batch dim
+    // not counted). If A is not transposed and B is weight, then we can convert
+    // this treating A as a batch of vectors. This is the only possibility
+    // to implement MatMul with 2D input in implicit batch mode.
+    if ((input_a.GetTrtDims().nbDims < 2 &&
+         (transpose_a || !input_b.is_weights())) ||
+        (input_b.GetTrtDims().nbDims < 2)) {
+      return errors::InvalidArgument(
+          "MatMul with 2D tensors requires explicit batch mode, or that tensor"
+          " A is not transposed and B is a constant tensor.");
+    }
   }
 
-  // If B is a tensor, then it must be at least 3D in TF,
-  // or TRT won't be able to handle the multiply correctly.
-  if (input_b.is_tensor() && input_b.GetTrtDims().nbDims < 2) {
-    return errors::InvalidArgument(
-        "Second input must either be a constant, or contain at least 2 "
-        "non-batch dimensions.");
-  }
-  if (params->validation_only) return Status::OK();
+  if (params->validation_only) return ITensorProxyPtr(nullptr);
 
-  // If an FC layer can be used and would be faster, use that instead.
-  const bool can_use_fc =
-      !transpose_a && input_a.is_tensor() && input_b.is_weights();
-  const bool should_use_fc = can_use_fc && input_a.GetTrtDims().nbDims >= 3 &&
-                             input_b.GetTrtDims().nbDims == 2;
-  // If int8 is specified, FC must be used unless it is not compatible, as MM
-  // does not support int8 at this time.
-  if (should_use_fc || (can_use_fc && params->converter->precision_mode() ==
-                                          TrtPrecisionMode::INT8)) {
-    return ConvertFullyConnectedHelper(
-        params, input_a.tensor()->trt_tensor(), input_b.weights(), transpose_b, node_name);
+  ::stream_executor::port::StatusOr<ITensorProxyPtr> result =
+      ConvertFullyConnectedImpl(params, input_a, input_b, transpose_a,
+                                transpose_b);
+  TF_RETURN_IF_ERROR(result.status());
+  ITensorProxyPtr output = result.ValueOrDie();
+  if (*output) {
+    // FC conversion was successful, we can return.
+    return output;
   }
-
-  const auto get_matrix_op = [](ITensorProxyPtr in,
-                                bool transpose) -> nvinfer1::MatrixOperation {
-    return (in->getDimensions().nbDims < 2)
-               ? nvinfer1::MatrixOperation::kVECTOR
-               : (transpose) ? nvinfer1::MatrixOperation::kTRANSPOSE
-                             : nvinfer1::MatrixOperation::kNONE;
-  };
-
-  // If the MatMul operand is a constant, applies transposes at conversion-time
-  // as necessary. If the operand is a tensor, does nothing. If required
-  // transposes were applied, sets transpose to false.
-  const auto prepare_matmul_operand =
-      [&params](TRT_TensorOrWeights operand,
-                bool* transpose) -> ITensorProxyPtr {
+  const auto convert_to_itensor =
+      [&params](TRT_TensorOrWeights operand) -> ITensorProxyPtr {
     if (operand.is_tensor()) {
       return operand.tensor();
     } else {
-      TRT_ShapedWeights weights(operand.weights().TrtDType());
-      if (*transpose) {
-        weights = params->weight_store->GetTempWeights(operand.weights());
-        ReorderCKtoKC(operand.weights(), &weights);
-        // Weights have been transposed, can set transpose to false
-        *transpose = false;
-      } else {
-        weights = operand.weights();
-      }
-      return params->converter->CreateConstantLayer(weights, weights.shape_);
+      return params->converter->CreateConstantLayer(operand.weights(),
+                                                    operand.GetTrtDims());
     }
   };
 
-  ITensorProxyPtr tensor_a = prepare_matmul_operand(input_a, &transpose_a);
-  ITensorProxyPtr tensor_b = prepare_matmul_operand(input_b, &transpose_b);
+  ITensorProxyPtr tensor_a = convert_to_itensor(input_a);
+  ITensorProxyPtr tensor_b = convert_to_itensor(input_b);
+
+  const auto get_matrix_op = [](ITensorProxyPtr in,
+                                bool transpose) -> nvinfer1::MatrixOperation {
+    return (transpose) ? nvinfer1::MatrixOperation::kTRANSPOSE
+                       : nvinfer1::MatrixOperation::kNONE;
+  };
+  nvinfer1::MatrixOperation op_a, op_b;
+  // Note: In implicit batch mode kTRANSPOSE and kNONE are only valid if the
+  // matrix has at least 2 non-batch dimension. In implicit batch mode, if a has
+  // 1 dim (excluding batch dim), then we can only use kVECTOR, which will treat
+  // matrix A as a batch of vectors.
+  op_a = (tensor_a->getDimensions().nbDims < 2)
+             ? nvinfer1::MatrixOperation::kVECTOR
+             : get_matrix_op(tensor_a, transpose_a);
+  // In implicit batch mode, if B has only 1 dims (excluding batch dim) then we
+  // already reject the case and don't convert. One could consider using the
+  // kVECTOR flag to express C = MatMul(A, B.T) if A is weight, but the result
+  // will not have the correct shape: in TRT's implicit batch implementation,
+  // the result is a batch of vectors D_ji = A_ik * B_jk, where j is the batch
+  // dimension. In contrast, the TF MatMul op produces C = D.T, and we cannot
+  // transpose over the batch dimension (implicit batch mode).
+  op_b = get_matrix_op(tensor_b, transpose_b);
 
   nvinfer1::IMatrixMultiplyLayer* layer =
       params->converter->network()->addMatrixMultiply(
-          *tensor_a->trt_tensor(), get_matrix_op(tensor_a, transpose_a), *tensor_b->trt_tensor(),
-          get_matrix_op(tensor_b, transpose_b));
+          *tensor_a->trt_tensor(), op_a, *tensor_b->trt_tensor(), op_b);
 
-  TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_name);
-  ITensorProxyPtr output_tensor = layer->getOutput(0);
-  params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
+  const auto& node_def = params->node_def;
+  TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+  params->converter->SetLayerName(layer, node_def);
+  return ITensorProxyPtr(layer->getOutput(0));
+}
+
+Status ConvertMatMulHelper(const OpConverterParams* params,
+                           TRT_TensorOrWeights input_a,
+                           TRT_TensorOrWeights input_b, bool transpose_a,
+                           bool transpose_b) {
+  ::stream_executor::port::StatusOr<ITensorProxyPtr> result =
+      ConvertMatMulImpl(params, input_a, input_b, transpose_a, transpose_b);
+  TF_RETURN_IF_ERROR(result.status());
+  if (!params->validation_only) {
+    params->outputs->push_back(TRT_TensorOrWeights(result.ValueOrDie()));
+  }
   return Status::OK();
 }
 
 // inputs are both two dimensional (ops::MatMul)
-Status ConvertMatMul(OpConverterParams* params) {
+Status ConvertMatMul(const OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
-  if (inputs.size() != 2) {
-    return errors::InvalidArgument(node_def.op(), " got ", inputs.size(),
-                                   " inputs but expected 2, at ",
-                                   node_def.name());
-  }
+  TFTRT_CHECK_INPUT_SIZE(inputs.size(), 2, node_def);
+
   TF_RETURN_IF_ERROR(
       AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
 
-  TFAttrs attrs(node_def);
-  bool transpose_a = attrs.get<bool>("transpose_a");
-  bool transpose_b = attrs.get<bool>("transpose_b");
+  bool transpose_a = false, transpose_b = false;
+  AttrSlice attrs(node_def);
+  TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "transpose_a", &transpose_a));
+  TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "transpose_b", &transpose_b));
 
   return ConvertMatMulHelper(params, inputs.at(0), inputs.at(1), transpose_a,
-                             transpose_b, node_def.name());
+                             transpose_b);
 }
 
-Status ConvertBatchMatMul(OpConverterParams* params) {
+Status ConvertBatchMatMul(const OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
-  if (inputs.size() != 2) {
-    return errors::InvalidArgument(node_def.op(), " got ", inputs.size(),
-                                   " inputs but expected 2, at ",
-                                   node_def.name());
-  }
-  // TODO(tmorris): Enable once false is updated to mean either tensor or weight
-  // TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"x", false}, {"y",
-  // false}}));
+  TFTRT_CHECK_INPUT_SIZE(inputs.size(), 2, node_def);
+
+  TF_RETURN_IF_ERROR(CheckInputsWeights(
+      *params, {{"x", TrtInputArg::kBoth}, {"y", TrtInputArg::kBoth}}));
+  // TODO(tfeher): Consider adding INT8 type because FC layer can support it.
   TF_RETURN_IF_ERROR(
       AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
   if (inputs.at(0).is_weights() && inputs.at(1).is_weights()) {
+    // TODO(lsugy): don't assume that if all inputs are weights, grappler
+    // should fold them, because variables are weights.
     return errors::InvalidArgument(
         "All inputs are weights, but Grappler is expected to fold them.");
   }
-  if (inputs.at(0).is_tensor() && inputs.at(1).is_tensor() &&
-      inputs.at(0).GetTrtDims().nbDims != inputs.at(1).GetTrtDims().nbDims) {
-    return errors::Unimplemented(
-        "Inputs must have the same rank if they are both tensors.");
-  }
 
-  TFAttrs attrs(node_def);
-  const bool transpose_a = attrs.get<bool>("adj_x");
-  const bool transpose_b = attrs.get<bool>("adj_y");
+  bool transpose_a = false, transpose_b = false;
+  AttrSlice attrs(node_def);
+  TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "adj_x", &transpose_a));
+  TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "adj_y", &transpose_b));
 
-  // There is no way to batch constants in TRT. Example:
-  // Tensor with TF Dims: 12 5 3 -> TRT Dims: 5 3
-  // Weight with TF Dims: 12 3 6 -> TRT Dims: 12 3 6
-  // It is not possible to treat the weight input as a batched [3, 6] tensor.
+  // In case input_l is weight, check whether input_l has implicit batch mode
+  // compatible batch dim.
   const auto check_weight_is_not_batched =
       [](const TRT_TensorOrWeights& input_l,
          const TRT_TensorOrWeights& input_r) {
-        // If input_l is a weight, then input_r must be a tensor because
-        // otherwise the op would be handled by Grappler.
+        // There is no way to batch constants in TRT using implicit batch mode.
+        // Example:
+        // Tensor with TF Dims: 12 5 3 -> TRT Dims: 5 3
+        // Weight with TF Dims: 12 3 6 -> TRT Dims: 12 3 6
+        // It is not possible to treat the weight input as a batched [3, 6]
+        // tensor. Batched weight tensors must have batch dim = 1 (after the
+        // broadcast).
         if (input_l.is_weights() &&
             input_l.GetTrtDims().nbDims > input_r.GetTrtDims().nbDims &&
             input_l.GetTrtDims().d[0] != 1) {
           return errors::Unimplemented(
-              "TensorRT does not support batched constants.");
+              "TensorRT does not support batched constants in implicit batch "
+              "mode.");
         }
         return Status::OK();
       };
-  TF_RETURN_IF_ERROR(check_weight_is_not_batched(inputs.at(0), inputs.at(1)));
-  TF_RETURN_IF_ERROR(check_weight_is_not_batched(inputs.at(1), inputs.at(0)));
+  if (params->use_implicit_batch) {
+    TF_RETURN_IF_ERROR(check_weight_is_not_batched(inputs.at(0), inputs.at(1)));
+    TF_RETURN_IF_ERROR(check_weight_is_not_batched(inputs.at(1), inputs.at(0)));
+  }
 
   // Broadcast inputs. We don't check feasibility since the dimensions in a
   // MatMul don't need to match. For example, consider a valid set of inputs
@@ -4843,71 +5070,38 @@ Status ConvertBatchMatMul(OpConverterParams* params) {
   // input 0: [N, T, C]
   // input 1: [1, C, K]
   // Since C != K and T != C, check feasiblity would fail.
-  nvinfer1::Dims broadcasted_dims_l, broadcasted_dims_r;
-  TF_RETURN_IF_ERROR(GetTrtBroadcastShape(
-      inputs.at(0), inputs.at(1), /*check_feasibility=*/false,
-      &broadcasted_dims_l, &broadcasted_dims_r));
-  ITensorProxyPtr tensor_l = nullptr;
-  ITensorProxyPtr tensor_r = nullptr;
-  TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
-      inputs.at(0), broadcasted_dims_l, params->validation_only, &tensor_l));
-  TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
-      inputs.at(1), broadcasted_dims_r, params->validation_only, &tensor_r));
-  if (params->validation_only) return Status::OK();
-
-  return ConvertMatMulHelper(params, TRT_TensorOrWeights(tensor_l),
-                             TRT_TensorOrWeights(tensor_r), transpose_a,
-                             transpose_b, node_def.name());
-}
-
-Status ConvertSoftmax(OpConverterParams* params) {
-  const auto& inputs = params->inputs;
-  const auto& node_def = params->node_def;
-  TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"logits", false}}));
-  TF_RETURN_IF_ERROR(
-      AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
-  ITensorProxyPtr tensor = inputs.at(0).tensor();
+  auto input_l = std::make_unique<TRT_TensorOrWeights>(inputs.at(0));
+  auto input_r = std::make_unique<TRT_TensorOrWeights>(inputs.at(1));
+  TF_RETURN_IF_ERROR(BroadcastTensors(input_l, input_r,
+                                      /*check_feasibility=*/false, params));
 
-  const int num_trt_dims = tensor->getDimensions().nbDims;
-  if (num_trt_dims == 0) {
-    return errors::InvalidArgument(
-        "TensorRT Softmax cannot apply on batch dimension, at",
-        node_def.name());
-  }
   if (params->validation_only) return Status::OK();
 
-  nvinfer1::ISoftMaxLayer* layer =
-      params->converter->network()->addSoftMax(*tensor->trt_tensor());
-  TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
-  // Tensorflow SoftMax assumes applying softmax on the last dimension.
-  layer->setAxes(1 << (num_trt_dims - 1));
-
-  ITensorProxyPtr output_tensor = layer->getOutput(0);
-  // Quantization range for SoftMax is always (0, 1)
-  params->converter->ProvideQuantizationRange(&output_tensor, 0.0f, 1.0f);
-  params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
-  return Status::OK();
+  return ConvertMatMulHelper(params, *input_l, *input_r, transpose_a,
+                             transpose_b);
 }
 
-Status ConvertArgMinMax(OpConverterParams* params) {
+Status ConvertArgMinMax(const OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
   TF_RETURN_IF_ERROR(
       CheckInputsWeights(*params, {{"input", false}, {"dimension", true}}));
   TF_RETURN_IF_ERROR(
       AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
-  // INT64 outputs are not supported by TRT.
-  TFAttrs attrs(node_def);
-  DataType output_dtype = attrs.get<DataType>("output_type");
+
+  DataType output_dtype{DataType::DT_INT32};
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(AttrSlice(node_def), "output_type", &output_dtype));
+
   if (output_dtype != DataType::DT_INT32) {
     return errors::Unimplemented("Output type ", DataTypeString(output_dtype),
-                                 " is not supported, at ", node_def.name());
+                                 " is not supported");
   }
   int tf_axis = inputs.at(1).weights().GetSpan<int>()[0];
   int trt_axis;
   nvinfer1::Dims dims = inputs.at(0).GetTrtDims();
   TF_RETURN_IF_ERROR(ConvertAxis(tf_axis, dims.nbDims, node_def.name(),
-                                 /*use_implicit_batch=*/true, &trt_axis));
+                                 params->use_implicit_batch, &trt_axis));
   nvinfer1::TopKOperation topk_op;
   if (node_def.op() == "ArgMin") {
     topk_op = nvinfer1::TopKOperation::kMIN;
@@ -4916,6 +5110,18 @@ Status ConvertArgMinMax(OpConverterParams* params) {
   } else {
     return errors::InvalidArgument("Unsupported ArgMin/Max operation");
   }
+
+#if !IS_TRT_VERSION_GE(7, 0, 0, 11)
+  const nvinfer1::Dims trt_dims = params->inputs.at(0).GetTrtDims();
+  if (trt_dims.nbDims >= 4) {
+    string trt_dim_str = DebugString(trt_dims);
+
+    return errors::Unimplemented(node_def.op(), "op is not able to support",
+                                 " tensors with 4+ dimensions (excluding batch",
+                                 " size). Received: ", trt_dim_str);
+  }
+#endif
+
   if (params->validation_only) return Status::OK();
 
   // Use TopK with k = 1. Only indices output is needed (output 1).
@@ -4923,51 +5129,63 @@ Status ConvertArgMinMax(OpConverterParams* params) {
   nvinfer1::ITopKLayer* layer = params->converter->network()->addTopK(
       *inputs.at(0).tensor()->trt_tensor(), topk_op, 1, reduce_axes);
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+  params->converter->SetLayerName(layer, node_def, "topk");
   ITensorProxyPtr output_indices_tensor = layer->getOutput(1);
 
   // Squeeze on axis.
-  std::vector<int> size(dims.d, dims.d + dims.nbDims);
-  size.erase(size.begin() + trt_axis);
-  nvinfer1::Dims new_dims;
-  TF_RETURN_IF_ERROR(TensorShapeArrayToTrtDims(size, &new_dims));
+  std::vector<int> input_dims(dims.d, dims.d + dims.nbDims);
+  input_dims[trt_axis] = 0;
   ITensorProxyPtr output_tensor = nullptr;
-  TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
-      TRT_TensorOrWeights(output_indices_tensor), new_dims,
-      /*validation_only=*/false, &output_tensor));
-
+  TF_RETURN_IF_ERROR(params->converter->SqueezeTensor(
+      /*input=*/output_indices_tensor,
+      /*input_dims=*/&input_dims,
+      /*params=*/params,
+      /*output=*/&output_tensor));
   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
+
   return Status::OK();
 }
 
-Status ConvertTopK(OpConverterParams* params) {
+Status ConvertTopK(const OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
   TF_RETURN_IF_ERROR(
       CheckInputsWeights(*params, {{"input", false}, {"k", true}}));
   TF_RETURN_IF_ERROR(
       AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
+  bool sorted{false};
+  TF_RETURN_IF_ERROR(GetNodeAttr(AttrSlice(node_def), "sorted", &sorted));
+
+  if (!sorted) {
+    // TensorRT only supports sorted output. Although TensorFlow API
+    // doesn't specify the order of output elements in case sorted=false,
+    // but it's safer to not convert because the output of TensorRT might
+    // be different with TensorFlow which can cause confusion.
+    return errors::InvalidArgument("Only sorted=True is supported");
+  }
+
   ITensorProxyPtr tensor = inputs.at(0).tensor();
   const int num_dims = tensor->getDimensions().nbDims;
   if (num_dims == 0) {
     return errors::InvalidArgument(
-        "TensorRT TopK cannot apply on batch dimension, at", node_def.name());
+        "TensorRT TopK cannot apply on batch dimension");
   }
 
   TRT_ShapedWeights k_w = inputs.at(1).weights();
   if (k_w.count() != 1) {
-    return errors::InvalidArgument("k value of TopK should be a scalar, at",
-                                   node_def.name());
+    return errors::InvalidArgument("k value of TopK should be a scalar");
   }
   // Note that ITopKLayer always have sorted outputs, so we don't need to handle
   // the 'sorted' attribute of the node.
   if (params->validation_only) return Status::OK();
 
   const nvinfer1::TopKOperation op = nvinfer1::TopKOperation::kMAX;
-  const int k = *(static_cast<int*>(k_w.GetValues()));
+  const int k = *(k_w.GetPointer<int>());
   const uint32_t reduce_axes = 1 << (num_dims - 1);
-  nvinfer1::ITopKLayer* layer =
-      params->converter->network()->addTopK(*tensor->trt_tensor(), op, k, reduce_axes);
+  nvinfer1::ITopKLayer* layer = params->converter->network()->addTopK(
+      *tensor->trt_tensor(), op, k, reduce_axes);
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+  params->converter->SetLayerName(layer, node_def);
 
   ITensorProxyPtr output_value_tensor = layer->getOutput(0);
   ITensorProxyPtr output_indices_tensor = layer->getOutput(1);
@@ -4976,40 +5194,176 @@ Status ConvertTopK(OpConverterParams* params) {
   return Status::OK();
 }
 
-Status ConvertDepthSpaceShuffle(OpConverterParams* params) {
+::stream_executor::port::StatusOr<std::pair<ITensorProxyPtr, ITensorProxyPtr>>
+CalcDepthSpaceDynamicShape(const OpConverterParams* params, int block_size,
+                           string data_format) {
+  // Instead we use a shape layer and shape arithmetic to calculate the reshape
+  // dimensions.
+  const auto& inputs = params->inputs;
+  const auto& node_def = params->node_def;
+
+  const int channels_axis = data_format == "NCHW" ? 1 : 3;
+  const int h_axis = data_format == "NCHW" ? 2 : 1;
+  const int w_axis = data_format == "NCHW" ? 3 : 2;
+
+  // Get shapes.
+  ITensorProxyPtr shape = params->converter->network()
+                              ->addShape(*inputs.at(0).tensor()->trt_tensor())
+                              ->getOutput(0);
+  ITensorProxyPtr batch_size =
+      params->converter->network()
+          ->addSlice(*shape->trt_tensor(), {1, {0}}, {1, {1}}, {1, {1}})
+          ->getOutput(0);
+  ITensorProxyPtr num_channels =
+      params->converter->network()
+          ->addSlice(*shape->trt_tensor(), {1, {channels_axis}}, {1, {1}},
+                     {1, {1}})
+          ->getOutput(0);
+  ITensorProxyPtr h =
+      params->converter->network()
+          ->addSlice(*shape->trt_tensor(), {1, {h_axis}}, {1, {1}}, {1, {1}})
+          ->getOutput(0);
+  ITensorProxyPtr w =
+      params->converter->network()
+          ->addSlice(*shape->trt_tensor(), {1, {w_axis}}, {1, {1}}, {1, {1}})
+          ->getOutput(0);
+  ITensorProxyPtr r;
+  TF_RETURN_IF_ERROR(CreateScalarConstant(params, block_size, &r));
+  ITensorProxyPtr r_squared;
+  TF_RETURN_IF_ERROR(
+      CreateScalarConstant(params, block_size * block_size, &r_squared));
+  // Get shuffle parameters.
+  std::vector<ITensorProxyPtr> first_shuffle_tensors(6, nullptr);
+  std::vector<ITensorProxyPtr> second_shuffle_tensors(4, nullptr);
+  if (node_def.op() == "DepthToSpace") {
+    // First Reshape [N, C, H, W] - > [N, r, r, C/(r*r), H, W].
+    first_shuffle_tensors[0] = batch_size;
+    first_shuffle_tensors[1] = r;
+    first_shuffle_tensors[2] = r;
+    first_shuffle_tensors[3] =
+        params->converter->network()
+            ->addElementWise(*num_channels->trt_tensor(),
+                             *r_squared->trt_tensor(),
+                             nvinfer1::ElementWiseOperation::kDIV)
+            ->getOutput(0);
+    first_shuffle_tensors[4] = h;
+    first_shuffle_tensors[5] = w;
+    // Second Reshape [N, C/(r*r), H, r, W, r] -> [N, C/(r*r), H * r, W * r].
+    second_shuffle_tensors[0] = batch_size;
+    second_shuffle_tensors[1] =
+        params->converter->network()
+            ->addElementWise(*num_channels->trt_tensor(),
+                             *r_squared->trt_tensor(),
+                             nvinfer1::ElementWiseOperation::kDIV)
+            ->getOutput(0);
+    second_shuffle_tensors[2] =
+        params->converter->network()
+            ->addElementWise(*h->trt_tensor(), *r->trt_tensor(),
+                             nvinfer1::ElementWiseOperation::kPROD)
+            ->getOutput(0);
+    second_shuffle_tensors[3] =
+        params->converter->network()
+            ->addElementWise(*w->trt_tensor(), *r->trt_tensor(),
+                             nvinfer1::ElementWiseOperation::kPROD)
+            ->getOutput(0);
+  } else if (node_def.op() == "SpaceToDepth") {
+    // First Reshape [N, C, H, W] -> [N, C, H/r, r, W/r, r].
+    first_shuffle_tensors[0] = batch_size;
+    first_shuffle_tensors[1] = num_channels;
+    first_shuffle_tensors[2] =
+        params->converter->network()
+            ->addElementWise(*h->trt_tensor(), *r->trt_tensor(),
+                             nvinfer1::ElementWiseOperation::kDIV)
+            ->getOutput(0);
+    first_shuffle_tensors[3] = r;
+    first_shuffle_tensors[4] =
+        params->converter->network()
+            ->addElementWise(*w->trt_tensor(), *r->trt_tensor(),
+                             nvinfer1::ElementWiseOperation::kDIV)
+            ->getOutput(0);
+    first_shuffle_tensors[5] = r;
+
+    // Second Reshape  [N, r, r, C, H/r, W/r] -> [N, C*r*r, H/r, W/r].
+    second_shuffle_tensors[0] = batch_size;
+    second_shuffle_tensors[1] =
+        params->converter->network()
+            ->addElementWise(*num_channels->trt_tensor(),
+                             *r_squared->trt_tensor(),
+                             nvinfer1::ElementWiseOperation::kPROD)
+            ->getOutput(0);
+    second_shuffle_tensors[2] =
+        params->converter->network()
+            ->addElementWise(*h->trt_tensor(), *r->trt_tensor(),
+                             nvinfer1::ElementWiseOperation::kDIV)
+            ->getOutput(0);
+    second_shuffle_tensors[3] =
+        params->converter->network()
+            ->addElementWise(*w->trt_tensor(), *r->trt_tensor(),
+                             nvinfer1::ElementWiseOperation::kDIV)
+            ->getOutput(0);
+  }
+
+  ::stream_executor::port::StatusOr<ITensorProxyPtr> result =
+      ConcatenateTensors(params, first_shuffle_tensors, 0);
+  TF_RETURN_IF_ERROR(result.status());
+  ITensorProxyPtr first_shuffle_shape = result.ValueOrDie();
+
+  result = ConcatenateTensors(params, second_shuffle_tensors, 1);
+  TF_RETURN_IF_ERROR(result.status());
+  ITensorProxyPtr second_shuffle_shape = result.ValueOrDie();
+
+  return std::make_pair(first_shuffle_shape, second_shuffle_shape);
+}
+
+Status ConvertDepthSpaceShuffle(const OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
   TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"input", false}}));
   TF_RETURN_IF_ERROR(AllowDataTypes(
       *params, {DataType::DT_FLOAT, DataType::DT_HALF, DataType::DT_INT32}));
-  TFAttrs attrs(node_def);
-  const int block_size = attrs.get<int64>("block_size");
+
+  string data_format;
+  int block_size;
+  AttrSlice attrs(node_def);
+  TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "data_format", &data_format));
+  TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "block_size", &block_size));
+
   if (block_size < 2) {
-    return errors::InvalidArgument("Block size must be 2 or greater, at ",
-                                   node_def.name());
+    return errors::InvalidArgument("Block size must be 2 or greater");
   }
-  const string data_format = attrs.get<string>("data_format");
+
   if (data_format != "NCHW" && data_format != "NHWC") {
     return errors::Unimplemented("Data format ", data_format,
-                                 " is not supported, at ", node_def.name());
+                                 " is not supported");
   }
+  int idx_offset = params->use_implicit_batch ? 0 : 1;
   nvinfer1::Dims dims = inputs.at(0).GetTrtDims();
-  if (dims.nbDims != 3) {
+  const int required_rank = 3 + idx_offset;
+  if (dims.nbDims != required_rank) {
     return errors::InvalidArgument("The input to ", node_def.op(),
-                                   " must be rank 4, at ", node_def.name());
-  }
-  const int num_channels = data_format == "NCHW" ? dims.d[0] : dims.d[2];
-  const int h = data_format == "NCHW" ? dims.d[1] : dims.d[0];
-  const int w = data_format == "NCHW" ? dims.d[2] : dims.d[1];
+                                   " must be rank 4");
+  }
+  const int num_channels =
+      data_format == "NCHW" ? dims.d[0 + idx_offset] : dims.d[2 + idx_offset];
+  const int h =
+      data_format == "NCHW" ? dims.d[1 + idx_offset] : dims.d[0 + idx_offset];
+  const int w =
+      data_format == "NCHW" ? dims.d[2 + idx_offset] : dims.d[1 + idx_offset];
   // Get shuffle parameters.
   nvinfer1::Dims first_shuffle_shape;
   nvinfer1::Permutation transpose_perm;
   nvinfer1::Dims second_shuffle_shape;
+
+  // We define all the shuffle and transpose dimensions assuming implicit batch
+  // mode. Afterwards we will update them to explicit batch mode if needed.
+  // Additionally, an NCHW layout is assumed, and this assumption is corrected
+  // afterwards with an initial transpose op. TODO(tfeher): Get rid of the
+  // layout_transpose ops by defining shuffle shape specifically for NCHW and
+  // NHCW.
   if (node_def.op() == "DepthToSpace") {
-    if (num_channels % (block_size * block_size) != 0) {
+    if (num_channels != -1 && num_channels % (block_size * block_size) != 0) {
       return errors::InvalidArgument(
-          "Number of channels must be divisible by block_size*block_size, at ",
-          node_def.name());
+          "Number of channels must be divisible by block_size*block_size");
     }
     // First Reshape [C, H, W] - > [r, r, C/(r*r), H, W]
     first_shuffle_shape = {
@@ -5021,12 +5375,13 @@ Status ConvertDepthSpaceShuffle(OpConverterParams* params) {
     // Second Reshape [C/(r*r), H, r, W, r] -> [C/(r*r), H * r, W * r]
     second_shuffle_shape =
         nvinfer1::Dims3(num_channels / (block_size * block_size),
-                          h * block_size, w * block_size);
-  } else if (node_def.op() == "SpaceToDepth") {
-    if (h % block_size != 0 || w % block_size != 0) {
+                        h * block_size, w * block_size);
+  } else {
+    if (node_def.op() != "SpaceToDepth")
+      return errors::InvalidArgument("Incorrect op type ", node_def.op());
+    if ((h != -1 && h % block_size != 0) || (w != -1 && w % block_size != 0)) {
       return errors::InvalidArgument(
-          "Width and height must be divisible by block_size, at ",
-          node_def.name());
+          "Width and height must be divisible by block_size");
     }
     // First Reshape [C, H, W] -> [C, H/r, r, W/r, r]
     first_shuffle_shape = {/*nbDims=*/5,
@@ -5041,34 +5396,90 @@ Status ConvertDepthSpaceShuffle(OpConverterParams* params) {
   if (params->validation_only) return Status::OK();
 
   nvinfer1::IShuffleLayer* first_shuffle =
-      params->converter->network()->addShuffle(*inputs.at(0).tensor()->trt_tensor());
+      params->converter->network()->addShuffle(
+          *inputs.at(0).tensor()->trt_tensor());
   TFTRT_RETURN_ERROR_IF_NULLPTR(first_shuffle, node_def.name());
+  params->converter->SetLayerName(first_shuffle, node_def, "shuffle",
+                                  /*op_instance=*/0);
+
+  ITensorProxyPtr second_shuffle_shape_tensor;
+
+  if (HasStaticShape(inputs.at(0).GetTrtDims())) {
+    // Adjust a reshape constructed at implicit batch mode for explicit batch
+    // mode. In particular, we need to insert the batch dimension size to the
+    // beginning of all the dimension sizes. Example: reshape {20,10,30} for
+    // implicit batch mode becomes reshape {N,20,10,30} for explicit batch mode.
+    auto adjust_reshape = [](int N, nvinfer1::Dims dims,
+                             bool use_implicit_batch) {
+      if (use_implicit_batch) return dims;
+      for (int i = dims.nbDims; i > 0; i--) {
+        dims.d[i] = dims.d[i - 1];
+      }
+      dims.d[0] = N;
+      dims.nbDims++;
+      return dims;
+    };
+
+    first_shuffle_shape = adjust_reshape(dims.d[0], first_shuffle_shape,
+                                         params->use_implicit_batch);
+    second_shuffle_shape = adjust_reshape(dims.d[0], second_shuffle_shape,
+                                          params->use_implicit_batch);
+
+    first_shuffle->setReshapeDimensions(first_shuffle_shape);
+  } else {
+    ::stream_executor::port::StatusOr<
+        std::pair<ITensorProxyPtr, ITensorProxyPtr>>
+        result = CalcDepthSpaceDynamicShape(params, block_size, data_format);
+    TF_RETURN_IF_ERROR(result.status());
+    first_shuffle->setInput(1, *result.ValueOrDie().first->trt_tensor());
+    second_shuffle_shape_tensor = result.ValueOrDie().second;
+  }
+
+  // Adjust a transpose constructed assuming implicit batch mode for explicit
+  // batch mode. In particular, we need to add the batch dimension to d0 and
+  // add 1 to all the dimension id in the transpose. Example: permutation
+  // for implicit batch mode becomes permutation {0,3,2,1} for explicit batch
+  // mode.
+  auto adjust_perm = [](int n, nvinfer1::Permutation perm,
+                        bool use_implicit_batch) {
+    if (use_implicit_batch) return perm;
+    for (int i = n; i > 0; i--) {
+      perm.order[i] = perm.order[i - 1] + 1;
+    }
+    perm.order[0] = 0;
+    return perm;
+  };
+  transpose_perm = adjust_perm(5, transpose_perm, params->use_implicit_batch);
+
   if (data_format == "NHWC") {
-    first_shuffle->setFirstTranspose({2, 0, 1});
+    nvinfer1::Permutation layout_transpose =
+        adjust_perm(3, {2, 0, 1}, params->use_implicit_batch);
+    first_shuffle->setFirstTranspose(layout_transpose);
   }
-  first_shuffle->setReshapeDimensions(first_shuffle_shape);
   first_shuffle->setSecondTranspose(transpose_perm);
 
   nvinfer1::IShuffleLayer* second_shuffle =
       params->converter->network()->addShuffle(*first_shuffle->getOutput(0));
   TFTRT_RETURN_ERROR_IF_NULLPTR(second_shuffle, node_def.name());
-  second_shuffle->setReshapeDimensions(second_shuffle_shape);
+  params->converter->SetLayerName(second_shuffle, node_def, "shuffle",
+                                  /*op_instance=*/1);
+
+  if (HasStaticShape(inputs.at(0).GetTrtDims())) {
+    second_shuffle->setReshapeDimensions(second_shuffle_shape);
+  } else {
+    second_shuffle->setInput(1, *second_shuffle_shape_tensor->trt_tensor());
+  }
   if (data_format == "NHWC") {
-    second_shuffle->setSecondTranspose({1, 2, 0});
+    nvinfer1::Permutation layout_transpose =
+        adjust_perm(3, {1, 2, 0}, params->use_implicit_batch);
+    second_shuffle->setSecondTranspose(layout_transpose);
   }
 
-  ITensorProxyPtr input_tensor = inputs.at(0).tensor();
-  ITensorProxyPtr first_shuffle_tensor = first_shuffle->getOutput(0);
-  ITensorProxyPtr second_shuffle_tensor = second_shuffle->getOutput(0);
-  params->converter->MarkQuantizationRangesAsInferrable(&input_tensor,
-                                                        &first_shuffle_tensor);
-  params->converter->MarkQuantizationRangesAsInferrable(&first_shuffle_tensor,
-                                                        &second_shuffle_tensor);
-  params->outputs->push_back(TRT_TensorOrWeights(second_shuffle_tensor));
+  params->outputs->push_back(TRT_TensorOrWeights(second_shuffle->getOutput(0)));
   return Status::OK();
 }
 
-Status ConvertSquaredDifference(OpConverterParams* params) {
+Status ConvertSquaredDifference(const OpConverterParams* params) {
   TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"x", false}, {"y", false}}));
   TF_RETURN_IF_ERROR(
       AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
@@ -5078,231 +5489,314 @@ Status ConvertSquaredDifference(OpConverterParams* params) {
   nvinfer1::Dims broadcasted_dims_l, broadcasted_dims_r;
   TF_RETURN_IF_ERROR(GetTrtBroadcastShape(
       inputs.at(0), inputs.at(1), /*check_feasibility=*/true,
-      &broadcasted_dims_l, &broadcasted_dims_r));
+      params->use_implicit_batch, &broadcasted_dims_l, &broadcasted_dims_r));
   ITensorProxyPtr tensor_l = nullptr;
   ITensorProxyPtr tensor_r = nullptr;
-  TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
-      inputs.at(0), broadcasted_dims_l, params->validation_only, &tensor_l));
-  TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
-      inputs.at(1), broadcasted_dims_r, params->validation_only, &tensor_r));
+  TF_RETURN_IF_ERROR(
+      PrepareTensorForShape(params->converter, inputs.at(0), broadcasted_dims_l,
+                            params->validation_only, &tensor_l, node_def));
+  TF_RETURN_IF_ERROR(
+      PrepareTensorForShape(params->converter, inputs.at(1), broadcasted_dims_r,
+                            params->validation_only, &tensor_r, node_def));
   if (params->validation_only) return Status::OK();
 
   // Subtract x - y.
   nvinfer1::IElementWiseLayer* sub =
       params->converter->network()->addElementWise(
-          *tensor_l->trt_tensor(), *tensor_r->trt_tensor(), nvinfer1::ElementWiseOperation::kSUB);
+          *tensor_l->trt_tensor(), *tensor_r->trt_tensor(),
+          nvinfer1::ElementWiseOperation::kSUB);
   TFTRT_RETURN_ERROR_IF_NULLPTR(sub, node_def.name());
+  params->converter->SetLayerName(sub, node_def, "sub");
+
   // Multiply (x - y) * (x - y).
   nvinfer1::IElementWiseLayer* mul =
       params->converter->network()->addElementWise(
           *sub->getOutput(0), *sub->getOutput(0),
           nvinfer1::ElementWiseOperation::kPROD);
   TFTRT_RETURN_ERROR_IF_NULLPTR(mul, node_def.name());
+  params->converter->SetLayerName(mul, node_def, "mul");
 
   params->outputs->push_back(TRT_TensorOrWeights(mul->getOutput(0)));
   return Status::OK();
 }
 
-#if IS_TRT_VERSION_GE(5, 1, 0, 0)
-Status ConvertCombinedNMS(OpConverterParams* params) {
-  TF_RETURN_IF_ERROR(
-      CheckInputsWeights(*params, {{"boxes", false},
-                                   {"scores", false},
-                                   {"max_output_size_per_class", true},
-                                   {"max_total_size", true},
-                                   {"iou_threshold", true},
-                                   {"score_threshold", true}}));
+#if IS_TRT_VERSION_GE(7, 1, 3, 0) || defined(TF_TRT_USE_EFFICIENT_NMS_PLUGIN)
+Status ConvertCombinedNMS(const OpConverterParams* params) {
+  TF_RETURN_IF_ERROR(CheckInputsWeights(
+      *params, {{"boxes", TrtInputArg::kTensor},
+                {"scores", TrtInputArg::kTensor},
+                {"max_output_size_per_class", TrtInputArg::kWeight},
+                {"max_total_size", TrtInputArg::kWeight},
+                {"iou_threshold", TrtInputArg::kWeight},
+                {"score_threshold", TrtInputArg::kWeight}}));
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
+  const auto& node_name = node_def.name();
 
-  ITensorProxyPtr boxes_tensor = inputs.at(0).tensor();
-  ITensorProxyPtr scores_tensor = inputs.at(1).tensor();
-  TRT_ShapedWeights output_size_per_class = inputs.at(2).weights();
-  TRT_ShapedWeights total_size = inputs.at(3).weights();
-  TRT_ShapedWeights iou_threshold = inputs.at(4).weights();
-  TRT_ShapedWeights score_threshold = inputs.at(5).weights();
-
-  // Validate tensors and weights (also set some of the needed plugin fields)
+  const ITensorProxyPtr boxes_tensor = inputs.at(0).tensor();
+  const ITensorProxyPtr scores_tensor = inputs.at(1).tensor();
   const auto boxes_dims = boxes_tensor->getDimensions();
   const auto scores_dims = scores_tensor->getDimensions();
-  if (boxes_dims.nbDims != 3) {
+
+#if IS_TRT_VERSION_GE(8, 2, 1, 6) || defined(TF_TRT_USE_EFFICIENT_NMS_PLUGIN)
+  const auto flag = true;
+  const auto* plugin_name = "NMS TRT Plugin ";
+  const auto* pluginName = "EfficientNMS_TFTRT_TRT";
+#else  // IS_TRT_VERSION_GE(7, 1, 3, 0)
+  const auto flag = false;
+  const auto* plugin_name = "TensorRT BatchedNMS Plugin ";
+  const auto* pluginName = "BatchedNMS_TRT";
+
+  auto AllowNmsTopkOverride = []() {
+    static bool result = [] {
+      bool value;
+      const Status status = ReadBoolFromEnvVar("TF_TRT_ALLOW_NMS_TOPK_OVERRIDE",
+                                               /*default_value=*/false, &value);
+      if (!status.ok()) {
+        LOG(ERROR) << status;
+      }
+      return value;
+    }();
+    return result;
+  };
+#endif
+
+  if (params->use_implicit_batch == flag) {
+    if (flag) {
+      return errors::Unimplemented(
+          convert_not_supported_implicit(node_def.op(), node_name));
+    } else {
+      if (!HasStaticShape(boxes_dims) || !HasStaticShape(scores_dims)) {
+        return errors::Unimplemented(plugin_name,
+                                     "requires input with static shape");
+      }
+    }
+  }
+
+  const auto& output_size_per_class = inputs.at(2).weights();
+  const auto& total_size = inputs.at(3).weights();
+  const auto& iou_threshold = inputs.at(4).weights();
+  const auto& score_threshold = inputs.at(5).weights();
+
+  const int offset = params->use_implicit_batch ? 0 : 1;
+  if (boxes_dims.nbDims != 3 + offset) {
     return errors::InvalidArgument(
-        "TensorRT BatchedNMS Plugin input boxes must be 3-D excluding batch ",
-        node_def.name());
+        plugin_name, "input boxes must be 4-D including batch, at ", node_name);
   }
-  const int num_classes = scores_dims.d[1];
-  bool box_check = boxes_dims.d[1] == 1 || boxes_dims.d[1] == num_classes;
+
+  AttrSlice attrs(node_def);
+  bool clip_boxes = false, pad_per_class = false;
+  TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "clip_boxes", &clip_boxes));
+  TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "pad_per_class", &pad_per_class));
+
+  const int class_idx = 1 + offset;
+  const int num_classes = scores_dims.d[class_idx];
+  const bool box_check =
+      boxes_dims.d[class_idx] == 1 || boxes_dims.d[class_idx] == num_classes;
   if (!box_check) {
     return errors::InvalidArgument(
-        "TensorRT BatchedNMS Plugin third dimension of boxes must be either 1 "
-        "or num_classes ",
-        node_def.name());
+        plugin_name,
+        "third dimension of boxes must be either 1"
+        "or match the num_classes dimension of scores, at ",
+        node_name);
   }
-  if (output_size_per_class.shape_.nbDims != 1) {
+
+  if (output_size_per_class.count() != 1) {
     return errors::InvalidArgument(
-        "TensorRT BatchedNMS Plugin max_output_size_per_class must be 0-D ",
-        node_def.name());
+        plugin_name, "max_output_size_per_class must be scalar, at ",
+        node_name);
   }
-  int max_size_per_class =
-      *(static_cast<int*>(output_size_per_class.GetValues()));
+
+  const int max_size_per_class = *(output_size_per_class.GetPointer<int>());
   if (max_size_per_class <= 0) {
     return errors::InvalidArgument(
-        "TensorRT BatchedNMS Plugin max_output_size_per_class should be > 0",
-        node_def.name());
+        plugin_name, "max_output_size_per_class should be > 0, at ", node_name);
   }
-  if (total_size.shape_.nbDims != 1) {
+
+  if (total_size.count() != 1) {
     return errors::InvalidArgument(
-        "TensorRT BatchedNMS Plugin max_total_size must be 0-D ",
-        node_def.name());
+        plugin_name, "max_total_size must be scalar, at ", node_name);
   }
-  int max_total_size = *(static_cast<int*>(total_size.GetValues()));
+
+  int max_total_size = *(total_size.GetPointer<int>());
   if (max_total_size <= 0) {
     return errors::InvalidArgument(
-        "TensorRT BatchedNMS Plugin max_total_size should be > 0",
-        node_def.name());
+        plugin_name, "max_total_size should be > 0, at ", node_name);
   }
-  if (iou_threshold.shape_.nbDims != 1) {
+
+  if (iou_threshold.count() != 1) {
     return errors::InvalidArgument(
-        "TensorRT BatchedNMS Plugin iou_threshold must be 0-D ",
-        node_def.name());
+        plugin_name, "iou_threshold must be scalar, at ", node_name);
   }
-  float iou_thresh = *(static_cast<float*>(iou_threshold.GetValues()));
+
+  const auto iou_thresh = *(iou_threshold.GetPointer<float>());
   if (iou_thresh < 0.0 || iou_thresh > 1.0) {
     return errors::InvalidArgument(
-        "TensorRT BatchedNMS Plugin iou_threshold must be in [0, 1]",
-        node_def.name());
+        plugin_name, "iou_threshold must be in [0, 1], at", node_name);
   }
-  if (score_threshold.shape_.nbDims != 1) {
+
+  if (score_threshold.count() != 1) {
     return errors::InvalidArgument(
-        "TensorRT BatchedNMS Plugin score_threshold must be 0-D ",
-        node_def.name());
+        plugin_name, "score_threshold must be scalar, at ", node_name);
+  }
+
+#if !IS_TRT_VERSION_GE(8, 2, 1, 6) && !defined(TF_TRT_USE_EFFICIENT_NMS_PLUGIN)
+  // TRT op is_normalized=False treats input coordinates as pixels and
+  // calculates width/height as (max - min + 1).
+  //
+  // TF op CombinedNonMaxSuppression doesn't care about the normalization and
+  // calculates width/height  as (max-min).
+  //
+  // We set is_normalized = true to be consistent with TF IOU calculaton.
+  const bool is_normalized = true;
+  const int backgrnd_id = -1;
+  const bool share_location = (boxes_dims.d[class_idx] == 1);
+  int keep_top_k =
+      pad_per_class ? std::min(max_size_per_class * num_classes, max_total_size)
+                    : max_total_size;
+
+  // According to the batchedNMS plugin description we need to set top_k so that
+  // keep_top_k <= top_k
+  // https://github.com/NVIDIA/TensorRT/tree/master/plugin/batchedNMSPlugin
+  // Before the NMS step, TRT selects top_k candidate from each class and
+  // discards the rest. The NMS step is performed only among the top_k
+  // candidates. To be strictly compatible with the TF op, we need that top_k is
+  // greater equal to num_boxes.
+  const int num_boxes = boxes_dims.d[offset];
+  int top_k = std::max(num_boxes, keep_top_k);
+  // TRT has a limitation: top_k <=4096.
+  if (top_k > 4096) {
+    if (AllowNmsTopkOverride()) {
+      top_k = 4096;
+      keep_top_k = std::min(top_k, keep_top_k);
+    } else {
+      return errors::InvalidArgument(
+          "TRT NMS plugin allow top_k<=4096, where top_k = max(num_boxes, "
+          "max_total_size). You can override this by setting "
+          "TF_TRT_ALLOW_NMS_TOPK_OVERRIDE=1 environment variable, but this can "
+          "result in a loss of accuracy.");
+    }
   }
+#endif
 
   if (params->validation_only) return Status::OK();
 
-  // TF op CombinedNonMaxSuppression doesn't have the option of
-  // not normalizing coordinates.
-  const bool is_normalized = true;
-  // Set plugin fields and the field collection
-  TFAttrs attrs(node_def);
-  bool share_location = (boxes_dims.d[1] == 1);
-  const bool pad_per_class = attrs.get<bool>("pad_per_class");
-  const int top_k = boxes_dims.d[0];
-  int keep_top_k = 0;
-  if (pad_per_class) {
-    keep_top_k = std::min(max_size_per_class * num_classes, max_total_size);
-  } else {
-    keep_top_k = max_total_size;
-  }
-  float score_thresh = *(static_cast<float*>(score_threshold.GetValues()));
-  const int background_id = -1;
-  nvinfer1::PluginField fields[8] = {
-      nvinfer1::PluginField{"shareLocation", &share_location,
-                            nvinfer1::PluginFieldType::kINT32, 1},
-      nvinfer1::PluginField{"backgroundLabelId", &background_id,
-                            nvinfer1::PluginFieldType::kINT32, 1},
-      nvinfer1::PluginField{"numClasses", &num_classes,
-                            nvinfer1::PluginFieldType::kINT32, 1},
-      nvinfer1::PluginField{"topK", &top_k, nvinfer1::PluginFieldType::kINT32,
-                            1},
-      nvinfer1::PluginField{"keepTopK", &keep_top_k,
-                            nvinfer1::PluginFieldType::kINT32, 1},
-      nvinfer1::PluginField{"scoreThreshold", &score_thresh,
-                            nvinfer1::PluginFieldType::kFLOAT32, 1},
-      nvinfer1::PluginField{"iouThreshold", &iou_thresh,
-                            nvinfer1::PluginFieldType::kFLOAT32, 1},
-      nvinfer1::PluginField{"isNormalized", &is_normalized,
-                            nvinfer1::PluginFieldType::kINT32, 1},
+  // Create plugin.
+  float score_thresh = *(score_threshold.GetPointer<float>());
+  nvinfer1::PluginField fields[] = {
+#if IS_TRT_VERSION_GE(8, 2, 1, 6) || defined(TF_TRT_USE_EFFICIENT_NMS_PLUGIN)
+    {"max_output_size_per_class", &max_size_per_class,
+     nvinfer1::PluginFieldType::kINT32, 1},
+    {"max_total_size", &max_total_size, nvinfer1::PluginFieldType::kINT32, 1},
+    {"iou_threshold", &iou_thresh, nvinfer1::PluginFieldType::kFLOAT32, 1},
+    {"score_threshold", &score_thresh, nvinfer1::PluginFieldType::kFLOAT32, 1},
+    {"pad_per_class", &pad_per_class, nvinfer1::PluginFieldType::kINT32, 1},
+    {"clip_boxes", &clip_boxes, nvinfer1::PluginFieldType::kINT32, 1},
+#else  // IS_TRT_VERSION_GE(7, 1, 3, 0)
+    {"shareLocation", &share_location, nvinfer1::PluginFieldType::kINT32, 1},
+    {"backgroundLabelId", &backgrnd_id, nvinfer1::PluginFieldType::kINT32, 1},
+    {"numClasses", &num_classes, nvinfer1::PluginFieldType::kINT32, 1},
+    {"topK", &top_k, nvinfer1::PluginFieldType::kINT32, 1},
+    {"keepTopK", &keep_top_k, nvinfer1::PluginFieldType::kINT32, 1},
+    {"scoreThreshold", &score_thresh, nvinfer1::PluginFieldType::kFLOAT32, 1},
+    {"iouThreshold", &iou_thresh, nvinfer1::PluginFieldType::kFLOAT32, 1},
+    {"isNormalized", &is_normalized, nvinfer1::PluginFieldType::kINT32, 1},
+    {"clipBoxes", &clip_boxes, nvinfer1::PluginFieldType::kINT32, 1},
+#endif
   };
-  nvinfer1::PluginFieldCollection fc{8, fields};
 
-  // Get plugin creator
-  auto creator =
-      getPluginRegistry()->getPluginCreator("BatchedNMS_TRT", "1", "");
-  TFTRT_RETURN_ERROR_IF_NULLPTR(creator, node_def.name());
+  nvinfer1::PluginFieldCollection fc{sizeof(fields) / sizeof(fields[0]),
+                                     fields};
+
+  // Get plugin creator.
+  auto creator = getPluginRegistry()->getPluginCreator(pluginName, "1", "");
+  TFTRT_RETURN_ERROR_IF_NULLPTR(creator, node_name);
 
-  // Create plugin
   TrtUniquePtrType<nvinfer1::IPluginV2> plugin(
-      creator->createPlugin(node_def.name().c_str(), &fc));
-  TFTRT_RETURN_ERROR_IF_NULLPTR(plugin, node_def.name());
+      creator->createPlugin(node_name.c_str(), &fc));
+  TFTRT_RETURN_ERROR_IF_NULLPTR(plugin, node_name);
 
-  // Set plugin inputs
+  // Set plugin inputs.
   std::vector<nvinfer1::ITensor*> trt_plugin_inputs;
   trt_plugin_inputs.push_back(boxes_tensor->trt_tensor());
   trt_plugin_inputs.push_back(scores_tensor->trt_tensor());
 
-  // Add plugin to network
+  // Add plugin to network.
   nvinfer1::IPluginV2Layer* layer = params->converter->network()->addPluginV2(
       &trt_plugin_inputs[0], static_cast<int>(trt_plugin_inputs.size()),
       *plugin);
-  TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+  TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_name);
+  params->converter->SetLayerName(layer, node_def, "plugin");
 
-  // Set plugin outputs
-  nvinfer1::ITensor* output_nmsed_boxes = layer->getOutput(1);
-#if IS_TRT_VERSION_GE(6, 0, 0, 0)
-  // TRT6 fixes (removes) the extra last dimension in CombinedNMS outputs
+  // Set plugin outputs.
+  const ITensorProxyPtr output_detection_boxes = layer->getOutput(1);
+  const ITensorProxyPtr output_detection_scores = layer->getOutput(2);
   ITensorProxyPtr output_num_detections = layer->getOutput(0);
-  ITensorProxyPtr output_nmsed_scores = layer->getOutput(2);
-  ITensorProxyPtr output_nmsed_classes = layer->getOutput(3);
-#else
-  ITensorProxyPtr output_num_detections = nullptr;
-  ITensorProxyPtr output_nmsed_scores = nullptr;
-  ITensorProxyPtr output_nmsed_classes = nullptr;
-
-  auto shrink_last_dim = [params](nvinfer1::ITensor* in_tensor,
-                                  nvinfer1::ITensor** out_tensor) {
-    nvinfer1::Dims dims = in_tensor->getDimensions();
-    if (dims.d[dims.nbDims - 1] != 1) {
-      return errors::Internal("Expect last dims to be 1, for tensor ",
-                              DebugString(*in_tensor));
-    }
-    --dims.nbDims;
-    TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
-        TRT_TensorOrWeights(in_tensor), dims,
-        /*validation_only=*/false, out_tensor));
-    return Status::OK();
-  };
-  TF_RETURN_IF_ERROR(
-      shrink_last_dim(layer->getOutput(2), &output_nmsed_scores));
-  TF_RETURN_IF_ERROR(
-      shrink_last_dim(layer->getOutput(3), &output_nmsed_classes));
-  TF_RETURN_IF_ERROR(
-      shrink_last_dim(layer->getOutput(0), &output_num_detections));
-#endif  // IS_TRT_VERSION_GE(6, 0, 0, 0)
+  ITensorProxyPtr output_detection_classes = layer->getOutput(3);
+
+#if IS_TRT_VERSION_GE(8, 2, 1, 6) || defined(TF_TRT_USE_EFFICIENT_NMS_PLUGIN)
+  // Cast the classes output from int32 to float32.
+  nvinfer1::IIdentityLayer* layer_detection_classes =
+      params->converter->network()->addIdentity(
+          *output_detection_classes->trt_tensor());
+  layer_detection_classes->setOutputType(0, nvinfer1::DataType::kFLOAT);
+  output_detection_classes = layer_detection_classes->getOutput(0);
+
+  // The plugin produces a [N, 1] tensor for the num output, squeeze it to [N]
+  std::vector<int> input_dims{output_num_detections->getDimensions().d[0], 0};
+  TF_RETURN_IF_ERROR(params->converter->SqueezeTensor(
+      /*input=*/output_num_detections,
+      /*input_dims=*/&input_dims,
+      /*params=*/params,
+      /*output=*/&output_num_detections));
+#endif
 
-  params->outputs->push_back(TRT_TensorOrWeights(output_nmsed_boxes));
-  params->outputs->push_back(TRT_TensorOrWeights(output_nmsed_scores));
-  params->outputs->push_back(TRT_TensorOrWeights(output_nmsed_classes));
+  // Final outputs.
+  params->outputs->push_back(TRT_TensorOrWeights(output_detection_boxes));
+  params->outputs->push_back(TRT_TensorOrWeights(output_detection_scores));
+  params->outputs->push_back(TRT_TensorOrWeights(output_detection_classes));
   params->outputs->push_back(TRT_TensorOrWeights(output_num_detections));
-
   return Status::OK();
 }
-#endif  // CombinedNonMaxSuppression
+#endif
 
-#if IS_TRT_VERSION_GE(6, 0, 0, 0)
-Status ConvertResize(OpConverterParams* params) {
+Status ConvertResize(const OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
-  TF_RETURN_IF_ERROR(
-      CheckInputsWeights(*params, {{"input", false}, {"size", true}}));
+  TF_RETURN_IF_ERROR(CheckInputsWeights(
+      *params,
+      {{"input", TrtInputArg::kTensor}, {"size", TrtInputArg::kBoth}}));
   TF_RETURN_IF_ERROR(AllowDataTypes(
       *params, {DataType::DT_FLOAT, DataType::DT_HALF, DataType::DT_INT32}));
 
-  // Get input tensor. Transpose it from NHWC to NCHW.
-  ITensorProxyPtr tensor = inputs.at(0).tensor();
-  TFTRT_RETURN_ERROR_IF_NULLPTR(tensor, params->node_def.name());
+  // Get input tensor.
+  ITensorProxyPtr inputs_tensor = inputs.at(0).tensor();
+  TFTRT_RETURN_ERROR_IF_NULLPTR(inputs_tensor, params->node_def.name());
 
-  // Get output size. It must constain two values i.e. [H_out, W_out]
-  TRT_ShapedWeights weights = inputs.at(1).weights();
-  if (weights.count() != 2) {
-    return errors::Unimplemented("Resize to shape=[] is not supported, at ",
-                                 node_def.name());
+  // Check output size. It must constain two values i.e. [H_out, W_out]
+  const bool const_output_size = inputs.at(1).is_weights();
+  if (const_output_size) {
+    // Output size is given as a constant.
+    if (inputs.at(1).weights().count() != 2) {
+      return errors::Unimplemented("Resize requires 2D values for the size");
+    }
+  } else {
+    // Output size is given as a tensor, possibly as the result of shape
+    // calculation ops in the graph.
+    if (params->use_implicit_batch) {
+      return errors::Unimplemented(
+          "Resize requires constant size in implicit batch mode");
+    }
+    TF_RETURN_IF_ERROR(ExpectShapeTensor(inputs.at(1)));
+    if (inputs.at(1).tensor()->getDimensions().d[0] != 2) {
+      return errors::Unimplemented("Resize requires 2D values for the size");
+    }
   }
-  const int* weights_ptr = static_cast<int*>(weights.GetValues());
 
   // Verify and consume node attributes.
-  TFAttrs attrs(node_def);
-  bool align_corners = attrs.get<bool>("align_corners");
+  bool align_corners;
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(AttrSlice(node_def), "align_corners", &align_corners));
   TF_RETURN_IF_ERROR(
       AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
 
@@ -5319,67 +5813,123 @@ Status ConvertResize(OpConverterParams* params) {
   } else if (node_def.op() == "ResizeNearestNeighbor") {
     resize_mode = nvinfer1::ResizeMode::kNEAREST;
   } else {
-    return errors::Unimplemented(node_def.op(), " is not yet implemented at ",
-                                 node_def.name());
+    return errors::Unimplemented(node_def.op(), " is not yet implemented");
   }
 
   // return after validation if only validation is requested.
   if (params->validation_only) return Status::OK();
 
   // Transpose tensor from NHWC to NCHW format.
-  TF_RETURN_IF_ERROR(
-      params->converter->TransposeTensor(tensor, {0, 3, 1, 2}, &tensor));
+  TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
+      inputs_tensor, {0, 3, 1, 2}, &inputs_tensor, node_def, "to_NCHW"));
+
+  // Calculate the output shape as static dimensions or a shape tensor:
+  // Given input shape [N, C, H, W] and output size [H_out, W_out],
+  // output shape equals [N, C, H_out, W_out].
+  nvinfer1::Dims output_shape_dims;
+  ITensorProxyPtr output_shape_tensor;
+  const bool static_output_shape =
+      HasStaticShape(inputs_tensor->getDimensions()) && const_output_size;
+  if (static_output_shape) {
+    // If the output shape can be fully determined at build time, calculate it
+    // as a set of dimensions.
+    output_shape_dims.nbDims = inputs_tensor->getDimensions().nbDims;
+    for (int i = 0; i < output_shape_dims.nbDims; ++i) {
+      output_shape_dims.d[i] = inputs_tensor->getDimensions().d[i];
+    }
+    const int* weights_ptr = inputs.at(1).weights().GetPointer<int>();
+    output_shape_dims.d[output_shape_dims.nbDims - 2] = weights_ptr[0];
+    output_shape_dims.d[output_shape_dims.nbDims - 1] = weights_ptr[1];
+  } else {
+    // Otherwise, build the output shape as a shape tensor that will be computed
+    // at run time.
+    // The batch size and num of channels will be copied from the input shape.
+    ITensorProxyPtr shape = params->converter->network()
+                                ->addShape(*inputs_tensor->trt_tensor())
+                                ->getOutput(0);
+    ITensorProxyPtr batch_size =
+        params->converter->network()
+            ->addSlice(*shape->trt_tensor(), {1, {0}}, {1, {1}}, {1, {1}})
+            ->getOutput(0);
+    ITensorProxyPtr num_channels =
+        params->converter->network()
+            ->addSlice(*shape->trt_tensor(), {1, {1}}, {1, {1}}, {1, {1}})
+            ->getOutput(0);
+
+    // The height and width will be obtained from the requested output size.
+    ITensorProxyPtr height, width;
+    if (const_output_size) {
+      // If the output size is constant, the height and width dimensions can be
+      // created as constants from the size values.
+      const int* weights_ptr = inputs.at(1).weights().GetPointer<int>();
+      TF_RETURN_IF_ERROR(CreateScalarConstant(params, weights_ptr[0], &height));
+      TF_RETURN_IF_ERROR(CreateScalarConstant(params, weights_ptr[1], &width));
+    } else {
+      // Otherwise, the size is a tensor which can be sliced, and each element
+      // used directly as the output height and width dimensions.
+      ITensorProxyPtr size = inputs.at(1).tensor();
+      height = params->converter->network()
+                   ->addSlice(*size->trt_tensor(), {1, {0}}, {1, {1}}, {1, {1}})
+                   ->getOutput(0);
+      width = params->converter->network()
+                  ->addSlice(*size->trt_tensor(), {1, {1}}, {1, {1}}, {1, {1}})
+                  ->getOutput(0);
+    }
 
-  // Calculate output dimensions.
-  // Given input dimensions [N, C, H, W] and output size [H_out, W_out],
-  // output dimensions equals [N, C, H_out, W_out]
-  nvinfer1::Dims output_dimensions;
-  output_dimensions.nbDims = tensor->getDimensions().nbDims;
-  for (int i = 0; i < output_dimensions.nbDims; ++i) {
-    output_dimensions.d[i] = tensor->getDimensions().d[i];
+    ::stream_executor::port::StatusOr<ITensorProxyPtr> result =
+        ConcatenateTensors(params, {batch_size, num_channels, height, width},
+                           0);
+    TF_RETURN_IF_ERROR(result.status());
+    output_shape_tensor = result.ValueOrDie();
   }
-  output_dimensions.d[output_dimensions.nbDims - 2] = weights_ptr[0];
-  output_dimensions.d[output_dimensions.nbDims - 1] = weights_ptr[1];
 
   // Add resize layer.
   nvinfer1::IResizeLayer* layer =
-      params->converter->network()->addResize(*tensor->trt_tensor());
+      params->converter->network()->addResize(*inputs_tensor->trt_tensor());
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+  params->converter->SetLayerName(layer, node_def);
 
   // Set layer parameters.
   layer->setResizeMode(resize_mode);
-  layer->setOutputDimensions(output_dimensions);
   layer->setAlignCorners(align_corners);
 
+  // Set output shape.
+  if (static_output_shape) {
+    // If the shapes are fully known at build time, pass the static output shape
+    // to the resize layer as expected output dimensions.
+    layer->setOutputDimensions(output_shape_dims);
+  } else {
+    // Otherwise, pass the output shape tensor to the resize layer as an input.
+    layer->setInput(1, *output_shape_tensor->trt_tensor());
+  }
+
   // Get output tensor. Transpose it from NCHW to NHWC.
   ITensorProxyPtr output = layer->getOutput(0);
 
-  TF_RETURN_IF_ERROR(
-      params->converter->TransposeTensor(output, {0, 2, 3, 1}, &output));
+  TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
+      output, {0, 2, 3, 1}, &output, node_def, "to_NHWC"));
   params->outputs->push_back(TRT_TensorOrWeights(output));
   // Success
   return Status::OK();
 }  // ConvertResize
-#endif  // IS_TRT_VERSION_GE(6, 0, 0, 0)
 
-Status ConvertAddN(OpConverterParams* params) {
+Status ConvertAddN(const OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
   TF_RETURN_IF_ERROR(
       AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
-  TFAttrs attrs(node_def);
-  const int num_inputs = attrs.get<int64>("N");
+
+  int num_inputs;
+  TF_RETURN_IF_ERROR(GetNodeAttr(AttrSlice(node_def), "N", &num_inputs));
+
   if (num_inputs < 2) {
-    return errors::InvalidArgument("AddN requires at least two inputs, at ",
-                                   node_def.name());
-  }
-  if (inputs.size() != num_inputs) {
-    return errors::InvalidArgument("Got ", inputs.size(),
-                                   " inputs but expected ", num_inputs, ", at ",
-                                   node_def.name());
+    return errors::InvalidArgument("AddN requires at least two inputs");
   }
+
+  TFTRT_CHECK_INPUT_SIZE(inputs.size(), num_inputs, node_def);
+
   for (const auto& input : inputs) {
-    if (!input.is_tensor() && input.weights().shape_.d[0] != 1) {
+    if (!input.is_tensor() && input.weights().Shape().dim(0) != 1) {
       return errors::InvalidArgument(
           "Weights input to AddN is required to have batch dimension 1.");
     }
@@ -5388,195 +5938,165 @@ Status ConvertAddN(OpConverterParams* params) {
 
   // AddN doesn't support broadcast.
   std::vector<ITensorProxyPtr> tensor_inputs;
+  tensor_inputs.reserve(inputs.size());
   for (const auto& input : inputs) {
     if (input.is_tensor()) {
       tensor_inputs.push_back(input.tensor());
     } else {
-      auto dims = input.weights().shape_;
-      TF_RETURN_IF_ERROR(RemoveBatchDimension(&dims));
-      tensor_inputs.push_back(
-          params->converter->CreateConstantLayer(input.weights(), dims));
+      auto dims = input.weights().Shape();
+      if (params->use_implicit_batch) {
+        TF_RETURN_IF_ERROR(dims.RemoveBatchDimension());
+      }
+      tensor_inputs.push_back(params->converter->CreateConstantLayer(
+          input.weights(), dims.AsTrtDims()));
     }
   }
   ITensorProxyPtr lhs = tensor_inputs[0];
   for (int i = 1; i < num_inputs; ++i) {
     ITensorProxyPtr rhs = tensor_inputs[i];
     nvinfer1::ILayer* layer = params->converter->network()->addElementWise(
-        *lhs->trt_tensor(), *rhs->trt_tensor(), nvinfer1::ElementWiseOperation::kSUM);
+        *lhs->trt_tensor(), *rhs->trt_tensor(),
+        nvinfer1::ElementWiseOperation::kSUM);
     TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+    params->converter->SetLayerName(layer, node_def, std::to_string(i));
     lhs = layer->getOutput(0);
   }
   params->outputs->push_back(TRT_TensorOrWeights(lhs));
   return Status::OK();
 }
 
-static void RegisterValidatableOpConverters(
-    std::unordered_map<string, OpConverter>* registration) {
-  (*registration)["BiasAdd"] = ConvertBiasAdd;
-#if IS_TRT_VERSION_GE(5, 1, 2, 0)
-  (*registration)["ClipByValue"] = ConvertClipByValue;
-#endif
-
-#if IS_TRT_VERSION_GE(5, 1, 0, 0)
-// TODO: @mconley @jdekhtiar - Removed when fixed
-#ifndef TF2TENSORRT_BYPASS_NMS_RESIZE_OPS
-  (*registration)["CombinedNonMaxSuppression"] = ConvertCombinedNMS;
-#endif //TF2TENSORRT_BYPASS_NMS_RESIZE_OPS
+REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertBiasAdd, "BiasAdd");
+REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertClipByValue, "ClipByValue");
+#if IS_TRT_VERSION_GE(7, 1, 3, 0) || defined(TF_TRT_USE_EFFICIENT_NMS_PLUGIN)
+REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertCombinedNMS,
+                                  "CombinedNonMaxSuppression");
 #endif
-  (*registration)["AddN"] = ConvertAddN;
-  (*registration)["ConcatV2"] = ConvertConcat;
-  (*registration)["Const"] = ConvertConst;
-  (*registration)["Conv2D"] = ConvertConv2D;
-  (*registration)["Conv2DBackpropInput"] = ConvertConv2DBackpropInput;
-  (*registration)["DepthToSpace"] = ConvertDepthSpaceShuffle;
-  (*registration)["DepthwiseConv2dNative"] = ConvertConv2DDepthwise;
-  (*registration)["ExpandDims"] = ConvertExpandDims;
-  (*registration)["FusedConv2DBiasActivation"] =
-      ConvertFusedConv2DBiasActivation;
-  (*registration)["GatherV2"] = ConvertGather;
-  (*registration)["LeakyRelu"] = ConvertLeakyRelu;
-  (*registration)["MatMul"] = ConvertMatMul;
-  (*registration)["Pack"] = ConvertPack;
-  (*registration)["Pad"] = ConvertPad;
-  (*registration)["Relu6"] = ConvertRelu6;
-  (*registration)["Reshape"] = ConvertReshape;
-#if IS_TRT_VERSION_GE(6, 0, 0, 0)
-  (*registration)["Conv3D"] = ConvertConv3D;
-  (*registration)["Conv3DBackpropInputV2"] = ConvertConv3DBackpropInputV2;
-// TODO: @mconley @jdekhtiar - Removed when fixed
-#ifndef TF2TENSORRT_BYPASS_NMS_RESIZE_OPS
-  for (auto resize_mode : {"ResizeBilinear", "ResizeNearestNeighbor"}) {
-    (*registration)[resize_mode] = ConvertResize;
-  }
-#endif // TF2TENSORRT_BYPASS_NMS_RESIZE_OPS
-#endif
-  (*registration)["Rsqrt"] = ConvertRsqrt;
-  (*registration)["Slice"] = ConvertSlice;
-  (*registration)["Softmax"] = ConvertSoftmax;
-  (*registration)["SpaceToDepth"] = ConvertDepthSpaceShuffle;
-  (*registration)["Split"] = ConvertSplit;
-  (*registration)["Square"] = ConvertSquare;
-  (*registration)["SquaredDifference"] = ConvertSquaredDifference;
-  (*registration)["Squeeze"] = ConvertSqueeze;
-  (*registration)["StridedSlice"] = ConvertStridedSlice;
-  (*registration)["TopKV2"] = ConvertTopK;
-  (*registration)["Transpose"] = ConvertTranspose;
-  (*registration)["Unpack"] = ConvertUnpack;
-
-  for (auto quantization_op_type :
-       {"QuantizeAndDequantizeV2", "QuantizeAndDequantizeV3",
-        "FakeQuantWithMinMaxVars", "FakeQuantWithMinMaxArgs"}) {
-    (*registration)[quantization_op_type] = ConvertQuantize;
-  }
-  for (const auto& binary_op_pair : *BinaryOperationMap()) {
-    (*registration)[binary_op_pair.first] = ConvertBinary;
-  }
-  for (const auto& activation_op_pair : *ActivationTypeMap()) {
-    (*registration)[activation_op_pair.first] = ConvertActivation;
-  }
-  for (auto pool_op_type : {"AvgPool", "MaxPool"}) {
-    (*registration)[pool_op_type] = ConvertPool;
-  }
-  for (auto normalization_op_type :
-       {"FusedBatchNorm", "FusedBatchNormV2", "FusedBatchNormV3"}) {
-    (*registration)[normalization_op_type] = ConvertFusedBatchNorm;
-  }
-  for (auto unary_op_pair : *UnaryOperationMap()) {
-    (*registration)[unary_op_pair.first] = ConvertUnary;
-  }
-  for (auto reduce_op_type : {"Sum", "Prod", "Max", "Min", "Mean"}) {
-    (*registration)[reduce_op_type] = ConvertReduce;
+REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertAddN, "AddN");
+REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertCast, "Cast");
+REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertConcat, "ConcatV2");
+REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertConst, "Const");
+REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertConv2D, "Conv2D");
+REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertConv2DBackpropInput,
+                                  "Conv2DBackpropInput");
+REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertDepthSpaceShuffle, "DepthToSpace");
+REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertConv2DDepthwise,
+                                  "DepthwiseConv2dNative");
+
+REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertExpandDims, "ExpandDims");
+REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertFusedConv2DBiasActivation,
+                                  "FusedConv2DBiasActivation");
+REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertGather, "GatherV2");
+REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertMatMul, "MatMul");
+REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertPack, "Pack");
+REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertPad, "Pad");
+REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertReshape, "Reshape");
+REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertConv3D, "Conv3D");
+REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertConv3DBackpropInputV2,
+                                  "Conv3DBackpropInputV2");
+REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertResize, "ResizeBilinear");
+REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertResize, "ResizeNearestNeighbor");
+REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertPool3D, "AvgPool3D");
+REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertPool3D, "MaxPool3D");
+REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertShape, "Shape");
+REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertSlice, "Slice");
+REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertDepthSpaceShuffle, "SpaceToDepth");
+REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertSplit, "Split");
+REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertSquare, "Square");
+REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertSquaredDifference,
+                                  "SquaredDifference");
+REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertSqueeze, "Squeeze");
+REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertStridedSlice, "StridedSlice");
+REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertTopK, "TopKV2");
+REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertTranspose, "Transpose");
+REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertUnpack, "Unpack");
+REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertPool, {"MaxPool", "AvgPool"});
+REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertFusedBatchNorm,
+                                  {"FusedBatchNorm", "FusedBatchNormV2",
+                                   "FusedBatchNormV3"});
+REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertReduce,
+                                  {"Sum", "Prod", "Max", "Min", "Mean"});
+REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertArgMinMax, {"ArgMin", "ArgMax"});
+// The following are no-ops during inference and will not be mapped to any
+// TRT layer.
+REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertIdentity,
+                                  {"Identity", "IdentityN", "Snapshot",
+                                   "StopGradient", "_CopyFromHostToGpu"});
+REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertBatchMatMul,
+                                  {"BatchMatMul", "BatchMatMulV2"});
+// Debug converter only accessible via `TF_TRT_OP_FAKELIST=OpName1,OpName2,...`
+REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertFake, "FakeOp");
+
+static Status SetDeviceInfoInNodes(GraphDef* graph_def, const string& device) {
+  for (auto& node : *(graph_def->mutable_node())) {
+    *node.mutable_device() = device;
   }
-  for (auto arg_minmax_type : {"ArgMin", "ArgMax"}) {
-    (*registration)[arg_minmax_type] = ConvertArgMinMax;
-  }
-  // The following are no-ops during inference and will not be mapped to any TRT
-  // layer.
-  for (auto identity_op_type : {"Identity", "Snapshot", "StopGradient"}) {
-    (*registration)[identity_op_type] = ConvertIdentity;
-  }
-  for (auto batch_matmul_type : {"BatchMatMul", "BatchMatMulV2"}) {
-    (*registration)[batch_matmul_type] = ConvertBatchMatMul;
-  }
-}
-
-void TrtNodeValidator::RegisterOpValidators() {
-  RegisterValidatableOpConverters(&op_validators_);
-}
-
-void Converter::RegisterOpConverters() {
-  RegisterValidatableOpConverters(&op_registry_);
+  return Status::OK();
 }
 
 Status ConvertGraphDefToEngine(
-    const GraphDef& gdef, TrtPrecisionMode precision_mode, int max_batch_size,
-    size_t max_workspace_size_bytes,
-    const std::vector<PartialTensorShape>& input_shapes, Logger* logger,
-    nvinfer1::IGpuAllocator* allocator, TRTInt8Calibrator* calibrator,
+    const GraphDef& gdef, OpKernelContext* ctx, TrtPrecisionMode precision_mode,
+    int max_batch_size, size_t max_workspace_size_bytes,
+    const std::vector<PartialTensorShape>& input_shapes,
+    nvinfer1::ILogger* trt_logger, nvinfer1::IGpuAllocator* allocator,
+    TRTInt8Calibrator* calibrator,
     TrtUniquePtrType<nvinfer1::ICudaEngine>* engine, bool use_calibration,
-    bool* convert_successfully) {
+    const bool use_implicit_batch, bool* convert_successfully,
+    TrtShapeOptimizationProfile* profiles, absl::string_view engine_name,
+    bool use_explicit_precision, tensorflow::grappler::Cluster* cluster,
+    const string& device) {
   engine->reset();
   if (convert_successfully) *convert_successfully = false;
 
-  // Create the builder.
-  TrtUniquePtrType<nvinfer1::IBuilder> builder(
-      nvinfer1::createInferBuilder(*logger));
-  builder->setMaxBatchSize(max_batch_size);
-  builder->setGpuAllocator(allocator);
-#if IS_TRT_VERSION_GE(6, 0, 0, 0)
-  TrtUniquePtrType<nvinfer1::IBuilderConfig> builder_config(
-      builder->createBuilderConfig());
-  builder_config->setMaxWorkspaceSize(max_workspace_size_bytes);
-  if (precision_mode == TrtPrecisionMode::FP16) {
-    builder_config->setFlag(nvinfer1::BuilderFlag::kFP16);
-  } else if (precision_mode == TrtPrecisionMode::INT8) {
-    builder_config->setFlag(nvinfer1::BuilderFlag::kFP16);
-    builder_config->setFlag(nvinfer1::BuilderFlag::kINT8);
-    if (use_calibration) {
-      builder_config->setInt8Calibrator(calibrator);
-    } else {
-      builder_config->setInt8Calibrator(nullptr);
-    }
-  }
-  const uint32_t flags = 0U;  // Implicit Batch Mode
-  // Create the network.
-  auto trt_network =
-      TrtUniquePtrType<nvinfer1::INetworkDefinition>(builder->createNetworkV2(flags));
-#else // IS_TRT_VERSION_GE(6, 0, 0, 0)
-  builder->setMaxWorkspaceSize(max_workspace_size_bytes);
-  if (precision_mode == TrtPrecisionMode::FP16) {
-    builder->setFp16Mode(true);
-  } else if (precision_mode == TrtPrecisionMode::INT8) {
-    // Setting FP16 mode as well allows TRT to also consider FP16 kernels and
-    // use them in situations where they are faster than INT8 or where INT8 is
-    // not supported for a given layer.
-    builder->setFp16Mode(true);
-    builder->setInt8Mode(true);
-    if (use_calibration) {
-      builder->setInt8Calibrator(calibrator);
-    } else {
-      builder->setInt8Calibrator(nullptr);
+  // Creating converter, TensorRT builder and network
+  auto statusor = Converter::Create(precision_mode, use_calibration, trt_logger,
+                                    use_implicit_batch, engine_name,
+                                    use_explicit_precision, ctx);
+
+  TF_RETURN_IF_ERROR(statusor.status());
+  std::unique_ptr<Converter> converter = std::move(statusor.ValueOrDie());
+
+  GraphDef graph = gdef;
+  if (cluster != nullptr) {
+    bool apply_layout_optim;
+    Status status =
+        ReadBoolFromEnvVar("TF_TRT_ENABLE_LAYOUT_OPTIMIZER",
+                           /*default_value=*/true, &apply_layout_optim);
+    if (!status.ok()) {
+      LOG(ERROR) << status;
     }
-  }
-  // Create the network.
-  auto trt_network =
-      TrtUniquePtrType<nvinfer1::INetworkDefinition>(builder->createNetwork());
-#endif // IS_TRT_VERSION_GE(6, 0, 0, 0)
+    if (apply_layout_optim) {
+      tensorflow::grappler::GrapplerItem grappler_item;
+      grappler_item.graph = gdef;
 
-  if (!trt_network) {
-    return errors::Internal("Failed to create TensorRT network object");
-  }
+      // Add device information to each node in the graphdef for successful
+      // execution of the layout optimizer
+      TF_RETURN_IF_ERROR(SetDeviceInfoInNodes(&grappler_item.graph, device));
+
+      // TensorRT API requires the input for convolution to be in NCHW.
+      tensorflow::grappler::GenericLayoutOptimizer layout_optimizer("NCHW");
+      TF_RETURN_IF_ERROR(
+          layout_optimizer.Optimize(cluster, grappler_item, &graph));
+
+      grappler_item.graph = graph;
+
+      tensorflow::grappler::ConstantFolding const_optimizer(nullptr);
+      TF_RETURN_IF_ERROR(
+          const_optimizer.Optimize(cluster, grappler_item, &graph));
 
-  // Build the network
-  if (VLOG_IS_ON(1)) {
-    string mode_str;
-    TF_RETURN_IF_ERROR(TrtPrecisionModeToName(precision_mode, &mode_str));
-    VLOG(1) << "Starting engine conversion, precision mode: " << mode_str;
+      // The optimizers may break the topological order
+      // so we need these steps to restore it
+      Graph g(OpRegistry::Global());
+      TF_RETURN_IF_ERROR(
+          ConvertGraphDefToGraph(GraphConstructorOptions(), graph, &g));
+      g.ToGraphDef(&graph);
+    }
   }
-  Converter converter(trt_network.get(), precision_mode, use_calibration);
+  VLOG(1) << "Starting to convert TensorFlow ops to TensorRT layers";
   std::vector<Converter::EngineOutputInfo> output_tensors;
+  int num_layers = converter->network()->getNbLayers();
+  absl::flat_hash_set<const char*> layer_names;
   // Graph nodes are already topologically sorted during construction
-  for (const auto& node_def : gdef.node()) {
+  for (const auto& node_def : graph.node()) {
     const string& node_name = node_def.name();
     VLOG(2) << "Converting node " << node_name << ", op=" << node_def.op();
     if (IsEngineInput(node_name)) {
@@ -5600,27 +6120,45 @@ Status ConvertGraphDefToEngine(
             "Node ", node_name,
             " with is neither Placeholder nor Arg, instead ", node_def.op());
       }
-      nvinfer1::DataType trt_dtype;
-      nvinfer1::Dims trt_dims;
-      int batch_size = -1;
-      auto shape = input_shapes.at(slot_number);
-      auto status = ValidateTensorProperties(
-          node_def.op(), node_def.attr().at(type_key).type(), shape,
-          /*validation_only=*/false, &trt_dtype, &trt_dims, &batch_size);
-      if (!status.ok()) {
-        const string error_message =
-            StrCat("Validation failed for ", node_name, " and input slot ",
-                   slot_number, ": ", status.error_message());
-        LOG(WARNING) << error_message;
-        return Status(status.code(), error_message);
+      DataType tf_dtype = node_def.attr().at(type_key).type();
+      if (tf_dtype == DT_RESOURCE) {
+        VLOG(2) << "Adding engine input resource " << node_name;
+        if (ctx == nullptr) {
+          return errors::InvalidArgument(
+              "Variable resource type conversion requires a valid ctx");
+        }
+
+        if (ctx->input(slot_number).NumElements() == 0) {
+          return errors::InvalidArgument("Resource input ", node_name,
+                                         " is empty.");
+        }
+        TF_RETURN_IF_ERROR(converter->AddInputResource(
+            node_name, ctx->input(slot_number).flat<ResourceHandle>()(0)));
+      } else {
+        nvinfer1::DataType trt_dtype;
+        nvinfer1::Dims trt_dims;
+        int batch_size = -1;
+        const auto shape = input_shapes.at(slot_number);
+        const auto status = ValidateTensorProperties(
+            node_def.op(), node_def.attr().at(type_key).type(), shape,
+            use_implicit_batch, /*validation_only=*/false, &trt_dtype,
+            &trt_dims, &batch_size);
+        if (!status.ok()) {
+          const string error_message =
+              StrCat("Validation failed for ", node_name, " and input slot ",
+                     slot_number, ": ", status.error_message());
+          LOG_WARNING_WITH_PREFIX << error_message;
+          return Status(status.code(), error_message);
+        }
+        VLOG(2) << "Adding engine input tensor " << node_name << " with shape "
+                << DebugString(trt_dims);
+        // TODO(laigd): the conversion should always happen at runtime where all
+        // the shapes are known, and we can provide a mode to generate the
+        // engines offline, by calling sess.run() and cache/serialize the
+        // engines.
+        TF_RETURN_IF_ERROR(converter->AddInputTensor(node_name, trt_dtype,
+                                                     trt_dims, batch_size));
       }
-      VLOG(2) << "Adding engine input tensor " << node_name << " with shape "
-              << DebugString(trt_dims);
-      // TODO(laigd): the conversion should always happen at runtime where all
-      // the shapes are known, and we can provide a mode to generate the
-      // engines offline, by calling sess.run() and cache/serialize the engines.
-      TF_RETURN_IF_ERROR(
-          converter.AddInputTensor(node_name, trt_dtype, trt_dims, batch_size));
     } else if (IsEngineOutput(node_name)) {
       int32 slot_number = -1;
       if (node_def.op() == "Identity") {
@@ -5640,36 +6178,59 @@ Status ConvertGraphDefToEngine(
             node_def.op());
       }
       // Get output type that TensorFlow expects
-      TFAttrs attrs(node_def);
-      DataType tf_dtype = attrs.get<DataType>("T");
+      string out_type_key;
+      if (node_def.op() == "ReadVariableOp" ||
+          node_def.op() == "ResourceGather") {
+        out_type_key = "dtype";
+      } else {
+        out_type_key = "T";
+      }
+      DataType tf_dtype;
+      TF_RETURN_IF_ERROR(
+          GetNodeAttr(AttrSlice(node_def), out_type_key, &tf_dtype));
       nvinfer1::DataType trt_dtype;
-      TF_RETURN_IF_ERROR(TfDataTypeToTrt(tf_dtype, &trt_dtype));
+      TF_RETURN_IF_ERROR(TfTypeToTrtType(tf_dtype, &trt_dtype));
       if (output_tensors.size() <= slot_number) {
         output_tensors.resize(slot_number + 1);
       }
       output_tensors.at(slot_number) = {node_def.input(0), node_name,
                                         trt_dtype};
     } else {
-      TF_RETURN_IF_ERROR(converter.ConvertNode(node_def));
+      TF_RETURN_IF_ERROR(converter->ConvertNode(node_def));
+    }
+
+    // To support TF-TRT profiling, we ensure each ILayer has a non-empty name.
+    // BuildCudaEngine returns an error if there is any ILayer name collision.
+    // We want to report the error here before BuildCudaEngine in a more
+    // meaningful way.
+    int new_num_layers = converter->network()->getNbLayers();
+    for (int i = num_layers; i < new_num_layers; i++) {
+      auto layer = converter->network()->getLayer(i);
+      if (layer->getName() == nullptr ||
+          !layer_names.insert(layer->getName()).second) {
+        std::string error_message = absl::StrCat(
+            "Converting node ", node_name, ", op=", node_def.op(),
+            layer->getName() ? " creates a layer with name collision"
+                             : " creates a layer without a name");
+        LOG_WARNING_WITH_PREFIX << error_message;
+        return errors::Internal(error_message);
+      }
     }
+    num_layers = new_num_layers;
   }
-  TF_RETURN_IF_ERROR(converter.RenameAndMarkOutputTensors(output_tensors));
+  TF_RETURN_IF_ERROR(converter->RenameAndMarkOutputTensors(output_tensors));
   if (convert_successfully) *convert_successfully = true;
 
   // Apply user provided quantization ranges to tensors
-  converter.MaybeApplyQuantizationRanges();
+  if (!use_explicit_precision) {
+    converter->MaybeApplyQuantizationRanges();
+  }
 
   // Build the engine.
-  VLOG(1) << "Starting engine creation";
-#if IS_TRT_VERSION_GE(6, 0, 0, 0)
-  engine->reset(
-      builder->buildEngineWithConfig(*converter.network(), *builder_config));
-#else
-  engine->reset(builder->buildCudaEngine(*converter.network()));
-#endif // IS_TRT_VERSION_GE(6, 0, 0, 0)
-  if (engine->get() == nullptr) {
-    return errors::Internal("Failed to build TensorRT engine");
-  }
+  TF_RETURN_IF_ERROR(converter->BuildCudaEngine(
+      engine, max_batch_size, max_workspace_size_bytes, allocator, calibrator,
+      profiles));
+
   VLOG(1) << "Finished conversion";
   return Status::OK();
 }
@@ -5677,12 +6238,21 @@ Status ConvertGraphDefToEngine(
 Status ConvertSegmentToGraphDef(
     const Graph* graph, const grappler::GraphProperties& graph_properties,
     const std::vector<const Node*>& subgraph_nodes,  // In topological order
-    std::vector<EngineConnection>* connections, GraphDef* segment_def,
-    string* scope_name) {
+    EngineInfo* engine_info) {
+  tensorflow::profiler::TraceMe activity(
+      "ConvertSegmentToGraphDef", tensorflow::profiler::TraceMeLevel::kInfo);
+  std::vector<EngineConnection>* connections = &engine_info->connections;
+  GraphDef* segment_def = &engine_info->segment_graph_def;
   std::set<string> marker_nodes;
   // Update connection shapes/data types and add corresponding input/output
   // nodes in the segment graphdef.
   for (size_t i = 0; i < connections->size(); ++i) {
+    tensorflow::profiler::TraceMe activity(
+        [&] {
+          return StrCat("Constructing TRTEngine IO: ", i + 1, "/",
+                        connections->size());
+        },
+        tensorflow::profiler::TraceMeLevel::kInfo);
     auto& connection = connections->at(i);
     if (connection.is_control_edge()) continue;
     auto outside_node = graph->FindNodeId(connection.outside_id);
@@ -5757,7 +6327,14 @@ Status ConvertSegmentToGraphDef(
   std::unordered_map<int, int> old_to_new_id_map;
   // Copy internal nodes to new graphdef
   string local_scope = subgraph_nodes.front()->name();
+  int i = 0;
   for (const Node* node : subgraph_nodes) {
+    tensorflow::profiler::TraceMe activity(
+        [&] {
+          return StrCat("Copy Node to Subgraph: ", ++i, "/",
+                        subgraph_nodes.size());
+        },
+        tensorflow::profiler::TraceMeLevel::kInfo);
     local_scope = GetCommonNameScope(local_scope, node->name());
     old_to_new_id_map[node->id()] = segment_def->node_size();
     auto snode = segment_def->add_node();
@@ -5766,6 +6343,13 @@ Status ConvertSegmentToGraphDef(
   }
   // Update the inputs of the new input nodes to point to placeholder nodes.
   for (int i = 0; i < connections->size(); ++i) {
+    tensorflow::profiler::TraceMe activity(
+        [&] {
+          return StrCat("Updating Subgraph Input: ", i + 1, "/",
+                        connections->size());
+        },
+        tensorflow::profiler::TraceMeLevel::kInfo);
+
     auto& connection = connections->at(i);
     if (connection.is_control_edge() || !connection.is_input_edge) continue;
     auto snode =
@@ -5777,13 +6361,26 @@ Status ConvertSegmentToGraphDef(
             << arg_name;
     snode->set_input(connection.inside_port, arg_name);
   }
+
   std::set<string> subgraph_node_names;
-  for (const Node* node : subgraph_nodes) {
-    subgraph_node_names.insert(node->name());
+  {
+    tensorflow::profiler::TraceMe activity(
+        "Constructing subgraph_node_names set: ",
+        tensorflow::profiler::TraceMeLevel::kInfo);
+
+    for (const Node* node : subgraph_nodes) {
+      subgraph_node_names.insert(node->name());
+    }
   }
 
   // Remove control inputs that are not inside the segment.
   for (int i = 0; i < segment_def->node_size(); ++i) {
+    tensorflow::profiler::TraceMe activity(
+        [&] {
+          return StrCat("Removing outside to subgraph control inputs: ", i + 1,
+                        "/", segment_def->node_size());
+        },
+        tensorflow::profiler::TraceMeLevel::kInfo);
     auto snode = segment_def->mutable_node(i);
     const int input_size = snode->input_size();
     int input_idx = 0;
@@ -5798,12 +6395,8 @@ Status ConvertSegmentToGraphDef(
                   << " from subgraph.";
           ++input_idx;
           continue;
-        } else {
-          return errors::InvalidArgument(
-              "Found non control input outside the segment that is not an "
-              "engine connection to ",
-              snode->name(), ": ", input.first);
         }
+        /// TODO(lsugy): throw error when it's not a resource input.
       }
       if (actual_input_idx != input_idx) {
         snode->set_input(actual_input_idx, snode->input(input_idx));
@@ -5815,7 +6408,6 @@ Status ConvertSegmentToGraphDef(
       snode->mutable_input()->RemoveLast();
     }
   }
-  *scope_name = local_scope;
   return Status::OK();
 }
 
@@ -5829,9 +6421,48 @@ bool OutputEdgeValidator::operator()(const Edge* out_edge) const {
   return true;
 }
 
+ITensorProxyPtr TRT_TensorOrWeights::as_tensor(
+    const OpConverterParams* params) {
+  if (is_tensor()) {
+    return tensor();
+  } else {
+    return params->converter->CreateConstantLayer(weights(), GetTrtDims());
+  }
+}
+
+std::string unexpected_type_error_msg(nvinfer1::DataType type_being_checked,
+                                      nvinfer1::DataType type_expected,
+                                      const NodeDef& node_def, int idx) {
+  return "The '" + node_def.input(idx) + "' parameter of " + node_def.op() +
+         " operation in " + node_def.name() + " is expected to be of type " +
+         DebugString(type_expected) + " type, got " +
+         DebugString(type_being_checked) + ".";
+}
+
+string batch_size_error(absl::string_view name, absl::string_view comment) {
+  return StrCat("Batch size doesn't match for tensor '", name, "' : ", comment);
+}
+
+Status check_type(nvinfer1::DataType type_being_checked,
+                  nvinfer1::DataType type_expected, const NodeDef& node_def,
+                  int idx) {
+  if (type_being_checked == type_expected) return Status::OK();
+
+  return errors::InvalidArgument(unexpected_type_error_msg(
+      type_being_checked, type_expected, node_def, idx));
+}
+
+std::string convert_not_supported_implicit(const std::string& pOpName,
+                                           const std::string& pNodeName,
+                                           const char* pOpType) {
+  const auto oper = pOpType ? absl::StrCat(pOpType, " ") : string("");
+  return absl::StrCat("Convertion for ", oper, "op: '", pOpName,
+                      "' is not supported in implicit batch mode, at ",
+                      pNodeName);
+}
+
 }  // namespace convert
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h
index 192c8f1e6d0..02c7148b842 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h
@@ -22,35 +22,28 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/op_converter.h"
 #include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/weights.h"
 #include "tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h"
 #include "tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.h"
 #include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h"
 #include "tensorflow/compiler/tf2tensorrt/utils/trt_tensor_proxy.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/lib/core/status.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 #include "third_party/tensorrt/NvInfer.h"
 
-// TODO: @mconley @jdekhtiar - Removed when fixed
-#define TF2TENSORRT_BYPASS_NMS_RESIZE_OPS
-
 namespace tensorflow {
 namespace tensorrt {
 
 namespace convert {
-
-#define IS_TRT_VERSION_GE(major, minor, patch, build)           \
-  ((NV_TENSORRT_MAJOR > major) ||                               \
-   (NV_TENSORRT_MAJOR == major && NV_TENSORRT_MINOR > minor) || \
-   (NV_TENSORRT_MAJOR == major && NV_TENSORRT_MINOR == minor && \
-    NV_TENSORRT_PATCH > patch) ||                               \
-   (NV_TENSORRT_MAJOR == major && NV_TENSORRT_MINOR == minor && \
-    NV_TENSORRT_PATCH == patch && NV_TENSORRT_BUILD >= build))
+using ::stream_executor::port::StatusOr;
 
 struct EngineConnection {
   // Constructs a non-control edge.
@@ -101,8 +94,13 @@ struct EngineInfo {
   EngineInfo()
       : engine_type(EngineType::TRTStatic),
         max_workspace_size_bytes(0),
+        max_batch_size(absl::nullopt),
+        maximum_cached_engines(0),
         precision_mode(TrtPrecisionMode::FP32),
-        use_calibration(true) {}
+        use_calibration(true),
+
+        allow_build_at_runtime(true),
+        use_explicit_precision(false) {}
 
   string engine_name;
   string device;
@@ -116,45 +114,55 @@ struct EngineInfo {
   enum class EngineType { TRTStatic = 0, TRTDynamic = 1 };
   EngineType engine_type;
   int64 max_workspace_size_bytes;
+  absl::optional<int> max_batch_size;
   int maximum_cached_engines;
   TrtPrecisionMode precision_mode;
   bool use_calibration;
+  bool allow_build_at_runtime;
+  bool use_explicit_precision;
 };
 
-// Constructs a graphdef from the segment in the given graph. Adds _Arg
-// nodes for input edges (InputPH_*) and _Retval nodes for output edges
-// (OutputPH_*). This function needs to be called before TensorRT nodes
-// inserted in order to correctly get sizes from the original graph.
+// Constructs a graphdef from the segment in the given graph and stores it to
+// the engine_info. Adds _Arg nodes for input edges (InputPH_*) and _Retval
+// nodes for output edges (OutputPH_*). Maintains the topological order of the
+// non-input/output nodes in the graphdef. This function needs to be called
+// before TensorRT layers are created because it prepares the original graph
+// for TensorRT conversion.
 //
 // - subgraph_node_names: the node names of the subgraph.
 // - subgraph_node_ids: the node ids of the subgraph, must be sorted in
 //   topological order.
-// - segment_def: the output GraphDef, whose non-input/output nodedefs will be
-//   sorted in topological order.
-// - scope_name: the name of the scope where the TRTEngineOp will be placed.
+// - engine_info: a data structure that records the information about the
+//   engine containing the subgraph.
 //
 // TODO(aaroey): add tests to validate these properties.
 Status ConvertSegmentToGraphDef(
     const Graph* graph, const grappler::GraphProperties& graph_properties,
-    const std::vector<const Node*>& subgraph_nodes,
-    std::vector<EngineConnection>* connections, GraphDef* segment_def,
-    string* scope_name);
+    const std::vector<const Node*>& subgraph_nodes, EngineInfo* engine_info);
 
 // Converts given subgraph to a TRT engine saved in 'engine'. Returns ok iff
 // 'builder' successfully build the engine. If the result is not ok, 'engine'
 // will be set to nullptr
-// Once returned, 'builder' is not needed any more and can be safely detroyed.
+// Once returned, 'builder' is not needed any more and can be safely destroyed.
 //
-// - convert_successfully: indicates whether the converson to TensorRT network
+// - convert_successfully: indicates whether the conversion to TensorRT network
 //   is successful. This is different than successfully building the engine:
 //   building can still fail afterwards.
+// Note: When 'cluster' is not null, it contains the graph to be converted.
+//       We may perform additional optimizations to the graph before converting
+//       the graph.
 Status ConvertGraphDefToEngine(
-    const GraphDef& gdef, TrtPrecisionMode precision_mode, int max_batch_size,
-    size_t max_workspace_size_bytes,
-    const std::vector<PartialTensorShape>& input_shapes, Logger* logger,
-    nvinfer1::IGpuAllocator* allocator, TRTInt8Calibrator* calibrator,
+    const GraphDef& gdef, OpKernelContext* ctx, TrtPrecisionMode precision_mode,
+    int max_batch_size, size_t max_workspace_size_bytes,
+    const std::vector<PartialTensorShape>& input_shapes,
+    nvinfer1::ILogger* logger, nvinfer1::IGpuAllocator* allocator,
+    TRTInt8Calibrator* calibrator,
     TrtUniquePtrType<nvinfer1::ICudaEngine>* engine, bool use_calibration,
-    bool* convert_successfully);
+    const bool use_implicit_batch, bool* convert_successfully,
+    TrtShapeOptimizationProfile* profiles, absl::string_view engine_name,
+    bool use_explicit_precision,
+    tensorflow::grappler::Cluster* cluster = nullptr,
+    const string& device = "");
 
 // Helper class for the segmenter to determine whether an output edge from the
 // TRT segment is valid.
@@ -165,206 +173,6 @@ class OutputEdgeValidator {
   bool operator()(const Edge* out_edge) const;
 };
 
-string DebugString(const nvinfer1::DataType trt_dtype);
-string DebugString(const nvinfer1::Dims& dims);
-string DebugString(const nvinfer1::Permutation& permutation, int len);
-string DebugString(const nvinfer1::ITensor& tensor);
-int64_t TrtWeightDimsNumElements(const nvinfer1::Dims& dims);
-int64_t TrtTensorDimsNumElements(const nvinfer1::Dims& dims);
-
-// Class to convert TF compile-time constants (e.g. Const nodes) to TRT weight.
-class TRT_ShapedWeights {
- public:
-  explicit TRT_ShapedWeights(
-      nvinfer1::DataType type = nvinfer1::DataType::kFLOAT);
-
-  // Copy from another weights.
-  //
-  // NOTE: this does not copy the underlying buffer but only increase its
-  // reference count.
-  TRT_ShapedWeights(const TRT_ShapedWeights& rhs);
-
-  nvinfer1::Weights GetTrtWeights() const;
-
-  const Tensor& GetTensor() const { return tensor_; }
-
-  // Returns the raw pointer to the underlying buffer which holds the weights
-  // value.
-  void* GetValues() const {
-    return const_cast<char*>(tensor_.tensor_data().data());
-  }
-
-  int64_t count() const;
-
-  size_t size_bytes() const;
-
-  string DebugString() const;
-
-  template <typename T>
-  absl::Span<const T> GetSpan() const {
-    return absl::Span<const T>(tensor_.flat<T>().data(), count());
-  }
-
-  template <typename T>
-  std::vector<T> ToVector() const {
-    auto span = GetSpan<T>();
-    return std::vector<T>(span.data(), span.data() + span.size());
-  }
-
-  nvinfer1::DataType TrtDType() const { return type_; }
-
-  // TODO(aaroey): make these private.
-  nvinfer1::Dims shape_;  // Note: shape.type[] is not used.
-
- private:
-  // This constructor is only used by TrtWeightStore, which creates the
-  // underlying buffer.
-  TRT_ShapedWeights(nvinfer1::DataType type, nvinfer1::Dims dims,
-                    Tensor tensor);
-
-  nvinfer1::DataType type_;
-
-  // All weights should be stored inside TrtWeightStore to make sure lifetime of
-  // all the underlying tensors are available until the engine is built. For
-  // this reason, tensor_ should never be reassigned to a different value that
-  // is not already present in the TrtWeightStore.
-  Tensor tensor_;
-
-  friend class TrtWeightStore;
-};
-
-// Container for TRT_ShapedWeights. We need this container because, TRT doesn't
-// manage the lifetime of the weights buffer, it only keeps a pointer to it and
-// requires that the data referenced by the pointer be available until the
-// building of engine is complete. For more information see
-// https://docs.nvidia.com/deeplearning/sdk/tensorrt-api/c_api/classnvinfer1_1_1_weights.html
-//
-// TODO(laigd): consider adding garbage collection to the unused weights.
-class TrtWeightStore {
- public:
-  // Get a TRT_ShapedWeights with 'type' and 'dims'.
-  TRT_ShapedWeights GetTempWeights(nvinfer1::DataType trt_type,
-                                   const nvinfer1::Dims& dims);
-
-  // Get a TRT_ShapedWeights with the same data type and dimensions as
-  // 'weights'.
-  TRT_ShapedWeights GetTempWeights(const TRT_ShapedWeights& weights) {
-    return GetTempWeights(weights.TrtDType(), weights.shape_);
-  }
-
- private:
-  // The backend storage of the TRT_ShapedWeights.
-  std::vector<Tensor> store_;
-};
-
-// Represents a TRT-style input to a TF node, it can be either a
-// nvinfer1::ITensor, or TRT_ShapedWeights which is compile-time constant.
-//
-// TODO(laigd): maybe rename it to TrtArgument, or mimic XlaCompiler::Argument.
-class TRT_TensorOrWeights {
- public:
-  TRT_TensorOrWeights() {}
-  TRT_TensorOrWeights(ITensorProxyPtr);
-  TRT_TensorOrWeights(ITensorProxyPtr tensor, int batch_size);
-
-  // Constructor that makes it an ITensor, doesn't take ownership of 'tensor'.
-  // This is used by Converter when building the TRT network, where the ITensor
-  // is owned by the TRT network being built. See comment for 'trt_tensor_'
-  // in trt_proxy_tensor.h.
-  explicit TRT_TensorOrWeights(nvinfer1::ITensor* tensor, int batch_size = -1);
-
-  // Constructor that makes it an ITensor by creating one using provided data
-  // type and shape, and takes ownership of the created ITensor. This is used by
-  // TrtNodeValidator to encapsulate the type and shape information for
-  // validation of graph nodes, and the created ITensor is fake and temporary,
-  // and should not be used to build any TRT network. See comment for
-  // 'simple_tensor_' in trt_proxy_tensor.h.
-  explicit TRT_TensorOrWeights(nvinfer1::DataType trt_dtype,
-                               const nvinfer1::Dims& trt_dims, int batch_size);
-
-  // Constructor that makes it a TRT_TensorOrWeights.
-  explicit TRT_TensorOrWeights(const TRT_ShapedWeights& weights);
-
-  TRT_TensorOrWeights(const TRT_TensorOrWeights& rhs);
-
-  void operator=(const TRT_TensorOrWeights& rhs);
-
-  bool is_tensor() const { return initialized_ && is_tensor_; }
-  bool is_weights() const { return initialized_ && !is_tensor_; }
-
-  ITensorProxyPtr tensor() const;
-
-  TRT_ShapedWeights& weights() {
-    CHECK(is_weights());
-    return weights_;
-  }
-
-  const TRT_ShapedWeights& weights() const {
-    CHECK(is_weights());
-    return weights_;
-  }
-
-  nvinfer1::Dims GetTrtDims() const;
-
-  int batch_size() const { return batch_size_; }
-
-  string DebugString() const;
-
- private:
-
-  void set_batch_size(int batch_size) { batch_size_ = batch_size; }
-
-  // First dimension of the TF tensor (NOT tensor_) that is represented by
-  // tensor_ is treated as the "batch dimension" by TRT, and tensor_'s
-  // dimensions (obtained via tensor_->getDimensions()) do not contain the batch
-  // dimension. For example, when a TF tensor with shape (A,B,C) is represented
-  // in TRT, tensor_->getDimensions() will be (B,C) and batch_size_ will be A.
-  //
-  // This requires that all tensors in the subgraph that is converted to a TRT
-  // engine have the same batch size are represented by the first dimension of
-  // their shape, and Converter will verify this during conversion. The drawback
-  // is that currently it cannot convert a graph that doesn't have the batch
-  // size represented in the shapes or the batch sizes are different. See
-  // b/118387490 for more details.
-  ITensorProxyPtr tensor_proxy_ptr_ = nullptr;
-  int batch_size_ = -1;
-
-  TRT_ShapedWeights weights_;
-  bool initialized_ = false;
-  bool is_tensor_ = false;
-
-  friend class Converter;
-};
-
-class Converter;
-
-// Parameters for each op converter.
-struct OpConverterParams {
-  // Constructor used for validation only.
-  OpConverterParams(const NodeDef& node_def,
-                    const std::vector<TRT_TensorOrWeights>& inputs,
-                    std::vector<TRT_TensorOrWeights>* outputs,
-                    TrtWeightStore* weight_store,
-                    TrtPrecisionMode precision_mode, bool use_calibration);
-
-  // Constructor used for conversion.
-  OpConverterParams(Converter* converter, const NodeDef& node_def,
-                    const std::vector<TRT_TensorOrWeights>& inputs,
-                    std::vector<TRT_TensorOrWeights>* outputs,
-                    TrtWeightStore* weight_store);
-
-  Converter* converter = nullptr;
-  const NodeDef& node_def;
-  const std::vector<TRT_TensorOrWeights>& inputs;
-  std::vector<TRT_TensorOrWeights>* outputs;
-  const bool validation_only;
-  TrtWeightStore* weight_store;
-  const TrtPrecisionMode precision_mode;
-  const bool use_calibration;
-};
-
-using OpConverter = std::function<Status(OpConverterParams*)>;
-
 // Class to verify if specific TF node is supported by TRT.
 class TrtNodeValidator {
  public:
@@ -372,33 +180,39 @@ class TrtNodeValidator {
   // checked by IsTensorRTCandidate() later. It is used to get the shape and
   // data type information of a tensor for validation purpose.
   TrtNodeValidator(const grappler::GraphProperties& graph_properties,
-                   TrtPrecisionMode precision_mode, bool use_calibration);
+                   TrtPrecisionMode precision_mode, bool use_calibration,
+                   bool use_implicit_batch, bool use_explicit_precision);
 
   // Returns OK iff 'node' is a TF-TRT conversion candidate, which will be added
   // to TRT subgraph and later converted into TRT engine.
   Status IsTensorRTCandidate(const Node* node);
 
- private:
   static const std::set<string>* quantize_ops;
 
-  void RegisterOpValidators();
+  // Returns validator by op type. If no validator is registered for
+  // specific op, it means no validation is needed and ValidateNode() will
+  // return OK.
+  ::stream_executor::port::StatusOr<OpConverter> GetValidator(
+      const std::string& op);
 
+ private:
   // Convert a Const node to a TRT_TensorOrWeights.
   Status ConvertConstToWeights(const NodeDef& const_node_def,
                                const std::vector<TRT_TensorOrWeights>& inputs,
                                TRT_TensorOrWeights* output);
 
+  // Convert a VariableV2 node to a TRT_TensorOrWeights.
+  Status ConvertVariableToWeights(
+      const NodeDef& const_node_def,
+      const std::vector<TRT_TensorOrWeights>& inputs,
+      TRT_TensorOrWeights* output);
+
   // Convert the output tensor at 'output_port' of 'node_def' to a
   // TRT_TensorOrWeights which will be later used as an input to other nodes and
   // passed to ValidateNode() below.
   Status ConvertToTensorOrWeights(const NodeDef& node_def, int output_port,
                                   TRT_TensorOrWeights* tensor_or_weights);
 
-  // Stores all the validators by op type. If no validator is registered for
-  // specific op, it means no validation is needed and ValidateNode() will
-  // return OK.
-  std::unordered_map<string, OpConverter> op_validators_;
-
   // Store the weights added during validation. Some validations (e.g.
   // validation for Const node) may produce weights.
   TrtWeightStore weight_store_;
@@ -412,6 +226,10 @@ class TrtNodeValidator {
 
   const bool use_calibration_;
 
+  const bool use_implicit_batch_;
+
+  const bool use_explicit_precision_;
+
   friend class ValidatorTest;
   friend class OpConverterTest;
 };
@@ -432,8 +250,11 @@ class Converter {
     nvinfer1::DataType trt_dtype;
   };
 
-  Converter(nvinfer1::INetworkDefinition* trt_network,
-            TrtPrecisionMode precision_mode, bool use_calibration);
+  static ::stream_executor::port::StatusOr<std::unique_ptr<Converter>> Create(
+      TrtPrecisionMode precision_mode, bool use_calibration,
+      nvinfer1::ILogger* trt_logger, const bool use_implicit_batch,
+      absl::string_view engine_name, bool use_explicit_precision = false,
+      OpKernelContext* ctx = nullptr);
 
   //////////////////////////////////////////////////////////////////////////////
   // Methods used by the TRT engine builder to build a TRT network from a TF
@@ -447,34 +268,44 @@ class Converter {
   Status AddInputTensor(const string& name, nvinfer1::DataType dtype,
                         const nvinfer1::Dims& dims, int batch_size);
 
+  // Store the ResourceHandle as a TRT_TensorOrWeights object. This can be
+  // later used as input to other nodes.
+  Status AddInputResource(const string& name, const ResourceHandle& resource);
+
   // Mark the tensors with names specified by source_tensor_name as output of
   // the TRT network, and set their names in the TRT network as dest_node_name.
   Status RenameAndMarkOutputTensors(
       const std::vector<EngineOutputInfo>& output_tensors);
 
+  // Build a TRT engine using the created network.
+  Status BuildCudaEngine(TrtUniquePtrType<nvinfer1::ICudaEngine>* engine,
+                         int max_batch_size, size_t max_workspace_size_bytes,
+                         nvinfer1::IGpuAllocator* allocator,
+                         TRTInt8Calibrator* calibrator,
+                         TrtShapeOptimizationProfile* profiles);
+
   //////////////////////////////////////////////////////////////////////////////
   // Methods used by op converters to convert individual TF node and add layers
   // to the TRT network.
 
   // Op converters (e.g. ConvertReshape) need to access the TRT network in order
   // to add TRT layers.
-  nvinfer1::INetworkDefinition* network() { return trt_network_; }
+  nvinfer1::INetworkDefinition* network() { return trt_network_.get(); }
 
   // What precision are we targeting?
   TrtPrecisionMode precision_mode() const { return precision_mode_; }
 
+  // Variable converters need the context to read variable values.
+  OpKernelContext* context() { return ctx_; }
+
   // Calibration will be or was previously performed on this network?
   bool use_calibration() const { return use_calibration_; }
 
-  // This should be called on the inputs and outputs of any layer we create
-  // where we know that the quantization range does not change during that
-  // operation. (e.g. Reshape, Transpose, Identity, MaxPool).
-  void MarkQuantizationRangesAsInferrable(ITensorProxyPtr* input,
-                                          ITensorProxyPtr* output);
+  // Whether implicit batch mode is enabled
+  bool use_implicit_batch() const { return use_implicit_batch_; }
 
   // This function should be called when we know the quantization range of a
-  // tensor, either from a quantize/dequantize node or when the output is a
-  // fixed range (e.g. SoftMax, Relu6, Sigmoid).
+  // tensor from a quantize/dequantize node.
   void ProvideQuantizationRange(ITensorProxyPtr* tensor, float min_range,
                                 float max_range);
 
@@ -487,28 +318,113 @@ class Converter {
 
   // Transpose 'input_tensor' with given permutation 'order_with_batch_dim' to
   // 'output_tensor'. The permutation 'order_with_batch_dim' contains the batch
-  // dimension which should always be 0.
+  // dimension which should always be 0. If this is for adding a transpose layer
+  // to support the conversion of 'node_def', callers need to provide a
+  // non-empty 'sub_op_name' appended to the name of 'node_def' to avoid layer
+  // name conflicts.
   Status TransposeTensor(ITensorProxyPtr input_tensor,
                          const std::vector<int>& order_with_batch_dim,
-                         ITensorProxyPtr* output_tensor);
+                         ITensorProxyPtr* output_tensor,
+                         const NodeDef& node_def,
+                         absl::string_view sub_op_name = "");
 
-  // Converts 'input' into 'tensor' with shape specified by 'dims' (which
-  // doesn't contain the batch dimension).
+  // Reshapes a dynamic shape tensor by removing or adding dimensions of size 1,
+  // and/or permuting the dimensions. The new shape is derived from the shape of
+  // the input tensor according to the slices and size_for_added_dims arguments.
   //
-  // If validation_only is true, it doesn't do the conversion but only do some
-  // minimum validation for the eligibility of the conversion, and *tensor will
-  // be set to nullptr.
-  Status PrepareTensorForShape(const TRT_TensorOrWeights& input,
-                               const nvinfer1::Dims& dims,
-                               const bool validation_only,
-                               ITensorProxyPtr* tensor);
+  // If there would be at most one unknown dimension, we could set the new shape
+  // using IShuffleLayer::setReshapeDimensions, which treats -1 as a special
+  // value (the same way as TF). In general, we can have more than one unknown
+  // dimensions, and we have to manipulate the shape tensors during runtime to
+  // define the new shape. This helper function defines the necessary shape
+  // inference layers and calls reshape using the calculated new shape.
+  //
+  // Example:
+  //
+  // Assume that we want to reshape a tensor from shape {A,B,C,D} to {C,D,A,B}
+  // (no transpose, just change the shape). In dynamic shape mode, the A,B,C,D
+  // values are not necessarily known at conversion time, they can be all -1. We
+  // can only define the new shape at runtime, when the actual shape is already
+  // known. To define the new shape:
+  // - We use an IShapeLayer to retrieve a shape tensor with the {A,B,C,D}
+  //   values.
+  // - Create two slices {C,D} and {A,B} of the shape tensor.
+  // - Concatenate these slices {C,D,A,B},
+  // - Set the {C,D,A,B} shape tensor as an input shape tensor for
+  // IShuffleLayer.
+  //
+  // This can be achieved by calling DynamicReshape(input, {{2,4},{0,2}},
+  // params).
+  //
+  // Before each slice we can insert new dims if the corresponding
+  // size_for_added_dims element is not negative. The size_for_added_dims array
+  // can have more than slices.size() elements, in order to insert a dimension
+  // after the last slice. For example, to add two leading 1 dimensions, and
+  // three trailing 1 dimensions, call DynamicReshape(input, {{0,nbDims}},
+  // {2, 3}).
+  //
+  // Parameters:
+  // input - input tensor
+  // slices - [start, end) pairs of slices
+  // params - conversion parameters
+  // output - reshaped tensor
+  // size_for_added_dims - size of dimension inserted right before slice[i]. We
+  //   only insert a new dim if size_for_added_dims[i] >= 0.
+  Status DynamicReshape(ITensorProxyPtr input,
+                        std::vector<std::pair<int, int>> slices,
+                        const OpConverterParams* params,
+                        ITensorProxyPtr* output,
+                        std::vector<int> size_for_added_dims = {},
+                        absl::optional<int> op_instance = absl::nullopt);
+
+  // Inserts a singleton dimension at axis for a dynamic shape tensor.
+  Status DynamicExpandDims(ITensorProxyPtr input, const nvinfer1::Dims& dims,
+                           int axis, const OpConverterParams* params,
+                           ITensorProxyPtr* output,
+                           absl::optional<int> op_instance = absl::nullopt);
+
+  // Helper function to add a squeeze op to the network.
+  //
+  // The input_dims argument stores the TRT dimensions of the input tensor,
+  // where the dimensions to be squeezed are replaced by 0.
+  Status SqueezeTensor(ITensorProxyPtr input, std::vector<int>* input_dims,
+                       const OpConverterParams* params, ITensorProxyPtr* output,
+                       absl::optional<int> op_instance = absl::nullopt);
 
   // Creates an IConstantLayer using 'weights' whose dimensions are specified by
   // 'dims', and returns the output ITensor.
   ITensorProxyPtr CreateConstantLayer(const TRT_ShapedWeights& weights,
-                                         const nvinfer1::Dims& dims);
+                                      const nvinfer1::Dims& dims);
+
+  // Gets the min and max value in a TRT_ShapedWeights
+  Status GetWeightRange(const TRT_ShapedWeights& weights, float* out_min,
+                        float* out_max) const;
+
+  // Constructs a name and passed it to the TensorRT layer to support xprof.
+  void SetLayerName(
+      nvinfer1::ILayer* layer, const NodeDef& node_def,
+      absl::string_view sub_op_name = "",
+      absl::optional<int> sub_op_instance = absl::nullopt,
+      absl::optional<std::string> origin_node_name = absl::nullopt);
+
+  void SetLayerName(nvinfer1::ILayer* layer, absl::string_view main_op_name,
+                    absl::string_view sub_op_name,
+                    absl::optional<int> sub_op_instance = absl::nullopt);
+
+  std::unordered_map<string, TRT_TensorOrWeights>& TensorsMap() {
+    return trt_tensors_;
+  }
+
+  bool UseExplicitPrecision() const { return use_explicit_precision_; }
 
  private:
+  Converter(TrtPrecisionMode precision_mode, bool use_calibration,
+            nvinfer1::ILogger* trt_logger, const bool use_implicit_batch,
+            absl::string_view engine_name, bool use_explicit_precision,
+            OpKernelContext* ctx);
+
+  Status Init(nvinfer1::ILogger* trt_logger);
+
   // Verify the provided batch_size is consistent with batch_size_ and update it
   // if necessary.
   Status MaybeUpdateBatchSize(int batch_size);
@@ -523,26 +439,21 @@ class Converter {
   Status GetInputs(const NodeDef& node_def,
                    std::vector<TRT_TensorOrWeights>* inputs) const;
 
-  void RegisterOpConverters();
-
-  void PropagateQuantizationRanges();
-
-  // Gets the min and max value in a TRT_ShapedWeights
-  Status GetWeightRange(const TRT_ShapedWeights& weights, float* out_min,
-                        float* out_max) const;
-
-  // Registered op converters by op type.
-  std::unordered_map<string, OpConverter> op_registry_;
-
   // Tensors/weights added during construction of trt_network_.
   std::unordered_map<string, TRT_TensorOrWeights> trt_tensors_;
 
-  // The TRT networking being built.
-  nvinfer1::INetworkDefinition* trt_network_;
+  // The TRT builder used to create the network and build the engine. Not owned.
+  TrtUniquePtrType<nvinfer1::IBuilder> trt_builder_;
+
+  // The TRT network being built.
+  TrtUniquePtrType<nvinfer1::INetworkDefinition> trt_network_;
 
   // Store the weights added during construction of trt_network_.
   TrtWeightStore weight_store_;
 
+  // Store the context.
+  OpKernelContext* ctx_;
+
   // During conversion, this table is populated with quantization ranges per
   // tensor. MaybeApplyQuantizationRanges() will use this table to set the TRT
   // quantization ranges. Since TRT only supports symmetric ranges, we will
@@ -552,49 +463,132 @@ class Converter {
   std::unordered_map<ITensorProxyPtr*, float> quantization_ranges_proxy_;
   std::unordered_map<nvinfer1::ITensor*, float> quantization_ranges_;
 
-  // Edges where quantization ranges can be inferred (copied) across ops - from
-  // first tensor to second tensor. PropagateQuantizationRanges() will propagate
-  // known ranges from quantization_ranges_ across these edges, adding the new
-  // ranges to quantization_ranges_ so that they can be applied in
-  // MaybeApplyQuantizationRanges().
-  std::vector<std::pair<ITensorProxyPtr*, ITensorProxyPtr*>>
-      quantization_infer_proxy_;
-  std::vector<std::pair<nvinfer1::ITensor*, nvinfer1::ITensor*>>
-      quantization_infer_;
-
   const TrtPrecisionMode precision_mode_;
 
   const bool use_calibration_;
 
+  // If this is false, all dimensions including the batch dimension are
+  // set explicitely.
+  const bool use_implicit_batch_;
+
   // Batch size of inputs to trt_network_ added by AddInputTensor(). During
   // network construction it will update this, use it to verify the batch
   // size of all inputs are compatible, and make sure individual TF node is
   // acceptable by TRT.
   int batch_size_ = -1;
 
+  // Assign a ID to each constant layer we create, so that we can assign a
+  // unique name to the layer.
+  int next_constant_layer_id_ = 0;
+
+  // The name of the TRTEngineOp node.
+  absl::string_view engine_name_;
+
+  // Indicates whether to use explicit precision in TensorRT (Q/DQ support).
+  bool use_explicit_precision_;
+
   friend class ConverterTest;
   friend class OpConverterTest;
 };
 
+// Converts a TensorFlow tensor to TRT shaped weights.
+Status TfTensorToTrtWeights(const Tensor& tensor, TrtWeightStore* weight_store,
+                            TRT_ShapedWeights* weights);
+
+// Converts 'input' of 'node_def' into 'tensor' with shape specified by 'dims'
+// (which doesn't contain the batch dimension).
+//
+// If validation_only is true, it doesn't do the conversion but only do some
+// minimum validation for the eligibility of the conversion, and *tensor will
+// be set to nullptr.
+// If validation_only is false converter must not be nullptr.
+Status PrepareTensorForShape(
+    Converter* converter, const TRT_TensorOrWeights& input,
+    const DimsAdapter& dims, const bool validation_only,
+    ITensorProxyPtr* tensor, const NodeDef& node_def,
+    absl::optional<int> op_instance = absl::nullopt,
+    absl::optional<std::string> origin_node_name = absl::nullopt);
+
 // Return OK if the broadcast scheme is supported and compute the shapes after
 // broadcasting. check_feasibility can be set to false in cases where dimensions
 // do not need to match exactly (as in the case of BatchMatMulV2).
 Status GetTrtBroadcastShape(const TRT_TensorOrWeights& operand_l,
                             const TRT_TensorOrWeights& operand_r,
                             const bool check_feasibility,
+                            const bool use_implicit_batch,
                             nvinfer1::Dims* operand_l_new_dims,
                             nvinfer1::Dims* operand_r_new_dims);
 
-// Map of all supported UnaryOperations
-const std::unordered_map<string, nvinfer1::UnaryOperation>* UnaryOperationMap();
-// Map of all supported ActivationTypes
-const std::unordered_map<string, nvinfer1::ActivationType>* ActivationTypeMap();
+template <typename T>
+using OperationMap = std::unordered_map<std::string, T>;
+
+// Map from Tensorflow operation names to TensorRT unary operations.
+using UnaryOperationMapType = OperationMap<nvinfer1::UnaryOperation>;
+const UnaryOperationMapType* UnaryOperationMap();
+
+// Map from Tensorflow boolean operation names to TensorRT unary operations.
+const UnaryOperationMapType* UnaryBooleanOperationMap();
+
+// Map of all supported ActivationTypes.
+using ActivationTypeMapType = OperationMap<nvinfer1::ActivationType>;
+const ActivationTypeMapType* ActivationTypeMap();
+
+// Map from Tensorflow binary operation names to TensorRT binary operations
+// types.
+using BinaryOperationMapType = OperationMap<nvinfer1::ElementWiseOperation>;
+const BinaryOperationMapType* BinaryOperationMap();
+
+// Map from Tensorflow boolean binary operation names to TensorRT binary
+// operations types.
+const BinaryOperationMapType* BinaryBooleanOperationMap();
+
+template <typename T>
+absl::InlinedVector<std::string, 10> GetOperationNames(const T& set) {
+  absl::InlinedVector<std::string, 10> result;
+  absl::c_transform(set, std::back_inserter(result),
+                    [](const auto x) { return x.first; });
+  return result;
+}
+
+// Adds a matrix multiplication operation to the TensorRT graph. The "params"
+// pointer is only used to access the TRT network builder. The inputs and
+// parameters for the op are fully specified by input_[a|b] and transpose_[a|b].
+::stream_executor::port::StatusOr<ITensorProxyPtr> ConvertMatMulImpl(
+    const OpConverterParams* params, TRT_TensorOrWeights input_a,
+    TRT_TensorOrWeights input_b, bool transpose_a, bool transpose_b);
+
+Status ApplyBroadcast(std::unique_ptr<TRT_TensorOrWeights>& operand,
+                      const DimsAdapter& broadcasted_dims,
+                      const OpConverterParams* params,
+                      absl::optional<int> op_instance);
+
+std::string convert_range_error_msg(float start, float limit, float delta);
+std::string convert_range_expected_msg(const NodeDef& node_def);
+std::string bool_weight_error_msg(const NodeDef& node_def);
+std::string unexpected_type_error_msg(nvinfer1::DataType type_being_checked,
+                                      nvinfer1::DataType type_expected,
+                                      const NodeDef& node_def, int idx = 0);
+std::string then_else_dtypes_error_msg(nvinfer1::DataType type_then,
+                                       nvinfer1::DataType type_else,
+                                       const NodeDef& node_def);
+std::string input_shapes_error_msg(const nvinfer1::Dims& shape1,
+                                   const nvinfer1::Dims& shape2,
+                                   const NodeDef& node,
+                                   bool then_vs_else = false);
+std::string batch_size_error(absl::string_view name, absl::string_view comment);
+
+inline bool find_name(const string& name, const std::vector<string> names) {
+  return std::find(names.begin(), names.end(), name) != names.end();
+}
+
+Status check_type(nvinfer1::DataType type_being_checked,
+                  nvinfer1::DataType type_expected, const NodeDef& node_def,
+                  int idx = 0);
 
 }  // namespace convert
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
 
 #endif  // TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_CONVERT_NODES_H_
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
index 8a0c963e0c8..0733c840d57 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
@@ -15,238 +15,148 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h"
 
+#include <algorithm>
+#include <cmath>
+#include <functional>
+#include <iterator>
 #include <memory>
+#include <numeric>
+#include <type_traits>
 #include <unordered_map>
 #include <vector>
 
+#include "absl/time/civil_time.h"
+
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "Eigen/Core"  // from @eigen_archive
+#include "absl/algorithm/container.h"
+#include "absl/base/call_once.h"
+#include "absl/container/inlined_vector.h"
 #include "absl/strings/match.h"
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/framework/scope.h"
 #include "tensorflow/cc/ops/nn_ops_internal.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/compiler/tf2tensorrt/common/datavec.h"
+#include "tensorflow/compiler/tf2tensorrt/common/utils.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/op_converter_registry.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.h"
 #include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_testutils.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_managed_allocator.h"
+#include "tensorflow/core/common_runtime/process_function_library_runtime.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/device_factory.h"
 #include "tensorflow/core/framework/node_def.pb.h"  // NOLINT
+#include "tensorflow/core/framework/resource_var.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor.pb.h"  // NOLINT
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/grappler/costs/graph_properties.h"
+#include "tensorflow/core/kernels/variable_ops.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/status_matchers.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/threadpool.h"
 #include "tensorflow/core/protobuf/config.pb.h"  // NOLINT
 #include "tensorflow/core/public/session.h"
-
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#include "tensorflow/core/public/version.h"
+#include "tensorflow/core/util/tensor_slice_reader_cache.h"
 #include "third_party/gpus/cuda/include/cuda.h"
 #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
 #include "third_party/tensorrt/NvInfer.h"
 
 namespace tensorflow {
 namespace tensorrt {
-namespace convert {
-
-using absl::StrCat;
-using ::testing::ElementsAre;
-using ::testing::ElementsAreArray;
-using ::testing::NanSensitiveFloatNear;
 
-// TODO(laigd): put this into some test utils file.
-void ExpectStatus(Status status, error::Code code = error::OK,
-                  const char* substr = nullptr) {
-  EXPECT_EQ(code, status.code())
-      << status << " vs expected error code \"" << error::Code_Name(code)
-      << "\" and message \"" << substr << "\"";
-  if (substr) {
-    EXPECT_THAT(status.error_message(), ::testing::HasSubstr(substr)) << status;
-  }
-}
-
-nvinfer1::Dims GetTestDims(const std::vector<int>& d) {
-  nvinfer1::Dims dims;
-  dims.nbDims = d.size();
-  for (int i = 0; i < d.size(); ++i) {
-    dims.d[i] = d[i];
-  }
-  return dims;
-}
-
-nvinfer1::DataType TfDataTypeToTrt(DataType tf_dtype) {
-  switch (tf_dtype) {
-    case DT_FLOAT:
-      return nvinfer1::DataType::kFLOAT;
-    case DT_HALF:
-      return nvinfer1::DataType::kHALF;
-    case DT_INT32:
-      return nvinfer1::DataType::kINT32;
-    default:
-      QCHECK(false) << "Unexpected data type " << DataTypeString(tf_dtype);
-  }
-}
+// TensorRT modes for testing. We define the following three modes:
+// 1. Implicit batch mode: The tensors have static (known) input shape and the
+//    the batch dimension (first dim) is removed from the TRT tensor shape. In
+//    a loose notation: trt_shape = tf_shape[1:].
+// 2. Explicit batch mode: static (known) input shape, but the batch dimension
+//    is part of the trt tensor shape. (trt_shape = tf_shape)
+// 3. Dynamic shape mode allows unknown input shapes, and requires explicit
+//    batch size definition (trt_shape = tf_shape).
+//
+// Note that the Converter only distinguishes between two modes:
+// - use_implicit_batch == true, this corresponds to kImplicitBatch,
+// - use_implicit_batch == false which includes both kExplicitBatch and
+//   kDynamicShape.
+//
+// For the converter, the distinction between explicit batch or dynamic shape
+// mode follows from the input tensors of the network: dynamic shape input
+// implies dynamic shape mode, while static shape input tensors imply explicit
+// batch mode. We want to test all these modes, therefore we define the
+// TrtTestMode with the following three options.
+enum class TrtTestMode {
+  kImplicitBatch = 0,
+  kExplicitBatch = 1,
+  kDynamicShape = 2
+};
 
-DataType TrtDataTypeToTf(nvinfer1::DataType trt_dtype) {
-  switch (trt_dtype) {
-    case nvinfer1::DataType::kFLOAT:
-      return DT_FLOAT;
-    case nvinfer1::DataType::kHALF:
-      return DT_HALF;
-    case nvinfer1::DataType::kINT32:
-      return DT_INT32;
+string DebugString(const TrtTestMode mode) {
+  switch (mode) {
+    case TrtTestMode::kImplicitBatch:
+      return "kImplicitBatch";
+    case TrtTestMode::kExplicitBatch:
+      return "kExplicitBatch";
+    case TrtTestMode::kDynamicShape:
+      return "kDynamicShape";
     default:
-      QCHECK(false) << "Unexpected data type " << static_cast<int>(trt_dtype);
-  }
-}
-
-NodeDef MakeNodeDef(const string& name, const string& op,
-                    const std::vector<string>& inputs,
-                    const std::map<string, AttrValue> attrs = {}) {
-  NodeDef node_def;
-  node_def.set_name(name);
-  node_def.set_op(op);
-  for (const string& input : inputs) {
-    node_def.add_input(input);
-  }
-  for (const auto& attr : attrs) {
-    (*node_def.mutable_attr())[attr.first] = attr.second;
-  }
-  return node_def;
-}
-
-template <typename T>
-NodeDef MakeConstNodeDef(const string& name, const std::vector<T>& vals,
-                         const TensorShape& shape) {
-  Scope s = Scope::NewRootScope();
-  Tensor t = test::AsTensor<T>(vals, shape);
-  auto const_op = ops::Const(s.WithOpName(name), t);
-  return const_op.node()->def();
-}
-
-template <typename T>
-NodeDef MakeConstNodeDef(const string& name, const std::vector<T>& vals) {
-  TensorShape shape;
-  const std::vector<int32> shape_dims = {static_cast<int32>(vals.size())};
-  TF_EXPECT_OK(TensorShapeUtils::MakeShape(shape_dims, &shape));
-  return MakeConstNodeDef(name, vals, shape);
-}
-
-bool TrtDimsEquals(const nvinfer1::Dims& lhs, const nvinfer1::Dims& rhs) {
-  if (lhs.nbDims != rhs.nbDims) return false;
-  for (int i = 0; i < lhs.nbDims; ++i) {
-    if (lhs.d[i] != rhs.d[i]) return false;
-    // We don't check the types in the tests.
+      return "Invalid TrtTestMode";
   }
-  return true;
-}
-
-bool TrtDimsEqualsArray(const std::vector<int>& lhs,
-                        const nvinfer1::Dims& rhs) {
-  return TrtDimsEquals(GetTestDims(lhs), rhs);
 }
 
-// TODO(laigd): define a parameterized matcher that can compare against the
-// vector.
-void ExpectTrtDimsEqualsArray(const std::vector<int>& lhs,
-                              const nvinfer1::Dims& rhs) {
-  EXPECT_TRUE(TrtDimsEqualsArray(lhs, rhs))
-      << "expected: " << DebugString(GetTestDims(lhs)) << "\n"
-      << "  actual: " << DebugString(rhs);
-}
-
-template <typename T>
-void ExpectArrayNear(const std::vector<T>& lhs, absl::Span<const T> rhs) {
-  ASSERT_EQ(lhs.size(), rhs.size());
-  for (int i = 0; i < lhs.size(); i++) {
-    EXPECT_FLOAT_EQ(lhs[i], rhs[i]);
-  }
-}
+namespace convert {
 
-// Eigen::half cannot implicitly convert to float which is required for
-// EXPECT_FLOAT_EQ.
-template <>
-void ExpectArrayNear(const std::vector<Eigen::half>& lhs,
-                     absl::Span<const Eigen::half> rhs) {
-  ASSERT_EQ(lhs.size(), rhs.size());
-  for (int i = 0; i < lhs.size(); i++) {
-    EXPECT_FLOAT_EQ(Eigen::half_impl::half_to_float(lhs[i]),
-                    Eigen::half_impl::half_to_float(rhs[i]));
-  }
-}
+using absl::StrCat;
+using ::testing::ElementsAre;
+using ::testing::ElementsAreArray;
+using ::testing::HasSubstr;
+using ::testing::Matcher;
+using ::testing::PrintToString;
 
-template <typename T>
-void ExpectArrayAlmostEqual(const std::vector<T>& lhs, absl::Span<const T> rhs,
-                            T tolerance) {
-  ASSERT_EQ(lhs.size(), rhs.size());
-  for (int i = 0; i < lhs.size(); i++) {
-    EXPECT_NEAR(lhs[i], rhs[i], tolerance);
-  }
-}
+using ::tensorflow::testing::IsOk;
+using ::tensorflow::testing::StatusIs;
 
-// Eigen::half cannot implicitly convert to float which is required for
-// EXPECT_NEAR.
-template <>
-void ExpectArrayAlmostEqual(const std::vector<Eigen::half>& lhs,
-                            absl::Span<const Eigen::half> rhs,
-                            Eigen::half tolerance) {
-  ASSERT_EQ(lhs.size(), rhs.size());
-  for (int i = 0; i < lhs.size(); i++) {
-    EXPECT_NEAR(Eigen::half_impl::half_to_float(lhs[i]),
-                Eigen::half_impl::half_to_float(rhs[i]),
-                Eigen::half_impl::half_to_float(tolerance));
-  }
-}
+constexpr std::array<TrtTestMode, 3> ValidTrtModes = {
+    TrtTestMode::kImplicitBatch, TrtTestMode::kExplicitBatch,
+    TrtTestMode::kDynamicShape};
 
 bool TrtShapedWeightsEquals(const TRT_ShapedWeights& lhs,
                             const TRT_ShapedWeights& rhs) {
-  return TrtDimsEquals(lhs.shape_, rhs.shape_) &&
-         lhs.TrtDType() == rhs.TrtDType() && lhs.GetValues() == rhs.GetValues();
+  return lhs.Shape() == rhs.Shape() && lhs.TrtDType() == rhs.TrtDType() &&
+         lhs.GetPointer<int8>() == rhs.GetPointer<int8>();
 }
 
 template <typename T>
 void ValidateWeights(const TRT_ShapedWeights& weights,
                      const std::vector<int>& expected_dims,
                      const std::vector<T>& expected_value) {
-  ExpectTrtDimsEqualsArray(expected_dims, weights.shape_);
+  EXPECT_EQ(weights.Shape(), DimsAdapter(expected_dims));
   ASSERT_EQ(expected_value.size(), weights.count()) << weights.DebugString();
-  const T* actual_values = static_cast<const T*>(weights.GetValues());
+  const T* actual_values = weights.GetPointer<T>();
   for (int i = 0; i < expected_value.size(); ++i) {
     EXPECT_EQ(expected_value[i], actual_values[i]);
   }
 }
 
-template <typename CType>
-std::vector<CType> InitTestVector(int size, CType start_value = CType(0)) {
-  std::vector<CType> res;
-  res.reserve(size);
-  for (int i = 0; i < size; ++i) {
-    res.push_back(start_value + CType(i));
-  }
-  return res;
-}
-
-template <typename InCType, typename OutCType>
-struct StaticCaster {
-  OutCType operator()(InCType in) const { return static_cast<OutCType>(in); }
-};
-
-template <typename InCType, typename OutCType>
-std::vector<OutCType> CastTestVector(const std::vector<InCType>& vals) {
-  std::vector<OutCType> res(vals.size());
-  std::transform(vals.begin(), vals.end(), res.begin(),
-                 StaticCaster<InCType, OutCType>());
-  return res;
-}
-
 TEST(TRT_ShapedWeights_Test, Basic) {
   // Test constructor with no arguments.
   {
@@ -258,7 +168,7 @@ TEST(TRT_ShapedWeights_Test, Basic) {
       EXPECT_EQ(nullptr, trt_weights.values);
       EXPECT_EQ(0, trt_weights.count);
 
-      EXPECT_EQ(nullptr, ptr->GetValues());
+      EXPECT_EQ(nullptr, ptr->GetPointer<int8>());
       EXPECT_EQ(0, ptr->count());
       EXPECT_EQ(0, ptr->size_bytes());
     }
@@ -273,7 +183,7 @@ TEST(TRT_ShapedWeights_Test, Basic) {
       EXPECT_EQ(nullptr, trt_weights.values);
       EXPECT_EQ(0, trt_weights.count);
 
-      EXPECT_EQ(nullptr, ptr->GetValues());
+      EXPECT_EQ(nullptr, ptr->GetPointer<int8>());
       EXPECT_EQ(0, ptr->count());
       EXPECT_EQ(0, ptr->size_bytes());
     }
@@ -282,7 +192,8 @@ TEST(TRT_ShapedWeights_Test, Basic) {
   {
     TrtWeightStore store;
     TRT_ShapedWeights weights =
-        store.GetTempWeights(nvinfer1::DataType::kFLOAT, GetTestDims({2, 5}));
+        store.GetTempWeights(nvinfer1::DataType::kFLOAT, CreateDims({2, 5}))
+            .value();
     TRT_ShapedWeights copy(weights);
     for (auto ptr : {&weights, &copy}) {
       nvinfer1::Weights trt_weights = ptr->GetTrtWeights();
@@ -290,12 +201,12 @@ TEST(TRT_ShapedWeights_Test, Basic) {
       EXPECT_NE(nullptr, trt_weights.values);
       EXPECT_EQ(10, trt_weights.count);
 
-      EXPECT_EQ(trt_weights.values, ptr->GetValues());
+      EXPECT_EQ(trt_weights.values, ptr->GetPointer<int8>());
       EXPECT_EQ(10, ptr->count());
       EXPECT_EQ(40, ptr->size_bytes());
     }
     // Test that it doesn't copy the underlying buffer.
-    EXPECT_EQ(weights.GetValues(), copy.GetValues());
+    EXPECT_EQ(weights.GetPointer<int8>(), copy.GetPointer<int8>());
   }
 }
 
@@ -336,7 +247,7 @@ TEST(TRT_TensorOrWeights_Test, Basic) {
           EXPECT_EQ(1, ptr->batch_size());
         }
         EXPECT_EQ(itensor->simple_tensor(), ptr->tensor()->simple_tensor());
-        ExpectTrtDimsEqualsArray({1}, ptr->GetTrtDims());
+        EXPECT_THAT(ptr->GetTrtDims(), DimsAreArray({1}));
       }
     }
   }
@@ -355,7 +266,7 @@ TEST(TRT_TensorOrWeights_Test, Basic) {
       EXPECT_EQ(false, ptr->is_weights());
       EXPECT_EQ(1, ptr->batch_size());
       EXPECT_NE(nullptr, ptr->tensor()->simple_tensor());
-      ExpectTrtDimsEqualsArray({1}, ptr->GetTrtDims());
+      EXPECT_THAT(ptr->GetTrtDims(), DimsAreArray({1}));
     }
   }
   // Test constructor with TRT_ShapedWeights argument.
@@ -369,18 +280,15 @@ TEST(TRT_TensorOrWeights_Test, Basic) {
       EXPECT_EQ(false, ptr->is_tensor());
       EXPECT_EQ(true, ptr->is_weights());
       EXPECT_TRUE(TrtShapedWeightsEquals(weights, ptr->weights()));
-      ExpectTrtDimsEqualsArray({}, ptr->GetTrtDims());
+      std::vector<int> empty_dims;
+      EXPECT_THAT(ptr->GetTrtDims(), DimsAreArray(empty_dims));
     }
   }
 }
 
 class ValidatorTest : public ::testing::Test {
  public:
-  std::unordered_map<string, OpConverter>& op_validators(
-      TrtNodeValidator* validator) {
-    return validator->op_validators_;
-  }
-
+  ValidatorTest() {}
   Status ConvertToTensorOrWeights(const Scope& scope, const Node* node,
                                   int output_port,
                                   TRT_TensorOrWeights* tensor_or_weights) {
@@ -390,26 +298,14 @@ class ValidatorTest : public ::testing::Test {
     TF_EXPECT_OK(graph_properties.InferStatically(true));
 
     TrtNodeValidator validator(graph_properties, TrtPrecisionMode::FP32,
-                               /*use_calibration=*/false);
+                               /*use_calibration=*/false,
+                               /*use_implicit_batch=*/true,
+                               /*use_explicit_precision=*/false);
     return validator.ConvertToTensorOrWeights(node->def(), output_port,
                                               tensor_or_weights);
   }
-
-  const std::set<string>* GetQuantizeOps(TrtNodeValidator* validator) {
-    return validator->quantize_ops;
-  }
 };
 
-TEST_F(ValidatorTest, QuantizeOpsAreRegistered) {
-  grappler::GrapplerItem item;
-  grappler::GraphProperties graph_properties(item);
-  TrtNodeValidator validator(graph_properties, TrtPrecisionMode::FP32,
-                             /*use_calibration=*/false);
-  for (const string& quantize_op : *GetQuantizeOps(&validator)) {
-    QCHECK(op_validators(&validator).count(quantize_op));
-  }
-}
-
 TEST_F(ValidatorTest, ConvertToTensorOrWeights) {
   // Convert Const.
   {
@@ -417,13 +313,14 @@ TEST_F(ValidatorTest, ConvertToTensorOrWeights) {
     auto node =
         ops::Const(s.WithOpName("my_const"), {1.0f, 2.0f}, TensorShape({2}));
     TRT_TensorOrWeights output;
-    ExpectStatus(ConvertToTensorOrWeights(s, node.op().node(),
-                                          /*output_port=*/0, &output));
+    EXPECT_THAT(ConvertToTensorOrWeights(s, node.op().node(),
+                                         /*output_port=*/0, &output),
+                IsOk());
     ValidateWeights<float>(output.weights(), {2}, {1.0, 2.0});
   }
 
   // Helper method to run ConvertToTensorOrWeights() with predefined parameters.
-  auto convert_to_tensor_or_weights = [this](const std::vector<int64>& dims,
+  auto convert_to_tensor_or_weights = [this](const std::vector<int64_t>& dims,
                                              TRT_TensorOrWeights* output) {
     Scope s = Scope::NewRootScope();
     const auto attrs = ops::Placeholder::Shape(PartialTensorShape{dims});
@@ -435,30 +332,33 @@ TEST_F(ValidatorTest, ConvertToTensorOrWeights) {
   // Convert non-Const with #dims > nvinfer1::Dims::MAX_DIMS+1.
   {
     TRT_TensorOrWeights output;
-    ExpectStatus(
+    EXPECT_THAT(
         convert_to_tensor_or_weights(
-            std::vector<int64>(nvinfer1::Dims::MAX_DIMS + 2, 1), &output),
-        error::OUT_OF_RANGE, "Input tensor rank is greater than 9");
+            std::vector<int64_t>(nvinfer1::Dims::MAX_DIMS + 2, 1), &output),
+        StatusIs(absl::StatusCode::kOutOfRange,
+                 HasSubstr("Input tensor rank is greater than 9")));
   }
   // Convert non-Const with #dims < 1.
   {
     TRT_TensorOrWeights output;
-    ExpectStatus(
-        convert_to_tensor_or_weights({}, &output), error::INVALID_ARGUMENT,
-        "Scalar input tensor is not supported since the first dimension "
-        "is treated as batch dimension by TRT");
+    EXPECT_THAT(convert_to_tensor_or_weights({}, &output),
+                StatusIs(absl::StatusCode::kInvalidArgument,
+                         HasSubstr("Scalar input tensor is not supported since "
+                                   "the first dimension "
+                                   "is treated as batch dimension by TRT")));
   }
-  // Convert non-Const. We test the case where the non-batch dimemsion is
+  // Convert non-Const. We test the case where the non-batch dimension is
   // unknown as well, to make sure the validator allows that.
   for (const int32 non_batch_dim : {-1, 2}) {
     const int32 batch_size = 12;
     TRT_TensorOrWeights output;
-    ExpectStatus(
-        convert_to_tensor_or_weights({batch_size, non_batch_dim}, &output));
+    EXPECT_THAT(
+        convert_to_tensor_or_weights({batch_size, non_batch_dim}, &output),
+        IsOk());
     ASSERT_TRUE(output.is_tensor());
     EXPECT_EQ(batch_size, output.batch_size());
     EXPECT_NE(nullptr, output.tensor()->simple_tensor());
-    ExpectTrtDimsEqualsArray({non_batch_dim}, output.GetTrtDims());
+    EXPECT_THAT(output.GetTrtDims(), DimsAreArray({non_batch_dim}));
   }
 }
 
@@ -474,31 +374,39 @@ TEST_F(ValidatorTest, IsTensorRTCandidate_Basics) {
   grappler::GraphProperties graph_properties(item);
   TF_EXPECT_OK(graph_properties.InferStatically(true));
   TrtNodeValidator validator(graph_properties, TrtPrecisionMode::FP32,
-                             /*use_calibration=*/false);
+                             /*use_calibration=*/false,
+                             /*use_implicit_batch=*/true,
+                             /*use_explicit_precision=*/false);
 
+  // Override the Add converter.
   bool start_conversion = false;
   bool should_fail = false;
-  auto op_converter = [&start_conversion,
-                       &should_fail](OpConverterParams* params) -> Status {
+  auto op_converter = [&start_conversion, &should_fail](
+                          const OpConverterParams* params) -> Status {
     if (should_fail) return errors::InvalidArgument("");
     if (!params->validation_only) start_conversion = true;
     return Status::OK();
   };
 
   // Validator not registered.
-  ASSERT_EQ(1, op_validators(&validator).erase("Add"));
-  ExpectStatus(validator.IsTensorRTCandidate(add_node), error::UNIMPLEMENTED,
-               "Op type Add is not supported.");
-
-  // Register validator.
-  op_validators(&validator)["Add"] = op_converter;
+  auto original_op_converter = GetOpConverterRegistry()->LookUp("Add");
+  ASSERT_TRUE(original_op_converter.ok());
+  GetOpConverterRegistry()->Clear("Add");
+  EXPECT_THAT(validator.IsTensorRTCandidate(add_node),
+              StatusIs(absl::StatusCode::kUnimplemented,
+                       HasSubstr("Op type Add is not supported.")));
+  GetOpConverterRegistry()->Register("Add", kDefaultConverterPriority + 1,
+                                     op_converter);
   TF_EXPECT_OK(validator.IsTensorRTCandidate(add_node));
   EXPECT_EQ(false, start_conversion);
 
   // Let the converter return error.
   should_fail = true;
-  ExpectStatus(validator.IsTensorRTCandidate(add_node),
-               error::INVALID_ARGUMENT);
+  EXPECT_THAT(validator.IsTensorRTCandidate(add_node),
+              StatusIs(absl::StatusCode::kInvalidArgument));
+  GetOpConverterRegistry()->Clear("Add");
+  GetOpConverterRegistry()->Register("Add", kDefaultConverterPriority,
+                                     *original_op_converter);
 }
 
 TEST(TrtNodeValidator, IsTensorRTCandidate) {
@@ -527,7 +435,7 @@ TEST(TrtNodeValidator, IsTensorRTCandidate) {
                                          feed, const_1, matmul_attrs);
 
   // Unsupported op.
-  auto unsupported_op = ops::Erf(s.WithOpName("sin"), feed);
+  auto unsupported_op = ops::Erfc(s.WithOpName("sin"), feed);
 
   // Incompatible input.
   auto incompatible_feed = ops::Placeholder(s.WithOpName("feed"), DT_DOUBLE);
@@ -553,25 +461,32 @@ TEST(TrtNodeValidator, IsTensorRTCandidate) {
   for (const TrtPrecisionMode precision_mode :
        {TrtPrecisionMode::FP32, TrtPrecisionMode::INT8}) {
     TrtNodeValidator validator(graph_properties, precision_mode,
-                               /*use_calibration=*/false);
+                               /*use_calibration=*/false,
+                               /*use_implicit_batch=*/true,
+                               /*use_explicit_precision=*/false);
     TF_EXPECT_OK(validator.IsTensorRTCandidate(matmul.operation.node()));
-    ExpectStatus(
+    EXPECT_THAT(
         validator.IsTensorRTCandidate(incompatible_matmul.operation.node()),
-        error::INVALID_ARGUMENT,
-        "Cannot transpose first input if it is a tensor with fewer than 2 "
-        "non-batch dimensions.");
-    ExpectStatus(validator.IsTensorRTCandidate(unsupported_op.operation.node()),
-                 error::UNIMPLEMENTED, "Op type Erf is not supported");
-    ExpectStatus(validator.IsTensorRTCandidate(
-                     matmul_with_incompatible_input.operation.node()),
-                 error::INTERNAL,
-                 "Failed to convert input feed_1 to a TRT_TensorOrWeights");
+        StatusIs(absl::StatusCode::kInvalidArgument,
+                 HasSubstr("MatMul with 2D tensors requires explicit batch "
+                           "mode, or that tensor A "
+                           "is not transposed and B is a constant tensor.")));
+    EXPECT_THAT(validator.IsTensorRTCandidate(unsupported_op.operation.node()),
+                StatusIs(absl::StatusCode::kUnimplemented,
+                         HasSubstr("Op type Erfc is not supported")));
+    EXPECT_THAT(validator.IsTensorRTCandidate(
+                    matmul_with_incompatible_input.operation.node()),
+                StatusIs(absl::StatusCode::kInternal,
+                         HasSubstr("Failed to convert at least one input to a "
+                                   "TRT_TensorOrWeights:")));
     if (precision_mode == TrtPrecisionMode::INT8) {
       TF_EXPECT_OK(validator.IsTensorRTCandidate(quantize.operation.node()));
     } else {
-      ExpectStatus(validator.IsTensorRTCandidate(quantize.operation.node()),
-                   error::UNIMPLEMENTED,
-                   "Op type FakeQuantWithMinMaxArgs is not supported");
+      EXPECT_THAT(
+          validator.IsTensorRTCandidate(quantize.operation.node()),
+          StatusIs(
+              absl::StatusCode::kUnimplemented,
+              HasSubstr("Op type FakeQuantWithMinMaxArgs is not supported")));
     }
   }
 }
@@ -581,24 +496,21 @@ class ConverterTest : public ::testing::Test {
   ConverterTest() { Reset(); }
 
   void Reset() {
-    builder_.reset(nvinfer1::createInferBuilder(logger_));
-#if IS_TRT_VERSION_GE(6, 0, 0, 0)
-    const uint32_t flags = 0U;  // Implicit Batch Mode
-    network_.reset(builder_->createNetworkV2(flags));
-#else
-    network_.reset(builder_->createNetwork());
-#endif // TRT >= 6
-    converter_.reset(new Converter(network_.get(), TrtPrecisionMode::FP32,
-                                   /*use_calibration=*/false));
+    GetOpConverterRegistry()->Clear("MyOp");
+    GetOpConverterRegistry()->Clear("DummyOp");
+    converter_ =
+        std::move(Converter::Create(TrtPrecisionMode::FP32,
+                                    /*use_calibration=*/false, &logger_,
+                                    /*use_implicit_batch=*/true,
+                                    /*engine_name=*/"TRTEngineOp_000_000",
+                                    /*use_explicit_precision=*/false)
+                      .value());
     weight_store_ = &converter_->weight_store_;
   }
 
-  void AddOpConverter(const string& op_name, OpConverter op_converter) {
-    converter_->op_registry_[op_name] = op_converter;
-  }
-
+  // TODO(cbate): These should be removed or changed to public per black-box
+  // testing principle.
   // Below we expose private methods of Converter for testing.
-
   Status MaybeUpdateBatchSize(int batch_size) {
     return converter_->MaybeUpdateBatchSize(batch_size);
   }
@@ -621,10 +533,6 @@ class ConverterTest : public ::testing::Test {
     return converter_->GetWeightRange(weights, out_min, out_max);
   }
 
-  void PropagateQuantizationRanges() {
-    converter_->PropagateQuantizationRanges();
-  }
-
   int batch_size() const { return converter_->batch_size_; }
 
   std::unordered_map<ITensorProxyPtr*, float>& quantization_ranges_proxy() {
@@ -637,13 +545,6 @@ class ConverterTest : public ::testing::Test {
 
  private:
   Logger& logger_ = *Logger::GetLogger();
-  // These members are ordered in a way such that the destruction order is:
-  // converter_ -> network_ -> builder_
-  TrtUniquePtrType<nvinfer1::IBuilder> builder_;
-#if IS_TRT_VERSION_GE(6, 0, 0, 0)
-  TrtUniquePtrType<nvinfer1::IBuilderConfig> builder_config_;
-#endif
-  TrtUniquePtrType<nvinfer1::INetworkDefinition> network_;
 
  protected:
   std::unique_ptr<Converter> converter_;
@@ -652,7 +553,8 @@ class ConverterTest : public ::testing::Test {
 
 TEST_F(ConverterTest, ConvertNode) {
   ITensorProxyPtr output_tensors[2];
-  auto op_converter = [&output_tensors](OpConverterParams* params) -> Status {
+  auto op_converter =
+      [&output_tensors](const OpConverterParams* params) -> Status {
     nvinfer1::Dims dims = params->inputs[0].tensor()->getDimensions();
     for (int i = 0; i < 2; ++i) {
       dims.d[0] += 1;
@@ -662,26 +564,33 @@ TEST_F(ConverterTest, ConvertNode) {
     return Status::OK();
   };
   NodeDef node_def = MakeNodeDef("my_op", "MyOp", {"my_input"});
-  TF_EXPECT_OK(converter_->AddInputTensor(
-      "my_input", nvinfer1::DataType::kFLOAT, GetTestDims({123}), 1));
+
+  TF_ASSERT_OK(converter_->AddInputTensor(
+      "my_input", nvinfer1::DataType::kFLOAT, CreateDims({123}), 1));
 
   // Converter not registered.
-  ExpectStatus(converter_->ConvertNode(node_def), error::UNIMPLEMENTED,
-               "No converter registered for op: MyOp");
+  EXPECT_THAT(converter_->ConvertNode(node_def),
+              StatusIs(absl::StatusCode::kNotFound,
+                       HasSubstr("No converter for op MyOp")));
 
   // Register the converter and retry.
-  AddOpConverter("MyOp", op_converter);
-  TF_EXPECT_OK(converter_->ConvertNode(node_def));
+  GetOpConverterRegistry()->Register("MyOp", kDefaultConverterPriority,
+                                     op_converter);
+  TF_ASSERT_OK(converter_->ConvertNode(node_def));
 
   TRT_TensorOrWeights actual_output_1;
   TF_EXPECT_OK(GetTensorOrWeights("my_op", &actual_output_1));
-  EXPECT_EQ(output_tensors[0]->simple_tensor(), actual_output_1.tensor()->simple_tensor());
+  EXPECT_EQ(output_tensors[0]->simple_tensor(),
+            actual_output_1.tensor()->simple_tensor());
   EXPECT_EQ(124, actual_output_1.tensor()->getDimensions().d[0]);
 
   TRT_TensorOrWeights actual_output_2;
   TF_EXPECT_OK(GetTensorOrWeights("my_op:1", &actual_output_2));
-  EXPECT_EQ(output_tensors[1]->simple_tensor(), actual_output_2.tensor()->simple_tensor());
+  EXPECT_EQ(output_tensors[1]->simple_tensor(),
+            actual_output_2.tensor()->simple_tensor());
   EXPECT_EQ(125, actual_output_2.tensor()->getDimensions().d[0]);
+
+  EXPECT_THAT(converter_->network(), LayerNamesNonEmpty());
 }
 
 TEST_F(ConverterTest, AddAndGetInputs) {
@@ -693,24 +602,26 @@ TEST_F(ConverterTest, AddAndGetInputs) {
   node_def.add_input("weird_input:2:3:4:0");
 
   TF_EXPECT_OK(converter_->AddInputTensor("input", nvinfer1::DataType::kFLOAT,
-                                          GetTestDims({1}), 1));
+                                          CreateDims({1}), 1));
   TF_EXPECT_OK(converter_->AddInputTensor("input:1", nvinfer1::DataType::kINT32,
-                                          GetTestDims({2, 3}), 1));
+                                          CreateDims({2, 3}), 1));
   TF_EXPECT_OK(converter_->AddInputTensor(
-      "weird_input:2:3:4", nvinfer1::DataType::kHALF, GetTestDims({5, 3}), 1));
+      "weird_input:2:3:4", nvinfer1::DataType::kHALF, CreateDims({5, 3}), 1));
 
   std::vector<TRT_TensorOrWeights> inputs;
   TF_EXPECT_OK(GetInputs(node_def, &inputs));
 
   EXPECT_EQ(4, inputs.size());
-  EXPECT_EQ(inputs[0].tensor()->simple_tensor(), inputs[1].tensor()->simple_tensor());
+  EXPECT_EQ(inputs[0].tensor()->trt_tensor(), inputs[1].tensor()->trt_tensor());
 
   EXPECT_EQ(nvinfer1::DataType::kFLOAT, inputs[0].tensor()->getType());
   EXPECT_EQ(nvinfer1::DataType::kINT32, inputs[2].tensor()->getType());
   EXPECT_EQ(nvinfer1::DataType::kHALF, inputs[3].tensor()->getType());
-  ExpectTrtDimsEqualsArray({1}, inputs[0].tensor()->getDimensions());
-  ExpectTrtDimsEqualsArray({2, 3}, inputs[2].tensor()->getDimensions());
-  ExpectTrtDimsEqualsArray({5, 3}, inputs[3].tensor()->getDimensions());
+  EXPECT_THAT(inputs[0].tensor()->getDimensions(), DimsAreArray({1}));
+  EXPECT_THAT(inputs[2].tensor()->getDimensions(), DimsAreArray({2, 3}));
+  EXPECT_THAT(inputs[3].tensor()->getDimensions(), DimsAreArray({5, 3}));
+
+  EXPECT_THAT(converter_->network(), LayerNamesNonEmpty());
 }
 
 TEST_F(ConverterTest, RenameAndMarkOutputTensors) {
@@ -720,7 +631,8 @@ TEST_F(ConverterTest, RenameAndMarkOutputTensors) {
   // Register a custom converter which shuffles the input. We use it to build a
   // TRT network whose output will be later marked.
   std::vector<ITensorProxyPtr> output_tensors;
-  auto op_converter = [&output_tensors](OpConverterParams* params) -> Status {
+  auto op_converter =
+      [&output_tensors](const OpConverterParams* params) -> Status {
     nvinfer1::Permutation perm;
     perm.order[0] = 1;
     perm.order[1] = 0;
@@ -737,81 +649,96 @@ TEST_F(ConverterTest, RenameAndMarkOutputTensors) {
     params->outputs->emplace_back(output_weights);
     return Status::OK();
   };
-  AddOpConverter("MyOp", op_converter);
+  GetOpConverterRegistry()->Register("MyOp", kDefaultConverterPriority,
+                                     op_converter);
 
   // Run the conversion.
   NodeDef node_def = MakeNodeDef("my_op", "MyOp", {"my_input"});
   TF_EXPECT_OK(converter_->AddInputTensor(
-      "my_input", nvinfer1::DataType::kFLOAT, GetTestDims({1, 2}), 1));
+      "my_input", nvinfer1::DataType::kFLOAT, CreateDims({1, 2}), 1));
   TF_EXPECT_OK(converter_->ConvertNode(node_def));
 
   // Mark a weight as output, should fail.
-  ExpectStatus(
+  EXPECT_THAT(
       converter_->RenameAndMarkOutputTensors({{"my_op:2", "my_output"}}),
-      error::INVALID_ARGUMENT, "Output my_op:2 is weights not tensor");
+      StatusIs(absl::StatusCode::kInvalidArgument,
+               HasSubstr("Output my_op:2 is weights not tensor")));
 
   // Mark tensors as output, should pass.
   TF_EXPECT_OK(converter_->RenameAndMarkOutputTensors(
       {{"my_op", "my_output"}, {"my_op:1", "my_output_1"}}));
   EXPECT_EQ(2, output_tensors.size());
   for (auto output_tensor : output_tensors) {
-    ExpectTrtDimsEqualsArray({2, 1}, output_tensor->getDimensions());
+    EXPECT_THAT(output_tensor->getDimensions(), DimsAreArray({2, 1}));
   }
   EXPECT_EQ("my_output", string(output_tensors[0]->getName()));
   EXPECT_EQ("my_output_1", string(output_tensors[1]->getName()));
+
+  EXPECT_THAT(converter_->network(), LayerNamesNonEmpty());
 }
 
 TEST_F(ConverterTest, TransposeTensor) {
   ITensorProxyPtr input_tensor = converter_->network()->addInput(
-      "", nvinfer1::DataType::kFLOAT, GetTestDims({2, 3, 5}));
+      "", nvinfer1::DataType::kFLOAT, CreateDims({2, 3, 5}));
   ITensorProxyPtr output_tensor = nullptr;
-
+  NodeDef dummy_node_def = MakeNodeDef("dummy_op", "DummyOp", {});
   // Rank doesn't match.
-  ExpectStatus(
-      converter_->TransposeTensor(input_tensor, {0, 1}, &output_tensor),
-      error::INVALID_ARGUMENT,
-      "Rank of perm for transpose does not match with that of the input");
+  EXPECT_THAT(converter_->TransposeTensor(input_tensor, {0, 1}, &output_tensor,
+                                          dummy_node_def, "sub1"),
+              StatusIs(absl::StatusCode::kInvalidArgument,
+                       HasSubstr("Rank of perm for transpose does not match "
+                                 "with that of the input")));
 
   // Transpose at batch dimension.
-  ExpectStatus(
-      converter_->TransposeTensor(input_tensor, {1, 0, 2, 3}, &output_tensor),
-      error::UNIMPLEMENTED, "Transpose at batch dimension is not supported.");
+  EXPECT_THAT(
+      converter_->TransposeTensor(input_tensor, {1, 0, 2, 3}, &output_tensor,
+                                  dummy_node_def, "sub2"),
+      StatusIs(absl::StatusCode::kUnimplemented,
+               HasSubstr("Transpose at batch dimension is not supported.")));
 
   // OK.
-  TF_EXPECT_OK(
-      converter_->TransposeTensor(input_tensor, {0, 3, 1, 2}, &output_tensor));
-  ExpectTrtDimsEqualsArray({5, 2, 3}, output_tensor->getDimensions());
+  TF_EXPECT_OK(converter_->TransposeTensor(
+      input_tensor, {0, 3, 1, 2}, &output_tensor, dummy_node_def, "sub3"));
+  EXPECT_THAT(output_tensor->getDimensions(), DimsAreArray({5, 2, 3}));
+  EXPECT_THAT(
+      converter_->network(),
+      LayerNamesAreArray({"TRTEngineOp_000_000/dummy_op-sub3:SHUFFLE"}));
 }
 
 void TestPrepareTensorForShape(
     const std::vector<int>& input_dims, const std::vector<int>& reshape_dims,
     const std::vector<int>& expected_tensor_dims, bool input_is_tensor,
     Converter* converter, TrtWeightStore* weight_store,
-    error::Code expected_code = error::OK,
+    absl::StatusCode expected_code = absl::StatusCode::kOk,
     const char* expected_error_msg_substr = nullptr) {
   TRT_TensorOrWeights input;
   if (input_is_tensor) {
     input = TRT_TensorOrWeights(converter->network()->addInput(
-        "", nvinfer1::DataType::kFLOAT, GetTestDims(input_dims)));
+        "", nvinfer1::DataType::kFLOAT, CreateDims(input_dims)));
   } else {
-    input = TRT_TensorOrWeights(weight_store->GetTempWeights(
-        nvinfer1::DataType::kFLOAT, GetTestDims(input_dims)));
+    input = TRT_TensorOrWeights(
+        weight_store
+            ->GetTempWeights(nvinfer1::DataType::kFLOAT, CreateDims(input_dims))
+            .value());
   }
   ITensorProxyPtr output_tensor = nullptr;
 
+  NodeDef dummy_node_def = MakeNodeDef("dummy_op", "DummyOp", {});
   for (bool validation_only : {false, true}) {
-    const Status status = converter->PrepareTensorForShape(
-        input, GetTestDims(reshape_dims), validation_only, &output_tensor);
-    if (expected_code == error::OK) {
+    const Status status =
+        PrepareTensorForShape(converter, input, DimsAdapter(reshape_dims),
+                              validation_only, &output_tensor, dummy_node_def);
+    if (expected_code == absl::StatusCode::kOk) {
       TF_EXPECT_OK(status);
       if (validation_only) {
         EXPECT_EQ(nullptr, *output_tensor);
       } else {
-        ExpectTrtDimsEqualsArray(expected_tensor_dims,
-                                 output_tensor->getDimensions());
+        EXPECT_THAT(output_tensor->getDimensions(),
+                    DimsAreArray(expected_tensor_dims));
       }
     } else {
-      ExpectStatus(status, expected_code, expected_error_msg_substr);
+      EXPECT_THAT(status, StatusIs(expected_code,
+                                   HasSubstr(expected_error_msg_substr)));
     }
   }
 }
@@ -822,7 +749,8 @@ TEST_F(ConverterTest, PrepareTensorForShape) {
     Reset();
     TestPrepareTensorForShape({2, 3, 5}, {2, 3, 6}, {}, input_is_tensor,
                               converter_.get(), weight_store_,
-                              error::INVALID_ARGUMENT, "Incompatible shapes");
+                              absl::StatusCode::kInvalidArgument,
+                              "Incompatible shapes");
 
     // Regular shape.
     Reset();
@@ -853,8 +781,10 @@ TEST_F(ConverterTest, PrepareTensorForShape) {
   Reset();
   TestPrepareTensorForShape({2, 3, 5}, {-1, 2}, {15, 2},
                             /*input_is_tensor=*/false, converter_.get(),
-                            weight_store_, error::INVALID_ARGUMENT,
+                            weight_store_, absl::StatusCode::kInvalidArgument,
                             "Shape is not fully defined");
+
+  EXPECT_THAT(converter_->network(), LayerNamesNonEmpty());
 }
 
 TEST_F(ConverterTest, MaybeUpdateBatchSize) {
@@ -872,8 +802,11 @@ TEST_F(ConverterTest, MaybeUpdateBatchSize) {
   TF_EXPECT_OK(MaybeUpdateBatchSize(-1));
   EXPECT_EQ(123, batch_size());
 
-  ExpectStatus(MaybeUpdateBatchSize(124), error::INVALID_ARGUMENT,
-               "Provided batch size does not match converter batch size");
+  EXPECT_THAT(
+      MaybeUpdateBatchSize(124),
+      StatusIs(absl::StatusCode::kInvalidArgument,
+               HasSubstr(
+                   "Provided batch size does not match converter batch size")));
 }
 
 TEST_F(ConverterTest, AddAndGetTensorOrWeights) {
@@ -890,17 +823,19 @@ TEST_F(ConverterTest, AddAndGetTensorOrWeights) {
   EXPECT_EQ(123, added_tensor.batch_size());
 
   // Add the same tensor again.
-  ExpectStatus(AddTensorOrWeights("my_tensor", tensor), error::ALREADY_EXISTS,
-               "tensor/weights my_tensor already exist");
+  EXPECT_THAT(AddTensorOrWeights("my_tensor", tensor),
+              StatusIs(absl::StatusCode::kAlreadyExists,
+                       HasSubstr("tensor/weights my_tensor already exist")));
 }
 
 template <typename T>
 void TestGetWeightRange(ConverterTest* test, TrtWeightStore* weight_store) {
-  TRT_ShapedWeights weights = weight_store->GetTempWeights(
-      TfDataTypeToTrt(DataTypeToEnum<T>::v()), GetTestDims({2, 3}));
+  nvinfer1::DataType trt_type;
+  TF_ASSERT_OK(TfTypeToTrtType(DataTypeToEnum<T>::v(), &trt_type));
+  TRT_ShapedWeights weights =
+      weight_store->GetTempWeights(trt_type, CreateDims({2, 3})).value();
   const std::vector<T> values = {T(3), T(1), T(2), T(6), T(5), T(4)};
-  memcpy(weights.GetValues(), values.data(), weights.size_bytes());
-
+  absl::c_copy(values, weights.GetPointer<T>());
   float out_min = 0.0f;
   float out_max = 0.0f;
   TF_EXPECT_OK(test->GetWeightRange(weights, &out_min, &out_max));
@@ -916,7 +851,7 @@ TEST_F(ConverterTest, GetWeightRange) {
 
 TEST_F(ConverterTest, ProvideQuantizationRange) {
   ITensorProxyPtr simple_tensor;
-  // Assymetric range
+  // Asymmetric range
   converter_->ProvideQuantizationRange(&simple_tensor, 0.0f, 6.0f);
   EXPECT_EQ(6.0f, quantization_ranges_proxy()[&simple_tensor]);
   converter_->ProvideQuantizationRange(&simple_tensor, 1.0f, 6.0f);
@@ -928,62 +863,27 @@ TEST_F(ConverterTest, ProvideQuantizationRange) {
   // Symmetric range
   converter_->ProvideQuantizationRange(&simple_tensor, -6.123f, 6.123f);
   EXPECT_EQ(6.123f, quantization_ranges_proxy()[&simple_tensor]);
+
+  EXPECT_THAT(converter_->network(), LayerNamesNonEmpty());
 }
 
 TEST_F(ConverterTest, MaybeApplyQuantizationRanges) {
-  // input -> infer1 -> infer2 -> infer3
-  ITensorProxyPtr input, infer_1, infer_2, infer_3;
+  ITensorProxyPtr input;
   ITensorProxyPtr not_infer;
-  Converter int8_converter(/*trt_network=*/nullptr, TrtPrecisionMode::INT8,
-                           /*use_calibration=*/true);
-  int8_converter.ProvideQuantizationRange(&input, -5.0f, 5.0f);
-  int8_converter.ProvideQuantizationRange(&not_infer, -100.0f, 100.0f);
-  int8_converter.MarkQuantizationRangesAsInferrable(&input, &infer_1);
-  int8_converter.MarkQuantizationRangesAsInferrable(&infer_1, &infer_2);
-  int8_converter.MarkQuantizationRangesAsInferrable(&infer_2, &infer_3);
-
-  // Input range should be inferred along the chain and applied to tensors.
-  int8_converter.MaybeApplyQuantizationRanges();
-#if IS_TRT_VERSION_GE(8, 0, 0, 0)
+  Logger& logger = *Logger::GetLogger();
+  auto int8_converter = Converter::Create(TrtPrecisionMode::INT8,
+                                          /*use_calibration=*/true, &logger,
+                                          /*use_implicit_batch=*/true,
+                                          /*engine_name=*/"")
+                            .value();
+  int8_converter->ProvideQuantizationRange(&input, -5.0f, 5.0f);
+  int8_converter->ProvideQuantizationRange(&not_infer, -100.0f, 100.0f);
+
+  int8_converter->MaybeApplyQuantizationRanges();
   EXPECT_EQ(input->getDynamicRangeMax(), 5.0f);
-  EXPECT_EQ(infer_1->getDynamicRangeMax(), 5.0f);
-  EXPECT_EQ(infer_2->getDynamicRangeMax(), 5.0f);
-  EXPECT_EQ(infer_3->getDynamicRangeMax(), 5.0f);
   EXPECT_EQ(not_infer->getDynamicRangeMax(), 100.0f);
-  EXPECT_EQ(input->getDynamicRangeMin(), -5.0f);
-  EXPECT_EQ(infer_1->getDynamicRangeMin(), -5.0f);
-  EXPECT_EQ(infer_2->getDynamicRangeMin(), -5.0f);
-  EXPECT_EQ(infer_3->getDynamicRangeMin(), -5.0f);
-  EXPECT_EQ(not_infer->getDynamicRangeMin(), -100.0f);
-#elif IS_TRT_VERSION_GE(5, 0, 0, 0)
-  EXPECT_EQ(input->getDynamicRange(), 5.0f);
-  EXPECT_EQ(infer_1->getDynamicRange(), 5.0f);
-  EXPECT_EQ(infer_2->getDynamicRange(), 5.0f);
-  EXPECT_EQ(infer_3->getDynamicRange(), 5.0f);
-  EXPECT_EQ(not_infer->getDynamicRange(), 100.0f);
-#endif
-}
 
-TEST_F(ConverterTest, PropagateQuantizationRanges) {
-  // infer0 <-> infer1 <-> infer2 <-> infer3
-  //              |
-  //            infer4 <-> infer5
-  ITensorProxyPtr infer[6];
-  ITensorProxyPtr not_infer;
-  converter_->ProvideQuantizationRange(&infer[4], -5.0f, 5.0f);
-  converter_->MarkQuantizationRangesAsInferrable(&infer[0], &infer[1]);
-  converter_->MarkQuantizationRangesAsInferrable(&infer[1], &infer[2]);
-  converter_->MarkQuantizationRangesAsInferrable(&infer[3], &infer[2]);
-  converter_->MarkQuantizationRangesAsInferrable(&infer[4], &infer[1]);
-  converter_->MarkQuantizationRangesAsInferrable(&infer[4], &infer[5]);
-
-  // Input range should be inferred along the chain.
-  PropagateQuantizationRanges();
-  auto ranges = quantization_ranges_proxy();
-  for (int i = 0; i < 6; ++i) {
-    EXPECT_EQ(5.0f, ranges[&infer[i]]);
-  }
-  EXPECT_EQ(ranges.count(&not_infer), 0);
+  EXPECT_THAT(int8_converter->network(), LayerNamesNonEmpty());
 }
 
 TEST_F(ConverterTest, GetTrtBroadcastShape) {
@@ -995,18 +895,19 @@ TEST_F(ConverterTest, GetTrtBroadcastShape) {
                                const bool operand_2_is_tensor,
                                const std::vector<int>& expected_operand_1_shape,
                                const std::vector<int>& expected_operand_2_shape,
-                               error::Code expected_code = error::OK,
-                               const char* expected_error_msg_substr = nullptr,
+                               absl::StatusCode expected_code =
+                                   absl::StatusCode::kOk,
+                               const char* expected_error_msg_substr = "",
                                const int operand_1_batch_size = -1,
                                const int operand_2_batch_size = -1) {
     auto create_tensor_or_weights = [](const std::vector<int>& shape,
                                        bool is_tensor, int batch_size = -1) {
       if (is_tensor) {
-        return TRT_TensorOrWeights{nvinfer1::DataType::kFLOAT,
-                                   GetTestDims(shape), batch_size};
+        return TRT_TensorOrWeights(nvinfer1::DataType::kFLOAT,
+                                   CreateDims(shape), batch_size);
       }
       TRT_ShapedWeights weights;
-      weights.shape_ = GetTestDims(shape);
+      weights.Shape() = CreateDims(shape);
       return TRT_TensorOrWeights(weights);
     };
 
@@ -1017,28 +918,31 @@ TEST_F(ConverterTest, GetTrtBroadcastShape) {
         operand_2_shape, operand_2_is_tensor, operand_2_batch_size);
 
     // operand_1 broadcast operand_2
-    ExpectStatus(
+    EXPECT_THAT(
         GetTrtBroadcastShape(operand_1, operand_2, /*check_feasibility=*/true,
-                             &operand_1_new_dims, &operand_2_new_dims),
-        expected_code, expected_error_msg_substr);
-    if (expected_code == error::OK) {
-      ExpectTrtDimsEqualsArray(expected_operand_1_shape, operand_1_new_dims);
-      ExpectTrtDimsEqualsArray(expected_operand_2_shape, operand_2_new_dims);
+                             /*use_implicit_batch=*/true, &operand_1_new_dims,
+                             &operand_2_new_dims),
+        StatusIs(expected_code, HasSubstr(expected_error_msg_substr)));
+    if (expected_code == absl::StatusCode::kOk) {
+      EXPECT_THAT(operand_1_new_dims, DimsAreArray(expected_operand_1_shape));
+      EXPECT_THAT(operand_2_new_dims, DimsAreArray(expected_operand_2_shape));
     }
     // operand_2 broadcast operand_1
-    ExpectStatus(
+    EXPECT_THAT(
         GetTrtBroadcastShape(operand_2, operand_1, /*check_feasibility=*/true,
-                             &operand_2_new_dims, &operand_1_new_dims),
-        expected_code, expected_error_msg_substr);
-    if (expected_code == error::OK) {
-      ExpectTrtDimsEqualsArray(expected_operand_1_shape, operand_1_new_dims);
-      ExpectTrtDimsEqualsArray(expected_operand_2_shape, operand_2_new_dims);
+                             /*use_implicit_batch=*/true, &operand_2_new_dims,
+                             &operand_1_new_dims),
+        StatusIs(expected_code, HasSubstr(expected_error_msg_substr)));
+    if (expected_code == absl::StatusCode::kOk) {
+      EXPECT_THAT(operand_1_new_dims, DimsAreArray(expected_operand_1_shape));
+      EXPECT_THAT(operand_2_new_dims, DimsAreArray(expected_operand_2_shape));
     }
   };
 
   // Both inputs are weights.
   symmetric_test(
-      {1}, {1}, kIsNotTensor, kIsNotTensor, {}, {}, error::INVALID_ARGUMENT,
+      {1}, {1}, kIsNotTensor, kIsNotTensor, {}, {},
+      absl::StatusCode::kInvalidArgument,
       "Broadcasting requires at least one of the operands be tensors");
 
   // One tensor and one weights.
@@ -1054,51 +958,58 @@ TEST_F(ConverterTest, GetTrtBroadcastShape) {
   symmetric_test({1, 3, 4}, {1, 2, 1, 4}, kIsTensor, kIsNotTensor, {1, 3, 4},
                  {2, 1, 4});
   symmetric_test({1, 1, 1}, {2, 1, 1, 1}, kIsTensor, kIsNotTensor, {}, {},
-                 error::INVALID_ARGUMENT, "Infeasible broadcast scheme");
+                 absl::StatusCode::kInvalidArgument,
+                 "Infeasible broadcast scheme");
   symmetric_test({1, 1, 1}, {2, 1, 1, 1}, kIsTensor, kIsNotTensor, {}, {},
-                 error::INVALID_ARGUMENT, "Infeasible broadcast scheme",
+                 absl::StatusCode::kInvalidArgument,
+                 "Infeasible broadcast scheme",
                  /*operand_1_batch_size=*/2);
   symmetric_test({1, 1, 1}, {1, 1, 1, 1, 1}, kIsTensor, kIsNotTensor, {}, {},
-                 error::INVALID_ARGUMENT,
+                 absl::StatusCode::kInvalidArgument,
                  "Broadcasting beyond batch dimension is not supported "
                  "(tensor #dims 4 vs broadcast #dims 5)");
   symmetric_test({3}, {1, 1, 3}, kIsTensor, kIsNotTensor, {}, {},
-                 error::INVALID_ARGUMENT,
+                 absl::StatusCode::kInvalidArgument,
                  "Broadcasting beyond batch dimension is not supported "
                  "(tensor #dims 2 vs broadcast #dims 3)",
                  /*operand_1_batch_size=*/2);
 
   // Both inputs are tensors.
   symmetric_test({1, 1, 1}, {1, 1}, kIsTensor, kIsTensor, {}, {},
-                 error::INVALID_ARGUMENT,
+                 absl::StatusCode::kInvalidArgument,
                  "Broadcasting beyond batch dimension is not supported "
                  "(tensor #dims 3 vs broadcast #dims 4)");
   symmetric_test({1, 3}, {3}, kIsTensor, kIsTensor, {}, {},
-                 error::INVALID_ARGUMENT,
+                 absl::StatusCode::kInvalidArgument,
                  "Broadcasting beyond batch dimension is not supported "
                  "(tensor #dims 2 vs broadcast #dims 3)");
   symmetric_test({1, 3, 4}, {2, 1, 4}, kIsTensor, kIsTensor, {1, 3, 4},
                  {2, 1, 4});
   symmetric_test({1, 1, 1}, {1, 1, 1, 1}, kIsTensor, kIsTensor, {}, {},
-                 error::INVALID_ARGUMENT,
+                 absl::StatusCode::kInvalidArgument,
                  "Broadcasting beyond batch dimension is not supported "
                  "(tensor #dims 4 vs broadcast #dims 5)");
   symmetric_test({2, 3}, {7, 5}, kIsTensor, kIsTensor, {}, {},
-                 error::INVALID_ARGUMENT, "Infeasible broadcast scheme");
+                 absl::StatusCode::kInvalidArgument,
+                 "Infeasible broadcast scheme");
+
+  EXPECT_THAT(converter_->network(), LayerNamesNonEmpty());
 }
 
 TEST_F(ConverterTest, CreateConstantLayer) {
   for (auto dtype : {nvinfer1::DataType::kFLOAT, nvinfer1::DataType::kINT32}) {
     TRT_ShapedWeights weights =
-        weight_store_->GetTempWeights(dtype, GetTestDims({2, 3, 5}));
+        weight_store_->GetTempWeights(dtype, CreateDims({2, 3, 5})).value();
     ITensorProxyPtr tensor =
-        converter_->CreateConstantLayer(weights, GetTestDims({3, 10}));
+        converter_->CreateConstantLayer(weights, CreateDims({3, 10}));
     ASSERT_NE(nullptr, tensor->trt_tensor());
     EXPECT_EQ(dtype, tensor->getType())
         << "Expected " << DebugString(dtype) << " vs. actual "
         << DebugString(tensor->getType());
-    ExpectTrtDimsEqualsArray({3, 10}, tensor->getDimensions());
+    EXPECT_THAT(tensor->getDimensions(), DimsAreArray({3, 10}));
   }
+
+  EXPECT_THAT(converter_->network(), LayerNamesNonEmpty());
 }
 
 class ConvertGraphDefToEngineTest : public ::testing::Test {
@@ -1125,10 +1036,12 @@ class ConvertGraphDefToEngineTest : public ::testing::Test {
     }
     // TODO(laigd): execute the engine and get outputs.
     return ConvertGraphDefToEngine(
-        gdef, TrtPrecisionMode::FP32, /*max_batch_size=*/1,
+        gdef, /*ctx=*/nullptr, TrtPrecisionMode::FP32, /*max_batch_size=*/1,
         /*max_workspace_size_bytes=*/64 << 20, input_shapes, &logger_,
         /*allocator=*/nullptr, /*calibrator=*/nullptr, &engine_,
-        /*use_calibration=*/false, /*convert_successfully=*/nullptr);
+        /*use_calibration=*/false, /*use_implicit_batch=*/true,
+        /*convert_successfully=*/nullptr, /*profiles=*/nullptr,
+        "TRTEngineOp_000_000", /*use_explicit_precision=*/false);
   }
 
  protected:
@@ -1154,258 +1067,409 @@ TEST_F(ConvertGraphDefToEngineTest, IdentityGraph) {
   TF_EXPECT_OK(RunConvertGraphDefToEngine(&s));
 }
 
-// Input/output data format for OpConverterTest::BuildAndRun().
-struct InputOutputData {
-  void* Buffer() const {
-    return const_cast<char*>(tensor.tensor_data().data());
-  }
-
-  size_t TotalBytes() const { return tensor.TotalBytes(); }
-
-  string name;
-  Tensor tensor;
-};
-
-template <typename T>
-Tensor ConstructTensor(int data_size, const T& value = T()) {
-  std::vector<T> values(data_size, value);
-  return test::AsTensor<T>(values);
+// Returns a vector of shapes from a vector of input tensors. This can be used
+// to create optimization profiles.
+Status GetShapeFromDataVec(DataVec input_data,
+                           std::vector<TensorShape>* shape_vec) {
+  shape_vec->reserve(input_data.size());
+  std::transform(input_data.begin(), input_data.end(),
+                 std::back_inserter(*shape_vec),
+                 [](InputOutputData x) { return x.tensor.shape(); });
+  return Status::OK();
 }
 
-using DataVec = std::vector<InputOutputData>;
-
 template <typename T>
 inline absl::Span<const T> GetSpanForData(const InputOutputData& data) {
   const auto& tensor_map = data.tensor.flat<T>();
   return absl::Span<const T>(tensor_map.data(), tensor_map.size());
 }
 
+std::vector<float> GetDataAsFloat(InputOutputData& data) {
+  const auto dType = data.tensor.dtype();
+  if (dType == DT_FLOAT) {
+    auto span = GetSpanForData<float>(data);
+    return std::vector<float>(span.begin(), span.end());
+  }
+  if (dType == DT_HALF) {
+    return CastVector<Eigen::half, float>(GetSpanForData<Eigen::half>(data));
+  }
+  if (dType == DT_INT32) {
+    return CastVector<int32, float>(GetSpanForData<int32>(data));
+  }
+#if IS_TRT_VERSION_GE(8, 2, 0, 0)
+  if (dType == DT_BOOL) {
+    return CastVector<bool, float>(GetSpanForData<bool>(data));
+  }
+#endif
+  LOG(FATAL) << "DataType not supported for testing " << DataTypeString(dType);
+  return {};
+}
+
 // Class to test various op converters, using both a TrtNodeValidator and
 // Converter.
 class OpConverterTest : public ::testing::Test {
  public:
-  OpConverterTest() : scope_(Scope::NewRootScope()) {
+  OpConverterTest()
+      : tensor_buffer_allocator_(new GpuManagedAllocator()),
+        scope_(Scope::NewRootScope()) {
     QCHECK_EQ(0, cudaStreamCreate(&stream_));
     Reset();
   }
 
-  ~OpConverterTest() noexcept override { QCHECK_EQ(0, cudaStreamDestroy(stream_)); }
+  ~OpConverterTest() noexcept override {
+    QCHECK_EQ(0, cudaStreamDestroy(stream_));
+  }
 
   Status GetTensorOrWeights(const string& name, TRT_TensorOrWeights* output) {
     return converter_->GetTensorOrWeights(name, output);
   }
 
-  void Reset() {
+  void Reset(TrtPrecisionMode precision_mode_to_test = TrtPrecisionMode::FP32,
+             TrtTestMode trt_mode = TrtTestMode::kImplicitBatch,
+             OpKernelContext* ctx = nullptr) {
+    // Destroy existing TRT objects in a proper order.
     converter_.reset(nullptr);
-
-    // Reset the INetworkDefinition.
     engine_.reset(nullptr);
-    network_.reset(nullptr);
-    builder_.reset(nvinfer1::createInferBuilder(logger_));
-#if IS_TRT_VERSION_GE(6, 0, 0, 0)
-  builder_config_.reset(builder_->createBuilderConfig());
-  builder_config_->setMaxWorkspaceSize(1 << 26);
-  if (precision_mode_to_test_ == TrtPrecisionMode::FP16) {
-    builder_config_->setFlag(nvinfer1::BuilderFlag::kFP16);
-  } else if (precision_mode_to_test_ == TrtPrecisionMode::INT8) {
-    builder_config_->setFlag(nvinfer1::BuilderFlag::kFP16);
-    builder_config_->setFlag(nvinfer1::BuilderFlag::kINT8);
-    builder_config_->setInt8Calibrator(nullptr);
-  }
-    const uint32_t flags = 0U;  // Implicit Batch Mode
-    network_.reset(builder_->createNetworkV2(flags));
-#else
-    network_.reset(builder_->createNetwork());
-    builder_->setMaxWorkspaceSize(1 << 26);
-#endif // TRT >= 6
 
-    // Reset the converter.
-    converter_.reset(new Converter(network_.get(), precision_mode_to_test_,
-                                   /*use_calibration=*/false));
+    // Re-create them in proper order.
+    converter_ =
+        std::move(Converter::Create(precision_mode_to_test,
+                                    /*use_calibration=*/false, &logger_,
+                                    /*use_implicit_batch=*/trt_mode ==
+                                        TrtTestMode::kImplicitBatch,
+                                    /*engine_name=*/"",
+                                    /*use_explicit_precision=*/false, ctx)
+                      .value());
 
     // Reset other related artifacts.
     scope_ = Scope::NewRootScope();
   }
 
+  // Constructs a flat tensor with 'vals' in Unified Memory.
+  template <typename T>
+  Tensor AsTensor(gtl::ArraySlice<T> vals) {  // non-absl ok
+    Tensor ret(tensor_buffer_allocator_.get(), DataTypeToEnum<T>::value,
+               {static_cast<int64_t>(vals.size())});
+    std::copy_n(vals.data(), vals.size(), ret.flat<T>().data());
+    return ret;
+  }
+
+  // Constructs a tensor of "shape" with values "vals" in Unified Memory.
+  template <typename T>
+  Tensor AsTensor(gtl::ArraySlice<T> vals,  // non-absl ok
+                  const TensorShape& shape) {
+    Tensor ret(tensor_buffer_allocator_.get(), DataTypeToEnum<T>::value,
+               {static_cast<int64_t>(vals.size())});
+    CHECK(ret.CopyFrom(AsTensor(vals), shape));
+    return ret;
+  }
+
+  template <typename T, typename S>
+  void transformTensor(const std::vector<T>& vals, Tensor& ret) {
+    std::transform(vals.begin(), vals.end(), ret.flat<S>().data(),
+                   [](const T in_val) -> S { return static_cast<S>(in_val); });
+  }
+
+  template <typename T, typename S>
+  void transformWeights(const std::vector<T>& vals,
+                        TRT_ShapedWeights& weights) {
+    std::transform(vals.begin(), vals.end(), weights.GetPointer<S>(),
+                   [](const T in_val) -> S { return static_cast<S>(in_val); });
+  }
+
+  // Constructs a tensor with given values (vals). The tensor type is defined by
+  // the tf_type argument, its shape is given by input_dims. The tensor is
+  // constructed using the allocator of OpConverterTest in Unified Memory.
+  template <typename T>
+  Tensor AsTensor(const std::vector<T>& vals,
+                  const std::vector<int>& input_dims, DataType tf_type) {
+    Tensor ret(tensor_buffer_allocator_.get(), tf_type,
+               {static_cast<int64_t>(vals.size())});
+    if (tf_type == DT_FLOAT) {
+      transformTensor<T, float>(vals, ret);
+    } else if (tf_type == DT_HALF) {
+      transformTensor<T, Eigen::half>(vals, ret);
+    } else if (tf_type == DT_INT32) {
+      transformTensor<T, int32>(vals, ret);
+#if IS_TRT_VERSION_GE(8, 2, 0, 0)
+    } else if (tf_type == DT_BOOL) {
+      transformTensor<T, bool>(vals, ret);
+#endif
+    } else {
+      LOG(FATAL) << "Cannot create tensor with type "
+                 << DataTypeString(tf_type);
+    }
+    TensorShape shape;
+    TF_EXPECT_OK(TensorShapeUtils::MakeShape(input_dims, &shape));
+    CHECK(ret.CopyFrom(ret, shape));
+    return ret;
+  }
+
+  template <typename T>
+  Tensor AsTensor(const std::vector<int>& vals,
+                  const std::vector<int>& input_dims, DataType tf_type) {
+    const auto& conv_vals = CastVector<int, T>(vals);
+    return AsTensor(conv_vals, input_dims, tf_type);
+  }
+
+  // Constructs a flat tensor in Unified Memory.
+  template <typename T>
+  Tensor ConstructTensor(int data_size, const T& value = T()) {
+    std::vector<T> values(data_size, value);
+    return AsTensor<T>(values);
+  }
+
+  // Constructs a flat tensor in Unified Memory.
+  template <typename T>
+  Tensor ConstructTensor(int data_size, const T& value, DataType tf_type) {
+    std::vector<T> values(data_size, value);
+    return AsTensor<T>(values, {data_size}, tf_type);
+  }
+
   void CheckDataTypeMatches(const DataVec& datas) {
+    if (VLOG_IS_ON(2)) {
+      int nbBindings = engine_->getNbBindings();
+      VLOG(2) << "Number of engine bindings: " << nbBindings;
+      for (int i = 0; i < nbBindings; i++) {
+        VLOG(2) << "Binding " << i << " name: " << engine_->getBindingName(i);
+      }
+    }
     for (const auto& data : datas) {
+      VLOG(2) << "Checking if data type matches for tensor " << data.name;
       const int input_index = engine_->getBindingIndex(data.name.c_str());
       ASSERT_NE(-1, input_index);
       const nvinfer1::DataType trt_dtype =
           engine_->getBindingDataType(input_index);
-      const DataType tf_dtype = TrtDataTypeToTf(trt_dtype);
-      ASSERT_EQ(data.tensor.dtype(), tf_dtype)
+      DataType tf_type;
+      TF_ASSERT_OK(TrtTypeToTfType(trt_dtype, &tf_type));
+      ASSERT_EQ(data.tensor.dtype(), tf_type)
           << DataTypeString(data.tensor.dtype()) << " vs. "
-          << DataTypeString(tf_dtype);
+          << DataTypeString(tf_type);
     }
   }
 
-  // TODO(laigd): test fp16 and int8 support for more converters.
-  void BuildAndRun(const DataVec& input_data, DataVec* output_data,
-                   TrtPrecisionMode precision_mode = TrtPrecisionMode::FP32,
-                   const int batch_size = 1) {
+  Status BuildAndRun(const DataVec& input_data, DataVec* output_data,
+                     const int batch_size = 1) {
     // Mark the output tensor as TRT engine output.
     std::vector<Converter::EngineOutputInfo> output_info;
     for (const auto& data : *output_data) {
-      output_info.push_back(
-          {data.name, data.name, TfDataTypeToTrt(data.tensor.dtype())});
-    }
-    TF_EXPECT_OK(converter_->RenameAndMarkOutputTensors(output_info));
-
-    ASSERT_EQ(nullptr, engine_.get());
-    builder_->setMaxBatchSize(batch_size);
-#if IS_TRT_VERSION_GE(6, 0, 0, 0)
-    if (precision_mode == TrtPrecisionMode::FP16) {
-      builder_config_->setFlag(nvinfer1::BuilderFlag::kFP16);
-    } else if (precision_mode == TrtPrecisionMode::INT8) {
-      builder_config_->setFlag(nvinfer1::BuilderFlag::kFP16);
-      builder_config_->setFlag(nvinfer1::BuilderFlag::kINT8);
-      builder_config_->setInt8Calibrator(nullptr);
+      nvinfer1::DataType trt_type;
+      TF_RETURN_IF_ERROR(TfTypeToTrtType(data.tensor.dtype(), &trt_type));
+      output_info.push_back({data.name, data.name, trt_type});
     }
-    engine_.reset(
-        builder_->buildEngineWithConfig(*converter_->network(), *builder_config_));
-#else
+    TF_RETURN_IF_ERROR(converter_->RenameAndMarkOutputTensors(output_info));
+
     // Build the TRT engine.
-    if (precision_mode == TrtPrecisionMode::FP16) {
-      builder_->setFp16Mode(true);
-    } else if (precision_mode == TrtPrecisionMode::INT8) {
-      // Setting FP16 mode as well allows TRT to also consider FP16 kernels and
-      // use them in situations where they are faster than INT8 or where INT8 is
-      // not supported for a given layer.
-      builder_->setFp16Mode(true);
-      builder_->setInt8Mode(true);
+    if (engine_.get() != nullptr) {
+      return errors::Internal("Engine already exists");
     }
-    engine_.reset(builder_->buildCudaEngine(*converter_->network()));
-#endif
+    TrtShapeOptimizationProfile profiles;
+    if (!converter_->use_implicit_batch()) {
+      std::vector<bool> input_mask(input_data.size());
+      for (int i = 0; i < input_data.size(); i++) {
+        input_mask[i] = (input_data[i].tensor.dtype() != DataType::DT_RESOURCE);
+      }
+      profiles.SetInputMask(input_mask);
+      profiles.SetShapeTensorMask(converter_->network());
+      TF_RETURN_IF_ERROR(profiles.CollectShapeValues(input_data));
+      // Create a single optimization profile for explicit batch mode
+      std::vector<TensorShape> input_shapes;
+      TF_RETURN_IF_ERROR(GetShapeFromDataVec(input_data, &input_shapes));
+      profiles.AddShape(input_shapes);
+      std::vector<PartialTensorShape> input_partial_shapes;
+      TF_RETURN_IF_ERROR(
+          GetNetworkInputShapes(converter_->network(), &input_partial_shapes));
+      profiles.InitProfiles(input_partial_shapes, ProfileStrategy::kRange);
+    }
+    TF_RETURN_IF_ERROR(
+        converter_->BuildCudaEngine(&engine_,
+                                    /*max_batch_size=*/batch_size,
+                                    /*max_workspace_size_bytes=*/1 << 26,
+                                    /*allocator=*/nullptr,
+                                    /*calibrator=*/nullptr,
+                                    /*profiles=*/&profiles));
     CHECK_NOTNULL(engine_.get());
     CheckDataTypeMatches(input_data);
     CheckDataTypeMatches(*output_data);
 
-    // Execute the TRT engine.
     const int num_bindings = input_data.size() + output_data->size();
     std::vector<void*> buffers(num_bindings);
 
-    for (const auto& data : input_data) {
-      const int input_index = engine_->getBindingIndex(data.name.c_str());
-      ASSERT_NE(-1, input_index);
-      ASSERT_EQ(0, cudaMalloc(&buffers[input_index], data.TotalBytes()));
-      ASSERT_EQ(0, cudaMemcpyAsync(buffers[input_index], data.Buffer(),
-                                   data.TotalBytes(), cudaMemcpyHostToDevice,
-                                   stream_));
+    if (engine_->getNbBindings() != num_bindings) {
+      return errors::Internal("Number of bindings do not match");
     }
-    struct SizeAndIndex {
-      SizeAndIndex(int in_size, int in_index)
-          : size(in_size), index(in_index) {}
-      int size;
-      int index;
-    };
-    std::vector<SizeAndIndex> output_infos;
-    for (const auto& data : *output_data) {
-      const int output_index = engine_->getBindingIndex(data.name.c_str());
-      ASSERT_NE(-1, output_index);
-      output_infos.emplace_back(data.TotalBytes(), output_index);
-      ASSERT_EQ(0, cudaMalloc(&buffers[output_index], data.TotalBytes()));
-    }
-
-    ASSERT_EQ(engine_->getNbBindings(), num_bindings);
+    // Since we have only 1 optimization profile (which is enabled by default)
+    // it is fine to create execution context directly, instead of calling
+    // profiles.CreateExecutionContexts()
     TrtUniquePtrType<nvinfer1::IExecutionContext> execution_context(
         engine_->createExecutionContext());
-    execution_context->enqueue(batch_size, buffers.data(), stream_, nullptr);
 
-    for (int i = 0; i < output_infos.size(); ++i) {
-      const auto& output_info = output_infos[i];
-      ASSERT_EQ(0, cudaMemcpyAsync(output_data->at(i).Buffer(),
-                                   buffers[output_info.index], output_info.size,
-                                   cudaMemcpyDeviceToHost, stream_));
-    }
+    // Prepare input bindings.
+    TF_RETURN_IF_ERROR(
+        SetTrtEngineInputs(engine_.get(), execution_context.get(), 0, buffers,
+                           converter_->use_implicit_batch(), batch_size,
+                           profiles, nullptr, &input_data));
+    // Prepare output bindings.
+    TF_RETURN_IF_ERROR(SetTrtEngineOutputs(
+        engine_.get(), execution_context.get(), 0, buffers,
+        converter_->use_implicit_batch(), batch_size, nullptr, output_data));
+    // Execute the TRT engine.
+    TF_RETURN_IF_ERROR(TrtEnqueue(execution_context.get(), buffers, stream_,
+                                  converter_->use_implicit_batch(),
+                                  batch_size));
     cudaStreamSynchronize(stream_);
+    return Status::OK();
+  }
+
+  // Adds ITensor for both validation and conversion, assuming explicit batch
+  // dimension is included in dims (ie for an NCHW tensor dims = {N, C, H, W}).
+  void AddTestTensorWithTFDims(
+      const string& name, const std::vector<int32>& dims,
+      nvinfer1::DataType trt_type = nvinfer1::DataType::kFLOAT,
+      Status add_input_status = Status::OK()) {
+    DataType tf_type;
+    TF_ASSERT_OK(TrtTypeToTfType(trt_type, &tf_type));
+    ops::Placeholder::Attrs attrs;
+    TF_EXPECT_OK(TensorShapeUtils::MakeShape(dims, &attrs.shape_));
+
+    auto input = ops::Placeholder(scope_.WithOpName(name), tf_type, attrs);
+    node_inputs_[name] = input.output;
+
+    // Add a real ITensor for conversion conditionally.
 
-    for (int i = 0; i < num_bindings; ++i) {
-      ASSERT_EQ(0, cudaFree(buffers[i]));
+    auto dims_adap =
+        DimsAdapter::Create(attrs.shape_, converter_->use_implicit_batch());
+    if (converter_->use_implicit_batch() && !dims_adap.ok()) {
+      ASSERT_EQ(add_input_status, dims_adap.status());
+      return;
+    } else {
+      TF_EXPECT_OK(dims_adap.status());
+    }
+    if (!converter_->use_implicit_batch() || dims_adap->IsStatic()) {
+      int batch_size = dims.size() > 0 ? dims[0] : 0;
+      Status status = converter_->AddInputTensor(
+          name, trt_type, dims_adap->AsTrtDims(), batch_size);
+      ASSERT_EQ(add_input_status, status);
     }
   }
 
-  bool HasStaticShape(const nvinfer1::Dims& dims) const {
-    if (dims.nbDims < 0) return false;
-    for (int i = 0; i < dims.nbDims; ++i) {
-      if (dims.d[i] < 0) return false;
-    }
-    return true;
+  Status AddTensorOrWeights(const string& name, TRT_TensorOrWeights input) {
+    return converter_->AddTensorOrWeights(name, input);
   }
 
-  // Add ITensor for both validation and conversion.
+  // Adds ITensor for both validation and conversion. The difference compared to
+  // AddTestTensorWithTFDims is in the meaning of the dims parameter. To define
+  // a tensor with NCHW shape, here we set dims = {C,H,W} and batch_size = N.
+  // TODO(tfeher) remove this function once all test are updated to use the
+  // other version of AddTestTensor (defined by
+  // ParameterizedOpConverterTestBase).
   void AddTestTensor(
       const string& name, const std::vector<int32>& dims, int batch_size = 1,
       nvinfer1::DataType trt_dtype = nvinfer1::DataType::kFLOAT) {
-    DataType tf_dtype = TrtDataTypeToTf(trt_dtype);
-    ops::Placeholder::Attrs attrs;
-    TF_EXPECT_OK(TensorShapeUtils::MakeShape(dims, &attrs.shape_));
-    attrs.shape_.InsertDim(0, batch_size);
-    auto input = ops::Placeholder(scope_.WithOpName(name), tf_dtype, attrs);
-    node_inputs_[name] = input.output;
-
-    // Add a real ITensor for conversion conditionally.
-    const nvinfer1::Dims trt_dims = GetTestDims(dims);
-    if (HasStaticShape(trt_dims)) {
-      TF_EXPECT_OK(
-          converter_->AddInputTensor(name, trt_dtype, trt_dims, batch_size));
+    DimsAdapter adap(dims);
+    std::vector<int32_t> dims_vec;
+    TF_CHECK_OK(adap.Prepend(batch_size).Vector(&dims_vec));
+    AddTestTensorWithTFDims(name, dims_vec, trt_dtype);
+    if (adap.IsStatic()) {
       ASSERT_EQ(batch_size, converter_->batch_size_);
     }
   }
 
-  // Add weights for both validation and conversion.
-  template <typename T>
+  // Adds weights for both validation and conversion. The type of the weight is
+  // determined by tf_type. The initial value vector (values) can have any
+  // type (T) that can be statically casted to tf_type.
+  template <typename T = int32>
   void AddTestWeights(const string& name, const std::vector<int>& dims,
-                      const std::vector<T>& values) {
+                      const std::vector<T>& values_inp, DataType tf_type,
+                      bool fix_values = true) {
+    const DimsAdapter dims_adap(dims);
+    const int64_t num_elements = dims_adap.Volume();
+
+    std::vector<T> values(values_inp);
+    if (num_elements != values.size()) {
+      if (fix_values) {
+        AdjustVectorByDims<T>(values, num_elements, name, "AddTestWeights");
+      } else {
+        FAIL() << "Unable to create test weights: "
+               << (num_elements > values.size() ? "not enough" : "to many")
+               << " values specified: " << values.size() << " vs. "
+               << num_elements << " defined by dims";
+      }
+    }
     // Add weights for validation.
-    TensorShape shape;
-    TF_EXPECT_OK(TensorShapeUtils::MakeShape(dims, &shape));
-    Tensor t = test::AsTensor<T>(values, shape);
+    Tensor t = AsTensor<T>(values, dims, tf_type);
     node_inputs_[name] = ops::Const(scope_.WithOpName(name), t);
 
     // Add weights for conversion.
-    const nvinfer1::DataType dtype = TfDataTypeToTrt(DataTypeToEnum<T>::v());
-    const nvinfer1::Dims trt_dims = GetTestDims(dims);
-    const int64_t num_elements = TrtWeightDimsNumElements(trt_dims);
+    nvinfer1::DataType dtype;
+    TF_ASSERT_OK(TfTypeToTrtType(tf_type, &dtype));
     QCHECK_EQ(num_elements, values.size())
         << num_elements << " vs " << values.size();
     TRT_ShapedWeights weights(dtype);
     if (num_elements) {
-      weights = converter_->weight_store_.GetTempWeights(dtype, trt_dims);
-      QCHECK_EQ(weights.size_bytes(), sizeof(T) * values.size())
-          << weights.size_bytes() << " vs " << sizeof(T) * values.size();
-      memcpy(weights.GetValues(), values.data(), weights.size_bytes());
+      weights =
+          converter_->weight_store_.GetTempWeights(dtype, dims_adap.AsTrtDims())
+              .value();
+
+      if (tf_type == DT_FLOAT) {
+        transformWeights<T, float>(values, weights);
+      } else if (tf_type == DT_HALF) {
+        transformWeights<T, Eigen::half>(values, weights);
+      } else if (tf_type == DT_INT32) {
+        transformWeights<T, int32>(values, weights);
+#if IS_TRT_VERSION_GE(8, 2, 0, 0)
+      } else if (tf_type == DT_BOOL) {
+        transformWeights<T, bool>(values, weights);
+#endif
+      } else {
+        LOG(FATAL) << "Cannot create tensor with type "
+                   << DataTypeString(tf_type);
+      }
     }
     TF_EXPECT_OK(
         converter_->AddTensorOrWeights(name, TRT_TensorOrWeights{weights}));
   }
 
+  // Adds test weight without specifying tf_type arg. In this case the initial
+  // value type (T) will determine the type of the weights.
+  template <typename T = int32>
+  void AddTestWeights(const string& name, const std::vector<int>& dims,
+                      const std::vector<T>& value, bool fix_values = true) {
+    AddTestWeights(name, dims, value, DataTypeToEnum<T>::value, fix_values);
+  }
+
   // Test validation in validation-only mode.
-  void RunValidation(const Node* node, error::Code expected_code = error::OK,
-                     const char* expected_msg_substr = nullptr) {
+  Status RunValidation(const Node* node) {
     grappler::GrapplerItem item;
     TF_EXPECT_OK(scope_.ToGraphDef(&item.graph));
     grappler::GraphProperties graph_properties(item);
     TF_EXPECT_OK(graph_properties.InferStatically(true));
 
-    TrtNodeValidator validator(graph_properties, precision_mode_to_test_,
-                               /*use_calibration=*/false);
-    ExpectStatus(validator.IsTensorRTCandidate(node), expected_code,
-                 expected_msg_substr);
-  }
-
-  void RunConversion(const Node* node, error::Code expected_code = error::OK,
-                     const char* expected_msg_substr = nullptr) {
-    ExpectStatus(converter_->ConvertNode(node->def()), expected_code,
-                 expected_msg_substr);
+    TrtNodeValidator validator(
+        graph_properties, converter_->precision_mode(),
+        /*use_calibration=*/false,
+        /*use_implicit_batch=*/converter_->use_implicit_batch(),
+        /*use_explicit_precision=*/false);
+    return validator.IsTensorRTCandidate(node);
+  }
+
+  void RunConversion(const Node* node,
+                     absl::StatusCode expected_code = absl::StatusCode::kOk,
+                     absl::string_view expected_msg_substr = "") {
+    EXPECT_THAT(converter_->ConvertNode(node->def()),
+                StatusIs(expected_code, HasSubstr(expected_msg_substr)));
+    if (expected_code == absl::StatusCode::kOk) {
+      EXPECT_THAT(converter_->network(), LayerNamesNonEmpty());
+    }
   }
 
   // Helper method to run both validation and conversion, when the expected
   // output are same.
-  void RunValidationAndConversion(const NodeDef& node_def,
-                                  error::Code expected_code = error::OK,
-                                  const char* expected_msg_substr = nullptr,
-                                  bool should_run_conversion = true) {
+  void RunValidationAndConversion(
+      const NodeDef& node_def,
+      absl::StatusCode expected_code = absl::StatusCode::kOk,
+      absl::string_view expected_msg_substr = "",
+      bool should_run_conversion = true) {
     // Add the node to the graph.
     // TODO(laigd): we should accept a function that adds the node using
     // `scope_`, so individual test case can reuse the scope object and we don't
@@ -1422,13 +1486,51 @@ class OpConverterTest : public ::testing::Test {
       graph->AddEdge(input.node(), input.index(), node, i);
     }
 
-    RunValidation(node, expected_code, expected_msg_substr);
-    if (should_run_conversion) {
+    status = RunValidation(node);
+    if (should_run_conversion && status.ok()) {
       RunConversion(node, expected_code, expected_msg_substr);
+    } else {
+      EXPECT_THAT(status,
+                  StatusIs(expected_code, HasSubstr(expected_msg_substr)));
+    }
+  }
+
+  // Helper method to run both validation and conversion, and check the output
+  // shapes.
+  void RunValidationAndConversion(
+      const NodeDef& node_def, const Status& status,
+      const std::string& output_name,
+      const std::vector<std::vector<int>>& exp_out_dims) {
+    RunValidationAndConversion(node_def,
+                               static_cast<absl::StatusCode>(status.code()),
+                               status.message(), true);
+
+    if (status.ok()) {
+      // TODO(tfeher): Enable this check in explicit_batch_mode.
+      // In dynamic shape mode the output dims cannot be tested here. In that
+      // case we need to wait for the concrate input shapes to be defined (by
+      // setBindingDimensions before enqueue) before we can check the output
+      // dims.
+      if (converter_->use_implicit_batch()) {
+        for (int i = 0; i < exp_out_dims.size(); i++) {
+          TRT_TensorOrWeights output;
+          string name = i == 0 ? output_name : StrCat(output_name, ":", i);
+          TF_EXPECT_OK(GetTensorOrWeights(name.c_str(), &output));
+          ASSERT_TRUE(output.is_tensor());
+          if (!exp_out_dims[i].empty()) {
+            // Removing batch dim.
+            auto out_dims = std::vector<int>(exp_out_dims[i].begin() + 1,
+                                             exp_out_dims[i].end());
+            VLOG(2) << "Testing output shape for tensor " << name;
+            EXPECT_THAT(output.tensor()->getDimensions(),
+                        DimsAreArray(out_dims));
+          }
+        }
+      }
     }
   }
 
-  // Expose quantization_ranges_proxy for tests
+  // Expose quantization_ranges_ for tests
   std::unordered_map<ITensorProxyPtr*, float>& quantization_ranges_proxy() {
     return converter_->quantization_ranges_proxy_;
   }
@@ -1438,77 +1540,869 @@ class OpConverterTest : public ::testing::Test {
     return converter_->quantization_ranges_;
   }
 
-  void PropagateQuantizationRanges() {
-    converter_->PropagateQuantizationRanges();
+ protected:
+  template <typename T>
+  void AdjustVectorByDims(std::vector<T>& values, size_t num_elements,
+                          const string& name, const char* callingFunc) {
+    const auto old_size = values.size();
+    if (num_elements > old_size) {
+      // Expending vector with 0's.
+      const std::vector<T> zeros(num_elements - old_size, 0);
+      values.reserve(num_elements);
+      values.insert(values.end(), zeros.begin(), zeros.end());
+      VLOG(2) << "In function " << callingFunc << " the vector '" << name
+              << "' was extended by " << num_elements - old_size << " zeros";
+    } else {
+      // Removing unnecessary elements.
+      values.resize(num_elements);
+      VLOG(2) << "Only first " << num_elements << " out of " << old_size
+              << " elements of the vector '" << name
+              << "' will be used in function" << callingFunc;
+    }
   }
+
+ public:
   std::unique_ptr<Converter> converter_;
 
  protected:
-  // TODO(laigd): parameterize the test and make the precision mode a parameter.
-  TrtPrecisionMode precision_mode_to_test_ = TrtPrecisionMode::FP32;
+  Logger& logger_ = *Logger::GetLogger();
 
  private:
-  Logger& logger_ = *Logger::GetLogger();
-  TrtUniquePtrType<nvinfer1::IBuilder> builder_;
-#if IS_TRT_VERSION_GE(6, 0, 0, 0)
-  TrtUniquePtrType<nvinfer1::IBuilderConfig> builder_config_;
-#endif
-  TrtUniquePtrType<nvinfer1::INetworkDefinition> network_;
   TrtUniquePtrType<nvinfer1::ICudaEngine> engine_;
   cudaStream_t stream_;
-  // Used to create placeholders with shape and data type information. The
-  // created placeholders will be used as inputs to the node to be verified,
-  // thus we need the shape and data type information to get a non-empty
-  // GraphProperties.
+  std::unique_ptr<Allocator> tensor_buffer_allocator_;
+
+ public:
+  // The scope that contains the graph being converted. Because
+  // tensor_buffer_allocator_ provides the storage for tensor contents that are
+  // represented as attributes for graph nodes within scope_,
+  // tensor_buffer_allocator_ needs to be available when destructing scope_.
+  // Therefore, scope_ comes after tensor_buffer_allocator_ in the class member
+  // field list.
   Scope scope_;
+
+ protected:
   std::unordered_map<string, Output> node_inputs_;
 };
 
-template <typename T>
-void CopyTensorElements(const Tensor& tensor, protobuf::RepeatedField<T>* out) {
-  out->Clear();
-  if (tensor.NumElements() == 0) return;
-
-  // TensorProto does not need to have all the elements present and can truncate
-  // trailing elements with the same value for compressed representation. Such
-  // elements are derived based on the tensor shape.
-  const auto flat = tensor.flat<T>();
-  int64 last_index = 0;
-  for (int64 i = 0; i < tensor.NumElements(); ++i) {
-    if (flat(i) != flat(last_index)) {
-      last_index = i;
-    }
+// Extends the OpConverterTest for variable converters which require a properly
+// setup context.
+class VariableOpConverterTest : public OpConverterTest {
+ public:
+  void Reset(TrtPrecisionMode precision_mode_to_test = TrtPrecisionMode::FP32,
+             TrtTestMode trt_mode = TrtTestMode::kImplicitBatch) {
+    OpConverterTest::Reset(precision_mode_to_test, trt_mode, context_.get());
   }
 
-  int num_out_elements = last_index + 1;
-  out->Reserve(num_out_elements);
-  out->AddNAlreadyReserved(num_out_elements);
-  const T* src = flat.data();
-  T* dst = out->mutable_data();
-  std::copy(src, src + num_out_elements, dst);
-}
+  void CreateContext(const NodeDef& node_def, OpKernel** kernel,
+                     OpKernelContext** context) {
+    std::unique_ptr<Device> device_(
+        DeviceFactory::NewDevice("GPU", {}, "/job:a/replica:0/task:0"));
+    Device* device_ptr = device_.get();
 
-template <DataType dtype, typename InputCType, typename OutputCType>
-void TestConvertConst(OpConverterTest* test) {
-  NodeDef node_def;
-  node_def.set_name("my_const");
-  node_def.set_op("Const");
+    device_mgr_ = std::make_unique<StaticDeviceMgr>(std::move(device_));
 
-  auto reset_and_test = [&node_def, test](
-                            const Tensor& tensor, const bool as_tensor_content,
-                            const std::vector<int>& expected_dims,
-                            const std::vector<OutputCType>& expected_value) {
-    test->Reset();
+    managed_allocator_ = std::make_unique<GpuManagedAllocator>();
+    Allocator* allocator = managed_allocator_.get();
+    step_container_ =
+        std::make_unique<ScopedStepContainer>(0, [](const string&) {});
+    slice_reader_cache_wrapper_ =
+        std::make_unique<checkpoint::TensorSliceReaderCacheWrapper>();
 
-    TensorProto* tensor_attr =
-        (*node_def.mutable_attr())["value"].mutable_tensor();
-    tensor_attr->Clear();
+    flib_def_ = std::make_unique<FunctionLibraryDefinition>(
+        OpRegistry::Global(), FunctionDefLibrary());
 
-    if (as_tensor_content) {
-      tensor.AsProtoTensorContent(tensor_attr);
-    } else {
-      tensor.shape().AsProto(tensor_attr->mutable_tensor_shape());
-      tensor_attr->set_dtype(tensor.dtype());
+    thread_pool_ =
+        std::make_unique<thread::ThreadPool>(Env::Default(), "default",
+                                             /*num_threads=*/1);
+    pflr_ = std::make_unique<ProcessFunctionLibraryRuntime>(
+        device_mgr_.get(), Env::Default(), /*config=*/nullptr,
+        TF_GRAPH_DEF_VERSION, flib_def_.get(), OptimizerOptions(),
+        thread_pool_.get());
+
+    FunctionLibraryRuntime* flib = pflr_->GetFLR(device_ptr->name());
+    ResourceMgr* resource_mgr = device_ptr->resource_manager();
+
+    TF_CHECK_OK(NodeProperties::CreateFromNodeDef(
+        node_def, OpRegistry::Global(), &props_));
+
+    OpKernel* kernel_ptr = nullptr;
+    TF_CHECK_OK(CreateOpKernel(DEVICE_GPU, device_ptr, allocator, flib,
+                               resource_mgr, props_, TF_GRAPH_DEF_VERSION,
+                               &kernel_ptr));
+    op_kernel_ = std::unique_ptr<OpKernel>(kernel_ptr);
+
+    auto* dev_info = device_ptr->tensorflow_accelerator_device_info();
+    CHECK_NOTNULL(dev_info);
+    DeviceContext* device_context = dev_info->default_context;
+
+    // Note: this setup is not exhaustive.
+    params_.device = device_ptr;
+    params_.op_kernel = op_kernel_.get();
+    params_.resource_manager = resource_mgr;
+    params_.frame_iter = FrameAndIter(0, 0);
+    params_.inputs = inputs_;
+    params_.step_container = step_container_.get();
+    params_.function_library = flib;
+    params_.slice_reader_cache = slice_reader_cache_wrapper_.get();
+    params_.op_device_context = device_context;
+
+    context_ = std::make_unique<OpKernelContext>(&params_);
+
+    // Outputs.
+    *kernel = op_kernel_.get();
+    *context = context_.get();
+  }
+
+  // Adds resource for resource variable op converters.
+  void AddTestResource(const string& name, const ResourceHandle& resource) {
+    // Add resource for validation.
+    node_inputs_[name] =
+        ops::Placeholder(scope_.WithOpName("my_handle"), DT_RESOURCE);
+
+    // Add resource for conversion.
+    TF_EXPECT_OK(AddTensorOrWeights(name, TRT_TensorOrWeights{resource}));
+  }
+
+ private:
+  // The following pointers manage the kernel context.
+  std::unique_ptr<DeviceMgr> device_mgr_;
+  std::unique_ptr<Allocator> managed_allocator_;
+  std::unique_ptr<ScopedStepContainer> step_container_;
+  std::unique_ptr<checkpoint::TensorSliceReaderCacheWrapper>
+      slice_reader_cache_wrapper_;
+  std::unique_ptr<FunctionLibraryDefinition> flib_def_;
+  std::unique_ptr<thread::ThreadPool> thread_pool_;
+  std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_;
+  OpKernelContext::Params params_;
+  std::unique_ptr<OpKernel> op_kernel_;
+  std::unique_ptr<OpKernelContext> context_;
+  std::shared_ptr<const NodeProperties> props_;
+  absl::InlinedVector<TensorValue, 4> inputs_;
+};
+
+// General test parameters to be used with ops that take a single input tensor.
+struct TestParamBase {
+  // Concrete input dimensions for the test (including the batch dim)
+  std::vector<int> input_dims;
+
+  // Dimensions to define an input with PartialTensorShape. This can be used to
+  // define networks with dynamic input shape. It can be left empty, in that
+  // case AddTestTensor sets partial shapes that are appropriate to TrtTestMode.
+  std::vector<int> partial_input_dims;
+
+  // Concrete (static) output dimensions, including batch size as first dim
+  std::vector<int> expected_output_dims;
+
+  // Parameter vector, has converter specific meaning.
+  std::vector<int> param;
+
+  // Expected status of conversion (with concrete error message)
+  Status status;
+
+  // Expected status of BuildAndRun
+  Status runtime_status;
+};
+
+std::ostream& operator<<(std::ostream& os, const TestParamBase& p) {
+  os << "input_dims" << PrintToString(p.input_dims);
+  if (!p.partial_input_dims.empty()) {
+    os << ", partial_input_dims" << PrintToString(p.partial_input_dims);
+  }
+  if (!p.expected_output_dims.empty()) {
+    os << ", exp_out_dims" << PrintToString(p.expected_output_dims);
+  }
+  if (!p.param.empty()) {
+    os << ", param" << PrintToString(p.param);
+  }
+  os << ", " << p.status;
+  return os;
+}
+
+// Printing vector with the numbers of type T which defines tensor or shape.
+template <typename T>
+const std::string get_debug_string_for_vector(const std::vector<T>& vector,
+                                              absl::string_view pComment,
+                                              absl::string_view name,
+                                              absl::string_view type = "") {
+  const std::string t1 = absl::StrCat(pComment, " '", name, "': Dims(nbDims=");
+  const std::string t2 = absl::StrJoin(vector, ",");
+  const std::string t3 = type != "" ? absl::StrCat(") of type ", type) : ")";
+  std::stringstream stream;
+  stream << t1 << vector.size() << ", d=" << t2 << t3;
+  return stream.str();
+}
+
+// Parameterized version of OpConverterTest. We have the following parameters:
+// 1. TrtTestMode: implicit batch, explicit batch, dynamic shape modes
+// 2. DataType of the input TF tensors: DT_FLOAT, DT_HALF, DT_INT32
+// 3. TrtPrecisionMode argument for the Converter: FP32, FP16, INT8
+// We will introduce subclasses that will be instantiated using different
+// combinations of the DataType and TrtPrecisionMode parameters.
+class ParameterizedOpConverterTestBase
+    : public OpConverterTest,
+      public ::testing::WithParamInterface<
+          std::tuple<TrtTestMode, DataType, TrtPrecisionMode>> {
+ public:
+  ParameterizedOpConverterTestBase()
+      : trt_mode_(std::get<0>(GetParam())),
+        tf_type_(std::get<1>(GetParam())),
+        converter_precision_(std::get<2>(GetParam())) {
+    LOG(INFO) << "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%";
+    LOG(INFO) << "tf_type_: " << DebugString(tf_type_);
+    LOG(INFO) << "trt_mode_: " << DebugString(trt_mode_);
+    LOG(INFO) << "converter_precision_: " << DebugString(converter_precision_);
+    LOG(INFO) << "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%";
+  }
+
+  void Reset() {
+    OpConverterTest::Reset(converter_precision_, trt_mode_);
+    input_data_.clear();
+  }
+
+  void Reset(TrtPrecisionMode precision) {
+    OpConverterTest::Reset(precision, trt_mode_);
+    input_data_.clear();
+  }
+
+  // Getters of protected attributes
+  DataType get_tf_type() { return tf_type_; }
+  TrtTestMode get_trt_mode() { return trt_mode_; }
+  TrtPrecisionMode get_converter_precision() { return converter_precision_; }
+
+  // Adds an input ITensor for TRT network. Also creates the corresponding TF
+  // tensor, and stores it in the list of inputs (input_data_).
+  //
+  // The TF tensor is always created with concrete static input shape given by
+  // dims. The ITensor can have static or dynamic shape based on the trt_mode
+  // attribute. The ITensor shape is set automatically according to the trt_mode
+  // parameter, unless the user overrides it with an explicit
+  // partial_input_shape_dims argument.
+  //
+  // Parameters:
+  // - name of the input node
+  // - dims actual dimensions of the tensor that we will use during the test
+  //   (including explicit batch dim)
+  // - values initial values for the TF tensor
+  // - dtype data type of the tensor
+  // - partial_input_shape dimensions which can include unknown shapes. This can
+  //   be empty, in that case the partial_input_shape will be set automatically
+  //   depending on the trt_mode argument. (This argument also includes explicit
+  //   batch dim).
+  // - add_input_status adding ITensor to the network can fail in implicit batch
+  //   mode if the batch size is inconsistent. Using the add_input_status arg we
+  //   can test such errors.
+  //
+  template <typename T = int>
+  void AddTestTensor(const string& name, const std::vector<int32>& dims,
+                     DataType tf_type, const std::vector<T>& values_inp,
+                     const std::vector<int32>& partial_input_shape_dims = {},
+                     Status add_input_status = Status::OK(),
+                     bool fix_values = true) {
+    std::vector<T> values(values_inp);
+    VLOG(2) << "**** AddTestTensor for " << name
+            << " ***** dims empty() = " << dims.empty()
+            << "  tf_type = " << DebugString(tf_type);
+    if (!dims.empty()) {
+      const auto num_elements = std::accumulate(
+          std::begin(dims), std::end(dims), 1, std::multiplies<double>());
+      if (!values.empty() && num_elements != values.size()) {
+        if (fix_values) {
+          AdjustVectorByDims(values, num_elements, name, "AddTestTensor");
+        } else {
+          // Note: for conversion only tests, it is valid to have empty values,
+          // otherwise the number of elements should match.
+          LOG(WARNING) << "Expected Test Tensor Shape: " << DebugString(dims)
+                       << ", Received Input Tensor: " << DebugString(values);
+        }
+      }
+    }
+
+    std::vector<int32> partial_shape;
+    if (!partial_input_shape_dims.empty()) {
+      partial_shape = partial_input_shape_dims;
+    } else {
+      if (trt_mode_ == TrtTestMode::kDynamicShape) {
+        // In dynamic shape mode we make all dims unknown.
+        partial_shape = std::vector<int32>(dims.size(), -1);
+      } else {
+        // Use static (known) input shapes.
+        partial_shape = dims;
+      }
+      if (VLOG_IS_ON(2)) {
+        VLOG(2) << get_debug_string_for_vector(partial_shape,
+                                               "Using partial_shape for", name);
+      }
+    }
+    nvinfer1::DataType trt_type;
+    TF_ASSERT_OK(TfTypeToTrtType(tf_type, &trt_type));
+    AddTestTensorWithTFDims(name, partial_shape, trt_type, add_input_status);
+    if (!values.empty()) {
+      if (VLOG_IS_ON(2)) {
+        VLOG(2) << get_debug_string_for_vector(values, "Adding test tensor for",
+                                               name, DataTypeString(tf_type));
+      }
+      InputOutputData data{name, AsTensor(values, dims, tf_type)};
+      VLOG(2) << "Added tensor: " << data.name << " with dtype "
+              << DataTypeString(data.tensor.dtype());
+      input_data_.push_back(data);
+    }
+  }
+
+  // Adds test tensor (same as above) but with the default tf_type defined by
+  // the test params.
+  template <typename T = int>
+  void AddTestTensor(const string& name, const std::vector<int32>& dims,
+                     const std::vector<T>& values = {},
+                     const std::vector<int32>& partial_input_shape_dims = {}) {
+    AddTestTensor<T>(name, dims, tf_type_, values, partial_input_shape_dims);
+  }
+
+  // Builds and runs the converted network. Checks output tensor shape. Tests
+  // output values using a matcher. The network can have multiple input and
+  // output tensors. The inputs are defined by the input_data_ member variable.
+  void BuildAndRun(const string& name,
+                   const std::vector<std::vector<int>>& expected_output_dims,
+                   const Status& expected_runtime_status,
+                   const std::vector<Matcher<std::vector<float>>>& matcher,
+                   const std::vector<DataType>& out_tf_types = {}) {
+    TensorShape shape;
+    const int n_output = expected_output_dims.size();
+    ASSERT_EQ(n_output, matcher.size());
+    DataVec output_data;
+    for (int i = 0; i < n_output; i++) {
+      TF_EXPECT_OK(
+          TensorShapeUtils::MakeShape(expected_output_dims[i], &shape));
+      string out_name = (i == 0) ? name : StrCat(name, ":", i);
+      DataType out_tf_type =
+          out_tf_types.size() > i ? out_tf_types[i] : tf_type_;
+      InputOutputData data{
+          out_name, ConstructTensor(shape.num_elements(), 0, out_tf_type)};
+      output_data.push_back(data);
+    }
+    const int batch_size =
+        input_data_.empty() ||
+                TensorShapeUtils::IsScalar(input_data_[0].tensor.shape())
+            ? 1
+            : input_data_[0].tensor.shape().dim_size(0);
+    Status stat =
+        OpConverterTest::BuildAndRun(input_data_, &output_data, batch_size);
+    ASSERT_EQ(expected_runtime_status.ok(), stat.ok())
+        << "expected status: " << expected_runtime_status
+        << ", actual status: " << stat;
+    if (expected_runtime_status.ok() && stat.ok()) {
+      for (int i = 0; i < n_output; i++) {
+        // Check the shape of the actual output tensors
+        TF_EXPECT_OK(
+            TensorShapeUtils::MakeShape(expected_output_dims[i], &shape));
+        EXPECT_TRUE(output_data[i].tensor.shape() == shape)
+            << "Expected shape: " << shape.DebugString() << ", actual shape: "
+            << output_data[i].tensor.shape().DebugString();
+        EXPECT_THAT(GetDataAsFloat(output_data[i]), matcher[i]);
+      }
+    }
+  }
+
+  // Runs validation and conversion. If conversion is successfull then builds
+  // the TRT network, executes it and checks the output. Handles multiple output
+  // tensors.
+  void TestOpConverterMultiOut(
+      const NodeDef& node_def,
+      const std::vector<std::vector<int>>& expected_output_dims,
+      const Status& expected_conversion_status,
+      const Status& expected_runtime_status,
+      const std::vector<Matcher<std::vector<float>>>& matcher,
+      const std::vector<DataType>& out_tf_type = {}) {
+    const auto& name = node_def.name();
+    RunValidationAndConversion(node_def, expected_conversion_status, name,
+                               expected_output_dims);
+    if (expected_conversion_status.ok()) {
+      BuildAndRun(name, expected_output_dims, expected_runtime_status, matcher,
+                  out_tf_type);
+    }
+  }
+
+  // Runs validation and conversion. If conversion is successfull then builds
+  // the TRT network, executes it and checks the output.
+  void TestOpConverter(const NodeDef& node_def,
+                       const std::vector<int>& expected_output_dims,
+                       const Status& expected_conversion_status,
+                       const Status& expected_runtime_status,
+                       const Matcher<std::vector<float>>& matcher,
+                       const std::vector<DataType>& out_tf_types = {}) {
+    TestOpConverterMultiOut(
+        node_def, std::vector<std::vector<int>>({expected_output_dims}),
+        expected_conversion_status, expected_runtime_status,
+        std::vector<Matcher<std::vector<float>>>({matcher}), out_tf_types);
+  }
+
+ protected:
+  const TrtTestMode trt_mode_;
+  const DataType tf_type_;
+  const TrtPrecisionMode converter_precision_;
+  DataVec input_data_;
+};
+
+template <typename T>
+class OpConverter_UnaryTest : public ParameterizedOpConverterTestBase {
+ public:
+  template <typename S>
+  void RunTests(
+      const string& testName, const OperationMap<S>& map,
+      std::map<std::string,
+               std::pair<std::function<NodeDef(DataType)>, T (*)(T)>>& op_map,
+      const std::vector<T> input_values, const std::string input_name = "input",
+      float max_abs_error = 0.0001, bool nan_sensitive = true) {
+    // Prepare test parameters.
+    auto p = TestParamBase{
+        {1, 1, 2, 3},  // input dims
+        {},            // input partial dims
+        {1, 1, 2, 3},  // expected output dims
+    };
+
+    // Get list of ops to test.
+    std::vector<string> ops_to_test;
+    for (auto& pair : map) {
+      ops_to_test.push_back(pair.first);
+    }
+
+    for (const string& op_name : ops_to_test) {
+      SCOPED_TRACE(op_name);
+      if (!op_map.count(op_name)) {
+        FAIL() << testName << " op test map does not contain op " << op_name;
+      }
+
+      const DataType tf_type = get_tf_type();
+      const NodeDef& node = op_map[op_name].first(tf_type);
+      runExpectedToFailTest(node, input_name, input_values, op_name);
+
+      Status conv_status = Status::OK();
+      if (trt_mode_ == TrtTestMode::kImplicitBatch &&
+          (op_name == "Sign" || op_name == "Round" ||
+           op_name == "LogicalNot")) {
+        const auto& err =
+            convert_not_supported_implicit(op_name, node.name(), "Unary");
+        conv_status = errors::Unimplemented(err);
+      }
+
+      Reset();
+      const DataType input_tf_type = op_name == "Cast" ? DT_HALF : tf_type;
+      const DataType output_tf_type = op_name == "Cast" ? DT_FLOAT : tf_type;
+
+      AddTestTensor("input", p.input_dims, input_tf_type, input_values);
+
+      std::vector<float> output;
+      std::transform(input_values.begin(), input_values.end(),
+                     std::back_inserter(output), op_map[op_name].second);
+
+      TestOpConverter(node, p.expected_output_dims, conv_status, Status::OK(),
+                      ArrayFloatNear(output, max_abs_error, nan_sensitive),
+                      {output_tf_type});
+    }
+  }
+  void runExpectedToFailTest(const NodeDef& node_def,
+                             const std::string& input_name,
+                             const std::vector<T>& input_values,
+                             const std::string& op_name) {
+    // Input is weights, should fail.
+    Reset();
+    std::string error =
+        "The input \"" + input_name + "\" for " + op_name + " must be a tensor";
+    AddTestWeights("input", {1, 2, 3}, input_values, get_tf_type());
+    RunValidationAndConversion(node_def, absl::StatusCode::kUnimplemented,
+                               error);
+
+    // Input has 0 dimensions, should fail.
+    Reset();
+    std::vector<int32> dims{};
+    if (trt_mode_ == TrtTestMode::kImplicitBatch) {
+      dims = {1};
+    }
+    error = "At least 1 dimension is required for UNARY operation '" + op_name +
+            "'";
+    AddTestTensor("input", dims);
+    RunValidationAndConversion(node_def, absl::StatusCode::kInvalidArgument,
+                               error);
+  }
+};
+
+template <typename T>
+class OpConverter_BinaryTest : public ParameterizedOpConverterTestBase {
+ public:
+  template <typename S>
+  void RunTests(
+      const OperationMap<S>& map,
+      std::map<std::string,
+               std::pair<std::function<NodeDef(DataType)>, std::vector<T>>>&
+          op_test_info,
+      const std::vector<std::vector<T>>& data) {
+    const std::vector<DataType> bool_types{DT_BOOL}, default_types{};
+    std::vector<string> logical_ops{"Greater", "Less", "Equal"};
+    std::vector<string> combined_ops{"GreaterEqual", "LessEqual"};
+    const DataType tf_type = get_tf_type();
+    AttrValue dtype;
+    dtype.set_type(tf_type);
+    std::map<std::string, NodeDef> nodes;
+    for (const auto op_name : combined_ops) {
+      nodes[op_name] = MakeNodeDef("my_binary", op_name, {"input1", "input2"},
+                                   {{"T", dtype}});
+    }
+
+    for (auto& iter : map) {
+      const string& op_name = iter.first;
+      if (!op_test_info.count(op_name)) {
+        FAIL() << "Binary op test map does not contain op " << op_name;
+      }
+      const auto comb_op = find_name(op_name, combined_ops);
+      const auto& node_def =
+          comb_op ? nodes[op_name] : op_test_info[op_name].first(tf_type);
+
+      for (const bool operand_1_is_tensor : {true, false}) {
+        for (const bool operand_2_is_tensor : {true, false}) {
+          SCOPED_TRACE(StrCat(op_name, "_", operand_1_is_tensor ? "T" : "W",
+                              operand_2_is_tensor ? "T" : "W"));
+          Reset();
+          if (!operand_1_is_tensor && !operand_2_is_tensor) {
+            // In that case the only test which should be launched is in
+            // runExpectedToFailTest
+            runExpectedToFailTest(op_name, node_def);
+            continue;
+          }
+
+          const bool logical_op = comb_op || find_name(op_name, logical_ops);
+          auto conv_status = Status::OK();
+          if (tf_type == DT_BOOL || logical_op) {
+            if (trt_mode_ == TrtTestMode::kImplicitBatch) {
+              conv_status =
+                  errors::Unimplemented(convert_not_supported_implicit(
+                      op_name, node_def.name(), "Binary"));
+            } else if (!logical_op &&
+                       (!operand_1_is_tensor || !operand_2_is_tensor)) {
+              conv_status = errors::InvalidArgument(
+                  "Both inputs  of '", op_name, "' are expected to be tensors");
+            }
+          }
+
+          if (operand_1_is_tensor) {
+            AddTestTensor("input1", {2, 1, 2}, data[0]);
+          } else {
+            AddTestWeights("input1", {1, 2}, data[1], tf_type);
+          }
+          if (operand_2_is_tensor) {
+            AddTestTensor("input2", {2, 2, 1}, data[2]);
+          } else {
+            AddTestWeights("input2", {2, 1}, data[3], tf_type);
+          }
+
+          TestOpConverter(node_def, {2, 2, 2}, conv_status, Status::OK(),
+                          ElementsAreArray(op_test_info[op_name].second),
+                          logical_op ? bool_types : default_types);
+        }
+      }
+    }
+  }
+
+  void runExpectedToFailTest(const std::string& op_name, const NodeDef& node) {
+    AddTestWeights("input1", {1}, {1}, tf_type_);
+    AddTestWeights("input2", {1}, {1}, tf_type_);
+    const string error =
+        "Constant folding is falled back to TensorFlow, "
+        "binary op '" +
+        op_name + "' received both input as constant";
+    RunValidationAndConversion(node, absl::StatusCode::kUnimplemented, error);
+  }
+};
+
+// Op converter test in FP32 mode. While for debugging purposes it might make
+// sense to run over all possible combinations, normally a subset of them
+// would be sufficient:
+// - All valid options to TrtTestMode (implicit, explicit, dynamic shape)
+// - DataType: is the TF data type of the input tensors. This usually only
+//   influences the data type added by Converter::AddInputTensor. We test the
+//   valid combinations of input data types in AddAndGetInputs, therefore
+//   for most of the OpConverterTest its is sufficient to test for DT_FLOAT.
+// - TrtPrecisionMode: valid options are FP32, FP16 and INT8. This influences
+//   how TRT handles the precision inside the TRT network, but should not matter
+//   for the TF -> TRT conversion. Therefore it should be sufficient to test
+//   for FP32.
+typedef ParameterizedOpConverterTestBase OpConverter_FP32_Test;
+// Base class for tests that need to be tested for both FP32 and FP16.
+typedef ParameterizedOpConverterTestBase OpConverter_FP32_FP16_Test;
+// Base class for Binary tests that need to be tested
+typedef OpConverter_BinaryTest<float> OpConverter_FP32_FP16_BinaryTest;
+typedef OpConverter_BinaryTest<int> OpConverter_BOOL_BinaryTest;
+// Base class for tests that need to be tested for FP32, FP16, and INT32
+typedef ParameterizedOpConverterTestBase OpConverter_FP32_FP16_INT32_Test;
+// Base class for tests that need to be tested for INT32
+typedef ParameterizedOpConverterTestBase OpConverter_INT32_Test;
+// Base class for Unary tests that need to be tested
+typedef OpConverter_UnaryTest<float> OpConverter_FP32_UnaryTest;
+typedef OpConverter_UnaryTest<int> OpConverter_BOOL_Test;
+
+// Instantiate parameter combinations to OpConverter_<DT_X...>_Test
+INSTANTIATE_TEST_CASE_P(
+    OpConvTestInstantiation, OpConverter_FP32_Test,
+    ::testing::Combine(::testing::ValuesIn(ValidTrtModes),
+                       ::testing::Values(DT_FLOAT),
+                       ::testing::Values(TrtPrecisionMode::FP32)));
+
+INSTANTIATE_TEST_CASE_P(
+    OpConvTestInstantiation, OpConverter_FP32_FP16_Test,
+    ::testing::Combine(::testing::ValuesIn(ValidTrtModes),
+                       ::testing::Values(DT_FLOAT, DT_HALF),
+                       ::testing::Values(TrtPrecisionMode::FP32)));
+
+INSTANTIATE_TEST_CASE_P(
+    OpConvTestInstantiation, OpConverter_FP32_FP16_INT32_Test,
+    ::testing::Combine(::testing::ValuesIn(ValidTrtModes),
+                       ::testing::Values(DT_FLOAT, DT_HALF, DT_INT32),
+                       ::testing::Values(TrtPrecisionMode::FP32)));
+
+INSTANTIATE_TEST_CASE_P(
+    OpConvTestInstantiation, OpConverter_INT32_Test,
+    ::testing::Combine(::testing::ValuesIn(ValidTrtModes),
+                       ::testing::Values(DT_INT32),
+                       ::testing::Values(TrtPrecisionMode::FP32)));
+
+INSTANTIATE_TEST_CASE_P(
+    OpConvTestInstantiation, OpConverter_FP32_UnaryTest,
+    ::testing::Combine(::testing::ValuesIn(ValidTrtModes),
+                       ::testing::Values(DT_FLOAT),
+                       ::testing::Values(TrtPrecisionMode::FP32)));
+
+INSTANTIATE_TEST_CASE_P(
+    OpConvTestInstantiation, OpConverter_BOOL_Test,
+    ::testing::Combine(::testing::ValuesIn(ValidTrtModes),
+                       ::testing::Values(DT_BOOL),
+                       ::testing::Values(TrtPrecisionMode::FP32)));
+
+INSTANTIATE_TEST_CASE_P(
+    OpConvTestInstantiation, OpConverter_FP32_FP16_BinaryTest,
+    ::testing::Combine(::testing::ValuesIn(ValidTrtModes),
+                       ::testing::Values(DT_FLOAT, DT_HALF),
+                       ::testing::Values(TrtPrecisionMode::FP32)));
+
+INSTANTIATE_TEST_CASE_P(
+    OpConvTestInstantiation, OpConverter_BOOL_BinaryTest,
+    ::testing::Combine(::testing::ValuesIn(ValidTrtModes),
+                       ::testing::Values(DT_BOOL),
+                       ::testing::Values(TrtPrecisionMode::FP32)));
+
+template <typename T>
+void CopyTensorElements(const Tensor& tensor, protobuf::RepeatedField<T>* out) {
+  out->Clear();
+  if (tensor.NumElements() == 0) return;
+
+  // TensorProto does not need to have all the elements present and can truncate
+  // trailing elements with the same value for compressed representation. Such
+  // elements are derived based on the tensor shape.
+  const auto flat = tensor.flat<T>();
+  int64 last_index = 0;
+  for (int64 i = 0; i < tensor.NumElements(); ++i) {
+    if (flat(i) != flat(last_index)) {
+      last_index = i;
+    }
+  }
+
+  int num_out_elements = last_index + 1;
+  out->Reserve(num_out_elements);
+  out->AddNAlreadyReserved(num_out_elements);
+  const T* src = flat.data();
+  T* dst = out->mutable_data();
+  std::copy(src, src + num_out_elements, dst);
+}
+
+template <DataType dtype, typename CType>
+void TestConvertVariableV2(VariableOpConverterTest* test) {
+  struct TestParam {
+    string container;
+    string shared_name;
+    std::vector<int> dims;
+    float epsilon;
+    Status conversion_status;
+  };
+
+  std::vector<TestParam> test_param = {
+      {"", "var0", {}, 0.001, Status::OK()},
+      {"", "var0", {64}, 0.001, Status::OK()},
+      {"", "var0", {8, 16}, 0.001, Status::OK()},
+      {"box", "var", {8, 16}, 0.001, Status::OK()}};
+  for (auto p : test_param) {
+    // Create node definition.
+    NodeDef node_def;
+    std::vector<int64_t> dims_64(p.dims.begin(), p.dims.end());
+    TensorShape shape = TensorShape(absl::Span<int64_t>(dims_64));
+    TF_CHECK_OK(NodeDefBuilder("my_var", "VariableV2")
+                    .Attr("dtype", dtype)
+                    .Attr("shape", shape)
+                    .Attr("container", p.container)
+                    .Attr("shared_name", p.shared_name)
+                    .Finalize(&node_def));
+
+    OpKernel* kernel;
+    OpKernelContext* context;
+    test->CreateContext(node_def, &kernel, &context);
+
+    test->Reset(TrtPrecisionMode::FP32, TrtTestMode::kDynamicShape);
+
+    // Set the value of the variable according to p.dims.
+    int var_size = std::accumulate(p.dims.begin(), p.dims.end(), 1,
+                                   std::multiplies<int>());
+    std::vector<CType> expected_value;
+    expected_value.reserve(var_size);
+    for (int i = 0; i < var_size; i++) {
+      expected_value.push_back((CType)i);
+    }
+
+    // To set the variable, we get the tensor by executing the VariableV2 op
+    // rather than creating the resource directly in the manager, because:
+    // 1) LegacyVar defined in `variable_ops.cc` is not accessible.
+    // 2) Tensor::set_shape is private, VariableOp is a friend class.
+    kernel->Compute(context);
+    Tensor* tensor_ptr = context->mutable_output(0);
+    CHECK_NOTNULL(tensor_ptr);
+    // We allocate the tensor in the temporary memory. Note that creating a
+    // tensor in this scope and sharing the underlying storage by copy would
+    // lead to double destruction.
+    AllocatorAttributes attr;
+    attr.set_gpu_compatible(true);
+    attr.set_nic_compatible(true);
+    OP_REQUIRES_OK(context,
+                   context->allocate_temp(dtype, shape, tensor_ptr, attr));
+    // The tensor is allocated on GPU. We copy the values from the CPU.
+    auto tensor_flat = tensor_ptr->flat<CType>();
+    CHECK_NOTNULL(tensor_flat.data());
+    auto ret = cudaMemcpy(tensor_flat.data(), expected_value.data(),
+                          expected_value.size() * sizeof(CType),
+                          cudaMemcpyHostToDevice);
+    CHECK_EQ(ret, 0);
+
+    test->RunValidationAndConversion(node_def);
+    TRT_TensorOrWeights output;
+    TF_EXPECT_OK(test->GetTensorOrWeights("my_var", &output));
+    EXPECT_THAT(output.weights(),
+                ShapedWeightsHasDimsAndValues<CType>(p.dims, expected_value));
+  }
+}
+
+TEST_F(VariableOpConverterTest, ConvertVariableV2) {
+  TestConvertVariableV2<DT_FLOAT, float>(this);
+  TestConvertVariableV2<DT_HALF, Eigen::half>(this);
+}
+
+template <DataType dtype, typename CType>
+void TestConvertReadVariableOp(VariableOpConverterTest* test) {
+  struct TestParam {
+    string container;
+    string name;
+    std::vector<int> dims;
+    float epsilon;
+    Status conversion_status;
+  };
+
+  std::vector<TestParam> test_param = {
+      {"", "var0", {}, 0.001, Status::OK()},
+      {"", "var0", {64}, 0.001, Status::OK()},
+      {"", "var0", {8, 16}, 0.001, Status::OK()},
+      {"box", "var", {8, 16}, 0.001, Status::OK()}};
+  for (auto p : test_param) {
+    // Create node definition.
+    NodeDefBuilder::NodeOut rvo_input =
+        NodeDefBuilder::NodeOut("my_handle", 0, DT_RESOURCE);
+    NodeDef node_def;
+    std::vector<int64_t> dims_64(p.dims.begin(), p.dims.end());
+    TensorShape shape =
+        TensorShape(gtl::ArraySlice<int64_t>(dims_64));  // non-absl ok
+    TF_CHECK_OK(NodeDefBuilder("my_var", "ReadVariableOp")
+                    .Attr("dtype", dtype)
+                    .Attr("_shape", shape)
+                    .Input(rvo_input)
+                    .Finalize(&node_def));
+
+    OpKernel* kernel;
+    OpKernelContext* context;
+    test->CreateContext(node_def, &kernel, &context);
+
+    test->Reset(TrtPrecisionMode::FP32, TrtTestMode::kDynamicShape);
+
+    // Set the value of the variable according to p.dims.
+    int var_size = std::accumulate(p.dims.begin(), p.dims.end(), 1,
+                                   std::multiplies<int>());
+    std::vector<CType> expected_value;
+    expected_value.reserve(var_size);
+    for (int i = 0; i < var_size; i++) {
+      // Set expected_value[i] = (cast)i.
+      expected_value.push_back((CType)i);
+    }
+
+    // Create a resource handle.
+    DtypeAndPartialTensorShape dtype_and_shape;
+    dtype_and_shape.dtype = dtype;
+    TF_CHECK_OK(PartialTensorShape::BuildPartialTensorShape(
+        gtl::ArraySlice<int64_t>(dims_64),  // non-absl ok
+        &dtype_and_shape.shape));
+    ResourceHandle handle = MakeResourceHandle<Var>(
+        context, p.container, p.name,
+        std::vector<DtypeAndPartialTensorShape>{dtype_and_shape});
+
+    // Create input resource with the handle.
+    test->AddTestResource("my_handle", handle);
+
+    // Create a resource with this handle.
+    Var* resource = new Var(dtype);
+    TF_EXPECT_OK(CreateResource(context, handle, resource));
+
+    // Setup the tensor of the variable.
+    // We allocate the tensor in the temporary memory. Note that creating a
+    // tensor in this scope and sharing the underlying storage by copy would
+    // lead to double destruction.
+    AllocatorAttributes attr_value;
+    attr_value.set_gpu_compatible(true);
+    attr_value.set_nic_compatible(true);
+    TF_EXPECT_OK(
+        context->allocate_temp(dtype, shape, resource->tensor(), attr_value));
+    // The tensor is allocated on GPU. We copy the values from the CPU.
+    auto tensor_flat = resource->tensor()->flat<CType>();
+    CHECK(tensor_flat.data());
+    auto ret = cudaMemcpy(tensor_flat.data(), expected_value.data(),
+                          expected_value.size() * sizeof(CType),
+                          cudaMemcpyHostToDevice);
+    CHECK_EQ(ret, 0);
+
+    test->RunValidationAndConversion(node_def);
+    TRT_TensorOrWeights output;
+    TF_EXPECT_OK(test->GetTensorOrWeights("my_var", &output));
+    EXPECT_THAT(output.weights(),
+                ShapedWeightsHasDimsAndValues<CType>(p.dims, expected_value));
+  }
+}
+
+TEST_F(VariableOpConverterTest, ConvertReadVariableOp) {
+  TestConvertReadVariableOp<DT_FLOAT, float>(this);
+  TestConvertReadVariableOp<DT_HALF, Eigen::half>(this);
+}
+
+template <DataType dtype, typename InputCType, typename OutputCType>
+void TestConvertConst(OpConverterTest* test) {
+  NodeDef node_def;
+  node_def.set_name("my_const");
+  node_def.set_op("Const");
+
+  auto reset_and_test = [&node_def, test](
+                            const Tensor& tensor, const bool as_tensor_content,
+                            const std::vector<int>& expected_dims,
+                            const std::vector<OutputCType>& expected_value) {
+    test->Reset();
+
+    TensorProto* tensor_attr =
+        (*node_def.mutable_attr())["value"].mutable_tensor();
+    tensor_attr->Clear();
+
+    if (as_tensor_content) {
+      tensor.AsProtoTensorContent(tensor_attr);
+    } else {
+      tensor.shape().AsProto(tensor_attr->mutable_tensor_shape());
+      tensor_attr->set_dtype(tensor.dtype());
 
       if (tensor.dtype() == DT_FLOAT) {
         CopyTensorElements<float>(tensor, tensor_attr->mutable_float_val());
@@ -1521,7 +2415,8 @@ void TestConvertConst(OpConverterTest* test) {
     test->RunValidationAndConversion(node_def);
     TRT_TensorOrWeights output;
     TF_EXPECT_OK(test->GetTensorOrWeights("my_const", &output));
-    ValidateWeights(output.weights(), expected_dims, expected_value);
+    EXPECT_THAT(output.weights(), ShapedWeightsHasDimsAndValues<OutputCType>(
+                                      expected_dims, expected_value));
   };
 
   auto& attr = *node_def.mutable_attr();
@@ -1534,17 +2429,20 @@ void TestConvertConst(OpConverterTest* test) {
   }
   {
     Tensor t = test::AsScalar<InputCType>(12);
-    reset_and_test(t, false, {1}, {12});
-    reset_and_test(t, true, {1}, {12});
+    std::vector<int> expected_dims{1};
+    // Scalars are represented as rank 0 tensors.
+    expected_dims.clear();
+    reset_and_test(t, false, expected_dims, {12});
+    reset_and_test(t, true, expected_dims, {12});
   }
   {
-    Tensor t = test::AsTensor<InputCType>({1, 2});
+    Tensor t = test->AsTensor<InputCType>({1, 2});
     reset_and_test(t, false, {2}, {1, 2});
     reset_and_test(t, true, {2}, {1, 2});
   }
   {
     Tensor t =
-        test::AsTensor<InputCType>({1, 2, 3, 4, 5, 6}, TensorShape({2, 3}));
+        test->AsTensor<InputCType>({1, 2, 3, 4, 5, 6}, TensorShape({2, 3}));
     reset_and_test(t, false, {2, 3}, {1, 2, 3, 4, 5, 6});
     reset_and_test(t, true, {2, 3}, {1, 2, 3, 4, 5, 6});
   }
@@ -1552,7 +2450,7 @@ void TestConvertConst(OpConverterTest* test) {
     // Set all tensor elements to the same value. Such tensors are encoded
     // using a single element list in tensor proto.
     Tensor t =
-        test::AsTensor<InputCType>({1, 1, 1, 1, 1, 1}, TensorShape({2, 3}));
+        test->AsTensor<InputCType>({1, 1, 1, 1, 1, 1}, TensorShape({2, 3}));
     reset_and_test(t, false, {2, 3}, {1, 1, 1, 1, 1, 1});
     reset_and_test(t, true, {2, 3}, {1, 1, 1, 1, 1, 1});
   }
@@ -1560,7 +2458,7 @@ void TestConvertConst(OpConverterTest* test) {
     // Set trailing tensor elements to the same value. Such tensors are
     // encoded by truncating all equal elements except the first one.
     Tensor t =
-        test::AsTensor<InputCType>({2, 2, 1, 1, 1, 1}, TensorShape({2, 3}));
+        test->AsTensor<InputCType>({2, 2, 1, 1, 1, 1}, TensorShape({2, 3}));
     reset_and_test(t, false, {2, 3}, {2, 2, 1, 1, 1, 1});
     reset_and_test(t, true, {2, 3}, {2, 2, 1, 1, 1, 1});
   }
@@ -1570,15 +2468,15 @@ TEST_F(OpConverterTest, ConvertConst) {
   {
     Reset();
     NodeDef node_def = MakeConstNodeDef<double>("my_const", {});
-    RunValidationAndConversion(node_def, error::INVALID_ARGUMENT,
-                               "Unsupported data type double");
+    RunValidationAndConversion(node_def, absl::StatusCode::kInvalidArgument,
+                               "Unsupported tensorflow data type double");
   }
   {
     Reset();
     Tensor tensor =
-        test::AsTensor<int64>({1, std::numeric_limits<int64>::max(), 1, 1, 1,
-                               std::numeric_limits<int64>::lowest()},
-                              TensorShape({2, 3}));
+        AsTensor<int64_t>({1, std::numeric_limits<int64_t>::max(), 1, 1, 1,
+                           std::numeric_limits<int64_t>::lowest()},
+                          TensorShape({2, 3}));
     NodeDef node_def;
     node_def.set_name("my_const");
     node_def.set_op("Const");
@@ -1587,7 +2485,7 @@ TEST_F(OpConverterTest, ConvertConst) {
         (*node_def.mutable_attr())["value"].mutable_tensor();
     tensor_attr->Clear();
     tensor.AsProtoTensorContent(tensor_attr);
-    RunValidationAndConversion(node_def, error::INVALID_ARGUMENT,
+    RunValidationAndConversion(node_def, absl::StatusCode::kInvalidArgument,
                                "outside the range of int32");
   }
 
@@ -1602,244 +2500,684 @@ TEST_F(OpConverterTest, ConvertConst) {
   TestConvertConst<DT_UINT64, uint64, int32>(this);
 }
 
-TEST_F(OpConverterTest, ConvertTranspose) {
+template <typename T>
+NodeDef CreateFusedBatchNormOp(DataType tf_type, std::string data_format,
+                               bool is_training, float epsilon) {
+  Scope s = Scope::NewRootScope();
+  auto x = ops::Placeholder(s.WithOpName("x"), tf_type);
+  auto scale = ops::Placeholder(s.WithOpName("scale"), tf_type);
+  auto offset = ops::Placeholder(s.WithOpName("offset"), tf_type);
+  auto mean = ops::Placeholder(s.WithOpName("mean"), tf_type);
+  auto variance = ops::Placeholder(s.WithOpName("variance"), tf_type);
+  typename T::Attrs attrs;
+  attrs.data_format_ = data_format;
+  attrs.is_training_ = is_training;
+  if (epsilon > 0) {
+    attrs.epsilon_ = epsilon;
+  } else {
+    EXPECT_GE(epsilon, 0);
+  }
+  return T(s.WithOpName("my_batchnorm"), x, scale, offset, mean, variance,
+           attrs)
+      .operation.node()
+      ->def();
+}
+
+TEST_P(OpConverter_FP32_Test, ConvertFusedBatchNorm) {
+  using OpFunc = std::function<NodeDef(DataType, std::string, bool, float)>;
+  std::vector<OpFunc> get_node_def_vec{
+      CreateFusedBatchNormOp<ops::FusedBatchNorm>,
+      CreateFusedBatchNormOp<ops::FusedBatchNormV2>,
+      CreateFusedBatchNormOp<ops::FusedBatchNormV3>};
+
+  struct TestParam {
+    std::string data_format;
+    int tensor_input_idx;  // Index of an input that will be provided as tensor.
+    bool is_training;
+    float epsilon;
+    Status conversion_status;
+    bool keep_channel_unknown;
+  };
+
+  struct NodeInput {
+    std::string name;
+    std::vector<int> dims;
+    std::vector<float> val;
+  };
+  std::vector<NodeInput> node_input_nchw{
+      {"x", {2, 3, 2, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}},
+      {"scale", {3}, {7, 8, 9}},
+      {"offset", {3}, {10, 20, 30}},
+      {"mean", {3}, {1, 2, 3}},
+      {"variance", {3}, {4, 5, 6}}};
+
+  std::vector<NodeInput> node_input_nhwc{
+      {"x", {2, 2, 1, 3}, {1, 3, 5, 2, 4, 6, 7, 9, 11, 8, 10, 12}},
+      {"scale", {3}, {7, 8, 9}},
+      {"offset", {3}, {10, 20, 30}},
+      {"mean", {3}, {1, 2, 3}},
+      {"variance", {3}, {4, 5, 6}}};
+
+  std::vector<float> expected_output_nchw{
+      10.0,    13.495633, 23.574135, 27.148273, 37.342354, 41.013527,
+      30.9738, 34.469433, 45.018955, 48.59309,  59.369415, 63.04059};
+
+  std::vector<float> expected_output_nhwc{
+      10.0,    23.574135, 37.342354, 13.495633, 27.148273, 41.013527,
+      30.9738, 45.018955, 59.369415, 34.469433, 48.59309,  63.04059};
+
+  for (auto get_node_def : get_node_def_vec) {
+    NodeDef tmp_node_def = get_node_def(tf_type_, "NCHW", true, 0);
+    std::string op_name = tmp_node_def.op();
+    std::vector<TestParam> test_param{
+        {"NCHW", 0, true, 0,
+         errors::Unimplemented(
+             StrCat(op_name, " only supports is_training=false"))},
+        {"NCHW", 1, false, 0,
+         errors::Unimplemented(StrCat("The input \"scale\" for ", op_name,
+                                      " must be a constant"))},
+        {"NCHW", 2, false, 0,
+         errors::Unimplemented(StrCat("The input \"offset\" for ", op_name,
+                                      " must be a constant"))},
+        {"NCHW", 3, false, 0,
+         errors::Unimplemented(StrCat("The input \"mean\" for ", op_name,
+                                      " must be a constant"))},
+        {"NCHW", 4, false, 0,
+         errors::Unimplemented(StrCat("The input \"variance\" for ", op_name,
+                                      " must be a constant"))},
+        {"NCHW", 0, false, 0.01},
+        {"NHWC", 0, false, 0.01}};
+    if (trt_mode_ == TrtTestMode::kDynamicShape) {
+      test_param.push_back(
+          {"NCHW", 0, false, 0.01,
+           errors::InvalidArgument("Channel dimension must be static"), true});
+      test_param.push_back(
+          {"NHWC", 0, false, 0.01,
+           errors::InvalidArgument("Channel dimension must be static"), true});
+    }
+    for (auto p : test_param) {
+      Reset();
+      NodeDef node_def =
+          get_node_def(tf_type_, p.data_format, p.is_training, p.epsilon);
+      std::vector<NodeInput> node_input =
+          p.data_format == "NCHW" ? node_input_nchw : node_input_nhwc;
+      std::vector<float> expected_output =
+          p.data_format == "NCHW" ? expected_output_nchw : expected_output_nhwc;
+      for (int i = 0; i < node_input.size(); i++) {
+        if (i == 0 || i == p.tensor_input_idx) {
+          // The first input (x) is always added as a tensor, and it has shape
+          // NCHW/NHWC. The other inputs are per channel values (1D, size C).
+          //
+          // In implicit batch mode, it is not possible to add any of the 1D
+          // inputs as a tensor: the first dim is always treated as batch dim in
+          // implicit batch mode, and that has to agree for all tensors. We have
+          // two input tensors with shapes NCHW and C and in general N != C.
+          // The converter already picked up N from the fist input, and reports
+          // an error when we try to add any other tensors with not matching
+          // first dim.
+          //
+          // This restriction does not apply in explicit batch mode: the tensors
+          // can have different first dim. The converter still expects that only
+          // the first arg is a tensor. TODO(tfeher) Check if one can relax this
+          // restriction.
+          Status expected_status =
+              (i != 0 && trt_mode_ == TrtTestMode::kImplicitBatch)
+                  ? errors::InvalidArgument(
+                        batch_size_error(node_input[i].name,
+                                         "Provided batch size does not match "
+                                         "converter batch size: 3 vs 2"))
+                  : Status::OK();
+          std::vector<int> partial_input_shape;
+          if (i == 0 && trt_mode_ == TrtTestMode::kDynamicShape &&
+              !p.keep_channel_unknown) {
+            // keep channel dim static (known)
+            partial_input_shape.resize(4, -1);
+            int channel_dim = (p.data_format == "NCHW" ? 1 : 3);
+            partial_input_shape[channel_dim] = node_input[i].dims[channel_dim];
+          }
+          AddTestTensor(node_input[i].name, node_input[i].dims, tf_type_,
+                        node_input[i].val, partial_input_shape,
+                        expected_status);
+
+        } else {
+          AddTestWeights(node_input[i].name, node_input[i].dims,
+                         node_input[i].val, tf_type_);
+        }
+      }
+      TestOpConverter(node_def, node_input[0].dims, p.conversion_status,
+                      Status::OK(), ArrayFloatNear(expected_output));
+    }
+  }
+}
+
+TEST_P(OpConverter_FP32_Test, ConvertTranspose) {
   // Get the NodeDef for Transpose.
   Scope s = Scope::NewRootScope();
-  auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
+  auto input = ops::Placeholder(s.WithOpName("input"), tf_type_);
   auto weights = ops::Placeholder(s.WithOpName("weights"), DT_INT32);
   auto transpose = ops::Transpose(s.WithOpName("my_transpose"), input, weights);
   const NodeDef& node_def = transpose.operation.node()->def();
 
-  {
-    // Permutation is a tensor, should fail.
-    Reset();
-    AddTestTensor("input", {1, 2, 3});
-    AddTestTensor("weights", {3});
-    RunValidationAndConversion(
-        node_def, error::UNIMPLEMENTED,
-        "The input \"perm\" for Transpose must be a constant, at my_transpose");
-  }
-  {
-    // Transpose at batch dimension, should fail.
-    Reset();
-    AddTestTensor("input", {1, 2, 3});
-    AddTestWeights<int32>("weights", {4}, {1, 0, 2, 3});
-    RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
-                               "Transpose at batch dimension is not supported");
-  }
-  {
-    // Permutation rank doesn't match, should fail.
-    Reset();
-    AddTestTensor("input", {1, 2, 3});
-    AddTestWeights<int32>("weights", {3}, {0, 1, 2});
-    RunValidationAndConversion(
-        node_def, error::INVALID_ARGUMENT,
-        "Rank of perm for transpose does not match with that of the input.");
+  std::vector<TestParamBase> test_params = {
+      // For the first test we leave param empty. This signals to use a
+      // input as weight which will be invalid
+      TestParamBase{{3, 1, 2, 1},
+                    {},
+                    {},
+                    {},
+                    Status(absl::StatusCode::kUnimplemented,
+                           "The input \"perm\" for Transpose must be a "
+                           "constant")},
+      TestParamBase{{1, 1, 2, 3},
+                    {},
+                    {},
+                    {0, 1, 2},
+                    Status(absl::StatusCode::kInvalidArgument,
+                           "Rank of perm for transpose does not match with "
+                           "that of the input.")},
+      // Transpose batch dim
+      TestParamBase{
+          {1, 1, 2, 3},
+          {},
+          {3, 2, 1, 1},
+          {3, 2, 1, 0},
+          (trt_mode_ == TrtTestMode::kImplicitBatch)
+              ? Status(absl::StatusCode::kUnimplemented,
+                       "Transpose at batch dimension is not supported")
+              : Status::OK()},
+      TestParamBase{{1, 1, 2, 3}, {}, {1, 3, 1, 2}, {0, 3, 1, 2}},
+  };
+  if (trt_mode_ == TrtTestMode::kDynamicShape) {
+    // Dynamic shape tests where some shapes are known
+    test_params.push_back(TestParamBase{
+        {1, 1, 2, 3}, {-1, 1, 2, -1}, {1, 3, 1, 2}, {0, 3, 1, 2}});
+  }
+  std::vector<float> expected_values{1, 4, 2, 5, 3, 6};
+  for (auto p : test_params) {
+    SCOPED_TRACE(p);
+    Reset();
+    AddTestTensor("input", p.input_dims, {1, 2, 3, 4, 5, 6},
+                  p.partial_input_dims);
+    if (p.param.empty()) {
+      AddTestTensor("weights", {3});
+    } else {
+      AddTestWeights<int32>("weights", {static_cast<int>(p.param.size())},
+                            p.param);
+    }
+    TestOpConverter(node_def, p.expected_output_dims, p.status,
+                    p.runtime_status, ElementsAreArray(expected_values));
   }
-  {
-    // Ok.
-    Reset();
-    AddTestTensor("input", {1, 2, 3});
-    AddTestWeights<int32>("weights", {4}, {0, 3, 1, 2});
-    RunValidationAndConversion(node_def);
-    TRT_TensorOrWeights output;
-    TF_EXPECT_OK(GetTensorOrWeights("my_transpose", &output));
-    ASSERT_TRUE(output.is_tensor());
-    ExpectTrtDimsEqualsArray({3, 1, 2}, output.tensor()->getDimensions());
+}
 
-    const DataVec input_data{
-        {"input", test::AsTensor<float>({1, 2, 3, 4, 5, 6})}};
-    DataVec output_data{{"my_transpose", ConstructTensor<float>(6)}};
-    BuildAndRun(input_data, &output_data);
-    EXPECT_THAT(GetSpanForData<float>(output_data[0]),
-                ElementsAre(1, 4, 2, 5, 3, 6));
+TEST_P(OpConverter_FP32_Test, ConvertTile) {
+  Scope s = Scope::NewRootScope();
+  auto input = ops::Placeholder(s.WithOpName("input"), tf_type_);
+  auto weights = ops::Placeholder(s.WithOpName("weights"), DT_INT32);
+  auto tile = ops::Tile(s.WithOpName("my_tile"), input, weights);
+  const NodeDef& node_def = tile.operation.node()->def();
+
+  struct TileParam {
+    std::vector<int> input_dims;
+    std::vector<int> multiplier;
+    std::vector<float> tensor;
+    // Concrete (static) output dimensions, including batch size as first dim.
+    std::vector<int> expected_output_dims;
+    std::vector<int> expected_results;
+    int test_ID;
+    // Expected status of conversion (with concrete error message).
+    Status status;
+  };
+
+  std::vector<TileParam> test_params = {
+      // Tests to be rejected by ConvertTile::Validate() for any trt_mode_.
+      TileParam{{1, 2, 3},   // input_dims
+                {1, -2, 1},  // multiplier
+                {},          // tensor
+                {},          // expected_output_dims
+                {},          // expected_results
+                1,           // test_ID
+                Status(absl::StatusCode::kInvalidArgument,
+                       "All replications of the Tile operation in "
+                       "'my_tile' should be positive, got (1, -2, 1).")},
+      TileParam{{1, 2, 3},           // input_dims
+                {1, 2, 1, 3},        // multiplier
+                {0, 1, 2, 3, 4, 5},  // tensor
+                {},                  // expected_output_dims
+                {},                  // expected_results
+                2,                   // test_ID
+                Status(absl::StatusCode::kInvalidArgument,
+                       "The length of the replication vector (4) of the "
+                       "Tile operation in 'my_tile' is expected to be equal "
+                       "to the rank of the input vector (3).")},
+      // Tests passed ConvertTile::Validate() for at least some trt_mode_.
+      TileParam{{1, 2},                                 // input_dims
+                {1, 3},                                 // multiplier
+                {2, 3},                                 // tensor
+                {1, 6},                                 // expected_output_dims
+                {2, 3, 2, 3, 2, 3}},                    // out values
+      TileParam{{1, 2, 3},                              // input_dims
+                {1, 2, 1},                              // multiplier
+                {0, 1, 2, 3, 4, 5},                     // tensor
+                {1, 4, 3},                              // output dims
+                {0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5}},  // expected_results
+      TileParam{{1, 2, 3},                              // input_dims
+                {1, 1, 2},                              // multiplier
+                {0, 1, 2, 3, 4, 5},                     // tensor
+                {1, 2, 6},                              // expected_output_dims
+                {0, 1, 2, 0, 1, 2, 3, 4, 5, 3, 4, 5}},  // expected_results
+      TileParam{{1, 2, 3},                              // input_dims
+                {1, 2, 2},                              // multiplier
+                {0, 1, 2, 3, 4, 5},                     // tensor
+                {1, 4, 6},                              // expected_output_dims
+                {0, 1, 2, 0, 1, 2, 3, 4, 5, 3, 4, 5,
+                 0, 1, 2, 0, 1, 2, 3, 4, 5, 3, 4, 5}},  // expected_results
+      // Tests with non trivial batch size multiplier.
+      TileParam{{1, 2},                                 // input_dims
+                {2, 3},                                 // multiplier
+                {2, 3},                                 // tensor
+                {2, 6},                                 // expected_output_dims
+                {2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3}},  // out values
+      TileParam{{1, 2, 3},                              // input_dims
+                {2, 2, 1},                              // multiplier
+                {0, 1, 2, 3, 4, 5},                     // tensor
+                {2, 4, 3},                              // output dims
+                {0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
+                 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5}},  // expected_results
+  };
+
+  for (bool multiplier_is_tensor : {true, false}) {
+    for (bool input_is_tensor : {true, false}) {
+      for (auto p : test_params) {
+        std::vector<int> num_mults = {static_cast<int>(p.multiplier.size())};
+        std::vector<int> partial_input_dims = {};
+        if (multiplier_is_tensor) {
+          if (trt_mode_ == TrtTestMode::kImplicitBatch) {
+            p.status =
+                Status(absl::StatusCode::kInvalidArgument,
+                       "Conversion for Tile is not implemented for multipliers "
+                       "passed as a tensor in implicit batch mode");
+            num_mults = {1, static_cast<int>(p.multiplier.size())};
+          } else {
+            if (p.test_ID == 1) {
+              // Skip this test because in that situation it is impossible
+              // to do a valid check for negative multipliers.
+              continue;
+            }
+
+            if (trt_mode_ == TrtTestMode::kDynamicShape) {
+              partial_input_dims = num_mults;
+              p.status = Status::OK();
+            }
+
+            if (p.test_ID == 2) {
+              p.status = Status(absl::StatusCode::kInvalidArgument,
+                                "When replications are defined as a tensor, "
+                                "the number of its elements (4) must be equal "
+                                "to the rank of the input tensor (3).");
+            }
+          }
+        } else {
+          if (trt_mode_ == TrtTestMode::kImplicitBatch && p.multiplier[0] > 1) {
+            p.status =
+                Status(absl::StatusCode::kUnimplemented,
+                       "The Tile operation along "
+                       "the batch dimension in 'my_tile' is not implemented.");
+          }
+        }
+
+        Reset();
+        if (input_is_tensor) {
+          AddTestTensor("input", p.input_dims, p.tensor);
+        } else {
+          AddTestWeights("input", p.input_dims, p.tensor, tf_type_);
+        }
+
+        if (multiplier_is_tensor) {
+          AddTestTensor<int>("weights", num_mults, DT_INT32, p.multiplier,
+                             partial_input_dims);
+        } else {
+          AddTestWeights<int32>("weights", num_mults, p.multiplier);
+        }
+
+        TestOpConverter(node_def, p.expected_output_dims, p.status,
+                        Status::OK(), ElementsAreArray(p.expected_results));
+      }
+    }
   }
 }
 
-TEST_F(OpConverterTest, ConvertReshape) {
+TEST_P(OpConverter_FP32_Test, ConvertReshape) {
   // Get the NodeDef for Reshape.
   Scope s = Scope::NewRootScope();
-  auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
+  auto input = ops::Placeholder(s.WithOpName("input"), tf_type_);
   auto weights = ops::Placeholder(s.WithOpName("weights"), DT_INT32);
   auto reshape = ops::Reshape(s.WithOpName("my_reshape"), input, weights);
   const NodeDef& node_def = reshape.operation.node()->def();
 
-  {
-    // Shape is a tensor, should fail.
+  if (trt_mode_ == TrtTestMode::kImplicitBatch) {
+    // Shape is a tensor, should fail in implicit batch mode.
     Reset();
-    AddTestTensor("input", {1, 2, 3});
+    AddTestTensor("input", {3, 2, 1});
     AddTestTensor("weights", {3});
     RunValidationAndConversion(
-        node_def, error::UNIMPLEMENTED,
-        "The input \"shape\" for Reshape must be a constant, at my_reshape");
-  }
-  {
-    // Reshape to scalar, should fail.
-    Reset();
-    AddTestTensor("input", {1, 2, 3});
-    AddTestWeights<int32>("weights", {0}, {});
-    RunValidationAndConversion(
-        node_def, error::UNIMPLEMENTED,
-        "Reshape to shape=[] is not supported, at my_reshape");
-  }
-  {
-    // Reshape tensor with zero rank to empty tensor, should fail.
-    Reset();
-    AddTestTensor("input", {});
-    AddTestWeights<int32>("weights", {1, 0, 1}, {});
+        node_def, absl::StatusCode::kInvalidArgument,
+        "The input \"shape\" for Reshape must be a constant in implicit batch "
+        "mode");
+  } else if (!IS_TRT_VERSION_GE(7, 1, 3, 0)) {
+    // Shape is a tensor, should fail before TRT 7.1.3 even in explicit batch /
+    // dynamic shape mode.
+    Reset();
+    AddTestTensor("input", {3, 2, 1});
+    AddTestTensor("weights", {3});
     RunValidationAndConversion(
-        node_def, error::UNIMPLEMENTED,
-        "Reshape to shape=[] is not supported, at my_reshape");
-  }
+        node_def, absl::StatusCode::kInvalidArgument,
+        "Non constant shape input tensor for Reshape requires minimum TRT "
+        "7.1.3");
+  }
+
+  Status reshape_from_scalar_status =
+      trt_mode_ == TrtTestMode::kImplicitBatch
+          ? errors::Internal(
+                "Failed to convert at least one input to a TRT_TensorOrWeights:"
+                " Scalar input tensor is not supported since the first "
+                "dimension is treated as batch dimension by TRT")
+          : Status::OK();
+  Status add_scalar_tensor_status =
+      trt_mode_ == TrtTestMode::kImplicitBatch
+          ? errors::InvalidArgument(
+                "removing first dim requires explicit batch dimension")
+          : Status::OK();
+  Status reshape_to_scalar_status =
+      trt_mode_ == TrtTestMode::kImplicitBatch
+          ? errors::Unimplemented("Reshape to shape=[] is not supported")
+          : Status::OK();
+  Status reshape_batch_status =
+      trt_mode_ == TrtTestMode::kImplicitBatch
+          ? errors::Unimplemented("Reshape on batch dimension is not supported")
+          : Status::OK();
 
   struct TestParams {
-    int batch_size;
     std::vector<int> tensor_dims;
     std::vector<int> shape;
+    std::vector<int> expected_shape;
+    Status conversion_status;
+    Status runtime_status;
+    std::vector<int> shape_prof;  // needed concrete values if shape == -1.
+    Status add_test_tensor_status;
   };
 
-  // Reshape at batch dimension, should fail.
-  const int kReshapeBatchDimsCases = 5;
-  TestParams params[kReshapeBatchDimsCases] = {
-      TestParams{1, {1, 2, 3}, {3, 1, 1, 2}},
-      TestParams{1, {1, 2, -1}, {-1, 1, 1, 2}},
-      TestParams{1, {1, 2, 3}, {-1, 1, 1, 2}},
-      TestParams{-1, {1, 2, 3}, {1, 1, 1, 2}},
-      TestParams{-1, {-1, 2, 3}, {1, 1, 1, 6}},  // TODO(laigd): it should pass.
+  std::vector<TestParams> params = {
+      // Reshape scalar to tensor, should fail in implicit batch mode.
+      TestParams{{},
+                 {1, 1},
+                 {},
+                 reshape_from_scalar_status,
+                 {},
+                 {},
+                 add_scalar_tensor_status},
+      // Reshape tensor to scalar, should fail in implicit batch mode.
+      // - In explicit batch mode if shape is set as weight it works.
+      // - In explicit batch mode && using shape as tensor input it should
+      //   fail. In that case we set the expected conversion status in the
+      //   test loop.
+      TestParams{{1, 1}, {}, {}, reshape_to_scalar_status},
+      // Reshape at batch dimension, should fail in implicit batch mode.
+      TestParams{{1, 1, 2, 3}, {3, 1, 1, 2}, {}, reshape_batch_status},
+      TestParams{{2, 1, 2, 3}, {-1, 1, 4}, {3, 1, 4}, reshape_batch_status},
+      // Tests that should succeed in every trt_mode.
+      TestParams{{1, 1, 2, 3}, {-1, 1, 3, 2}, {1, 1, 3, 2}},
+      TestParams{{1, 1, 2, 3}, {1, 1, -1}, {1, 1, 6}},
+      TestParams{{1, 1, 2, 3}, {1, 1, 3, 2}},
+      TestParams{{2, 1, 2, 3}, {2, 1, 3, 2}},
+      TestParams{{1, 1, 1}, {1}},
+      TestParams{{1}, {1, 1}},
+      TestParams{{2, 1, 1}, {2}},
+      TestParams{{2}, {2, 1}},
   };
-  for (int i = 0; i < kReshapeBatchDimsCases; ++i) {
-    Reset();
-    const std::vector<int>& dims = params[i].tensor_dims;
-    AddTestTensor("input", dims, params[i].batch_size);
-    AddTestWeights<int32>("weights", {4}, params[i].shape);
-    RunValidationAndConversion(
-        node_def, error::UNIMPLEMENTED,
-        "Reshape on batch dimension is not supported, at my_reshape",
-        /*should_run_conversion=*/(dims[0] > 0 && dims[1] > 0 && dims[2] > 0));
-  }
-
-  // Reshape on non batch dimensions, ok.
-  const int kReshapeOKCases = 8;
-  TestParams ok_params[kReshapeOKCases] = {
-      TestParams{-1, {1, 2, 3}, {-1, 1, 3, 2}},
-      TestParams{1, {1, 2, 3}, {-1, 1, 3, 2}},
-      TestParams{1, {1, 2, 3}, {1, 1, 3, 2}},
-      TestParams{2, {1, 2, 3}, {2, 1, 3, 2}},
-      TestParams{1, {1, 1}, {1}},
-      TestParams{1, {}, {1, 1}},
-      TestParams{2, {1, 1}, {2}},
-      TestParams{2, {}, {2, 1}},
+  if (trt_mode_ == TrtTestMode::kImplicitBatch) {
+    // Reshape tensor with zero rank using an empty shape tensor, should fail in
+    // implicit batch mode. In explicit batch mode this is an identity operation
+    // and does not add a reshape layer therefore we do not test it.
+    params.push_back(TestParams{{},
+                                {},
+                                {},
+                                reshape_from_scalar_status,
+                                {},
+                                {},
+                                add_scalar_tensor_status});
+  }
+  // Testing the methods for representing the reshape shape for IShuffleLayer:
+  // as a weight (true) or as a tensor (false).
+  std::vector<bool> shape_input_options(1, true);
+
+  if (trt_mode_ != TrtTestMode::kImplicitBatch &&
+      IS_TRT_VERSION_GE(7, 1, 3, 0)) {
+    shape_input_options.push_back(false);
+  }
+
+  for (auto p : params) {
+    for (auto shape_as_weight : shape_input_options) {
+      std::ostringstream oss;
+      oss << "shape " << PrintToString(p.shape);
+      SCOPED_TRACE(StrCat(oss.str(), shape_as_weight ? " weight" : " tensor"));
+      if (!shape_as_weight && p.shape.empty()) {
+        p.conversion_status = errors::Unimplemented(
+            "Reshape with dynamic input requires 1D input tensor");
+      }
+      Reset();
+      const int n_elements =
+          std::accumulate(p.tensor_dims.begin(), p.tensor_dims.end(), 1,
+                          std::multiplies<int>());
+      std::vector<float> input_vec(n_elements);
+      std::iota(input_vec.begin(), input_vec.end(), 1);
+      AddTestTensor("input", p.tensor_dims, tf_type_, input_vec, {},
+                    p.add_test_tensor_status);
+      if (shape_as_weight) {
+        AddTestWeights<int32>("weights", {static_cast<int>(p.shape.size())},
+                              p.shape);
+      } else {
+        std::vector<int32> dims;
+        std::vector<int32> values{p.shape};
+        if (!p.shape.empty()) {
+          dims.push_back(p.shape.size());
+        } else {
+          // If the shape is empty we use a dummy value to ensure that
+          // AddTestTensor creates the corresponding entry in InputOutputData.
+          values.push_back(1);
+        }
+        AddTestTensor("weights", dims, DT_INT32, values, dims);
+      }
+      std::vector<int> expected_shape =
+          p.expected_shape.empty() ? p.shape : p.expected_shape;
+      VLOG(2) << "Calling TestOpConverter";
+      TestOpConverter(node_def, expected_shape, p.conversion_status,
+                      p.runtime_status, ElementsAreArray(input_vec));
+    }
+  }
+}
+
+TEST_P(OpConverter_FP32_Test, ConvertShape) {
+  // Get the NodeDef for Shape op.
+  Scope s = Scope::NewRootScope();
+  auto input = ops::Placeholder(s.WithOpName("input"), tf_type_);
+  auto shape = ops::Shape(s.WithOpName("my_shape"), input);
+  const NodeDef& node_def = shape.operation.node()->def();
+
+  Status conversion_status =
+      (trt_mode_ == TrtTestMode::kImplicitBatch)
+          ? errors::Unimplemented(
+                "Shape is only supported for explicit batch mode.")
+          : Status::OK();
+  std::vector<TestParamBase> test_params = {
+// TODO(b/166274212): Enable the test parameter for TensorRT 7.1.3.
+#if !IS_TRT_VERSION_GE(7, 1, 3, 0)
+    TestParamBase{{1, 2, 3}, {}, {3}, {}, conversion_status},
+#endif
+    // Add input as weight (we use non empty param ({1}) to trigger this).
+    TestParamBase{{1, 2, 3}, {}, {3}, {1}, conversion_status},
   };
-  for (int i = 0; i < kReshapeOKCases; ++i) {
-    const int batch_size = std::max(1, ok_params[i].batch_size);
-    const auto& shape = ok_params[i].shape;
-    Reset();
-    AddTestTensor("input", ok_params[i].tensor_dims, batch_size);
-    AddTestWeights<int32>("weights", {static_cast<int>(shape.size())}, shape);
-    RunValidationAndConversion(node_def);
 
-    TRT_TensorOrWeights output;
-    TF_EXPECT_OK(GetTensorOrWeights("my_reshape", &output));
-    ASSERT_TRUE(output.is_tensor());
-    const std::vector<int> expected_output_dims(shape.begin() + 1, shape.end());
-    const nvinfer1::Dims actual_output_dims = output.tensor()->getDimensions();
-    ExpectTrtDimsEqualsArray(expected_output_dims, actual_output_dims);
-
-    std::vector<float> input_vec(TrtTensorDimsNumElements(actual_output_dims) *
-                                 batch_size);
-    std::iota(input_vec.begin(), input_vec.end(), 1);
-    const DataVec input_data{{"input", test::AsTensor<float>(input_vec)}};
-    DataVec output_data{
-        {"my_reshape", ConstructTensor<float>(input_vec.size())}};
-    BuildAndRun(input_data, &output_data, TrtPrecisionMode::FP32, batch_size);
-    EXPECT_THAT(GetSpanForData<float>(output_data[0]),
-                ElementsAreArray(input_vec));
+  auto input_is_weight = [](const TestParamBase p) { return !p.param.empty(); };
+  for (auto p : test_params) {
+    SCOPED_TRACE(p);
+    Reset();
+    // The number of elements of the input tensor. We leave it 0 in case we do
+    // not need to add an input tensor. This happens in explicit batch mode: the
+    // shape is known at conversion time and therefore the shape is added to the
+    // network as a constant layer. In this case the single node network that
+    // we use for the unit test have no actual input tensor when it is converted
+    // to a TensorRT network.
+    int n_elements = 0;
+    if (input_is_weight(p) || trt_mode_ != TrtTestMode::kExplicitBatch) {
+      // Calculate the number of elements for adding input data.
+      n_elements = std::accumulate(p.input_dims.begin(), p.input_dims.end(), 1,
+                                   std::multiplies<int>());
+    }
+    std::vector<float> input_val(n_elements, 1);
+    if (!input_is_weight(p)) {
+      AddTestTensor("input", p.input_dims, input_val);
+    } else {
+      AddTestWeights("input", p.input_dims, input_val, tf_type_);
+    }
+    TestOpConverter(node_def, p.expected_output_dims, p.status,
+                    p.runtime_status, ElementsAreArray(p.input_dims),
+                    {DT_INT32});
   }
 }
 
-// Helper function for testing MatMul and BatchMatMul
-// get_matmul corresponds to the function used to generate the node. It should
-// accept (DataType, transpose_a, transpose_b) as parameters.
+struct MatMulTestParams {
+  std::vector<int> shape_a;
+  std::vector<int> values_a;
+  bool transpose_a;
+  std::vector<int> shape_b;
+  std::vector<int> values_b;
+  bool transpose_b;
+  std::vector<int> expected_shape;
+  std::vector<int> expected_output;
+};
+
+// Helper function for testing MatMul and BatchMatMul. get_matmul is a function
+// used to generate the node. It accepts (DataType, transpose_a, transpose_b) as
+// parameters.
 void TestMatMulHelper(
-    OpConverterTest* test,
+    ParameterizedOpConverterTestBase* test,
     const std::function<NodeDef(DataType, bool, bool)>& get_matmul,
-    const std::string& op_name) {
-  // HACK: This needs to be done in a better way.
-  const bool is_batch_matmul = op_name == "BatchMatMul";
+    const std::vector<MatMulTestParams>& params) {
   {
     // Unsupported data type.
     test->Reset();
     NodeDef node_def = get_matmul(DT_INT32, false, false);
-    test->AddTestTensor("input", {2}, /*batch_size=*/1,
-                        nvinfer1::DataType::kINT32);
+    test->AddTestTensor("input", {1, 2}, DT_INT32, {});
     test->AddTestWeights<int32>("weights", {2, 1}, {3, 5});
+    const std::vector<DataType> allowed_types{DT_FLOAT, DT_HALF};
     test->RunValidationAndConversion(
-        node_def, error::UNIMPLEMENTED,
-        StrCat("Data type int32 is not supported for ", op_name,
-               ", must be one of [float, half], at my_matmul")
-            .c_str());
-  }
-  // OK.
-  for (bool transpose_a : {false, true}) {
-    for (bool transpose_b : {false, true}) {
-      test->Reset();
-      NodeDef node_def = get_matmul(DT_FLOAT, transpose_a, transpose_b);
-      test->AddTestTensor("input", {2}, /*batch_size=*/1);
-      test->AddTestWeights<float>("weights", {2, 2}, {0, 1, 2, 3});
-      if (is_batch_matmul) {
-        test->RunValidationAndConversion(
-            node_def, error::UNIMPLEMENTED,
-            "TensorRT does not support batched constants.");
-        continue;
-      } else if (transpose_a) {
-        test->RunValidationAndConversion(
-            node_def, error::INVALID_ARGUMENT,
-            "Cannot transpose first input if it is a tensor with fewer than 2 "
-            "non-batch dimensions");
-        continue;
-      }
-      test->RunValidationAndConversion(node_def);
-      TRT_TensorOrWeights output;
-      TF_EXPECT_OK(test->GetTensorOrWeights("my_matmul", &output));
-      ASSERT_TRUE(output.is_tensor());
-      ExpectTrtDimsEqualsArray({2}, output.tensor()->getDimensions());
-
-      const DataVec input_data{{"input", test::AsTensor<float>({0, 1})}};
-      DataVec output_data{{"my_matmul", ConstructTensor<float>(2)}};
-      test->BuildAndRun(input_data, &output_data);
-      if (transpose_b) {
-        EXPECT_THAT(GetSpanForData<float>(output_data[0]), ElementsAre(1, 3));
-      } else {
-        EXPECT_THAT(GetSpanForData<float>(output_data[0]), ElementsAre(2, 3));
+        node_def, absl::StatusCode::kUnimplemented,
+        convert_not_supported_dtype_msg(allowed_types, DT_INT32, node_def));
+  }
+
+  // FC conversion depends on whether the last dim of A is known or not. In
+  // Dynamic shape mode, we will check whether A is handled correctly if it has
+  // a partially known input shape (last dim known).
+  std::vector<bool> a_test_partial_shape_values{false};
+  if (test->get_trt_mode() == TrtTestMode::kDynamicShape) {
+    a_test_partial_shape_values.push_back(true);
+  }
+
+  for (auto p : params) {
+    for (bool a_is_tensor : {true, false}) {
+      for (bool b_is_tensor : {true, false}) {
+        for (bool a_partial_shape : a_test_partial_shape_values) {
+          if (a_partial_shape && !a_is_tensor) {
+            // Only tensors can have partial shape.
+            continue;
+          }
+          if (!a_is_tensor && !b_is_tensor) {
+            // Skip test when both args are weights. We do not convert this
+            // since const folding eliminates this case.
+            continue;
+          }
+          SCOPED_TRACE(StrCat("A", p.transpose_a ? ".T" : "", " is ",
+                              a_is_tensor ? "tensor" : "weight", ", B",
+                              p.transpose_b ? ".T" : "", " is ",
+                              b_is_tensor ? "tensor " : "weight, rank A ",
+                              p.shape_a.size(), ", rank B ", p.shape_b.size()));
+          test->Reset();
+
+          NodeDef node_def =
+              get_matmul(test->get_tf_type(), p.transpose_a, p.transpose_b);
+          const bool is_batch_matmul = node_def.op() == "BatchMatMul";
+
+          if (a_is_tensor) {
+            if (a_partial_shape) {
+              // Prepare a partial shape for A where only the last dim is known.
+              std::vector<int> partial_shape(p.shape_a.size(), -1);
+              int k = p.shape_a.size() - 1;
+              partial_shape.at(k) = p.shape_a.at(k);
+              test->AddTestTensor("input", p.shape_a, test->get_tf_type(),
+                                  p.values_a, partial_shape);
+            } else {
+              test->AddTestTensor("input", p.shape_a, p.values_a);
+            }
+          } else {
+            test->AddTestWeights("input", p.shape_a, p.values_a,
+                                 test->get_tf_type());
+          }
+          if (b_is_tensor) {
+            if (a_is_tensor && p.shape_a[0] != p.shape_b[0] &&
+                test->get_trt_mode() == TrtTestMode::kImplicitBatch) {
+              VLOG(2) << "Skipping test with inpcompatible batch dimensions";
+              continue;
+            }
+            test->AddTestTensor("weights", p.shape_b, p.values_b);
+          } else {
+            test->AddTestWeights("weights", p.shape_b, p.values_b,
+                                 test->get_tf_type());
+          }
+
+          Status conversion_status = Status::OK();
+          if (test->get_trt_mode() == TrtTestMode::kImplicitBatch) {
+            // Implicit batch mode has several restriction. We change conversion
+            // status accordingly.
+            if (is_batch_matmul) {
+              if (a_is_tensor && p.shape_a.size() < p.shape_b.size()) {
+                conversion_status = errors::InvalidArgument(
+                    "Broadcasting beyond batch dimension is not supported "
+                    "(tensor #dims ",
+                    p.shape_a.size(), " vs broadcast #dims ", p.shape_b.size(),
+                    ")");
+              }
+              if (b_is_tensor && p.shape_b.size() < p.shape_a.size()) {
+                conversion_status = errors::InvalidArgument(
+                    "Broadcasting beyond batch dimension is not supported "
+                    "(tensor #dims ",
+                    p.shape_b.size(), " vs broadcast #dims ", p.shape_a.size(),
+                    ")");
+              }
+              if ((!a_is_tensor || !b_is_tensor) && p.shape_a[0] != 1) {
+                conversion_status = errors::Unimplemented(
+                    "TensorRT does not support batched constants in implicit "
+                    "batch mode.");
+              }
+            } else if ((a_is_tensor && p.shape_a.size() <= 2 &&
+                        (p.transpose_a || b_is_tensor)) ||
+                       (b_is_tensor && p.shape_b.size() <= 2)) {
+              conversion_status = errors::InvalidArgument(
+                  "MatMul with 2D tensors requires explicit batch mode, or that"
+                  " tensor A is not transposed and B is a constant tensor.");
+            }
+          }
+
+          test->TestOpConverter(node_def, p.expected_shape, conversion_status,
+                                Status::OK(),
+                                ElementsAreArray(p.expected_output));
+          if (!conversion_status.ok()) {
+            VLOG(2) << "Converted with status " << conversion_status;
+          }
+          VLOG(2) << "== Finished test iteration ==";
+        }
       }
     }
   }
-  // OK, 3D inputs
-  for (bool transpose_b : {false, true}) {
-    test->Reset();
-    NodeDef node_def = get_matmul(DT_FLOAT, /*transpose_a=*/false, transpose_b);
-    test->AddTestTensor("input", {2}, /*batch_size=*/1);
-    test->AddTestWeights<float>("weights", {2, 2}, {0, 1, 2, 3});
-    if (is_batch_matmul) {
-      test->RunValidationAndConversion(
-          node_def, error::UNIMPLEMENTED,
-          "TensorRT does not support batched constants.");
-      continue;
-    }
-    test->RunValidationAndConversion(node_def);
-    TRT_TensorOrWeights output;
-    TF_EXPECT_OK(test->GetTensorOrWeights("my_matmul", &output));
-    ASSERT_TRUE(output.is_tensor());
-    ExpectTrtDimsEqualsArray({2}, output.tensor()->getDimensions());
-    const DataVec input_data{{"input", test::AsTensor<float>({0, 1})}};
-    DataVec output_data{{"my_matmul", ConstructTensor<float>(2)}};
-    test->BuildAndRun(input_data, &output_data);
-    if (transpose_b) {
-      EXPECT_THAT(GetSpanForData<float>(output_data[0]), ElementsAre(1, 3));
-    } else {
-      EXPECT_THAT(GetSpanForData<float>(output_data[0]), ElementsAre(2, 3));
-    }
-  }
 }
 
 template <typename LayerType>
@@ -1854,7 +3192,39 @@ void CheckAddedLayers(OpConverterTest* test, bool expect_found) {
   EXPECT_EQ(expect_found, layer_found);
 }
 
-TEST_F(OpConverterTest, ConvertMatMul) {
+std::vector<MatMulTestParams> GetMatMulTestParams() {
+  std::vector<MatMulTestParams> params{
+      // clang-format off
+      MatMulTestParams{{2, 2}, {0, 1, 2, 3}, false,  // A (shape, val, T?)
+                       {2, 2}, {0, 1, 2, 3}, false,  // B (shape, val, T?)
+                       {2, 2}, {2, 3, 6, 11}},       // result (shape, val)
+      MatMulTestParams{{2, 2}, {0, 1, 2, 3}, false,
+                       {2, 2}, {0, 1, 2, 3},  true,
+                       {2, 2}, {1, 3, 3, 13}},
+      MatMulTestParams{{2, 2}, {0, 1, 2, 3},  true,
+                       {2, 2}, {0, 1, 2, 3}, false,
+                       {2, 2}, {4, 6, 6, 10}},
+      MatMulTestParams{{2, 2}, {0, 1, 2, 3}, true,
+                       {2, 2}, {0, 1, 2, 3}, true,
+                       {2, 2}, {2, 6, 3, 11}},
+      MatMulTestParams{{2, 3}, {0, 1, 2, 3, 4, 5}, false,
+                       {2, 3}, {1, 2, 3, 4, 5, 6}, true,
+                       {2, 2}, {8, 17, 26, 62}},
+      MatMulTestParams{{2, 3}, {0, 1, 2, 3, 4, 5}, true,
+                       {2, 3}, {1, 2, 3, 4, 5, 6}, false,
+                       {3, 3}, {12, 15, 18, 17, 22, 27, 22, 29, 36}},
+      MatMulTestParams{{3, 2}, {0, 1, 2, 3, 4, 5}, false,
+                       {2, 3}, {1, 2, 3, 4, 5, 6}, false,
+                       {3, 3}, {4, 5, 6, 14, 19, 24, 24, 33, 42}},
+      MatMulTestParams{{3, 2}, {0, 1, 2, 3, 4, 5}, true,
+                       {2, 3}, {1, 2, 3, 4, 5, 6}, true,
+                       {2, 2}, {16, 34, 22, 49}},
+      // clang-format on
+  };
+  return params;
+}
+
+TEST_P(OpConverter_FP32_Test, ConvertMatMul) {
   // Get the NodeDef for MatMul.
   auto get_matmul_nodedef = [](DataType dtype, bool transpose_a,
                                bool transpose_b) -> NodeDef {
@@ -1868,68 +3238,10 @@ TEST_F(OpConverterTest, ConvertMatMul) {
     return matmul.operation.node()->def();
   };
 
-  // Additional test cases specific to MatMul
-  {
-    // Can only transpose A if it is 2D in TRT
-    Reset();
-    NodeDef node_def = get_matmul_nodedef(DT_FLOAT, true, false);
-    AddTestTensor("input", {2}, /*batch_size=*/1);
-    AddTestWeights<float>("weights", {2, 2}, {0, 1, 2, 3});
-    RunValidationAndConversion(
-        node_def, error::INVALID_ARGUMENT,
-        "Cannot transpose first input if it is a tensor with fewer than 2 "
-        "non-batch dimensions.");
-  }
-  {
-    // B must always have 2 non-batch dimensions
-    Reset();
-    NodeDef node_def = get_matmul_nodedef(DT_FLOAT, false, false);
-    AddTestTensor("input", {2}, /*batch_size=*/1);
-    AddTestTensor("weights", {2}, /*batch_size=*/1);
-    RunValidationAndConversion(
-        node_def, error::INVALID_ARGUMENT,
-        "Second input must either be a constant, or contain at least 2 "
-        "non-batch dimensions.");
-  }
-  {
-    // We can never transpose weights that are not 2D.
-    Reset();
-    NodeDef node_def = get_matmul_nodedef(DT_FLOAT, true, false);
-    AddTestWeights<float>("input", {1, 1, 2}, {0, 1});
-    AddTestTensor("weights", {2, 2}, /*batch_size=*/1);
-    RunValidationAndConversion(
-        node_def, error::INVALID_ARGUMENT,
-        "Cannot currently transpose constant input if it is not 2 dimensional");
-  }
-  {
-    // Make sure that INT8 mode uses IFullyConnectedLayer when possible.
-    precision_mode_to_test_ = TrtPrecisionMode::INT8;
-    Reset();
-    NodeDef node_def = get_matmul_nodedef(DT_FLOAT, false, false);
-    AddTestTensor("input", {2, 1, 1});
-    AddTestWeights<float>("weights", {2, 2}, {0, 1, 2, 3});
-    RunValidationAndConversion(node_def);
-    CheckAddedLayers<nvinfer1::IMatrixMultiplyLayer>(this, false);
-    CheckAddedLayers<nvinfer1::IFullyConnectedLayer>(this, true);
-    precision_mode_to_test_ = TrtPrecisionMode::FP32;
-  }
-  {
-    // Make sure that INT8 mode doesn't try to use IFullyConnectedLayer when not
-    // compatible. In this case we can't use FC because weights is a tensor.
-    precision_mode_to_test_ = TrtPrecisionMode::INT8;
-    Reset();
-    NodeDef node_def = get_matmul_nodedef(DT_FLOAT, false, false);
-    AddTestTensor("input", {2, 1, 1});
-    AddTestTensor("weights", {2, 2});
-    RunValidationAndConversion(node_def);
-    CheckAddedLayers<nvinfer1::IMatrixMultiplyLayer>(this, true);
-    CheckAddedLayers<nvinfer1::IFullyConnectedLayer>(this, false);
-    precision_mode_to_test_ = TrtPrecisionMode::FP32;
-  }
-  TestMatMulHelper(this, get_matmul_nodedef, "MatMul");
+  TestMatMulHelper(this, get_matmul_nodedef, GetMatMulTestParams());
 }
 
-TEST_F(OpConverterTest, ConvertBatchMatMul) {
+TEST_P(OpConverter_FP32_Test, ConvertBatchMatMul) {
   // Get the NodeDef for BatchMatMul.
   auto get_batch_matmul_nodedef = [](DataType dtype, bool transpose_a,
                                      bool transpose_b) -> NodeDef {
@@ -1943,304 +3255,229 @@ TEST_F(OpConverterTest, ConvertBatchMatMul) {
     return matmul.operation.node()->def();
   };
 
-  {
-    // Can't broadcast two tensor inputs of different rank.
-    Reset();
-    NodeDef node_def = get_batch_matmul_nodedef(DT_FLOAT, false, false);
-    AddTestTensor("input", {1, 2, 2}, /*batch_size=*/2);
-    AddTestTensor("weights", {2}, /*batch_size=*/2);
-    RunValidationAndConversion(
-        node_def, error::UNIMPLEMENTED,
-        "Inputs must have the same rank if they are both tensors.");
-  }
-  {
-    // Make sure that INT8 mode doesn't try to use IFullyConnectedLayer when not
-    // compatible. In this case we can't use FC because transpose_a is true.
-    precision_mode_to_test_ = TrtPrecisionMode::INT8;
-    Reset();
-    NodeDef node_def = get_batch_matmul_nodedef(DT_FLOAT, true, false);
-    AddTestTensor("input", {1, 2, 2});
-    AddTestWeights<float>("weights", {2, 2}, {0, 1, 2, 3});
-    RunValidationAndConversion(node_def);
-    CheckAddedLayers<nvinfer1::IMatrixMultiplyLayer>(this, true);
-    CheckAddedLayers<nvinfer1::IFullyConnectedLayer>(this, false);
-    precision_mode_to_test_ = TrtPrecisionMode::FP32;
-  }
+  // We derive test data from the MatMul test params by adding extra leading
+  // dimensions.
+  std::vector<MatMulTestParams> params_2d = GetMatMulTestParams();
+  std::vector<MatMulTestParams> params;
+  params.reserve(params_2d.size() * 3 + 1);
 
-  for (bool transpose_a : {false, true}) {
-    for (bool transpose_b : {false, true}) {
-      Reset();
-      NodeDef node_def =
-          get_batch_matmul_nodedef(DT_FLOAT, transpose_a, transpose_b);
-      AddTestTensor("input", {2, 2}, /*batch_size=*/1);
-      AddTestWeights<float>("weights", {1, 2, 2}, {1, 2, 3, 4});
-
-      RunValidationAndConversion(node_def);
-      TRT_TensorOrWeights output;
-      TF_EXPECT_OK(GetTensorOrWeights("my_matmul", &output));
-      ASSERT_TRUE(output.is_tensor());
-      ExpectTrtDimsEqualsArray({2, 2}, output.tensor()->getDimensions());
-      const DataVec input_data{{"input", test::AsTensor<float>({0, 1, 2, 3})}};
-      DataVec output_data{{"my_matmul", ConstructTensor<float>(4)}};
-      BuildAndRun(input_data, &output_data);
-      if (!transpose_a && !transpose_b) {
-        EXPECT_THAT(GetSpanForData<float>(output_data[0]),
-                    ElementsAre(3, 4, 11, 16));
-      } else if (transpose_a && transpose_b) {
-        EXPECT_THAT(GetSpanForData<float>(output_data[0]),
-                    ElementsAre(4, 8, 7, 15));
-      } else if (transpose_a) {
-        EXPECT_THAT(GetSpanForData<float>(output_data[0]),
-                    ElementsAre(6, 8, 10, 14));
-      } else if (transpose_b) {
-        EXPECT_THAT(GetSpanForData<float>(output_data[0]),
-                    ElementsAre(2, 4, 8, 18));
-      }
-    }
-  }
+  auto insert_ones = [](std::vector<int> v, int n) {
+    std::vector<int> ones(n, 1);
+    ones.insert(ones.end(), v.begin(), v.end());
+    return ones;
+  };
 
-  TestMatMulHelper(this, get_batch_matmul_nodedef, "BatchMatMul");
+  // Add a leading 1 dimension to A, B and result.
+  std::transform(params_2d.begin(), params_2d.end(), std::back_inserter(params),
+                 [](MatMulTestParams p) {
+                   p.shape_a.insert(p.shape_a.begin(), 1);
+                   p.shape_b.insert(p.shape_b.begin(), 1);
+                   p.expected_shape.insert(p.expected_shape.begin(), 1);
+                   return p;
+                 });
+
+  // Test with N > 1: weights cannot be batched in implicit batch mode.
+  // clang-format off
+  params.push_back(
+      MatMulTestParams{{2, 2, 2}, {0, 1, 2, 3, 0, 1, 2, 3}, false,  // A
+                       {2, 2, 2}, {0, 1, 2, 3, 0, 1, 2, 3}, false,  // B
+                       {2, 2, 2}, {2, 3, 6, 11, 2, 3, 6, 11}}       // result
+  );
+
+  params.push_back(
+      MatMulTestParams{{2, 2, 3}, {0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5},
+      false,
+                       {2, 2, 3}, {1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6}, true,
+                       {2, 2, 2}, {8, 17, 26, 62, 8, 17, 26, 62}});
+  // clang-format on
+
+  // Add two leading 1 dimensions to A, B and result.
+  std::transform(params_2d.begin(), params_2d.end(), std::back_inserter(params),
+                 [insert_ones](MatMulTestParams p) {
+                   p.shape_a = insert_ones(p.shape_a, 2);
+                   p.shape_b = insert_ones(p.shape_b, 2);
+                   p.expected_shape = insert_ones(p.expected_shape, 2);
+                   return p;
+                 });
+
+  // Test broadcast: add two leading 1 dimensions to A, but not to B.
+  std::transform(params_2d.begin(), params_2d.end(), std::back_inserter(params),
+                 [insert_ones](MatMulTestParams p) {
+                   p.shape_a = insert_ones(p.shape_a, 2);
+                   p.expected_shape = insert_ones(p.expected_shape, 2);
+                   return p;
+                 });
+
+  // Test broadcast: add a leading 1 dimension to A and two leading 1s to B.
+  // Broadcasting A need a dynamic brodacast which will be incompatible with
+  // FC layer.
+  std::transform(params_2d.begin(), params_2d.end(), std::back_inserter(params),
+                 [insert_ones](MatMulTestParams p) {
+                   p.shape_a = insert_ones(p.shape_a, 1);
+                   p.shape_b = insert_ones(p.shape_b, 2);
+                   p.expected_shape = insert_ones(p.expected_shape, 2);
+                   return p;
+                 });
+
+  // Test with N > 1: since weights cannot be batched in implicit batch mode.
+  // We tests with batch size 2.
+  std::transform(params_2d.begin(), params_2d.end(), std::back_inserter(params),
+                 [insert_ones](MatMulTestParams p) {
+                   p.shape_a.insert(p.shape_a.begin(), 2);
+                   p.values_a.reserve(p.values_a.size() * 2);
+                   p.values_a.insert(p.values_a.end(), p.values_a.begin(),
+                                     p.values_a.end());
+
+                   p.shape_b.insert(p.shape_b.begin(), 2);
+                   p.values_b.reserve(p.values_b.size() * 2);
+                   p.values_b.insert(p.values_b.end(), p.values_b.begin(),
+                                     p.values_b.end());
+
+                   p.expected_shape.insert(p.expected_shape.begin(), 2);
+                   p.expected_output.reserve(p.expected_output.size() * 2);
+                   p.expected_output.insert(p.expected_output.end(),
+                                            p.expected_output.begin(),
+                                            p.expected_output.end());
+                   return p;
+                 });
+
+  // 4D tensor where the second "batch dim" is not 1
+  params.push_back(MatMulTestParams{
+      {1, 2, 4, 5},
+      {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13,
+       14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27,
+       28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39},
+      false,  // A
+      {1, 2, 3, 5},
+      {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+       16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30},
+      true,  // B
+      {1, 2, 4, 3},
+      {40,   90,   140,  115,  290,  465,  190,  490,
+       790,  265,  690,  1115, 1990, 2540, 3090, 2440,
+       3115, 3790, 2890, 3690, 4490, 3340, 4265, 5190}});  // result
+
+  TestMatMulHelper(this, get_batch_matmul_nodedef, params);
 }
 
-template <DataType dtype>
-void TestConvertBiasAdd(OpConverterTest* test) {
+TEST_P(OpConverter_FP32_FP16_Test, ConvertBiasAdd) {
+  // Note that kINT32 is not supported by IScaleLayer, so we don't test
+  // DT_INT32 type here. DT_FLOAT and DT_HALF are tested.
   // Get the NodeDef for BiasAdd.
-  auto get_biasadd_nodedef = [](const string& data_format) -> NodeDef {
+  auto get_biasadd_nodedef = [](const string& data_format,
+                                DataType tf_type) -> NodeDef {
     Scope s = Scope::NewRootScope();
-    auto input = ops::Placeholder(s.WithOpName("input"), dtype);
-    auto weights = ops::Placeholder(s.WithOpName("weights"), dtype);
+    auto input = ops::Placeholder(s.WithOpName("input"), tf_type);
+    auto weights = ops::Placeholder(s.WithOpName("weights"), tf_type);
     const auto biasadd_attrs = ops::BiasAdd::DataFormat(data_format);
     auto biasadd =
         ops::BiasAdd(s.WithOpName("my_biasadd"), input, weights, biasadd_attrs);
     return biasadd.operation.node()->def();
   };
 
-  typedef typename EnumToDataType<dtype>::Type CType;
   for (const string& data_format : {"NHWC", "NCHW"}) {
     for (const int trt_input_rank : {1, 2, 3, 4}) {
-      test->Reset();
-      NodeDef node_def = get_biasadd_nodedef(data_format);
+      Reset();
+      NodeDef node_def = get_biasadd_nodedef(data_format, tf_type_);
 
       // Add input, dims_array will be like {2, 1, ..., 1, 3}
-      std::vector<int32> dims_array(trt_input_rank, 1);
+      std::vector<int32> dims_array(trt_input_rank + 1, 1);
       if (trt_input_rank == 1) {
-        dims_array[0] = (data_format == "NHWC" ? 3 : 2);
+        dims_array[1] = (data_format == "NHWC" ? 3 : 2);
       } else {
-        dims_array[0] = 2;
-        dims_array[trt_input_rank - 1] = 3;
+        dims_array[1] = 2;
+        dims_array[trt_input_rank] = 3;
       }
-      test->AddTestTensor("input", dims_array, /*batch_size=*/1,
-                          TfDataTypeToTrt(dtype));
+      const int64_t num_input = DimsAdapter(dims_array).Volume();
+      ASSERT_EQ(trt_input_rank > 1 ? 6 : (data_format == "NHWC" ? 3 : 2),
+                num_input);
+      std::vector<float> input_data(num_input, 0);
+
+      AddTestTensor("input", dims_array, input_data);
 
-      // Add bias weights.
       const int channel_size = (data_format == "NHWC" ? 3 : 2);
-      std::vector<CType> bias(channel_size);
+      std::vector<float> bias(channel_size);
       for (int i = 0; i < channel_size; ++i) {
-        bias[i] = CType(i + 1);  // bias will be {1, 2, 3, ...}
+        bias[i] = i + 1;  // bias will be {1, 2, 3, ...}
       }
-      test->AddTestWeights<CType>("weights", {channel_size}, bias);
-
-      // Run the conversion.
-      test->RunValidationAndConversion(node_def);
-      TRT_TensorOrWeights output;
-      TF_EXPECT_OK(test->GetTensorOrWeights("my_biasadd", &output));
-      ASSERT_TRUE(output.is_tensor());
-      ExpectTrtDimsEqualsArray(dims_array, output.tensor()->getDimensions());
+      AddTestWeights("weights", {channel_size}, bias, tf_type_);
 
       // Build and run the engine.
-      const int num_input = TrtTensorDimsNumElements(GetTestDims(dims_array));
-      ASSERT_EQ(trt_input_rank > 1 ? 6 : (data_format == "NHWC" ? 3 : 2),
-                num_input);
+      std::vector<float> output_data;
 
-      const DataVec input_data{
-          {"input", ConstructTensor<CType>(num_input, CType(0))}};
-      DataVec output_data{{"my_biasadd", ConstructTensor<CType>(num_input)}};
-      test->BuildAndRun(input_data, &output_data);
       if (trt_input_rank == 1) {
         if (data_format == "NHWC") {
-          EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
-                      ElementsAre(CType(1), CType(2), CType(3)));
+          output_data = {1, 2, 3};
         } else {
-          EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
-                      ElementsAre(CType(1), CType(2)));
+          output_data = {1, 2};
         }
       } else {
         if (data_format == "NHWC") {
-          EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
-                      ElementsAre(CType(1), CType(2), CType(3), CType(1),
-                                  CType(2), CType(3)));
+          output_data = {1, 2, 3, 1, 2, 3};
         } else {
-          EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
-                      ElementsAre(CType(1), CType(1), CType(1), CType(2),
-                                  CType(2), CType(2)));
+          output_data = {1, 1, 1, 2, 2, 2};
         }
       }
+      TestOpConverter(node_def, dims_array, Status::OK(), Status::OK(),
+                      ElementsAreArray(output_data));
     }
   }
 }
 
-TEST_F(OpConverterTest, ConvertBiasAdd) {
-  // OK. Note that kINT32 is not supported by IScaleLayer, so we don't test
-  // DT_INT32 type here.
-  TestConvertBiasAdd<DT_FLOAT>(this);
-  TestConvertBiasAdd<DT_HALF>(this);
-}
-
 template <typename OpType>
-NodeDef GetBinaryOpNodeDef(const string& input_name_l,
-                           const string& input_name_r, DataType dtype) {
+NodeDef GetBinaryOpNodeDef(DataType dtype) {
   Scope s = Scope::NewRootScope();
-  auto input_l = ops::Placeholder(s.WithOpName(input_name_l), dtype);
-  auto input_r = ops::Placeholder(s.WithOpName(input_name_r), dtype);
+  auto input_l = ops::Placeholder(s.WithOpName("input1"), dtype);
+  auto input_r = ops::Placeholder(s.WithOpName("input2"), dtype);
   auto op = OpType(s.WithOpName("my_binary"), input_l, input_r);
   return op.operation.node()->def();
 }
-
-template <typename OpType, DataType dtype>
-void TestBinaryOp(OpConverterTest* test, bool operand_1_is_tensor,
-                  bool operand_2_is_tensor) {
-  typedef typename EnumToDataType<dtype>::Type CType;
-  test->Reset();
-  const NodeDef node_def =
-      GetBinaryOpNodeDef<OpType>("input1", "input2", dtype);
-  if (operand_1_is_tensor) {
-    test->AddTestTensor("input1", /*dims=*/{1, 2}, /*batch_size=*/2,
-                        TfDataTypeToTrt(dtype));
-  } else {
-    test->AddTestWeights("input1", /*dims=*/{1, 2},
-                         /*values=*/std::vector<CType>{CType(3), CType(6)});
-  }
-  if (operand_2_is_tensor) {
-    test->AddTestTensor("input2", /*dims=*/{2, 1}, /*batch_size=*/2,
-                        TfDataTypeToTrt(dtype));
-  } else {
-    test->AddTestWeights("input2", /*dims=*/{2, 1},
-                         /*values=*/std::vector<CType>{CType(2), CType(3)});
-  }
-  test->RunValidationAndConversion(node_def);
-
-  DataVec input_data;
-  if (operand_1_is_tensor) {
-    input_data.push_back(
-        {"input1",
-         test::AsTensor<CType>({CType(3), CType(6), CType(3), CType(6)})});
-  }
-  if (operand_2_is_tensor) {
-    input_data.push_back(
-        {"input2",
-         test::AsTensor<CType>({CType(2), CType(3), CType(2), CType(3)})});
-  }
-  DataVec output_data{{"my_binary", ConstructTensor<CType>(8)}};
-  // Check output dims.
-  TRT_TensorOrWeights output;
-  TF_EXPECT_OK(test->GetTensorOrWeights("my_binary", &output));
-  ASSERT_TRUE(output.is_tensor());
-  ExpectTrtDimsEqualsArray({2, 2}, output.tensor()->getDimensions());
-  // After broadcasting first input becomes {3, 6, 3, 6} and second input
-  // becomes {2, 3, 2, 3}.
-  test->BuildAndRun(
-      input_data, &output_data,
-      dtype == DT_HALF ? TrtPrecisionMode::FP16 : TrtPrecisionMode::FP32,
-      /*batch_size=*/2);
-  if (node_def.op() == "Add") {
-    EXPECT_THAT(
-        GetSpanForData<CType>(output_data[0]),
-        ElementsAreArray(CastTestVector<int, CType>({5, 8, 6, 9, 5, 8, 6, 9})));
-  } else if (node_def.op() == "Sub") {
-    EXPECT_THAT(
-        GetSpanForData<CType>(output_data[0]),
-        ElementsAreArray(CastTestVector<int, CType>({1, 4, 0, 3, 1, 4, 0, 3})));
-  } else if (node_def.op() == "Mul") {
-    EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
-                ElementsAreArray(
-                    CastTestVector<int, CType>({6, 12, 9, 18, 6, 12, 9, 18})));
-  } else if (node_def.op() == "Div") {
-    EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
-                ElementsAreArray(CastTestVector<float, CType>(
-                    {1.5, 3, 1, 2, 1.5, 3, 1, 2})));
-  } else if (node_def.op() == "RealDiv") {
-    EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
-                ElementsAreArray(CastTestVector<float, CType>(
-                    {1.5, 3, 1, 2, 1.5, 3, 1, 2})));
-  } else if (node_def.op() == "FloorDiv") {
-    EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
-                ElementsAreArray(
-                    CastTestVector<float, CType>({1, 3, 1, 2, 1, 3, 1, 2})));
-  } else if (node_def.op() == "Minimum") {
-    EXPECT_THAT(
-        GetSpanForData<CType>(output_data[0]),
-        ElementsAreArray(CastTestVector<int, CType>({2, 2, 3, 3, 2, 2, 3, 3})));
-  } else if (node_def.op() == "Maximum") {
-    EXPECT_THAT(
-        GetSpanForData<CType>(output_data[0]),
-        ElementsAreArray(CastTestVector<int, CType>({3, 6, 3, 6, 3, 6, 3, 6})));
-  } else if (node_def.op() == "Pow") {
-    ExpectArrayNear(
-        CastTestVector<int, CType>({9, 36, 27, 216, 9, 36, 27, 216}),
-        GetSpanForData<CType>(output_data[0]));
-  } else {
-    ASSERT_TRUE(false);
-  }
+
+TEST_P(OpConverter_FP32_FP16_BinaryTest, ConvertBinary) {
+  using OpFunc = std::function<NodeDef(DataType)>;
+  std::map<std::string, std::pair<OpFunc, std::vector<float>>> op_test_info;
+#define ADD_OP(name, op, v1, v2, v3, v4, v5, v6, v7, v8) \
+  op_test_info[name] =                                   \
+      std::make_pair(GetBinaryOpNodeDef<op>,             \
+                     std::vector<float>(v1, v2, v3, v4, v5, v6, v7, v8))
+  ADD_OP("Add", ops::Add, {5, 8, 6, 9, 5, 8, 6, 9});
+  ADD_OP("AddV2", ops::AddV2, {5, 8, 6, 9, 5, 8, 6, 9});
+  ADD_OP("Sub", ops::Sub, {1, 4, 0, 3, 1, 4, 0, 3});
+  ADD_OP("Mul", ops::Mul, {6, 12, 9, 18, 6, 12, 9, 18});
+  ADD_OP("Div", ops::Div, {1.5, 3, 1, 2, 1.5, 3, 1, 2});
+  ADD_OP("RealDiv", ops::RealDiv, {1.5, 3, 1, 2, 1.5, 3, 1, 2});
+  ADD_OP("FloorDiv", ops::FloorDiv, {1, 3, 1, 2, 1, 3, 1, 2});
+  ADD_OP("Minimum", ops::Minimum, {2, 2, 3, 3, 2, 2, 3, 3});
+  ADD_OP("Maximum", ops::Maximum, {3, 6, 3, 6, 3, 6, 3, 6});
+  ADD_OP("Pow", ops::Pow, {9, 36, 27, 216, 9, 36, 27, 216});
+#if IS_TRT_VERSION_GE(8, 2, 0, 0)
+  ADD_OP("Greater", ops::Greater, {1, 1, 0, 1, 1, 1, 0, 1});
+  ADD_OP("Less", ops::Less, {0, 0, 0, 0, 0, 0, 0, 0});
+  ADD_OP("Equal", ops::Equal, {0, 0, 1, 0, 0, 0, 1, 0});
+  ADD_OP("GreaterEqual", ops::Less, {1, 1, 1, 1, 1, 1, 1, 1});
+  ADD_OP("LessEqual", ops::Greater, {0, 0, 1, 0, 0, 0, 1, 0});
+#endif
+#undef ADD_OP
+  std::vector<std::vector<float>> data = {
+      {3, 6, 3, 6}, {3, 6}, {2, 3, 2, 3}, {2, 3}};
+  RunTests(*BinaryOperationMap(), op_test_info, data);
 }
 
-TEST_F(OpConverterTest, ConvertBinary) {
-  AttrValue dtype;
-  dtype.set_type(DT_FLOAT);
-  {
-    // Both inputs are weights.
-    Reset();
-    NodeDef node_def =
-        MakeNodeDef("my_add", "Add", {"weights1", "weights2"}, {{"T", dtype}});
-    AddTestWeights<float>("weights1", {1}, {1});
-    AddTestWeights<float>("weights2", {1}, {1});
-    RunValidationAndConversion(
-        node_def, error::UNIMPLEMENTED,
-        "Constant folding is falled back to TensorFlow, binary op received "
-        "both input as constant at: my_add");
-  }
-
-  // Test combinations of tensor vs weight inputs (except when both inputs are
-  // weights).
-  for (const bool operand_1_is_tensor : {true, false}) {
-    for (const bool operand_2_is_tensor : {true, false}) {
-      if (!operand_1_is_tensor && !operand_2_is_tensor) continue;
-      // FP32 tests
-      TestBinaryOp<ops::Add, DT_FLOAT>(this, operand_1_is_tensor,
-                                       operand_2_is_tensor);
-      TestBinaryOp<ops::Sub, DT_FLOAT>(this, operand_1_is_tensor,
-                                       operand_2_is_tensor);
-      TestBinaryOp<ops::Mul, DT_FLOAT>(this, operand_1_is_tensor,
-                                       operand_2_is_tensor);
-      TestBinaryOp<ops::Div, DT_FLOAT>(this, operand_1_is_tensor,
-                                       operand_2_is_tensor);
-      TestBinaryOp<ops::RealDiv, DT_FLOAT>(this, operand_1_is_tensor,
-                                           operand_2_is_tensor);
-      TestBinaryOp<ops::Minimum, DT_FLOAT>(this, operand_1_is_tensor,
-                                           operand_2_is_tensor);
-      TestBinaryOp<ops::Maximum, DT_FLOAT>(this, operand_1_is_tensor,
-                                           operand_2_is_tensor);
-      TestBinaryOp<ops::Pow, DT_FLOAT>(this, operand_1_is_tensor,
-                                       operand_2_is_tensor);
-      // FP16 tests
-      // TODO(tmorris): Use templates to avoid duplication.
-      TestBinaryOp<ops::Add, DT_HALF>(this, operand_1_is_tensor,
-                                      operand_2_is_tensor);
-      TestBinaryOp<ops::Sub, DT_HALF>(this, operand_1_is_tensor,
-                                      operand_2_is_tensor);
-      TestBinaryOp<ops::Mul, DT_HALF>(this, operand_1_is_tensor,
-                                      operand_2_is_tensor);
-      TestBinaryOp<ops::Div, DT_HALF>(this, operand_1_is_tensor,
-                                      operand_2_is_tensor);
-      TestBinaryOp<ops::RealDiv, DT_HALF>(this, operand_1_is_tensor,
-                                          operand_2_is_tensor);
-      TestBinaryOp<ops::Minimum, DT_HALF>(this, operand_1_is_tensor,
-                                          operand_2_is_tensor);
-      TestBinaryOp<ops::Maximum, DT_HALF>(this, operand_1_is_tensor,
-                                          operand_2_is_tensor);
-      TestBinaryOp<ops::Pow, DT_HALF>(this, operand_1_is_tensor,
-                                      operand_2_is_tensor);
-    }
-  }
+TEST_P(OpConverter_BOOL_BinaryTest, ConvertBooleanBinary) {
+  using OpFunc = std::function<NodeDef(DataType)>;
+  std::map<std::string, std::pair<OpFunc, std::vector<int>>> op_test_info;
+#define ADD_OP(name, op, v1, v2, v3, v4, v5, v6, v7, v8) \
+  op_test_info[name] =                                   \
+      std::make_pair(GetBinaryOpNodeDef<op>,             \
+                     std::vector<int>(v1, v2, v3, v4, v5, v6, v7, v8))
+  ADD_OP("LogicalOr", ops::LogicalOr, {1, 1, 0, 1, 1, 1, 0, 1});
+  ADD_OP("LogicalAnd", ops::LogicalAnd, {0, 1, 0, 0, 0, 1, 0, 0});
+#undef ADD_OP
+#if IS_TRT_VERSION_GE(8, 2, 0, 0)
+  std::vector<std::vector<int>> data = {
+      {0, 1, 0, 1}, {0, 1}, {1, 0, 1, 0}, {1, 0}};
+  RunTests(*BinaryBooleanOperationMap(), op_test_info, data);
+#endif
 }
 
 NodeDef GetAddNNodeDef(const std::vector<string>& input_names, DataType dtype) {
@@ -2253,94 +3490,136 @@ NodeDef GetAddNNodeDef(const std::vector<string>& input_names, DataType dtype) {
   return op.operation.node()->def();
 }
 
-template <DataType dtype>
-void TestAddN(OpConverterTest* test) {
-  typedef typename EnumToDataType<dtype>::Type CType;
-  {
-    // All inputs are tensors.
-    test->Reset();
-    DataVec input_data;
-    for (const auto name : {"inp1", "inp2", "inp3"}) {
-      test->AddTestTensor(name, /*dims=*/{1, 2}, /*batch_size=*/2,
-                          TfDataTypeToTrt(dtype));
-      input_data.push_back({name, test::AsTensor<CType>({CType(1), CType(2),
-                                                         CType(3), CType(4)})});
-    }
-    const NodeDef node_def = GetAddNNodeDef({"inp1", "inp2", "inp3"}, dtype);
-    test->RunValidationAndConversion(node_def);
+struct AddNTestParams {
+  std::vector<float> input_values;
+  std::vector<string> input_names;
+  std::vector<int> dimensions;
+  std::vector<float> expected_output;
+  Status status;
+};
 
-    TRT_TensorOrWeights output;
-    TF_EXPECT_OK(test->GetTensorOrWeights("my_addn", &output));
-    ASSERT_TRUE(output.is_tensor());
-    ExpectTrtDimsEqualsArray({1, 2}, output.tensor()->getDimensions());
+void TestAddN(ParameterizedOpConverterTestBase* test, AddNTestParams& p) {
+  // All inputs are tensors.
+  test->Reset();
+  const NodeDef node_def = GetAddNNodeDef(p.input_names, test->get_tf_type());
 
-    DataVec output_data{{"my_addn", ConstructTensor<CType>(4)}};
-    test->BuildAndRun(
-        input_data, &output_data,
-        dtype == DT_HALF ? TrtPrecisionMode::FP16 : TrtPrecisionMode::FP32,
-        /*batch_size=*/2);
-    EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
-                ElementsAreArray(CastTestVector<int, CType>({3, 6, 9, 12})));
+  if (p.input_values.size() % p.input_names.size() != 0) {
+    LOG(ERROR) << "The number of input values: `" << p.input_values.size()
+               << "` is not a multiple of the number of inputs: `"
+               << p.input_names.size() << "`";
+    ASSERT_TRUE(false);
   }
-  {
-    // Input contains tensors and weights.
-    test->Reset();
-    DataVec input_data;
-    for (const auto name : {"inp1", "inp2"}) {
-      test->AddTestTensor(name, /*dims=*/{1, 2}, /*batch_size=*/1,
-                          TfDataTypeToTrt(dtype));
-      input_data.push_back({name, test::AsTensor<CType>({CType(1), CType(2)})});
-    }
-    test->AddTestWeights("inp3", /*dims=*/{1, 1, 2},
-                         /*values=*/std::vector<CType>{CType(3), CType(4)});
-    const NodeDef node_def = GetAddNNodeDef({"inp1", "inp2", "inp3"}, dtype);
-    test->RunValidationAndConversion(node_def);
 
-    TRT_TensorOrWeights output;
-    TF_EXPECT_OK(test->GetTensorOrWeights("my_addn", &output));
-    ASSERT_TRUE(output.is_tensor());
-    ExpectTrtDimsEqualsArray({1, 2}, output.tensor()->getDimensions());
-
-    DataVec output_data{{"my_addn", ConstructTensor<CType>(2)}};
-    test->BuildAndRun(
-        input_data, &output_data,
-        dtype == DT_HALF ? TrtPrecisionMode::FP16 : TrtPrecisionMode::FP32);
-    EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
-                ElementsAreArray(CastTestVector<int, CType>({5, 8})));
-  }
+  DataVec input_data;
+  int input_offset = 0;
+  const int window_size = p.input_values.size() / p.input_names.size();
+  for (const string& name : p.input_names) {
+    std::vector<float>::const_iterator start_pos =
+        p.input_values.begin() + input_offset;
+    std::vector<float>::const_iterator end_pos = start_pos + window_size;
+    std::vector<float> sub_input_val(start_pos, end_pos);
+    input_offset += window_size;
+
+    test->AddTestTensor(name, p.dimensions, test->get_tf_type(), sub_input_val);
+  }
+
+  test->TestOpConverter(node_def, p.dimensions,
+                        /*expected_conversion_status=*/p.status,
+                        /*expected_runtime_status=*/p.status,
+                        /*matcher=*/ElementsAreArray(p.expected_output),
+                        /*out_tf_types=*/{test->get_tf_type()});
 }
 
-TEST_F(OpConverterTest, ConvertAddN) {
+TEST_P(OpConverter_FP32_FP16_Test, ConvertAddN) {
   {
     // Weights with batch dim that is not 1.
     Reset();
-    const NodeDef node_def = GetAddNNodeDef({"tensor", "weights"}, DT_FLOAT);
-    AddTestTensor("tensor", /*dims=*/{1, 2}, /*batch_size=*/2);
+    const NodeDef node_def = GetAddNNodeDef({"tensor", "weights"}, tf_type_);
+    AddTestTensor("tensor", /*dims=*/{1, 2});
     AddTestWeights<float>("weights", {2, 1, 2}, {0, 1, 2, 3});
     RunValidationAndConversion(
-        node_def, error::INVALID_ARGUMENT,
+        node_def, absl::StatusCode::kInvalidArgument,
         "Weights input to AddN is required to have batch dimension 1.");
   }
-  TestAddN<DT_FLOAT>(this);
-  TestAddN<DT_HALF>(this);
+
+  const std::vector<float> common_input = CreateVectorIota<float>(6);
+
+  std::vector<AddNTestParams> params = {
+      {/*input_values=*/common_input,
+       /*input_names=*/{"inp1", "inp2", "inp3"},
+       /*dimensions=*/{1, 1, 2, 1, 1},
+       /*expected_output=*/{6, 9},
+       /*status=*/Status::OK()},
+      {/*input_values=*/common_input,
+       /*input_names=*/{"inp1", "inp2"},
+       /*dimensions=*/{1, 1, 3, 1, 1},
+       /*expected_output=*/{3, 5, 7},
+       /*status=*/Status::OK()},
+      {/*input_values=*/common_input,
+       /*input_names=*/{"inp1", "inp2", "inp3"},
+       /*dimensions=*/{1, 2, 1, 1},
+       /*expected_output=*/{6, 9},
+       /*status=*/Status::OK()},
+      {/*input_values=*/common_input,
+       /*input_names=*/{"inp1", "inp2"},
+       /*dimensions=*/{1, 1, 3, 1},
+       /*expected_output=*/{3, 5, 7},
+       /*status=*/Status::OK()},
+      {/*input_values=*/common_input,
+       /*input_names=*/{"inp1", "inp2", "inp3"},
+       /*dimensions=*/{1, 2, 1},
+       /*expected_output=*/{6, 9},
+       /*status=*/Status::OK()},
+      {/*input_values=*/common_input,
+       /*input_names=*/{"inp1", "inp2"},
+       /*dimensions=*/{1, 1, 3},
+       /*expected_output=*/{3, 5, 7},
+       /*status=*/Status::OK()},
+      {/*input_value=*/common_input,
+       /*input_names=*/{"inp1", "inp2", "inp3"},
+       /*dimensions=*/{2, 1},
+       /*expected_output=*/{6, 9},
+       /*status=*/Status::OK()},
+      {/*input_values=*/common_input,
+       /*input_names=*/{"inp1", "inp2"},
+       /*dimensions=*/{1, 3},
+       /*expected_output=*/{3, 5, 7},
+       /*status=*/Status::OK()},
+      {/*input_values=*/common_input,
+       /*input_names=*/{"inp1", "inp2", "inp3"},
+       /*dimensions=*/{2},
+       /*expected_output=*/{6, 9},
+       /*status=*/Status::OK()},
+      {/*input_values=*/common_input,
+       /*input_names=*/{"inp1", "inp2"},
+       /*dimensions=*/{3},
+       /*expected_output=*/{3, 5, 7},
+       /*status=*/Status::OK()},
+      {/*input_values=*/common_input,
+       /*input_names=*/{"inp1", "inp2", "inp3", "inp4", "inp5", "inp6"},
+       /*dimensions=*/{1},
+       /*expected_output=*/{15},
+       /*status=*/Status::OK()},
+  };
+
+  for (auto p : params) {
+    TestAddN(this, p);
+  }
 }
 
-TEST_F(OpConverterTest, ConvertQuantize) {
-  precision_mode_to_test_ = TrtPrecisionMode::INT8;
+TEST_P(OpConverter_FP32_Test, ConvertQDQDynamicRangeMode) {
   {
     // FakeQuantWithMinMaxArgs attributes are empty, should fail.
-    Reset();
+    Reset(TrtPrecisionMode::INT8);
     NodeDef node_def =
         MakeNodeDef("my_quantize", "FakeQuantWithMinMaxArgs", {"input"});
     AddTestTensor("input", {1, 2, 3});
-    RunValidationAndConversion(
-        node_def, error::INVALID_ARGUMENT,
-        "Min or max attribute not found for FakeQuantWithMinMaxArgs "
-        "at my_quantize");
+    RunValidationAndConversion(node_def, absl::StatusCode::kNotFound,
+                               "No attr named 'min'");
   }
   {
     // FakeQuantWithMinMaxArgs ranges set via attributes, ok.
-    Reset();
+    Reset(TrtPrecisionMode::INT8);
     Scope s = Scope::NewRootScope();
     auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
     auto quantize_attrs = ops::FakeQuantWithMinMaxArgs::Min(-6.0f).Max(6.0f);
@@ -2358,7 +3637,7 @@ TEST_F(OpConverterTest, ConvertQuantize) {
   }
   {
     // FakeQuantWithMinMaxVars ranges set via inputs, ok.
-    Reset();
+    Reset(TrtPrecisionMode::INT8);
     Scope s = Scope::NewRootScope();
     auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
     auto weights_min = ops::Placeholder(s.WithOpName("weights_min"), DT_FLOAT);
@@ -2379,7 +3658,7 @@ TEST_F(OpConverterTest, ConvertQuantize) {
   }
   {
     // QuantizeAndDequantizeV2 ranges set via inputs, ok.
-    Reset();
+    Reset(TrtPrecisionMode::INT8);
     Scope s = Scope::NewRootScope();
     auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
     auto weights_min = ops::Placeholder(s.WithOpName("weights_min"), DT_FLOAT);
@@ -2400,7 +3679,7 @@ TEST_F(OpConverterTest, ConvertQuantize) {
   }
   {
     // QuantizeAndDequantizeV2 Range inputs are tensors, should fail.
-    Reset();
+    Reset(TrtPrecisionMode::INT8);
     Scope s = Scope::NewRootScope();
     auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
     auto weights_min = ops::Placeholder(s.WithOpName("weights_min"), DT_FLOAT);
@@ -2411,14 +3690,13 @@ TEST_F(OpConverterTest, ConvertQuantize) {
     AddTestTensor("input", {1, 2, 3});
     AddTestTensor("weights_min", {1});
     AddTestTensor("weights_max", {1});
-    RunValidationAndConversion(
-        node_def, error::UNIMPLEMENTED,
-        "The input \"input_min\" for QuantizeAndDequantizeV2 must be a constant"
-        ", at my_quantize");
+    RunValidationAndConversion(node_def, absl::StatusCode::kUnimplemented,
+                               "The input \"input_min\" for "
+                               "QuantizeAndDequantizeV2 must be a constant");
   }
   {
     // QuantizeAndDequantizeV3 ranges set via inputs, ok.
-    Reset();
+    Reset(TrtPrecisionMode::INT8);
     Scope s = Scope::NewRootScope();
     auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
     auto weights_min = ops::Placeholder(s.WithOpName("weights_min"), DT_FLOAT);
@@ -2441,80 +3719,409 @@ TEST_F(OpConverterTest, ConvertQuantize) {
   }
 }
 
-template <DataType dtype>
-void TestConvertSquare(OpConverterTest* test) {
-  test->Reset();
-  typedef typename EnumToDataType<dtype>::Type CType;
+TEST_P(OpConverter_FP32_FP16_Test, ConvertSquare) {
+  {
+    // Input is weights, should fail.
+    Reset();
+    Scope s = Scope::NewRootScope();
+    auto input = ops::Placeholder(s.WithOpName("input"), tf_type_);
+    auto square = ops::Square(s.WithOpName("my_square"), input);
+    NodeDef node_def = square.operation.node()->def();
+    AddTestWeights("input", {1, 2, 3}, {1, 2, 3, 4, -5, 6}, tf_type_);
+    RunValidationAndConversion(node_def, absl::StatusCode::kUnimplemented,
+                               "The input \"x\" for Square must be a tensor");
+  }
+
+  Reset();
 
   Scope s = Scope::NewRootScope();
-  auto input = ops::Placeholder(s.WithOpName("input"), dtype);
+  auto input = ops::Placeholder(s.WithOpName("input"), tf_type_);
   auto square = ops::Square(s.WithOpName("my_square"), input);
   NodeDef node_def = square.operation.node()->def();
 
-  test->AddTestTensor("input", {1, 20}, /*batch_size=*/1,
-                      TfDataTypeToTrt(dtype));
-  test->RunValidationAndConversion(node_def);
-  TRT_TensorOrWeights output;
-  TF_EXPECT_OK(test->GetTensorOrWeights("my_square", &output));
-  ASSERT_TRUE(output.is_tensor());
-  ExpectTrtDimsEqualsArray({1, 20}, output.tensor()->getDimensions());
-
   const int num_inputs = 20;
-  std::vector<CType> inputs(num_inputs);
-  std::vector<CType> expected_outputs(num_inputs);
+  std::vector<float> inputs(num_inputs);
+  std::vector<float> expected_outputs(num_inputs);
+
   for (int i = 0; i < num_inputs; ++i) {
-    const CType value = CType(i - 9);
+    const float value = (i - 9);
     inputs[i] = value;
     expected_outputs[i] = value * value;
   }
-  const DataVec input_data{{"input", test::AsTensor<CType>(inputs)}};
-  // Engine outputs are converted to FP16 automatically if we set FP16 mode in
-  // the builder.
-  DataVec output_data{{"my_square", ConstructTensor<CType>(num_inputs)}};
-  test->BuildAndRun(
-      input_data, &output_data,
-      dtype == DT_HALF ? TrtPrecisionMode::FP16 : TrtPrecisionMode::FP32);
-  ExpectArrayNear(expected_outputs, GetSpanForData<CType>(output_data[0]));
+  AddTestTensor("input", {1, 1, 20}, tf_type_, inputs);
+
+  TestOpConverter(node_def, {1, 1, 20}, Status::OK(), Status::OK(),
+                  ArrayFloatNear(expected_outputs, 0));
 }
 
-TEST_F(OpConverterTest, ConvertSquare) {
-  {
-    // Input is weights, should fail.
+// A function that builds the next lexicographically greater configuration
+// for the current one. The configuration is described as a (0,1)-vector
+// config, where config[i] is 0 or 1 when the i-th parameter is passed as
+// a weight or tensor, respectively. The function returns TRUE if such
+// a configuration is built, or FALSE otherwise.
+bool nextTensorWeightConfiguration(std::vector<int>& config) {
+  for (int i = config.size(); i-- > 0;) {
+    if ((config[i] = 1 - config[i])) return true;
+  }
+  return false;
+}
+
+#if IS_TRT_VERSION_GE(8, 2, 0, 0)
+
+TEST_P(OpConverter_FP32_FP16_INT32_Test, ConvertFill) {
+  Scope s = Scope::NewRootScope();
+  auto dims = ops::Placeholder(s.WithOpName("dims"), DT_INT32);
+  auto value = ops::Placeholder(s.WithOpName("value"), tf_type_);
+  auto fill = ops::Fill(s.WithOpName("my_fill"), dims, value);
+  const NodeDef& node_def = fill.operation.node()->def();
+
+  if (trt_mode_ == TrtTestMode::kImplicitBatch) {
     Reset();
-    Scope s = Scope::NewRootScope();
-    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
-    auto square = ops::Square(s.WithOpName("my_square"), input);
-    NodeDef node_def = square.operation.node()->def();
-    AddTestWeights<float>("input", {1, 2, 3}, {1, 2, 3, 4, -5, 6});
+    // random data
+    AddTestWeights("dims", {2}, {2, 2}, DT_INT32);
+    AddTestWeights("value", {1}, {42}, tf_type_);
     RunValidationAndConversion(
-        node_def, error::UNIMPLEMENTED,
-        "The input \"x\" for Square must be a tensor, at my_square");
+        node_def, absl::StatusCode::kUnimplemented,
+        convert_not_supported_implicit(node_def.op(), node_def.name()));
+    return;
+  }
+
+  std::vector<std::vector<int>> output_dims_params = {
+      {8}, {8, 2, 4}, {32, 32, 3200}};
+  std::vector<std::vector<int>> value_dims_params = {{}, {1}};
+
+  float val = 42.0;
+  Status status = Status::OK();
+  for (bool dims_is_tensor : {true, false}) {
+    for (bool value_is_tensor : {true, false}) {
+      for (auto output_dims : output_dims_params) {
+        for (auto value_dims : value_dims_params) {
+          Reset();
+          std::vector<int32_t> dims_dims = {
+              static_cast<int32_t>(output_dims.size())};
+          if (dims_is_tensor) {
+            AddTestTensor("dims", dims_dims, DT_INT32, output_dims, dims_dims);
+          } else {
+            AddTestWeights("dims", dims_dims, output_dims, DT_INT32);
+          }
+          if (value_is_tensor) {
+            AddTestTensor("value", value_dims, tf_type_,
+                          {static_cast<int>(val)});
+          } else {
+            AddTestWeights("value", value_dims, {static_cast<int>(val)},
+                           tf_type_);
+          }
+          size_t nb_el = 1;
+          for (auto d : output_dims) {
+            nb_el *= d;
+          }
+          std::vector<float> expected_output(nb_el, val);
+          TestOpConverter(node_def, output_dims, status, status,
+                          ElementsAreArray(expected_output));
+        }
+      }
+    }
   }
+}
+
+TEST_P(OpConverter_FP32_FP16_INT32_Test, ConvertRange) {
+  auto get_casted_value = [this](const float value, const DataType dtype) {
+    return dtype == DT_INT32 ? static_cast<int32>(value) : value;
+  };
+
+  auto set_parameters = [this](const std::array<const char*, 3>& name,
+                               const std::array<std::vector<float>, 3>& value,
+                               const std::array<DataType, 3>& type,
+                               const std::vector<int>& config,
+                               int shape_idx = -1) {
+    Reset();
+    for (int i = 0; i < 3; i++) {
+      if (config[i]) {
+        std::vector<int32> partial_shape_dims = {};
+        // The correct partial shape will be provided
+        // (a) for all parameters, when shape_idx > 3
+        // (b) for all parameters, except shape_idx, when shape_idx >= 0
+        // (c) for none of the shape_idx < 0
+        if (shape_idx > 3 || (shape_idx >= 0 && shape_idx != i)) {
+          partial_shape_dims = {1};
+        }
+        AddTestTensor(name[i], {1}, type[i], value[i], partial_shape_dims);
+      } else {
+        AddTestWeights(name[i], {1}, value[i], type[i]);
+      }
+    }
+  };
+
+  const float start = 1.0;
+  const float limit = 43.0;
+  const float delta = 2.0;
+
+  const std::array<const char*, 3> param_name = {"start", "limit", "delta"};
+  std::array<std::vector<float>, 3> param_value;
+  param_value[0] = {start};
+  param_value[1] = {limit};
+  param_value[2] = {delta};
+  const auto start_type = tf_type_;
+  std::array<DataType, 3> param_type = {tf_type_, tf_type_, tf_type_};
+
+  Scope s = Scope::NewRootScope();
+  const auto range =
+      ops::Range(s.WithOpName("my_range"),
+                 ops::Placeholder(s.WithOpName(param_name[0]), param_type[0]),
+                 ops::Placeholder(s.WithOpName(param_name[1]), param_type[1]),
+                 ops::Placeholder(s.WithOpName(param_name[2]), param_type[2]));
+
+  const NodeDef& ndef = range.operation.node()->def();
+  const std::vector<DataType> param_types{DT_FLOAT, DT_HALF, DT_INT32};
+
+  // ConverterRange is not implemented for Implicite batch mode.
+  std::vector<int> config(3, 0);
+  if (trt_mode_ == TrtTestMode::kImplicitBatch) {
+    const auto& err = convert_not_supported_implicit(ndef.op(), ndef.name());
+    do {
+      set_parameters(param_name, param_value, param_type, config);
+      RunValidationAndConversion(ndef, absl::StatusCode::kUnimplemented, err);
+    } while (nextTensorWeightConfiguration(config));
+
+    return;
+  }
+
+  const auto& expect_msg = convert_range_expected_msg(ndef);
+  bool all_weights = true;
+  do {
+    for (auto limit_type : param_types) {
+      param_type[1] = limit_type;
+      for (auto delta_type : param_types) {
+        param_type[2] = delta_type;
+
+        const auto all_integers = start_type == DT_INT32 &&
+                                  limit_type == DT_INT32 &&
+                                  delta_type == DT_INT32;
+
+        if (all_weights || (all_integers && !config[2])) {
+          // Reject invalid parameters if delta = 0 and it's passed as a weight.
+          param_value[2] = {0};
+          set_parameters(param_name, param_value, param_type, config);
+          RunValidationAndConversion(
+              ndef, absl::StatusCode::kInvalidArgument,
+              "The delta parameter of Range operation cannot be equal to 0");
+
+          if (!all_weights && !config[2]) {
+            param_value[2] = {-1};
+            set_parameters(param_name, param_value, param_type, config);
+            const string err = StrCat(
+                "The delta parameter of Range operation "
+                "cannot be negative, when one of (start, limit) is passed as "
+                "a tensor, but got ",
+                param_value[2][0]);
+            RunValidationAndConversion(ndef, absl::StatusCode::kInvalidArgument,
+                                       err);
+          }
+        }
+
+        if (all_weights) {
+          // Reject invalid parameters preventing the limit from
+          // being reached for fixed values of start and delta.
+          for (int j = 0; j <= 1; j++) {
+            param_value[j] = {get_casted_value(start, tf_type_)};
+            param_value[1 - j] = {get_casted_value(limit, limit_type)};
+            param_value[2] = {(2 * j - 1) *
+                              get_casted_value(delta, delta_type)};
+            set_parameters(param_name, param_value, param_type, config);
+            const auto error = convert_range_error_msg(
+                param_value[0][0], param_value[1][0], param_value[2][0]);
+            RunValidationAndConversion(ndef, absl::StatusCode::kInvalidArgument,
+                                       error);
+          }
+        }
+
+        param_value[0] = {start};
+        param_value[2] = {delta};
+        if (all_integers) {
+          if (trt_mode_ == TrtTestMode::kDynamicShape) {
+            // Wrong dimension for the parameter passed as a tensor.
+            for (int j = 0; j < 3; j++) {
+              if (!config[j]) continue;
+
+              const string err =
+                  StrCat("Dimension for '", param_name[j],
+                         "' of Range operator should be equal to 1");
+              set_parameters(param_name, param_value, param_type, config, j);
+              RunValidationAndConversion(
+                  ndef, absl::StatusCode::kInvalidArgument, err);
+            }
+          }
+        } else {
+          if (!all_weights) {
+            // The following test should fail, when
+            //    (a) at least one parameter is passed as a tensor;
+            //    (b) at least one parameter is not of type DT_INT32.
+            set_parameters(param_name, param_value, param_type, config);
+            RunValidationAndConversion(ndef, absl::StatusCode::kUnimplemented,
+                                       expect_msg);
+          }
+        }
+      }
+    }
+    // All other configs will be set so that at least one parameter
+    // will be passed as a tensor
+    all_weights = false;
+  } while (nextTensorWeightConfiguration(config));
+
+  nvinfer1::DataType trt_type;
+  TF_ASSERT_OK(TfTypeToTrtType(DT_BOOL, &trt_type));
+  const std::string error_msg =
+      "Unsupported data type " + DebugString(trt_type) + " used for '";
+  do {
+    for (auto limit_type : param_types) {
+      param_type[1] = limit_type;
+      for (auto delta_type : param_types) {
+        param_type[2] = delta_type;
+
+        for (int i = 0; i < 3; i++) {
+          if (!config[i]) {
+            const auto saved_type = param_type[i];
+            param_type[i] = DT_BOOL;
+            set_parameters(param_name, param_value, param_type, config);
+            param_type[i] = saved_type;
+            RunValidationAndConversion(ndef, absl::StatusCode::kInvalidArgument,
+                                       error_msg + param_name[i] + "'");
+          }
+        }
+      }
+    }
+  } while (nextTensorWeightConfiguration(config));
+
+  // The tests that pass all checks in ConvertRange::Validate().
+  const Status status = Status::OK();
+  const std::vector<DataType> int_type{DT_INT32};
+  int partial_shape_idx = -1;
+  all_weights = true;
+  do {
+    // For now when at least one of (start, limit, delta) is passed as a tensor
+    //    (a) all these parameters should be of DT_INT32 type;
+    //    (b) only positive delta could be used.
+    const auto& types = all_weights ? param_types : int_type;
+    const auto jEnd = all_weights ? 1 : 0;
+    for (auto limit_type : types) {
+      param_type[1] = limit_type;
+      for (auto delta_type : types) {
+        param_type[2] = delta_type;
+        // Loop for positive and negative deltas.
+        for (int j = 0; j <= jEnd; j++) {
+          // Define the expected result which should match the usage
+          // of DT_INT32 for one of (start, limit, delta).
+          const int mult = (1 - 2 * j);
+          param_value[j] = {get_casted_value(start, tf_type_)};
+          param_value[1 - j] = {get_casted_value(limit, limit_type)};
+          param_value[2] = {mult * get_casted_value(delta, delta_type)};
+
+          // Create expected output.
+          std::vector<float> expected_output;
+          const float limit_curr = param_value[1][0];
+          const float delta_curr = param_value[2][0];
+          float value = param_value[0][0];
+          int num_values = 0;
+          while (mult * (limit_curr - value) > 0) {
+            num_values++;
+            expected_output.push_back(value);
+            value += delta_curr;
+          }
+
+          set_parameters(param_name, param_value, param_type, config,
+                         partial_shape_idx);
+          const std::vector<int> output_dims = {num_values};
+          TestOpConverter(ndef, output_dims, status, status,
+                          ElementsAreArray(expected_output));
+        }
+      }
+    }
+
+    if (all_weights) {
+      if (start_type != DT_INT32) break;
+      if (trt_mode_ == TrtTestMode::kDynamicShape) partial_shape_idx = 3;
+
+      // All other configs will be set so that at least one parameter
+      // will be passed as a tensor
+      all_weights = false;
+    }
+  } while (nextTensorWeightConfiguration(config));
+}
+
+TEST_P(OpConverter_FP32_FP16_INT32_Test, ConvertLikeOps) {
+  auto get_node = [&](int value) -> NodeDef {
+    Scope s = Scope::NewRootScope();
+    auto input = ops::Placeholder(s.WithOpName("input"), tf_type_);
+    if (value == 0) {
+      auto zeros_like = ops::ZerosLike(s.WithOpName("Zeros"), input);
+      return zeros_like.operation.node()->def();
+    }
+    auto ones_like = ops::OnesLike(s.WithOpName("Ones"), input);
+    return ones_like.operation.node()->def();
+  };
+
+  for (int value : {0, 1}) {
+    Reset();
+    const NodeDef& node_def = get_node(value);
+
+    if (trt_mode_ == TrtTestMode::kImplicitBatch) {
+      std::vector<float> input_data(8, 42.0f);
+      AddTestTensor("input", {8}, tf_type_, input_data);
+      const auto& err = convert_not_supported_implicit(node_def.name() + "Like",
+                                                       node_def.name());
+      RunValidationAndConversion(node_def, absl::StatusCode::kUnimplemented,
+                                 err);
+      continue;
+    }
 
-  // OK. Note that kINT32 is not supported by IElementWiseLayer, so we don't
-  // test DT_INT32 type here.
-  TestConvertSquare<DT_FLOAT>(this);
-  TestConvertSquare<DT_HALF>(this);
+    std::vector<std::vector<int>> output_dims_params = {
+        {8}, {8, 2, 4}, {32, 32, 3200}};
+
+    float val = 42.0;
+    Status status = Status::OK();
+    for (bool input_is_tensor : {true, false}) {
+      for (auto output_dims : output_dims_params) {
+        Reset();
+        size_t nb_el = 1;
+        for (auto d : output_dims) {
+          nb_el *= d;
+        }
+        std::vector<float> input_data(nb_el, val);
+        if (input_is_tensor) {
+          AddTestTensor("input", output_dims, tf_type_, input_data);
+        } else {
+          AddTestWeights("input", output_dims, input_data, tf_type_);
+        }
+        std::vector<float> expected_output(nb_el, value);
+        TestOpConverter(node_def, output_dims, status, status,
+                        ElementsAreArray(expected_output));
+      }
+    }
+  }
 }
 
-#if IS_TRT_VERSION_GE(5, 1, 0, 0)
-// TODO: @mconley @jdekhtiar - Reactivate when fixed
-#ifndef TF2TENSORRT_BYPASS_NMS_RESIZE_OPS
-TEST_F(OpConverterTest, ConvertCombinedNMS) {
+#endif  // IS_TRT_VERSION_GE(8, 2, 0, 0)
+
+#if IS_TRT_VERSION_GE(8, 2, 1, 6) || defined(TF_TRT_USE_EFFICIENT_NMS_PLUGIN)
+
+TEST_P(OpConverter_FP32_Test, ConvertCombinedNMS) {
   // Get the NodeDef for CombinedNMS.
-  auto get_nms_nodedef = []() -> NodeDef {
+  auto get_nms_nodedef = [](DataType tf_type, bool clip_boxes = true,
+                            bool pad_per_class = false) -> NodeDef {
     Scope s = Scope::NewRootScope();
-    auto boxes_tensor = ops::Placeholder(s.WithOpName("boxes"), DT_FLOAT);
-    auto scores_tensor = ops::Placeholder(s.WithOpName("scores"), DT_FLOAT);
+    auto boxes_tensor = ops::Placeholder(s.WithOpName("boxes"), tf_type);
+    auto scores_tensor = ops::Placeholder(s.WithOpName("scores"), tf_type);
     auto max_output_size_per_class =
         ops::Placeholder(s.WithOpName("max_output_size_per_class"), DT_INT32);
     auto max_total_size =
         ops::Placeholder(s.WithOpName("max_total_size"), DT_INT32);
     auto iou_threshold =
-        ops::Placeholder(s.WithOpName("iou_threshold"), DT_FLOAT);
+        ops::Placeholder(s.WithOpName("iou_threshold"), tf_type);
     auto score_threshold =
-        ops::Placeholder(s.WithOpName("score_threshold"), DT_FLOAT);
-    auto nms_attrs = ops::CombinedNonMaxSuppression::Attrs().PadPerClass(false);
+        ops::Placeholder(s.WithOpName("score_threshold"), tf_type);
+    auto nms_attrs = ops::CombinedNonMaxSuppression::Attrs()
+                         .PadPerClass(pad_per_class)
+                         .ClipBoxes(clip_boxes);
 
     auto nms_op = ops::CombinedNonMaxSuppression(
         s.WithOpName("my_nms"), boxes_tensor, scores_tensor,
@@ -2524,212 +4131,376 @@ TEST_F(OpConverterTest, ConvertCombinedNMS) {
   };
 
   struct TestParams {
+    const std::string description;
     const std::vector<int32> boxes_tensor_dims;
     const std::vector<int32> scores_tensor_dims;
+    const std::vector<float> boxes_values;
+    const std::vector<float> scores_values;
     const int32 max_output_size_per_class;
     const int32 max_total_size;
     const float iou_threshold;
     const float score_threshold;
-    const std::vector<int32> expected_nmsed_boxes_dims;
-    const std::vector<int32> expected_nmsed_scores_dims;
-    const std::vector<int32> expected_nmsed_classes_dims;
+    const bool pad_per_class;
+    const bool clip_boxes;
+    const std::vector<std::vector<int32>> expected_output_dims;
+    const std::vector<float> exp_boxes;
+    const std::vector<float> exp_scores;
+    const std::vector<float> exp_classes;
+    const std::vector<float> exp_num_detections;
+    Status conversion_status;
+    Status runtime_status;
   };
 
-  // Ok.
-  const int kCombinedNMSOKCases = 1;
-  TestParams ok_params[kCombinedNMSOKCases] = {
+#if IS_TRT_VERSION_GE(8, 2, 1, 6) || defined(TF_TRT_USE_EFFICIENT_NMS_PLUGIN)
+  Status conv_status =
+      trt_mode_ == TrtTestMode::kImplicitBatch
+          ? errors::Unimplemented(convert_not_supported_implicit(
+                "CombinedNonMaxSuppression", "my_nms"))
+          : Status::OK();
+
+  std::vector<TestParams> params = {
+      TestParams{"Test 1: clip boxes",
+                 {1, 1, 3, 4},  // boxes dims
+                 {1, 1, 3},     // scores dims
+                                // boxes values:
+                 {0, 0, 0.3, 1.4, 0, 0, 0.3, 1.4, 0, 0, 0.3, 1.4},
+                 {0.4, 0.7, 0.3},  // scores values
+                 3,                // max_output_size_per_class
+                 2,                // max_total_size
+                 0.1,              // IOU threshold
+                 0,                // score_threshold
+                 false,            // pad_per_class
+                 true,             // clip_boxes
+                 {{1, 2, 4},       // expected_nmsed_boxes_dims
+                  {1, 2},          // expected_nmsed_scores_dims
+                  {1, 2},          // expected_nmsed_classes_dims
+                  {1}},            // expected_valid_detections_dims
+                                   // exp_boxes_values:
+                 {0, 0, 0.3, 1.0, 0, 0, 0.3, 1.0},
+                 {0.7, 0.4},  // exp_scores
+                 {1, 0},      // exp_classes
+                 {2},         // exp_num_detections
+                 conv_status},
+      TestParams{
+          "Test 2: iou threshold",
+          {1, 5, 1, 4},  // boxes dims
+          {1, 5, 1},     // scores dims
+                         // boxes values:
+          {0, 0, 5, 10, 0, 1, 5, 11, 8, 0, 12, 4, 6, 2, 10, 6, 8, 9, 11, 12},
+          {5, 4, 3, 2, 1},  // scores values
+          4,                // max_output_size_per_class
+          4,                // max_total_size
+          0.7,              // IOU threshold
+          0,                // score threshold
+          false,            // pad_per_class
+          false,            // clip_boxes
+          {{1, 4, 4},       // expected nmsed_boxes_dims
+           {1, 4},          // expected nmsed_scores_dims
+           {1, 4},          // expected_nmsed_classes_dims
+           {1}},            // expected_valid_detections_dims
+                            // exp_boxes_values:
+          {0, 0, 5, 10, 8, 0, 12, 4, 6, 2, 10, 6, 8, 9, 11, 12},
+          {5, 3, 2, 1},  // exp_scores
+          {0, 0, 0, 0},  // exp_classes
+          {4},           // exp_num_detections
+          conv_status},
+      TestParams{
+          "Test 3: score threshold",
+          {1, 5, 1, 4},  // boxes dims
+          {1, 5, 1},     // scores dims
+                         // boxes values:
+          {0, 0, 5, 10, 0, 1, 5, 11, 8, 0, 12, 4, 6, 2, 10, 6, 8, 9, 11, 12},
+          {5, 4, 3, 2, 1},  // scores values
+          4,                // max_output_size_per_class
+          4,                // max_total_size
+          0.1,              // IOU threshold
+          2,                // score threshold
+          false,            // pad_per_class
+          false,            // clip_boxes
+          {{1, 4, 4},       // expected nmsed_boxes_dims
+           {1, 4},          // expected nmsed_scores_dims
+           {1, 4},          // expected_nmsed_classes_dims
+           {1}},            // expected_valid_detections_dims
+                            // exp_boxes_values:
+          {0, 0, 5, 10, 8, 0, 12, 4, 0, 0, 0, 0, 0, 0, 0, 0},
+          {5, 3, 0, 0},  // exp_scores
+          {0, 0, 0, 0},  // exp_classes
+          {2},           // exp_num_detections
+          conv_status},
+      TestParams{
+          "Test 4: per class size and pad",
+          {1, 5, 1, 4},  // boxes dims
+          {1, 5, 2},     // scores dims
+                         // boxes values:
+          {0, 0, 5, 10, 0, 1, 5, 11, 8, 0, 12, 4, 6, 2, 10, 6, 8, 9, 11, 12},
+          // scores values:
+          {5, 0, 0, 4, 3, 0, 2, 0, 1, 0},
+          1,           // max_output_size_per_class
+          4,           // max_total_size
+          0.1,         // IOU threshold
+          0,           // score threshold
+          true,        // pad_per_class
+          false,       // clip_boxes
+          {{1, 2, 4},  // expected nmsed_boxes_dims
+           {1, 2},     // expected nmsed_scores_dims
+           {1, 2},     // expected_nmsed_classes_dims
+           {1}},       // expected_valid_detections_dims
+                       // exp_boxes_values:
+          {0, 0, 5, 10, 0, 1, 5, 11},
+          {5, 4},  // exp_scores
+          {0, 1},  // exp_classes
+          {2},     // exp_num_detections
+          conv_status},
+      TestParams{
+          "Test 5: different box coordinate order",
+          {1, 5, 1, 4},  // boxes dims
+          {1, 5, 2},     // scores dims
+                         // boxes values:
+          {5, 10, 0, 0, 5, 11, 0, 1, 12, 4, 8, 0, 10, 6, 6, 2, 11, 12, 8, 9},
+          // scores values:
+          {5, 0, 0, 4, 3, 0, 2, 0, 1, 0},
+          1,           // max_output_size_per_class
+          4,           // max_total_size
+          0.1,         // IOU threshold
+          0,           // score threshold
+          true,        // pad_per_class
+          false,       // clip_boxes
+          {{1, 2, 4},  // expected nmsed_boxes_dims
+           {1, 2},     // expected nmsed_scores_dims
+           {1, 2},     // expected_nmsed_classes_dims
+           {1}},       // expected_valid_detections_dims
+                       // exp_boxes_values:
+          {5, 10, 0, 0, 5, 11, 0, 1},
+          {5, 4},  // exp_scores
+          {0, 1},  // exp_classes
+          {2},     // exp_num_detections
+          conv_status},
+  };
+#else  // IS_TRT_VERSION_GE(7, 1, 3, 0)
+  Status conv_status =
+      trt_mode_ == TrtTestMode::kDynamicShape
+          ? errors::Unimplemented(
+                "TensorRT BatchedNMS Plugin requires input with static shape")
+          : Status::OK();
+
+  std::vector<TestParams> params = {
       // TODO(aaroey): there is a bug in TRT's CombinedNonMaxSuppression
       // implementation that, the extra output classes that are outside of the
       // range specified by valid_detections[i] are not zeros but -1s.
-      TestParams{{1, 1, 4}, {1, 3}, 3, 2, .5f, 0, {2, 4}, {2}, {2}}};
+      TestParams{
+          "Test 1: Original test",
+          {1, 1, 3, 4},                                      // boxes dims
+          {1, 1, 3},                                         // scores dims
+          {0, 0, 0.3, 0.4, 0, 0, 0.3, 0.4, 0, 0, 0.3, 0.4},  // boxes values
+          {0.4, 0.7, 0.3},                                   // scores values
+          3,                                 // max_output_size_per_class
+          2,                                 // max_total_size
+          .5f,                               // IOU threshold
+          0,                                 // score_threshold
+          false,                             // pad_per_class
+          true,                              // clip_boxes
+          {{1, 2, 4},                        // expected_nmsed_boxes_dims
+           {1, 2},                           // expected_nmsed_scores_dims
+           {1, 2},                           // expected_nmsed_classes_dims
+           {1}},                             // expected_valid_detections_dims
+          {0, 0, 0.3, 0.4, 0, 0, 0.3, 0.4},  // exp_boxes_values
+          {0.7, 0.4},                        // exp_scores
+          {1, 0},                            // exp_classes
+          {2},                               // exp_num_detections
+          conv_status},
+      // Test with clip_boxes = False
+      TestParams{
+          "Test 2: clip_boxes",
+          {1, 5, 1, 4},  // boxes dims
+          {1, 5, 1},     // scores dims
+          // boxes values:
+          {0, 0, 5, 10, 0, 4, 5, 14, 8, 0, 12, 4, 6, 2, 10, 6, 8, 9, 11, 12},
+          {5, 4, 3, 2, 1},  // scores values
+          4,                // max_output_size_per_class
+          4,                // max_total_size
+          0.1,              // IOU threshold
+          0,                // score threshold
+          false,            // pad_per_class
+          false,            // clip_boxes
+          {{1, 4, 4},       // expected nmsed_boxes_dims
+           {1, 4},          // expected nmsed_scores_dims
+           {1, 4},          // expected_nmsed_classes_dims
+           {1}},            // expected_valid_detections_dims
+                            // exp_boxes_values:
+          {0, 0, 5, 10, 8, 0, 12, 4, 8, 9, 11, 12, 0, 0, 0, 0},
+          {5, 3, 1, 0},   // exp_scores
+          {0, 0, 0, -1},  // exp_classes
+          {3},            // exp_num_detections
+          conv_status},
+      // Test with clip_boxes = False, and nonzero score threshold
+      TestParams{
+          "Test 3: score threshold",
+          {1, 5, 1, 4},  // boxes dims
+          {1, 5, 1},     // scores dims
+          // boxes values:
+          {0, 0, 5, 10, 0, 4, 5, 14, 8, 0, 12, 4, 6, 2, 10, 6, 8, 9, 11, 12},
+          {5, 4, 3, 2, 1},  // scores values
+          4,                // max_output_size_per_class
+          4,                // max_total_size
+          0.1,              // IOU threshold
+          2,                // score threshold
+          false,            // pad_per_class
+          false,            // clip_boxes
+          {{1, 4, 4},       // expected nmsed_boxes_dims
+           {1, 4},          // expected nmsed_scores_dims
+           {1, 4},          // expected_nmsed_classes_dims
+           {1}},            // expected_valid_detections_dims
+                            // exp_boxes_values:
+          {0, 0, 5, 10, 8, 0, 12, 4, 0, 0, 0, 0, 0, 0, 0, 0},
+          {5, 3, 0, 0},    // exp_scores
+          {0, 0, -1, -1},  // exp_classes
+          {2},             // exp_num_detections
+          conv_status},
+      // Test where the boxes are defined as with max value first for the box
+      // coordinates. This test fails before TRT 7.1.3.
+      TestParams{
+          "Test 4: max coord first",
+          {1, 5, 1, 4},  // boxes dims
+          {1, 5, 1},     // scores dims
+                         // boxes values:
+          {5, 10, 0, 0, 5, 14, 0, 4, 12, 4, 8, 0, 10, 6, 6, 2, 11, 12, 8, 9},
+          {5, 4, 3, 2, 1},  // scores values
+          4,                // max_output_size_per_class
+          4,                // max_total_size
+          0.1,              // IOU threshold
+          0,                // score threshold
+          false,            // pad_per_class
+          false,            // clip_boxes
+          {{1, 4, 4},       // expected nmsed_boxes_dims
+           {1, 4},          // expected nmsed_scores_dims
+           {1, 4},          // expected_nmsed_classes_dims
+           {1}},            // expected_valid_detections_dims
+                            // exp_boxes_values:
+          {5, 10, 0, 0, 12, 4, 8, 0, 11, 12, 8, 9, 0, 0, 0, 0},
+          {5, 3, 1, 0},   // exp_scores
+          {0, 0, 0, -1},  // exp_classes
+          {3},            // exp_num_detections
+          conv_status},
+      TestParams{"Test 5: TopK error",
+                 {1, 5000, 1, 4},  // boxes dims
+                 {1, 5000, 1},     // scores dims
+                 {},               // boxes values:
+                 {},               // scores values
+                 4,                // max_output_size_per_class
+                 4,                // max_total_size
+                 0.1,              // IOU threshold
+                 0,                // score threshold
+                 false,            // pad_per_class
+                 false,            // clip_boxes
+                 {},               // expected_valid_detections_dims
+                 {},               // exp_boxes_values
+                 {},               // exp_scores
+                 {},               // exp_classes
+                 {},               // exp_num_detections
+                 conv_status.ok()
+                     ? errors::InvalidArgument(
+                           "TRT NMS plugin allow top_k<=4096, where top_k = "
+                           "max(num_boxes, max_total_size). You can override "
+                           "this by setting TF_TRT_ALLOW_NMS_TOPK_OVERRIDE=1 "
+                           "environment variable, but this can result in a "
+                           "loss of accuracy.")
+                     : conv_status},
+  };
+#endif
 
-  for (int i = 0; i < kCombinedNMSOKCases; ++i) {
+  for (auto p : params) {
     Reset();
-
-    AddTestTensor("boxes", ok_params[i].boxes_tensor_dims);
-    AddTestTensor("scores", ok_params[i].scores_tensor_dims);
+    SCOPED_TRACE(p.description);
+    AddTestTensor("boxes", p.boxes_tensor_dims, p.boxes_values);
+    AddTestTensor("scores", p.scores_tensor_dims, p.scores_values);
     AddTestWeights<int32>("max_output_size_per_class", {1},
-                          {ok_params[i].max_output_size_per_class});
-    AddTestWeights<int32>("max_total_size", {1}, {ok_params[i].max_total_size});
-    AddTestWeights<float>("iou_threshold", {1}, {ok_params[i].iou_threshold});
-    AddTestWeights<float>("score_threshold", {1},
-                          {ok_params[i].score_threshold});
-
-    RunValidationAndConversion(get_nms_nodedef());
-
-    TRT_TensorOrWeights nmsed_boxes;
-    TRT_TensorOrWeights nmsed_scores;
-    TRT_TensorOrWeights nmsed_classes;
-    TRT_TensorOrWeights valid_detections;
-
-    TF_EXPECT_OK(GetTensorOrWeights("my_nms", &nmsed_boxes));
-    TF_EXPECT_OK(GetTensorOrWeights("my_nms:1", &nmsed_scores));
-    TF_EXPECT_OK(GetTensorOrWeights("my_nms:2", &nmsed_classes));
-    TF_EXPECT_OK(GetTensorOrWeights("my_nms:3", &valid_detections));
-
-    ASSERT_TRUE(nmsed_boxes.is_tensor());
-    ASSERT_TRUE(nmsed_scores.is_tensor());
-    ASSERT_TRUE(nmsed_classes.is_tensor());
-    ASSERT_TRUE(valid_detections.is_tensor());
-
-    ExpectTrtDimsEqualsArray(ok_params[i].expected_nmsed_boxes_dims,
-                             nmsed_boxes.tensor()->getDimensions());
-    ExpectTrtDimsEqualsArray(ok_params[i].expected_nmsed_scores_dims,
-                             nmsed_scores.tensor()->getDimensions());
-    ExpectTrtDimsEqualsArray(ok_params[i].expected_nmsed_classes_dims,
-                             nmsed_classes.tensor()->getDimensions());
-    ExpectTrtDimsEqualsArray({}, valid_detections.tensor()->getDimensions());
-
-    DataVec output_data{
-        {"my_nms", ConstructTensor<float>(8)},
-        {"my_nms:1", ConstructTensor<float>(2)},
-        {"my_nms:2", ConstructTensor<float>(2)},
-        {"my_nms:3", ConstructTensor<int32>(1)},
-    };
-    const DataVec input_data{
-        {"boxes", test::AsTensor<float>({0, 0, 0.3, 0.4})},
-        {"scores", test::AsTensor<float>({0.4, 0.7, 0.3})}};
-    BuildAndRun(input_data, &output_data);
-    EXPECT_THAT(GetSpanForData<float>(output_data[0]),
-                ElementsAre(0, 0, 0.3, 0.4, 0, 0, 0.3, 0.4));
-    EXPECT_THAT(GetSpanForData<float>(output_data[1]), ElementsAre(0.7, 0.4));
-    EXPECT_THAT(GetSpanForData<float>(output_data[2]), ElementsAre(1, 0));
-    EXPECT_THAT(GetSpanForData<int32>(output_data[3]), ElementsAre(2));
+                          {p.max_output_size_per_class});
+    AddTestWeights<int32>("max_total_size", {1}, {p.max_total_size});
+    AddTestWeights<float>("iou_threshold", {1}, {p.iou_threshold}, tf_type_);
+    AddTestWeights<float>("score_threshold", {1}, {p.score_threshold},
+                          tf_type_);
+
+    auto node_def = get_nms_nodedef(tf_type_, p.clip_boxes, p.pad_per_class);
+
+    TestOpConverterMultiOut(node_def, p.expected_output_dims,
+                            p.conversion_status, p.runtime_status,
+                            {
+                                ElementsAreArray(p.exp_boxes),
+                                ElementsAreArray(p.exp_scores),
+                                ElementsAreArray(p.exp_classes),
+                                ElementsAreArray(p.exp_num_detections),
+                            },
+                            {tf_type_, tf_type_, tf_type_, DT_INT32});
   }
 }
+#endif
 
-#endif  // TF2TENSORRT_BYPASS_NMS_RESIZE_OPS
-#endif  // CombinedNonMaxSuppression
+template <typename T>
+NodeDef CreateUnaryOp(DataType tf_type) {
+  Scope s = Scope::NewRootScope();
+  auto input = ops::Placeholder(s.WithOpName("input"), tf_type);
+  return T(s.WithOpName("my_unary"), input).operation.node()->def();
+}
 
-TEST_F(OpConverterTest, ConvertActivation) {
-  {
-    // Input is weights, should fail.
-    Reset();
-    Scope s = Scope::NewRootScope();
-    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
-    auto relu = ops::Relu(s.WithOpName("my_act"), input);
-    const NodeDef& node_def = relu.operation.node()->def();
-    AddTestWeights<int32>("input", {1, 2, 3}, {-3, -2, -1, 0, 1, 2});
-    RunValidationAndConversion(
-        node_def, error::UNIMPLEMENTED,
-        "The input \"input\" for Relu must be a tensor, at my_act");
-  }
+constexpr float kLeakyReluAlpha = 0.2f;
+template <>
+NodeDef CreateUnaryOp<ops::internal::LeakyRelu>(DataType tf_type) {
+  Scope s = Scope::NewRootScope();
+  auto input = ops::Placeholder(s.WithOpName("input"), tf_type);
+  return ops::internal::LeakyRelu(
+             s.WithOpName("my_unary"), input,
+             ops::internal::LeakyRelu::Alpha(kLeakyReluAlpha))
+      .operation.node()
+      ->def();
+}
 
-  constexpr float kLeakyReluAlpha = 0.2f;
+TEST_P(OpConverter_FP32_UnaryTest, ConvertActivation) {
   constexpr float kSeluAlpha = 1.7580993408473768599402175208123f;
   constexpr float kSeluScale = 1.0507009873554804934193349852946f;
+  using OpFunc = std::function<NodeDef(DataType)>;
+  using ValFunc = float (*)(float);
+  std::map<std::string, std::pair<OpFunc, ValFunc>> op_map;
+
+#define ADD_OP(name, op, compute) \
+  op_map[name] = std::make_pair(CreateUnaryOp<op>, compute)
+  ADD_OP("LeakyRelu", ops::internal::LeakyRelu,
+         [](float x) { return (x > 0.0f) ? x : x * kLeakyReluAlpha; });
+  ADD_OP("Relu", ops::Relu, [](float x) { return (x > 0.0f) ? x : 0.0f; });
+  ADD_OP("Relu6", ops::Relu6,
+         [](float x) { return std::min(std::max(x, 0.0f), 6.0f); });
+  ADD_OP("Sigmoid", ops::Sigmoid,
+         [](float x) { return 1.0f / (1.0f + std::exp(-x)); });
+  ADD_OP("Tanh", ops::Tanh, static_cast<ValFunc>(std::tanh));
+  ADD_OP("Elu", ops::Elu,
+         [](float x) { return (x > 0.0f) ? x : std::exp(x) - 1; });
+  ADD_OP("Selu", ops::Selu, [](float x) {
+    return (x > 0.0f) ? kSeluScale * x
+                      : kSeluScale * kSeluAlpha * (std::exp(x) - 1);
+  });
+  ADD_OP("Softsign", ops::Softsign,
+         [](float x) { return x / (std::abs(x) + 1); });
+  ADD_OP("Softplus", ops::Softplus,
+         [](float x) { return std::log(std::exp(x) + 1); });
+#undef ADD_OP
+
+  // std::exp in Softplus will overflow for input > 88
+  const std::vector<float> input = {-100, -2, -1, 0, 1, 88};
+  const bool nan_sensitive = false;
 
-  // Get nodedef for activation layer.
-  auto get_act_nodedef = [](string op_name) -> NodeDef {
-    Scope s = Scope::NewRootScope();
-    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
-    if (op_name == "LeakyRelu") {
-      auto act = ops::internal::LeakyRelu(
-          s.WithOpName("my_act"), input,
-          ops::internal::LeakyRelu::Alpha(kLeakyReluAlpha));
-      return act.operation.node()->def();
-    } else if (op_name == "Relu") {
-      auto act = ops::Relu(s.WithOpName("my_act"), input);
-      return act.operation.node()->def();
-    } else if (op_name == "Relu6") {
-      auto act = ops::Relu6(s.WithOpName("my_act"), input);
-      return act.operation.node()->def();
-    } else if (op_name == "Sigmoid") {
-      auto act = ops::Sigmoid(s.WithOpName("my_act"), input);
-      return act.operation.node()->def();
-    } else if (op_name == "Tanh") {
-      auto act = ops::Tanh(s.WithOpName("my_act"), input);
-      return act.operation.node()->def();
-    } else if (op_name == "Elu") {
-      auto act = ops::Elu(s.WithOpName("my_act"), input);
-      return act.operation.node()->def();
-    } else if (op_name == "Selu") {
-      auto act = ops::Selu(s.WithOpName("my_act"), input);
-      return act.operation.node()->def();
-    } else if (op_name == "Softsign") {
-      auto act = ops::Softsign(s.WithOpName("my_act"), input);
-      return act.operation.node()->def();
-    } else if (op_name == "Softplus") {
-      auto act = ops::Softplus(s.WithOpName("my_act"), input);
-      return act.operation.node()->def();
-    }
-    EXPECT_TRUE(false);
-    return NodeDef();
-  };
-  // Get expected output for activation layer.
-  auto get_act_output = [](string op_name, float input) -> float {
-    if (op_name == "LeakyRelu") {
-      return (input > 0.0f) ? input : input * kLeakyReluAlpha;
-    } else if (op_name == "Relu") {
-      return (input > 0.0f) ? input : 0.0f;
-    } else if (op_name == "Relu6") {
-      return std::min(std::max(input, 0.0f), 6.0f);
-    } else if (op_name == "Sigmoid") {
-      return 1.0f / (1.0f + std::exp(-input));
-    } else if (op_name == "Tanh") {
-      return std::tanh(input);
-    } else if (op_name == "Elu") {
-      return (input > 0.0f) ? input : std::exp(input) - 1;
-    } else if (op_name == "Selu") {
-      return (input > 0.0f) ? kSeluScale * input
-                            : kSeluScale * kSeluAlpha * (std::exp(input) - 1);
-    } else if (op_name == "Softsign") {
-      return input / (std::abs(input) + 1);
-    } else if (op_name == "Softplus") {
-      return std::log(std::exp(input) + 1);
-    }
-    EXPECT_TRUE(false);
-    return 0;
-  };
-
-  // Get list of ops to test.
-  std::vector<string> ops_to_test;
-  // Add all ops supported by ConvertUnary.
-  auto* map = ActivationTypeMap();
-  ops_to_test.reserve(map->size());
-  for (auto& pair : *map) {
-    ops_to_test.push_back(pair.first);
-  }
-  // Add other activation ops to test.
-  ops_to_test.push_back("Relu6");
-  ops_to_test.push_back("LeakyRelu");
-  // Ok.
-  for (const string& op_name : ops_to_test) {
-    Reset();
-    NodeDef node_def = get_act_nodedef(op_name);
-    AddTestTensor("input", {1, 2, 3});
-    RunValidationAndConversion(node_def);
-    TRT_TensorOrWeights output;
-    TF_EXPECT_OK(GetTensorOrWeights("my_act", &output));
-    ASSERT_TRUE(output.is_tensor());
-    ExpectTrtDimsEqualsArray({1, 2, 3}, output.tensor()->getDimensions());
-
-    // Certain activations should set quantization range automatically.
-    auto ranges = quantization_ranges();
-    if (op_name == "Relu6") {
-      EXPECT_EQ(ranges[output.tensor()->trt_tensor()], 6.0f);
-    } else if (op_name == "Sigmoid" || op_name == "Tanh" ||
-               op_name == "Softsign") {
-      EXPECT_EQ(ranges[output.tensor()->trt_tensor()], 1.0f);
-    }
-
-    // std::exp in Softplus will overflow for input > 88
-    const std::vector<float> input = {-100, -2, -1, 0, 1, 88};
-    const DataVec input_data{{"input", test::AsTensor<float>(input)}};
-    DataVec output_data{{"my_act", ConstructTensor<float>(6)}};
-    BuildAndRun(input_data, &output_data);
-    for (int i = 0; i < input.size(); i++) {
-      const float expected_output = get_act_output(op_name, input[i]);
-      EXPECT_NEAR(GetSpanForData<float>(output_data[0])[i], expected_output,
-                  1e-4);
-    }
-  }
+#if IS_TRT_VERSION_GE(8, 0, 0, 0)
+  // NVBug # 3322482 - Known bug with TRT 8.0 on specific GPU architectures
+  const float max_abs_error = 1e-4;
+#else
+  const float max_abs_error = 0.;
+#endif
+  RunTests("Activation", *ActivationTypeMap(), op_map, input, "input",
+           max_abs_error, nan_sensitive);
 }
 
-TEST_F(OpConverterTest, ConvertExpandDims) {
+TEST_P(OpConverter_FP32_Test, ConvertExpandDims) {
   // Get the NodeDef for ExpandDims.
   Scope s = Scope::NewRootScope();
-  auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
+  auto input = ops::Placeholder(s.WithOpName("input"), tf_type_);
   auto weights = ops::Placeholder(s.WithOpName("weights"), DT_INT32);
   auto expanddims =
       ops::ExpandDims(s.WithOpName("my_expanddims"), input, weights);
@@ -2739,227 +4510,241 @@ TEST_F(OpConverterTest, ConvertExpandDims) {
     Reset();
     AddTestWeights<int32>("input", {1, 2, 3}, {1, 2, 3, 4, 5, 6});
     AddTestWeights<int32>("weights", {1}, {1});
-    RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
+    RunValidationAndConversion(node_def, absl::StatusCode::kUnimplemented,
                                "The input \"input\" for ExpandDims must be a "
-                               "tensor, at my_expanddims");
+                               "tensor");
   }
   {
     // Axis is a tensor, should fail.
     Reset();
-    AddTestTensor("input", {1, 2, 3});
+    AddTestTensor("input", {3, 2, 1});
     AddTestTensor("weights", {3});
-    RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
+    RunValidationAndConversion(node_def, absl::StatusCode::kUnimplemented,
                                "The input \"axis\" for ExpandDims must be a "
-                               "constant, at my_expanddims");
-  }
-  {
-    // Add dim at batch dimension, should fail.
-    Reset();
-    AddTestTensor("input", {1, 2, 3});
-    AddTestWeights<int32>("weights", {1}, {0});
-    RunValidationAndConversion(
-        node_def, error::UNIMPLEMENTED,
-        "TensorRT does not allow manipulation of the batch dimension, at "
-        "my_expanddims");
-  }
-  {
-    // Add dim at batch dimension via negative axis, should fail.
-    Reset();
-    AddTestTensor("input", {1, 2, 3});
-    // Input is rank 4 (batch dim included)
-    AddTestWeights<int32>("weights", {1}, {-5});
-    RunValidationAndConversion(
-        node_def, error::UNIMPLEMENTED,
-        "TensorRT does not allow manipulation of the batch dimension, at "
-        "my_expanddims");
-  }
-  {
-    // Axis > rank(input), should fail.
-    Reset();
-    AddTestTensor("input", {1, 2, 3});
-    // Input is rank 4 (batch dim included)
-    AddTestWeights<int32>("weights", {1}, {5});
-    RunValidationAndConversion(
-        node_def, error::INVALID_ARGUMENT,
-        "Axis value of 5 is out of bounds, must be in range [-5, 5), at "
-        "my_expanddims");
-  }
-  {
-    // Axis < -rank(input)-1, should fail.
-    Reset();
-    AddTestTensor("input", {1, 2, 3});
-    // Input is rank 4 (batch dim included)
-    AddTestWeights<int32>("weights", {1}, {-6});
-    RunValidationAndConversion(
-        node_def, error::INVALID_ARGUMENT,
-        "Axis value of -6 is out of bounds, must be in range [-5, 5), at "
-        "my_expanddims");
+                               "constant");
   }
-
-  struct TestParams {
-    std::vector<int> input_dims;
-    int axis;
-    std::vector<int> expected_output_dims;
+  std::vector<TestParamBase> test_params = {
+      TestParamBase{{1, 1, 2, 3},
+                    {},
+                    {1, 1, 1, 2, 3},
+                    {0},
+                    trt_mode_ == TrtTestMode::kImplicitBatch
+                        ? Status(absl::StatusCode::kUnimplemented,
+                                 "TensorRT does not allow manipulation of the "
+                                 "batch dimension")
+                        : Status::OK()},
+      TestParamBase{{1, 1, 2, 3},
+                    {},
+                    {1, 1, 1, 2, 3},
+                    {-5},
+                    trt_mode_ == TrtTestMode::kImplicitBatch
+                        ? Status(absl::StatusCode::kUnimplemented,
+                                 "TensorRT does not allow manipulation of the "
+                                 "batch dimension")
+                        : Status::OK()},
+      TestParamBase{{1, 1, 2, 3},
+                    {},
+                    {},
+                    {5},
+                    Status(absl::StatusCode::kInvalidArgument,
+                           "Axis value of 5 is out of bounds, must be in range"
+                           " [-5, 5)")},
+      TestParamBase{{1, 1, 2, 3},
+                    {},
+                    {},
+                    {-6},
+                    Status(absl::StatusCode::kInvalidArgument,
+                           "Axis value of -6 is out of bounds, must be in range"
+                           " [-5, 5)")},
+      TestParamBase{{1, 2, 3}, {}, {1, 1, 2, 3}, {1}},
+      TestParamBase{{1, 2, 3}, {}, {1, 1, 2, 3}, {-3}},
+      TestParamBase{{1, 2, 3}, {}, {1, 2, 3, 1}, {3}},
+      TestParamBase{{1, 2, 3}, {}, {1, 2, 3, 1}, {-1}},
+      TestParamBase{{1, 2, 3}, {}, {1, 2, 1, 3}, {2}},
+      TestParamBase{{1, 2, 3}, {}, {1, 2, 1, 3}, {-2}},
+      TestParamBase{{1, 6}, {}, {1, 1, 6}, {1}},
+      TestParamBase{{1, 6}, {}, {1, 6, 1}, {-1}},
   };
-
-  // Ok.
-  const int kExpandDimsOKCases = 8;
-  TestParams ok_params[kExpandDimsOKCases] = {
-      TestParams{{2, 3}, 1, {1, 2, 3}}, TestParams{{2, 3}, -3, {1, 2, 3}},
-      TestParams{{2, 3}, 3, {2, 3, 1}}, TestParams{{2, 3}, -1, {2, 3, 1}},
-      TestParams{{2, 3}, 2, {2, 1, 3}}, TestParams{{2, 3}, -2, {2, 1, 3}},
-      TestParams{{6}, 1, {1, 6}},       TestParams{{6}, -1, {6, 1}},
-  };
-  for (int i = 0; i < kExpandDimsOKCases; ++i) {
+  for (auto p : test_params) {
     Reset();
-    AddTestTensor("input", ok_params[i].input_dims);
-    AddTestWeights<int32>("weights", {1}, {ok_params[i].axis});
-    RunValidationAndConversion(node_def);
-    TRT_TensorOrWeights output;
-    TF_EXPECT_OK(GetTensorOrWeights("my_expanddims", &output));
-    ASSERT_TRUE(output.is_tensor());
-    ExpectTrtDimsEqualsArray(ok_params[i].expected_output_dims,
-                             output.tensor()->getDimensions());
-
-    const DataVec input_data{
-        {"input", test::AsTensor<float>({1, 2, 3, 4, 5, 6})}};
-    DataVec output_data{{"my_expanddims", ConstructTensor<float>(6)}};
-    BuildAndRun(input_data, &output_data);
-    EXPECT_THAT(GetSpanForData<float>(output_data[0]),
-                ElementsAre(1, 2, 3, 4, 5, 6));
+    AddTestTensor("input", p.input_dims, {1, 2, 3, 4, 5, 6});
+    AddTestWeights<int32>("weights", {1}, {p.param[0]});
+    TestOpConverter(node_def, p.expected_output_dims, p.status,
+                    p.runtime_status, ElementsAreArray({1, 2, 3, 4, 5, 6}));
   }
 }
 
-TEST_F(OpConverterTest, ConvertSqueeze) {
-  {
-    // No attrs, should fail.
-    Reset();
-    Scope s = Scope::NewRootScope();
-    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
-    auto squeeze = ops::Squeeze(s.WithOpName("my_squeeze"), input);
-    const NodeDef& node_def = squeeze.operation.node()->def();
-    AddTestTensor("input", {1, 2, 3});
-    RunValidationAndConversion(
-        node_def, error::UNIMPLEMENTED,
-        "Squeeze is only implemented for explicit dims, at my_squeeze");
-  }
-
-  // Get the NodeDef for Squeeze.
-  auto get_squeeze_nodedef = [](std::vector<int> axis) -> NodeDef {
-    Scope s = Scope::NewRootScope();
-    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
-    ops::Squeeze::Attrs squeeze_attrs;
-    squeeze_attrs.axis_ = gtl::ArraySlice<int>(axis);  // non-absl ok
-    auto squeeze =
-        ops::Squeeze(s.WithOpName("my_squeeze"), input, squeeze_attrs);
-    return squeeze.operation.node()->def();
-  };
+TEST_P(OpConverter_FP32_FP16_Test, ConvertSoftmax) {
+  // Get the NodeDef for SoftMax.
+  Scope s = Scope::NewRootScope();
+  auto input = ops::Placeholder(s.WithOpName("logits"), tf_type_);
+  auto softmax = ops::Softmax(s.WithOpName("my_softmax"), input);
+  const NodeDef& node_def = softmax.operation.node()->def();
 
-  {
-    // Input is weights, should fail.
-    Reset();
-    NodeDef node_def = get_squeeze_nodedef({0});
-    AddTestWeights<float>("input", {1, 2, 3}, {1, 2, 3, 4, 5, 6});
-    RunValidationAndConversion(
-        node_def, error::UNIMPLEMENTED,
-        "The input \"input\" for Squeeze must be a tensor, at my_squeeze");
-  }
-  {
-    // Squeeze batch dim, should fail.
-    Reset();
-    NodeDef node_def = get_squeeze_nodedef({0});
-    AddTestTensor("input", {1, 2, 3});
-    RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
-                               "TensorRT does not allow manipulation of the "
-                               "batch dimension, at my_squeeze");
-  }
-  {
-    // Squeeze batch dim via negative axis, should fail.
-    Reset();
-    NodeDef node_def = get_squeeze_nodedef({-4});
-    AddTestTensor("input", {1, 2, 3});
-    RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
-                               "TensorRT does not allow manipulation of the "
-                               "batch dimension, at my_squeeze");
-  }
-  {
-    // Squeeze >= rank(input), should fail.
-    Reset();
-    NodeDef node_def = get_squeeze_nodedef({4});
-    AddTestTensor("input", {1, 2, 3});
-    RunValidationAndConversion(
-        node_def, error::INVALID_ARGUMENT,
-        "Axis value of 4 is out of bounds, must be in range [-4, 4), at "
-        "my_squeeze");
-  }
-  {
-    // Squeeze < -rank(input), should fail.
-    Reset();
-    NodeDef node_def = get_squeeze_nodedef({-5});
-    AddTestTensor("input", {1, 2, 3});
-    RunValidationAndConversion(
-        node_def, error::INVALID_ARGUMENT,
-        "Axis value of -5 is out of bounds, must be in range [-4, 4), at "
-        "my_squeeze");
-  }
-  {
-    // Squeeze an axis with size != 1, should fail.
-    Reset();
-    NodeDef node_def = get_squeeze_nodedef({2});
-    AddTestTensor("input", {1, 2, 3});
-    RunValidationAndConversion(
-        node_def, error::INVALID_ARGUMENT,
-        "Dimension 2 with size 2 cannot be squeezed because it must be size 1, "
-        "at my_squeeze");
+  struct TestParams {
+    std::vector<int> input_dims;
+    std::vector<float> expected_values;
+  };
+  std::vector<TestParams> test_params = {
+      TestParams{/*input_dims=*/{2, 3},
+                 /*expected_values=*/{0.09003057, 0.24472848, 0.66524094,
+                                      0.09003057, 0.24472848, 0.66524094}},
+      TestParams{/*input_dims=*/{6, 1},
+                 /*expected_values=*/{1, 1, 1, 1, 1, 1}},  // works w/ std input
+      TestParams{/*input_dims=*/{1, 6},  // this works w/ arange(1,7) input
+                 /*expected_values=*/{0.00426978, 0.01160646, 0.03154963,
+                                      0.08576079, 0.23312202, 0.6336913}}};
+  std::vector<float> input_values{1, 2, 3, 4, 5, 6};
+  for (auto p : test_params) {
+    Reset();
+    AddTestTensor("logits", p.input_dims, input_values);
+    TestOpConverter(node_def, p.input_dims, Status::OK(), Status::OK(),
+                    ArrayFloatNear(p.expected_values, 1e-3));
   }
+}
+
+TEST_P(OpConverter_FP32_FP16_Test, ConvertLogSoftmax) {
+  // Get the NodeDef for LogSoftMax.
+  Scope s = Scope::NewRootScope();
+  auto input = ops::Placeholder(s.WithOpName("logits"), tf_type_);
+  auto logsoftmax = ops::LogSoftmax(s.WithOpName("my_logsoftmax"), input);
+  const NodeDef& node_def = logsoftmax.operation.node()->def();
 
   struct TestParams {
     std::vector<int> input_dims;
-    std::vector<int> axis;
-    std::vector<int> expected_output_dims;
+    std::vector<float> expected_values;
   };
 
-  // Ok.
-  const int kSqueezeOKCases = 10;
-  TestParams ok_params[kSqueezeOKCases] = {
-      TestParams{{1, 2, 3}, {1}, {2, 3}},
-      TestParams{{1, 2, 3}, {-3}, {2, 3}},
-      TestParams{{2, 3, 1}, {3}, {2, 3}},
-      TestParams{{2, 3, 1}, {-1}, {2, 3}},
-      TestParams{{1, 2, 1, 3, 1}, {1, 3, 5}, {2, 3}},
-      TestParams{{1, 2, 1, 3, 1}, {3, 1, 5}, {2, 3}},
-      TestParams{{1, 2, 1, 3, 1}, {-1, -3, -5}, {2, 3}},
-      TestParams{{1, 2, 1, 3, 1}, {1, -3, 5}, {2, 3}},
-      TestParams{{1, 6}, {1}, {6}},
-      TestParams{{6, 1}, {2}, {6}},
-  };
-  for (int i = 0; i < kSqueezeOKCases; ++i) {
-    Reset();
-    NodeDef node_def = get_squeeze_nodedef(ok_params[i].axis);
-    AddTestTensor("input", ok_params[i].input_dims);
-    RunValidationAndConversion(node_def);
-    TRT_TensorOrWeights output;
-    TF_EXPECT_OK(GetTensorOrWeights("my_squeeze", &output));
-    ASSERT_TRUE(output.is_tensor());
-    ExpectTrtDimsEqualsArray(ok_params[i].expected_output_dims,
-                             output.tensor()->getDimensions());
+  std::vector<TestParams> test_params = {
+      TestParams{/*input_dims=*/{2, 3},
+                 /*expected_values=*/{-2.4076061, -1.407606, -0.40760604,
+                                      -2.4076061, -1.407606, -0.40760604}},
+      TestParams{/*input_dims=*/{1, 6},
+                 /*expected_values=*/{-5.4561934, -4.4561934, -3.4561934,
+                                      -2.4561934, -1.4561933, -0.45619333}},
+      TestParams{/*input_dims=*/{6, 1},
+                 /*expected_values=*/{0, 0, 0, 0, 0, 0}}};
+  std::vector<float> input_values{1, 2, 3, 4, 5, 6};
+  for (auto p : test_params) {
+    Reset();
+    AddTestTensor("logits", p.input_dims, input_values);
+    TestOpConverter(node_def, p.input_dims, Status::OK(), Status::OK(),
+                    ArrayFloatNear(p.expected_values, 1e-3));
+  }
+}
 
-    const DataVec input_data{
-        {"input", test::AsTensor<float>({1, 2, 3, 4, 5, 6})}};
-    DataVec output_data{{"my_squeeze", ConstructTensor<float>(6)}};
-    BuildAndRun(input_data, &output_data);
-    EXPECT_THAT(GetSpanForData<float>(output_data[0]),
-                ElementsAre(1, 2, 3, 4, 5, 6));
+TEST_P(OpConverter_FP32_Test, ConvertSqueeze) {
+  const bool use_implicit_batch = (trt_mode_ == TrtTestMode::kImplicitBatch);
+  // Get the NodeDef for Squeeze.
+  auto get_squeeze_nodedef = [](std::vector<int> axes,
+                                DataType tf_type) -> NodeDef {
+    Scope s = Scope::NewRootScope();
+    auto input = ops::Placeholder(s.WithOpName("input"), tf_type);
+    if (!axes.empty()) {
+      ops::Squeeze::Attrs squeeze_attrs;
+      squeeze_attrs.axis_ = gtl::ArraySlice<int>(axes);  // non-absl ok
+      auto squeeze =
+          ops::Squeeze(s.WithOpName("my_squeeze"), input, squeeze_attrs);
+      return squeeze.operation.node()->def();
+    } else {
+      auto squeeze = ops::Squeeze(s.WithOpName("my_squeeze"), input);
+      return squeeze.operation.node()->def();
+    }
+  };
+  std::vector<TestParamBase> test_params = {
+      TestParamBase{
+          {1, 2, 1, 3},  // input dims
+          {},            // input partial dims
+          {2, 3},        // expected output dims
+          {},            // axis
+          trt_mode_ == TrtTestMode::kExplicitBatch
+              ? Status::OK()
+              : Status{absl::StatusCode::kUnimplemented,
+                       "Squeeze is not implemented for empty squeeze_dims"}},
+      TestParamBase{{1, 2, 1, 3},
+                    {},
+                    {2, 1, 3},
+                    {0},
+                    use_implicit_batch
+                        ? Status{absl::StatusCode::kUnimplemented,
+                                 "TensorRT does not allow manipulation of the "
+                                 "batch dimension"}
+                        : Status::OK()},
+      TestParamBase{{1, 2, 1, 3},
+                    {},
+                    {2, 1, 3},
+                    {-4},
+                    use_implicit_batch
+                        ? Status{absl::StatusCode::kUnimplemented,
+                                 "TensorRT does not allow manipulation of the "
+                                 "batch dimension"}
+                        : Status::OK()},
+      TestParamBase{
+          {1, 1, 2, 3},
+          {},
+          {},
+          {4},
+          Status{absl::StatusCode::kInvalidArgument,
+                 "Axis value of 4 is out of bounds, must be in range [-4, 4)"}},
+      TestParamBase{
+          {1, 1, 2, 3},
+          {},
+          {},
+          {-5},
+          Status{
+              absl::StatusCode::kInvalidArgument,
+              "Axis value of -5 is out of bounds, must be in range [-4, 4)"}},
+      TestParamBase{{1, 1, 2, 3}, {}, {1, 2, 3}, {1}},
+      TestParamBase{{1, 1, 2, 3}, {}, {1, 2, 3}, {-3}},
+      TestParamBase{{1, 2, 3, 1}, {}, {1, 2, 3}, {3}},
+      TestParamBase{{1, 2, 3, 1}, {}, {1, 2, 3}, {-1}},
+      TestParamBase{{1, 1, 2, 1, 3, 1}, {}, {1, 2, 3}, {1, 3, 5}},
+      TestParamBase{{1, 1, 2, 1, 3, 1}, {}, {1, 2, 3}, {3, 1, 5}},
+      TestParamBase{{1, 1, 2, 1, 3, 1}, {}, {1, 2, 3}, {-1, -3, -5}},
+      TestParamBase{{1, 1, 2, 1, 3, 1}, {}, {1, 2, 3}, {1, -3, 5}},
+      TestParamBase{{1, 1, 6}, {}, {1, 6}, {1}},
+      TestParamBase{{1, 6, 1}, {}, {1, 6}, {2}},
+  };
+  auto squeeze_non_singleton = TestParamBase{
+      {1, 1, 2, 3},
+      {},
+      {},
+      {2},
+      Status{absl::StatusCode::kInvalidArgument,
+             "Dimension 2 with size 2 cannot be squeezed because it must be "
+             "size 1"}};
+
+  if (trt_mode_ == TrtTestMode::kDynamicShape) {
+    // In this test we try to squeeze axis=2 which has size > 1. In dynamic
+    // shape mode the converter sees only -1, so it cannot catch this error.
+    squeeze_non_singleton.status = Status::OK();  // conversion status
+    squeeze_non_singleton.runtime_status =
+        errors::InvalidArgument("Negative number of dimensions -1");
+    // Dynamic shape tests with partially known input shape
+    test_params.push_back(TestParamBase{{2, 1, 3}, {2, -1, 3}, {2, 3}, {1}});
+    test_params.push_back(TestParamBase{{2, 1, 3}, {2, 1, -1}, {2, 3}, {1}});
+  }
+  test_params.push_back(squeeze_non_singleton);
+
+  for (TestParamBase p : test_params) {
+    SCOPED_TRACE(p);
+    Reset();
+    NodeDef node_def = get_squeeze_nodedef(p.param, tf_type_);
+    AddTestTensor("input", p.input_dims, {1, 2, 3, 4, 5, 6},
+                  p.partial_input_dims);
+    TestOpConverter(node_def, p.expected_output_dims, p.status,
+                    p.runtime_status, ElementsAreArray({1, 2, 3, 4, 5, 6}));
   }
 }
 
-TEST_F(OpConverterTest, ConvertStridedSlice) {
+TEST_P(OpConverter_FP32_FP16_INT32_Test, ConvertStridedSlice) {
   // Get nodedef for StridedSlice layer.
   auto get_strided_slice_nodedef =
-      [](int64 begin_mask = 0, int64 end_mask = 0, int64 ellipsis_mask = 0,
-         int64 new_axis_mask = 0, int64 shrink_axis_mask = 0) -> NodeDef {
+      [](DataType tf_type, int64 begin_mask = 0, int64 end_mask = 0,
+         int64 ellipsis_mask = 0, int64 new_axis_mask = 0,
+         int64 shrink_axis_mask = 0) -> NodeDef {
     Scope s = Scope::NewRootScope();
-    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
+    auto input = ops::Placeholder(s.WithOpName("input"), tf_type);
     auto begin = ops::Placeholder(s.WithOpName("begin"), DT_INT32);
     auto end = ops::Placeholder(s.WithOpName("end"), DT_INT32);
     auto strides = ops::Placeholder(s.WithOpName("strides"), DT_INT32);
@@ -2977,105 +4762,26 @@ TEST_F(OpConverterTest, ConvertStridedSlice) {
   {
     // Input is weights, should fail.
     Reset();
-    NodeDef node_def = get_strided_slice_nodedef();
-    AddTestWeights<int32>("input", {1, 2, 3}, {1, 2, 3, 4, 5, 6});
+    NodeDef node_def = get_strided_slice_nodedef(tf_type_);
+    AddTestWeights<int32>("input", {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6});
     AddTestWeights<int32>("begin", {4}, {0, 0, 0, 0});
     AddTestWeights<int32>("end", {4}, {1, 1, 2, 3});
     AddTestWeights<int32>("strides", {4}, {1, 1, 1, 1});
-    RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
-                               "The input \"input\" for StridedSlice must be a "
-                               "tensor, at my_strided_slice");
+    RunValidationAndConversion(node_def, absl::StatusCode::kUnimplemented,
+                               "The input \"input\" for StridedSlice must "
+                               "be a tensor");
   }
   {
     // Begin, end, strides are tensors, should fail.
     Reset();
-    NodeDef node_def = get_strided_slice_nodedef();
-    AddTestTensor("input", {1, 2, 3});
+    NodeDef node_def = get_strided_slice_nodedef(tf_type_);
+    AddTestTensor("input", {4, 1, 1, 1});
     AddTestTensor("begin", {4});
     AddTestTensor("end", {4});
     AddTestTensor("strides", {4});
     RunValidationAndConversion(
-        node_def, error::UNIMPLEMENTED,
-        "The input \"begin\" for StridedSlice must be a constant, at "
-        "my_strided_slice");
-  }
-  {
-    // Modify batch dim, should fail.
-    Reset();
-    NodeDef node_def = get_strided_slice_nodedef();
-    AddTestTensor("input", {1, 2, 3});
-    AddTestWeights<int32>("begin", {4}, {0, 0, 0, 0});
-    AddTestWeights<int32>("end", {4}, {0, 1, 2, 3});
-    AddTestWeights<int32>("strides", {4}, {1, 1, 1, 1});
-    RunValidationAndConversion(
-        node_def, error::UNIMPLEMENTED,
-        "TensorRT does not allow modifications to the batch dimension, at "
-        "my_strided_slice");
-  }
-  {
-    // Dynamic batch size without end_mask, should fail.
-    Reset();
-    NodeDef node_def = get_strided_slice_nodedef();
-    AddTestTensor("input", {1, 2, 3}, /*batch_size=*/-1);
-    AddTestWeights<int32>("begin", {4}, {0, 0, 0, 0});
-    AddTestWeights<int32>("end", {4}, {1, 1, 2, 3});
-    AddTestWeights<int32>("strides", {4}, {1, 1, 1, 1});
-    RunValidationAndConversion(
-        node_def, error::UNIMPLEMENTED,
-        "TensorRT does not allow modifications to the batch dimension, at "
-        "my_strided_slice");
-  }
-  {
-    // Dynamic batch size but using end_mask, ok.
-    Reset();
-    NodeDef node_def = get_strided_slice_nodedef(/*begin_mask=*/0,
-                                                 /*end_mask=*/1);
-    AddTestTensor("input", {1, 2, 3}, /*batch_size=*/-1);
-    AddTestWeights<int32>("begin", {4}, {0, 0, 0, 0});
-    AddTestWeights<int32>("end", {4}, {0, 1, 2, 2});
-    AddTestWeights<int32>("strides", {4}, {1, 1, 1, 1});
-    RunValidationAndConversion(node_def);
-  }
-// TRT 5.1+ supports strides (disabled until 5.1.3.1 due to bugs)
-#if IS_TRT_VERSION_GE(5, 1, 3, 1)
-  {
-    // Negative strides, should fail.
-    Reset();
-    NodeDef node_def = get_strided_slice_nodedef();
-    AddTestTensor("input", {1, 2, 3});
-    AddTestWeights<int32>("begin", {4}, {0, 0, 0, 0});
-    AddTestWeights<int32>("end", {4}, {1, 1, 2, 3});
-    AddTestWeights<int32>("strides", {4}, {1, 1, 1, -1});
-    RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
-                               "Negative or zero stride values are not "
-                               "supported for StridedSlice, at "
-                               "my_strided_slice");
-  }
-#else
-  {
-    // Stride is not 1, should fail.
-    Reset();
-    NodeDef node_def = get_strided_slice_nodedef();
-    AddTestTensor("input", {1, 2, 3});
-    AddTestWeights<int32>("begin", {4}, {0, 0, 0, 0});
-    AddTestWeights<int32>("end", {4}, {1, 1, 2, 3});
-    AddTestWeights<int32>("strides", {4}, {1, 2, 1, 3});
-    RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
-                               "Strides other than 1 are not supported with "
-                               "this version of TRT, at my_strided_slice");
-  }
-#endif
-  {
-    // Size of sliced dim is negative, should fail.
-    Reset();
-    NodeDef node_def = get_strided_slice_nodedef();
-    AddTestTensor("input", {1, 2, 3});
-    AddTestWeights<int32>("begin", {4}, {0, 0, 2, 0});
-    AddTestWeights<int32>("end", {4}, {1, 1, 0, 3});
-    AddTestWeights<int32>("strides", {4}, {1, 1, 1, 1});
-    RunValidationAndConversion(node_def, error::INVALID_ARGUMENT,
-                               "\"size\" cannot be negative or zero for "
-                               "StridedSlice, at my_strided_slice");
+        node_def, absl::StatusCode::kUnimplemented,
+        "The input \"begin\" for StridedSlice must be a constant");
   }
 
   struct TestParams {
@@ -3090,6 +4796,9 @@ TEST_F(OpConverterTest, ConvertStridedSlice) {
     int shrink_axis_mask;
     std::vector<int> expected_output_dims;
     std::vector<float> expected_output;
+    Status conversion_status;
+    Status runtime_status;
+    std::vector<int> partial_input_dims;
   };
 
   auto get_mask = [](const std::vector<int>& mask) {
@@ -3101,634 +4810,956 @@ TEST_F(OpConverterTest, ConvertStridedSlice) {
   };
 
   // Same input is used for all tests.
-  const std::vector<float> ok_input = {1, 2, 3, 4, 5, 6};
-
-#if IS_TRT_VERSION_GE(5, 1, 3, 1)
-  const int kStridedSliceOKCases = 31;
-#else
-  const int kStridedSliceOKCases = 27;
-#endif
-  // Ok.
-  TestParams ok_params[kStridedSliceOKCases] = {
-    // 2D Crop.
-    TestParams{
-        /*input_dims=*/{1, 2, 3},
-        /*begin=*/{0, 0, 0, 0},
-        /*end=*/{0, 0, 1, 2},
-        /*strides=*/{1, 1, 1, 1},
-        /*begin_mask=*/get_mask({0, 0, 0, 0}),
-        /*end_mask=*/get_mask({1, 1, 0, 0}),
-        /*ellipsis_mask=*/0,
-        /*new_axis_mask=*/0,
-        /*shrink_axis_mask=*/0,
-        /*expected_output_dims=*/{1, 1, 2},
-        /*expected_output=*/{1, 2},
-    },
-    TestParams{
-        /*input_dims=*/{1, 2, 3},
-        /*begin=*/{0, 0, 1, 1},
-        /*end=*/{0, 0, 0, 0},
-        /*strides=*/{1, 1, 1, 1},
-        /*begin_mask=*/get_mask({0, 0, 0, 0}),
-        /*end_mask=*/get_mask({1, 1, 1, 1}),
-        /*ellipsis_mask=*/0,
-        /*new_axis_mask=*/0,
-        /*shrink_axis_mask=*/0,
-        /*expected_output_dims=*/{1, 1, 2},
-        /*expected_output=*/{5, 6},
-    },
-    TestParams{
-        /*input_dims=*/{1, 2, 3},
-        /*begin=*/{0, 0, 1, 1},
-        /*end=*/{0, 1, 2, 3},
-        /*strides=*/{1, 1, 1, 1},
-        /*begin_mask=*/get_mask({0, 0, 0, 0}),
-        /*end_mask=*/get_mask({1, 1, 0, 0}),
-        /*ellipsis_mask=*/0,
-        /*new_axis_mask=*/0,
-        /*shrink_axis_mask=*/0,
-        /*expected_output_dims=*/{1, 1, 2},
-        /*expected_output=*/{5, 6},
-    },
-    // 2D Crop, with transpose.
-    TestParams{
-        /*input_dims=*/{2, 3, 1},
-        /*begin=*/{0, 0, 0, 0},
-        /*end=*/{0, 1, 2, 1},
-        /*strides=*/{1, 1, 1, 1},
-        /*begin_mask=*/get_mask({0, 0, 0, 0}),
-        /*end_mask=*/get_mask({1, 0, 0, 0}),
-        /*ellipsis_mask=*/0,
-        /*new_axis_mask=*/0,
-        /*shrink_axis_mask=*/0,
-        /*expected_output_dims=*/{1, 2, 1},
-        /*expected_output=*/{1, 2},
-    },
-    TestParams{
-        /*input_dims=*/{2, 3, 1},
-        /*begin=*/{0, 1, 1, 0},
-        /*end=*/{0, 2, 3, 1},
-        /*strides=*/{1, 1, 1, 1},
-        /*begin_mask=*/get_mask({0, 0, 0, 0}),
-        /*end_mask=*/get_mask({1, 0, 0, 0}),
-        /*ellipsis_mask=*/0,
-        /*new_axis_mask=*/0,
-        /*shrink_axis_mask=*/0,
-        /*expected_output_dims=*/{1, 2, 1},
-        /*expected_output=*/{5, 6},
-    },
-    TestParams{
-        /*input_dims=*/{2, 1, 3},
-        /*begin=*/{0, 0, 0, 0},
-        /*end=*/{0, 1, 1, 2},
-        /*strides=*/{1, 1, 1, 1},
-        /*begin_mask=*/get_mask({0, 0, 0, 0}),
-        /*end_mask=*/get_mask({1, 0, 0, 0}),
-        /*ellipsis_mask=*/0,
-        /*new_axis_mask=*/0,
-        /*shrink_axis_mask=*/0,
-        /*expected_output_dims=*/{1, 1, 2},
-        /*expected_output=*/{1, 2},
-    },
-    TestParams{
-        /*input_dims=*/{2, 1, 3},
-        /*begin=*/{0, 1, 0, 1},
-        /*end=*/{0, 2, 1, 3},
-        /*strides=*/{1, 1, 1, 1},
-        /*begin_mask=*/get_mask({0, 0, 0, 0}),
-        /*end_mask=*/get_mask({1, 0, 0, 0}),
-        /*ellipsis_mask=*/0,
-        /*new_axis_mask=*/0,
-        /*shrink_axis_mask=*/0,
-        /*expected_output_dims=*/{1, 1, 2},
-        /*expected_output=*/{5, 6},
-    },
-    // 2D Crop, with reshape.
-    TestParams{
-        /*input_dims=*/{2, 3},
-        /*begin=*/{0, 0, 0},
-        /*end=*/{0, 1, 2},
-        /*strides=*/{1, 1, 1},
-        /*begin_mask=*/get_mask({0, 0, 0}),
-        /*end_mask=*/get_mask({1, 0, 0}),
-        /*ellipsis_mask=*/0,
-        /*new_axis_mask=*/0,
-        /*shrink_axis_mask=*/0,
-        /*expected_output_dims=*/{1, 2},
-        /*expected_output=*/{1, 2},
-    },
-    TestParams{
-        /*input_dims=*/{2, 3},
-        /*begin=*/{0, 1, 1},
-        /*end=*/{0, 0, 0},
-        /*strides=*/{1, 1, 1},
-        /*begin_mask=*/get_mask({0, 0, 0}),
-        /*end_mask=*/get_mask({1, 1, 1}),
-        /*ellipsis_mask=*/0,
-        /*new_axis_mask=*/0,
-        /*shrink_axis_mask=*/0,
-        /*expected_output_dims=*/{1, 2},
-        /*expected_output=*/{5, 6},
-    },
-    // 1D Crop.
-    TestParams{
-        /*input_dims=*/{1, 2, 3},
-        /*begin=*/{0, 0, 0, 0},
-        /*end=*/{0, 0, 0, 2},
-        /*strides=*/{1, 1, 1, 1},
-        /*begin_mask=*/get_mask({0, 0, 0, 0}),
-        /*end_mask=*/get_mask({1, 1, 1, 0}),
-        /*ellipsis_mask=*/0,
-        /*new_axis_mask=*/0,
-        /*shrink_axis_mask=*/0,
-        /*expected_output_dims=*/{1, 2, 2},
-        /*expected_output=*/{1, 2, 4, 5},
-    },
-    TestParams{
-        /*input_dims=*/{1, 2, 3},
-        /*begin=*/{0, 0, 1, 0},
-        /*end=*/{0, 0, 0, 0},
-        /*strides=*/{1, 1, 1, 1},
-        /*begin_mask=*/get_mask({0, 0, 0, 0}),
-        /*end_mask=*/get_mask({1, 1, 1, 1}),
-        /*ellipsis_mask=*/0,
-        /*new_axis_mask=*/0,
-        /*shrink_axis_mask=*/0,
-        /*expected_output_dims=*/{1, 1, 3},
-        /*expected_output=*/{4, 5, 6},
-    },
-    // 1D Crop, with transpose.
-    TestParams{
-        /*input_dims=*/{2, 3, 1},
-        /*begin=*/{0, 0, 0, 0},
-        /*end=*/{0, 1, 0, 0},
-        /*strides=*/{1, 1, 1, 1},
-        /*begin_mask=*/get_mask({0, 0, 0, 0}),
-        /*end_mask=*/get_mask({1, 0, 1, 1}),
-        /*ellipsis_mask=*/0,
-        /*new_axis_mask=*/0,
-        /*shrink_axis_mask=*/0,
-        /*expected_output_dims=*/{1, 3, 1},
-        /*expected_output=*/{1, 2, 3},
-    },
-    TestParams{
-        /*input_dims=*/{2, 3, 1},
-        /*begin=*/{0, 1, 0, 0},
-        /*end=*/{0, 0, 0, 0},
-        /*strides=*/{1, 1, 1, 1},
-        /*begin_mask=*/get_mask({0, 0, 0, 0}),
-        /*end_mask=*/get_mask({1, 1, 1, 1}),
-        /*ellipsis_mask=*/0,
-        /*new_axis_mask=*/0,
-        /*shrink_axis_mask=*/0,
-        /*expected_output_dims=*/{1, 3, 1},
-        /*expected_output=*/{4, 5, 6},
-    },
-    // 1D Crop, with reshape.
-    TestParams{
-        /*input_dims=*/{6},
-        /*begin=*/{0, 0},
-        /*end=*/{0, 3},
-        /*strides=*/{1, 1},
-        /*begin_mask=*/get_mask({0, 0}),
-        /*end_mask=*/get_mask({1, 0}),
-        /*ellipsis_mask=*/0,
-        /*new_axis_mask=*/0,
-        /*shrink_axis_mask=*/0,
-        /*expected_output_dims=*/{3},
-        /*expected_output=*/{1, 2, 3},
-    },
-    TestParams{
-        /*input_dims=*/{1, 6},
-        /*begin=*/{0, 0, 2},
-        /*end=*/{0, 0, 5},
-        /*strides=*/{1, 1, 1},
-        /*begin_mask=*/get_mask({0, 0, 0}),
-        /*end_mask=*/get_mask({1, 1, 0}),
-        /*ellipsis_mask=*/0,
-        /*new_axis_mask=*/0,
-        /*shrink_axis_mask=*/0,
-        /*expected_output_dims=*/{1, 3},
-        /*expected_output=*/{3, 4, 5},
-    },
-    TestParams{
-        /*input_dims=*/{6, 1},
-        /*begin=*/{0, 2, 0},
-        /*end=*/{0, 5, 0},
-        /*strides=*/{1, 1, 1},
-        /*begin_mask=*/get_mask({0, 0, 0}),
-        /*end_mask=*/get_mask({1, 0, 1}),
-        /*ellipsis_mask=*/0,
-        /*new_axis_mask=*/0,
-        /*shrink_axis_mask=*/0,
-        /*expected_output_dims=*/{3, 1},
-        /*expected_output=*/{3, 4, 5},
-    },
-    // Negative axis.
-    TestParams{
-        /*input_dims=*/{6, 1},
-        /*begin=*/{0, -6, 0},
-        /*end=*/{0, -3, 0},
-        /*strides=*/{1, 1, 1},
-        /*begin_mask=*/get_mask({0, 0, 0}),
-        /*end_mask=*/get_mask({1, 0, 1}),
-        /*ellipsis_mask=*/0,
-        /*new_axis_mask=*/0,
-        /*shrink_axis_mask=*/0,
-        /*expected_output_dims=*/{3, 1},
-        /*expected_output=*/{1, 2, 3},
-    },
-    TestParams{
-        /*input_dims=*/{6, 1},
-        /*begin=*/{0, 0, 0},
-        /*end=*/{0, -1, 0},
-        /*strides=*/{1, 1, 1},
-        /*begin_mask=*/get_mask({0, 0, 0}),
-        /*end_mask=*/get_mask({1, 0, 1}),
-        /*ellipsis_mask=*/0,
-        /*new_axis_mask=*/0,
-        /*shrink_axis_mask=*/0,
-        /*expected_output_dims=*/{5, 1},
-        /*expected_output=*/{1, 2, 3, 4, 5},
-    },
-    // Clamp out of bounds begin and end.
-    TestParams{
-        /*input_dims=*/{1, 2, 3},
-        /*begin=*/{0, 0, -9999, -9},
-        /*end=*/{0, 1, 1000, 4},
-        /*strides=*/{1, 1, 1, 1},
-        /*begin_mask=*/get_mask({0, 0, 0, 0}),
-        /*end_mask=*/get_mask({1, 0, 0, 0}),
-        /*ellipsis_mask=*/0,
-        /*new_axis_mask=*/0,
-        /*shrink_axis_mask=*/0,
-        /*expected_output_dims=*/{1, 2, 3},
-        /*expected_output=*/{1, 2, 3, 4, 5, 6},
-    },
-#if IS_TRT_VERSION_GE(5, 1, 3, 1)
-    // Strides
-    TestParams{
-        /*input_dims=*/{6},
-        /*begin=*/{0, 0},
-        /*end=*/{0, 5},
-        /*strides=*/{1, 2},
-        /*begin_mask=*/get_mask({0, 0}),
-        /*end_mask=*/get_mask({1, 0}),
-        /*ellipsis_mask=*/0,
-        /*new_axis_mask=*/0,
-        /*shrink_axis_mask=*/0,
-        /*expected_output_dims=*/{3},
-        /*expected_output=*/{1, 3, 5},
-    },
-    TestParams{
-        /*input_dims=*/{6},
-        /*begin=*/{0, 0},
-        /*end=*/{0, 6},
-        /*strides=*/{1, 2},
-        /*begin_mask=*/get_mask({0, 0}),
-        /*end_mask=*/get_mask({1, 0}),
-        /*ellipsis_mask=*/0,
-        /*new_axis_mask=*/0,
-        /*shrink_axis_mask=*/0,
-        /*expected_output_dims=*/{3},
-        /*expected_output=*/{1, 3, 5},
-    },
-    TestParams{
-        /*input_dims=*/{6},
-        /*begin=*/{0, 1},
-        /*end=*/{0, 6},
-        /*strides=*/{1, 2},
-        /*begin_mask=*/get_mask({0, 0}),
-        /*end_mask=*/get_mask({1, 0}),
-        /*ellipsis_mask=*/0,
-        /*new_axis_mask=*/0,
-        /*shrink_axis_mask=*/0,
-        /*expected_output_dims=*/{3},
-        /*expected_output=*/{2, 4, 6},
-    },
-    TestParams{
-        /*input_dims=*/{6},
-        /*begin=*/{0, 2},
-        /*end=*/{0, 6},
-        /*strides=*/{1, 3},
-        /*begin_mask=*/get_mask({0, 0}),
-        /*end_mask=*/get_mask({1, 0}),
-        /*ellipsis_mask=*/0,
-        /*new_axis_mask=*/0,
-        /*shrink_axis_mask=*/0,
-        /*expected_output_dims=*/{2},
-        /*expected_output=*/{3, 6},
-    },
-#endif
-    // ellipsis_mask
-    TestParams{
-        /*input_dims=*/{1, 2, 3},
-        /*begin=*/{0, 1},
-        /*end=*/{0, 2},
-        /*strides=*/{1, 1},
-        /*begin_mask=*/get_mask({0, 0, 0, 0}),
-        /*end_mask=*/get_mask({0, 0, 0, 0}),
-        /*ellipsis_mask=*/get_mask({1, 0, 0, 0}),
-        /*new_axis_mask=*/0,
-        /*shrink_axis_mask=*/0,
-        /*expected_output_dims=*/{1, 2, 1},
-        /*expected_output=*/{2, 5},
-    },
-    TestParams{
-        /*input_dims=*/{1, 2, 3},
-        /*begin=*/{0, 0, 1},
-        /*end=*/{0, 0, 2},
-        /*strides=*/{1, 1, 1},
-        /*begin_mask=*/get_mask({1, 0, 0, 0}),
-        /*end_mask=*/get_mask({1, 0, 0, 0}),
-        /*ellipsis_mask=*/get_mask({0, 1, 0, 0}),
-        /*new_axis_mask=*/0,
-        /*shrink_axis_mask=*/0,
-        /*expected_output_dims=*/{1, 2, 1},
-        /*expected_output=*/{2, 5},
-    },
-    TestParams{
-        /*input_dims=*/{1, 2, 3},
-        /*begin=*/{0, 0, 0, 1},
-        /*end=*/{0, 1, 2, 2},
-        /*strides=*/{1, 1, 1, 1},
-        /*begin_mask=*/get_mask({0, 0, 0, 0}),
-        /*end_mask=*/get_mask({0, 0, 0, 0}),
-        /*ellipsis_mask=*/get_mask({1, 0, 0, 0}),
-        /*new_axis_mask=*/0,
-        /*shrink_axis_mask=*/0,
-        /*expected_output_dims=*/{1, 2, 1},
-        /*expected_output=*/{2, 5},
-    },
-    TestParams{
-        /*input_dims=*/{1, 2, 3},
-        /*begin=*/{0, 0, 0, 1},
-        /*end=*/{1, 1, 2, 2},
-        /*strides=*/{1, 1, 1, 1},
-        /*begin_mask=*/get_mask({0, 0, 0, 0}),
-        /*end_mask=*/get_mask({0, 0, 0, 0}),
-        /*ellipsis_mask=*/get_mask({0, 1, 0, 0}),
-        /*new_axis_mask=*/0,
-        /*shrink_axis_mask=*/0,
-        /*expected_output_dims=*/{1, 2, 1},
-        /*expected_output=*/{2, 5},
-    },
-    TestParams{
-        /*input_dims=*/{1, 2, 3},
-        /*begin=*/{0, 0, 0, 0, 1},
-        /*end=*/{0, 1, 1, 2, 2},
-        /*strides=*/{1, 1, 1, 1, 1},
-        /*begin_mask=*/get_mask({0, 0, 0, 0}),
-        /*end_mask=*/get_mask({0, 0, 0, 0}),
-        /*ellipsis_mask=*/get_mask({1, 0, 0, 0}),
-        /*new_axis_mask=*/0,
-        /*shrink_axis_mask=*/0,
-        /*expected_output_dims=*/{1, 2, 1},
-        /*expected_output=*/{2, 5},
-    },
-    // shrink_axis_mask
-    TestParams{
-        /*input_dims=*/{1, 2, 3},
-        /*begin=*/{0, 0, 0, 1},
-        /*end=*/{0, 0, 0, 2},
-        /*strides=*/{1, 1, 1, 1},
-        /*begin_mask=*/get_mask({1, 1, 1, 0}),
-        /*end_mask=*/get_mask({1, 1, 1, 0}),
-        /*ellipsis_mask=*/0,
-        /*new_axis_mask=*/0,
-        /*shrink_axis_mask=*/get_mask({0, 0, 0, 1}),
-        /*expected_output_dims=*/{1, 2},
-        /*expected_output=*/{2, 5},
-    },
-    TestParams{
-        /*input_dims=*/{1, 2, 3},
-        /*begin=*/{0, 0, 0, 1},
-        /*end=*/{0, 1, 2, 2},
-        /*strides=*/{1, 1, 1, 1},
-        /*begin_mask=*/get_mask({1, 0, 0, 0}),
-        /*end_mask=*/get_mask({1, 0, 0, 0}),
-        /*ellipsis_mask=*/0,
-        /*new_axis_mask=*/0,
-        /*shrink_axis_mask=*/get_mask({0, 1, 0, 1}),
-        /*expected_output_dims=*/{2},
-        /*expected_output=*/{2, 5},
-    },
-    TestParams{
-        /*input_dims=*/{6},
-        /*begin=*/{0, 0},
-        /*end=*/{0, 1},
-        /*strides=*/{1, 1},
-        /*begin_mask=*/get_mask({1, 0}),
-        /*end_mask=*/get_mask({1, 0}),
-        /*ellipsis_mask=*/0,
-        /*new_axis_mask=*/0,
-        /*shrink_axis_mask=*/get_mask({0, 1}),
-        /*expected_output_dims=*/{},
-        /*expected_output=*/{1},
-    },
+  const std::vector<float> ok_input = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
+  Status modified_batch_dim_status =
+      (trt_mode_ == TrtTestMode::kImplicitBatch)
+          ? errors::Unimplemented(
+                "TensorRT does not allow modifications to "
+                "the batch dimension")
+          : Status::OK();
+  std::vector<TestParams> params = {
+      // Modify batch dim, should fail in implicit batch mode.
+      TestParams{/*input_dims=*/{2, 1, 1, 3},
+                 /*begin=*/{0, 0, 0, 0},
+                 /*end=*/{1, 1, 1, 2},
+                 /*strides=*/{1, 1, 1, 1},
+                 /*begin_mask=*/get_mask({0, 0, 0, 0}),
+                 /*end_mask=*/get_mask({0, 0, 0, 0}),
+                 /*ellipsis_mask=*/0,
+                 /*new_axis_mask=*/0,
+                 /*shrink_axis_mask=*/0,
+                 /*expected_output_dims=*/{1, 1, 1, 2},
+                 /*expected_output=*/{1, 2},
+                 /*conversion_status=*/modified_batch_dim_status,
+                 /*runtime_status=*/Status::OK(),
+                 /*partial_input_dims=*/{}},
+      // Unknown batch size without end_mask.
+      TestParams{
+          /*input_dims=*/{2, 1, 1, 3},
+          /*begin=*/{0, 0, 0, 0},
+          /*end=*/{1, 1, 1, 2},
+          /*strides=*/{1, 1, 1, 1},
+          /*begin_mask=*/get_mask({0, 0, 0, 0}),
+          /*end_mask=*/get_mask({0, 0, 0, 0}),
+          /*ellipsis_mask=*/0,
+          /*new_axis_mask=*/0,
+          /*shrink_axis_mask=*/0,
+          /*expected_output_dims=*/{1, 1, 1, 2},
+          /*expected_output=*/{1, 2},
+          modified_batch_dim_status,
+          Status::OK(),
+          /*partial_input_dims=*/{-1, 1, 1, 3},
+      },
+      // Test Case 2: Unknown batch size with end_mask.
+      TestParams{
+          /*input_dims=*/{2, 1, 1, 3},
+          /*begin=*/{0, 0, 0, 0},
+          /*end=*/{0, 1, 1, 2},
+          /*strides=*/{1, 1, 1, 1},
+          /*begin_mask=*/get_mask({1, 0, 0, 0}),
+          /*end_mask=*/get_mask({1, 0, 0, 0}),
+          /*ellipsis_mask=*/0,
+          /*new_axis_mask=*/0,
+          /*shrink_axis_mask=*/0,
+          /*expected_output_dims=*/{2, 1, 1, 2},
+          /*expected_output=*/{1, 2, 4, 5},
+          Status::OK(),
+          Status::OK(),
+          /*partial_input_dims=*/{-1, 1, 1, 3},
+      },
+      // Invalid parameters: end[2] < begin[2]
+      TestParams{/*input_dims=*/{1, 1, 2, 3},
+                 /*begin=*/{0, 0, 2, 0},
+                 /*end=*/{1, 1, 0, 3},
+                 /*strides=*/{1, 1, 1, 1},
+                 /*begin_mask=*/0,
+                 /*end_mask=*/0,
+                 /*ellipsis_mask=*/0,
+                 /*new_axis_mask=*/0,
+                 /*shrink_axis_mask=*/0,
+                 /*expected_output_dims=*/{},
+                 /*expected_output=*/{},
+                 errors::InvalidArgument("\"size\" cannot be negative for "
+                                         "StridedSlice"),
+                 Status::OK(),
+                 /*partial_input_dims=*/{}},
+      // Slice on the last two dimensions. All dimensions are static.
+      TestParams{
+          /*input_dims=*/{1, 1, 2, 3},
+          /*begin=*/{0, 0, 0, 0},
+          /*end=*/{0, 0, 1, 2},
+          /*strides=*/{1, 1, 1, 1},
+          /*begin_mask=*/get_mask({0, 0, 0, 0}),
+          /*end_mask=*/get_mask({1, 1, 0, 0}),
+          /*ellipsis_mask=*/0,
+          /*new_axis_mask=*/0,
+          /*shrink_axis_mask=*/0,
+          /*expected_output_dims=*/{1, 1, 1, 2},
+          /*expected_output=*/{1, 2},
+      },
+      // Slice on the last two dimensions. The slice is fully
+      // specified for the dynamic dimensions.
+      TestParams{
+          /*input_dims=*/{1, 1, 2, 3},
+          /*begin=*/{0, 0, 0, 0},
+          /*end=*/{0, 0, 1, 2},
+          /*strides=*/{1, 1, 1, 1},
+          /*begin_mask=*/get_mask({0, 0, 0, 0}),
+          /*end_mask=*/get_mask({1, 1, 0, 0}),
+          /*ellipsis_mask=*/0,
+          /*new_axis_mask=*/0,
+          /*shrink_axis_mask=*/0,
+          /*expected_output_dims=*/{1, 1, 1, 2},
+          /*expected_output=*/{1, 2},
+          Status::OK(),
+          Status::OK(),
+          /*partial_input_dims=*/{1, 1, -1, -1},
+      },
+      // End mask is provided on all dimensions. This should override the fact
+      // that the end value is 0. For dynamic shape, it tests
+      // that we can infer tensor size when "end mask" is provided.
+      TestParams{
+          /*input_dims=*/{1, 1, 2, 3},
+          /*begin=*/{0, 0, 1, 1},
+          /*end=*/{0, 0, 0, 0},
+          /*strides=*/{1, 1, 1, 1},
+          /*begin_mask=*/get_mask({0, 0, 0, 0}),
+          /*end_mask=*/get_mask({1, 1, 1, 1}),
+          /*ellipsis_mask=*/0,
+          /*new_axis_mask=*/0,
+          /*shrink_axis_mask=*/0,
+          /*expected_output_dims=*/{1, 1, 1, 2},
+          /*expected_output=*/{5, 6},
+          Status::OK(),
+          Status::OK(),
+          /*partial_input_dims=*/{1, 1, -1, -1},
+      },
+      // End mask is provided for the batch dimension to overwrite the end value
+      // 0 for that dimension.
+      TestParams{
+          /*input_dims=*/{1, 1, 2, 3},
+          /*begin=*/{0, 0, 1, 1},
+          /*end=*/{0, 1, 2, 3},
+          /*strides=*/{1, 1, 1, 1},
+          /*begin_mask=*/get_mask({0, 0, 0, 0}),
+          /*end_mask=*/get_mask({1, 1, 0, 0}),
+          /*ellipsis_mask=*/0,
+          /*new_axis_mask=*/0,
+          /*shrink_axis_mask=*/0,
+          /*expected_output_dims=*/{1, 1, 1, 2},
+          /*expected_output=*/{5, 6},
+      },
+      // Test slice on two dimensions with negative stride, without end_mask set
+      // on crop dimensions.
+      TestParams{/*input_dims=*/{1, 1, 2, 3},
+                 /*begin=*/{0, 0, 1, 2},
+                 /*end=*/{0, 0, 0, 0},
+                 /*strides=*/{1, 1, -1, -1},
+                 /*begin_mask=*/get_mask({0, 0, 0, 0}),
+                 /*end_mask=*/get_mask({1, 1, 0, 0}),
+                 /*ellipsis_mask=*/0,
+                 /*new_axis_mask=*/0,
+                 /*shrink_axis_mask=*/0,
+                 /*expected_output_dims=*/{1, 1, 1, 2},
+                 /*expected_output=*/{6, 5},
+                 /*conversion_status=*/Status::OK(),
+                 /*runtime_status=*/Status::OK(),
+                 /*partial_input_dims=*/{1, 1, -1, -1}},
+      // Test slice on two dimensions with negative stride, with end_mask set on
+      // crop dimensions. In dynamic shape mode, this tests the runtime size
+      // computation.
+      TestParams{/*input_dims=*/{1, 1, 2, 3},
+                 /*begin=*/{0, 0, 1, 1},
+                 /*end=*/{0, 0, 0, 0},
+                 /*strides=*/{1, 1, -1, -1},
+                 /*begin_mask=*/get_mask({0, 0, 0, 0}),
+                 /*end_mask=*/get_mask({1, 1, 1, 1}),
+                 /*ellipsis_mask=*/0,
+                 /*new_axis_mask=*/0,
+                 /*shrink_axis_mask=*/0,
+                 /*expected_output_dims=*/{1, 1, 2, 2},
+                 /*expected_output=*/{5, 4, 2, 1},
+                 /*conversion_status=*/Status::OK(),
+                 /*runtime_status=*/Status::OK(),
+                 /*partial_input_dims=*/{1, 1, -1, -1}},
+      // Test slice on two dimensions with negative stride, with begin_mask set
+      // on the crop dimensions. In dynamic shape mode, this tests the runtime
+      // size computation.
+      TestParams{/*input_dims=*/{1, 1, 2, 3},
+                 /*begin=*/{0, 0, 0, 0},
+                 /*end=*/{0, 0, 0, 0},
+                 /*strides=*/{1, 1, -1, -1},
+                 /*begin_mask=*/get_mask({0, 0, 1, 1}),
+                 /*end_mask=*/get_mask({1, 1, 0, 0}),
+                 /*ellipsis_mask=*/0,
+                 /*new_axis_mask=*/0,
+                 /*shrink_axis_mask=*/0,
+                 /*expected_output_dims=*/{1, 1, 1, 2},
+                 /*expected_output=*/{6, 5},
+                 /*conversion_status=*/Status::OK(),
+                 /*runtime_status=*/Status::OK(),
+                 /*partial_input_dims=*/{1, 1, -1, -1}},
+      // Test the reversal of all non-batch dimensions by providing the begin
+      // masks, end masks, and -1 as strides.
+      TestParams{/*input_dims=*/{1, 1, 2, 3},
+                 /*begin=*/{0, 0, 0, 0},
+                 /*end=*/{0, 0, 0, 0},
+                 /*strides=*/{1, -1, -1, -1},
+                 /*begin_mask=*/get_mask({1, 1, 1, 1}),
+                 /*end_mask=*/get_mask({1, 1, 1, 1}),
+                 /*ellipsis_mask=*/0,
+                 /*new_axis_mask=*/0,
+                 /*shrink_axis_mask=*/0,
+                 /*expected_output_dims=*/{1, 1, 2, 3},
+                 /*expected_output=*/{6, 5, 4, 3, 2, 1},
+                 /*conversion_status=*/Status::OK(),
+                 /*runtime_status=*/Status::OK(),
+                 /*partial_input_dims=*/{1, -1, -1, -1}},
+      // Slice on dimensions 1 and 2.
+      TestParams{
+          /*input_dims=*/{1, 2, 3, 1},
+          /*begin=*/{0, 0, 0, 0},
+          /*end=*/{0, 1, 2, 1},
+          /*strides=*/{1, 1, 1, 1},
+          /*begin_mask=*/get_mask({0, 0, 0, 0}),
+          /*end_mask=*/get_mask({1, 0, 0, 0}),
+          /*ellipsis_mask=*/0,
+          /*new_axis_mask=*/0,
+          /*shrink_axis_mask=*/0,
+          /*expected_output_dims=*/{1, 1, 2, 1},
+          /*expected_output=*/{1, 2},
+      },
+      // Slice on dimensions 1 and 2.
+      TestParams{
+          /*input_dims=*/{1, 2, 3, 1},
+          /*begin=*/{0, 1, 1, 0},
+          /*end=*/{0, 2, 3, 1},
+          /*strides=*/{1, 1, 1, 1},
+          /*begin_mask=*/get_mask({0, 0, 0, 0}),
+          /*end_mask=*/get_mask({1, 0, 0, 0}),
+          /*ellipsis_mask=*/0,
+          /*new_axis_mask=*/0,
+          /*shrink_axis_mask=*/0,
+          /*expected_output_dims=*/{1, 1, 2, 1},
+          /*expected_output=*/{5, 6},
+      },
+      // Slice on dimensions 1 and 3.
+      TestParams{
+          /*input_dims=*/{1, 2, 1, 3},
+          /*begin=*/{0, 0, 0, 0},
+          /*end=*/{0, 1, 1, 2},
+          /*strides=*/{1, 1, 1, 1},
+          /*begin_mask=*/get_mask({0, 0, 0, 0}),
+          /*end_mask=*/get_mask({1, 0, 0, 0}),
+          /*ellipsis_mask=*/0,
+          /*new_axis_mask=*/0,
+          /*shrink_axis_mask=*/0,
+          /*expected_output_dims=*/{1, 1, 1, 2},
+          /*expected_output=*/{1, 2},
+      },
+      // Slice on dimensions 1 and 3 with non-zero slice start.
+      TestParams{
+          /*input_dims=*/{1, 2, 1, 3},
+          /*begin=*/{0, 1, 0, 1},
+          /*end=*/{0, 2, 1, 3},
+          /*strides=*/{1, 1, 1, 1},
+          /*begin_mask=*/get_mask({0, 0, 0, 0}),
+          /*end_mask=*/get_mask({1, 0, 0, 0}),
+          /*ellipsis_mask=*/0,
+          /*new_axis_mask=*/0,
+          /*shrink_axis_mask=*/0,
+          /*expected_output_dims=*/{1, 1, 1, 2},
+          /*expected_output=*/{5, 6},
+      },
+      // Slice on 3D tensor.
+      TestParams{
+          /*input_dims=*/{1, 2, 3},
+          /*begin=*/{0, 0, 0},
+          /*end=*/{0, 1, 2},
+          /*strides=*/{1, 1, 1},
+          /*begin_mask=*/get_mask({0, 0, 0}),
+          /*end_mask=*/get_mask({1, 0, 0}),
+          /*ellipsis_mask=*/0,
+          /*new_axis_mask=*/0,
+          /*shrink_axis_mask=*/0,
+          /*expected_output_dims=*/{1, 1, 2},
+          /*expected_output=*/{1, 2},
+      },
+      // Slice on 3D tensor using end_mask. For dynamic shape, all
+      // dimensions are dynamic.
+      TestParams{/*input_dims=*/{1, 2, 3},
+                 /*begin=*/{0, 1, 1},
+                 /*end=*/{0, 0, 0},
+                 /*strides=*/{1, 1, 1},
+                 /*begin_mask=*/get_mask({0, 0, 0}),
+                 /*end_mask=*/get_mask({1, 1, 1}),
+                 /*ellipsis_mask=*/0,
+                 /*new_axis_mask=*/0,
+                 /*shrink_axis_mask=*/0,
+                 /*expected_output_dims=*/{1, 1, 2},
+                 /*expected_output=*/{5, 6},
+                 /*conversion_status=*/Status::OK(),
+                 /*runtime_status=*/Status::OK(),
+                 /*partial_input_dims=*/{-1, -1, -1}},
+      // Slice on 3D tensor using end_mask. For dynamic shape, all
+      // dimensions are dynamic.
+      TestParams{/*input_dims=*/{1, 1, 2, 3},
+                 /*begin=*/{0, 0, 0, 0},
+                 /*end=*/{0, 0, 0, 2},
+                 /*strides=*/{1, 1, 1, 1},
+                 /*begin_mask=*/get_mask({0, 0, 0, 0}),
+                 /*end_mask=*/get_mask({1, 1, 1, 0}),
+                 /*ellipsis_mask=*/0,
+                 /*new_axis_mask=*/0,
+                 /*shrink_axis_mask=*/0,
+                 /*expected_output_dims=*/{1, 1, 2, 2},
+                 /*expected_output=*/{1, 2, 4, 5},
+                 /*conversion_status=*/Status::OK(),
+                 /*runtime_status=*/Status::OK(),
+                 /*partial_input_dims=*/{-1, -1, -1, -1}},
+      TestParams{
+          /*input_dims=*/{1, 1, 2, 3},
+          /*begin=*/{0, 0, 1, 0},
+          /*end=*/{0, 0, 0, 0},
+          /*strides=*/{1, 1, 1, 1},
+          /*begin_mask=*/get_mask({0, 0, 0, 0}),
+          /*end_mask=*/get_mask({1, 1, 1, 1}),
+          /*ellipsis_mask=*/0,
+          /*new_axis_mask=*/0,
+          /*shrink_axis_mask=*/0,
+          /*expected_output_dims=*/{1, 1, 1, 3},
+          /*expected_output=*/{4, 5, 6},
+      },
+      // 1D simple slice.
+      TestParams{/*input_dims=*/{1, 2, 3, 1},
+                 /*begin=*/{0, 0, 0, 0},
+                 /*end=*/{0, 1, 0, 0},
+                 /*strides=*/{1, 1, 1, 1},
+                 /*begin_mask=*/get_mask({0, 0, 0, 0}),
+                 /*end_mask=*/get_mask({1, 0, 1, 1}),
+                 /*ellipsis_mask=*/0,
+                 /*new_axis_mask=*/0,
+                 /*shrink_axis_mask=*/0,
+                 /*expected_output_dims=*/{1, 1, 3, 1},
+                 /*expected_output=*/{1, 2, 3},
+                 /*conversion_status=*/Status::OK(),
+                 /*runtime_status=*/Status::OK(),
+                 /*partial_input_dims=*/{-1, -1, -1, -1}},
+      TestParams{
+          /*input_dims=*/{1, 2, 3, 1},
+          /*begin=*/{0, 1, 0, 0},
+          /*end=*/{0, 0, 0, 0},
+          /*strides=*/{1, 1, 1, 1},
+          /*begin_mask=*/get_mask({0, 0, 0, 0}),
+          /*end_mask=*/get_mask({1, 1, 1, 1}),
+          /*ellipsis_mask=*/0,
+          /*new_axis_mask=*/0,
+          /*shrink_axis_mask=*/0,
+          /*expected_output_dims=*/{1, 1, 3, 1},
+          /*expected_output=*/{4, 5, 6},
+      },
+      // Simple 1D slice on 2D input.
+      TestParams{/*input_dims=*/{1, 6},
+                 /*begin=*/{0, 0},
+                 /*end=*/{0, 3},
+                 /*strides=*/{1, 1},
+                 /*begin_mask=*/get_mask({0, 0}),
+                 /*end_mask=*/get_mask({1, 0}),
+                 /*ellipsis_mask=*/0,
+                 /*new_axis_mask=*/0,
+                 /*shrink_axis_mask=*/0,
+                 /*expected_output_dims=*/{1, 3},
+                 /*expected_output=*/{1, 2, 3},
+                 /*conversion_status=*/Status::OK(),
+                 /*runtime_status=*/Status::OK(),
+                 /*partial_input_dims=*/{-1, -1}},
+      TestParams{
+          /*input_dims=*/{1, 1, 6},
+          /*begin=*/{0, 0, 2},
+          /*end=*/{0, 0, 5},
+          /*strides=*/{1, 1, 1},
+          /*begin_mask=*/get_mask({0, 0, 0}),
+          /*end_mask=*/get_mask({1, 1, 0}),
+          /*ellipsis_mask=*/0,
+          /*new_axis_mask=*/0,
+          /*shrink_axis_mask=*/0,
+          /*expected_output_dims=*/{1, 1, 3},
+          /*expected_output=*/{3, 4, 5},
+      },
+      TestParams{
+          /*input_dims=*/{1, 6, 1},
+          /*begin=*/{0, 2, 0},
+          /*end=*/{0, 5, 0},
+          /*strides=*/{1, 1, 1},
+          /*begin_mask=*/get_mask({0, 0, 0}),
+          /*end_mask=*/get_mask({1, 0, 1}),
+          /*ellipsis_mask=*/0,
+          /*new_axis_mask=*/0,
+          /*shrink_axis_mask=*/0,
+          /*expected_output_dims=*/{1, 3, 1},
+          /*expected_output=*/{3, 4, 5},
+      },
+      // Negative axis.
+      TestParams{
+          /*input_dims=*/{1, 6, 1},
+          /*begin=*/{0, -6, 0},
+          /*end=*/{0, -3, 0},
+          /*strides=*/{1, 1, 1},
+          /*begin_mask=*/get_mask({0, 0, 0}),
+          /*end_mask=*/get_mask({1, 0, 1}),
+          /*ellipsis_mask=*/0,
+          /*new_axis_mask=*/0,
+          /*shrink_axis_mask=*/0,
+          /*expected_output_dims=*/{1, 3, 1},
+          /*expected_output=*/{1, 2, 3},
+      },
+      TestParams{
+          /*input_dims=*/{1, 6, 1},
+          /*begin=*/{0, 0, 0},
+          /*end=*/{0, -1, 0},
+          /*strides=*/{1, 1, 1},
+          /*begin_mask=*/get_mask({0, 0, 0}),
+          /*end_mask=*/get_mask({1, 0, 1}),
+          /*ellipsis_mask=*/0,
+          /*new_axis_mask=*/0,
+          /*shrink_axis_mask=*/0,
+          /*expected_output_dims=*/{1, 5, 1},
+          /*expected_output=*/{1, 2, 3, 4, 5},
+      },
+      // Clamp out of bounds begin and end.
+      TestParams{
+          /*input_dims=*/{1, 1, 2, 3},
+          /*begin=*/{0, 0, -9999, -9},
+          /*end=*/{0, 1, 1000, 4},
+          /*strides=*/{1, 1, 1, 1},
+          /*begin_mask=*/get_mask({0, 0, 0, 0}),
+          /*end_mask=*/get_mask({1, 0, 0, 0}),
+          /*ellipsis_mask=*/0,
+          /*new_axis_mask=*/0,
+          /*shrink_axis_mask=*/0,
+          /*expected_output_dims=*/{1, 1, 2, 3},
+          /*expected_output=*/{1, 2, 3, 4, 5, 6},
+      },
+      // Stride values >= 2.
+      TestParams{/*input_dims=*/{1, 6},
+                 /*begin=*/{0, 0},
+                 /*end=*/{0, 5},
+                 /*strides=*/{1, 2},
+                 /*begin_mask=*/get_mask({0, 0}),
+                 /*end_mask=*/get_mask({1, 0}),
+                 /*ellipsis_mask=*/0,
+                 /*new_axis_mask=*/0,
+                 /*shrink_axis_mask=*/0,
+                 /*expected_output_dims=*/{1, 3},
+                 /*expected_output=*/{1, 3, 5},
+                 /*conversion_status=*/Status::OK(),
+                 /*runtime_status=*/Status::OK(),
+                 /*partial_input_dims=*/{-1, -1}},
+      TestParams{/*input_dims=*/{1, 6},
+                 /*begin=*/{0, 0},
+                 /*end=*/{0, 6},
+                 /*strides=*/{1, 2},
+                 /*begin_mask=*/get_mask({0, 0}),
+                 /*end_mask=*/get_mask({1, 0}),
+                 /*ellipsis_mask=*/0,
+                 /*new_axis_mask=*/0,
+                 /*shrink_axis_mask=*/0,
+                 /*expected_output_dims=*/{1, 3},
+                 /*expected_output=*/{1, 3, 5},
+                 /*conversion_status=*/Status::OK(),
+                 /*runtime_status=*/Status::OK(),
+                 /*partial_input_dims=*/{-1, -1}},
+      TestParams{/*input_dims=*/{1, 6},
+                 /*begin=*/{0, 1},
+                 /*end=*/{0, 6},
+                 /*strides=*/{1, 2},
+                 /*begin_mask=*/get_mask({0, 0}),
+                 /*end_mask=*/get_mask({1, 0}),
+                 /*ellipsis_mask=*/0,
+                 /*new_axis_mask=*/0,
+                 /*shrink_axis_mask=*/0,
+                 /*expected_output_dims=*/{1, 3},
+                 /*expected_output=*/{2, 4, 6},
+                 /*conversion_status=*/Status::OK(),
+                 /*runtime_status=*/Status::OK(),
+                 /*partial_input_dims=*/{-1, -1}},
+      TestParams{/*input_dims=*/{1, 6},
+                 /*begin=*/{0, 2},
+                 /*end=*/{0, 6},
+                 /*strides=*/{1, 3},
+                 /*begin_mask=*/get_mask({0, 0}),
+                 /*end_mask=*/get_mask({1, 0}),
+                 /*ellipsis_mask=*/0,
+                 /*new_axis_mask=*/0,
+                 /*shrink_axis_mask=*/0,
+                 /*expected_output_dims=*/{1, 2},
+                 /*expected_output=*/{3, 6},
+                 /*conversion_status=*/Status::OK(),
+                 /*runtime_status=*/Status::OK(),
+                 /*partial_input_dims=*/{-1, -1}},
+      // Stride values <= -2.
+      TestParams{/*input_dims=*/{1, 6},
+                 /*begin=*/{0, 5},
+                 /*end=*/{0, 0},
+                 /*strides=*/{1, -2},
+                 /*begin_mask=*/get_mask({0, 0}),
+                 /*end_mask=*/get_mask({1, 1}),
+                 /*ellipsis_mask=*/0,
+                 /*new_axis_mask=*/0,
+                 /*shrink_axis_mask=*/0,
+                 /*expected_output_dims=*/{1, 3},
+                 /*expected_output=*/{6, 4, 2},
+                 /*conversion_status=*/Status::OK(),
+                 /*runtime_status=*/Status::OK(),
+                 /*partial_input_dims=*/{-1, -1}},
+      TestParams{/*input_dims=*/{1, 6},
+                 /*begin=*/{0, 5},
+                 /*end=*/{0, 0},
+                 /*strides=*/{1, -2},
+                 /*begin_mask=*/get_mask({0, 0}),
+                 /*end_mask=*/get_mask({1, 0}),
+                 /*ellipsis_mask=*/0,
+                 /*new_axis_mask=*/0,
+                 /*shrink_axis_mask=*/0,
+                 /*expected_output_dims=*/{1, 3},
+                 /*expected_output=*/{6, 4, 2},
+                 /*conversion_status=*/Status::OK(),
+                 /*runtime_status=*/Status::OK(),
+                 /*partial_input_dims=*/{-1, -1}},
+      TestParams{/*input_dims=*/{1, 6},
+                 /*begin=*/{0, 5},
+                 /*end=*/{0, 1},
+                 /*strides=*/{1, -3},
+                 /*begin_mask=*/get_mask({0, 0}),
+                 /*end_mask=*/get_mask({1, 0}),
+                 /*ellipsis_mask=*/0,
+                 /*new_axis_mask=*/0,
+                 /*shrink_axis_mask=*/0,
+                 /*expected_output_dims=*/{1, 2},
+                 /*expected_output=*/{6, 3},
+                 /*conversion_status=*/Status::OK(),
+                 /*runtime_status=*/Status::OK(),
+                 /*partial_input_dims=*/{-1, -1}},
+      // Ellipsis_mask causes leading dimensions to be ignored. Begin, end,
+      // stride, and mask values of size 2 should be interpreted as applying to
+      // the last 2 dimensions, while the ellipsis applies to the first 2 (for a
+      // 4D input tensor).
+      TestParams{/*input_dims=*/{1, 1, 2, 3},
+                 /*begin=*/{0, 1},
+                 /*end=*/{0, 2},
+                 /*strides=*/{1, 1},
+                 /*begin_mask=*/get_mask({0, 0}),
+                 /*end_mask=*/get_mask({0, 0}),
+                 /*ellipsis_mask=*/get_mask({1, 0, 0}),
+                 /*new_axis_mask=*/0,
+                 /*shrink_axis_mask=*/0,
+                 /*expected_output_dims=*/{1, 1, 2, 1},
+                 /*expected_output=*/{2, 5},
+                 /*conversion_status=*/Status::OK(),
+                 /*runtime_status=*/Status::OK(),
+                 /*partial_input_dims=*/{-1, -1, -1, -1}},
+      // Ellipsis_mask on single inner dimension.
+      TestParams{
+          /*input_dims=*/{1, 1, 2, 3},
+          /*begin=*/{0, 0, 1},
+          /*end=*/{0, 0, 2},
+          /*strides=*/{1, 1, 1},
+          /*begin_mask=*/get_mask({1, 0, 0, 0}),
+          /*end_mask=*/get_mask({1, 0, 0, 0}),
+          /*ellipsis_mask=*/get_mask({0, 1, 0, 0}),
+          /*new_axis_mask=*/0,
+          /*shrink_axis_mask=*/0,
+          /*expected_output_dims=*/{1, 1, 2, 1},
+          /*expected_output=*/{2, 5},
+      },
+      // Ellipsis_mask on single leading dimension.
+      TestParams{/*input_dims=*/{1, 1, 2, 3},
+                 /*begin=*/{0, 0, 0, 1},
+                 /*end=*/{0, 1, 2, 2},
+                 /*strides=*/{1, 1, 1, 1},
+                 /*begin_mask=*/get_mask({0, 0, 0, 0}),
+                 /*end_mask=*/get_mask({0, 0, 0, 0}),
+                 /*ellipsis_mask=*/get_mask({1, 0, 0, 0}),
+                 /*new_axis_mask=*/0,
+                 /*shrink_axis_mask=*/0,
+                 /*expected_output_dims=*/{1, 1, 2, 1},
+                 /*expected_output=*/{2, 5},
+                 /*conversion_status=*/Status::OK(),
+                 /*runtime_status=*/Status::OK(),
+                 /*partial_input_dims=*/{-1, -1, -1, -1}},
+      // Ellipsis_mask on single inner dimension overrides that dimensions'
+      // begin/end values.
+      TestParams{/*input_dims=*/{1, 1, 2, 3},
+                 /*begin=*/{0, 1, 0, 1},
+                 /*end=*/{1, 1, 2, 2},
+                 /*strides=*/{1, 1, 1, 1},
+                 /*begin_mask=*/get_mask({0, 0, 0, 0}),
+                 /*end_mask=*/get_mask({0, 0, 0, 0}),
+                 /*ellipsis_mask=*/get_mask({0, 1, 0, 0}),
+                 /*new_axis_mask=*/0,
+                 /*shrink_axis_mask=*/0,
+                 /*expected_output_dims=*/{1, 1, 2, 1},
+                 /*expected_output=*/{2, 5},
+                 /*conversion_status=*/Status::OK(),
+                 /*runtime_status=*/Status::OK(),
+                 /*partial_input_dims=*/{-1, -1, -1, -1}},
+      // Ellipsis mask on single leading dimension should throw out extra
+      // leading values of begin/end vectors so that only the last N-1 values of
+      // each remain.
+      TestParams{/*input_dims=*/{1, 1, 2, 3},
+                 /*begin=*/{0, 0, 0, 0, 1},
+                 /*end=*/{0, 1, 1, 2, 2},
+                 /*strides=*/{1, 1, 1, 1, 1},
+                 /*begin_mask=*/get_mask({0, 0, 0, 0}),
+                 /*end_mask=*/get_mask({0, 0, 0, 0}),
+                 /*ellipsis_mask=*/get_mask({1, 0, 0, 0}),
+                 /*new_axis_mask=*/0,
+                 /*shrink_axis_mask=*/0,
+                 /*expected_output_dims=*/{1, 1, 2, 1},
+                 /*expected_output=*/{2, 5},
+                 /*conversion_status=*/Status::OK(),
+                 /*runtime_status=*/Status::OK(),
+                 /*partial_input_dims=*/{-1, -1, -1, -1}},
+      // Shrink-axis mask set for the final dimension of final size 1 should
+      // remove that dimension from the final shape.
+      TestParams{/*input_dims=*/{1, 1, 2, 3},
+                 /*begin=*/{0, 0, 0, 1},
+                 /*end=*/{0, 0, 0, 2},
+                 /*strides=*/{1, 1, 1, 1},
+                 /*begin_mask=*/get_mask({1, 1, 1, 0}),
+                 /*end_mask=*/get_mask({1, 1, 1, 0}),
+                 /*ellipsis_mask=*/0,
+                 /*new_axis_mask=*/0,
+                 /*shrink_axis_mask=*/get_mask({0, 0, 0, 1}),
+                 /*expected_output_dims=*/{1, 1, 2},
+                 /*expected_output=*/{2, 5},
+                 /*conversion_status=*/Status::OK(),
+                 /*runtime_status=*/Status::OK(),
+                 /*partial_input_dims=*/{1, 1, 2, -1}},
+      // Shrink-axis mask set for multiple dimensions that have a final size of
+      // 1 should remove those dimensions from the final shape.
+      TestParams{/*input_dims=*/{1, 1, 2, 3},
+                 /*begin=*/{0, 0, 0, 1},
+                 /*end=*/{0, 1, 2, 2},
+                 /*strides=*/{1, 1, 1, 1},
+                 /*begin_mask=*/get_mask({1, 0, 0, 0}),
+                 /*end_mask=*/get_mask({1, 0, 0, 0}),
+                 /*ellipsis_mask=*/0,
+                 /*new_axis_mask=*/0,
+                 /*shrink_axis_mask=*/get_mask({0, 1, 0, 1}),
+                 /*expected_output_dims=*/{1, 2},
+                 /*expected_output=*/{2, 5},
+                 /*conversion_status=*/Status::OK(),
+                 /*runtime_status=*/Status::OK(),
+                 /*partial_input_dims=*/{1, 1, 2, -1}},
+      // Shrink-axis mask set for multiple sequential dimensions of final size 1
+      // should
+      // remove those dimensions from the final shape.
+      TestParams{/*input_dims=*/{6, 1, 1},
+                 /*begin=*/{0, 0, 0},
+                 /*end=*/{0, 0, 0},
+                 /*strides=*/{1, 1, 1},
+                 /*begin_mask=*/get_mask({1, 1, 1}),
+                 /*end_mask=*/get_mask({1, 1, 1}),
+                 /*ellipsis_mask=*/0,
+                 /*new_axis_mask=*/0,
+                 /*shrink_axis_mask=*/get_mask({0, 1, 1}),
+                 /*expected_output_dims=*/{6},
+                 /*expected_output=*/{1, 2, 3, 4, 5, 6},
+                 /*conversion_status=*/Status::OK(),
+                 /*runtime_status=*/Status::OK(),
+                 /*partial_input_dims=*/{-1, -1, -1}},
+      // The new_axis_mask parameter is not supported.
+      TestParams{/*input_dims=*/{1, 6},
+                 /*begin=*/{0, 0, 0},
+                 /*end=*/{0, 0, 0},
+                 /*strides=*/{1, 1, 1},
+                 /*begin_mask=*/
+                 get_mask({0, 1, 1}),
+                 /*end_mask=*/get_mask({0, 1, 1}),
+                 /*ellipsis_mask=*/0,
+                 /*new_axis_mask=*/get_mask({1, 0, 0}),
+                 /*shrink_axis_mask=*/get_mask({0, 0, 0}),
+                 /*expected_output_dims=*/{1, 1, 6},
+                 /*expected_output=*/{1, 1, 6},
+                 /*conversion_status=*/
+                 errors::Unimplemented(
+                     "new_axis_mask is not supported for StridedSlice"),
+                 /*runtime_status=*/Status::OK(),
+                 /*partial_input_dims=*/{1, 6}},
+      // Test all axes dynamic inputs with shrink_axis_mask
+      TestParams{/*input_dims=*/{1, 3, 2},
+                 /*begin=*/{0, 0, 0},
+                 /*end=*/{0, 0, 3},
+                 /*strides=*/{1, 1, 1},
+                 /*begin_mask=*/get_mask({0, 1, 1}),
+                 /*end_mask=*/get_mask({0, 1, 1}),
+                 /*ellipsis_mask=*/0,
+                 /*new_axis_mask=*/0,
+                 /*shrink_axis_mask=*/1,
+                 /*expected_output_dims=*/{3, 2},
+                 /*expected_output=*/{1, 2, 3, 4, 5, 6},
+                 /*conversion_status=*/modified_batch_dim_status, Status::OK(),
+                 /*partial_input_dims=*/{-1, -1, -1}},
+      // Test dynamic input with shrink_axis_mask along axis=0
+      TestParams{/*input_dims=*/{2, 3, 2},
+                 /*begin=*/{0, 0, 0},
+                 /*end=*/{0, 0, 3},
+                 /*strides=*/{1, 1, 1},
+                 /*begin_mask=*/get_mask({0, 1, 1}),
+                 /*end_mask=*/get_mask({0, 1, 1}),
+                 /*ellipsis_mask=*/0,
+                 /*new_axis_mask=*/0,
+                 /*shrink_axis_mask=*/1,
+                 /*expected_output_dims=*/{3, 2},
+                 /*expected_output=*/{1, 2, 3, 4, 5, 6},
+                 /*conversion_status=*/modified_batch_dim_status, Status::OK(),
+                 /*partial_input_dims=*/{-1, -1, 2}},
+      // Test dynamic input sizes with multiple axes shrinking
+      TestParams{/*input_dims=*/{2, 3, 2},
+                 /*begin=*/{0, 0, 0},
+                 /*end=*/{0, 0, 3},
+                 /*strides=*/{1, 1, 1},
+                 /*begin_mask=*/get_mask({0, 1, 1}),
+                 /*end_mask=*/get_mask({0, 1, 1}),
+                 /*ellipsis_mask=*/0,
+                 /*new_axis_mask=*/0,
+                 /*shrink_axis_mask=*/3,
+                 /*expected_output_dims=*/{2},
+                 /*expected_output=*/{1, 2},
+                 /*conversion_status=*/modified_batch_dim_status, Status::OK(),
+                 /*partial_input_dims=*/{-1, -1, 2}},
   };
 
-  for (int i = 0; i < kStridedSliceOKCases; i++) {
+  int i = 0;
+  for (auto p : params) {
     Reset();
     NodeDef node_def = get_strided_slice_nodedef(
-        ok_params[i].begin_mask, ok_params[i].end_mask,
-        ok_params[i].ellipsis_mask, ok_params[i].new_axis_mask,
-        ok_params[i].shrink_axis_mask);
-    AddTestTensor("input", ok_params[i].input_dims);
-    AddTestWeights<int32>("begin",
-                          {static_cast<int>(ok_params[i].begin.size())},
-                          ok_params[i].begin);
-    AddTestWeights<int32>("end", {static_cast<int>(ok_params[i].end.size())},
-                          ok_params[i].end);
-    AddTestWeights<int32>("strides",
-                          {static_cast<int>(ok_params[i].strides.size())},
-                          ok_params[i].strides);
-    RunValidationAndConversion(node_def);
+        tf_type_, p.begin_mask, p.end_mask, p.ellipsis_mask, p.new_axis_mask,
+        p.shrink_axis_mask);
 
-    TRT_TensorOrWeights output;
-    TF_EXPECT_OK(GetTensorOrWeights("my_strided_slice", &output));
-    ASSERT_TRUE(output.is_tensor());
-    ExpectTrtDimsEqualsArray(ok_params[i].expected_output_dims,
-                             output.tensor()->getDimensions());
+    VLOG(2) << "Preparing test case " << i++ << " with dims "
+            << DebugString(p.input_dims);
+
+    switch (trt_mode_) {
+      case TrtTestMode::kImplicitBatch: {
+        AddTestTensor("input", p.input_dims, ok_input);
+        break;
+      }
+      case TrtTestMode::kExplicitBatch: {
+        AddTestTensor("input", p.input_dims, ok_input);
+        break;
+      }
+      case TrtTestMode::kDynamicShape: {
+        if (p.partial_input_dims.size() > 0) {
+          AddTestTensor("input", p.input_dims, tf_type_, ok_input,
+                        p.partial_input_dims);
+        } else {
+          AddTestTensor("input", p.input_dims, tf_type_, ok_input,
+                        p.input_dims);
+        }
+        break;
+      }
+    }
 
-    const DataVec input_data{{"input", test::AsTensor<float>(ok_input)}};
-    DataVec output_data{
-        {"my_strided_slice",
-         ConstructTensor<float>(ok_params[i].expected_output.size())}};
-    BuildAndRun(input_data, &output_data);
-    EXPECT_THAT(GetSpanForData<float>(output_data[0]),
-                ElementsAreArray(ok_params[i].expected_output));
+    VLOG(2) << "Adding weights begin: " << DebugString(p.begin)
+            << ", end: " << DebugString(p.end)
+            << ", strides: " << DebugString(p.strides);
+    AddTestWeights<int32>("begin", {static_cast<int>(p.begin.size())}, p.begin);
+    AddTestWeights<int32>("end", {static_cast<int>(p.end.size())}, p.end);
+    AddTestWeights<int32>("strides", {static_cast<int>(p.strides.size())},
+                          p.strides);
+
+    TestOpConverter(node_def, p.expected_output_dims, p.conversion_status,
+                    p.runtime_status, ElementsAreArray(p.expected_output));
   }
 }
 
-TEST_F(OpConverterTest, ConvertSlice) {
+TEST_P(OpConverter_FP32_FP16_INT32_Test, ConvertSlice) {
   // Get nodedef for Slice layer.
-  auto get_slice_nodedef = []() -> NodeDef {
+  auto get_slice_nodedef = [](DataType tf_type) -> NodeDef {
     Scope s = Scope::NewRootScope();
-    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
+    auto input = ops::Placeholder(s.WithOpName("input"), tf_type);
     auto begin = ops::Placeholder(s.WithOpName("begin"), DT_INT32);
     auto size = ops::Placeholder(s.WithOpName("size"), DT_INT32);
     auto slice = ops::Slice(s.WithOpName("my_slice"), input, begin, size);
     return slice.operation.node()->def();
   };
 
-  {
-    // Begin is below bounds, should fail.
-    Reset();
-    NodeDef node_def = get_slice_nodedef();
-    AddTestTensor("input", {1, 2, 3});
-    AddTestWeights<int32>("begin", {4}, {0, 0, -1, 0});
-    AddTestWeights<int32>("size", {4}, {1, 1, 2, 3});
-    RunValidationAndConversion(
-        node_def, error::INVALID_ARGUMENT,
-        "\"begin\" for dimension 2 in Slice is out of range, at my_slice");
-  }
-  {
-    // Begin is above bounds, should fail.
-    Reset();
-    NodeDef node_def = get_slice_nodedef();
-    AddTestTensor("input", {1, 2, 3});
-    AddTestWeights<int32>("begin", {4}, {0, 0, 3, 0});
-    AddTestWeights<int32>("size", {4}, {1, 1, 2, 3});
-    RunValidationAndConversion(
-        node_def, error::INVALID_ARGUMENT,
-        "\"begin\" for dimension 2 in Slice is out of range, at my_slice");
-  }
-  {
-    // Size is below bounds, should fail.
-    Reset();
-    NodeDef node_def = get_slice_nodedef();
-    AddTestTensor("input", {1, 2, 3});
-    AddTestWeights<int32>("begin", {4}, {0, 0, 0, 0});
-    AddTestWeights<int32>("size", {4}, {1, 1, 2, -2});
-    RunValidationAndConversion(
-        node_def, error::INVALID_ARGUMENT,
-        "\"begin\" + \"size\" for dimension 3 in Slice is out of range, at "
-        "my_slice");
-  }
-  {
-    // Size is above bounds, should fail.
-    Reset();
-    NodeDef node_def = get_slice_nodedef();
-    AddTestTensor("input", {1, 2, 3});
-    AddTestWeights<int32>("begin", {4}, {0, 0, 0, 0});
-    AddTestWeights<int32>("size", {4}, {1, 1, 3, 3});
-    RunValidationAndConversion(
-        node_def, error::INVALID_ARGUMENT,
-        "\"begin\" + \"size\" for dimension 2 in Slice is out of range, at "
-        "my_slice");
-  }
-  {
-    // Modify batch dim, should fail.
-    Reset();
-    NodeDef node_def = get_slice_nodedef();
-    AddTestTensor("input", {1, 2, 3});
-    AddTestWeights<int32>("begin", {4}, {0, 0, 0, 0});
-    AddTestWeights<int32>("size", {4}, {0, 1, 2, 3});
-    RunValidationAndConversion(
-        node_def, error::UNIMPLEMENTED,
-        "TensorRT does not allow modifications to the batch dimension, at "
-        "my_slice");
-  }
-  {
-    // Dynamic batch size with size[0] not -1, should fail.
-    Reset();
-    NodeDef node_def = get_slice_nodedef();
-    AddTestTensor("input", {1, 2, 3}, /*batch_size=*/-1);
-    AddTestWeights<int32>("begin", {4}, {0, 0, 0, 0});
-    AddTestWeights<int32>("size", {4}, {1, 1, 2, 3});
-    RunValidationAndConversion(
-        node_def, error::UNIMPLEMENTED,
-        "TensorRT does not allow modifications to the batch dimension, at "
-        "my_slice");
-  }
-  {
-    // Dynamic batch size but using size[0] of -1, ok.
-    Reset();
-    NodeDef node_def = get_slice_nodedef();
-    AddTestTensor("input", {1, 2, 3}, /*batch_size=*/-1);
-    AddTestWeights<int32>("begin", {4}, {0, 0, 0, 0});
-    AddTestWeights<int32>("size", {4}, {-1, 1, 2, 2});
-    RunValidationAndConversion(node_def);
-  }
-
   struct TestParams {
     std::vector<int> input_dims;
+    std::vector<int>
+        partial_input_dims;  // Symbolic shape in dynamic shape mode.
     std::vector<int> begin;
     std::vector<int> size;
     std::vector<int> expected_output_dims;
     std::vector<int> expected_output;
+    Status conversion_status;
+    Status runtime_status;
   };
 
-  // Ok.
-  const int kSliceOKCases = 5;
-  TestParams ok_params[kSliceOKCases] = {
-      TestParams{{1, 2, 3},
+  std::vector<TestParams> params = {
+      // Slice start points must always be >= 0.
+      TestParams{/*input_dims=*/{1, 1, 2, 3},
+                 /*partial_input_dims=*/{-1, -1, -1, -1},
+                 /*begin=*/{0, 0, -1, 0},
+                 /*size=*/{1, 1, 2, 3},
+                 /*expected_output_dims=*/{},
+                 /*expected_output=*/{},
+                 /*conversion_status=*/
+                 errors::InvalidArgument("\"begin\" in Slice "
+                                         "is out of range")},
+      // In implicit batch mode, slicing the batch dimension is not allowed.
+      TestParams{/*input_dims=*/{2, 1, 1, 3},
+                 /*partial_input_dims=*/{-1, -1, -1, -1},
+                 /*begin=*/{0, 0, 0, 0},
+                 /*size=*/{1, 1, 1, 3},
+                 /*expected_output_dims=*/{1, 1, 1, 3},
+                 /*expected_output=*/{1, 2, 3},
+                 /*conversion_status=*/trt_mode_ == TrtTestMode::kImplicitBatch
+                     ? errors::Unimplemented(
+                           "TensorRT does not allow modifications to the batch "
+                           "dimension in implicit batch mode")
+                     : Status::OK()},
+      // Dynamic batch size but using size[0] of -1, ok.
+      TestParams{{1, 1, 2, 3},
+                 /*partial_input_dims=*/{-1, -1, -1, -1},
+                 {0, 0, 0, 0},
+                 {-1, 1, 2, 2},
+                 {1, 1, 2, 2},
+                 {1, 2, 4, 5},
+                 Status::OK()},
+      TestParams{{1, 1, 2, 3},
+                 /*partial_input_dims=*/{-1, -1, -1, -1},
                  {0, 0, 0, 0},
                  {-1, -1, -1, -1},
-                 {1, 2, 3},
+                 {1, 1, 2, 3},
+                 {1, 2, 3, 4, 5, 6},
+                 Status::OK()},
+      TestParams{{1, 1, 2, 3},
+                 /*partial_input_dims=*/{-1, -1, -1, -1},
+                 {0, 0, 0, 0},
+                 {1, 1, 2, 3},
+                 {1, 1, 2, 3},
                  {1, 2, 3, 4, 5, 6}},
+      TestParams{{1, 1, 2, 3},
+                 /*partial_input_dims=*/{-1, -1, -1, -1},
+                 /*begin=*/{0, 0, 0, 0},
+                 /*size=*/{1, -1, 2, 2},
+                 /*expected_output_dims=*/{1, 1, 2, 2},
+                 /*expected_output=*/{1, 2, 4, 5},
+                 Status::OK()},
+      TestParams{/*input_dims=*/{1, 6},
+                 /*partial_input_dims=*/{-1, -1},
+                 /*being=*/{0, 1},
+                 /*size=*/{1, 5},
+                 /*expected_output_dims=*/{1, 5},
+                 /*expected_output=*/{2, 3, 4, 5, 6}},
+      TestParams{/*input_dims=*/{1, 6},
+                 /*partial_input_dims=*/{-1, -1},
+                 /*begin=*/{0, 1},
+                 /*size=*/{-1, 3},
+                 /*expected_output_dims=*/{1, 3},
+                 /*expected_output=*/{2, 3, 4}, Status::OK()},
+      // In dynamic shape mode we do not know the input shape during
+      // conversion, therfore we cannot check out of bound access.
       TestParams{
-          {1, 2, 3}, {0, 0, 0, 0}, {1, 1, 2, 3}, {1, 2, 3}, {1, 2, 3, 4, 5, 6}},
+          {1, 1, 2, 3},
+          /*partial_input_dims=*/{-1, -1, -1, -1},
+          /*begin=*/{0, 0, 3, 0},
+          /*end=*/{1, 1, 2, 3},
+          {},
+          {},
+          trt_mode_ == TrtTestMode::kDynamicShape
+              ? Status::OK()
+              : errors::InvalidArgument("\"begin\" + \"size\" for dimension "
+                                        "2 in Slice is out of range"),
+          errors::Internal("Internal: Failed to build TensorRT engine")},
+      // The slice operation should expect that the "size[i]" values are not
+      // less than -1.
+      TestParams{/*input_dims=*/{1, 1, 2, 3},
+                 /*partial_input_dims=*/{-1, -1, -1, -1},
+                 /*begin=*/{0, 0, 0, 0},
+                 /*size=*/{1, 1, 2, -2},
+                 {},
+                 {},
+                 errors::InvalidArgument("\"size\" in Slice is out of range")},
       TestParams{
-          {1, 2, 3}, {0, 0, 0, 0}, {1, -1, 2, 2}, {1, 2, 2}, {1, 2, 4, 5}},
-      TestParams{{6}, {0, 1}, {1, 5}, {5}, {2, 3, 4, 5, 6}},
-      TestParams{{6}, {0, 1}, {-1, 3}, {3}, {2, 3, 4}},
+          /*input_dims=*/{1, 1, 2, 3},
+          /*partial_input_dims=*/{-1, -1, -1, -1},
+          /*begin=*/{0, 0, 0, 0},
+          /*size=*/{1, 1, 3, 2},
+          /*expected_output_dims=*/{},
+          /*expected_output=*/{},
+          /*conversion_status=*/trt_mode_ == TrtTestMode::kDynamicShape
+              ? Status::OK()
+              : errors::InvalidArgument("\"begin\" + \"size\" for dimension "
+                                        "2 in Slice is out of range"),
+          errors::Internal("Internal: Failed to build TensorRT engine")},
   };
 
-  for (int i = 0; i < kSliceOKCases; i++) {
+  logger_.unsuppressAllLoggerMsgs();
+  int i = 0;
+  for (auto p : params) {
     Reset();
-    NodeDef node_def = get_slice_nodedef();
-    AddTestTensor("input", ok_params[i].input_dims);
-    AddTestWeights<int32>("begin",
-                          {static_cast<int>(ok_params[i].begin.size())},
-                          ok_params[i].begin);
-    AddTestWeights<int32>("size", {static_cast<int>(ok_params[i].size.size())},
-                          ok_params[i].size);
-    RunValidationAndConversion(node_def);
+    NodeDef node_def = get_slice_nodedef(tf_type_);
 
-    TRT_TensorOrWeights output;
-    TF_EXPECT_OK(GetTensorOrWeights("my_slice", &output));
-    ASSERT_TRUE(output.is_tensor());
-    ExpectTrtDimsEqualsArray(ok_params[i].expected_output_dims,
-                             output.tensor()->getDimensions());
+    VLOG(2) << "Preparing test case " << i++ << " with dims "
+            << DebugString(p.input_dims);
 
-    const DataVec input_data{
-        {"input", test::AsTensor<float>({1, 2, 3, 4, 5, 6})}};
-    DataVec output_data{{"my_slice", ConstructTensor<float>(
-                                         ok_params[i].expected_output.size())}};
-    BuildAndRun(input_data, &output_data);
-    EXPECT_THAT(GetSpanForData<float>(output_data[0]),
-                ElementsAreArray(ok_params[i].expected_output));
+    // The input tensor always has size 6.
+    std::vector<int> input_vals = {1, 2, 3, 4, 5, 6};
+
+    switch (trt_mode_) {
+      case TrtTestMode::kImplicitBatch: {
+        AddTestTensor("input", p.input_dims, input_vals);
+        break;
+      }
+      case TrtTestMode::kExplicitBatch: {
+        AddTestTensor("input", p.input_dims, input_vals);
+        break;
+      }
+      case TrtTestMode::kDynamicShape: {
+        if (p.partial_input_dims.size() > 0) {
+          AddTestTensor("input", p.input_dims, tf_type_, input_vals,
+                        p.partial_input_dims);
+
+        } else {
+          AddTestTensor("input", p.input_dims, tf_type_, input_vals,
+                        p.input_dims);
+        }
+        break;
+      }
+    }
+
+    AddTestWeights<int32>("begin", {static_cast<int>(p.begin.size())}, p.begin);
+    AddTestWeights<int32>("size", {static_cast<int>(p.size.size())}, p.size);
+
+    const bool flag =
+        trt_mode_ == TrtTestMode::kDynamicShape && (i == 9 || i == 11);
+    if (flag) logger_.suppressLoggerMsgs(nvinfer1::ILogger::Severity::kERROR);
+
+    TestOpConverter(node_def, p.expected_output_dims, p.conversion_status,
+                    p.runtime_status, ElementsAreArray(p.expected_output));
+    if (flag) logger_.unsuppressLoggerMsgs(nvinfer1::ILogger::Severity::kERROR);
   }
 }
 
-TEST_F(OpConverterTest, ConvertConv2D) {
+TEST_P(OpConverter_FP32_Test, ConvertConv2D) {
   // Get nodedef for Conv2D layer.
+  DataType tf_type = tf_type_;
   auto get_conv2d_nodedef =
-      [](std::vector<int> strides = {1, 1, 1, 1}, string padding = "SAME",
-         string data_format = "NCHW", std::vector<int> dilations = {1, 1, 1, 1},
-         bool is_conv2d_backprop_input = false) -> NodeDef {
+      [tf_type](std::vector<int> strides = {1, 1, 1, 1},
+                string padding = "SAME", string data_format = "NCHW",
+                std::vector<int> dilations = {1, 1, 1, 1}) -> NodeDef {
     Scope s = Scope::NewRootScope();
-    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
-    auto filter = ops::Placeholder(s.WithOpName("weights"), DT_FLOAT);
-    if (is_conv2d_backprop_input) {
-      auto input_sizes =
-          ops::Placeholder(s.WithOpName("input_sizes"), DT_INT32);
-      ops::Conv2DBackpropInput::Attrs attrs = ops::Conv2DBackpropInput::Attrs()
-                                                  .DataFormat(data_format)
-                                                  .Dilations(dilations);
-      auto conv2d =
-          ops::Conv2DBackpropInput(s.WithOpName("my_conv2d"), input_sizes,
-                                   filter, input, strides, padding, attrs);
-      return conv2d.operation.node()->def();
-    } else {
-      ops::Conv2D::Attrs attrs =
-          ops::Conv2D::Attrs().DataFormat(data_format).Dilations(dilations);
-      auto conv2d = ops::Conv2D(s.WithOpName("my_conv2d"), input, filter,
-                                strides, padding, attrs);
-      return conv2d.operation.node()->def();
-    }
+    auto input = ops::Placeholder(s.WithOpName("input"), tf_type);
+    auto filter = ops::Placeholder(s.WithOpName("weights"), tf_type);
+    ops::Conv2D::Attrs attrs =
+        ops::Conv2D::Attrs().DataFormat(data_format).Dilations(dilations);
+    auto conv2d = ops::Conv2D(s.WithOpName("my_conv2d"), input, filter, strides,
+                              padding, attrs);
+    return conv2d.operation.node()->def();
   };
 
   {
@@ -3738,96 +5769,93 @@ TEST_F(OpConverterTest, ConvertConv2D) {
     AddTestWeights<float>("input", {1, 2, 3}, {1, 2, 3, 4, 5, 6});
     AddTestWeights<float>("weights", {3, 3, 1, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9});
     RunValidationAndConversion(
-        node_def, error::UNIMPLEMENTED,
-        "The input \"input\" for Conv2D must be a tensor, at my_conv2d");
+        node_def, absl::StatusCode::kUnimplemented,
+        "The input \"input\" for Conv2D must be a tensor");
   }
   {
     // Filter is tensor, should fail.
     Reset();
     NodeDef node_def = get_conv2d_nodedef();
-    AddTestTensor("input", {1, 2, 3});
+    AddTestTensor("input", {3, 1, 2, 1});
     AddTestTensor("weights", {3, 3, 1, 1});
     RunValidationAndConversion(
-        node_def, error::UNIMPLEMENTED,
-        "The input \"filter\" for Conv2D must be a constant, at my_conv2d");
+        node_def, absl::StatusCode::kUnimplemented,
+        "The input \"filter\" for Conv2D must be a constant");
   }
   {
     // Filter is not 4D, should fail.
     Reset();
     NodeDef node_def = get_conv2d_nodedef();
-    AddTestTensor("input", {1, 2, 3});
+    AddTestTensor("input", {1, 1, 2, 3});
     AddTestWeights<float>("weights", {3, 3, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9});
-    RunValidationAndConversion(
-        node_def, error::INVALID_ARGUMENT,
-        "Conv2D expects kernel of dimension 4, at my_conv2d");
+    RunValidationAndConversion(node_def, absl::StatusCode::kInvalidArgument,
+                               "Conv2D expects kernel of dimension 4");
   }
   {
     // Dilations is not 4D, should fail.
     Reset();
     NodeDef node_def =
         get_conv2d_nodedef({1, 1, 1, 1}, "SAME", "NCHW", {1, 1, 1});
-    AddTestTensor("input", {1, 2, 3});
+    AddTestTensor("input", {1, 1, 2, 3});
     AddTestWeights<float>("weights", {3, 3, 1, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9});
     RunValidationAndConversion(
-        node_def, error::INVALID_ARGUMENT,
-        "Convolution dilations field must specify 4 dimensions, at my_conv2d");
+        node_def, absl::StatusCode::kInvalidArgument,
+        "Convolution dilations field must specify 4 dimensions");
   }
   {
     // Dilation value is not 1 for channel, should fail.
     Reset();
     NodeDef node_def =
         get_conv2d_nodedef({1, 1, 1, 1}, "SAME", "NCHW", {1, 2, 1, 1});
-    AddTestTensor("input", {1, 2, 3});
+    AddTestTensor("input", {1, 1, 2, 3});
     AddTestWeights<float>("weights", {3, 3, 1, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9});
-    RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
+    RunValidationAndConversion(node_def, absl::StatusCode::kUnimplemented,
                                "Dilation rate must be 1 for batch and channel "
-                               "dimensions, at my_conv2d");
+                               "dimensions");
   }
   {
     // Dilation value is not 1 for channel (NHWC), should fail.
     Reset();
     NodeDef node_def =
         get_conv2d_nodedef({1, 1, 1, 1}, "SAME", "NHWC", {1, 1, 1, 2});
-    AddTestTensor("input", {2, 3, 1});
+    AddTestTensor("input", {1, 2, 3, 1});
     AddTestWeights<float>("weights", {3, 3, 1, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9});
-    RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
+    RunValidationAndConversion(node_def, absl::StatusCode::kUnimplemented,
                                "Dilation rate must be 1 for batch and channel "
-                               "dimensions, at my_conv2d");
-  }
-  {
-    // Dilation + Conv2DBackpropInput, should fail.
-    Reset();
-    NodeDef node_def =
-        get_conv2d_nodedef({1, 1, 1, 1}, "SAME", "NHWC", {1, 1, 2, 1}, true);
-    AddTestTensor("input", {2, 3, 1});
-    AddTestWeights<float>("weights", {3, 3, 1, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9});
-    AddTestWeights<int>("input_sizes", {4}, {1, 2, 3, 1});
-    RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
-                               "Dilation with Conv2DBackpropInput "
-                               "(conv2d_transpose) is not supported, "
-                               "at my_conv2d");
+                               "dimensions");
   }
   {
     // Strides is not 4D, should fail.
     Reset();
     NodeDef node_def =
         get_conv2d_nodedef({1, 1, 1}, "SAME", "NCHW", {1, 1, 1, 1});
-    AddTestTensor("input", {1, 2, 3});
+    AddTestTensor("input", {1, 1, 2, 3});
     AddTestWeights<float>("weights", {3, 3, 1, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9});
     RunValidationAndConversion(
-        node_def, error::INVALID_ARGUMENT,
-        "Convolution strides field must specify 4 dimensions, at my_conv2d");
+        node_def, absl::StatusCode::kInvalidArgument,
+        "Convolution strides field must specify 4 dimensions");
   }
   {
     // Stride value is not 1 for channel, should fail.
     Reset();
     NodeDef node_def =
         get_conv2d_nodedef({1, 2, 1, 1}, "SAME", "NCHW", {1, 1, 1, 1});
-    AddTestTensor("input", {1, 2, 3});
+    AddTestTensor("input", {1, 1, 2, 3});
     AddTestWeights<float>("weights", {3, 3, 1, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9});
     RunValidationAndConversion(
-        node_def, error::UNIMPLEMENTED,
-        "Stride must be 1 for batch and channel dimensions, at my_conv2d");
+        node_def, absl::StatusCode::kUnimplemented,
+        "Stride must be 1 for batch and channel dimensions");
+  }
+  if (trt_mode_ == TrtTestMode::kDynamicShape) {
+    Reset();
+    NodeDef node_def = get_conv2d_nodedef();
+    // Channel dim unknown, should fail.
+    nvinfer1::DataType trt_type;
+    TF_ASSERT_OK(TfTypeToTrtType(tf_type_, &trt_type));
+    AddTestTensorWithTFDims("input", {-1, -1, -1, -1}, trt_type);
+    AddTestWeights<float>("weights", {1, 2, 1, 1}, {-1, 1});
+    RunValidationAndConversion(node_def, absl::StatusCode::kInvalidArgument,
+                               "Channel dimension must be static");
   }
 
   struct TestParams {
@@ -3839,15 +5867,14 @@ TEST_F(OpConverterTest, ConvertConv2D) {
     string padding;
     string data_format;
     std::vector<int> dilations;
-    bool is_conv2d_backprop_input;
     std::vector<int> expected_output_dims;
     std::vector<float> expected_output;
   };
 
   // Ok.
-  std::vector<TestParams> ok_params{
+  std::vector<TestParams> ok_params = {
       // Basic
-      TestParams{/*input_dims=*/{1, 2, 3},
+      TestParams{/*input_dims=*/{1, 1, 2, 3},
                  /*input=*/{0, 1, 2, 3, 3, 4},
                  /*filter_dims=*/{1, 2, 1, 1},
                  /*filter=*/{-1, 1},
@@ -3855,11 +5882,10 @@ TEST_F(OpConverterTest, ConvertConv2D) {
                  /*padding=*/"VALID",
                  /*data_format=*/"NCHW",
                  /*dilations=*/{1, 1, 1, 1},
-                 /*is_conv2d_backprop_input=*/false,
-                 /*expected_output_dims=*/{1, 2, 2},
+                 /*expected_output_dims=*/{1, 1, 2, 2},
                  /*expected_output=*/{1, 1, 0, 1}},
       // SAME padding (Asymmetric)
-      TestParams{/*input_dims=*/{1, 2, 3},
+      TestParams{/*input_dims=*/{1, 1, 2, 3},
                  /*input=*/{0, 1, 2, 3, 3, 4},
                  /*filter_dims=*/{1, 2, 1, 1},
                  /*filter=*/{-1, 1},
@@ -3867,11 +5893,10 @@ TEST_F(OpConverterTest, ConvertConv2D) {
                  /*padding=*/"SAME",
                  /*data_format=*/"NCHW",
                  /*dilations=*/{1, 1, 1, 1},
-                 /*is_conv2d_backprop_input=*/false,
-                 /*expected_output_dims=*/{1, 2, 3},
+                 /*expected_output_dims=*/{1, 1, 2, 3},
                  /*expected_output=*/{1, 1, -2, 0, 1, -4}},
       // SAME padding (Symmetric)
-      TestParams{/*input_dims=*/{1, 2, 3},
+      TestParams{/*input_dims=*/{1, 1, 2, 3},
                  /*input=*/{0, 1, 2, 3, 3, 4},
                  /*filter_dims=*/{1, 3, 1, 1},
                  /*filter=*/{-1, 0, 1},
@@ -3879,11 +5904,10 @@ TEST_F(OpConverterTest, ConvertConv2D) {
                  /*padding=*/"SAME",
                  /*data_format=*/"NCHW",
                  /*dilations=*/{1, 1, 1, 1},
-                 /*is_conv2d_backprop_input=*/false,
-                 /*expected_output_dims=*/{1, 2, 3},
+                 /*expected_output_dims=*/{1, 1, 2, 3},
                  /*expected_output=*/{1, 2, -1, 3, 1, -3}},
       // NHWC
-      TestParams{/*input_dims=*/{2, 3, 1},
+      TestParams{/*input_dims=*/{1, 2, 3, 1},
                  /*input=*/{0, 1, 2, 3, 3, 4},
                  /*filter_dims=*/{1, 2, 1, 1},
                  /*filter=*/{-1, 1},
@@ -3891,11 +5915,10 @@ TEST_F(OpConverterTest, ConvertConv2D) {
                  /*padding=*/"VALID",
                  /*data_format=*/"NHWC",
                  /*dilations=*/{1, 1, 1, 1},
-                 /*is_conv2d_backprop_input=*/false,
-                 /*expected_output_dims=*/{2, 2, 1},
+                 /*expected_output_dims=*/{1, 2, 2, 1},
                  /*expected_output=*/{1, 1, 0, 1}},
       // Dilated
-      TestParams{/*input_dims=*/{1, 2, 3},
+      TestParams{/*input_dims=*/{1, 1, 2, 3},
                  /*input=*/{0, 1, 2, 3, 3, 4},
                  /*filter_dims=*/{1, 2, 1, 1},
                  /*filter=*/{-1, 1},
@@ -3903,11 +5926,10 @@ TEST_F(OpConverterTest, ConvertConv2D) {
                  /*padding=*/"VALID",
                  /*data_format=*/"NCHW",
                  /*dilations=*/{1, 1, 1, 2},
-                 /*is_conv2d_backprop_input=*/false,
-                 /*expected_output_dims=*/{1, 2, 1},
+                 /*expected_output_dims=*/{1, 1, 2, 1},
                  /*expected_output=*/{2, 1}},
       // Strided
-      TestParams{/*input_dims=*/{1, 2, 4},
+      TestParams{/*input_dims=*/{1, 1, 2, 4},
                  /*input=*/{0, 1, 2, 2, 3, 4, 4, 7},
                  /*filter_dims=*/{1, 2, 1, 1},
                  /*filter=*/{-1, 1},
@@ -3915,11 +5937,74 @@ TEST_F(OpConverterTest, ConvertConv2D) {
                  /*padding=*/"VALID",
                  /*data_format=*/"NCHW",
                  /*dilations=*/{1, 1, 1, 1},
-                 /*is_conv2d_backprop_input=*/false,
-                 /*expected_output_dims=*/{1, 2, 2},
+                 /*expected_output_dims=*/{1, 1, 2, 2},
                  /*expected_output=*/{1, 0, 1, 3}},
+  };
+
+  for (int i = 0; i < ok_params.size(); i++) {
+    Reset();
+    NodeDef node_def =
+        get_conv2d_nodedef(ok_params[i].strides, ok_params[i].padding,
+                           ok_params[i].data_format, ok_params[i].dilations);
+    std::vector<int> partial_input_shape;
+    if (trt_mode_ == TrtTestMode::kDynamicShape) {
+      // The channel dim cannot have unknown size, fix that.
+      partial_input_shape.resize(ok_params[i].input_dims.size(), -1);
+      int channel_id = (ok_params[i].data_format == "NCHW") ? 1 : 3;
+      partial_input_shape[channel_id] = ok_params[i].input_dims[channel_id];
+    }
+
+    AddTestTensor("input", ok_params[i].input_dims, tf_type_,
+                  ok_params[i].input, partial_input_shape);
+    AddTestWeights<float>("weights", ok_params[i].filter_dims,
+                          ok_params[i].filter);
+
+    TestOpConverter(node_def, ok_params[i].expected_output_dims, Status::OK(),
+                    Status::OK(),
+                    ElementsAreArray(ok_params[i].expected_output));
+  }
+}
+
+TEST_P(OpConverter_FP32_Test, ConvertConv2DBackpropInput) {
+  // Get nodedef for Conv2D layer.
+  auto get_conv2d_backprop_input_nodedef =
+      [](DataType tf_type, std::vector<int> strides = {1, 1, 1, 1},
+         string padding = "SAME", string data_format = "NCHW",
+         std::vector<int> dilations = {1, 1, 1, 1}) -> NodeDef {
+    Scope s = Scope::NewRootScope();
+    auto input = ops::Placeholder(s.WithOpName("input"), tf_type);
+    auto filter = ops::Placeholder(s.WithOpName("weights"), tf_type);
+    auto input_sizes = ops::Placeholder(s.WithOpName("input_sizes"), DT_INT32);
+    ops::Conv2DBackpropInput::Attrs attrs = ops::Conv2DBackpropInput::Attrs()
+                                                .DataFormat(data_format)
+                                                .Dilations(dilations);
+    auto conv2d = ops::Conv2DBackpropInput(
+        s.WithOpName("my_conv2d_backprop_input"), input_sizes, filter, input,
+        strides, padding, attrs);
+    return conv2d.operation.node()->def();
+  };
+
+  struct TestParams {
+    std::vector<int> input_dims;
+    std::vector<float> input;
+    std::vector<int> filter_dims;
+    std::vector<float> filter;
+    std::vector<int> strides;
+    string padding;
+    string data_format;
+    std::vector<int> dilations;
+    std::vector<int> expected_output_dims;
+    std::vector<float> expected_output;
+    Status conversion_status;
+    // For dynamic shape mode, we must use the partial_input_dims for
+    // creating the test tensor if any of the input_dims are -1.
+    std::vector<int> partial_input_dims;
+  };
+
+  // Ok.
+  std::vector<TestParams> params = {
       // Transpose Strided
-      TestParams{/*input_dims=*/{1, 2, 2},
+      TestParams{/*input_dims=*/{1, 1, 2, 2},
                  /*input=*/{0, 1, 2, 3},
                  /*filter_dims=*/{1, 2, 1, 1},
                  /*filter=*/{-1, 1},
@@ -3927,11 +6012,10 @@ TEST_F(OpConverterTest, ConvertConv2D) {
                  /*padding=*/"SAME",
                  /*data_format=*/"NCHW",
                  /*dilations=*/{1, 1, 1, 1},
-                 /*is_conv2d_backprop_input=*/true,
-                 /*expected_output_dims=*/{1, 2, 4},
+                 /*expected_output_dims=*/{1, 1, 2, 4},
                  /*expected_output=*/{0, 0, -1, 1, -2, 2, -3, 3}},
       // Transpose Strided NHWC
-      TestParams{/*input_dims=*/{2, 2, 1},
+      TestParams{/*input_dims=*/{1, 2, 2, 1},
                  /*input=*/{0, 1, 2, 3},
                  /*filter_dims=*/{1, 2, 1, 1},
                  /*filter=*/{-1, 1},
@@ -3939,11 +6023,10 @@ TEST_F(OpConverterTest, ConvertConv2D) {
                  /*padding=*/"SAME",
                  /*data_format=*/"NHWC",
                  /*dilations=*/{1, 1, 1, 1},
-                 /*is_conv2d_backprop_input=*/true,
-                 /*expected_output_dims=*/{2, 4, 1},
+                 /*expected_output_dims=*/{1, 2, 4, 1},
                  /*expected_output=*/{0, 0, -1, 1, -2, 2, -3, 3}},
       // Transpose Strided NHWC with VALID padding
-      TestParams{/*input_dims=*/{3, 1, 1},
+      TestParams{/*input_dims=*/{1, 3, 1, 1},
                  /*input=*/{0, 1, 2},
                  /*filter_dims=*/{2, 1, 1, 1},
                  /*filter=*/{-1, 1},
@@ -3951,438 +6034,965 @@ TEST_F(OpConverterTest, ConvertConv2D) {
                  /*padding=*/"VALID",
                  /*data_format=*/"NHWC",
                  /*dilations=*/{1, 1, 1, 1},
-                 /*is_conv2d_backprop_input=*/true,
-                 /*expected_output_dims=*/{7, 1, 1},
+                 /*expected_output_dims=*/{1, 7, 1, 1},
                  /*expected_output=*/{0, 0, -1, 1, -2, 2, 0}},
-
+      TestParams{/*input_dims=*/{1, 1, 2, 2},
+                 /*input=*/{0, 1, 2, 3},
+                 /*filter_dims=*/{1, 2, 1, 1},
+                 /*filter=*/{-1, 1},
+                 /*strides=*/{1, 1, 1, 2},
+                 /*padding=*/"EXPLICIT",
+                 /*data_format=*/"NCHW",
+                 /*dilations=*/{1, 1, 1, 1},
+                 /*expected_output_dims=*/{1, 1, 2, 4},
+                 /*expected_output=*/{0, 0, -1, 1, -2, 2, -3, 3},
+                 errors::Unimplemented("EXPLICIT padding type not "
+                                       "implemented, only VALID and SAME are"
+                                       " supported")},
+      // Dilation + Conv2DBackpropInput, should fail.
+      TestParams{/*input_dims=*/{1, 1, 2, 2},
+                 /*input=*/{0, 1, 2, 3},
+                 /*filter_dims=*/{1, 2, 1, 1},
+                 /*filter=*/{-1, 1},
+                 /*strides=*/{1, 1, 1, 1},
+                 /*padding=*/"SAME",
+                 /*data_format=*/"NCHW",
+                 /*dilations=*/{1, 1, 1, 2},
+                 {1, 1, 2, 2},
+                 {},
+                 errors::Unimplemented("Dilation with Conv2DBackpropInput "
+                                       "(conv2d_transpose) is not supported")},
   };
+  if (trt_mode_ == TrtTestMode::kDynamicShape) {
+    params.push_back(
+        TestParams{/*input_dims=*/{1, 1, 2, 2},
+                   /*input=*/{0, 1, 2, 3},
+                   /*filter_dims=*/{1, 2, 1, 1},
+                   /*filter=*/{-1, 1},
+                   /*strides=*/{1, 1, 1, 2},
+                   /*padding=*/"SAME",
+                   /*data_format=*/"NCHW",
+                   /*dilations=*/{1, 1, 1, 1},
+                   /*expected_output_dims=*/{1, 1, 2, 4},
+                   /*expected_output=*/{0, 0, -1, 1, -2, 2, -3, 3},
+                   errors::InvalidArgument("Channel dimension must be static"),
+                   /*partial input dims=*/{1, -1, 2, 2}});
+    // Test dynamic  batch dimension.
+    params.push_back(
+        TestParams{/*input_dims=*/{2, 1, 2, 2},
+                   /*input=*/
+                   // clang-format off
+                      {0, 1, 2, 3,
+                       3, 2, 1, 0},
+                   // clang-format on
+                   /*filter_dims=*/{1, 2, 1, 1},
+                   /*filter=*/{-1, 1},
+                   /*strides=*/{1, 1, 1, 2},
+                   /*padding=*/"SAME",
+                   /*data_format=*/"NCHW",
+                   /*dilations=*/{1, 1, 1, 1},
+                   /*expected_output_dims=*/{2, 1, 2, 4},
+                   /*expected_output=*/
+                   // clang-format off
+                   { 0, 0, -1, 1, -2, 2, -3, 3,
+                    -3, 3, -2, 2, -1, 1, 0, 0},
+                   // clang-format on
+                   /*conversion_status=*/Status::OK(),
+                   /*partial input dims=*/{-1, 1, 2, 2}});
+
+    // Test dynamic height and width.
+    params.push_back(TestParams{
+        /*input_dims=*/{1, 1, 2, 2},
+        /*input=*/{0, 1, 2, 3},
+        /*filter_dims=*/{1, 2, 1, 1},
+        /*filter=*/{-1, 1},
+        /*strides=*/{1, 1, 1, 2},
+        /*padding=*/"SAME",
+        /*data_format=*/"NCHW",
+        /*dilations=*/{1, 1, 1, 1},
+        /*expected_output_dims=*/{1, 1, 2, 4},
+        /*expected_output=*/
+        {0, 0, -1, 1, -2, 2, -3, 3},
+        /*conversion_status=*/
+        errors::Unimplemented(
+            "Conv2dBackpropInput does not support input with unknown spatial "
+            "shape"),
+        /*partial input dims=*/{1, 1, -1, -1}});
+  }
+  for (auto p : params) {
+    for (int input_sizes_length : {2, 4}) {
+      Reset();
+      NodeDef node_def = get_conv2d_backprop_input_nodedef(
+          tf_type_, p.strides, p.padding, p.data_format, p.dilations);
 
-  for (int i = 0; i < ok_params.size(); i++) {
-    Reset();
-    NodeDef node_def = get_conv2d_nodedef(
-        ok_params[i].strides, ok_params[i].padding, ok_params[i].data_format,
-        ok_params[i].dilations, ok_params[i].is_conv2d_backprop_input);
-    AddTestTensor("input", ok_params[i].input_dims);
-    AddTestWeights<float>("weights", ok_params[i].filter_dims,
-                          ok_params[i].filter);
+      switch (trt_mode_) {
+        case TrtTestMode::kImplicitBatch: {
+          AddTestTensor("input", p.input_dims, p.input);
+          break;
+        }
+        case TrtTestMode::kExplicitBatch: {
+          AddTestTensor("input", p.input_dims, p.input);
+          break;
+        }
+        case TrtTestMode::kDynamicShape: {
+          AddTestTensor("input", p.input_dims, tf_type_, p.input,
+                        p.partial_input_dims.size() > 0 ? p.partial_input_dims
+                                                        : p.input_dims);
+          break;
+        }
+        default: { ASSERT_TRUE(false) << "unknown test mode"; }
+      }
 
+      AddTestWeights<float>("weights", p.filter_dims, p.filter, tf_type_);
 
-    if (ok_params[i].is_conv2d_backprop_input) {
-      std::vector<int> tf_input_sizes = ok_params[i].expected_output_dims;
-      tf_input_sizes.insert(tf_input_sizes.begin(), 1);  // Add batch dimension.
-      QCHECK_EQ(4, tf_input_sizes.size());
-      AddTestWeights<int>("input_sizes", {4}, tf_input_sizes);
+      if (input_sizes_length == 4) {
+        AddTestWeights<int>("input_sizes", {4}, p.expected_output_dims);
+      } else {
+        std::vector<int> tf_input_sizes(2);
+        // Remove the channel and batch dimensions.
+        if (p.data_format == "NHWC") {
+          std::copy(p.expected_output_dims.begin() + 1,
+                    p.expected_output_dims.end() - 1, tf_input_sizes.begin());
+        } else {
+          std::copy(p.expected_output_dims.begin() + 2,
+                    p.expected_output_dims.end(), tf_input_sizes.begin());
+        }
+        QCHECK_EQ(2, tf_input_sizes.size());
+        AddTestWeights<int>("input_sizes", {2}, tf_input_sizes);
+      }
+
+      TestOpConverter(node_def, p.expected_output_dims, p.conversion_status,
+                      Status::OK(), ElementsAreArray(p.expected_output));
     }
-    RunValidationAndConversion(node_def);
-    TRT_TensorOrWeights output;
-    TF_EXPECT_OK(GetTensorOrWeights("my_conv2d", &output));
-    ASSERT_TRUE(output.is_tensor());
-    ExpectTrtDimsEqualsArray(ok_params[i].expected_output_dims,
-                             output.tensor()->getDimensions());
+  }
+}
 
-    const DataVec input_data{
-        {"input", test::AsTensor<float>(ok_params[i].input)}};
-    DataVec output_data{
-        {"my_conv2d",
-         ConstructTensor<float>(ok_params[i].expected_output.size())}};
-    BuildAndRun(input_data, &output_data);
-    EXPECT_THAT(GetSpanForData<float>(output_data[0]),
-                ElementsAreArray(ok_params[i].expected_output));
+// Get the NodeDef for Pack.
+NodeDef GetConv3DNodeDef(std::vector<int> strides = {1, 1, 1, 1, 1},
+                         string padding = "SAME", string data_format = "NCDHW",
+                         std::vector<int> dilations = {1, 1, 1, 1, 1},
+                         bool is_conv3d_backprop_input = false) {
+  Scope s = Scope::NewRootScope();
+  auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
+  auto filter = ops::Placeholder(s.WithOpName("weights"), DT_FLOAT);
+
+  if (is_conv3d_backprop_input) {
+    auto input_sizes = ops::Placeholder(s.WithOpName("input_sizes"), DT_INT32);
+    ops::Conv3DBackpropInputV2::Attrs attrs =
+        ops::Conv3DBackpropInputV2::Attrs()
+            .DataFormat(data_format)
+            .Dilations(dilations);
+    auto conv3d =
+        ops::Conv3DBackpropInputV2(s.WithOpName("my_conv3d"), input_sizes,
+                                   filter, input, strides, padding, attrs);
+    return conv3d.operation.node()->def();
+  } else {
+    ops::Conv3D::Attrs attrs =
+        ops::Conv3D::Attrs().DataFormat(data_format).Dilations(dilations);
+    auto conv3d = ops::Conv3D(s.WithOpName("my_conv3d"), input, filter, strides,
+                              padding, attrs);
+    return conv3d.operation.node()->def();
   }
 }
 
-#if IS_TRT_VERSION_GE(6, 0, 0, 0)
-TEST_F(OpConverterTest, ConvertConv3D) {
-  // Get nodedef for Conv3D layer.
-  auto get_conv3d_nodedef =
-      [](std::vector<int> strides = {1, 1, 1, 1, 1}, string padding = "SAME",
-         string data_format = "NCDHW",
-         std::vector<int> dilations = {1, 1, 1, 1, 1},
-         bool is_conv3d_backprop_input = false) -> NodeDef {
-    Scope s = Scope::NewRootScope();
-    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
-    auto filter = ops::Placeholder(s.WithOpName("weights"), DT_FLOAT);
-
-    if (is_conv3d_backprop_input) {
-      auto input_sizes =
-          ops::Placeholder(s.WithOpName("input_sizes"), DT_INT32);
-      ops::Conv3DBackpropInputV2::Attrs attrs =
-          ops::Conv3DBackpropInputV2::Attrs()
-              .DataFormat(data_format)
-              .Dilations(dilations);
-      auto conv3d =
-          ops::Conv3DBackpropInputV2(s.WithOpName("my_conv3d"), input_sizes,
-                                     filter, input, strides, padding, attrs);
-      return conv3d.operation.node()->def();
-    } else {
-      ops::Conv3D::Attrs attrs =
-          ops::Conv3D::Attrs().DataFormat(data_format).Dilations(dilations);
-      auto conv3d = ops::Conv3D(s.WithOpName("my_conv3d"), input, filter,
-                                strides, padding, attrs);
-      return conv3d.operation.node()->def();
-    }
-  };
+struct Conv3DTestParams {
+  std::vector<int> input_dims;
+  std::vector<float> input;
+  std::vector<int> filter_dims;
+  std::vector<float> filter;
+  std::vector<int> strides;
+  string padding;
+  string data_format;
+  std::vector<int> dilations;
+  bool is_conv3d_backprop;
+  std::vector<int> expected_output_dims;
+  std::vector<float> expected_output;
+  bool allow_dynamic_channel_dim;
+  Status validation_status;
+};
+
+void TestConv3D(ParameterizedOpConverterTestBase* test, Conv3DTestParams& p) {
+  test->Reset();
+  NodeDef node_def = GetConv3DNodeDef(p.strides, p.padding, p.data_format,
+                                      p.dilations, p.is_conv3d_backprop);
+
+  std::vector<int> partial_input_shape;
+  if (!p.allow_dynamic_channel_dim &&
+      test->get_trt_mode() == TrtTestMode::kDynamicShape) {
+    // The channel dim cannot have unknown size, fix that.
+    partial_input_shape.resize(p.input_dims.size(), -1);
+    int channel_id = (p.data_format == "NCDHW") ? 1 : 4;
+    partial_input_shape[channel_id] = p.input_dims[channel_id];
+  }
+
+  test->AddTestTensor("input", p.input_dims, test->get_tf_type(), p.input,
+                      partial_input_shape);
+  test->AddTestWeights<float>("weights", p.filter_dims, p.filter);
+
+  if (p.is_conv3d_backprop) {
+    test->AddTestWeights<float>("input_sizes",
+                                {static_cast<int>(p.expected_output.size())},
+                                p.expected_output);
+  }
 
+  test->TestOpConverter(node_def, p.expected_output_dims,
+                        /*expected_conversion_status=*/p.validation_status,
+                        /*expected_runtime_status=*/Status::OK(),
+                        /*matcher=*/ElementsAreArray(p.expected_output),
+                        /*out_tf_types=*/{test->get_tf_type()});
+}
+
+TEST_P(OpConverter_FP32_FP16_Test, ConvertConv3D) {
   {
     // Input is weights, should fail.
     Reset();
-    NodeDef node_def = get_conv3d_nodedef();
+    NodeDef node_def = GetConv3DNodeDef();
 
-    AddTestWeights<float>("input", {1, 2, 3}, {1, 2, 3, 4, 5, 6});
-    AddTestWeights<float>("weights", {3, 3, 1, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9});
+    AddTestWeights<float>("input", {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6});
+    AddTestWeights<float>("weights", {1, 3, 3, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9});
     RunValidationAndConversion(
-        node_def, error::UNIMPLEMENTED,
-        "The input \"input\" for Conv3D must be a tensor, at my_conv3d");
+        node_def, absl::StatusCode::kUnimplemented,
+        "The input \"input\" for Conv3D must be a tensor");
   }
   {
     // Filter is tensor, should fail.
     Reset();
-    NodeDef node_def = get_conv3d_nodedef();
-    AddTestTensor("input", {1, 2, 3});
-    AddTestTensor("weights", {3, 3, 1, 1, 3, 3, 1, 1});
+    NodeDef node_def = GetConv3DNodeDef();
+    AddTestTensor("input", {1, 1, 2, 3}, tf_type_, CreateVectorIota<float>(6));
+    AddTestTensor("weights", {1, 3, 3, 1}, tf_type_,
+                  CreateVectorIota<float>(9));
     RunValidationAndConversion(
-        node_def, error::UNIMPLEMENTED,
-        "The input \"filter\" for Conv3D must be a constant, at my_conv3d");
+        node_def, absl::StatusCode::kUnimplemented,
+        "The input \"filter\" for Conv3D must be a constant");
   }
   {
     // Filter is not 5D, should fail.
     Reset();
-    NodeDef node_def = get_conv3d_nodedef();
-    AddTestTensor("input", {1, 2, 3});
+    NodeDef node_def = GetConv3DNodeDef();
+    AddTestTensor("input", {1, 1, 2, 3}, tf_type_, CreateVectorIota<float>(6));
     AddTestWeights<float>("weights", {3, 3, 1, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9});
-    RunValidationAndConversion(
-        node_def, error::INVALID_ARGUMENT,
-        "Conv3D expects kernel of dimension 5, at my_conv3d");
+    RunValidationAndConversion(node_def, absl::StatusCode::kInvalidArgument,
+                               "Conv3D expects kernel of dimension 5");
   }
   {
     // Dilations is not 5D, should fail.
     Reset();
     NodeDef node_def =
-        get_conv3d_nodedef({1, 1, 1, 1, 1}, "SAME", "NCDHW", {1, 1, 1, 1});
-    AddTestTensor("input", {1, 2, 3});
+        GetConv3DNodeDef({1, 1, 1, 1, 1}, "SAME", "NCDHW", {1, 1, 1, 1});
+    AddTestTensor("input", {1, 1, 2, 3}, tf_type_, CreateVectorIota<float>(6));
     AddTestWeights<float>(
         "weights", {3, 3, 1, 1, 1},
         {1, 2, 3, 4, 5, 6, 7, 8, 9});  // Dimensions, then values
     RunValidationAndConversion(
-        node_def, error::INVALID_ARGUMENT,
-        "Convolution dilations field must specify 5 dimensions, at my_conv3d");
+        node_def, absl::StatusCode::kInvalidArgument,
+        "Convolution dilations field must specify 5 dimensions");
   }
   {
     // Dilation value is not 1 for channel, should fail.
     Reset();
     NodeDef node_def =
-        get_conv3d_nodedef({1, 1, 1, 1, 1}, "SAME", "NCDHW", {1, 2, 1, 1, 1});
-    AddTestTensor("input", {1, 2, 3});
+        GetConv3DNodeDef({1, 1, 1, 1, 1}, "SAME", "NCDHW", {1, 2, 1, 1, 1});
+    AddTestTensor("input", {1, 1, 2, 3}, tf_type_, CreateVectorIota<float>(6));
     AddTestWeights<float>("weights", {3, 3, 1, 1, 1},
                           {1, 2, 3, 4, 5, 6, 7, 8, 9});
-    RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
+    RunValidationAndConversion(node_def, absl::StatusCode::kUnimplemented,
                                "Dilation rate must be 1 for batch and channel "
-                               "dimensions, at my_conv3d");
+                               "dimensions");
   }
   {
     // Dilation value is not 1 for channel (NDHWC), should fail.
     Reset();
     NodeDef node_def =
-        get_conv3d_nodedef({1, 1, 1, 1, 1}, "SAME", "NDHWC", {1, 1, 1, 1, 2});
-    AddTestTensor("input", {2, 3, 1});
+        GetConv3DNodeDef({1, 1, 1, 1, 1}, "SAME", "NDHWC", {1, 1, 1, 1, 2});
+    AddTestTensor("input", {1, 2, 3, 1}, tf_type_, CreateVectorIota<float>(6));
     AddTestWeights<float>("weights", {3, 3, 1, 1, 1},
                           {1, 2, 3, 4, 5, 6, 7, 8, 9});
-    RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
+    RunValidationAndConversion(node_def, absl::StatusCode::kUnimplemented,
                                "Dilation rate must be 1 for batch and channel "
-                               "dimensions, at my_conv3d");
+                               "dimensions");
   }
   {
     // Dilation + Conv3DBackpropInputV2, should fail.
     Reset();
-    NodeDef node_def = get_conv3d_nodedef({1, 1, 1, 1, 1}, "SAME", "NDHWC",
-                                          {1, 1, 2, 1, 1}, true);
-    AddTestTensor("input", {2, 3, 1});
+    NodeDef node_def = GetConv3DNodeDef({1, 1, 1, 1, 1}, "SAME", "NDHWC",
+                                        {1, 1, 2, 1, 1}, true);
+    AddTestTensor("input", {1, 2, 3, 1}, tf_type_, CreateVectorIota<float>(6));
     AddTestWeights<float>("weights", {3, 3, 1, 1, 1},
                           {1, 2, 3, 4, 5, 6, 7, 8, 9});
     AddTestWeights<int>("input_sizes", {4}, {1, 2, 3, 1});
-    RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
+    RunValidationAndConversion(node_def, absl::StatusCode::kUnimplemented,
                                "Dilation with Conv3DBackpropInputV2 "
-                               "(conv3d_transpose) is not supported, "
-                               "at my_conv3d");
+                               "(conv3d_transpose) is not supported");
   }
   {
     // Asymmetric+ Conv3DBackpropInputV2, should fail.
     Reset();
-    NodeDef node_def = get_conv3d_nodedef({1, 1, 1, 1, 1}, "SAME", "NDHWC",
-                                          {1, 1, 1, 1, 1}, true);
-    AddTestTensor("input", {1, 2, 2, 2});
+    NodeDef node_def = GetConv3DNodeDef({1, 1, 1, 1, 1}, "SAME", "NDHWC",
+                                        {1, 1, 1, 1, 1}, true);
+    AddTestTensor("input", {1, 2, 2, 2}, tf_type_, CreateVectorIota<float>(8));
     AddTestWeights<float>("weights", {1, 1, 2, 1, 1}, {1, 1});
     AddTestWeights<int>("input_sizes", {8}, {1, 2, 3, 4, 5, 6, 7, 8});
-    RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
+    RunValidationAndConversion(node_def, absl::StatusCode::kUnimplemented,
                                "Asymmetric padding with Conv3DBackpropInputV2 "
-                               "(conv3d_transpose) is not supported, at "
-                               "my_conv3d");
+                               "(conv3d_transpose) is not supported");
   }
   {
     // Strides is not 5D, should fail.
     Reset();
-    NodeDef node_def = get_conv3d_nodedef({1, 1, 1, 1, 1, 1}, "SAME", "NCDHW",
-                                          {1, 1, 1, 1, 1});
-    AddTestTensor("input", {1, 2, 2, 2});
+    NodeDef node_def =
+        GetConv3DNodeDef({1, 1, 1, 1, 1, 1}, "SAME", "NCDHW", {1, 1, 1, 1, 1});
+    AddTestTensor("input", {1, 2, 2, 2}, tf_type_, CreateVectorIota<float>(8));
     AddTestWeights<float>("weights", {1, 1, 2, 1, 1}, {1, 1});
     RunValidationAndConversion(
-        node_def, error::INVALID_ARGUMENT,
-        "Convolution strides field must specify 5 dimensions, at my_conv3d");
+        node_def, absl::StatusCode::kInvalidArgument,
+        "Convolution strides field must specify 5 dimensions");
   }
   {
     // Stride value is not 1 for channel, should fail.
     Reset();
     NodeDef node_def =
-        get_conv3d_nodedef({1, 2, 1, 1, 1}, "SAME", "NCDHW", {1, 1, 1, 1, 1});
-    AddTestTensor("input", {1, 2, 3});
+        GetConv3DNodeDef({1, 2, 1, 1, 1}, "SAME", "NCDHW", {1, 1, 1, 1, 1});
+    AddTestTensor("input", {1, 1, 2, 3}, tf_type_, CreateVectorIota<float>(6));
     AddTestWeights<float>("weights", {3, 3, 1, 1, 1},
                           {1, 2, 3, 4, 5, 6, 7, 8, 9});
     RunValidationAndConversion(
-        node_def, error::UNIMPLEMENTED,
-        "Stride must be 1 for batch and channel dimensions, at my_conv3d");
+        node_def, absl::StatusCode::kUnimplemented,
+        "Stride must be 1 for batch and channel dimensions");
+  }
+
+  // Start here
+  std::vector<Conv3DTestParams> ok_params = {
+      // Basic - just 1x1 conv - input = output
+      {/*input_dims=*/{1, 1, 3, 3, 3},  // CDHW
+       /*input=*/{1, 2,  15,  3, 6,  -3, 22, 1, 88, 56, 36, 1,  1, 105,
+                  1, 16, -28, 1, 42, 9,  3,  1, 7,  1,  11, 61, 5},
+       /*filter_dims=*/{1, 1, 1, 1, 1},  // DRSCK
+       /*filter=*/{1},
+       /*strides=*/{1, 1, 1, 1, 1},
+       /*padding=*/"VALID",
+       /*data_format=*/"NCDHW",
+       /*dilations=*/{1, 1, 1, 1, 1},
+       /*is_conv3d_backprop=*/false,
+       /*expected_output_dims=*/{1, 1, 3, 3, 3},
+       /*expected_output=*/{1,  2,  15, 3, 6,   -3, 22, 1,   88,
+                            56, 36, 1,  1, 105, 1,  16, -28, 1,
+                            42, 9,  3,  1, 7,   1,  11, 61,  5},
+       /*allow_dynamic_channel_dim=*/false,
+       /*validation_status=*/Status::OK()},
+      // Basic - 2x1 filter
+      {/*input_dims=*/{1, 1, 3, 3, 3},  // CDHW
+       /*input=*/{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 6},
+       /*filter_dims=*/{2, 1, 1, 1, 1},  // DRSCK
+       /*filter=*/{1, 1},
+       /*strides=*/{1, 1, 1, 1, 1},
+       /*padding=*/"VALID",
+       /*data_format=*/"NCDHW",
+       /*dilations=*/{1, 1, 1, 1, 1},
+       /*is_conv3d_backprop=*/false,
+       /*expected_output_dims=*/{1, 1, 2, 3, 3},
+       /*expected_output=*/
+       {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 7},
+       /*allow_dynamic_channel_dim=*/false,
+       /*validation_status=*/Status::OK()},
+      // SAME padding (Asymmetric)
+      {/*input_dims=*/{1, 1, 2, 3, 2},  // CDHW
+       /*input=*/{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11},
+       /*filter_dims=*/{2, 1, 1, 1, 1},  // DRSCK
+       /*filter=*/{-1, 1},
+       /*strides=*/{1, 1, 1, 1, 1},
+       /*padding=*/"SAME",
+       /*data_format=*/"NCDHW",
+       /*dilations=*/{1, 1, 1, 1, 1},
+       /*is_conv3d_backprop=*/false,
+       /*expected_output_dims=*/{1, 1, 2, 3, 2},
+       // Diff in first 2 depths is const 6.
+       /*expected_output=*/{6, 6, 6, 6, 6, 6, -6, -7, -8, -9, -10, -11},
+       /*allow_dynamic_channel_dim=*/false,
+       /*validation_status=*/Status::OK()},
+      // SAME padding (Symmetric)
+      {/*input_dims=*/{1, 1, 2, 3, 2},  // CDHW
+       /*input=*/{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11},
+       /*filter_dims=*/{3, 1, 1, 1, 1},  // DRSCK
+       /*filter=*/{-1, 0, 1},
+       /*strides=*/{1, 1, 1, 1, 1},
+       /*padding=*/"SAME",
+       /*data_format=*/"NCDHW",
+       /*dilations=*/{1, 1, 1, 1, 1},
+       /*is_conv3d_backprop=*/false,
+       /*expected_output_dims=*/{1, 1, 2, 3, 2},
+       // Swaps front two depths, negates
+       /*expected_output=*/{6, 7, 8, 9, 10, 11, 0, -1, -2, -3, -4, -5},
+       /*allow_dynamic_channel_dim=*/false,
+       /*validation_status=*/Status::OK()
+
+      },
+      // NDHWC (multi-channel)
+      {/*input_dims=*/{1, 2, 3, 2, 2},  // DHWC
+       /*input=*/{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+                  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11},
+       /*filter_dims=*/{2, 1, 1, 2, 1},  // DRSCK
+       /*filter=*/{-1, 1, 1, -1},
+       /*strides=*/{1, 1, 1, 1, 1},
+       /*padding=*/"VALID",
+       /*data_format=*/"NDHWC",
+       /*dilations=*/{1, 1, 1, 1, 1},
+       /*is_conv3d_backprop=*/false,
+       /*expected_output_dims=*/{1, 1, 3, 2, 1},
+       /*expected_output=*/{0, 0, 0, 0, 0, 0},  // Filters oppose each-other
+       /*allow_dynamic_channel_dim=*/false,
+       /*validation_status=*/Status::OK()},
+      // Dilated
+      {/*input_dims=*/{1, 1, 3, 3, 3},  // CDHW
+       /*input=*/{1,   1,   1,   1,   1, 1, 1, 1, 1, -10, -10, -10, -10, -10,
+                  -10, -10, -10, -10, 7, 7, 7, 7, 7, 7,   7,   7,   7},
+       /*filter_dims=*/{2, 1, 1, 1, 1},  // DRSCK
+       /*filter=*/{1, 1},
+       /*strides=*/{1, 1, 1, 1, 1},
+       /*padding=*/"VALID",
+       /*data_format=*/"NCDHW",
+       /*dilations=*/{1, 1, 2, 1, 1},
+       /*is_conv3d_backprop=*/false,
+       /*expected_output_dims=*/{1, 1, 1, 3, 3},
+       // Only front depth is valid, skips neg values
+       /*expected_output=*/{8, 8, 8, 8, 8, 8, 8, 8, 8},
+       /*allow_dynamic_channel_dim=*/false,
+       /*validation_status=*/Status::OK()},
+      // Strided
+      {/*input_dims=*/{1, 1, 3, 3, 3},
+       /*input=*/{1, 0, 2, 0, 0, 0, 3, 0, 4, 0, 0, 0, 0, 0,
+                  0, 0, 0, 0, 5, 0, 6, 0, 0, 0, 7, 0, 8},
+       /*filter_dims=*/{1, 1, 1, 1, 1},
+       /*filter=*/{1},
+       /*strides=*/{1, 1, 2, 2, 2},
+       /*padding=*/"VALID",
+       /*data_format=*/"NCDHW",
+       /*dilations=*/{1, 1, 1, 1, 1},
+       /*is_conv3d_backprop=*/false,
+       /*expected_output_dims=*/{1, 1, 2, 2, 2},
+       // Should only pick up the corners
+       /*expected_output=*/{1, 2, 3, 4, 5, 6, 7, 8},
+       /*allow_dynamic_channel_dim=*/false,
+       /*validation_status=*/Status::OK()},
+      // Transpose Strided
+      {/*input_dims=*/{1, 1, 2, 2, 2},  // CDHW
+       /*input=*/{1, 2, 3, 4, 5, 6, 7, 8},
+       /*filter_dims=*/{1, 1, 1, 1, 1},
+       /*filter=*/{1},
+       /*strides=*/{1, 1, 2, 2, 2},
+       /*padding=*/"VALID",
+       /*data_format=*/"NCDHW",
+       /*dilations=*/{1, 1, 1, 1, 1},
+       /*is_conv3d_backprop=*/true,
+       /*expected_output_dims=*/{1, 1, 3, 3, 3},
+       /*expected_output=*/{1, 0, 2, 0, 0, 0, 3, 0, 4,   // Cube expands and
+                            0, 0, 0, 0, 0, 0, 0, 0, 0,   // fills center
+                            5, 0, 6, 0, 0, 0, 7, 0, 8},  // with zeroes
+       /*allow_dynamic_channel_dim=*/false,
+       /*validation_status=*/Status::OK()},
+  };
+
+  if (trt_mode_ == TrtTestMode::kDynamicShape) {
+    ok_params.reserve(ok_params.size() + 2);
+    const std::vector<float> common_input = CreateVectorIota<float>(3 * 3 * 3);
+    // NCDHW - Dynamic Channel - Should fail in kDynamicShape
+    ok_params.push_back(Conv3DTestParams{
+        /*input_dims=*/{1, 1, 3, 3, 3},
+        /*input=*/common_input,
+        /*filter_dims=*/{1, 1, 1, 1, 1},
+        /*filter=*/{1},
+        /*strides=*/{1, 1, 2, 2, 2},
+        /*padding=*/"VALID",
+        /*data_format=*/"NCDHW",
+        /*dilations=*/{1, 1, 1, 1, 1},
+        /*is_conv3d_backprop=*/false,
+        /*expected_output_dims=*/{},  // ignore, will fail anyway
+        /*expected_output=*/{},       // ignore, will fail anyway
+        /*allow_dynamic_channel_dim=*/true,
+        /*validation_status=*/
+        Status{absl::StatusCode::kInvalidArgument,
+               "Channel dimension must be static"}});
+    // NDHWC - Dynamic Channel - Should fail in kDynamicShape
+    ok_params.push_back(Conv3DTestParams{
+        /*input_dims=*/{1, 3, 3, 3, 1},
+        /*input=*/common_input,
+        /*filter_dims=*/{1, 1, 1, 1, 1},
+        /*filter=*/{1},
+        /*strides=*/{1, 2, 2, 2, 1},
+        /*padding=*/"VALID",
+        /*data_format=*/"NDHWC",
+        /*dilations=*/{1, 1, 1, 1, 1},
+        /*is_conv3d_backprop=*/false,
+        /*expected_output_dims=*/{},  // ignore, will fail anyway
+        /*expected_output=*/{},       // ignore, will fail anyway
+        /*allow_dynamic_channel_dim=*/true,
+        /*validation_status=*/
+        Status{absl::StatusCode::kInvalidArgument,
+               "Channel dimension must be static"}});
+  }
+
+  for (auto p : ok_params) {
+    TestConv3D(this, p);
+  }
+}
+
+template <typename T>
+NodeDef CreatePoolOp(DataType tf_type, std::vector<int> ksize,
+                     std::vector<int> strides, string padding,
+                     string data_format) {
+  Scope s = Scope::NewRootScope();
+  auto input = ops::Placeholder(s.WithOpName("input"), tf_type);
+  typename T::Attrs attrs;
+  attrs.data_format_ = data_format;
+  return T(s.WithOpName("my_pool"), input, ksize, strides, padding, attrs)
+      .operation.node()
+      ->def();
+}
+TEST_P(OpConverter_FP32_Test, ConvertPool) {
+  // Get nodedef for MaxPool and AvgPool layers (2D or 3D).
+  auto get_pool_nodedef =
+      [](DataType tf_type, int nDim, std::vector<int> ksize = {},
+         std::vector<int> strides = {}, string padding = "SAME",
+         string data_format = "", const bool is_max_pooling = true) -> NodeDef {
+    if (ksize.empty()) {
+      ksize = nDim == 2 ? std::vector<int>{1, 1, 1, 1}
+                        : std::vector<int>{1, 1, 1, 1, 1};
+    }
+    if (strides.empty()) {
+      strides = nDim == 2 ? std::vector<int>{1, 1, 1, 1}
+                          : std::vector<int>{1, 1, 1, 1, 1};
+    }
+    if (data_format == "") {
+      data_format = nDim == 2 ? "NCHW" : "NCDHW";
+    }
+    if (is_max_pooling) {
+      if (nDim == 3) {
+        return CreatePoolOp<ops::MaxPool3D>(tf_type, ksize, strides, padding,
+                                            data_format);
+      } else {
+        return CreatePoolOp<ops::MaxPool>(tf_type, ksize, strides, padding,
+                                          data_format);
+      }
+    } else {
+      if (nDim == 3) {
+        return CreatePoolOp<ops::AvgPool3D>(tf_type, ksize, strides, padding,
+                                            data_format);
+      } else {
+        return CreatePoolOp<ops::AvgPool>(tf_type, ksize, strides, padding,
+                                          data_format);
+      }
+    }
+  };
+
+  std::vector<int> test_nDims{2, 3};
+
+  for (int nDim : test_nDims) {
+    // Input is weights, should fail.
+    Reset();
+    NodeDef node_def = get_pool_nodedef(tf_type_, nDim);
+
+    AddTestWeights<float>("input", {1, 1, 1, 2, 3}, {1, 2, 3, 4, 5, 6});
+    RunValidationAndConversion(
+        node_def, absl::StatusCode::kUnimplemented,
+        StrCat("The input \"input\" for ", node_def.op(), " must be a tensor"));
   }
+
   struct TestParams {
     std::vector<int> input_dims;
     std::vector<float> input;
-    std::vector<int> filter_dims;
-    std::vector<float> filter;
+    std::vector<int> ksize;
     std::vector<int> strides;
     string padding;
     string data_format;
-    std::vector<int> dilations;
-    bool is_conv3d_backprop_input;
     std::vector<int> expected_output_dims;
-    std::vector<float> expected_output;
+    // The expected outputs for the following operations: MaxPool2D, AvgPool2D,
+    // MaxPool3D, AvgPool3D
+    std::vector<std::vector<float>> expected_outputs;
+    Status status;
+    std::set<int> skip_dims;
   };
 
-  // Start here
-  const int kConv3DOKCases = 8;
-  TestParams ok_params[kConv3DOKCases] = {
-      // Basic - just 1x1 conv - input = output
+  // We use common_input as the input to test both 2D and 3D pooling operations,
+  // to simplify TestParams. For 2D operations, only the first 1/3 of the values
+  // are used.
+  const std::vector<float> common_input{-4, 2,  15, 3, 6,   -3, 22, 1,   88,
+                                        56, 36, 1,  1, 105, 1,  16, -28, 1,
+                                        42, 9,  3,  1, 7,   1,  11, 61,  5};
+  // The output of 2D ops for the case where the op is equivalent to the
+  // identity op.
+  const std::vector<float> common_2d_output{-4, 2, 15, 3, 6, -3, 22, 1, 88};
+  std::vector<TestParams> test_params = {
+      // Validation failure - kernel size too large for TRT
       TestParams{
-          /*input_dims=*/{1, 3, 3, 3},  // CDHW
-          /*input=*/{1, 2,  15,  3, 6,  -3, 22, 1, 88, 56, 36, 1,  1, 105,
-                     1, 16, -28, 1, 42, 9,  3,  1, 7,  1,  11, 61, 5},
-          /*filter_dims=*/{1, 1, 1, 1, 1},  // DRSCK
-          /*filter=*/{1},
+          /*input_dims=*/{1, 1, 3, 3, 3},
+          /*input=*/common_input,
+          /*ksize=*/{1, 1, 1000, 1000, 1000},
           /*strides=*/{1, 1, 1, 1, 1},
           /*padding=*/"VALID",
           /*data_format=*/"NCDHW",
-          /*dilations=*/{1, 1, 1, 1, 1},
-          /*is_conv3d_backprop_input=*/false,
-          /*expected_output_dims=*/{1, 3, 3, 3},
-          /*expected_output=*/{1,  2,  15, 3, 6,   -3, 22, 1,   88,
-                               56, 36, 1,  1, 105, 1,  16, -28, 1,
-                               42, 9,  3,  1, 7,   1,  11, 61,  5}},
-      // Basic - 2x1 filter
-      TestParams{/*input_dims=*/{1, 3, 3, 3},  // CDHW
-                 /*input=*/{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 6},
-                 /*filter_dims=*/{2, 1, 1, 1, 1},  // DRSCK
-                 /*filter=*/{1, 1},
-                 /*strides=*/{1, 1, 1, 1, 1},
-                 /*padding=*/"VALID",
-                 /*data_format=*/"NCDHW",
-                 /*dilations=*/{1, 1, 1, 1, 1},
-                 /*is_conv3d_backprop_input=*/false,
-                 /*expected_output_dims=*/{1, 2, 3, 3},
-                 /*expected_output=*/
-                 {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 7}},
-      // SAME padding (Asymmetric)
+          /*expected_output_dims=*/{1, 1, 3, 3, 3},
+          /*expected_outputs=*/
+          {common_2d_output, common_2d_output, common_input, common_input},
+          /*status=*/
+          Status(absl::StatusCode::kInvalidArgument,
+                 "Window dimensions are not within bounds")},
+      // Validation failure for 3D ops - negative kernel depth
       TestParams{
-          /*input_dims=*/{1, 2, 3, 2},  // CDHW
-          /*input=*/{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11},
-          /*filter_dims=*/{2, 1, 1, 1, 1},  // DRSCK
-          /*filter=*/{-1, 1},
+          /*input_dims=*/{1, 1, 3, 3, 3},
+          /*input=*/common_input,
+          /*ksize=*/{1, 1, -1, 1, 1},
           /*strides=*/{1, 1, 1, 1, 1},
-          /*padding=*/"SAME",
+          /*padding=*/"VALID",
           /*data_format=*/"NCDHW",
-          /*dilations=*/{1, 1, 1, 1, 1},
-          /*is_conv3d_backprop_input=*/false,
-          /*expected_output_dims=*/{1, 2, 3, 2},
-          /*expected_output=*/
-          {6, 6, 6, 6, 6, 6, -6, -7, -8, -9, -10,
-           -11}  // Diff in first 2 depths is const 6
-      },
-      // SAME padding (Symmetric)
+          /*expected_output_dims=*/{1, 1, 3, 3, 3},
+          /*expected_outputs=*/
+          {common_2d_output, common_2d_output, common_input, common_input},
+          /*status=*/
+          Status(absl::StatusCode::kInvalidArgument,
+                 "Window dimensions are not within bounds"),
+          /*skip_dims=*/{2}},
+      // Validation failure - negative kernel height
       TestParams{
-          /*input_dims=*/{1, 2, 3, 2},  // CDHW
-          /*input=*/{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11},
-          /*filter_dims=*/{3, 1, 1, 1, 1},  // DRSCK
-          /*filter=*/{-1, 0, 1},
+          /*input_dims=*/{1, 1, 3, 3, 3},
+          /*input=*/common_input,
+          /*ksize=*/{1, 1, 1, -1, 1},
           /*strides=*/{1, 1, 1, 1, 1},
-          /*padding=*/"SAME",
+          /*padding=*/"VALID",
           /*data_format=*/"NCDHW",
-          /*dilations=*/{1, 1, 1, 1, 1},
-          /*is_conv3d_backprop_input=*/false,
-          /*expected_output_dims=*/{1, 2, 3, 2},
-          /*expected_output=*/
-          {6, 7, 8, 9, 10, 11, 0, -1, -2, -3, -4,
-           -5}  // Swaps front two depths, negates
-      },
-
-      // NDHWC (multi-channel)
+          /*expected_output_dims=*/{1, 1, 3, 3, 3},
+          /*expected_outputs=*/
+          {common_2d_output, common_2d_output, common_input, common_input},
+          /*status=*/
+          Status(absl::StatusCode::kInvalidArgument,
+                 "Window dimensions are not within bounds")},
+      // Validation failure - negative kernel width
       TestParams{
-          /*input_dims=*/{2, 3, 2, 2},  // DHWC
-          /*input=*/{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
-                     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11},
-          /*filter_dims=*/{2, 1, 1, 2, 1},  // DRSCK
-          /*filter=*/{-1, 1, 1, -1},
+          /*input_dims=*/{1, 1, 3, 3, 3},
+          /*input=*/common_input,
+          /*ksize=*/{1, 1, 1, 1, -1},
           /*strides=*/{1, 1, 1, 1, 1},
           /*padding=*/"VALID",
-          /*data_format=*/"NDHWC",
-          /*dilations=*/{1, 1, 1, 1, 1},
-          /*is_conv3d_backprop_input=*/false,
-          /*expected_output_dims=*/{1, 3, 2, 1},
-          /*expected_output=*/{0, 0, 0, 0, 0, 0}  // Each filter opposes the
-                                                  // other
-      },
-
-      // Dilated
+          /*data_format=*/"NCDHW",
+          /*expected_output_dims=*/{1, 1, 3, 3, 3},
+          /*expected_outputs=*/
+          {common_2d_output, common_2d_output, common_input, common_input},
+          /*status=*/
+          Status(absl::StatusCode::kInvalidArgument,
+                 "Window dimensions are not within bounds")},
+      // Basic - just 1x1 max pooling - input = output
       TestParams{
-          /*input_dims=*/{1, 3, 3, 3},  // CDHW
-          /*input=*/{1,   1,   1,   1,   1, 1, 1, 1, 1, -10, -10, -10, -10, -10,
-                     -10, -10, -10, -10, 7, 7, 7, 7, 7, 7,   7,   7,   7},
-          /*filter_dims=*/{2, 1, 1, 1, 1},  // DRSCK
-          /*filter=*/{1, 1},
+          /*input_dims=*/{1, 1, 3, 3, 3},
+          /*input=*/common_input,
+          /*ksize=*/{1, 1, 1, 1, 1},
           /*strides=*/{1, 1, 1, 1, 1},
           /*padding=*/"VALID",
           /*data_format=*/"NCDHW",
-          /*dilations=*/{1, 1, 2, 1, 1},
-          /*is_conv3d_backprop_input=*/false,
-          /*expected_output_dims=*/{1, 1, 3, 3},
-          /*expected_output=*/{8, 8, 8, 8, 8, 8, 8, 8, 8}  // Only front depth
-                                                           // is valid, skips
-                                                           // neg values
-      },
-      // Strided
+          /*expected_output_dims=*/{1, 1, 3, 3, 3},
+          /*expected_outputs=*/
+          {common_2d_output, common_2d_output, common_input, common_input}},
+      // Basic - just 1x1 max pooling - input = output, SAME padding
       TestParams{
-          /*input_dims=*/{1, 3, 3, 3},
-          /*input=*/{1, 0, 2, 0, 0, 0, 3, 0, 4, 0, 0, 0, 0, 0,
-                     0, 0, 0, 0, 5, 0, 6, 0, 0, 0, 7, 0, 8},
-          /*filter_dims=*/{1, 1, 1, 1, 1},
-          /*filter=*/{1},
-          /*strides=*/{1, 1, 2, 2, 2},
-          /*padding=*/"VALID",
+          /*input_dims=*/{1, 1, 3, 3, 3},
+          /*input=*/common_input,
+          /*ksize=*/{1, 1, 1, 1, 1},
+          /*strides=*/{1, 1, 1, 1, 1},
+          /*padding=*/"SAME",
           /*data_format=*/"NCDHW",
-          /*dilations=*/{1, 1, 1, 1, 1},
-          /*is_conv3d_backprop_input=*/false,
-          /*expected_output_dims=*/{1, 2, 2, 2},
-          /*expected_output=*/{1, 2, 3, 4, 5, 6, 7, 8}  // Should only pick up
-                                                        // the corners
-      },
-      // Transpose Strided
-      TestParams{/*input_dims=*/{1, 2, 2, 2},  // CDHW
-                 /*input=*/{1, 2, 3, 4, 5, 6, 7, 8},
-                 /*filter_dims=*/{1, 1, 1, 1, 1},
-                 /*filter=*/{1},
+          /*expected_output_dims=*/{1, 1, 3, 3, 3},
+          /*expected_outputs=*/
+          {common_2d_output, common_2d_output, common_input, common_input}},
+      // 3x3 pooling NCDHW
+      TestParams{/*input_dims=*/{1, 1, 3, 3, 3},
+                 /*input=*/common_input,
+                 /*ksize=*/{1, 1, 3, 3, 3},
+                 /*strides=*/{1, 1, 1, 1, 1},
+                 /*padding=*/"VALID",
+                 /*data_format=*/"NCDHW",
+                 /*expected_output_dims=*/{1, 1, 1, 1, 1},
+                 /*expected_outputs=*/{{88}, {14.444445}, {105}, {17}}},
+      // 3x3 pooling, NDHWC
+      TestParams{/*input_dims=*/{1, 3, 3, 3, 1},
+                 /*input=*/common_input,
+                 /*ksize=*/{1, 3, 3, 3, 1},
+                 /*strides=*/{1, 1, 1, 1, 1},
+                 /*padding=*/"VALID",
+                 /*data_format=*/"NDHWC",
+                 /*expected_output_dims=*/{1, 1, 1, 1, 1},
+                 /*expected_outputs=*/{{88}, {14.444445}, {105}, {17}}},
+      // Strided
+      TestParams{/*input_dims=*/{1, 1, 3, 3, 3},
+                 /*input=*/{1, 0, 2, 0, 0, 0, 3, 0, 4, 0, 0, 0, 0, 0,
+                            0, 0, 0, 0, 5, 0, 6, 0, 0, 0, 7, 0, 8},
+                 /*ksize=*/{1, 1, 1, 1, 1},
                  /*strides=*/{1, 1, 2, 2, 2},
                  /*padding=*/"VALID",
                  /*data_format=*/"NCDHW",
-                 /*dilations=*/{1, 1, 1, 1, 1},
-                 /*is_conv3d_backprop_input=*/true,
-                 /*expected_output_dims=*/{1, 3, 3, 3},
-                 /*expected_output=*/
-                 {1, 0, 2, 0, 0, 0, 3, 0, 4, 0, 0, 0, 0, 0,
-                  0, 0, 0, 0, 5, 0, 6, 0, 0, 0, 7, 0, 8}},  // Cube
-                                                            // expands and
-                                                            // fills
-                                                            // center with
-                                                            // zeroes
-
+                 /*expected_output_dims=*/{1, 1, 2, 2, 2},
+                 /*expected_outputs=*/
+                 {{1, 2, 3, 4},  // Should only pick up the corners
+                  {1, 2, 3, 4},
+                  {1, 2, 3, 4, 5, 6, 7, 8},
+                  {1, 2, 3, 4, 5, 6, 7, 8}}},
   };
 
-  for (int i = 0; i < kConv3DOKCases; i++) {
-    Reset();
-    NodeDef node_def = get_conv3d_nodedef(
-        ok_params[i].strides, ok_params[i].padding, ok_params[i].data_format,
-        ok_params[i].dilations, ok_params[i].is_conv3d_backprop_input);
-    AddTestTensor("input", ok_params[i].input_dims);
-    AddTestWeights<float>("weights", ok_params[i].filter_dims,
-                          ok_params[i].filter);
-    if (ok_params[i].is_conv3d_backprop_input) {
-      AddTestWeights<float>(
-          "input_sizes",
-          {static_cast<int>(ok_params[i].expected_output.size())},
-          ok_params[i].expected_output);
+  for (auto p : test_params) {
+    int test_counter = 0;
+    for (int nDim : test_nDims) {
+      if (p.skip_dims.find(nDim) != p.skip_dims.end()) {
+        continue;
+      }
+      auto input = p.input;
+      auto input_dims = p.input_dims;
+      auto ksize = p.ksize;
+      auto strides = p.strides;
+      auto expected_output_dims = p.expected_output_dims;
+      std::string data_format = p.data_format;
+      if (nDim == 2) {
+        input.resize(9);
+        data_format = p.data_format == "NDHWC" ? "NHWC" : "NCHW";
+        // Remove one of the spatial dimensions
+        input_dims.erase(input_dims.begin() + 2);
+        ksize.erase(ksize.begin() + 2);
+        strides.erase(strides.begin() + 2);
+        expected_output_dims.erase(expected_output_dims.begin() + 2);
+      }
+      for (bool is_max_pooling : {true, false}) {
+        Reset();
+        NodeDef node = get_pool_nodedef(tf_type_, nDim, ksize, strides,
+                                        p.padding, data_format, is_max_pooling);
+        AddTestTensor("input", input_dims, input);
+        TestOpConverter(node, expected_output_dims, p.status, Status::OK(),
+                        ElementsAreArray(p.expected_outputs.at(test_counter)));
+        test_counter++;
+      }
     }
-    RunValidationAndConversion(node_def);
-    TRT_TensorOrWeights output;
-    TF_EXPECT_OK(GetTensorOrWeights("my_conv3d", &output));
-    ASSERT_TRUE(output.is_tensor());
-    ExpectTrtDimsEqualsArray(ok_params[i].expected_output_dims,
-                             output.tensor()->getDimensions());
+  }
+}
 
-    const DataVec input_data{
-        {"input", test::AsTensor<float>(ok_params[i].input)}};
-    DataVec output_data{
-        {"my_conv3d",
-         ConstructTensor<float>(ok_params[i].expected_output.size())}};
-    BuildAndRun(input_data, &output_data);
-    EXPECT_THAT(GetSpanForData<float>(output_data[0]),
-                ElementsAreArray(ok_params[i].expected_output));
+TEST_P(OpConverter_FP32_FP16_Test, ConvertTopK) {
+  // Get the NodeDef for TopKV2.
+  Scope s = Scope::NewRootScope();
+  auto input = ops::Placeholder(s.WithOpName("input"), tf_type_);
+  auto weights = ops::Placeholder(s.WithOpName("weights"), DT_INT32);
+  auto topk = ops::TopK(s.WithOpName("my_topk"), input, weights);
+  const NodeDef& node_def = topk.operation.node()->def();
+  {
+    // K is a tensor, should fail.
+    Reset();
+    AddTestTensor("input", {1, 1, 2, 3});
+    AddTestTensor("weights", {1}, DT_INT32, {});
+    RunValidationAndConversion(node_def, absl::StatusCode::kUnimplemented,
+                               "The input \"k\" for TopKV2 must be a constant");
+  }
+  {
+    // Ok.
+    Reset();
+    AddTestTensor("input", {1, 1, 2, 5}, {-9, 3, 5, 1, 6, -5, 7, 1, 0, -1});
+    AddTestWeights<int32>("weights", {1}, {2});
+    std::vector<std::vector<int>> expected_output_dims{{1, 1, 2, 2},
+                                                       {1, 1, 2, 2}};
+    TestOpConverterMultiOut(node_def, expected_output_dims, Status::OK(),
+                            Status::OK(),
+                            {ElementsAre(6, 5, 7, 1), ElementsAre(4, 2, 1, 2)},
+                            {tf_type_, DT_INT32});
   }
 }
-#endif  // IS_TRT_VERSION_GE(6, 0, 0, 0)
 
-TEST_F(OpConverterTest, ConvertTopK) {
-  // TODO(tmorris): This test isn't setting the input dtype properly. TopK with
-  // int32 is unsupported by TRT.
-  for (const auto dtype : {DT_FLOAT}) {
-    // Get the NodeDef for TopKV2.
-    Scope s = Scope::NewRootScope();
-    auto input = ops::Placeholder(s.WithOpName("input"), dtype);
-    auto weights = ops::Placeholder(s.WithOpName("weights"), DT_INT32);
-    auto topk = ops::TopK(s.WithOpName("my_topk"), input, weights);
-    const NodeDef& node_def = topk.operation.node()->def();
-    {
-      // K is a tensor, should fail.
-      Reset();
-      AddTestTensor("input", {1, 2, 3}, /*batch_size=*/1,
-                    /*trt_dtype=*/TfDataTypeToTrt(dtype));
-      AddTestTensor("weights", {2});
-      RunValidationAndConversion(
-          node_def, error::UNIMPLEMENTED,
-          "The input \"k\" for TopKV2 must be a constant, at my_topk");
-    }
-    {
-      // Ok.
-      Reset();
-      AddTestTensor("input", {1, 2, 5});
-      AddTestWeights<int32>("weights", {1}, {2});
-      RunValidationAndConversion(node_def);
-      TRT_TensorOrWeights outputs[2];
-      TF_EXPECT_OK(GetTensorOrWeights("my_topk", &outputs[0]));
-      TF_EXPECT_OK(GetTensorOrWeights("my_topk:1", &outputs[1]));
-      for (auto& output : outputs) {
-        ASSERT_TRUE(output.is_tensor());
-        ExpectTrtDimsEqualsArray({1, 2, 2}, output.tensor()->getDimensions());
-      }
+struct DataFormatVecPermuteTestParams {
+  string dst_format;
+  string src_format;
+  std::vector<int> x_shape;
+  std::vector<int> x;
+  bool x_is_tensor;
+  std::vector<int> expected_output;
+  Status conversion_status;
+};
+
+NodeDef GetDataFormatVecPermuteNodeDef(string dst_format, string src_format,
+                                       std::vector<int>& x_shape) {
+  Scope s = Scope::NewRootScope();
+  PartialTensorShape tensor_shape;
+  auto x = ops::Placeholder(s.WithOpName("x"), DT_INT32);
+  const auto attrs = ops::DataFormatVecPermute::Attrs()
+                         .DstFormat(dst_format)
+                         .SrcFormat(src_format);
+  auto dfvp = ops::DataFormatVecPermute(s.WithOpName("my_dfvp"), x, attrs);
+  return dfvp.operation.node()->def();
+}
+
+TEST_P(OpConverter_INT32_Test, ConvertDataFormatVecPermute) {
+  const auto& error = convert_not_supported_implicit(
+      string("DataFormatVecPermute"), string("my_dfvp"));
+  const Status implicit_error = Status{absl::StatusCode::kUnimplemented, error};
+  const auto conversion_status =
+      trt_mode_ == TrtTestMode::kImplicitBatch ? implicit_error : Status::OK();
+  std::vector<DataFormatVecPermuteTestParams> test_params = {
+      // 1D case with tensor.
+      DataFormatVecPermuteTestParams{/*dst_format=*/"NCHW",
+                                     /*src_format=*/"NHWC",
+                                     /*x_shape=*/{4},
+                                     /*x=*/{1, 2, 3, 4},
+                                     /*x_is_tensor=*/true,
+                                     /*expected_output=*/{1, 4, 2, 3},
+                                     /*conversion_status=*/conversion_status},
+      // 1D case with weights.
+      DataFormatVecPermuteTestParams{/*dst_format=*/"NCHW",
+                                     /*src_format=*/"NHWC",
+                                     /*x_shape=*/{4},
+                                     /*x=*/{1, 2, 3, 4},
+                                     /*x_is_tensor=*/false,
+                                     /*expected_output=*/{1, 4, 2, 3},
+                                     /*conversion_status=*/conversion_status},
+      // 2D case with tensor.
+      DataFormatVecPermuteTestParams{
+          /*dst_format=*/"NCHW",
+          /*src_format=*/"NHWC",
+          /*x_shape=*/{4, 2},
+          /*x=*/{1, 2, 3, 4, 5, 6, 7, 8},
+          /*x_is_tensor=*/true,
+          /*expected_output=*/{1, 2, 7, 8, 3, 4, 5, 6},
+          /*conversion_status=*/conversion_status},
+      // 2D case with weights.
+      DataFormatVecPermuteTestParams{
+          /*dst_format=*/"NCHW",
+          /*src_format=*/"NHWC",
+          /*x_shape=*/{4, 2},
+          /*x=*/{1, 2, 3, 4, 5, 6, 7, 8},
+          /*x_is_tensor=*/false,
+          /*expected_output=*/{1, 2, 7, 8, 3, 4, 5, 6},
+          /*conversion_status=*/conversion_status},
+      // Format of size 5.
+      DataFormatVecPermuteTestParams{
+          /*dst_format=*/"NCDHW",
+          /*src_format=*/"NDHWC",
+          /*x_shape=*/{5, 2},
+          /*x=*/{1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
+          /*x_is_tensor=*/true,
+          /*expected_output=*/{1, 2, 9, 10, 3, 4, 5, 6, 7, 8},
+          /*conversion_status=*/conversion_status},
+      // Input of size 2: treat the elements as spatial dimensions.
+      DataFormatVecPermuteTestParams{/*dst_format=*/"NCWH",
+                                     /*src_format=*/"NHWC",
+                                     /*x_shape=*/{2, 2},
+                                     /*x=*/{1, 2, 3, 4},
+                                     /*x_is_tensor=*/true,
+                                     /*expected_output=*/{3, 4, 1, 2},
+                                     /*conversion_status=*/conversion_status},
+      // Input of size 3: treat the elements as spatial dimensions.
+      DataFormatVecPermuteTestParams{/*dst_format=*/"NCHWD",
+                                     /*src_format=*/"NDHWC",
+                                     /*x_shape=*/{3},
+                                     /*x=*/{1, 2, 3},
+                                     /*x_is_tensor=*/true,
+                                     /*expected_output=*/{2, 3, 1},
+                                     /*conversion_status=*/conversion_status},
+      // Invalid rank, should fail.
+      DataFormatVecPermuteTestParams{
+          /*dst_format=*/"NCHW",
+          /*src_format=*/"NHWC",
+          /*x_shape=*/{2, 2, 2},
+          /*x=*/{1, 2, 3, 4, 5, 6, 7, 8},
+          /*x_is_tensor=*/true,
+          /*expected_output=*/{},
+          /*conversion_status=*/trt_mode_ == TrtTestMode::kImplicitBatch
+              ? implicit_error
+              : Status{absl::StatusCode::kInvalidArgument,
+                       "Input must be a vector or matrix, but got rank 3, at "
+                       "my_dfvp"}},
+      // Invalid size for 1D input, should fail.
+      DataFormatVecPermuteTestParams{
+          /*dst_format=*/"NCHW",
+          /*src_format=*/"NHWC",
+          /*x_shape=*/{3},
+          /*x=*/{1, 2, 3},
+          /*x_is_tensor=*/true,
+          /*expected_output=*/{},
+          /*conversion_status=*/trt_mode_ == TrtTestMode::kImplicitBatch
+              ? implicit_error
+              : Status{absl::StatusCode::kInvalidArgument,
+                       "1D input must be of size 2 or 4, but got size 3, at "
+                       "my_dfvp"}},
+      // Invalid first dim for 2D input, should fail.
+      DataFormatVecPermuteTestParams{
+          /*dst_format=*/"NCDHW",
+          /*src_format=*/"NDHWC",
+          /*x_shape=*/{4, 2},
+          /*x=*/{1, 2, 3, 4, 5, 6, 7, 8},
+          /*x_is_tensor=*/true,
+          /*expected_output=*/{},
+          /*conversion_status=*/trt_mode_ == TrtTestMode::kImplicitBatch
+              ? implicit_error
+              : Status{absl::StatusCode::kInvalidArgument,
+                       "First dimension of 2D input must be of size 3 or 5, "
+                       "but got shape (4, 2), at my_dfvp"}},
+      // Invalid second dim for 2D input, should fail.
+      DataFormatVecPermuteTestParams{
+          /*dst_format=*/"NCHW",
+          /*src_format=*/"NHWC",
+          /*x_shape=*/{4, 3},
+          /*x=*/{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},
+          /*x_is_tensor=*/true,
+          /*expected_output=*/{},
+          /*conversion_status=*/trt_mode_ == TrtTestMode::kImplicitBatch
+              ? implicit_error
+              : Status{absl::StatusCode::kInvalidArgument,
+                       "Second dimension of 2D input must be of size 2, but "
+                       "got shape (4, 3), at my_dfvp"}},
+  };
 
-      const DataVec input_data{
-          {"input", test::AsTensor<float>({-9, 3, 5, 1, 6, -5, 7, 1, 0, -1})}};
-      DataVec output_data{{"my_topk", ConstructTensor<float>(4)},
-                          {"my_topk:1", ConstructTensor<int32>(4)}};
-      BuildAndRun(input_data, &output_data);
-      EXPECT_THAT(GetSpanForData<float>(output_data[0]),
-                  ElementsAre(6, 5, 7, 1));
-      EXPECT_THAT(GetSpanForData<int32>(output_data[1]),
-                  ElementsAre(4, 2, 1, 2));
+  for (auto p : test_params) {
+    Reset();
+    const NodeDef node_def =
+        GetDataFormatVecPermuteNodeDef(p.dst_format, p.src_format, p.x_shape);
+
+    if (p.x_is_tensor) {
+      AddTestTensor("x", p.x_shape, DT_INT32, p.x, p.x_shape);
+    } else {
+      AddTestWeights("x", p.x_shape, p.x, DT_INT32);
     }
+
+    TestOpConverter(node_def, p.x_shape, p.conversion_status, Status::OK(),
+                    ElementsAreArray(p.expected_output));
   }
 }
 
-template <DataType dtype>
-void TestConvertGather(OpConverterTest* test) {
-  typedef typename EnumToDataType<dtype>::Type CType;
-
+NodeDef CreateGatherOp(DataType tf_type, int batch_dims) {
   // Get the NodeDef for GatherV2.
   Scope s = Scope::NewRootScope();
-  auto params = ops::Placeholder(s.WithOpName("params"), dtype);
+  auto params = ops::Placeholder(s.WithOpName("params"), tf_type);
   auto indices = ops::Placeholder(s.WithOpName("indices"), DT_INT32);
   auto axis = ops::Placeholder(s.WithOpName("axis"), DT_INT32);
-  auto gather = ops::GatherV2(s.WithOpName("my_gather"), params, indices, axis);
+  ops::GatherV2::Attrs op_attrs;
+  op_attrs.batch_dims_ = batch_dims;
+  auto gather =
+      ops::GatherV2(s.WithOpName("my_gather"), params, indices, axis, op_attrs);
   const NodeDef& node_def = gather.operation.node()->def();
+  return node_def;
+}
+
+TEST_P(OpConverter_FP32_FP16_INT32_Test, ConvertGather) {
+  auto node_def = CreateGatherOp(tf_type_, /*batch_dims*/ 0);
+
+  {
+    // Axis is a tensor, should fail.
+    Reset();
+    AddTestTensor("params", {1, 1, 2, 3}, tf_type_, {});
+    AddTestTensor("indices", {1, 2}, DT_INT32, {});
+    AddTestTensor("axis", {1}, DT_INT32, {});
+    RunValidationAndConversion(
+        node_def, absl::StatusCode::kUnimplemented,
+        "The input \"axis\" for GatherV2 must be a constant");
+  }
+  {
+    // Axis is out of bounds, should fail.
+    Reset();
+    AddTestTensor("params", {1, 1, 2, 3});
+    AddTestTensor("indices", {1, 2}, DT_INT32, {});
+    AddTestWeights<int32>("axis", {1}, {4});
+    RunValidationAndConversion(node_def, absl::StatusCode::kInvalidArgument,
+                               "Axis value of 4 is out of bounds, must be in "
+                               "range [-4, 4)");
+  }
 
   struct TestParams {
     // TF shape of the input 'params' (including batch dimension).
@@ -4391,407 +7001,536 @@ void TestConvertGather(OpConverterTest* test) {
     std::vector<int> indices_shape;
     std::vector<int> indices;
     int axis;
+    int batch_dims;
     // Expected TF shape of the output (including batch dimension).
     std::vector<int> expected_output_shape;
     std::vector<int> expected_output;
     bool params_is_tensor;
+    bool indices_is_tensor;
+    Status conversion_status;
+    Status runtime_status;
+    Status add_index_status;
   };
 
   // Input is the same {1, 2, 3, 4, 5, 6} for all cases.
-  const int kGatherOKCases = 11;
-  const std::vector<CType> params_input = {CType(1), CType(2), CType(3),
-                                           CType(4), CType(5), CType(6)};
-  TestParams ok_params[kGatherOKCases] = {
+  const std::vector<int> params_input = {1, 2, 3, 4, 5, 6};
+
+  std::vector<TestParams> test_params = {
+      // Axis is batch dimension, should fail in implicit batch mode.
+      TestParams{/*params_shape=*/{2, 1, 1, 3},
+                 /*indices_shape=*/{2},
+                 /*indices=*/{1, 0},
+                 /*axis=*/0,
+                 /*batch_dims=*/0,
+                 /*expected_output_shape=*/{2, 1, 1, 3},
+                 /*expected_output=*/{4, 5, 6, 1, 2, 3},
+                 /*params_is_tensor=*/true,
+                 /*indices_is_tensor=*/true,
+                 /*conversion_status=*/trt_mode_ == TrtTestMode::kImplicitBatch
+                     ? Status{absl::StatusCode::kUnimplemented,
+                              "TensorRT does not allow "
+                              "manipulation of the batch dimension"}
+                     : Status::OK()},
+      // Batch size of indices is not 1 when params and indices are tensors.
+      TestParams{/*params_shape=*/{2, 1, 3},
+                 /*indices_shape=*/{2, 1},
+                 /*indices=*/{2, 0},
+                 /*axis=*/2,
+                 /*batch_dims=*/0,
+                 /*expected_output_shape=*/{2, 1, 2, 1},
+                 /*expected_output=*/{3, 1, 6, 4},
+                 /*params_is_tensor=*/true,
+                 /*indices_is_tensor=*/true,
+                 /*conversion_status=*/trt_mode_ == TrtTestMode::kImplicitBatch
+                     ? Status{absl::StatusCode::kUnimplemented,
+                              "Params and indices must have a"
+                              " batch size of 1 when params and indices are "
+                              "both tensors or both"
+                              " constants."}
+                     : Status::OK()},
+      // Batch size of indices is not 1 when params is tensor and indices are
+      // constant.
+      TestParams{/*params_shape=*/{2, 1, 3},
+                 /*indices_shape=*/{2, 1},
+                 /*indices=*/{2, 0},
+                 /*axis=*/2,
+                 /*batch_dims=*/0,
+                 /*expected_output_shape=*/{2, 1, 2, 1},
+                 /*expected_output=*/{3, 1, 6, 4},
+                 /*params_is_tensor=*/true,
+                 /*indices_is_tensor=*/false,
+                 /*conversion_status=*/Status::OK()},
+      // Axis is not zero when params is a weight, should fail in implicit batch
+      // mode.
+      TestParams{/*params_shape=*/{2, 1, 3},
+                 /*indices_shape=*/{2},
+                 /*indices=*/{1, 2},
+                 /*axis=*/2,
+                 /*batch_dims=*/0,
+                 /*expected_output_shape=*/{2, 1, 2},
+                 /*expected_output=*/{2, 3, 5, 6},
+                 /*params_is_tensor=*/false,
+                 /*indices_is_tensor=*/true,
+                 /*conversion_status=*/trt_mode_ == TrtTestMode::kImplicitBatch
+                     ? Status{absl::StatusCode::kUnimplemented,
+                              "The input axis must be zero when "
+                              "params is a weight."}
+                     : Status::OK()},
+      // Params with only batch dimension.
+      TestParams{
+          /*params_shape=*/{6},
+          /*indices_shape=*/{2},
+          /*indices=*/{1, 3},
+          /*axis=*/0,
+          /*batch_dims=*/0,
+          /*expected_output_shape=*/{2},
+          /*expected_output=*/{2, 4},
+          /*params_is_tensor=*/true,
+          /*indices_is_tensor=*/true,
+          /*conversion_status=*/trt_mode_ == TrtTestMode::kImplicitBatch
+              ? Status{absl::StatusCode::kUnimplemented,
+                       "TensorRT does not allow "
+                       "manipulation of the batch dimension"}
+              : Status::OK(),
+          /*runtime_status=*/Status::OK(),
+          /*add_index_status=*/trt_mode_ == TrtTestMode::kImplicitBatch
+              ? Status{absl::StatusCode::kInvalidArgument,
+                       batch_size_error("indices",
+                                        "Provided batch size does not match "
+                                        "converter batch size: 2 vs 6")}
+              : Status::OK()},
       // Vector indices, and output rank is rank(params).
       TestParams{
           /*params_shape=*/{1, 1, 2, 3},
           /*indices_shape=*/{1},
           /*indices=*/{0},
           /*axis=*/3,
+          /*batch_dims=*/0,
           /*expected_output_shape=*/{1, 1, 2, 1},
           /*expected_output=*/{1, 4},
           /*params_is_tensor=*/true,
+          /*indices_is_tensor=*/true,
       },
       TestParams{
           /*params_shape=*/{1, 1, 2, 3},
           /*indices_shape=*/{1},
           /*indices=*/{1},
           /*axis=*/2,
+          /*batch_dims=*/0,
           /*expected_output_shape=*/{1, 1, 1, 3},
           /*expected_output=*/{4, 5, 6},
           /*params_is_tensor=*/true,
+          /*indices_is_tensor=*/true,
       },
-      // Indices with rank>1, and output rank is rank(params)+rank(indices)-1.
+      // Indices with rank>1, and output rank is rank(params) + rank(indices) -
+      // 1
       TestParams{
           /*params_shape=*/{1, 1, 2, 3},
           /*indices_shape=*/{1, 1},
           /*indices=*/{0},
           /*axis=*/3,
+          /*batch_dims=*/0,
           /*expected_output_shape=*/{1, 1, 2, 1, 1},
           /*expected_output=*/{1, 4},
           /*params_is_tensor=*/true,
+          /*indices_is_tensor=*/true,
       },
       TestParams{
           /*params_shape=*/{1, 1, 2, 3},
           /*indices_shape=*/{1, 1},
           /*indices=*/{1},
           /*axis=*/3,
+          /*batch_dims=*/0,
           /*expected_output_shape=*/{1, 1, 2, 1, 1},
           /*expected_output=*/{2, 5},
           /*params_is_tensor=*/true,
+          /*indices_is_tensor=*/true,
       },
       TestParams{
           /*params_shape=*/{1, 1, 2, 3},
           /*indices_shape=*/{1, 1},
           /*indices=*/{2},
           /*axis=*/-1,
+          /*batch_dims=*/0,
           /*expected_output_shape=*/{1, 1, 2, 1, 1},
           /*expected_output=*/{3, 6},
           /*params_is_tensor=*/true,
+          /*indices_is_tensor=*/true,
       },
       TestParams{
           /*params_shape=*/{1, 1, 2, 3},
           /*indices_shape=*/{1, 3},
           /*indices=*/{2, 0, 1},
           /*axis=*/3,
+          /*batch_dims=*/0,
           /*expected_output_shape=*/{1, 1, 2, 1, 3},
           /*expected_output=*/{3, 1, 2, 6, 4, 5},
           /*params_is_tensor=*/true,
+          /*indices_is_tensor=*/true,
       },
       TestParams{
           /*params_shape=*/{1, 3, 2},
           /*indices_shape=*/{1, 2, 2},
           /*indices=*/{0, 0, 1, 0},
           /*axis=*/2,
+          /*batch_dims=*/0,
           /*expected_output_shape=*/{1, 3, 1, 2, 2},
           /*expected_output=*/{1, 1, 2, 1, 3, 3, 4, 3, 5, 5, 6, 5},
           /*params_is_tensor=*/true,
+          /*indices_is_tensor=*/true,
       },
       TestParams{
           /*params_shape=*/{1, 2, 3},
           /*indices_shape=*/{1},
           /*indices=*/{0},
           /*axis=*/0,
+          /*batch_dims=*/0,
           /*expected_output_shape=*/{1, 2, 3},
           /*expected_output=*/{1, 2, 3, 4, 5, 6},
           /*params_is_tensor=*/false,
+          /*indices_is_tensor=*/true,
       },
       TestParams{
           /*params_shape=*/{3, 2},
           /*indices_shape=*/{1, 2},
           /*indices=*/{0, 1},
           /*axis=*/0,
+          /*batch_dims=*/0,
           /*expected_output_shape=*/{1, 2, 2},
           /*expected_output=*/{1, 2, 3, 4},
           /*params_is_tensor=*/false,
+          /*indices_is_tensor=*/true,
       },
       TestParams{
           /*params_shape=*/{2, 3},
           /*indices_shape=*/{1, 1, 2},
           /*indices=*/{0, 1},
           /*axis=*/0,
+          /*batch_dims=*/0,
           /*expected_output_shape=*/{1, 1, 2, 3},
           /*expected_output=*/{1, 2, 3, 4, 5, 6},
           /*params_is_tensor=*/false,
+          /*indices_is_tensor=*/true,
       },
       TestParams{
           /*params_shape=*/{3, 2},
           /*indices_shape=*/{2, 2},
           /*indices=*/{0, 2, 1, 0},
           /*axis=*/0,
+          /*batch_dims=*/0,
           /*expected_output_shape=*/{2, 2, 2},
           /*expected_output=*/{1, 2, 5, 6, 3, 4, 1, 2},
           /*params_is_tensor=*/false,
+          /*indices_is_tensor=*/true,
+      },
+      // Test cases in which indices constant
+      TestParams{
+          /*params_shape=*/{1, 1, 2, 3},
+          /*indices_shape=*/{1, 1},
+          /*indices=*/{0},
+          /*axis=*/3,
+          /*batch_dims=*/0,
+          /*expected_output_shape=*/{1, 1, 2, 1, 1},
+          /*expected_output=*/{1, 4},
+          /*params_is_tensor=*/true,
+          /*indices_is_tensor=*/false,
       },
+      // Test cases in which both input and indices constant
+      TestParams{/*params_shape=*/{1, 2, 3},
+                 /*indices_shape=*/{1},
+                 /*indices=*/{0},
+                 /*axis=*/0,
+                 /*batch_dims=*/0,
+                 /*expected_output_shape=*/{1, 2, 3},
+                 /*expected_output=*/{1, 2, 3, 4, 5, 6},
+                 /*params_is_tensor=*/false,
+                 /*indices_is_tensor=*/false,
+                 /*conversion_status=*/trt_mode_ == TrtTestMode::kImplicitBatch
+                     ? Status{absl::StatusCode::kUnimplemented,
+                              "Params and indices must have a"
+                              " batch size of 1 when params and indices are "
+                              "both tensors or both"
+                              " constants."}
+                     : Status::OK()},
+      TestParams{/*params_shape=*/{3, 2},
+                 /*indices_shape=*/{2, 2},
+                 /*indices=*/{0, 2, 1, 0},
+                 /*axis=*/0,
+                 /*batch_dims=*/0,
+                 /*expected_output_shape=*/{2, 2, 2},
+                 /*expected_output=*/{1, 2, 5, 6, 3, 4, 1, 2},
+                 /*params_is_tensor=*/false,
+                 /*indices_is_tensor=*/false,
+                 /*conversion_status=*/trt_mode_ == TrtTestMode::kImplicitBatch
+                     ? Status{absl::StatusCode::kUnimplemented,
+                              "Params and indices must have a"
+                              " batch size of 1 when params and indices are "
+                              "both tensors or both"
+                              " constants."}
+                     : Status::OK()},
+      TestParams{
+          /*params_shape=*/{2, 3},
+          /*indices_shape=*/{2, 2},
+          /*indices=*/{0, 1, 1, 2},
+          /*axis=*/1,
+          /*batch_dims=*/1,
+          /*expected_output_shape=*/{2, 2},
+          /*expected_output=*/{1, 2, 5, 6},
+          /*params_is_tensor=*/false,
+          /*indices_is_tensor=*/false,
+          /*conversion_status=*/trt_mode_ == TrtTestMode::kImplicitBatch
+              ? Status{absl::StatusCode::kUnimplemented,
+                       "The input axis must be zero when params is a weight."}
+              : Status::OK()},
   };
 
-  // Ok.
-  for (int i = 0; i < kGatherOKCases; i++) {
-    test->Reset();
-    const auto& params_shape = ok_params[i].params_shape;
-    if (ok_params[i].params_is_tensor) {
-      std::vector<int> params_dims(params_shape.begin() + 1,
-                                   params_shape.end());
-      test->AddTestTensor("params", params_dims, params_shape[0],
-                          TfDataTypeToTrt(dtype));
+  for (auto p : test_params) {
+    Reset();
+
+    auto node_def = CreateGatherOp(tf_type_, p.batch_dims);
+
+    if (p.params_is_tensor) {
+      AddTestTensor("params", p.params_shape, params_input);
     } else {
-      test->AddTestWeights<CType>("params", params_shape, params_input);
+      AddTestWeights("params", p.params_shape, params_input, tf_type_);
     }
 
-    const auto& indices_shape = ok_params[i].indices_shape;
-    test->AddTestTensor(
-        "indices",
-        std::vector<int>(indices_shape.begin() + 1, indices_shape.end()),
-        indices_shape[0], nvinfer1::DataType::kINT32);
-    test->AddTestWeights<int32>("axis", {1}, {ok_params[i].axis});
-    test->RunValidationAndConversion(node_def);
-    TRT_TensorOrWeights output;
-    TF_EXPECT_OK(test->GetTensorOrWeights("my_gather", &output));
-    ASSERT_TRUE(output.is_tensor());
-
-    const auto& expected_output_shape = ok_params[i].expected_output_shape;
-    const auto& expected_output = ok_params[i].expected_output;
-    ASSERT_EQ(expected_output.size(),
-              TrtWeightDimsNumElements(GetTestDims(expected_output_shape)));
-    const std::vector<int> expected_output_dims(
-        expected_output_shape.begin() + 1, expected_output_shape.end());
-    ExpectTrtDimsEqualsArray(expected_output_dims,
-                             output.tensor()->getDimensions());
-
-    // Create input in CType and convert expected output to CType.
-    std::vector<CType> converted_expected_output(expected_output.begin(),
-                                                 expected_output.end());
-
-    DataVec input_data;
-    if (ok_params[i].params_is_tensor) {
-      input_data = {{"params", test::AsTensor<CType>(params_input)},
-                    {"indices", test::AsTensor<int32>(ok_params[i].indices)}};
+    if (p.indices_is_tensor) {
+      AddTestTensor("indices", p.indices_shape, DT_INT32, p.indices, {},
+                    p.add_index_status);
     } else {
-      input_data = {{"indices", test::AsTensor<int32>(ok_params[i].indices)}};
+      std::vector<int> indices_shape(p.indices_shape);
+      AddTestWeights("indices", indices_shape, p.indices, DT_INT32);
     }
-    DataVec output_data{
-        {"my_gather", ConstructTensor<CType>(expected_output.size())}};
-    test->BuildAndRun(
-        input_data, &output_data,
-        dtype == DT_HALF ? TrtPrecisionMode::FP16 : TrtPrecisionMode::FP32,
-        /*batch_size=*/expected_output_shape[0]);
-    EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
-                ElementsAreArray(converted_expected_output));
+
+    AddTestWeights<int32>("axis", {1}, {p.axis});
+    TestOpConverter(node_def, p.expected_output_shape, p.conversion_status,
+                    p.runtime_status, ElementsAreArray(p.expected_output));
   }
 }
 
-TEST_F(OpConverterTest, ConvertGather) {
-  // Get the NodeDef for GatherV2.
+template <typename OpType>
+NodeDef CreateReduceOp(DataType tf_type, bool keep_dims) {
   Scope s = Scope::NewRootScope();
-  auto params = ops::Placeholder(s.WithOpName("params"), DT_FLOAT);
-  auto indices = ops::Placeholder(s.WithOpName("indices"), DT_INT32);
+  auto input = ops::Placeholder(s.WithOpName("input"), tf_type);
   auto axis = ops::Placeholder(s.WithOpName("axis"), DT_INT32);
-  auto gather = ops::GatherV2(s.WithOpName("my_gather"), params, indices, axis);
-  const NodeDef& node_def = gather.operation.node()->def();
-  {
-    // Axis is a tensor, should fail.
-    Reset();
-    AddTestTensor("params", {1, 2, 3});
-    AddTestTensor("indices", {2});
-    AddTestTensor("axis", {1});
-    RunValidationAndConversion(
-        node_def, error::UNIMPLEMENTED,
-        "The input \"axis\" for GatherV2 must be a constant, at my_gather");
-  }
-  {
-    // Axis is out of bounds, should fail.
-    Reset();
-    AddTestTensor("params", {1, 2, 3});
-    AddTestTensor("indices", {2});
-    AddTestWeights<int32>("axis", {1}, {4});
-    RunValidationAndConversion(node_def, error::INVALID_ARGUMENT,
-                               "Axis value of 4 is out of bounds, must be in "
-                               "range [-4, 4), at my_gather");
-  }
-  {
-    // Axis is batch dimension, should fail.
-    Reset();
-    AddTestTensor("params", {1, 2, 3});
-    AddTestTensor("indices", {2});
-    AddTestWeights<int32>("axis", {1}, {0});
-    RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
-                               "TensorRT does not allow manipulation of the "
-                               "batch dimension, at my_gather");
-  }
-  {
-    // Axis is not zero when params is a weight, should fail.
-    Reset();
-    AddTestWeights<int32>("params", {1, 3}, {1, 2, 3});
-    AddTestTensor("indices", {2});
-    AddTestWeights<int32>("axis", {1}, {1});
-    RunValidationAndConversion(
-        node_def, error::UNIMPLEMENTED,
-        "The input axis must be zero when params is a weight.");
+  typename OpType::Attrs op_attrs;
+  op_attrs.keep_dims_ = keep_dims;
+  auto op = OpType(s.WithOpName("my_reduce"), input, axis, op_attrs);
+  return op.operation.node()->def();
+}
+
+// Applies reduction op on sub-sequences of input
+// output[i] = reduce(input[m * i : m * (i +1)])
+std::vector<float> CalcReduce(string op_name, std::vector<float> input, int m,
+                              float (*op)(float, float), float init) {
+  std::vector<float> output(input.size() / m);
+  for (int i = 0; i < output.size(); i++) {
+    auto begin = input.begin() + i * m;
+    auto end = input.begin() + (i + 1) * m;
+    output[i] = std::accumulate(begin, end, init, op);
+    if (op_name == "Mean") {
+      output[i] /= m;
+    }
   }
+  return output;
+}
+TEST_P(OpConverter_FP32_FP16_INT32_Test, ConvertReduce) {
   {
-    // Batch size of indices is not 1 when params is a tensor.
+    // Input is weights, should fail.
     Reset();
-    AddTestTensor("params", {1, 2, 3}, /*batch_size=*/2);
-    AddTestTensor("indices", {2}, /*batch_size=*/2);
+    const NodeDef node_def = CreateReduceOp<ops::Sum>(tf_type_, false);
+    AddTestWeights<float>("input", {1, 2, 3}, {-3, -2, -1, 0, 1, 2});
     AddTestWeights<int32>("axis", {1}, {1});
-    RunValidationAndConversion(
-        node_def, error::UNIMPLEMENTED,
-        "Indices must have a batch size of 1 when params is a tensor.");
+    RunValidationAndConversion(node_def, absl::StatusCode::kUnimplemented,
+                               "The input \"input\" for Sum must be a tensor");
   }
-
-  Reset();
-  TestConvertGather<DT_FLOAT>(this);
-  TestConvertGather<DT_HALF>(this);
-  TestConvertGather<DT_INT32>(this);
-}
-
-TEST_F(OpConverterTest, ConvertUnary) {
   {
-    // Input is weights, should fail.
+    // Axis is weights, should fail.
     Reset();
-    Scope s = Scope::NewRootScope();
-    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
-    auto neg = ops::Neg(s.WithOpName("my_unary"), input);
-    const NodeDef& node_def = neg.operation.node()->def();
-    AddTestWeights<float>("input", {1, 2, 3}, {-3, -2, -1, 0, 1, 2});
-    RunValidationAndConversion(
-        node_def, error::UNIMPLEMENTED,
-        "The input \"x\" for Neg must be a tensor, at my_unary");
+    const NodeDef node_def = CreateReduceOp<ops::Sum>(tf_type_, false);
+    AddTestTensor("input", {1, 2, 3}, {-3, -2, -1, 0, 1, 2});
+    AddTestTensor("axis", {1}, DT_INT32, {1});
+    RunValidationAndConversion(node_def, absl::StatusCode::kUnimplemented,
+                               "The input \"axis\" for Sum must be a constant");
   }
-
-  // Get nodedef for unary layer.
-  auto get_unary_nodedef = [](string op_name) -> NodeDef {
-    Scope s = Scope::NewRootScope();
-    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
-    if (op_name == "Abs") {
-      auto unary = ops::Abs(s.WithOpName("my_unary"), input);
-      return unary.operation.node()->def();
-    } else if (op_name == "Acos") {
-      auto unary = ops::Acos(s.WithOpName("my_unary"), input);
-      return unary.operation.node()->def();
-    } else if (op_name == "Acosh") {
-      auto unary = ops::Acosh(s.WithOpName("my_unary"), input);
-      return unary.operation.node()->def();
-    } else if (op_name == "Asin") {
-      auto unary = ops::Asin(s.WithOpName("my_unary"), input);
-      return unary.operation.node()->def();
-    } else if (op_name == "Asinh") {
-      auto unary = ops::Asinh(s.WithOpName("my_unary"), input);
-      return unary.operation.node()->def();
-    } else if (op_name == "Atan") {
-      auto unary = ops::Atan(s.WithOpName("my_unary"), input);
-      return unary.operation.node()->def();
-    } else if (op_name == "Atanh") {
-      auto unary = ops::Atanh(s.WithOpName("my_unary"), input);
-      return unary.operation.node()->def();
-    } else if (op_name == "Ceil") {
-      auto unary = ops::Ceil(s.WithOpName("my_unary"), input);
-      return unary.operation.node()->def();
-    } else if (op_name == "Cos") {
-      auto unary = ops::Cos(s.WithOpName("my_unary"), input);
-      return unary.operation.node()->def();
-    } else if (op_name == "Cosh") {
-      auto unary = ops::Cosh(s.WithOpName("my_unary"), input);
-      return unary.operation.node()->def();
-    } else if (op_name == "Exp") {
-      auto unary = ops::Exp(s.WithOpName("my_unary"), input);
-      return unary.operation.node()->def();
-    } else if (op_name == "Floor") {
-      auto unary = ops::Floor(s.WithOpName("my_unary"), input);
-      return unary.operation.node()->def();
-    } else if (op_name == "Log") {
-      auto unary = ops::Log(s.WithOpName("my_unary"), input);
-      return unary.operation.node()->def();
-    } else if (op_name == "Neg") {
-      auto unary = ops::Neg(s.WithOpName("my_unary"), input);
-      return unary.operation.node()->def();
-    } else if (op_name == "Reciprocal") {
-      auto unary = ops::Reciprocal(s.WithOpName("my_unary"), input);
-      return unary.operation.node()->def();
-    } else if (op_name == "Rsqrt") {
-      auto unary = ops::Rsqrt(s.WithOpName("my_unary"), input);
-      return unary.operation.node()->def();
-    } else if (op_name == "Sin") {
-      auto unary = ops::Sin(s.WithOpName("my_unary"), input);
-      return unary.operation.node()->def();
-    } else if (op_name == "Sinh") {
-      auto unary = ops::Sinh(s.WithOpName("my_unary"), input);
-      return unary.operation.node()->def();
-    } else if (op_name == "Sqrt") {
-      auto unary = ops::Sqrt(s.WithOpName("my_unary"), input);
-      return unary.operation.node()->def();
-    } else if (op_name == "Tan") {
-      auto unary = ops::Tan(s.WithOpName("my_unary"), input);
-      return unary.operation.node()->def();
-    }
-    EXPECT_TRUE(false);
-    return NodeDef();
+  using OpFunc = std::function<NodeDef(DataType, bool)>;
+  using ValFunc = float (*)(float, float);
+  struct ReduceTestDescriptor {
+    string name;
+    OpFunc get_node;
+    ValFunc val_func;
+    float init_val;
   };
-  // Get expected output for unary layer.
-  auto get_unary_output = [](string op_name, float input) -> float {
-    if (op_name == "Abs") {
-      return std::abs(input);
-    } else if (op_name == "Acos") {
-      return std::acos(input);
-    } else if (op_name == "Acosh") {
-      return std::acosh(input);
-    } else if (op_name == "Asin") {
-      return std::asin(input);
-    } else if (op_name == "Asinh") {
-      return std::asinh(input);
-    } else if (op_name == "Atan") {
-      return std::atan(input);
-    } else if (op_name == "Atanh") {
-      return std::atanh(input);
-    } else if (op_name == "Ceil") {
-      return std::ceil(input);
-    } else if (op_name == "Cos") {
-      return std::cos(input);
-    } else if (op_name == "Cosh") {
-      return std::cosh(input);
-    } else if (op_name == "Exp") {
-      return std::exp(input);
-    } else if (op_name == "Floor") {
-      return std::floor(input);
-    } else if (op_name == "Log") {
-      return std::log(input);
-    } else if (op_name == "Neg") {
-      return -input;
-    } else if (op_name == "Reciprocal") {
-      return 1.0 / input;
-    } else if (op_name == "Rsqrt") {
-      return 1.0 / std::sqrt(input);
-    } else if (op_name == "Sin") {
-      return std::sin(input);
-    } else if (op_name == "Sinh") {
-      return std::sinh(input);
-    } else if (op_name == "Sqrt") {
-      return std::sqrt(input);
-    } else if (op_name == "Tan") {
-      return std::tan(input);
-    }
-    EXPECT_TRUE(false);
-    return 0;
+  std::vector<ReduceTestDescriptor> op_test_info{
+      {"Sum", CreateReduceOp<ops::Sum>, [](float x, float y) { return x + y; },
+       0},
+      {"Prod", CreateReduceOp<ops::Prod>,
+       [](float x, float y) { return x * y; }, 1},
+      {"Mean", CreateReduceOp<ops::Mean>,
+       [](float x, float y) { return x + y; }, 0},
+      {"Min", CreateReduceOp<ops::Min>,
+       [](float x, float y) { return y < x ? y : x; }, 1000},
+      {"Max", CreateReduceOp<ops::Max>,
+       [](float x, float y) { return x < y ? y : x; }, -1000}};
+
+  std::vector<float> input_values{1, 2, 3, 4, 5, 6};
+  struct TestParams {
+    std::vector<int> input_dims;
+    std::vector<float> input_values;
+    // Helper array contains the same elements as input but permuted in a way
+    // that the reduction can be calculated over contiguous elements using
+    // CalcReduce
+    std::vector<float> helper_array;
+    std::vector<int> axis;
+    int stride;  // product of input_dims along axis
+    Status conversion_status;
+  };
+  std::vector<TestParams> params{
+      // Out of range tests
+      TestParams{{2, 3, 1}, input_values, input_values, {3}, 3},
+      TestParams{{2, 3, 1}, input_values, input_values, {-4}, 3},
+      // Ok tests
+      TestParams{{2, 3, 1}, input_values, {1, 4, 2, 5, 3, 6}, {0}, 2},
+      TestParams{{2, 3, 1}, input_values, input_values, {1}, 3},
+      TestParams{{2, 3, 1}, input_values, input_values, {2}, 1},
+      TestParams{{2, 3, 1}, input_values, input_values, {0, 1}, 6},
+      // Ok tests with negative axis values
+      TestParams{{2, 3, 1}, input_values, {1, 4, 2, 5, 3, 6}, {-3}, 2},
+      TestParams{{2, 3, 1}, input_values, input_values, {-2}, 3},
+      TestParams{{2, 3, 1}, input_values, input_values, {-1}, 1},
+      TestParams{{2, 3, 1}, input_values, input_values, {-3, 1}, 6},
   };
 
-  // Get list of ops to test.
-  std::vector<string> ops_to_test;
-  // Add all ops supported by ConvertUnary.
-  auto* map = UnaryOperationMap();
-  ops_to_test.reserve(map->size());
-  for (auto& pair : *map) {
-    ops_to_test.push_back(pair.first);
-  }
-  // Add other unary ops to test.
-  ops_to_test.push_back("Rsqrt");
-  // Ok.
-  for (const string& op_name : ops_to_test) {
-    Reset();
-    NodeDef node_def = get_unary_nodedef(op_name);
-    AddTestTensor("input", {1, 2, 3});
-    RunValidationAndConversion(node_def);
-    TRT_TensorOrWeights output;
-    TF_EXPECT_OK(GetTensorOrWeights("my_unary", &output));
-    ASSERT_TRUE(output.is_tensor());
-    ExpectTrtDimsEqualsArray({1, 2, 3}, output.tensor()->getDimensions());
-
-    const std::vector<float> input = {-0.9f, 0.6f, 0.0f, -3.5f, 100.0f, 2.9f};
-    const DataVec input_data{{"input", test::AsTensor<float>(input)}};
-    DataVec output_data{{"my_unary", ConstructTensor<float>(6)}};
-    BuildAndRun(input_data, &output_data);
-    for (int i = 0; i < input.size(); ++i) {
-      const float expected_output = get_unary_output(op_name, input[i]);
-      EXPECT_THAT(GetSpanForData<float>(output_data[0])[i],
-                  NanSensitiveFloatNear(expected_output, 0.0001));
+  for (bool keep_dims : {false, true}) {
+    for (auto& op : op_test_info) {
+      VLOG(2) << "Processing " << op.name << " with keep_dims=" << keep_dims;
+      for (auto p : params) {
+        SCOPED_TRACE(StrCat(op.name, keep_dims ? " & keep_dims" : ""));
+        Reset();
+        NodeDef node_def = op.get_node(tf_type_, keep_dims);
+
+        AddTestTensor("input", p.input_dims, p.input_values);
+        AddTestWeights<int32>("axis", {static_cast<int>(p.axis.size())},
+                              p.axis);
+        std::vector<int> expected_output_dims(p.input_dims);
+
+        // Set expected output dim and conversion error messages
+        for (int ax : p.axis) {
+          int rank = p.input_dims.size();
+          if (ax >= rank || ax < -rank) {
+            p.conversion_status =
+                errors::InvalidArgument("Axis value of ", ax,
+                                        " is out of bounds, must be in "
+                                        "range [",
+                                        -rank, ", ", rank, ")");
+          } else {
+            int ax_positive = ax >= 0 ? ax : ax + rank;
+            // Zero marks elements that we will remove later.
+            expected_output_dims[ax_positive] = keep_dims ? 1 : 0;
+            if (trt_mode_ == TrtTestMode::kImplicitBatch &&
+                (ax == 0 || ax == -rank)) {
+              p.conversion_status = errors::Unimplemented(
+                  "TensorRT does not allow manipulation of the batch "
+                  "dimension");
+            }
+          }
+        }
+        expected_output_dims.erase(std::remove(expected_output_dims.begin(),
+                                               expected_output_dims.end(), 0),
+                                   expected_output_dims.end());
+        VLOG(2) << "out dims "
+                << absl::StrCat("[", absl::StrJoin(expected_output_dims, ","),
+                                "]");
+        std::vector<float> expected_values = CalcReduce(
+            op.name, p.helper_array, p.stride, op.val_func, op.init_val);
+
+        if (tf_type_ == DT_INT32) {
+          // We need to floor the float values in the `expected_values` vector.
+          std::for_each(expected_values.begin(), expected_values.end(),
+                        [](float& _n) { _n = std::floor(_n); });
+        }
+
+        TestOpConverter(node_def, expected_output_dims, p.conversion_status,
+                        Status::OK(), ArrayFloatNear(expected_values));
+      }
     }
   }
 }
 
+NodeDef CreateCastOp(DataType tf_type) {
+  Scope s = Scope::NewRootScope();
+  auto input = ops::Placeholder(s.WithOpName("input"), DT_HALF);
+  return ops::Cast(s.WithOpName("my_unary"), input, DT_FLOAT)
+      .operation.node()
+      ->def();
+}
+
+TEST_P(OpConverter_FP32_UnaryTest, ConvertUnary) {
+  using OpFunc = std::function<NodeDef(DataType)>;
+  using ValFunc = float (*)(float);
+  std::map<std::string, std::pair<OpFunc, ValFunc>> op_map;
+#define ADD_OP(name, op, compute) \
+  op_map[name] =                  \
+      std::make_pair(CreateUnaryOp<op>, static_cast<ValFunc>(compute))
+  ADD_OP("Abs", ops::Abs, std::abs);
+  ADD_OP("Acos", ops::Acos, std::acos);
+  ADD_OP("Acosh", ops::Acosh, std::acosh);
+  ADD_OP("Asin", ops::Asin, std::asin);
+  ADD_OP("Asinh", ops::Asinh, std::asinh);
+  ADD_OP("Atan", ops::Atan, std::atan);
+  ADD_OP("Atanh", ops::Atanh, std::atanh);
+  op_map["Cast"] = std::make_pair(CreateCastOp, [](float x) { return x; });
+  ADD_OP("Ceil", ops::Ceil, std::ceil);
+  ADD_OP("Cos", ops::Cos, std::cos);
+  ADD_OP("Cosh", ops::Cosh, std::cosh);
+  ADD_OP("Exp", ops::Exp, std::exp);
+  ADD_OP("Erf", ops::Erf, std::erf);
+  ADD_OP("Floor", ops::Floor, std::floor);
+  ADD_OP("Log", ops::Log, std::log);
+  ADD_OP("Neg", ops::Neg, [](float x) { return -x; });
+  ADD_OP("Reciprocal", ops::Reciprocal, [](float x) { return 1.0f / x; });
+#if IS_TRT_VERSION_GE(8, 2, 0, 0)
+  ADD_OP("Round", ops::Round, [](float x) { return (float)std::round(x); });
+  ADD_OP("Sign", ops::Sign,
+         [](float x) { return x > 0 ? 1.0f : (x < 0 ? -1.0f : 0.0f); });
+#endif
+  ADD_OP("Rsqrt", ops::Rsqrt, [](float x) { return 1.0f / std::sqrt(x); });
+  ADD_OP("Sin", ops::Sin, std::sin);
+  ADD_OP("Sinh", ops::Sinh, std::sinh);
+  ADD_OP("Sqrt", ops::Sqrt, std::sqrt);
+  ADD_OP("Tan", ops::Tan, std::tan);
+#undef ADD_OP
+
+  std::vector<float> input_values{-0.9f, 0.6f, 0.0f, -3.5f, 100.0f, 2.9f};
+  RunTests("Unary", *UnaryOperationMap(), op_map, input_values, "x");
+}
+
+TEST_P(OpConverter_BOOL_Test, ConvertBoolean) {
+  std::vector<int> input_values{1, 0, 1, 0, 0, 1};
+  using OpFunc = std::function<NodeDef(DataType)>;
+
+  using ValFunc = int (*)(int);
+  std::map<std::string, std::pair<OpFunc, ValFunc>> op_map;
+#define ADD_OP(name, op, compute) \
+  op_map[name] =                  \
+      std::make_pair(CreateUnaryOp<op>, static_cast<ValFunc>(compute))
+  ADD_OP("LogicalNot", ops::LogicalNot, [](int x) { return 1 - x; });
+#undef ADD_OP
+
+#if IS_TRT_VERSION_GE(8, 2, 0, 0)
+  // The test does not actually run for TPT versions less than 8.2
+  RunTests("LogicalUnary", *UnaryBooleanOperationMap(), op_map, input_values,
+           "x");
+#endif
+}
+
 // Get the NodeDef for ConcatV2.
 // TODO(hinsu): Consider switching this to static function.
 auto get_concat_nodedef = [](DataType dtype, int num_inputs) -> NodeDef {
   Scope s = Scope::NewRootScope();
   std::vector<Input> values;
+  values.reserve(num_inputs);
   for (int i = 0; i < num_inputs; ++i) {
     const string input_name = StrCat("values_", i);
     values.push_back(ops::Placeholder(s.WithOpName(input_name), dtype));
@@ -4802,172 +7541,169 @@ auto get_concat_nodedef = [](DataType dtype, int num_inputs) -> NodeDef {
   return concat.operation.node()->def();
 };
 
-template <DataType dtype>
-void TestConvertConcat(OpConverterTest* test) {
-  typedef typename EnumToDataType<dtype>::Type CType;
+TEST_P(OpConverter_FP32_FP16_INT32_Test, ConvertConcat) {
+  {
+    // Axis is a tensor, should fail.
+    Reset();
+    NodeDef node_def = get_concat_nodedef(tf_type_, 2);
+    AddTestTensor("values_0", {1, 1, 2, 3});
+    AddTestTensor("values_1", {1, 1, 2, 3});
+    AddTestTensor("axis", {1});
+    RunValidationAndConversion(
+        node_def, absl::StatusCode::kUnimplemented,
+        "The input \"axis\" for ConcatV2 must be a constant");
+  }
+  {
+    // Axis is out of bounds, should fail.
+    Reset();
+    NodeDef node_def = get_concat_nodedef(tf_type_, 2);
+    AddTestTensor("values_0", {1, 1, 2, 3});
+    AddTestTensor("values_1", {1, 1, 2, 3});
+    AddTestWeights<int32>("axis", {1}, {4});
+    RunValidationAndConversion(node_def, absl::StatusCode::kInvalidArgument,
+                               "Axis value of 4 is out of bounds, must be in "
+                               "range [-4, 4)");
+  }
+  {
+    // Inputs have inconsistent ranks, should fail.
+    Reset();
+    NodeDef node_def = get_concat_nodedef(tf_type_, 2);
+    AddTestTensor("values_0", {1, 1, 2, 3});
+    AddTestTensor("values_1", {1, 1, 6});
+    AddTestWeights<int32>("axis", {1}, {1});
+    RunValidationAndConversion(node_def, absl::StatusCode::kInvalidArgument,
+                               "Received inputs with inconsistent rank");
+  }
 
   struct TestParams {
     std::vector<std::vector<int>> input_shapes;
-    std::vector<std::vector<CType>> input_values;
+    std::vector<std::vector<int>> input_values;
+    std::vector<bool> inputs_are_tensors;
     int axis;
     std::vector<int> expected_output_dims;
-    std::vector<CType> expected_output;
+    std::vector<int> expected_output;
+    Status conversion_status;
+    Status run_status;
   };
 
-  const std::vector<std::vector<CType>> common_input{
-      InitTestVector<CType>(6),
-      InitTestVector<CType>(6, /*start_value=*/CType(6))};
-  // TODO(hinsu): Use std::vector instead of an array to avoid use of explicit
-  // size.
-  const int kConcatOKCases = 4;
-  TestParams ok_params[kConcatOKCases] = {
+  const std::vector<std::vector<int>> common_input{CreateVectorIota<int>(6),
+                                                   CreateVectorIota<int>(6, 6)};
+
+  std::vector<TestParams> params = {
       {
-          /*input_shapes=*/{{1, 2, 3}, {1, 2, 3}},
+          /*input_shapes=*/{{1, 1, 2, 3}, {1, 1, 2, 3}},
           /*input_values=*/common_input,
+          /*inputs_are_tensors=*/{true, true},
           /*axis=*/1,
-          /*expected_output_dims=*/{2, 2, 3},
-          /*expected_output=*/InitTestVector<CType>(12),
+          /*expected_output_dims=*/{1, 2, 2, 3},
+          /*expected_output=*/CreateVectorIota<int>(12),
       },
       {
-          /*input_shapes=*/{{1, 2, 3}, {1, 2, 3}},
+          /*input_shapes=*/{{1, 1, 2, 3}, {1, 1, 2, 3}},
           /*input_values=*/common_input,
+          /*inputs_are_tensors=*/{true, true},
           /*axis=*/2,
-          /*expected_output_dims=*/{1, 4, 3},
-          /*expected_output=*/InitTestVector<CType>(12),
+          /*expected_output_dims=*/{1, 1, 4, 3},
+          /*expected_output=*/CreateVectorIota<int>(12),
       },
       {
-          /*input_shapes=*/{{1, 2, 3}, {1, 2, 3}},
+          /*input_shapes=*/{{1, 1, 2, 3}, {1, 1, 2, 3}},
           /*input_values=*/common_input,
+          /*inputs_are_tensors=*/{true, true},
           /*axis=*/3,
-          /*expected_output_dims=*/{1, 2, 6},
+          /*expected_output_dims=*/{1, 1, 2, 6},
           /*expected_output=*/
-          {CType(0), CType(1), CType(2), CType(6), CType(7), CType(8), CType(3),
-           CType(4), CType(5), CType(9), CType(10), CType(11)},
+          {0, 1, 2, 6, 7, 8, 3, 4, 5, 9, 10, 11},
       },
       {
-          /*input_shapes=*/{{1}, {2}, {3}, {1}, {1}, {2}},
+          /*input_shapes=*/{{1, 1}, {1, 2}, {1, 3}, {1, 1}, {1, 1}, {1, 2}},
           /*input_values=*/
-          {{CType(1)},
-           {CType(2), CType(3)},
-           {CType(4), CType(5), CType(6)},
-           {CType(7)},
-           {CType(8)},
-           {CType(9), CType(10)}},
+          {{1}, {2, 3}, {4, 5, 6}, {7}, {8}, {9, 10}},
+          /*inputs_are_tensors=*/{true, true, true, true, true, true},
           /*axis=*/1,
-          /*expected_output_dims=*/{10},
+          /*expected_output_dims=*/{1, 10},
           /*expected_output=*/
-          InitTestVector<CType>(10, /*start_value=*/CType(1)),
+          CreateVectorIota<int>(10, /*start_value=*/1),
       },
-  };
+      {
+          // An input is a weight
+          /*input_shapes=*/{{1, 1, 2, 3}, {1, 1, 2, 3}},
+          /*input_values=*/common_input,
+          /*inputs_are_tensors=*/{true, false},
+          /*axis=*/1,
+          /*expected_output_dims=*/{1, 2, 2, 3},
+          /*expected_output=*/CreateVectorIota<int>(12),
+          /*conversion_status=*/trt_mode_ == TrtTestMode::kImplicitBatch
+              ? errors::Unimplemented(
+                    "The input \"values_1\" for ConcatV2 must be a tensor")
+              : Status::OK(),
+          /*run_status=*/Status::OK(),
+      },
+      {
+          // An input is a weight
+          /*input_shapes=*/{{1, 1, 2, 3}, {1, 1, 2, 3}},
+          /*input_values=*/common_input,
+          /*inputs_are_tensors=*/{false, false},
+          /*axis=*/1,
+          /*expected_output_dims=*/{1, 2, 2, 3},
+          /*expected_output=*/CreateVectorIota<int>(12),
+          /*conversion_status=*/trt_mode_ == TrtTestMode::kImplicitBatch
+              ? errors::Unimplemented(
+                    "The input \"values_0\" for ConcatV2 must be a tensor")
+              : Status::OK(),
+          /*run_status=*/Status::OK(),
+      },
+      {
+          // Axis is batch dimension, should fail in implicit batch mode.
+          /*input_shapes=*/{{1, 1, 2, 3}, {1, 1, 2, 3}},
+          /*input_values=*/common_input,
+          /*inputs_are_tensors=*/{true, true},
+          /*axis=*/0,
+          /*expected_output_dims=*/{2, 1, 2, 3},
+          /*expected_output=*/CreateVectorIota<int>(12),
+          /*conversion_status=*/trt_mode_ == TrtTestMode::kImplicitBatch
+              ? errors::Unimplemented(
+                    "TensorRT does not allow manipulation of the "
+                    "batch dimension")
+              : Status::OK(),
+      },
+      {
+          // Inconsistent input shape, runtime error in dynamic shape mode.
+          /*input_shapes=*/{{1, 1, 2, 3}, {1, 1, 3, 2}},
+          /*input_values=*/common_input,
+          /*inputs_are_tensors=*/{true, true},
+          /*axis=*/1,
+          /*expected_output_dims=*/{2, 1, 2, 3},
+          /*expected_output=*/CreateVectorIota<int>(12),
+          trt_mode_ != TrtTestMode::kDynamicShape
+              ? errors::InvalidArgument(
+                    "Received inputs with inconsistent shape")
+              : Status::OK(),
+          errors::InvalidArgument(""),
+      }};
+
+  for (auto p : params) {
+    Reset();
+    const int num_inputs = p.input_shapes.size();
+    EXPECT_EQ(num_inputs, p.input_values.size());
+
+    NodeDef node_def = get_concat_nodedef(tf_type_, num_inputs);
 
-  for (int i = 0; i < kConcatOKCases; ++i) {
-    test->Reset();
-    const int num_inputs = ok_params[i].input_shapes.size();
-    EXPECT_EQ(num_inputs, ok_params[i].input_values.size());
-    NodeDef node_def = get_concat_nodedef(dtype, num_inputs);
     // Create inputs.
     for (int j = 0; j < num_inputs; ++j) {
-      test->AddTestTensor(StrCat("values_", j), ok_params[i].input_shapes[j], 1,
-                          TfDataTypeToTrt(dtype));
-    }
-    test->AddTestWeights<int32>("axis", {1}, {ok_params[i].axis});
-    test->RunValidationAndConversion(node_def);
+      string name = StrCat("values_", j);
 
-    TRT_TensorOrWeights output;
-    TF_EXPECT_OK(test->GetTensorOrWeights("my_concat", &output));
-    ASSERT_TRUE(output.is_tensor());
-    ExpectTrtDimsEqualsArray(ok_params[i].expected_output_dims,
-                             output.tensor()->getDimensions());
-    // Create input data for tensors.
-    DataVec input_data;
-    for (int j = 0; j < num_inputs; ++j) {
-      input_data.push_back(
-          {StrCat("values_", j),
-           test::AsTensor<CType>(ok_params[i].input_values[j])});
+      if (!p.inputs_are_tensors[j]) {
+        AddTestWeights(name, p.input_shapes[j], p.input_values[j], tf_type_);
+      } else {
+        AddTestTensor(name, p.input_shapes[j], p.input_values[j]);
+      }
     }
-    DataVec output_data{
-        {"my_concat",
-         ConstructTensor<CType>(ok_params[i].expected_output.size())}};
-    test->BuildAndRun(
-        input_data, &output_data,
-        dtype == DT_HALF ? TrtPrecisionMode::FP16 : TrtPrecisionMode::FP32);
-    EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
-                ElementsAreArray(ok_params[i].expected_output));
-  }
-}
+    AddTestWeights<int32>("axis", {1}, {p.axis});
 
-TEST_F(OpConverterTest, ConvertConcat) {
-  {
-    // Axis is a tensor, should fail.
-    Reset();
-    NodeDef node_def = get_concat_nodedef(DT_FLOAT, 2);
-    AddTestTensor("values_0", {1, 2, 3});
-    AddTestTensor("values_1", {1, 2, 3});
-    AddTestTensor("axis", {1});
-    RunValidationAndConversion(
-        node_def, error::UNIMPLEMENTED,
-        "The input \"axis\" for ConcatV2 must be a constant, at my_concat");
-  }
-  {
-    // Axis is out of bounds, should fail.
-    Reset();
-    NodeDef node_def = get_concat_nodedef(DT_FLOAT, 2);
-    AddTestTensor("values_0", {1, 2, 3});
-    AddTestTensor("values_1", {1, 2, 3});
-    AddTestWeights<int32>("axis", {1}, {4});
-    RunValidationAndConversion(node_def, error::INVALID_ARGUMENT,
-                               "Axis value of 4 is out of bounds, must be in "
-                               "range [-4, 4), at my_concat");
-  }
-  {
-    // Axis is batch dimension, should fail.
-    Reset();
-    NodeDef node_def = get_concat_nodedef(DT_FLOAT, 2);
-    AddTestTensor("values_0", {1, 2, 3});
-    AddTestTensor("values_1", {1, 2, 3});
-    AddTestWeights<int32>("axis", {1}, {0});
-    RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
-                               "TensorRT does not allow manipulation of the "
-                               "batch dimension, at my_concat");
-  }
-  {
-    // Inputs have inconsistent rank, should fail.
-    Reset();
-    NodeDef node_def = get_concat_nodedef(DT_FLOAT, 2);
-    AddTestTensor("values_0", {1, 2, 3});
-    AddTestTensor("values_1", {1, 6});
-    AddTestWeights<int32>("axis", {1}, {1});
-    RunValidationAndConversion(
-        node_def, error::INVALID_ARGUMENT,
-        "Received inputs with inconsistent rank, at my_concat");
-  }
-  {
-    // An input is a weight, should fail.
-    Reset();
-    NodeDef node_def = get_concat_nodedef(DT_FLOAT, 2);
-    AddTestTensor("values_0", {1, 2, 3});
-    AddTestWeights<float>("values_1", {1, 2, 3}, {1, 2, 3, 4, 5, 6});
-    AddTestWeights<int32>("axis", {1}, {1});
-    RunValidationAndConversion(
-        node_def, error::UNIMPLEMENTED,
-        "The input \"values_1\" for ConcatV2 must be a tensor, at my_concat");
-  }
-  {
-    // Inputs have inconsistent non-axis shapes, should fail.
-    Reset();
-    NodeDef node_def = get_concat_nodedef(DT_FLOAT, 2);
-    AddTestTensor("values_0", {1, 2, 3});
-    AddTestTensor("values_1", {1, 3, 2});
-    AddTestWeights<int32>("axis", {1}, {1});
-    RunValidationAndConversion(
-        node_def, error::INVALID_ARGUMENT,
-        "Received inputs with inconsistent shape, at my_concat");
+    TestOpConverter(node_def, p.expected_output_dims, p.conversion_status,
+                    p.run_status, ElementsAreArray(p.expected_output));
   }
-
-  TestConvertConcat<DT_FLOAT>(this);
-  TestConvertConcat<DT_HALF>(this);
-  // TODO(tmorris): Enable once TRT adds support.
-  // TestConvertConcat<DT_INT32>(this);
 }
 
 // Get the NodeDef for Split.
@@ -4992,13 +7728,12 @@ void TestConvertSplit(OpConverterTest* test) {
     std::vector<std::vector<CType>> expected_outputs;
   };
 
-  const std::vector<CType> common_input = InitTestVector<CType>(6);
-  const int kSplitOKCases = 4;
-  TestParams ok_params[kSplitOKCases] = {
+  const std::vector<CType> common_input = CreateVectorIota<CType>(6);
+  std::vector<TestParams> ok_params = {
       // Identity (num_split = 1)
       {/*input_shape=*/{1, 2, 3}, /*value=*/common_input, /*axis=*/1,
        /*num_split=*/1, /*expected_output_dims=*/{1, 2, 3},
-       /*expected_outputs=*/{InitTestVector<CType>(6)}},
+       /*expected_outputs=*/{CreateVectorIota<CType>(6)}},
       {/*input_shape=*/{1, 2, 3},
        /*value=*/common_input,
        /*axis=*/3,
@@ -5024,16 +7759,17 @@ void TestConvertSplit(OpConverterTest* test) {
        /*num_split=*/2,
        /*expected_output_dims=*/{1, 3},
        /*expected_outputs=*/
-       {InitTestVector<CType>(3), InitTestVector<CType>(3, CType(3))}},
+       {CreateVectorIota<CType>(3), CreateVectorIota<CType>(3, CType(3))}},
   };
 
-  for (int i = 0; i < kSplitOKCases; ++i) {
+  for (int i = 0; i < ok_params.size(); ++i) {
     test->Reset();
     NodeDef node_def = get_split_nodedef(dtype, ok_params[i].num_split);
     // Create inputs.
     test->AddTestWeights<int32>("axis", {1}, {ok_params[i].axis});
-    test->AddTestTensor("value", ok_params[i].input_shape, 1,
-                        TfDataTypeToTrt(dtype));
+    nvinfer1::DataType trt_type;
+    TF_ASSERT_OK(TfTypeToTrtType(dtype, &trt_type));
+    test->AddTestTensor("value", ok_params[i].input_shape, 1, trt_type);
     // Convert.
     test->RunValidationAndConversion(node_def);
 
@@ -5045,20 +7781,18 @@ void TestConvertSplit(OpConverterTest* test) {
       const string name = j == 0 ? StrCat("my_split") : StrCat("my_split:", j);
       TF_EXPECT_OK(test->GetTensorOrWeights(name, &outputs[j]));
       EXPECT_TRUE(outputs[j].is_tensor());
-      ExpectTrtDimsEqualsArray(ok_params[i].expected_output_dims,
-                               outputs[j].tensor()->getDimensions());
+      EXPECT_THAT(outputs[j].tensor()->getDimensions(),
+                  DimsAreArray(ok_params[i].expected_output_dims));
       // Create buffer to store output.
       output_data.push_back(
-          {name,
-           ConstructTensor<CType>(ok_params[i].expected_outputs[j].size())});
+          {name, test->ConstructTensor<CType>(
+                     ok_params[i].expected_outputs[j].size())});
     }
 
     // Verify output values are correct.
     const DataVec input_data{
-        {"value", test::AsTensor<CType>(ok_params[i].value)}};
-    test->BuildAndRun(
-        input_data, &output_data,
-        dtype == DT_HALF ? TrtPrecisionMode::FP16 : TrtPrecisionMode::FP32);
+        {"value", test->AsTensor<CType>(ok_params[i].value)}};
+    TF_EXPECT_OK(test->BuildAndRun(input_data, &output_data));
     for (int j = 0; j < outputs.size(); ++j) {
       EXPECT_THAT(GetSpanForData<CType>(output_data[j]),
                   ElementsAreArray(ok_params[i].expected_outputs[j]));
@@ -5074,8 +7808,8 @@ TEST_F(OpConverterTest, ConvertSplit) {
     AddTestTensor("axis", {1});
     AddTestTensor("value", {1, 2, 3});
     RunValidationAndConversion(
-        node_def, error::UNIMPLEMENTED,
-        "The input \"axis\" for Split must be a constant, at my_split");
+        node_def, absl::StatusCode::kUnimplemented,
+        "The input \"axis\" for Split must be a constant");
   }
   {
     // Axis is out of bounds, should fail.
@@ -5083,9 +7817,9 @@ TEST_F(OpConverterTest, ConvertSplit) {
     NodeDef node_def = get_split_nodedef(DT_FLOAT, 1);
     AddTestWeights<int32>("axis", {1}, {4});
     AddTestTensor("value", {1, 2, 3});
-    RunValidationAndConversion(node_def, error::INVALID_ARGUMENT,
+    RunValidationAndConversion(node_def, absl::StatusCode::kInvalidArgument,
                                "Axis value of 4 is out of bounds, must be in "
-                               "range [-4, 4), at my_split");
+                               "range [-4, 4)");
   }
   {
     // Axis is out of bounds (negative), should fail.
@@ -5093,9 +7827,9 @@ TEST_F(OpConverterTest, ConvertSplit) {
     NodeDef node_def = get_split_nodedef(DT_FLOAT, 1);
     AddTestWeights<int32>("axis", {1}, {-5});
     AddTestTensor("value", {1, 2, 3});
-    RunValidationAndConversion(node_def, error::INVALID_ARGUMENT,
+    RunValidationAndConversion(node_def, absl::StatusCode::kInvalidArgument,
                                "Axis value of -5 is out of bounds, must be in "
-                               "range [-4, 4), at my_split");
+                               "range [-4, 4)");
   }
   {
     // Axis is batch dimension, should fail.
@@ -5103,9 +7837,9 @@ TEST_F(OpConverterTest, ConvertSplit) {
     NodeDef node_def = get_split_nodedef(DT_FLOAT, 1);
     AddTestWeights<int32>("axis", {1}, {0});
     AddTestTensor("value", {1, 2, 3});
-    RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
+    RunValidationAndConversion(node_def, absl::StatusCode::kUnimplemented,
                                "TensorRT does not allow manipulation of the "
-                               "batch dimension, at my_split");
+                               "batch dimension");
   }
   {
     // Value is a weight, should fail.
@@ -5114,8 +7848,8 @@ TEST_F(OpConverterTest, ConvertSplit) {
     AddTestWeights<int32>("axis", {1}, {1});
     AddTestWeights<float>("value", {1, 2, 3}, {1, 2, 3, 4, 5, 6});
     RunValidationAndConversion(
-        node_def, error::UNIMPLEMENTED,
-        "The input \"value\" for Split must be a tensor, at my_split");
+        node_def, absl::StatusCode::kUnimplemented,
+        "The input \"value\" for Split must be a tensor");
   }
   {
     // Dim is not evenly divisibly by num_split, should fail.
@@ -5124,8 +7858,8 @@ TEST_F(OpConverterTest, ConvertSplit) {
     AddTestWeights<int32>("axis", {1}, {3});
     AddTestTensor("value", {1, 2, 3});
     RunValidationAndConversion(
-        node_def, error::INVALID_ARGUMENT,
-        "Dimension 3 of size 3 is not evenly divisble by 2, at my_split");
+        node_def, absl::StatusCode::kInvalidArgument,
+        "Dimension 3 of size 3 is not evenly divisible by 2");
   }
   {
     // num_split > dim size, should fail.
@@ -5134,15 +7868,13 @@ TEST_F(OpConverterTest, ConvertSplit) {
     AddTestWeights<int32>("axis", {1}, {3});
     AddTestTensor("value", {1, 2, 3});
     RunValidationAndConversion(
-        node_def, error::INVALID_ARGUMENT,
-        "Dimension 3 of size 3 is not evenly divisble by 4, at my_split");
+        node_def, absl::StatusCode::kInvalidArgument,
+        "Dimension 3 of size 3 is not evenly divisible by 4");
   }
 
   TestConvertSplit<DT_FLOAT>(this);
   TestConvertSplit<DT_HALF>(this);
-#if IS_TRT_VERSION_GE(5, 1, 3, 1)
   TestConvertSplit<DT_INT32>(this);
-#endif
 }
 
 // Get the NodeDef for Unpack (Unstack in TF API).
@@ -5155,164 +7887,174 @@ auto get_unpack_nodedef = [](DataType dtype, int num, int axis) -> NodeDef {
   return unstack.operation.node()->def();
 };
 
-template <DataType dtype>
-void TestConvertUnpack(OpConverterTest* test) {
-  typedef typename EnumToDataType<dtype>::Type CType;
+struct UnpackTestParams {
+  std::vector<int> input_shape;
+  std::vector<float> input_value;
+  int axis;
+  int num;
+  std::vector<int> expected_output_dims;
+  std::vector<std::vector<float>> expected_outputs;
+  Status run_status;
+};
 
-  struct TestParams {
-    std::vector<int> input_shape;
-    std::vector<CType> value;
-    int axis;
-    int num;
-    std::vector<int> expected_output_dims;
-    std::vector<std::vector<CType>> expected_outputs;
-  };
+void TestConvertUnpack(ParameterizedOpConverterTestBase* test,
+                       UnpackTestParams& p) {
+  test->Reset();
+  NodeDef node_def = get_unpack_nodedef(test->get_tf_type(), p.num, p.axis);
+  // Create inputs.
+  test->AddTestTensor("value", p.input_shape, test->get_tf_type(),
+                      p.input_value);
+
+  std::vector<Matcher<std::vector<float>>> matcher_vec;
+  std::vector<DataType> datatype_vec;
+  std::vector<std::vector<int>> expected_output_dims;
+
+  for (int j = 0; j < p.expected_outputs.size(); ++j) {
+    matcher_vec.push_back(ElementsAreArray(p.expected_outputs[j]));
+    datatype_vec.push_back(test->get_tf_type());
+    expected_output_dims.push_back(p.expected_output_dims);
+  }
+
+  test->TestOpConverterMultiOut(/*node_def=*/node_def,
+                                /*expected_output_dims=*/expected_output_dims,
+                                /*expected_conversion_status=*/p.run_status,
+                                /*expected_runtime_status=*/p.run_status,
+                                /*matcher=*/matcher_vec,
+                                /*out_tf_type=*/datatype_vec);
+}
 
-  const std::vector<CType> common_input = InitTestVector<CType>(6);
-  const int kUnpackOKCases = 4;
-  TestParams ok_params[kUnpackOKCases] = {
-      {/*input_shape=*/{1, 2, 3}, /*value=*/common_input, /*axis=*/1,
-       /*num=*/1, /*expected_output_dims=*/{2, 3},
-       /*expected_outputs=*/{InitTestVector<CType>(6)}},
-      {/*input_shape=*/{1, 2, 3},
-       /*value=*/common_input,
-       /*axis=*/3,
+// TODO: Reactivate when INT32 Segfault fixed
+TEST_P(OpConverter_FP32_FP16_INT32_Test, ConvertUnpack) {
+  // We need to skip error testing for Dynamic Shape mode, as it is impossible
+  // to convert Unpack in Dynamic Shape Mode.
+  if (trt_mode_ != TrtTestMode::kDynamicShape) {
+    {
+      // Value is weights, should fail.
+      Reset();
+      NodeDef node_def = get_unpack_nodedef(tf_type_, /*num=*/3, /*axis=*/3);
+      AddTestWeights<float>("value", {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6});
+      RunValidationAndConversion(
+          node_def, absl::StatusCode::kUnimplemented,
+          "The input \"value\" for Unpack must be a tensor");
+    }
+    {
+      // Axis is out of bounds, should fail.
+      Reset();
+      NodeDef node_def = get_unpack_nodedef(tf_type_, /*num=*/1, /*axis=*/4);
+      AddTestTensor("value", {1, 1, 2, 3});
+      RunValidationAndConversion(node_def, absl::StatusCode::kInvalidArgument,
+                                 "Axis value of 4 is out of bounds, must be in "
+                                 "range [-4, 4)");
+    }
+    {
+      // Axis is out of bounds (negative), should fail.
+      Reset();
+      NodeDef node_def = get_unpack_nodedef(tf_type_, /*num=*/1, /*axis=*/-5);
+      AddTestTensor("value", {1, 1, 2, 3});
+      RunValidationAndConversion(node_def, absl::StatusCode::kInvalidArgument,
+                                 "Axis value of -5 is out of bounds, must be "
+                                 "in range [-4, 4)");
+    }
+    {
+      if (trt_mode_ != TrtTestMode::kExplicitBatch) {
+        // Axis is batch dimension, should fail.
+        Reset();
+        NodeDef node_def = get_unpack_nodedef(tf_type_, /*num=*/1, /*axis=*/0);
+        AddTestTensor("value", {1, 2, 3});
+        RunValidationAndConversion(node_def, absl::StatusCode::kUnimplemented,
+                                   "TensorRT does not allow manipulation of "
+                                   "the batch dimension");
+      }
+    }
+    {
+      // Dim size does not match num, should fail.
+      Reset();
+      NodeDef node_def = get_unpack_nodedef(tf_type_, /*num=*/5, /*axis=*/2);
+      AddTestTensor("value", {1, 1, 6});
+      RunValidationAndConversion(
+          node_def, absl::StatusCode::kInvalidArgument,
+          "Dimension 2 has size 6 which is not equal to num of 5");
+    }
+    {
+      // Output would be TF scalar, should fail.
+      Reset();
+      NodeDef node_def = get_unpack_nodedef(tf_type_, /*num=*/1, /*axis=*/0);
+      AddTestTensor(
+          "value", {}, tf_type_, {}, {},
+          trt_mode_ == TrtTestMode::kImplicitBatch
+              ? errors::InvalidArgument(
+                    "removing first dim requires explicit batch dimension")
+              : Status::OK());
+      if (trt_mode_ == TrtTestMode::kImplicitBatch) {
+        RunValidationAndConversion(
+            node_def, absl::StatusCode::kInternal,
+            "Failed to convert at least one input to a TRT_TensorOrWeights: "
+            "Scalar input tensor is not supported since the first dimension is "
+            "treated as batch dimension by TRT");
+      } else {
+        RunValidationAndConversion(node_def, absl::StatusCode::kUnimplemented,
+                                   "Input \"value\" for Unpack must be rank 2 "
+                                   "or greater");
+      }
+    }
+  }
+
+  const std::vector<float> common_input = CreateVectorIota<float>(6);
+
+  Status run_status =
+      trt_mode_ == TrtTestMode::kDynamicShape
+          ? errors::InvalidArgument(
+                "The argument `strided_slice_spec` is "
+                "`absl::nullopt` with `dynamic_input_size_indices` non empty.")
+          : Status::OK();
+
+  std::vector<UnpackTestParams> params = {
+      {/*input_shape=*/{1, 1, 2, 1, 3, 1},
+       /*input_value=*/common_input,
+       /*axis=*/4,
        /*num=*/3,
-       /*expected_output_dims=*/{1, 2},
-       /*expected_outputs=*/
-       {{CType(0), CType(3)}, {CType(1), CType(4)}, {CType(2), CType(5)}}},
-      {/*input_shape=*/{6, 1},
-       /*value=*/common_input,
+       /*expected_output_dims=*/{1, 1, 2, 1, 1},
+       /*expected_outputs=*/{{0, 3}, {1, 4}, {2, 5}},
+       /*run_status=*/run_status},
+      {/*input_shape=*/{1, 1, 2, 1, 3},
+       /*input_value=*/common_input,
+       /*axis=*/4,
+       /*num=*/3,
+       /*expected_output_dims=*/{1, 1, 2, 1},
+       /*expected_outputs=*/{{0, 3}, {1, 4}, {2, 5}},
+       /*run_status=*/run_status},
+      {/*input_shape=*/{1, 1, 2, 3},
+       /*input_value=*/common_input,
+       /*axis=*/1,
+       /*num=*/1,
+       /*expected_output_dims=*/{1, 2, 3},
+       /*expected_outputs=*/{CreateVectorIota<float>(6)},
+       /*run_status=*/run_status},
+      {/*input_shape=*/{1, 6, 1},
+       /*input_value=*/common_input,
        /*axis=*/-2,
        /*num=*/6,
-       /*expected_output_dims=*/{1},
-       /*expected_outputs=*/
-       {{CType(0)},
-        {CType(1)},
-        {CType(2)},
-        {CType(3)},
-        {CType(4)},
-        {CType(5)}}},
-      {/*input_shape=*/{6},
-       /*value=*/common_input,
+       /*expected_output_dims=*/{1, 1},
+       /*expected_outputs=*/{{0}, {1}, {2}, {3}, {4}, {5}},
+       /*run_status=*/run_status},
+      {/*input_shape=*/{1, 6},
+       /*input_value=*/common_input,
        /*axis=*/1,
        /*num=*/6,
-       /*expected_output_dims=*/{},
-       /*expected_outputs=*/
-       {{CType(0)},
-        {CType(1)},
-        {CType(2)},
-        {CType(3)},
-        {CType(4)},
-        {CType(5)}}},
+       /*expected_output_dims=*/{1},
+       /*expected_outputs=*/{{0}, {1}, {2}, {3}, {4}, {5}},
+       /*run_status=*/run_status},
   };
-
-  for (int i = 0; i < kUnpackOKCases; ++i) {
-    test->Reset();
-    NodeDef node_def =
-        get_unpack_nodedef(dtype, ok_params[i].num, ok_params[i].axis);
-    // Create inputs.
-    test->AddTestTensor("value", ok_params[i].input_shape, 1,
-                        TfDataTypeToTrt(dtype));
-    // Convert.
-    test->RunValidationAndConversion(node_def);
-
-    // Get output tensors and verify output dims.
-    EXPECT_EQ(ok_params[i].expected_outputs.size(), ok_params[i].num);
-    std::vector<TRT_TensorOrWeights> outputs(ok_params[i].num);
-    DataVec output_data;
-    for (int j = 0; j < outputs.size(); ++j) {
-      const string name = j == 0 ? "my_unpack" : StrCat("my_unpack:", j);
-      TF_EXPECT_OK(test->GetTensorOrWeights(name, &outputs[j]));
-      EXPECT_TRUE(outputs[j].is_tensor());
-      ExpectTrtDimsEqualsArray(ok_params[i].expected_output_dims,
-                               outputs[j].tensor()->getDimensions());
-      // Create buffer to store output.
-      output_data.push_back(
-          {name,
-           ConstructTensor<CType>(ok_params[i].expected_outputs[j].size())});
-    }
-
-    // Verify output values are correct.
-    const DataVec input_data{
-        {"value", test::AsTensor<CType>(ok_params[i].value)}};
-    test->BuildAndRun(
-        input_data, &output_data,
-        dtype == DT_HALF ? TrtPrecisionMode::FP16 : TrtPrecisionMode::FP32);
-    for (int j = 0; j < outputs.size(); ++j) {
-      EXPECT_THAT(GetSpanForData<CType>(output_data[j]),
-                  ElementsAreArray(ok_params[i].expected_outputs[j]));
-    }
-  }
-}
-
-TEST_F(OpConverterTest, ConvertUnpack) {
-  {
-    // Value is weights, should fail.
-    Reset();
-    NodeDef node_def = get_unpack_nodedef(DT_FLOAT, /*num=*/3, /*axis=*/3);
-    AddTestWeights<float>("value", {1, 2, 3}, {1, 2, 3, 4, 5, 6});
-    RunValidationAndConversion(
-        node_def, error::UNIMPLEMENTED,
-        "The input \"value\" for Unpack must be a tensor, at my_unpack");
-  }
-  {
-    // Axis is out of bounds, should fail.
-    Reset();
-    NodeDef node_def = get_unpack_nodedef(DT_FLOAT, /*num=*/1, /*axis=*/4);
-    AddTestTensor("value", {1, 2, 3});
-    RunValidationAndConversion(node_def, error::INVALID_ARGUMENT,
-                               "Axis value of 4 is out of bounds, must be in "
-                               "range [-4, 4), at my_unpack");
-  }
-  {
-    // Axis is out of bounds (negative), should fail.
-    Reset();
-    NodeDef node_def = get_unpack_nodedef(DT_FLOAT, /*num=*/1, /*axis=*/-5);
-    AddTestTensor("value", {1, 2, 3});
-    RunValidationAndConversion(node_def, error::INVALID_ARGUMENT,
-                               "Axis value of -5 is out of bounds, must be in "
-                               "range [-4, 4), at my_unpack");
-  }
-  {
-    // Axis is batch dimension, should fail.
-    Reset();
-    NodeDef node_def = get_unpack_nodedef(DT_FLOAT, /*num=*/1, /*axis=*/0);
-    AddTestTensor("value", {1, 2, 3});
-    RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
-                               "TensorRT does not allow manipulation of the "
-                               "batch dimension, at my_unpack");
-  }
-  {
-    // Dim size does not match num, should fail.
-    Reset();
-    NodeDef node_def = get_unpack_nodedef(DT_FLOAT, /*num=*/5, /*axis=*/2);
-    AddTestTensor("value", {1, 6});
-    RunValidationAndConversion(
-        node_def, error::INVALID_ARGUMENT,
-        "Dimension 2 has size 6 which is not equal to num of 5, at my_unpack");
-  }
-  {
-    // Output would be TF scalar, should fail.
-    Reset();
-    NodeDef node_def = get_unpack_nodedef(DT_FLOAT, /*num=*/1, /*axis=*/0);
-    AddTestTensor("value", {});
-    RunValidationAndConversion(
-        node_def, error::UNIMPLEMENTED,
-        "Input \"value\" for Unpack must be rank 2 or greater, at my_unpack");
-  }
-
-  TestConvertUnpack<DT_FLOAT>(this);
-  TestConvertUnpack<DT_HALF>(this);
-#if IS_TRT_VERSION_GE(5, 1, 3, 1)
-  TestConvertUnpack<DT_INT32>(this);
-#endif
+  for (auto p : params) {
+    TestConvertUnpack(this, p);
+  }
 }
 
 // Get the NodeDef for Pack.
 NodeDef GetPackNodeDef(DataType dtype, int num_inputs, int axis) {
   Scope s = Scope::NewRootScope();
   std::vector<Input> values;
+  values.reserve(num_inputs);
   for (int i = 0; i < num_inputs; ++i) {
     const string input_name = StrCat("values_", i);
     values.push_back(ops::Placeholder(s.WithOpName(input_name), dtype));
@@ -5324,154 +8066,165 @@ NodeDef GetPackNodeDef(DataType dtype, int num_inputs, int axis) {
   return pack.operation.node()->def();
 }
 
-template <DataType dtype>
-void TestConvertPack(OpConverterTest* test) {
-  typedef typename EnumToDataType<dtype>::Type CType;
-
+TEST_P(OpConverter_FP32_FP16_INT32_Test, ConvertPack) {
   struct TestParams {
     std::vector<std::vector<int>> input_shapes;
-    std::vector<std::vector<CType>> input_values;
+    std::vector<std::vector<int>> partial_input_shapes;
+    std::vector<std::vector<float>> input_values;
     int axis;
     std::vector<int> expected_output_dims;
-    std::vector<CType> expected_output;
+    std::vector<float> expected_output;
+    Status conversion_status;
+    Status runtime_status;
+    bool input_1_is_weight;
   };
 
-  const std::vector<std::vector<CType>> common_input{
-      InitTestVector<CType>(6),
-      InitTestVector<CType>(6, /*start_value=*/CType(6))};
+  const std::vector<std::vector<float>> common_input{
+      CreateVectorIota<float>(6),
+      CreateVectorIota<float>(6, /*start_value=*/6)};
   std::vector<TestParams> params = {
+      // Second input is weight, should fail in implicit batch mode
+      {/*input_shapes=*/{{1, 2, 3}, {1, 2, 3}},
+       /*partial_input_shapes=*/{{}, {}},
+       /*input_values=*/common_input,
+       /*axis=*/1,
+       /*expected_output_dims=*/{1, 2, 2, 3},
+       /*expected_output=*/CreateVectorIota<float>(12),
+       trt_mode_ == TrtTestMode::kImplicitBatch
+           ? Status{absl::StatusCode::kUnimplemented,
+                    "The input \"values_1\" for Pack must be a tensor"}
+           : Status::OK(),
+       /*runtime_status*/ Status::OK(),
+       /*weight_input*/ true},
+      // Axis is out of bounds, should fail.
+      {
+          /*input_shapes=*/{{1, 2, 3}, {1, 2, 3}},
+          /*partial_input_shapes=*/{{}, {}},
+          /*input_values=*/common_input,
+          /*axis=*/-5,
+          /*expected_output_dims=*/{},
+          /*expected_output=*/{},
+          Status{absl::StatusCode::kInvalidArgument,
+                 "Axis value of -5 is out of bounds, must be in"
+                 " range [-4, 4)"},
+      },
+      // Axis is batch dimension, should fail in implicit batch mode.
+      {/*input_shapes=*/{{1, 2, 3}, {1, 2, 3}},
+       /*partial_input_shapes=*/{{}, {}},
+       /*input_values=*/common_input,
+       /*axis=*/-4,
+       /*expected_output_dims=*/{2, 1, 2, 3},
+       /*expected_output=*/CreateVectorIota<float>(12),
+       trt_mode_ == TrtTestMode::kImplicitBatch
+           ? Status{absl::StatusCode::kUnimplemented,
+                    "TensorRT does not allow manipulation of the batch "
+                    "dimension"}
+           : Status::OK()},
+      // Inconsistent rank, should fail.
+      {
+          /*input_shapes=*/{{1, 2, 3}, {1, 6}},
+          /*partial_input_shapes=*/{{}, {}},
+          /*input_values=*/common_input,
+          /*axis=*/1,
+          /*expected_output_dims=*/{},
+          /*expected_output=*/{},
+          Status{absl::StatusCode::kInvalidArgument,
+                 "Received inputs with inconsistent rank"},
+      },
       {
-          /*input_shapes=*/{{2, 3}, {2, 3}},
+          /*input_shapes=*/{{1, 2, 3}, {1, 2, 3}},
+          /*partial_input_shapes=*/{{}, {}},
           /*input_values=*/common_input,
           /*axis=*/1,
-          /*expected_output_dims=*/{2, 2, 3},
-          /*expected_output=*/InitTestVector<CType>(12),
+          /*expected_output_dims=*/{1, 2, 2, 3},
+          /*expected_output=*/CreateVectorIota<float>(12),
       },
       {
-          /*input_shapes=*/{{2, 3}, {2, 3}},
+          /*input_shapes=*/{{1, 2, 3}, {1, 2, 3}},
+          /*partial_input_shapes=*/{{}, {}},
           /*input_values=*/common_input,
           /*axis=*/2,
-          /*expected_output_dims=*/{2, 2, 3},
+          /*expected_output_dims=*/{1, 2, 2, 3},
           /*expected_output=*/
-          {CType(0), CType(1), CType(2), CType(6), CType(7), CType(8), CType(3),
-           CType(4), CType(5), CType(9), CType(10), CType(11)},
+          {0, 1, 2, 6, 7, 8, 3, 4, 5, 9, 10, 11},
       },
       {
-          /*input_shapes=*/{{2, 3}, {2, 3}},
+          /*input_shapes=*/{{1, 2, 3}, {1, 2, 3}},
+          /*partial_input_shapes=*/{{}, {}},
           /*input_values=*/common_input,
           /*axis=*/3,
-          /*expected_output_dims=*/{2, 3, 2},
+          /*expected_output_dims=*/{1, 2, 3, 2},
           /*expected_output=*/
-          {CType(0), CType(6), CType(1), CType(7), CType(2), CType(8), CType(3),
-           CType(9), CType(4), CType(10), CType(5), CType(11)},
+          {0, 6, 1, 7, 2, 8, 3, 9, 4, 10, 5, 11},
       },
       {
-          /*input_shapes=*/{{2, 3}},
-          /*input_values=*/{InitTestVector<CType>(6)},
+          /*input_shapes=*/{{1, 2, 3}},
+          /*partial_input_shapes=*/{{}},
+          /*input_values=*/{CreateVectorIota<float>(6)},
           /*axis=*/1,
-          /*expected_output_dims=*/{1, 2, 3},
-          /*expected_output=*/InitTestVector<CType>(6),
+          /*expected_output_dims=*/{1, 1, 2, 3},
+          /*expected_output=*/CreateVectorIota<float>(6),
       },
       {
-          /*input_shapes=*/{{2, 3}},
-          /*input_values=*/{InitTestVector<CType>(6)},
+          /*input_shapes=*/{{1, 2, 3}},
+          /*partial_input_shapes=*/{{}},
+          /*input_values=*/{CreateVectorIota<float>(6)},
           /*axis=*/2,
-          /*expected_output_dims=*/{2, 1, 3},
-          /*expected_output=*/InitTestVector<CType>(6),
+          /*expected_output_dims=*/{1, 2, 1, 3},
+          /*expected_output=*/CreateVectorIota<float>(6),
       },
   };
-
-  for (int i = 0; i < params.size(); ++i) {
-    test->Reset();
-    const int num_inputs = params[i].input_shapes.size();
-    EXPECT_EQ(num_inputs, params[i].input_values.size());
-
-    NodeDef node_def = GetPackNodeDef(dtype, num_inputs, params[i].axis);
+  // Inputs have inconsistent shapes, should fail.
+  if (trt_mode_ != TrtTestMode::kDynamicShape) {
+    params.push_back(
+        TestParams{/*input_shapes=*/{{1, 2, 3}, {1, 3, 2}},
+                   /*partial_input_shapes=*/{{}, {}},
+                   /*input_values=*/common_input,
+                   /*axis=*/1,
+                   /*expected_output_dims=*/{},
+                   /*expected_output=*/CreateVectorIota<float>(12),
+                   Status{absl::StatusCode::kInvalidArgument,
+                          "Received inputs with inconsistent shape"}});
+  } else {
+    // In dynamic shape mode we cannot catch inconsistent shapes at conversion
+    // time, only during runtime. But TensorRT does not raise a proper runtime
+    // error, instead it aborts the program with the following message:
+    //  Assertion failed: t->start.d[i] + t->extent.d[i] <= r.dims.d[i]
+    // ../builder/cudnnBuilderGraph.cpp:862
+    // Aborting...
+    // TODO(tfeher) Add dynamic shapes test once TRT handles shape error
+    // decently
+  }
+  if (trt_mode_ == TrtTestMode::kDynamicShape) {
+    // Test with mixed dynamic / static shape input tensors
+    params.push_back(
+        TestParams{/*input_shapes=*/{{1, 2, 3}, {1, 2, 3}},
+                   /*partial_input_shapes=*/{{-1, -1, -1}, {1, 2, 3}},
+                   /*input_values=*/common_input,
+                   /*axis=*/2,
+                   /*expected_output_dims=*/{1, 2, 2, 3},
+                   /*expected_output=*/
+                   {0, 1, 2, 6, 7, 8, 3, 4, 5, 9, 10, 11}});
+  }
+  for (auto p : params) {
+    Reset();
+    const int num_inputs = p.input_shapes.size();
+    EXPECT_EQ(num_inputs, p.input_values.size());
+
+    NodeDef node_def = GetPackNodeDef(tf_type_, num_inputs, p.axis);
     // Create inputs.
     for (int j = 0; j < num_inputs; ++j) {
-      test->AddTestTensor(StrCat("values_", j), params[i].input_shapes[j], 1,
-                          TfDataTypeToTrt(dtype));
-    }
-    test->RunValidationAndConversion(node_def);
-
-    TRT_TensorOrWeights output;
-    TF_EXPECT_OK(test->GetTensorOrWeights("my_pack", &output));
-    EXPECT_TRUE(output.is_tensor());
-    ExpectTrtDimsEqualsArray(params[i].expected_output_dims,
-                             output.tensor()->getDimensions());
-    // Create input data for tensors.
-    DataVec input_data;
-    for (int j = 0; j < num_inputs; ++j) {
-      input_data.push_back({StrCat("values_", j),
-                            test::AsTensor<CType>(params[i].input_values[j])});
+      if (j == 1 && p.input_1_is_weight) {
+        AddTestWeights(StrCat("values_", j), p.input_shapes[j],
+                       p.input_values[j], tf_type_);
+      } else {
+        AddTestTensor(StrCat("values_", j), p.input_shapes[j], tf_type_,
+                      p.input_values[j], p.partial_input_shapes[j]);
+      }
     }
-    DataVec output_data{
-        {"my_pack", ConstructTensor<CType>(params[i].expected_output.size())}};
-    test->BuildAndRun(
-        input_data, &output_data,
-        dtype == DT_HALF ? TrtPrecisionMode::FP16 : TrtPrecisionMode::FP32);
-    EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
-                ElementsAreArray(params[i].expected_output));
-  }
-}
-
-TEST_F(OpConverterTest, ConvertPack) {
-  {
-    // An input is a weight, should fail.
-    Reset();
-    NodeDef node_def = GetPackNodeDef(DT_FLOAT, 2, /*axis=*/1);
-    AddTestTensor("values_0", {1, 2, 3});
-    AddTestWeights<float>("values_1", {1, 2, 3}, {1, 2, 3, 4, 5, 6});
-    RunValidationAndConversion(
-        node_def, error::UNIMPLEMENTED,
-        "The input \"values_1\" for Pack must be a tensor, at my_pack");
-  }
-  {
-    // Axis is out of bounds, should fail.
-    Reset();
-    NodeDef node_def = GetPackNodeDef(DT_FLOAT, 2, /*axis=*/-5);
-    AddTestTensor("values_0", {2, 3});
-    AddTestTensor("values_1", {2, 3});
-    RunValidationAndConversion(node_def, error::INVALID_ARGUMENT,
-                               "Axis value of -5 is out of bounds, must be in "
-                               "range [-4, 4), at my_pack");
-  }
-  {
-    // Axis is batch dimension, should fail.
-    Reset();
-    NodeDef node_def = GetPackNodeDef(DT_FLOAT, 2, /*axis=*/-4);
-    AddTestTensor("values_0", {2, 3});
-    AddTestTensor("values_1", {2, 3});
-    RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
-                               "TensorRT does not allow manipulation of the "
-                               "batch dimension, at my_pack");
-  }
-  {
-    // Inputs have inconsistent rank, should fail.
-    Reset();
-    NodeDef node_def = GetPackNodeDef(DT_FLOAT, 2, /*axis=*/1);
-    AddTestTensor("values_0", {1, 2, 3});
-    AddTestTensor("values_1", {1, 6});
-    RunValidationAndConversion(
-        node_def, error::INVALID_ARGUMENT,
-        "Received inputs with inconsistent rank, at my_pack");
-  }
-  {
-    // Inputs have inconsistent shapes, should fail.
-    Reset();
-    NodeDef node_def = GetPackNodeDef(DT_FLOAT, 2, /*axis=*/1);
-    AddTestTensor("values_0", {1, 2});
-    AddTestTensor("values_1", {2, 2});
-    RunValidationAndConversion(
-        node_def, error::INVALID_ARGUMENT,
-        "Received inputs with inconsistent shape, at my_pack");
+    TestOpConverter(node_def, p.expected_output_dims, p.conversion_status,
+                    p.runtime_status, ElementsAreArray(p.expected_output));
   }
-
-  TestConvertPack<DT_FLOAT>(this);
-  TestConvertPack<DT_HALF>(this);
-
-  // TODO(hinsu): Enable INT32 with TensorRT version 5.1.3 after testing.
-  // TestConvertPack<DT_INT32>(this);
 }
 
 // Get the NodeDef for ArgMin or ArgMax.
@@ -5485,134 +8238,160 @@ NodeDef GetArgMinMaxNodeDef(DataType input_dtype, DataType output_dtype) {
   return arg.operation.node()->def();
 }
 
-template <typename OpType, DataType dtype>
-void TestConvertArgMinMax(OpConverterTest* test) {
-  typedef typename EnumToDataType<dtype>::Type CType;
+struct ArgMinMaxTestParams {
+  std::vector<int> input_shape;
+  std::vector<float> input_value;
+  int axis;
+  std::vector<int> expected_output_dims;
+  std::vector<int> expected_argmax_output;
+  std::vector<int> expected_argmin_output;
+  Status status;
+};
 
-  struct TestParams {
-    std::vector<int> input_shape;
-    std::vector<CType> input_value;
-    int axis;
-    std::vector<int> expected_output_dims;
-    std::vector<int> expected_argmax_output;
-    std::vector<int> expected_argmin_output;
-  };
+template <typename OpType>
+void TestConvertArgMinMax(ParameterizedOpConverterTestBase* test,
+                          DataType _tf_type, ArgMinMaxTestParams& p) {
+  test->Reset();
 
-  const std::vector<CType> common_input = InitTestVector<CType>(6);
-  std::vector<TestParams> params = {
+  NodeDef node_def = GetArgMinMaxNodeDef<OpType>(_tf_type,
+                                                 /*output_dtype=*/DT_INT32);
+
+  std::vector<int> expected_out;
+  if (node_def.op() == "ArgMax") {
+    expected_out = p.expected_argmax_output;
+  } else if (node_def.op() == "ArgMin") {
+    expected_out = p.expected_argmin_output;
+  } else {
+    ASSERT_TRUE(false);
+  }
+
+  test->AddTestTensor("input", p.input_shape, _tf_type, p.input_value);
+  test->AddTestWeights("dimension", {1}, {p.axis}, DT_INT32);
+
+  test->TestOpConverter(node_def, p.expected_output_dims,
+                        /*expected_conversion_status=*/p.status,
+                        /*expected_runtime_status=*/Status::OK(),
+                        /*matcher=*/ElementsAreArray(expected_out), {DT_INT32});
+}
+
+TEST_P(OpConverter_FP32_FP16_Test, ConvertArgMinMax) {
+  {
+    // Dimension is a tensor, should fail.
+    Reset();
+    NodeDef node_def =
+        GetArgMinMaxNodeDef<ops::ArgMax>(tf_type_,
+                                         /*output_dtype=*/DT_INT32);
+    AddTestTensor("input", {1, 2, 3});
+    AddTestTensor("dimension", {1});
+    RunValidationAndConversion(
+        node_def, absl::StatusCode::kUnimplemented,
+        "The input \"dimension\" for ArgMax must be a constant");
+  }
+  {
+    // Output type is INT64, should fail.
+    Reset();
+    NodeDef node_def =
+        GetArgMinMaxNodeDef<ops::ArgMax>(tf_type_,
+                                         /*output_dtype=*/DT_INT64);
+    AddTestTensor("input", {1, 2, 3});
+    AddTestWeights("dimension", {1}, {3}, DT_INT32);
+    RunValidationAndConversion(node_def, absl::StatusCode::kUnimplemented,
+                               "Output type int64 is not supported");
+  }
+
+  const std::vector<float> common_input = CreateVectorIota<float>(6);
+  std::vector<ArgMinMaxTestParams> params = {
+      {/*input_shape=*/{2, 3},
+       /*input_value=*/common_input,
+       /*axis=*/0,
+       /*expected_output_dims=*/{3},
+       /*expected_argmax_output=*/{1, 1, 1},
+       /*expected_argmin_output=*/{0, 0, 0},
+       trt_mode_ == TrtTestMode::kImplicitBatch
+           ? errors::Unimplemented("TensorRT does not allow manipulation of "
+                                   "the batch dimension")
+           : Status::OK()},
+      {
+          /*input_shape=*/{1, 6},
+          /*input_value=*/common_input,
+          /*axis=*/1,
+          /*expected_output_dims=*/{1},
+          /*expected_argmax_output=*/{5},
+          /*expected_argmin_output=*/{0},
+      },
+      {
+          /*input_shape=*/{1, 10},
+          /*input_value=*/
+          {-5.0f, 3.0f, 5.0f, 1.0f, 6.0f, -9.0f, 7.0f, 1.0f, 0.0f, -1.0f},
+          /*axis=*/-1,
+          /*expected_output_dims=*/{1},
+          /*expected_argmax_output=*/{6},
+          /*expected_argmin_output=*/{5},
+      },
       {
-          /*input_shape=*/{2, 3},
+          /*input_shape=*/{1, 2, 3},
           /*input_value=*/common_input,
           /*axis=*/2,
-          /*expected_output_dims=*/{2},
+          /*expected_output_dims=*/{1, 2},
           /*expected_argmax_output=*/{2, 2},
           /*expected_argmin_output=*/{0, 0},
       },
       {
-          /*input_shape=*/{2, 3},
+          /*input_shape=*/{1, 2, 3},
           /*input_value=*/common_input,
           /*axis=*/-2,
-          /*expected_output_dims=*/{3},
+          /*expected_output_dims=*/{1, 3},
           /*expected_argmax_output=*/{1, 1, 1},
           /*expected_argmin_output=*/{0, 0, 0},
       },
       {
-          /*input_shape=*/{6},
+          /*input_shape=*/{1, 2, 1, 3},
           /*input_value=*/common_input,
-          /*axis=*/1,
-          /*expected_output_dims=*/{},
-          /*expected_argmax_output=*/{5},
-          /*expected_argmin_output=*/{0},
+          /*axis=*/3,
+          /*expected_output_dims=*/{1, 2, 1},
+          /*expected_argmax_output=*/{2, 2},
+          /*expected_argmin_output=*/{0, 0},
       },
       {
-          /*input_shape=*/{10},
-          /*input_value=*/
-          {CType(-5), CType(3), CType(5), CType(1), CType(6), CType(-9),
-           CType(7), CType(1), CType(0), CType(-1)},
-          /*axis=*/-1,
-          /*expected_output_dims=*/{},
-          /*expected_argmax_output=*/{6},
-          /*expected_argmin_output=*/{5},
+          /*input_shape=*/{1, 2, 1, 3},
+          /*input_value=*/common_input,
+          /*axis=*/-3,
+          /*expected_output_dims=*/{1, 1, 3},
+          /*expected_argmax_output=*/{1, 1, 1},
+          /*expected_argmin_output=*/{0, 0, 0},
+      },
+      {/*input_shape=*/{1, 2, 1, 1, 3},
+       /*input_value=*/common_input,
+       /*axis=*/4,
+       /*expected_output_dims=*/{1, 2, 1, 1},
+       /*expected_argmax_output=*/{2, 2},
+       /*expected_argmin_output=*/{0, 0},
+#if !IS_TRT_VERSION_GE(7, 0, 0, 11)
+       errors::Unimplemented("op is not able to support tensors with 4+"
+                             " dimensions (excluding batch size)")
+#else
+       Status::OK()
+#endif
+      },
+      {/*input_shape=*/{1, 2, 1, 1, 3},
+       /*input_value=*/common_input,
+       /*axis=*/-4,
+       /*expected_output_dims=*/{1, 1, 1, 3},
+       /*expected_argmax_output=*/{1, 1, 1},
+       /*expected_argmin_output=*/{0, 0, 0},
+#if !IS_TRT_VERSION_GE(7, 0, 0, 11)
+       errors::Unimplemented("op is not able to support tensors with 4+"
+                             " dimensions (excluding batch size)")
+#else
+       Status::OK()
+#endif
       },
   };
 
-  for (int i = 0; i < params.size(); ++i) {
-    test->Reset();
-
-    NodeDef node_def = GetArgMinMaxNodeDef<OpType>(dtype, DT_INT32);
-    // Create inputs.
-    test->AddTestTensor("input", params[i].input_shape, /*batch_size=*/1,
-                        /*trt_dtype=*/TfDataTypeToTrt(dtype));
-    test->AddTestWeights<int32>("dimension", {1}, {params[i].axis});
-    test->RunValidationAndConversion(node_def);
-
-    TRT_TensorOrWeights output;
-    TF_EXPECT_OK(test->GetTensorOrWeights("my_arg", &output));
-    EXPECT_TRUE(output.is_tensor());
-    ExpectTrtDimsEqualsArray(params[i].expected_output_dims,
-                             output.tensor()->getDimensions());
-    // Create input data for tensors.
-    const DataVec input_data{
-        {"input", test::AsTensor<CType>(params[i].input_value)}};
-    DataVec output_data{
-        {"my_arg",
-         ConstructTensor<int32>(params[i].expected_argmax_output.size())}};
-    test->BuildAndRun(
-        input_data, &output_data,
-        dtype == DT_HALF ? TrtPrecisionMode::FP16 : TrtPrecisionMode::FP32);
-
-    if (node_def.op() == "ArgMax") {
-      EXPECT_THAT(GetSpanForData<int32>(output_data[0]),
-                  ElementsAreArray(params[i].expected_argmax_output));
-    } else if (node_def.op() == "ArgMin") {
-      EXPECT_THAT(GetSpanForData<int32>(output_data[0]),
-                  ElementsAreArray(params[i].expected_argmin_output));
-    } else {
-      ASSERT_TRUE(false);
-    }
-  }
-}
-
-TEST_F(OpConverterTest, ConvertArgMinMax) {
-  {
-    // Dimension is a tensor, should fail.
-    Reset();
-    NodeDef node_def = GetArgMinMaxNodeDef<ops::ArgMax>(DT_FLOAT, DT_INT32);
-    AddTestTensor("input", {1, 2, 3});
-    AddTestTensor("dimension", {1});
-    RunValidationAndConversion(
-        node_def, error::UNIMPLEMENTED,
-        "The input \"dimension\" for ArgMax must be a constant, at my_arg");
-  }
-  {
-    // Output type is INT64, should fail.
-    Reset();
-    NodeDef node_def = GetArgMinMaxNodeDef<ops::ArgMax>(DT_FLOAT, DT_INT64);
-    AddTestTensor("input", {1, 2, 3});
-    AddTestWeights<int32>("dimension", {1}, {3});
-    RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
-                               "Output type int64 is not supported, at my_arg");
+  for (auto p : params) {
+    TestConvertArgMinMax<ops::ArgMin>(this, tf_type_, p);
+    TestConvertArgMinMax<ops::ArgMax>(this, tf_type_, p);
   }
-  {
-    // Axis is batch dimension, should fail
-    Reset();
-    NodeDef node_def = GetArgMinMaxNodeDef<ops::ArgMax>(DT_FLOAT, DT_INT32);
-    AddTestTensor("input", {1, 2, 3});
-    AddTestWeights<int32>("dimension", {1}, {0});
-    RunValidationAndConversion(
-        node_def, error::UNIMPLEMENTED,
-        "TensorRT does not allow manipulation of the batch dimension, at "
-        "my_arg");
-  }
-
-  TestConvertArgMinMax<ops::ArgMin, DT_FLOAT>(this);
-  TestConvertArgMinMax<ops::ArgMax, DT_FLOAT>(this);
-  TestConvertArgMinMax<ops::ArgMin, DT_HALF>(this);
-  TestConvertArgMinMax<ops::ArgMax, DT_HALF>(this);
-  // TRT does not support int32 for TopK layer which is used to implement ArgMin
-  // and ArgMax.
-  // TestConvertArgMinMax<ops::ArgMin, DT_INT32>(this);
-  // TestConvertArgMinMax<ops::ArgMax, DT_INT32>(this);
 }
 
 // Get the NodeDef for DepthToSpace or SpaceToSpace.
@@ -5626,363 +8405,297 @@ NodeDef GetDepthSpaceShuffleNodeDef(DataType dtype, int block_size,
   return shuffle.operation.node()->def();
 }
 
-template <typename CType>
 struct DepthSpaceShuffleTestParams {
   std::vector<int> input_dims;
-  std::vector<CType> input_value;
+  std::vector<int> input_value;
   int block_size;
   string data_format;
   std::vector<int> expected_output_dims;
-  std::vector<CType> expected_output;
+  std::vector<int> expected_output;
 };
 
-template <typename OpType, DataType dtype, typename CType>
+template <typename OpType>
 void TestConvertDepthSpaceShuffle(
-    OpConverterTest* test,
-    const std::vector<DepthSpaceShuffleTestParams<CType>>& params) {
-  for (int i = 0; i < params.size(); ++i) {
-    test->Reset();
-
-    NodeDef node_def = GetDepthSpaceShuffleNodeDef<OpType>(
-        dtype, params[i].block_size, params[i].data_format);
-    test->AddTestTensor("input", params[i].input_dims, 1,
-                        TfDataTypeToTrt(dtype));
-    test->RunValidationAndConversion(node_def);
-
-    TRT_TensorOrWeights output;
-    TF_EXPECT_OK(test->GetTensorOrWeights("my_shuffle", &output));
-    EXPECT_TRUE(output.is_tensor());
-    ExpectTrtDimsEqualsArray(params[i].expected_output_dims,
-                             output.tensor()->getDimensions());
-
-    DataVec input_data{{"input", test::AsTensor<CType>(params[i].input_value)}};
-    DataVec output_data{{"my_shuffle", ConstructTensor<CType>(
-                                           params[i].expected_output.size())}};
-    test->BuildAndRun(
-        input_data, &output_data,
-        dtype == DT_HALF ? TrtPrecisionMode::FP16 : TrtPrecisionMode::FP32);
-    EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
-                ElementsAreArray(params[i].expected_output));
-  }
-}
-
-template <DataType dtype>
-void TestConvertDepthToSpace(OpConverterTest* test) {
-  typedef typename EnumToDataType<dtype>::Type CType;
-  const std::vector<CType> common_input = InitTestVector<CType>(16);
-  std::vector<DepthSpaceShuffleTestParams<CType>> params = {
-      {
-          /*input_shape=*/{4, 2, 2},
-          /*input_value=*/common_input,
-          /*block_size=*/2,
-          /*data_format=*/"NCHW",
-          /*expected_output_dims=*/{1, 4, 4},
-          /*expected_output=*/
-          CastTestVector<int, CType>(
-              {0, 4, 1, 5, 8, 12, 9, 13, 2, 6, 3, 7, 10, 14, 11, 15}),
-      },
-      {
-          /*input_shape=*/{2, 2, 4},
-          /*input_value=*/common_input,
-          /*block_size=*/2,
-          /*data_format=*/"NHWC",
-          /*expected_output_dims=*/{4, 4, 1},
-          /*expected_output=*/
-          CastTestVector<int, CType>(
-              {0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15}),
-      },
-      {
-          /*input_shape=*/{16, 1, 1},
-          /*input_value=*/common_input,
-          /*block_size=*/4,
-          /*data_format=*/"NCHW",
-          /*expected_output_dims=*/{1, 4, 4},
-          /*expected_output=*/InitTestVector<CType>(16),
-      },
-      {
-          /*input_shape=*/{2, 2, 8},
-          /*input_value=*/InitTestVector<CType>(32),
-          /*block_size=*/2,
-          /*data_format=*/"NHWC",
-          /*expected_output_dims=*/{4, 4, 2},
-          /*expected_output=*/CastTestVector<int, CType>({0,  1,  2,  3,  8,
-                                                          9,  10, 11, 4,  5,
-                                                          6,  7,  12, 13, 14,
-                                                          15, 16, 17, 18, 19,
-                                                          24, 25, 26, 27, 20,
-                                                          21, 22, 23, 28, 29,
-                                                          30, 31}),
-      },
-  };
-
-  TestConvertDepthSpaceShuffle<ops::DepthToSpace, dtype, CType>(test, params);
-}
+    ParameterizedOpConverterTestBase* test,
+    const std::vector<DepthSpaceShuffleTestParams>& params) {
+  Status status = Status::OK();
 
-TEST_F(OpConverterTest, ConvertDepthToSpace) {
   {
     // Input is a weight, should fail.
-    Reset();
-    NodeDef node_def =
-        GetDepthSpaceShuffleNodeDef<ops::DepthToSpace>(DT_FLOAT, 2, "NCHW");
-    AddTestWeights<float>("input", {4, 1, 1}, {1, 2, 3, 4});
-    RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
-                               "The input \"input\" for DepthToSpace must be a "
-                               "tensor, at my_shuffle");
+    test->Reset();
+    NodeDef node_def = GetDepthSpaceShuffleNodeDef<ops::DepthToSpace>(
+        test->get_tf_type(), 2, "NCHW");
+    test->AddTestWeights<float>("input", {1, 4, 1, 1}, {1, 2, 3, 4});
+    test->RunValidationAndConversion(
+        node_def, absl::StatusCode::kUnimplemented,
+        StrCat("The input \"input\" for ", node_def.op(), " must be a tensor"));
   }
   {
     // Input rank != 4
-    Reset();
-    NodeDef node_def =
-        GetDepthSpaceShuffleNodeDef<ops::DepthToSpace>(DT_FLOAT, 2, "NCHW");
-    AddTestTensor("input", {16, 32});
-    RunValidationAndConversion(
-        node_def, error::INVALID_ARGUMENT,
-        "The input to DepthToSpace must be rank 4, at my_shuffle");
-  }
-  {
-    // Channels not divisible by block_size, should fail.
-    Reset();
-    NodeDef node_def =
-        GetDepthSpaceShuffleNodeDef<ops::DepthToSpace>(DT_FLOAT, 3, "NCHW");
-    AddTestTensor("input", {16, 32, 32});
-    RunValidationAndConversion(node_def, error::INVALID_ARGUMENT,
-                               "Number of channels must be divisible by "
-                               "block_size*block_size, at my_shuffle");
+    test->Reset();
+    NodeDef node_def = GetDepthSpaceShuffleNodeDef<ops::DepthToSpace>(
+        test->get_tf_type(), 2, "NCHW");
+    test->AddTestTensor("input", {1, 16, 32});
+    test->RunValidationAndConversion(
+        node_def, absl::StatusCode::kInvalidArgument,
+        StrCat("The input to ", node_def.op(), " must be rank 4"));
   }
   {
     // Unsupported format, should fail.
-    Reset();
+    test->Reset();
     NodeDef node_def = GetDepthSpaceShuffleNodeDef<ops::DepthToSpace>(
-        DT_FLOAT, 2, "NCHW_VECT_C");
-    AddTestTensor("input", {16, 32, 32});
-    RunValidationAndConversion(
-        node_def, error::UNIMPLEMENTED,
-        "Data format NCHW_VECT_C is not supported, at my_shuffle");
+        test->get_tf_type(), 2, "NCHW_VECT_C");
+    test->AddTestTensor("input", {1, 16, 32, 32});
+    test->RunValidationAndConversion(
+        node_def, absl::StatusCode::kUnimplemented,
+        "Data format NCHW_VECT_C is not supported");
+  }
+  if (test->get_trt_mode() != TrtTestMode::kDynamicShape) {
+    // In dynamic shape mode, we cannot check input dimension values at
+    // conversion time therefore we cannot confirm block_size vs input dim
+    // consistency. We rely on the user to provide a valid TF graph. Otherwise
+    // TRT will fail with a runtime error.
+    if (std::is_same<OpType, ops::DepthToSpace>::value) {
+      // Channels not divisible by block_size, should fail.
+      test->Reset();
+      NodeDef node_def = GetDepthSpaceShuffleNodeDef<ops::DepthToSpace>(
+          test->get_tf_type(), 3, "NCHW");
+      test->AddTestTensor("input", {1, 16, 32, 32});
+      test->RunValidationAndConversion(node_def,
+                                       absl::StatusCode::kInvalidArgument,
+                                       "Number of channels must be divisible by"
+                                       " block_size*block_size");
+    } else {
+      {  // Width not divisible by block_size, should fail.
+        test->Reset();
+        NodeDef node_def = GetDepthSpaceShuffleNodeDef<ops::SpaceToDepth>(
+            test->get_tf_type(), 3, "NCHW");
+        test->AddTestTensor("input", {1, 16, 9, 32});
+        test->RunValidationAndConversion(node_def,
+                                         absl::StatusCode::kInvalidArgument,
+                                         "Width and height must be divisible by"
+                                         " block_size");
+      }
+      {
+        // Height not divisible by block_size, should fail.
+        test->Reset();
+        NodeDef node_def = GetDepthSpaceShuffleNodeDef<ops::SpaceToDepth>(
+            test->get_tf_type(), 3, "NCHW");
+        test->AddTestTensor("input", {1, 16, 32, 9});
+        test->RunValidationAndConversion(node_def,
+                                         absl::StatusCode::kInvalidArgument,
+                                         "Width and height must be divisible by"
+                                         " block_size");
+      }
+    }
   }
 
-  TestConvertDepthToSpace<DT_FLOAT>(this);
-  TestConvertDepthToSpace<DT_HALF>(this);
-  TestConvertDepthToSpace<DT_INT32>(this);
+  for (auto p : params) {
+    test->Reset();
+    const NodeDef node = GetDepthSpaceShuffleNodeDef<OpType>(
+        test->get_tf_type(), p.block_size, p.data_format);
+    test->AddTestTensor("input", p.input_dims, p.input_value);
+    test->TestOpConverter(node, p.expected_output_dims, status, Status::OK(),
+                          ElementsAreArray(p.expected_output));
+  }
 }
 
-template <DataType dtype>
-void TestConvertSpaceToDepth(OpConverterTest* test) {
-  typedef typename EnumToDataType<dtype>::Type CType;
-  const std::vector<CType> common_input = InitTestVector<CType>(16);
-  std::vector<DepthSpaceShuffleTestParams<CType>> params = {
+TEST_P(OpConverter_FP32_FP16_INT32_Test, ConvertDepthToSpace) {
+  const std::vector<int> common_input = CreateVectorIota<int>(16);
+  std::vector<DepthSpaceShuffleTestParams> params = {
       {
-          /*input_shape=*/{1, 4, 4},
+          /*input_shape=*/{1, 4, 2, 2},
           /*input_value=*/common_input,
           /*block_size=*/2,
           /*data_format=*/"NCHW",
-          /*expected_output_dims=*/{4, 2, 2},
+          /*expected_output_dims=*/{1, 1, 4, 4},
           /*expected_output=*/
-          CastTestVector<int, CType>(
-              {0, 2, 8, 10, 1, 3, 9, 11, 4, 6, 12, 14, 5, 7, 13, 15}),
+          {0, 4, 1, 5, 8, 12, 9, 13, 2, 6, 3, 7, 10, 14, 11, 15},
       },
       {
-          /*input_shape=*/{4, 4, 1},
+          /*input_shape=*/{1, 2, 2, 4},
           /*input_value=*/common_input,
           /*block_size=*/2,
           /*data_format=*/"NHWC",
-          /*expected_output_dims=*/{2, 2, 4},
+          /*expected_output_dims=*/{1, 4, 4, 1},
           /*expected_output=*/
-          CastTestVector<int, CType>(
-              {0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15}),
+          {0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15},
       },
       {
-          /*input_shape=*/{1, 4, 4},
+          /*input_shape=*/{1, 16, 1, 1},
           /*input_value=*/common_input,
           /*block_size=*/4,
           /*data_format=*/"NCHW",
-          /*expected_output_dims=*/{16, 1, 1},
-          /*expected_output=*/InitTestVector<CType>(16),
+          /*expected_output_dims=*/{1, 1, 4, 4},
+          /*expected_output=*/CreateVectorIota<int>(16),
       },
       {
-          /*input_shape=*/{4, 4, 2},
-          /*input_value=*/InitTestVector<CType>(32),
+          /*input_shape=*/{1, 2, 2, 8},
+          /*input_value=*/CreateVectorIota<int>(32),
           /*block_size=*/2,
           /*data_format=*/"NHWC",
-          /*expected_output_dims=*/{2, 2, 8},
-          /*expected_output=*/CastTestVector<int, CType>({0,  1,  2,  3,  8,
-                                                          9,  10, 11, 4,  5,
-                                                          6,  7,  12, 13, 14,
-                                                          15, 16, 17, 18, 19,
-                                                          24, 25, 26, 27, 20,
-                                                          21, 22, 23, 28, 29,
-                                                          30, 31}),
-      },
-  };
-
-  TestConvertDepthSpaceShuffle<ops::SpaceToDepth, dtype, CType>(test, params);
-}
-
-TEST_F(OpConverterTest, ConvertSpaceToDepth) {
-  {
-    // Input is a weight, should fail.
-    Reset();
-    NodeDef node_def =
-        GetDepthSpaceShuffleNodeDef<ops::SpaceToDepth>(DT_FLOAT, 2, "NCHW");
-    AddTestWeights<float>("input", {4, 1, 1}, {1, 2, 3, 4});
-    RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
-                               "The input \"input\" for SpaceToDepth must be a "
-                               "tensor, at my_shuffle");
-  }
-  {
-    // Input rank != 4
-    Reset();
-    NodeDef node_def =
-        GetDepthSpaceShuffleNodeDef<ops::SpaceToDepth>(DT_FLOAT, 2, "NCHW");
-    AddTestTensor("input", {16, 32});
-    RunValidationAndConversion(
-        node_def, error::INVALID_ARGUMENT,
-        "The input to SpaceToDepth must be rank 4, at my_shuffle");
-  }
-  {
-    // Width not divisble by block_size, should fail.
-    Reset();
-    NodeDef node_def =
-        GetDepthSpaceShuffleNodeDef<ops::SpaceToDepth>(DT_FLOAT, 3, "NCHW");
-    AddTestTensor("input", {16, 9, 32});
-    RunValidationAndConversion(node_def, error::INVALID_ARGUMENT,
-                               "Width and height must be divisible by "
-                               "block_size, at my_shuffle");
-  }
-  {
-    // Height not divisble by block_size, should fail.
-    Reset();
-    NodeDef node_def =
-        GetDepthSpaceShuffleNodeDef<ops::SpaceToDepth>(DT_FLOAT, 3, "NCHW");
-    AddTestTensor("input", {16, 32, 9});
-    RunValidationAndConversion(node_def, error::INVALID_ARGUMENT,
-                               "Width and height must be divisible by "
-                               "block_size, at my_shuffle");
-  }
-  {
-    // Unsupported format, should fail.
-    Reset();
-    NodeDef node_def = GetDepthSpaceShuffleNodeDef<ops::SpaceToDepth>(
-        DT_FLOAT, 2, "NCHW_VECT_C");
-    AddTestTensor("input", {16, 32, 32});
-    RunValidationAndConversion(
-        node_def, error::UNIMPLEMENTED,
-        "Data format NCHW_VECT_C is not supported, at my_shuffle");
-  }
-
-  TestConvertSpaceToDepth<DT_FLOAT>(this);
-  TestConvertSpaceToDepth<DT_HALF>(this);
-  TestConvertSpaceToDepth<DT_INT32>(this);
-}
-
-#if IS_TRT_VERSION_GE(5, 1, 2, 0)
-// Get the NodeDef for ClipByValue.
-NodeDef GetClipByValueNodeDef(DataType dtype) {
-  Scope s = Scope::NewRootScope();
-  auto t = ops::Placeholder(s.WithOpName("t"), dtype);
-  auto clip_value_min = ops::Placeholder(s.WithOpName("clip_value_min"), dtype);
-  auto clip_value_max = ops::Placeholder(s.WithOpName("clip_value_max"), dtype);
-  auto clip = ops::ClipByValue(s.WithOpName("my_clip"), t, clip_value_min,
-                               clip_value_max);
-  return clip.operation.node()->def();
-}
-
-template <DataType dtype>
-void TestConvertClipByValue(OpConverterTest* test) {
-  typedef typename EnumToDataType<dtype>::Type CType;
-
-  struct TestParams {
-    std::vector<int> dims;
-    std::vector<CType> input_value;
-    CType clip_value_min;
-    CType clip_value_max;
-    std::vector<CType> expected_output;
-  };
+          /*expected_output_dims=*/{1, 4, 4, 2},
+          /*expected_output=*/{0,  1,  2,  3,  8,  9,  10, 11, 4,  5,  6,
+                               7,  12, 13, 14, 15, 16, 17, 18, 19, 24, 25,
+                               26, 27, 20, 21, 22, 23, 28, 29, 30, 31},
+      }};
 
-  const std::vector<CType> common_input = InitTestVector<CType>(6);
-  std::vector<TestParams> params = {
+  TestConvertDepthSpaceShuffle<ops::DepthToSpace>(this, params);
+}
+
+TEST_P(OpConverter_FP32_FP16_INT32_Test, ConvertSpaceToDepth) {
+  const std::vector<int> common_input = CreateVectorIota<int>(16);
+  std::vector<DepthSpaceShuffleTestParams> params = {
+      {
+          /*input_shape=*/{1, 1, 4, 4},
+          /*input_value=*/common_input,
+          /*block_size=*/2,
+          /*data_format=*/"NCHW",
+          /*expected_output_dims=*/{1, 4, 2, 2},
+          /*expected_output=*/
+          {0, 2, 8, 10, 1, 3, 9, 11, 4, 6, 12, 14, 5, 7, 13, 15},
+      },
       {
-          /*dims=*/{1, 2, 3},
+          /*input_shape=*/{1, 4, 4, 1},
           /*input_value=*/common_input,
-          /*clip_value_min=*/CType(2),
-          /*clip_value_max=*/CType(5),
+          /*block_size=*/2,
+          /*data_format=*/"NHWC",
+          /*expected_output_dims=*/{1, 2, 2, 4},
           /*expected_output=*/
-          {CType(2), CType(2), CType(2), CType(3), CType(4), CType(5)},
+          {0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15},
       },
       {
-          /*dims=*/{2, 1, 3},
+          /*input_shape=*/{1, 1, 4, 4},
           /*input_value=*/common_input,
-          /*clip_value_min=*/CType(-1),
-          /*clip_value_max=*/CType(8),
-          /*expected_output=*/common_input,
+          /*block_size=*/4,
+          /*data_format=*/"NCHW",
+          /*expected_output_dims=*/{1, 16, 1, 1},
+          /*expected_output=*/CreateVectorIota<int>(16),
+      },
+      {
+          /*input_shape=*/{1, 4, 4, 2},
+          /*input_value=*/CreateVectorIota<int>(32),
+          /*block_size=*/2,
+          /*data_format=*/"NHWC",
+          /*expected_output_dims=*/{1, 2, 2, 8},
+          /*expected_output=*/{0,  1,  2,  3,  8,  9,  10, 11, 4,  5,  6,
+                               7,  12, 13, 14, 15, 16, 17, 18, 19, 24, 25,
+                               26, 27, 20, 21, 22, 23, 28, 29, 30, 31},
       },
   };
+  TestConvertDepthSpaceShuffle<ops::SpaceToDepth>(this, params);
+}
 
-  for (int i = 0; i < params.size(); ++i) {
-    test->Reset();
-
-    NodeDef node_def = GetClipByValueNodeDef(dtype);
-    test->AddTestTensor("t", params[i].dims, 1, TfDataTypeToTrt(dtype));
-    test->AddTestWeights<CType>("clip_value_min", {1},
-                                {params[i].clip_value_min});
-    test->AddTestWeights<CType>("clip_value_max", {1},
-                                {params[i].clip_value_max});
-    test->RunValidationAndConversion(node_def);
+TEST_P(OpConverter_FP32_FP16_Test, ConvertClipByValue) {
+  Scope s = Scope::NewRootScope();
+  auto t = ops::Placeholder(s.WithOpName("t"), tf_type_);
+  auto clip_value_min =
+      ops::Placeholder(s.WithOpName("clip_value_min"), tf_type_);
+  auto clip_value_max =
+      ops::Placeholder(s.WithOpName("clip_value_max"), tf_type_);
+  auto clip = ops::ClipByValue(s.WithOpName("my_clip"), t, clip_value_min,
+                               clip_value_max);
+  const NodeDef& node_def = clip.operation.node()->def();
 
-    TRT_TensorOrWeights output;
-    TF_EXPECT_OK(test->GetTensorOrWeights("my_clip", &output));
-    EXPECT_TRUE(output.is_tensor());
-    ExpectTrtDimsEqualsArray(params[i].dims, output.tensor()->getDimensions());
-
-    DataVec input_data{{"t", test::AsTensor<CType>(params[i].input_value)}};
-    DataVec output_data{
-        {"my_clip", ConstructTensor<CType>(params[i].expected_output.size())}};
-    test->BuildAndRun(
-        input_data, &output_data,
-        dtype == DT_HALF ? TrtPrecisionMode::FP16 : TrtPrecisionMode::FP32);
-    EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
-                ElementsAreArray(params[i].expected_output));
-  }
-}
+  nvinfer1::DataType trt_type_;
+  TF_ASSERT_OK(TfTypeToTrtType(tf_type_, &trt_type_));
 
-TEST_F(OpConverterTest, ConvertClipByValue) {
   {
     // Input is a weight, should fail.
     Reset();
-    NodeDef node_def = GetClipByValueNodeDef(DT_FLOAT);
-    AddTestWeights<float>("t", {1, 2, 3}, {1, 2, 3, 4, 5, 6});
-    AddTestWeights<float>("clip_value_min", {1}, {1});
-    AddTestWeights<float>("clip_value_max", {1}, {5});
-    RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
+    AddTestWeights("t", {1, 2, 3}, {1, 2, 3, 4, 5, 6}, tf_type_);
+    AddTestWeights("clip_value_min", {1}, {1}, tf_type_);
+    AddTestWeights("clip_value_max", {1}, {5}, tf_type_);
+    RunValidationAndConversion(node_def, absl::StatusCode::kUnimplemented,
                                "The input \"t\" for ClipByValue must be a "
-                               "tensor, at my_clip");
+                               "tensor");
   }
   {
     // Clip min is a tensor, should fail.
     Reset();
-    NodeDef node_def = GetClipByValueNodeDef(DT_FLOAT);
     AddTestTensor("t", {1, 2, 3});
     AddTestTensor("clip_value_min", {1});
-    AddTestWeights<float>("clip_value_max", {1}, {1});
-    RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
+    AddTestWeights("clip_value_max", {1}, {1}, tf_type_);
+    RunValidationAndConversion(node_def, absl::StatusCode::kUnimplemented,
                                "The input \"clip_value_min\" for ClipByValue "
-                               "must be a constant, at my_clip");
+                               "must be a constant");
   }
   {
     // Clip max is a tensor, should fail.
     Reset();
-    NodeDef node_def = GetClipByValueNodeDef(DT_FLOAT);
     AddTestTensor("t", {1, 2, 3});
-    AddTestWeights<float>("clip_value_min", {1}, {1});
+    AddTestWeights("clip_value_min", {1}, {1}, tf_type_);
     AddTestTensor("clip_value_max", {1});
-    RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
+    RunValidationAndConversion(node_def, absl::StatusCode::kUnimplemented,
                                "The input \"clip_value_max\" for ClipByValue "
-                               "must be a constant, at my_clip");
+                               "must be a constant");
   }
 
-  TestConvertClipByValue<DT_FLOAT>(this);
-  TestConvertClipByValue<DT_HALF>(this);
+  struct TestParams {
+    std::vector<int> dims;
+    int clip_value_min;
+    int clip_value_max;
+    std::vector<float> expected_output;
+  };
+
+  const std::vector<float> common_input = CreateVectorIota<float>(6);
+
+  std::vector<TestParams> params = {{
+                                        /*dims=*/{6},
+                                        /*clip_value_min=*/2,
+                                        /*clip_value_max=*/4,
+                                        /*expected_output=*/{2, 2, 2, 3, 4, 4},
+                                    },
+                                    {
+                                        /*dims=*/{1, 6},
+                                        /*clip_value_min=*/2,
+                                        /*clip_value_max=*/4,
+                                        /*expected_output=*/{2, 2, 2, 3, 4, 4},
+                                    },
+                                    {
+                                        /*dims=*/{1, 2, 3},
+                                        /*clip_value_min=*/2,
+                                        /*clip_value_max=*/4,
+                                        /*expected_output=*/{2, 2, 2, 3, 4, 4},
+                                    },
+                                    {
+                                        /*dims=*/{1, 2, 3, 1},
+                                        /*clip_value_min=*/2,
+                                        /*clip_value_max=*/4,
+                                        /*expected_output=*/{2, 2, 2, 3, 4, 4},
+                                    },
+                                    {
+                                        /*dims=*/{1, 1, 3, 1, 2},
+                                        /*clip_value_min=*/2,
+                                        /*clip_value_max=*/4,
+                                        /*expected_output=*/{2, 2, 2, 3, 4, 4},
+                                    },
+                                    {
+                                        /*dims=*/{1, 1, 3, 1, 2, 1},
+                                        /*clip_value_min=*/2,
+                                        /*clip_value_max=*/4,
+                                        /*expected_output=*/{2, 2, 2, 3, 4, 4},
+                                    },
+                                    {
+                                        /*dims=*/{2, 1, 3},
+                                        /*clip_value_min=*/-1,
+                                        /*clip_value_max=*/8,
+                                        /*expected_output=*/common_input,
+                                    }};
+
+  for (auto p : params) {
+    Reset();
+
+    AddTestTensor("t", p.dims, tf_type_, common_input);
+    AddTestWeights("clip_value_min", {1}, {p.clip_value_min}, tf_type_);
+    AddTestWeights("clip_value_max", {1}, {p.clip_value_max}, tf_type_);
+
+    TestOpConverter(node_def, p.dims,
+                    /*expected_conversion_status=*/Status::OK(),
+                    /*expected_runtime_status=*/Status::OK(),
+                    /*matcher=*/ElementsAreArray(p.expected_output));
+  }
 }
-#endif  // IS_TRT_VERSION_GE(5, 1, 2, 0)
 
 // Get the NodeDef for SquaredDifference.
 NodeDef GetSquaredDifferenceNodeDef(DataType dtype) {
@@ -5994,222 +8707,197 @@ NodeDef GetSquaredDifferenceNodeDef(DataType dtype) {
   return squared_diff.operation.node()->def();
 }
 
-template <DataType dtype>
-void TestConvertSquaredDifference(OpConverterTest* test) {
-  typedef typename EnumToDataType<dtype>::Type CType;
+TEST_P(OpConverter_FP32_FP16_Test, ConvertSquaredDifference) {
+  {
+    // Input is a weight, should fail.
+    Reset();
+    NodeDef node_def = GetSquaredDifferenceNodeDef(tf_type_);
+    AddTestWeights<float>("x", {1, 2, 3}, {1, 2, 3, 4, 5, 6});
+    AddTestTensor("y", {1, 1, 2, 3});
+    RunValidationAndConversion(node_def, absl::StatusCode::kUnimplemented,
+                               "The input \"x\" for SquaredDifference must be "
+                               "a tensor");
+  }
 
   struct TestParams {
     std::vector<int> dims_x;
     std::vector<int> dims_y;
-    std::vector<CType> value_x;
-    std::vector<CType> value_y;
+    std::vector<float> value_x;
+    std::vector<float> value_y;
     std::vector<int> expected_output_dims;
-    std::vector<CType> expected_output;
+    std::vector<float> expected_output;
+    Status status;
+    Status runtime_status;
   };
 
-  const std::vector<CType> common_input = InitTestVector<CType>(6);
+  const std::vector<float> common_input = CreateVectorIota<float>(6);
   std::vector<TestParams> params = {
+      {/*dims_x=*/{1, 2, 3},
+       /*dims_y=*/{1, 7, 5},
+       /*value_x=*/common_input,
+       /*value_y=*/std::vector<float>(7 * 5, 0),
+       /*expected_output_dims=*/{1, 1, 2, 3},
+       /*expected_output=*/common_input,
+       trt_mode_ == TrtTestMode::kDynamicShape
+           ? Status::OK()
+           : errors::InvalidArgument("Infeasible broadcast scheme"),
+       errors::Internal(
+           "Binding index out of range. This can happen if profile is not set, "
+           "or the network is invalid for the current profile.")},
       {
-          /*dims_x=*/{1, 2, 3},
-          /*dims_y=*/{1, 2, 3},
+          /*dims_x=*/{1, 1, 2, 3},
+          /*dims_y=*/{1, 1, 2, 3},
           /*value_x=*/common_input,
-          /*value_y=*/CastTestVector<int, CType>({0, -1, 3, 0, 10, -7}),
-          /*expected_output_dims=*/{1, 2, 3},
-          /*expected_output=*/CastTestVector<int, CType>({0, 4, 1, 9, 36, 144}),
+          /*value_y=*/{0, -1, 3, 0, 10, -7},
+          /*expected_output_dims=*/{1, 1, 2, 3},
+          /*expected_output=*/{0, 4, 1, 9, 36, 144},
       },
       {
-          /*dims_x=*/{1, 2, 3},
-          /*dims_y=*/{1, 1, 3},
+          /*dims_x=*/{1, 1, 2, 3},
+          /*dims_y=*/{1, 1, 1, 3},
           /*value_x=*/common_input,
-          /*value_y=*/CastTestVector<int, CType>({0, 1, 2}),
-          /*expected_output_dims=*/{1, 2, 3},
-          /*expected_output=*/CastTestVector<int, CType>({0, 0, 0, 9, 9, 9}),
+          /*value_y=*/{0, 1, 2},
+          /*expected_output_dims=*/{1, 1, 2, 3},
+          /*expected_output=*/{0, 0, 0, 9, 9, 9},
       },
   };
 
-  for (int i = 0; i < params.size(); ++i) {
-    test->Reset();
-
-    NodeDef node_def = GetSquaredDifferenceNodeDef(dtype);
-    test->AddTestTensor("x", params[i].dims_x, 1, TfDataTypeToTrt(dtype));
-    test->AddTestTensor("y", params[i].dims_y, 1, TfDataTypeToTrt(dtype));
-    test->RunValidationAndConversion(node_def);
-
-    TRT_TensorOrWeights output;
-    TF_EXPECT_OK(test->GetTensorOrWeights("my_squared_diff", &output));
-    EXPECT_TRUE(output.is_tensor());
-    ExpectTrtDimsEqualsArray(params[i].expected_output_dims,
-                             output.tensor()->getDimensions());
-
-    DataVec input_data{{"x", test::AsTensor<CType>(params[i].value_x)},
-                       {"y", test::AsTensor<CType>(params[i].value_y)}};
-    DataVec output_data{
-        {"my_squared_diff",
-         ConstructTensor<CType>(params[i].expected_output.size())}};
-    test->BuildAndRun(
-        input_data, &output_data,
-        dtype == DT_HALF ? TrtPrecisionMode::FP16 : TrtPrecisionMode::FP32);
-    EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
-                ElementsAreArray(params[i].expected_output));
-  }
-}
-
-TEST_F(OpConverterTest, ConvertSquaredDifference) {
-  {
-    // Input is a weight, should fail.
-    Reset();
-    NodeDef node_def = GetSquaredDifferenceNodeDef(DT_FLOAT);
-    AddTestWeights<float>("x", {1, 2, 3}, {1, 2, 3, 4, 5, 6});
-    AddTestTensor("y", {1, 2, 3});
-    RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
-                               "The input \"x\" for SquaredDifference must be "
-                               "a tensor, at my_squared_diff");
-  }
-  {
-    // Shapes are not broadcastable, should fail.
+  for (auto p : params) {
     Reset();
-    NodeDef node_def = GetSquaredDifferenceNodeDef(DT_FLOAT);
-    AddTestTensor("x", {2, 3});
-    AddTestTensor("y", {7, 5});
-    RunValidationAndConversion(node_def, error::INVALID_ARGUMENT,
-                               "Infeasible broadcast scheme");
+    const NodeDef node = GetSquaredDifferenceNodeDef(tf_type_);
+    AddTestTensor("x", p.dims_x, p.value_x);
+    AddTestTensor("y", p.dims_y, p.value_y);
+    TestOpConverter(node, p.expected_output_dims, p.status, p.runtime_status,
+                    ElementsAreArray(p.expected_output));
   }
-
-  TestConvertSquaredDifference<DT_FLOAT>(this);
-  TestConvertSquaredDifference<DT_HALF>(this);
 }
 
-#if IS_TRT_VERSION_GE(6, 0, 0, 0)
-// TODO: @mconley @jdekhtiar - Reactivate when fixed
-#ifndef TF2TENSORRT_BYPASS_NMS_RESIZE_OPS
 template <typename OpType>
-NodeDef MakeResizeNodeDef(std::string name, DataType dtype,
-                          bool align_corners) {
+NodeDef MakeResizeNodeDef(DataType dtype, bool align_corners) {
   Scope s = Scope::NewRootScope();
   auto input = ops::Placeholder(s.WithOpName("input"), dtype);
   auto size = ops::Placeholder(s.WithOpName("size"), DT_INT32);
   auto attrs = typename OpType::Attrs().AlignCorners(align_corners);
-  auto resize = OpType(s.WithOpName(name), input, size, attrs);
+  auto resize = OpType(s.WithOpName("my_resize"), input, size, attrs);
   return resize.operation.node()->def();
 }
 
-template <typename CType>
 struct ResizeTestParams {
   std::vector<int> input_dims;
   std::vector<int> output_resize_dims;
-  std::vector<CType> input_values;
+  std::vector<float> input_value;
+  bool size_as_tensor;
   bool align_corners;
   std::vector<int> expected_output_dims;
-  std::vector<CType> expected_nearest_output_values;
-  std::vector<CType> expected_bilinear_output_values;
+  std::vector<float> expected_nearest_output_values;
+  std::vector<float> expected_bilinear_output_values;
+  Status status;
 };
 
-template <typename OpType, DataType dtype>
-void TestConvertResize(OpConverterTest* test) {
-  typedef typename EnumToDataType<dtype>::Type CType;
-
-  std::vector<ResizeTestParams<CType>> params{
-      {
-          /*input_dims=*/{1, 2, 1},       // H, W, C
-          /*output_resize_dims=*/{2, 3},  // H_out, W_out
-          /*input_values=*/CastTestVector<float, CType>({2.0f, -1.0f}),
-          /*align_corners=*/false,
-          /*expected_output_dims=*/{2, 3, 1},  // H, W, C
-          /*expected_nearest_output_values=*/
-          CastTestVector<float, CType>({2.0f, 2.0f, -1.0f, 2.0f, 2.0f, -1.0f}),
-          /*expected_bilinear_output_values=*/
-          CastTestVector<float, CType>({2.0f, 0.f, -1.0f, 2.0f, 0.f, -1.0f}),
-      },
-      {
-          /*input_dims=*/{1, 2, 1},       // H, W, C
-          /*output_resize_dims=*/{2, 3},  // H_out, W_out
-          /*input_values=*/CastTestVector<float, CType>({2.0f, -1.0f}),
-          /*align_corners=*/true,
-          /*expected_output_dims=*/{2, 3, 1},  // H, W, C
-          /*expected_nearest_output_values=*/
-          CastTestVector<float, CType>({2.0f, 2.0f, -1.0f, 2.0f, 2.0f, -1.0f}),
-          /*expected_bilinear_output_values=*/
-          CastTestVector<float, CType>({2.0f, 0.5f, -1.0f, 2.0f, 0.5f, -1.0f}),
-      }};
-
-// This use case is not supported as of TRT version 7.1
-#if IS_TRT_VERSION_GE(7, 1, 0, 0)
-  if (OpType == ops::ResizeBilinear) {
-    params.erase(params.begin());
+template <typename OpType>
+void TestConvertResize(ParameterizedOpConverterTestBase* test,
+                       ResizeTestParams& p) {
+  test->Reset();
+  // Create resize node.
+  NodeDef node_def =
+      MakeResizeNodeDef<OpType>(test->get_tf_type(), p.align_corners);
+
+  test->AddTestTensor("input", p.input_dims, test->get_tf_type(),
+                      p.input_value);
+  // Create output size.
+  if (p.size_as_tensor) {
+    std::vector<int32> size_dims{2};
+    std::vector<int32> size_values{p.output_resize_dims};
+    test->AddTestTensor("size", size_dims, DT_INT32, size_values, size_dims);
+  } else {
+    test->AddTestWeights("size", {2}, p.output_resize_dims, DT_INT32);
   }
-#endif
-
-  for (int i = 0; i < params.size(); ++i) {
-    test->Reset();
-    // Create resize node.
-    NodeDef node_def =
-        MakeResizeNodeDef<OpType>("my_resize", dtype, params[i].align_corners);
-    // Create input tensor
-    test->AddTestTensor("input", params[i].input_dims, /*batch_size=*/1,
-                        /*trt_dtype=*/TfDataTypeToTrt(dtype));
-    // Create output size.
-    test->AddTestWeights<int32>("size", {2}, params[i].output_resize_dims);
-
-    test->RunValidationAndConversion(node_def);
 
-    TRT_TensorOrWeights output;
-    TF_EXPECT_OK(test->GetTensorOrWeights("my_resize", &output));
+  std::vector<float> expected_out;
 
-    // Create input data for tensors.
-    const DataVec input_data{
-        {"input", test::AsTensor<CType>(params[i].input_values)}};
-    DataVec output_data{
-        {"my_resize", ConstructTensor<CType>(
-                          params[i].expected_nearest_output_values.size())}};
-
-    test->BuildAndRun(
-        input_data, &output_data,
-        dtype == DT_HALF ? TrtPrecisionMode::FP16 : TrtPrecisionMode::FP32);
-
-    if (node_def.op() == "ResizeBilinear") {
-      ExpectArrayAlmostEqual(params[i].expected_bilinear_output_values,
-                             GetSpanForData<CType>(output_data[0]),
-                             CType(1e-3));
-    } else if (node_def.op() == "ResizeNearestNeighbor") {
-      ExpectArrayAlmostEqual(params[i].expected_nearest_output_values,
-                             GetSpanForData<CType>(output_data[0]),
-                             CType(1e-3));
-    }
+  if (node_def.op() == "ResizeBilinear") {
+    expected_out = p.expected_bilinear_output_values;
+  } else if (node_def.op() == "ResizeNearestNeighbor") {
+    expected_out = p.expected_nearest_output_values;
+  } else {
+    ASSERT_TRUE(false);
   }
+
+  test->TestOpConverter(node_def, p.expected_output_dims,
+                        /*expected_conversion_status=*/p.status,
+                        /*expected_runtime_status=*/p.status,
+                        /*matcher=*/ElementsAreArray(expected_out),
+                        /*out_tf_types=*/{DT_FLOAT});
 }
 
-TEST_F(OpConverterTest, ConvertResize) {
+TEST_P(OpConverter_FP32_FP16_Test, ConvertResize) {
   {
     // First input is weight, should fail.
     Reset();
-    NodeDef node_def =
-        MakeResizeNodeDef<ops::ResizeBilinear>("my_resize", DT_FLOAT, true);
+    NodeDef node_def = MakeResizeNodeDef<ops::ResizeBilinear>(tf_type_,
+                                                              /*align_corners=*/
+                                                              true);
     AddTestWeights<float>("input", {1, 2}, {1, 2});
     AddTestWeights<int>("size", {1, 2}, {1, 2});
     RunValidationAndConversion(
-        node_def, error::UNIMPLEMENTED,
+        node_def, absl::StatusCode::kUnimplemented,
         "The input \"input\" for ResizeBilinear must be a "
-        "tensor, at my_resize");
+        "tensor");
+  }
+
+  std::vector<ResizeTestParams> params{
+      {/*input_dims=*/{1, 1, 2, 1},    // N, H, W, C
+       /*output_resize_dims=*/{2, 3},  // H_out, W_out
+       /*input_values=*/{2.0f, -1.0f},
+       /*size_as_tensor=*/false,
+       /*align_corners=*/false,
+       /*expected_output_dims=*/{1, 2, 3, 1},  // N, H, W, C
+       /*expected_nearest_output_values=*/
+       {2.0f, 2.0f, -1.0f, 2.0f, 2.0f, -1.0f},
+       /*expected_bilinear_output_values=*/
+       {2.0f, 0.f, -1.0f, 2.0f, 0.f, -1.0f},
+       /*status=*/Status::OK()},
+      {/*input_dims=*/{1, 1, 2, 1},    // N, H, W, C
+       /*output_resize_dims=*/{2, 3},  // H_out, W_out
+       /*input_values=*/{2.0f, -1.0f},
+       /*size_as_tensor=*/false,
+       /*align_corners=*/true,
+       /*expected_output_dims=*/{1, 2, 3, 1},  // N, H, W, C
+       /*expected_nearest_output_values=*/
+       {2.0f, 2.0f, -1.0f, 2.0f, 2.0f, -1.0f},
+       /*expected_bilinear_output_values=*/
+       {2.0f, 0.5f, -1.0f, 2.0f, 0.5f, -1.0f},
+       /*status=*/Status::OK()}};
+
+  if (trt_mode_ != TrtTestMode::kImplicitBatch) {
+    // Size as a tensor is not supported in implicit batch mode.
+    params.push_back({/*input_dims=*/{1, 1, 2, 1},    // N, H, W, C
+                      /*output_resize_dims=*/{2, 3},  // H_out, W_out
+                      /*input_values=*/{2.0f, -1.0f},
+                      /*size_as_tensor=*/true,
+                      /*align_corners=*/true,
+                      /*expected_output_dims=*/{1, 2, 3, 1},  // N, H, W, C
+                      /*expected_nearest_output_values=*/
+                      {2.0f, 2.0f, -1.0f, 2.0f, 2.0f, -1.0f},
+                      /*expected_bilinear_output_values=*/
+                      {2.0f, 0.5f, -1.0f, 2.0f, 0.5f, -1.0f},
+                      /*status=*/Status::OK()});
+  }
+
+  for (auto p : params) {
+    TestConvertResize<ops::ResizeNearestNeighbor>(this, p);
+
+// This use case is not supported as of TRT version 7.1
+#if IS_TRT_VERSION_GE(7, 1, 0, 0)
+    if (!p.align_corners) {
+      p.status = errors::InvalidArgument(
+          "Cannot Convert Bilinear Resize when align_corners=False");
+    }
+#endif
+
+    TestConvertResize<ops::ResizeBilinear>(this, p);
   }
-  {
-    // output dimension is a tensor, should fail.
-    Reset();
-    NodeDef node_def =
-        MakeResizeNodeDef<ops::ResizeBilinear>("my_resize", DT_FLOAT, true);
-    AddTestTensor("input", {1, 2});
-    AddTestTensor("size", {1, 2});
-    RunValidationAndConversion(
-        node_def, error::UNIMPLEMENTED,
-        "The input \"size\" for ResizeBilinear must be a "
-        "constant, at my_resize");
-  }
-  TestConvertResize<ops::ResizeBilinear, DT_FLOAT>(this);
-  TestConvertResize<ops::ResizeBilinear, DT_HALF>(this);
-  TestConvertResize<ops::ResizeNearestNeighbor, DT_FLOAT>(this);
-  TestConvertResize<ops::ResizeNearestNeighbor, DT_HALF>(this);
 }
-#endif  // TF2TENSORRT_BYPASS_NMS_RESIZE_OPS
-#endif  // IS_TRT_VERSION_GE(6, 0, 0, 0)
 
 NodeDef MakePadNodeDef(std::string name, DataType dtype) {
   Scope s = Scope::NewRootScope();
@@ -6219,88 +8907,42 @@ NodeDef MakePadNodeDef(std::string name, DataType dtype) {
   return pad.operation.node()->def();
 }
 
-template <typename CType>
 struct PadTestParams {
   std::vector<int> input_dims;
   std::vector<int> pad_dims;
-  std::vector<CType> input_values;
+  std::vector<int> pad_values;
+  std::vector<float> input_values;
   std::vector<int> expected_output_dims;
-  std::vector<CType> expected_output_values;
+  std::vector<float> expected_output_values;
+  Status status;
 };
 
-template <DataType dtype>
-void TestConvertPad(OpConverterTest* test) {
-  typedef typename EnumToDataType<dtype>::Type CType;
-
-  std::vector<PadTestParams<CType>> params{
-      {
-          /*input_dims=*/{1, 2, 1},  // H, W, C
-          /*pad_dims=*/{4, 2},       // #dims, {pad_before, pad_after}
-          /*input_values=*/CastTestVector<float, CType>({2.0f, -1.0f}),
-          /*expected_output_dims=*/{2, 3, 1},  // H, W, C
-          /*expected_output_values=*/
-          CastTestVector<float, CType>({0.0, 0.0, 0.0, 2.0f, -1.0f, 0.0}),
-      },
-  };
-
-  for (int i = 0; i < params.size(); ++i) {
-    test->Reset();
-    // Create pad node.
-    NodeDef node_def = MakePadNodeDef("my_pad", dtype);
-    // Create input tensor
-    test->AddTestTensor("input", params[i].input_dims, /*batch_size=*/1,
-                        /*trt_dtype=*/TfDataTypeToTrt(dtype));
-    // Create output size.
-    test->AddTestWeights<int32>("padding", params[i].pad_dims,
-                                {0, 0, 1, 0, 0, 1, 0, 0});
-    test->RunValidationAndConversion(node_def);
-
-    TRT_TensorOrWeights output;
-    TF_EXPECT_OK(test->GetTensorOrWeights("padding", &output));
-
-    // Create input data for tensors.
-    const DataVec input_data{
-        {"input", test::AsTensor<CType>(params[i].input_values)}};
-    DataVec output_data{
-        {"my_pad",
-         ConstructTensor<CType>(params[i].expected_output_values.size())}};
-
-    test->BuildAndRun(
-        input_data, &output_data,
-        dtype == DT_HALF ? TrtPrecisionMode::FP16 : TrtPrecisionMode::FP32);
-    ExpectArrayAlmostEqual(params[i].expected_output_values,
-                           GetSpanForData<CType>(output_data[0]), CType(1e-5));
-  }
-}
-
-TEST_F(OpConverterTest, ConvertPad) {
+TEST_P(OpConverter_FP32_FP16_Test, ConvertPad) {
   {
     // First input is weight, should fail.
     Reset();
-    NodeDef node_def = MakePadNodeDef("my_pad", DT_FLOAT);
-    AddTestWeights<float>("input", {1, 2}, {1, 2});
+    NodeDef node_def = MakePadNodeDef("my_pad", tf_type_);
+    AddTestWeights("input", {1, 2}, {1, 2}, tf_type_);
     AddTestWeights<int>("padding", {1, 2}, {1, 2});
-    RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
+    RunValidationAndConversion(node_def, absl::StatusCode::kUnimplemented,
                                "The input \"tensor\" for Pad must be a "
                                "tensor");
   }
   {
     // padding is a tensor, should fail.
     Reset();
-    NodeDef node_def = MakePadNodeDef("my_pad", DT_FLOAT);
+    NodeDef node_def = MakePadNodeDef("my_pad", tf_type_);
     AddTestTensor("input", {1, 2});
     AddTestTensor("padding", {1, 2});
-    RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
+    RunValidationAndConversion(node_def, absl::StatusCode::kUnimplemented,
                                "The input \"paddings\" for Pad must be a "
                                "constant");
   }
-  TestConvertPad<DT_FLOAT>(this);
-  TestConvertPad<DT_HALF>(this);
   {
     // Make sure that ranges are inferred across a Pad.
     Reset();
-    NodeDef node_def = MakePadNodeDef("my_pad", DT_FLOAT);
-    AddTestTensor("input", {1, 2, 1});
+    NodeDef node_def = MakePadNodeDef("my_pad", tf_type_);
+    AddTestTensor("input", {1, 1, 2, 1});
     AddTestWeights<int>("padding", {4, 2}, {0, 0, 1, 0, 0, 1, 0, 0});
     TRT_TensorOrWeights input;
     TRT_TensorOrWeights output;
@@ -6309,17 +8951,758 @@ TEST_F(OpConverterTest, ConvertPad) {
     TF_EXPECT_OK(GetTensorOrWeights("my_pad", &output));
     ITensorProxyPtr input_tensor = input.tensor();
     converter_->ProvideQuantizationRange(&input_tensor, -5.0f, 5.0f);
-    // Input range should be inferred across pad.
-    PropagateQuantizationRanges();
     auto ranges = quantization_ranges();
     EXPECT_EQ(5.0f, ranges[input.tensor()->trt_tensor()]);
-    EXPECT_EQ(5.0f, ranges[output.tensor()->trt_tensor()]);
   }
+
+  std::vector<PadTestParams> params{
+      // 1 padding dim
+      {
+          /*input_dims=*/{1, 1, 3, 2},  // N, H, W, C
+          /*pad_dims=*/{4, 2},          // #dims, {pad_before, pad_after}
+          /*pad_values*/ {0, 0, 0, 0, 0, 1, 0, 0},
+          /*input_values=*/{1, 2, 3, 4, 5, 6},
+          /*expected_output_dims=*/{1, 1, 4, 2},  // N, H, W, C
+          /*expected_output_values=*/
+          {1, 2, 3, 4, 5, 6, 0, 0},
+      },
+      {
+          /*input_dims=*/{1, 1, 3, 2},  // N, H, W, C
+          /*pad_dims=*/{4, 2},          // #dims, {pad_before, pad_after}
+          /*pad_values*/ {0, 0, 0, 0, 0, 0, 0, 1},
+          /*input_values=*/{1, 2, 3, 4, 5, 6},
+          /*expected_output_dims=*/{1, 1, 3, 3},  // N, H, W, C
+          /*expected_output_values=*/
+          {1, 2, 0, 3, 4, 0, 5, 6, 0},
+      },
+      {
+          /*input_dims=*/{1, 1, 3, 2},  // N, H, W, C
+          /*pad_dims=*/{4, 2},          // #dims, {pad_before, pad_after}
+          /*pad_values*/ {0, 0, 1, 0, 0, 0, 0, 0},
+          /*input_values=*/{1, 2, 3, 4, 5, 6},
+          /*expected_output_dims=*/{1, 2, 3, 2},  // N, H, W, C
+          /*expected_output_values=*/
+          {0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6},
+      },
+      // 2 padding dims
+      {
+          /*input_dims=*/{1, 1, 2, 1},  // N, H, W, C
+          /*pad_dims=*/{4, 2},          // #dims, {pad_before, pad_after}
+          /*pad_values*/ {0, 0, 1, 0, 0, 1, 0, 0},
+          /*input_values=*/{2.0f, -1.0f},
+          /*expected_output_dims=*/{1, 2, 3, 1},  // N, H, W, C
+          /*expected_output_values=*/
+          {0.0, 0.0, 0.0, 2.0f, -1.0f, 0.0},
+      },
+      PadTestParams{
+          /*input_dims=*/{1, 1, 2, 2},  // N, H, W, C
+          /*pad_dims=*/{4, 2},          // #dims, {pad_before, pad_after}
+          /*pad_values*/ {0, 0, 1, 0, 0, 1, 0, 0},
+          /*input_values=*/{2, -1, 3., 4},
+          /*expected_output_dims=*/{1, 2, 3, 2},  // N, H, W, C
+          /*expected_output_values=*/
+          {0, 0, 0, 0, 0, 0, 2, -1, 3, 4, 0, 0},
+      },
+      PadTestParams{
+          /*input_dims=*/{1, 1, 2, 1, 2},  // N, C, H, W, D
+          /*pad_dims=*/{5, 2},             // #dims, {pad_before, pad_after}
+          /*pad_values*/ {0, 0, 1, 0, 0, 1, 0, 0, 0, 0},
+          /*input_values=*/{2, -1, 3., 4},
+          /*expected_output_dims=*/{1, 2, 3, 1, 2},  // N, H, W, C
+          /*expected_output_values=*/
+          {0, 0, 0, 0, 0, 0, 2, -1, 3, 4, 0, 0},
+      },
+      PadTestParams{
+          /*input_dims=*/{1, 1, 2, 1, 2},  // N, C, H, W, D
+          /*pad_dims=*/{5, 2},             // #dims, {pad_before, pad_after}
+          /*pad_values*/ {0, 0, 0, 1, 0, 0, 1, 1, 0, 0},
+          /*input_values=*/{2, -1, 3., 4},
+          /*expected_output_dims=*/{1, 2, 2, 3, 2},  // N, H, W, C
+          /*expected_output_values=*/
+          {0., 0., 2., -1., 0., 0., 0., 0., 3., 4., 0., 0.,
+           0., 0., 0., 0.,  0., 0., 0., 0., 0., 0., 0., 0},
+      },
+      PadTestParams{
+          /*input_dims=*/{1, 1, 2, 1},  // N, H, W, C
+          /*pad_dims=*/{4, 2},          // #dims, {pad_before, pad_after}
+          /*pad_values*/ {1, 0, 0, 0, 0, 1, 0, 0},
+          /*input_values=*/{2.0f, -1.0f},
+          /*expected_output_dims=*/{2, 1, 3, 1},  // N, H, W, C
+          /*expected_output_values=*/{0.0, 0.0, 0.0, 2.0f, -1.0f, 0.0},
+          trt_mode_ == TrtTestMode::kImplicitBatch
+              ? errors::InvalidArgument("Padding layer does not support "
+                                        "padding on batch dimension")
+              : Status::OK()},
+      PadTestParams{
+          /*input_dims=*/{1, 1, 2, 1},  // N, H, W, C
+          /*pad_dims=*/{4, 2},          // #dims, {pad_before, pad_after}
+          /*pad_values*/ {0, 0, 1, 0, 0, 1, 1, 1},
+          /*input_values=*/{2.0f, -1.0f},
+          /*expected_output_dims=*/{},  // N, H, W, C
+          /*expected_output_values=*/{},
+          errors::InvalidArgument("Padding layer does not support padding on "
+                                  "> 2")},
+      PadTestParams{
+          /*input_dims=*/{1, 2, 2},  // N, H, W
+          /*pad_dims=*/{3, 2},       // #dims, {pad_before, pad_after}
+          /*pad_values*/ {0, 0, 1, 0, 0, 1},
+          /*input_values=*/{2, -1, 3., 4},
+          /*expected_output_dims=*/{1, 3, 3},  // N, H, W, C
+          /*expected_output_values=*/
+          {0., 0., 0., 2., -1., 0., 3., 4., 0.},
+          errors::InvalidArgument("Convertpad requires at least 4D input")}};
+
+  for (auto p : params) {
+    Reset();
+    // Create pad node.
+    NodeDef node_def = MakePadNodeDef("my_pad", tf_type_);
+    // Create input tensor.
+    AddTestTensor("input", p.input_dims, p.input_values);
+    // Create output size.
+    AddTestWeights<int32>("padding", p.pad_dims, p.pad_values);
+    TestOpConverter(node_def, p.expected_output_dims, p.status, p.status,
+                    ElementsAreArray(p.expected_output_values));
+  }
+}
+
+#if IS_TRT_VERSION_GE(8, 2, 0, 0)
+
+class OpConverter_Select : public ParameterizedOpConverterTestBase {
+ public:
+  void RunTest(const string& opName);
+};
+
+void OpConverter_Select::RunTest(const string& opName) {
+  const auto testing_SelectV2 = opName == "SelectV2";
+  const int maxVal = 32;
+  const std::array<const char*, 3> par_name = {"cond", "then", "else"};
+  std::array<DataType, 3> par_type = {DT_BOOL, tf_type_, tf_type_};
+  std::vector<int> config(3, 0);
+  std::array<const std::vector<int>*, 3> par_dims;
+  std::vector<float> data_then(1, 0), data_else(1, maxVal),
+      expected_output(1, maxVal);
+  std::array<std::vector<float>*, 3> par_value = {nullptr, &data_then,
+                                                  &data_else};
+  std::vector<int> data_cond(1, 0);
+
+  auto set_parameters = [&](DataType cond_type = DT_BOOL) {
+    Reset();
+    if (config[0]) {
+      AddTestTensor(par_name[0], *par_dims[0], cond_type, data_cond);
+    } else {
+      AddTestWeights(par_name[0], {1}, data_cond, cond_type);
+    }
+    for (int i = 1; i < 3; i++) {
+      if (config[i]) {
+        AddTestTensor(par_name[i], *par_dims[i], par_type[i], *par_value[i]);
+      } else {
+        AddTestWeights(par_name[i], {1}, *par_value[i], par_type[i]);
+      }
+    }
+  };
+
+  auto set_dimension = [this](const nvinfer1::Dims* dims,
+                              std::vector<int>& dims_param,
+                              std::string* comment = nullptr) {
+    const auto nbDims = dims->nbDims;
+    if (comment) {
+      *comment = "batch_dim: " + std::to_string(nbDims + 1) + ", " +
+                 DebugString(*dims);
+    }
+
+    dims_param.resize(nbDims);
+    for (int i = 0; i < nbDims; i++) dims_param[i] = dims->d[i];
+  };
+
+  auto adjust_comments = [this](const nvinfer1::Dims* p_dims,
+                                std::string* p_comment) {
+    if (p_dims[0].nbDims == p_dims[1].nbDims) return;
+
+    const int idx = p_dims[0].nbDims < p_dims[1].nbDims ? 0 : 1;
+
+    nvinfer1::Dims dims;
+    dims.nbDims = p_dims[1 - idx].nbDims;
+    int i = 0;
+    for (; i < dims.nbDims - p_dims[idx].nbDims; i++) dims.d[i] = 1;
+
+    for (int j = i; i < dims.nbDims; i++) dims.d[i] = p_dims[idx].d[i - j];
+
+    *(p_comment + idx) =
+        "batch_dim: " + std::to_string(1) + ", " + DebugString(dims);
+    *(p_comment + 1 - idx) =
+        "batch_dim: " + std::to_string(p_dims[idx].nbDims + 1) + ", " +
+        DebugString(p_dims[1 - idx]);
+  };
+
+  auto assign_values = [this](
+                           const std::array<const std::vector<int>*, 3>& dims,
+                           std::array<std::vector<float>*, 3> par_value,
+                           std::vector<int>& data_cond, int use_indices = 0,
+                           const std::vector<float>* expected_out = nullptr,
+                           std::vector<int>* expect_dims_pntr = nullptr) {
+    size_t rank[3];
+    const auto dim_len =
+        dims[0]->size() > dims[1]->size() ? dims[0]->size() : dims[1]->size();
+    std::vector<int> exp_dims;
+    if (!expect_dims_pntr) expect_dims_pntr = &exp_dims;
+
+    auto& expect_dims = *expect_dims_pntr;
+    expect_dims.resize(dim_len);
+    expect_dims.assign(dim_len, 0);
+    for (int i = 0; i < 3; i++) {
+      if (dims[i]) {
+        const auto& dim = *dims[i];
+        for (auto j = 0; j < dims[i]->size(); j++) {
+          if (expect_dims[j] < dim[j]) expect_dims[j] = dim[j];
+        }
+
+        rank[i] = std::accumulate(std::begin(dim), std::end(dim), 1,
+                                  std::multiplies<int>());
+      } else {
+        assert(i >= 2);
+        rank[i] = rank[i - 1];
+      }
+    }
+
+    // Create data for ConvertSelectV2 testing.
+    for (int k = 1; k <= 2; k++) {
+      auto& data = *par_value[k];
+      data.resize(rank[k]);
+      if (use_indices) {
+        const int mult = k == 1 ? 1 : -1;
+        for (int i = 0; i < rank[k]; i++) {
+          data[i] = mult * (i + 1);
+        }
+      } else {
+        for (int i = 0; i < rank[k]; i++) {
+          data[i] = k == 1 ? data[i >> 1] + i % 2 : maxVal - (*par_value[1])[i];
+        }
+      }
+    }
+
+    data_cond.resize(rank[0]);
+    data_cond[0] = 0;
+    for (int i = 0; i < rank[0]; i++) {
+      data_cond[i] = i % 2 ? 1 - data_cond[i >> 1] : data_cond[i >> 1];
+    }
+
+    if (!expected_out || expected_out->size() > 0) {
+      auto& expected_output = *par_value[0];
+      const auto rank_out =
+          std::accumulate(std::begin(expect_dims), std::end(expect_dims), 1,
+                          std::multiplies<int>());
+
+      assert(rank_out == (expected_out ? expected_out->size()
+                                       : rank[use_indices >= 0 ? 0 : 1]));
+
+      expected_output.resize(rank_out);
+      const auto& data_then = *par_value[1];
+      const auto& data_else = *par_value[2];
+      const auto div = use_indices >= 0 ? 1 : rank_out / rank[0];
+      for (int i = 0; i < rank_out; i++) {
+        expected_output[i] =
+            expected_out ? (*expected_out)[i]
+                         : data_cond[i / div] ? data_then[i] : data_else[i];
+      }
+    }
+  };
+
+  auto shape_error_msg = [&](const NodeDef& node, bool same_then_else = true) {
+    nvinfer1::Dims shape[3];
+    const auto j = same_then_else ? 0 : 1;
+    if (trt_mode_ == TrtTestMode::kDynamicShape) {
+      // Creating dynamic shapes corresponding to 'cond' and 'then' parameters.
+      for (int i = 0; i < 2; i++) {
+        for (int j = shape[i].nbDims = par_dims[i]->size(); j--;) {
+          shape[i].d[j] = -1;
+        }
+      }
+    } else {
+      for (int i = 0; i < 2; i++) {
+        DimsAdapter(*par_dims[i + j]).TrtDims(&shape[i + j]);
+      }
+    }
+
+    return input_shapes_error_msg(shape[j], shape[j + 1], node,
+                                  !same_then_else);
+  };
+
+  auto run_test = [&](const NodeDef& node, const std::vector<int>& exp_dims) {
+    const bool same_then_else_shapes = *par_dims[1] == *par_dims[2];
+    const bool same_cond_chape = *par_dims[0] == *par_dims[1];
+    const auto nMax = testing_SelectV2 ? 2 : 1;
+    for (int n = 0; n < nMax; n++) {
+      set_parameters();
+      if (testing_SelectV2 || (same_then_else_shapes && same_cond_chape)) {
+        TestOpConverter(node, exp_dims, Status::OK(), Status::OK(),
+                        ElementsAreArray(expected_output));
+      } else {
+        const auto err_msg = shape_error_msg(node, same_then_else_shapes);
+        RunValidationAndConversion(node, absl::StatusCode::kInvalidArgument,
+                                   err_msg);
+      }
+
+      if (!n) {
+        // Changing the condition and expected_output.
+        for (auto idx = data_cond.size(); idx--;)
+          data_cond[idx] = 1 - data_cond[idx];
+
+        // Compare of the shapes if the tensors "then" and "else".
+        if (!same_then_else_shapes) {
+          // Shapes are different:
+          //     assigning +1's and -1's to the elements
+          //     of the tensors "then" and "else", respectively
+          for (int p = 1; p <= 2; p++) {
+            auto& values = *par_value[p];
+            const auto val = p == 1 ? 1 : -1;
+            for (auto idx = values.size(); idx--;) values[idx] = val;
+          }
+          //    and set the appropriate expected values.
+          for (auto idx = expected_output.size(); idx--;)
+            expected_output[idx] = expected_output[idx] > 0 ? -1 : 1;
+        } else {
+          // Shapes are the same:
+          //    just change the signs of the expected values.
+          for (auto idx = expected_output.size(); idx--;)
+            expected_output[idx] = -expected_output[idx];
+        }
+      }
+    }
+  };
+
+  std::array<DataType, 3> data_types = {DT_FLOAT, DT_HALF, DT_INT32};
+  NodeDef node;
+  TF_CHECK_OK(NodeDefBuilder("op", opName)
+                  .Input("cond", 0, DT_BOOL)
+                  .Input("then", 0, tf_type_)
+                  .Input("else", 0, tf_type_)
+                  .Finalize(&node));
+
+  const std::vector<std::vector<int>> dims_params = {
+      {8}, {8, 2, 4}, {32, 32, 3200}};
+
+  // All parameters passed as the weights OR 1-element tensors.
+  par_dims = {&dims_params[0], &dims_params[0], &dims_params[0]};
+  if (trt_mode_ == TrtTestMode::kImplicitBatch) {
+    const auto& err = convert_not_supported_implicit(node.op(), node.name());
+    do {
+      set_parameters();
+      RunValidationAndConversion(node, absl::StatusCode::kUnimplemented, err);
+    } while (nextTensorWeightConfiguration(config));
+    return;
+  }
+
+  // Parameter 'cond' can only be of type DT_BOOL.
+  do {
+    for (auto cond_type : {DT_INT32, DT_FLOAT, DT_HALF}) {
+      nvinfer1::DataType trt_type;
+      TF_ASSERT_OK(TfTypeToTrtType(cond_type, &trt_type));
+      const auto error_msg =
+          unexpected_type_error_msg(trt_type, nvinfer1::DataType::kBOOL, node);
+      set_parameters(cond_type);
+      RunValidationAndConversion(node, absl::StatusCode::kInvalidArgument,
+                                 error_msg);
+    }
+  } while (nextTensorWeightConfiguration(config));
+
+  std::string err_msg = bool_weight_error_msg(node);
+
+  std::vector<int> dims_const = {1};
+  par_dims = {&dims_const, &dims_const, &dims_const};
+  // Loop when condition is reversed and the expected_output
+  // should change from 'else' to 'then'.
+  for (int i = 0; i < 2; i++) {
+    do {
+      set_parameters();
+      if (config[0]) {
+        TestOpConverter(node, {1}, Status::OK(), Status::OK(),
+                        ElementsAreArray(expected_output));
+      } else {
+        RunValidationAndConversion(node, absl::StatusCode::kInvalidArgument,
+                                   err_msg);
+      }
+    } while (nextTensorWeightConfiguration(config));
+
+    // Changing the condition and expected_output.
+    data_cond[0] = 1 - data_cond[0];
+    expected_output[0] = (*par_value[1 + i])[0];
+  }
+
+  // All parameters passed as the tensors.
+  for (int i = 0; i < 3; i++) {
+    config[i] = 1;
+  }
+
+  par_value[0] = &expected_output;
+  if (trt_mode_ == TrtTestMode::kExplicitBatch) {
+    // Testing infeasible broadcast schemes.
+    // For that subtest dims('then') will be equal to dims('else').
+    std::string bc_comment[2];
+    std::vector<int> dims[4];
+    par_dims = {dims, dims + 1, dims + 1};
+    const nvinfer1::Dims infeasible_dims[] = {
+        {3, {4, 3, 2}}, {4, {4, 3, 2, 5}}, {3, {4, 1, 3}},
+        {3, {4, 3, 2}}, {3, {4, 3, 2}},    {5, {4, 3, 2, 5, 2}}};
+
+    auto iMax = sizeof(infeasible_dims) / sizeof(infeasible_dims[0]);
+    // Loop for all pairs of nvinfer1::Dims from infeasible_dims.
+    for (int i = 0; i < iMax; i += 2) {
+      // Loop for all permutations on 2 elements which will assign
+      // each pairs of nvinfer1::Dims from infeasible_dims to
+      // (dims('cond'), dims('then')) and (dims('then'), dims('cond')),
+      // respectively.
+      for (int k = 0; k < 2; k++) {
+        for (int j = 0; j < 2; j++) {
+          set_dimension(infeasible_dims + i + (j + k) % 2, dims[j],
+                        bc_comment + (j + k) % 2);
+        }
+
+        if (testing_SelectV2) {
+          adjust_comments(infeasible_dims + i, bc_comment);
+          err_msg = "Infeasible broadcast scheme (" + bc_comment[k] + " vs " +
+                    bc_comment[1 - k];
+        } else {
+          err_msg = shape_error_msg(node);
+        }
+
+        set_parameters();
+        RunValidationAndConversion(node, absl::StatusCode::kInvalidArgument,
+                                   err_msg);
+      }
+    }
+
+    // Tests for exactly two identical dims for any two out of 3 tensors.
+    const nvinfer1::Dims feasible_dims_2[] = {
+        {3, {1, 3, 2}}, {3, {4, 3, 2}}, {3, {4, 1, 2}}, {3, {4, 3, 2}},
+        {3, {4, 3, 1}}, {3, {4, 3, 2}}, {3, {1, 1, 2}}, {3, {4, 3, 2}},
+        {3, {1, 3, 1}}, {3, {4, 3, 2}}, {3, {4, 1, 1}}, {3, {4, 3, 2}},
+        {3, {1, 1, 1}}, {3, {4, 3, 2}}, {3, {1, 3, 2}}, {3, {4, 1, 2}},
+    };
+
+    // Expected values will be definded directly.
+    const std::vector<float> expected_val_2[] = {
+        // Expected values for all feasible ordered pairs of dims
+        // for dims('then') == dims('else'), dims('then') != dims('cond').
+        {-1,  2,  3,  -4,  5,  -6,  -7,  8,  9,  -10, 11, -12,
+         -13, 14, 15, -16, 17, -18, -19, 20, 21, -22, 23, -24},
+        {-1, 2, 3, -4, 5, -6, -1, 2, 3,  -4, -5, 6,
+         -1, 2, 3, -4, 5, -6, -1, 2, -3, 4,  5,  -6},
+        {-1, 2,   -3, 4,   -5, 6,   7,   -8, 9,   -10, 11,  -12,
+         13, -14, 15, -16, 17, -18, -19, 20, -21, 22,  -23, 24},
+        {-1, 2, 1, -2, 1, -2, -3, 4, 3,  -4, -3, 4,
+         -5, 6, 5, -6, 5, -6, -7, 8, -7, 8,  7,  -8},
+        {-1,  -2,  3,  4,  5,  6,  -7,  -8,  9,   10,  -11, -12,
+         -13, -14, 15, 16, 17, 18, -19, -20, -21, -22, 23,  24},
+        {-1, 1, 2, -2, 3, -3, -4,  4,  5,   -5, -6, 6,
+         -7, 7, 8, -8, 9, -9, -10, 10, -11, 11, 12, -12},
+        {-1,  2,  -3,  4,  -5,  6,  -7,  8,  -9,  10, -11, 12,
+         -13, 14, -15, 16, -17, 18, -19, 20, -21, 22, -23, 24},
+        {-1, 2, 1, -2, 1, -2, -1, 2, 1,  -2, -1, 2,
+         -1, 2, 1, -2, 1, -2, -1, 2, -1, 2,  1,  -2},
+        {-1,  -2,  3,  4,  5,  6,  -7,  -8,  9,  10, 11, 12,
+         -13, -14, 15, 16, 17, 18, -19, -20, 21, 22, 23, 24},
+        {-1, 1, 2, -2, 3, -3, -1, 1, 2,  -2, -3, 3,
+         -1, 1, 2, -2, 3, -3, -1, 1, -2, 2,  3,  -3},
+        {-1, -2, -3, -4, -5, -6, 7,   8,   9,   10,  11,  12,
+         13, 14, 15, 16, 17, 18, -19, -20, -21, -22, -23, -24},
+        {-1, 1, 1, -1, 1, -1, -2, 2, 2,  -2, -2, 2,
+         -3, 3, 3, -3, 3, -3, -4, 4, -4, 4,  4,  -4},
+        {-1,  -2,  -3,  -4,  -5,  -6,  -7,  -8,  -9,  -10, -11, -12,
+         -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, -23, -24},
+        {-1, 1, 1, -1, 1, -1, -1, 1, 1,  -1, -1, 1,
+         -1, 1, 1, -1, 1, -1, -1, 1, -1, 1,  1,  -1},
+        {-1, 2, 1, -2, 1, -2, -3, 4, 3, -4, 3, -4,
+         -5, 6, 5, -6, 5, -6, -7, 8, 7, -8, 7, -8},
+        {-1, 2,  -3, 4,  -5, 6,  1,  -2, 3,  -4, 5,  -6,
+         1,  -2, 3,  -4, 5,  -6, -1, 2,  -3, 4,  -5, 6},
+        // Expected values for all feasible ordered pairs of dims
+        // for dims('cond') == dims('else'), dims('then') != dims('else').
+        {-1,  2, 3, -4,  5, -6,  -7,  2, 3,   -10, -11, 6,
+         -13, 2, 3, -16, 5, -18, -19, 2, -21, 4,   5,   -24},
+        {-1, 2,  3,  -4, 5,  -6, -1, 8,  9,  -4, 11, -6,
+         -1, 14, 15, -4, 17, -6, -1, 20, 21, -4, 23, -6},
+        {-1,  2, 1, -4,  1, -6,  -7,  4, 3,   -10, -11, 4,
+         -13, 6, 5, -16, 5, -18, -19, 8, -21, 8,   7,   -24},
+        {-1, 2,  -1, 4,  -1, 6,  7,  -4, 9,  -4, 11, -4,
+         13, -6, 15, -6, 17, -6, -7, 20, -7, 22, -7, 24},
+        {-1,  1, 2, -4,  3, -6,  -7,  4,  5,   -10, -11, 6,
+         -13, 7, 8, -16, 9, -18, -19, 10, -21, 11,  12,  -24},
+        {-1, -1, 3,  4,  5,  6,  -4,  -4,  9,   10,  -6, -6,
+         -7, -7, 15, 16, 17, 18, -10, -10, -11, -11, 23, 24},
+        {-1,  2, 1, -4,  1, -6,  -7,  2, 1,   -10, -11, 2,
+         -13, 2, 1, -16, 1, -18, -19, 2, -21, 2,   1,   -24},
+        {-1, 2,  -1, 4,  -1, 6,  -1, 8,  -1, 10, -1, 12,
+         -1, 14, -1, 16, -1, 18, -1, 20, -1, 22, -1, 24},
+        {-1,  1, 2, -4,  3, -6,  -7,  1, 2,   -10, -11, 3,
+         -13, 1, 2, -16, 3, -18, -19, 1, -21, 2,   3,   -24},
+        {-1, -1, 3,  4,  5,  6,  -1, -1, 9,  10, 11, 12,
+         -1, -1, 15, 16, 17, 18, -1, -1, 21, 22, 23, 24},
+        {-1,  1, 1, -4,  1, -6,  -7,  2, 2,   -10, -11, 2,
+         -13, 3, 3, -16, 3, -18, -19, 4, -21, 4,   4,   -24},
+        {-1, -1, -1, -1, -1, -1, 7,  8,  9,  10, 11, 12,
+         13, 14, 15, 16, 17, 18, -4, -4, -4, -4, -4, -4},
+        {-1,  1, 1, -4,  1, -6,  -7,  1, 1,   -10, -11, 1,
+         -13, 1, 1, -16, 1, -18, -19, 1, -21, 1,   1,   -24},
+        {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+        {-1, 2,  -1, 4,  -1, 6,  1,  -4, 3,  -4, 5,  -4,
+         1,  -6, 3,  -6, 5,  -6, -7, 2,  -7, 4,  -7, 6},
+        {-1, 2, 1, -4, 1, -6, -1, 4, 3, -4, 3, -6,
+         -1, 6, 5, -4, 5, -6, -1, 8, 7, -4, 7, -6}};
+
+    const auto exp_dims = dims + 3;
+    const int kMax2 = 2;  // number of permutations on 2 elements
+    iMax = sizeof(feasible_dims_2) / sizeof(feasible_dims_2[0]);
+    assert(kMax2 * iMax / 3 ==
+           sizeof(expected_val_2) / sizeof(expected_val_2[0]));
+    // Broadcast shapes defined for `cond` OR for `then` and `else`.
+    // Loop for all pairs of nvinfer1::Dims from feasible_dims_2.
+    for (int i = 0; i < iMax; i += 2) {
+      // Loop for all permutations on 2 elements.
+      for (int k = 0; k < kMax2; k++) {
+        // Constructing dims for tensors 'cond' and 'then'.
+        // NOTE: dims('else') will be the same as  dims('then').
+        for (int j = 0; j < 2; j++)
+          set_dimension(feasible_dims_2 + i + (j + k) % 2, dims[j]);
+
+        const std::vector<float>* expect = expected_val_2 + i + k;
+        // Loop where the tensor shapes for 'cond' and 'then' are swapping.
+        for (int m = 0; m < 2; m++) {
+          assign_values(par_dims, par_value, data_cond, 1, expect, exp_dims);
+          run_test(node, *exp_dims);
+
+          // Swapping dims for 'cond' and 'then' tensors.
+          const auto tmp = par_dims[0];
+          par_dims[0] = par_dims[1];
+          par_dims[1] = tmp;
+          expect += iMax;
+        }
+      }
+    }
+
+    // Tests for pairwise different dims('cond'), dims('then'), dims('else').
+    const nvinfer1::Dims feasible_dims_3[] = {
+        {2, {3, 2}},    {2, {3, 1}},    {2, {1, 1}},    {3, {2, 2, 1}},
+        {3, {2, 1, 2}}, {3, {1, 2, 2}}, {3, {2, 1, 1}}, {3, {2, 1, 2}},
+        {3, {1, 2, 2}}, {3, {2, 1, 1}}, {3, {1, 1, 2}}, {3, {1, 2, 1}},
+    };
+
+    const std::vector<float> expected_val_3[] = {
+        {-1, 1, 2, -1, 3, -1},        {-1, 1, 1, -2, 1, -3},
+        {-1, -1, 3, 4, 5, 6},         {-1, -2, 1, 1, 1, 1},
+        {-1, -1, -2, -2, -3, -3},     {-1, -2, -3, -4, -5, -6},
+        {-1, -2, 1, 2, 3, 4, -3, -4}, {-1, -2, 3, 4, 1, 2, -3, -4},
+        {-1, 1, -3, 2, 3, -2, 4, -4}, {-1, 2, -2, 4, 1, -3, 3, -4},
+        {-1, 1, 2, -2, -3, 3, 4, -4}, {-1, 2, 1, -2, -3, 4, 3, -4},
+        {-1, -2, -3, -4, 3, 4, 3, 4}, {-1, -2, -1, -2, 1, 2, 3, 4},
+        {-1, 1, -3, 1, 2, -2, 2, -4}, {-1, 2, -1, 4, 1, -2, 3, -2},
+        {-1, 1, 1, -2, -3, 2, 2, -4}, {-1, 2, 1, -1, -2, 4, 3, -2},
+        {-1, -1, -2, -2, 1, 2, 1, 2}, {-1, -2, -1, -2, 1, 1, 2, 2},
+        {-1, 1, -2, 1, -1, 2, -2, 2}, {-1, 1, -1, 2, -2, 1, -2, 2},
+        {-1, -2, 1, 1, -1, -2, 2, 2}, {-1, -1, 1, 2, -2, -2, 1, 2},
+    };
+
+    const int kMax3 = 6;  // number of permutations on 3 elements
+    const std::array<int, 3> perm[kMax3] = {{0, 1, 2}, {0, 2, 1}, {1, 0, 2},
+                                            {1, 2, 0}, {2, 0, 1}, {2, 1, 0}};
+    par_dims = {dims, dims + 1, dims + 2};
+    iMax = sizeof(feasible_dims_3) / sizeof(feasible_dims_3[0]);
+    assert(kMax3 * iMax / 3 ==
+           sizeof(expected_val_3) / sizeof(expected_val_3[0]));
+    // Loop for all triples of nvinfer1::Dims from feasible_dims_3.
+    for (int i = 0; i < iMax; i += 3) {
+      // Loop for all permutations on 3 elements.
+      for (int k = 0; k < kMax3; k++) {
+        // Constructing dims for tensors 'cond', 'then' and 'else`.
+        for (int j = 0; j < 3; j++)
+          set_dimension(feasible_dims_3 + i + perm[k][j], dims[j]);
+
+        const auto* expect = expected_val_3 + kMax3 * (i / 3) + k;
+        assign_values(par_dims, par_value, data_cond, 1, expect, exp_dims);
+        run_test(node, *exp_dims);
+      }
+    }
+
+    if (!testing_SelectV2) {
+      // Tests for `cond` passed as a vector with N elements, where N is a batch
+      // size. The subtest should not pass a ConvertSelect::Validate() when one
+      // of following is true:
+      //    (a) N is NOT equal to the first dimention of dims('then');
+      //    (b dims('cond').nbDims > 1.
+      //
+      // For all these subtest dims('then') == dims('else').
+      const nvinfer1::Dims vect_dim[] = {
+          {1, {4}}, {3, {5, 2, 3}}, {2, {5, 2}}, {3, {5, 2, 3}},
+          {1, {5}}, {3, {5, 2, 3}}, {1, {4}},    {4, {4, 3, 5, 2}},
+      };
+
+      std::vector<int> dims[4];
+      par_dims = {dims, dims + 1, dims + 1};
+      auto iMax = sizeof(vect_dim) / sizeof(vect_dim[0]);
+      // Loop for all pairs of nvinfer1::Dims from vector_dims.
+      for (int i = 0; i < iMax; i += 2) {
+        err_msg =
+            vect_dim[i].nbDims != 1 || vect_dim[i].d[0] != vect_dim[i + 1].d[0]
+                ? input_shapes_error_msg(vect_dim[i], vect_dim[i + 1], node)
+                : "";
+
+        for (int j = 0; j < 2; j++) {
+          set_dimension(vect_dim + i + j, dims[j]);
+        }
+
+        assign_values(par_dims, par_value, data_cond, -1);
+        set_parameters();
+        if (err_msg.empty()) {
+          TestOpConverter(node, dims[1], Status::OK(), Status::OK(),
+                          ElementsAreArray(expected_output));
+        } else {
+          RunValidationAndConversion(node, absl::StatusCode::kInvalidArgument,
+                                     err_msg);
+        }
+      }
+    }
+  }  // trt_mode_ == TrtTestMode::kExplicitBatch
+
+  // Tests for dims('cond') == dims('then') == dims('else').
+  for (auto dims : dims_params) {
+    par_dims = {&dims, &dims, &dims};
+    assign_values(par_dims, par_value, data_cond);
+
+    // Loop over all possible values of type_else (type_then = tf_type_).
+    for (const auto type_else : data_types) {
+      par_type[2] = type_else;
+      set_parameters();
+      if ((par_type[1] == DT_INT32 || par_type[2] == DT_INT32) &&
+          par_type[1] != par_type[2]) {
+        // ConvertSelectV2::Validation() should fail when exactly one of
+        // (type_then, type_else) is equal to nvinfer1::DataType::kINT32.
+        nvinfer1::DataType trt_type[2];
+        for (int i = 0; i < 2; i++) {
+          TF_ASSERT_OK(TfTypeToTrtType(par_type[i + 1], trt_type + i));
+        }
+
+        err_msg = then_else_dtypes_error_msg(trt_type[0], trt_type[1], node);
+        RunValidationAndConversion(node, absl::StatusCode::kInvalidArgument,
+                                   err_msg);
+      } else {
+        TestOpConverter(node, dims, Status::OK(), Status::OK(),
+                        ElementsAreArray(expected_output));
+      }
+    }
+
+    // Restoring the original value.
+    par_type[2] = tf_type_;
+  }
+
+  if (trt_mode_ == TrtTestMode::kDynamicShape) {
+    std::vector<float> values_then{1, 2, 3, 4, 5, 6};
+    std::vector<float> values_else{-1, -2, -3, -4, -5, -6};
+    std::vector<float> expected_output{1, -2, 3, 4, -5, 6};
+    data_cond = std::vector<int>{1, 0, 1};
+    const std::vector<int> cond_dims{1, 3}, input_dims{1, 2, 3};
+    par_dims = {&cond_dims, &input_dims, &input_dims};
+    // Loop when condition is reversed and the expected_output
+    // should change from 'else' to 'then'.
+    const auto len_cond = data_cond.size();
+    for (int i = 0; i < 2; i++) {
+      par_value[i + 1] = &values_then;
+      par_value[2 - i] = &values_else;
+      for (int j = 0; j < values_then.size(); j++) {
+        expected_output[j] = par_value[2 - data_cond[j % len_cond]]->at(j);
+      }
+
+      set_parameters();
+      if (testing_SelectV2) {
+        TestOpConverter(node, input_dims, Status::OK(), Status::OK(),
+                        ElementsAreArray(expected_output));
+      } else {
+        const auto err_msg = shape_error_msg(node);
+        RunValidationAndConversion(node, absl::StatusCode::kInvalidArgument,
+                                   err_msg);
+      }
+      // Changing the condition and expected_output.
+      for (int j = len_cond; j--;) {
+        data_cond[j] = 1 - data_cond[j];
+      }
+    }
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(
+    OpConvTestInstantiation, OpConverter_Select,
+    ::testing::Combine(::testing::ValuesIn(ValidTrtModes),
+                       ::testing::Values(DT_FLOAT, DT_HALF, DT_INT32),
+                       ::testing::Values(TrtPrecisionMode::FP32)));
+
+TEST_P(OpConverter_Select, ConvertSelectV2) { RunTest("SelectV2"); }
+
+TEST_P(OpConverter_Select, Convert_Select) { RunTest("Select"); }
+
+TEST_F(OpConverterTest, DuplicateSqueeze) {
+  // Define a custom converter which performs multiple squeezes.
+  auto op_converter = [](const OpConverterParams* params) -> Status {
+    if (params->validation_only) return Status::OK();
+    auto input = params->inputs.at(0).tensor();
+    ITensorProxyPtr output;
+    // Squeeze the first dimension.
+    std::vector<int> new_dims = {0, 1, 2, 3};
+    TF_EXPECT_OK(params->converter->SqueezeTensor(
+        /*input=*/input, /*input_dims=*/&new_dims, /*params=*/params,
+        /*output=*/&output, /*op_instance=*/0));
+    // Squeeze the second dimension.
+    new_dims = {0, 2, 3};
+    TF_EXPECT_OK(params->converter->SqueezeTensor(
+        /*input=*/output, /*input_dims=*/&new_dims, /*params=*/params,
+        /*output=*/&output, /*op_instance=*/1));
+    params->outputs->push_back(TRT_TensorOrWeights(output));
+    return Status::OK();
+  };
+  // Use a simple unary op for the custom converter and add an input.
+  NodeDef node_def = CreateUnaryOp<ops::Abs>(DataType::DT_FLOAT);
+  AddTestTensor("input", {1, 1, 2, 3});
+  // Override the converter for Abs to use the custom converter for this test
+  // only, and run conversion.
+  GetOpConverterRegistry()->Register("Abs", kDefaultConverterPriority + 1,
+                                     op_converter);
+  RunValidationAndConversion(node_def);
+  // Set up the inputs and outputs.
+  DataVec input_data;
+  DataVec output_data;
+  InputOutputData abs_input{
+      "input", ConstructTensor<float>(/*data_size=*/6, /*value=*/0,
+                                      /*tf_type=*/DataType::DT_FLOAT)};
+  InputOutputData abs_output{
+      "my_unary", ConstructTensor<float>(/*data_size=*/6, /*value=*/0,
+                                         /*tf_type=*/DataType::DT_FLOAT)};
+  input_data.push_back(abs_input);
+  output_data.push_back(abs_output);
+  // Build and run the cuda engine.
+  TF_EXPECT_OK(BuildAndRun(input_data, &output_data));
 }
 
+#endif
+
 }  // namespace convert
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+int main(int argc, char** argv) {
+// TRT >= 8.2 optimizes memory management in the builder. When all builders
+// are destroyed, it unloads many resources. This test fixture will create and
+// destroy hundreds of builders when run sequentially for parameterized
+// tests. We can hold open an IBuilder in order to prevent TRT from unloading
+// shared resources between engine builds when using TRT shared library. This
+// greatly speeds up unit tests and is safe to do.
+#if IS_TRT_VERSION_GE(8, 2, 0, 0)
+  // This builder holds a copy of cask::KernelLibrary, which is shared with
+  // other builders. Other builders used during testing won't trigger costly
+  // loading of cask::KernelLibrary.
+  std::unique_ptr<nvinfer1::IBuilder> const holder{
+      nvinfer1::createInferBuilder(*tensorflow::tensorrt::Logger::GetLogger())};
+#endif
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+#else
+int main(int, char**) { return 0; }
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/convert/logger_registry.cc b/tensorflow/compiler/tf2tensorrt/convert/logger_registry.cc
new file mode 100644
index 00000000000..07c9c2f1ea0
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/convert/logger_registry.cc
@@ -0,0 +1,60 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+
+#include "tensorflow/compiler/tf2tensorrt/convert/logger_registry.h"
+
+#include <unordered_map>
+
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+namespace tensorrt {
+
+class LoggerRegistryImpl : public LoggerRegistry {
+  Status Register(const string& name, nvinfer1::ILogger* logger) override {
+    mutex_lock lock(mu_);
+    if (!registry_.emplace(name, std::unique_ptr<nvinfer1::ILogger>(logger))
+             .second) {
+      return errors::AlreadyExists("Logger ", name, " already registered");
+    }
+    return Status::OK();
+  }
+
+  nvinfer1::ILogger* LookUp(const string& name) override {
+    mutex_lock lock(mu_);
+    const auto found = registry_.find(name);
+    if (found == registry_.end()) {
+      return nullptr;
+    }
+    return found->second.get();
+  }
+
+ private:
+  mutable mutex mu_;
+  mutable std::unordered_map<string, std::unique_ptr<nvinfer1::ILogger>>
+      registry_ TF_GUARDED_BY(mu_);
+};
+
+LoggerRegistry* GetLoggerRegistry() {
+  static LoggerRegistryImpl* registry = new LoggerRegistryImpl;
+  return registry;
+}
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/convert/logger_registry.h b/tensorflow/compiler/tf2tensorrt/convert/logger_registry.h
new file mode 100644
index 00000000000..2a265cf7caa
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/convert/logger_registry.h
@@ -0,0 +1,58 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_LOGGER_REGISTRY_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_LOGGER_REGISTRY_H_
+
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
+
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+
+#include "third_party/tensorrt/NvInfer.h"
+
+namespace tensorflow {
+namespace tensorrt {
+
+class LoggerRegistry {
+ public:
+  virtual Status Register(const string& name, nvinfer1::ILogger* logger) = 0;
+  virtual nvinfer1::ILogger* LookUp(const string& name) = 0;
+  virtual ~LoggerRegistry() {}
+};
+
+LoggerRegistry* GetLoggerRegistry();
+
+class RegisterLogger {
+ public:
+  RegisterLogger(const string& name, nvinfer1::ILogger* logger) {
+    TF_CHECK_OK(GetLoggerRegistry()->Register(name, logger));
+  }
+};
+
+#define REGISTER_TENSORRT_LOGGER(name, logger) \
+  REGISTER_TENSORRT_LOGGER_UNIQ_HELPER(__COUNTER__, name, logger)
+#define REGISTER_TENSORRT_LOGGER_UNIQ_HELPER(ctr, name, logger) \
+  REGISTER_TENSORRT_LOGGER_UNIQ(ctr, name, logger)
+#define REGISTER_TENSORRT_LOGGER_UNIQ(ctr, name, logger)                 \
+  static ::tensorflow::tensorrt::RegisterLogger register_trt_logger##ctr \
+      TF_ATTRIBUTE_UNUSED =                                              \
+          ::tensorflow::tensorrt::RegisterLogger(name, logger)
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_LOGGER_REGISTRY_H_
diff --git a/tensorflow/compiler/tf2tensorrt/convert/logger_registry_test.cc b/tensorflow/compiler/tf2tensorrt/convert/logger_registry_test.cc
new file mode 100644
index 00000000000..01921297b98
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/convert/logger_registry_test.cc
@@ -0,0 +1,34 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+namespace {
+
+class TestLogger : public nvinfer1::ILogger {
+  void log(nvinfer1::ILogger::Severity severity, const char* msg) override {}
+};
+
+TestLogger test_logger;
+
+REGISTER_TENSORRT_LOGGER("test_logger", &test_logger);
+
+TEST(LoggerRegistryTest, RegistersCorrectly) {
+  auto registered_logger = GetLoggerRegistry()->LookUp("test_logger");
+  EXPECT_THAT(registered_logger, Eq(&test_logger));
+}
+
+}  // namespace
diff --git a/tensorflow/compiler/tf2tensorrt/convert/op_converter.h b/tensorflow/compiler/tf2tensorrt/convert/op_converter.h
new file mode 100644
index 00000000000..e6f21cbed1d
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/convert/op_converter.h
@@ -0,0 +1,225 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_OP_CONVERTER_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_OP_CONVERTER_H_
+
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+
+#include <memory>
+#include <vector>
+
+#include "absl/strings/str_format.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/trt_parameters.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/weights.h"
+
+namespace tensorflow {
+namespace tensorrt {
+namespace convert {
+
+class Converter;
+
+// Specifies the expected type taken by a TRT_TensorOrWeights input during op
+// conversion.
+// kResource is only used for resource variable ops. For an operation like
+// Add(tensor, ReadVariableOp(...)), the second operand of Add is the result of
+// the ReadVariableOp, which is a kWeight.
+enum class TrtInputArg { kTensor = 1, kWeight = 2, kBoth = 3, kResource = 4 };
+
+// Parameters for each op converter.
+struct OpConverterParams {
+  // Constructor used for validation only.
+  OpConverterParams(const NodeDef& node_def,
+                    const std::vector<TRT_TensorOrWeights>& inputs,
+                    std::vector<TRT_TensorOrWeights>* outputs,
+                    TrtWeightStore* weight_store,
+                    TrtPrecisionMode precision_mode, bool use_calibration,
+                    bool use_implicit_batch, bool use_explicit_precision);
+
+  // Constructor used for conversion.
+  OpConverterParams(Converter* converter, const NodeDef& node_def,
+                    const std::vector<TRT_TensorOrWeights>& inputs,
+                    std::vector<TRT_TensorOrWeights>* outputs,
+                    TrtWeightStore* weight_store);
+
+  Converter* converter = nullptr;
+  const NodeDef& node_def;
+  const std::vector<TRT_TensorOrWeights>& inputs;
+  std::vector<TRT_TensorOrWeights>* outputs;
+  const bool validation_only;
+  TrtWeightStore* weight_store;
+  const TrtPrecisionMode precision_mode;
+  const bool use_calibration;
+  const bool use_implicit_batch;
+  const bool use_explicit_precision;
+};
+
+// Operation converter function specification.
+using OpConverter = std::function<Status(const OpConverterParams*)>;
+
+struct InputArgSpec {
+  absl::string_view name;
+  TrtInputArg allowed_roles;
+
+  static constexpr InputArgSpec Create(absl::string_view n, TrtInputArg role) {
+    return InputArgSpec{n, role};
+  }
+};
+
+template <typename T>
+std::string convert_not_supported_dtype_msg(const T& allowed_types,
+                                            DataType tf_type,
+                                            const NodeDef& node) {
+  string allowed_types_string =
+      absl::StrJoin(allowed_types, ", ", [](string* out, const DataType& type) {
+        absl::StrAppendFormat(out, "%s", DataTypeString(type));
+      });
+
+  return absl::StrCat("Data type ", DataTypeString(tf_type),
+                      " is not supported for ", node.op(), ", must be one of [",
+                      allowed_types_string, "]");
+}
+
+std::string convert_not_supported_implicit(const std::string& pOpName,
+                                           const std::string& pNodeName,
+                                           const char* pOpType = NULL);
+
+// A Curiously recurring template pattern (CRTP) template class for operation
+// converters.
+template <typename Impl>
+class OpConverterBase {
+ public:
+  explicit OpConverterBase(const OpConverterParams* params,
+                           const std::vector<DataType>& data_types =
+                               {DataType::DT_FLOAT, DataType::DT_HALF})
+      : params_(params),
+        node_def_attrs_(params->node_def),
+        allowed_dtypes_(data_types) {}
+
+  // Default NodeDef attribute name to inspect in order to determine node data
+  // type. The Impl class can override this by implementing the same function.
+  static constexpr const char* NodeDefDataTypeAttributeName() { return "T"; }
+
+  // Validate data type of the given NodeDef against allowed types.
+  Status ValidateNodeDefDataType() {
+    // If the attribute name is empty, we should skip this check.
+    if (absl::string_view(Impl::NodeDefDataTypeAttributeName()).empty()) {
+      return Status::OK();
+    }
+
+    // Get the NodeDef data type.
+    auto dtype = GetAttrValue<DataType>(Impl::NodeDefDataTypeAttributeName());
+    if (!dtype.ok()) {
+      return errors::InvalidArgument("Attribute with name ",
+                                     Impl::NodeDefDataTypeAttributeName(),
+                                     " not found.");
+    }
+
+    // Check allowed data types.;
+    if (std::find(allowed_dtypes_.begin(), allowed_dtypes_.end(),
+                  dtype.ValueOrDie()) == allowed_dtypes_.end()) {
+      return errors::Unimplemented(convert_not_supported_dtype_msg(
+          allowed_dtypes_, dtype.ValueOrDie(), params_->node_def));
+    }
+    return Status::OK();
+  }
+
+  static constexpr bool HasFixNumberOfInputs() { return true; }
+
+  // Validates input argument roles and data types.
+  Status ValidateInputs() {
+    const NodeDef& node_def = params_->node_def;
+    const auto& inputs = params_->inputs;
+    if (Impl::HasFixNumberOfInputs()) {
+      TRT_ENSURE(inputs.size() == Impl::InputSpec().size());
+    } else {
+      TRT_ENSURE(inputs.size() <= Impl::InputSpec().size());
+    }
+    for (int i = 0; i < inputs.size(); i++) {
+      const InputArgSpec arg_spec = Impl::InputSpec()[i];
+      if (arg_spec.allowed_roles == TrtInputArg::kWeight &&
+          inputs.at(i).is_tensor()) {
+        return errors::Unimplemented("The input \"", arg_spec.name, "\" for ",
+                                     node_def.op(), " must be a constant, at ",
+                                     node_def.name());
+      }
+      if (arg_spec.allowed_roles == TrtInputArg::kTensor &&
+          inputs.at(i).is_weights()) {
+        return errors::Unimplemented("The input \"", arg_spec.name, "\" for ",
+                                     node_def.op(), " must be a tensor, at ",
+                                     node_def.name());
+      }
+    }
+    return Status::OK();
+  }
+
+  Status operator()() {
+    // Validate data type and inputs.
+    TF_RETURN_IF_ERROR(this->ValidateNodeDefDataType());
+    TF_RETURN_IF_ERROR(this->ValidateInputs());
+
+    // Perform op-level validation.
+    TF_RETURN_IF_ERROR(reinterpret_cast<Impl*>(this)->Validate());
+    if (params_->validation_only) {
+      return Status::OK();
+    }
+
+    // Perform conversion.
+    return reinterpret_cast<Impl*>(this)->Convert();
+  }
+
+ protected:
+  Status NotSupportedInImplicitBatch(const char* pOpType = nullptr) {
+    if (params_->use_implicit_batch) {
+      const auto& op = params_->node_def.op();
+      const auto& nodeName = params_->node_def.name();
+      const auto& error = convert_not_supported_implicit(op, nodeName, pOpType);
+      return errors::Unimplemented(error);
+    }
+    return Status::OK();
+  }
+
+  void AddOutput(const TRT_TensorOrWeights& out) {
+    params_->outputs->push_back(out);
+  }
+
+  template <typename T>
+  ::stream_executor::port::StatusOr<T> GetAttrValue(
+      absl::string_view key) const {
+    T result;
+    TF_RETURN_IF_ERROR(GetNodeAttr(node_def_attrs_, key, &result));
+    return result;
+  }
+
+  const OpConverterParams* const params_;
+  const AttrSlice node_def_attrs_;
+  const std::vector<DataType> allowed_dtypes_;
+};
+
+// Constructs and returns a converter function for a given operation converter
+// class T. This requires T to be a derived class of StructuredOpConverter.
+template <typename T>
+OpConverter MakeConverterFunction() {
+  return [](const OpConverterParams* params) -> Status {
+    T converter(params);
+    return converter();
+  };
+}
+
+}  // namespace convert
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_OP_CONVERTER_H_
diff --git a/tensorflow/compiler/tf2tensorrt/convert/op_converter_registry.cc b/tensorflow/compiler/tf2tensorrt/convert/op_converter_registry.cc
new file mode 100644
index 00000000000..6c0ea1e3e00
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/convert/op_converter_registry.cc
@@ -0,0 +1,158 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/tf2tensorrt/convert/op_converter_registry.h"
+
+#include <set>
+#include <string>
+#include <utility>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/str_cat.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/util/env_var.h"
+
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+
+namespace tensorflow {
+namespace tensorrt {
+namespace convert {
+
+struct OpConverterRegistration {
+  OpConverter converter;
+  int priority;
+};
+class OpConverterRegistry::Impl {
+ public:
+  ~Impl() = default;
+
+  InitOnStartupMarker Register(const string& name, const int priority,
+                               OpConverter converter) {
+    mutex_lock lock(mu_);
+    auto item = registry_.find(name);
+    if (item != registry_.end()) {
+      const int existing_priority = item->second.priority;
+      if (priority <= existing_priority) {
+        LOG(WARNING) << absl::StrCat(
+            "Ignoring TF->TRT ", name, " op converter with priority ",
+            existing_priority, " due to another converter with priority ",
+            priority);
+        return {};
+      } else {
+        LOG(WARNING) << absl::StrCat(
+            "Overwriting TF->TRT ", name, " op converter with priority ",
+            existing_priority, " using another converter with priority ",
+            priority);
+        registry_.erase(item);
+      }
+    }
+    registry_.insert({name, OpConverterRegistration{converter, priority}});
+    return {};
+  }
+
+  ::stream_executor::port::StatusOr<OpConverter> LookUp(string name) {
+    // Fetch the user-provide TF operations denylisted for conversion by TF-TRT.
+    static const absl::flat_hash_set<string> tftrt_op_fakelist = [] {
+      string tftrt_op_fakelist_str;
+      TF_CHECK_OK(ReadStringFromEnvVar("TF_TRT_OP_FAKELIST",
+                                       /*default_value=*/"",
+                                       &tftrt_op_fakelist_str));
+      absl::flat_hash_set<string> tftrt_op_fakelist{};
+      for (const auto& x : str_util::Split(tftrt_op_fakelist_str, ",")) {
+        tftrt_op_fakelist.insert(x);
+      }
+      // Force a rehash of the flat hash set
+      tftrt_op_fakelist.rehash(0);
+      return tftrt_op_fakelist;
+    }();
+
+    // In case the TensorFlow OP `name` matches any of the names passed to
+    // TF_TRT_OP_FAKELIST environment variable, force ::LookUp to resolves to
+    // ConvertFake OP converter.
+    if (tftrt_op_fakelist.contains(name)) {
+      LOG_FIRST_N(INFO, 2) << "Emulating OP Converter: `" << name << "`. It "
+                           << "will cause TRT engine building to fail. This "
+                           << "feature is only intended to be used for "
+                           << "TF-TRT graph segmentation experiments. This "
+                           << "feature is controlled using: "
+                           << "`TF_TRT_OP_FAKELIST=OpName1,OpName2`.";
+      // Forces ::LookUp to resolve to `ConvertFake` registred to `FakeOp`.
+      mutex_lock lock(mu_);
+      return registry_.find("FakeOp")->second.converter;
+    }
+
+    mutex_lock lock(mu_);
+    auto found = registry_.find(name);
+    if (found != registry_.end()) {
+      return found->second.converter;
+    }
+    return errors::NotFound("No converter for op ", name);
+  }
+
+  void Clear(const std::string& name) {
+    mutex_lock lock(mu_);
+    auto itr = registry_.find(name);
+    if (itr == registry_.end()) {
+      return;
+    }
+    registry_.erase(itr);
+  }
+
+  std::vector<std::string> ListRegisteredOps() const {
+    mutex_lock lock(mu_);
+    std::vector<std::string> result;
+    result.reserve(registry_.size());
+    for (const auto& item : registry_) {
+      result.push_back(item.first);
+    }
+    return result;
+  }
+
+ private:
+  mutable mutex mu_;
+  mutable std::unordered_map<std::string, OpConverterRegistration> registry_
+      TF_GUARDED_BY(mu_);
+};
+
+OpConverterRegistry::OpConverterRegistry() : impl_(std::make_unique<Impl>()) {}
+
+::stream_executor::port::StatusOr<OpConverter> OpConverterRegistry::LookUp(
+    const string& name) {
+  return impl_->LookUp(name);
+}
+
+InitOnStartupMarker OpConverterRegistry::Register(const string& name,
+                                                  const int priority,
+                                                  OpConverter converter) {
+  return impl_->Register(name, priority, converter);
+}
+
+std::vector<std::string> OpConverterRegistry::ListRegisteredOps() const {
+  return impl_->ListRegisteredOps();
+}
+
+void OpConverterRegistry::Clear(const std::string& name) { impl_->Clear(name); }
+
+OpConverterRegistry* GetOpConverterRegistry() {
+  static OpConverterRegistry* registry = new OpConverterRegistry();
+  return registry;
+}
+
+}  // namespace convert
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/convert/op_converter_registry.h b/tensorflow/compiler/tf2tensorrt/convert/op_converter_registry.h
new file mode 100644
index 00000000000..cba4e907a39
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/convert/op_converter_registry.h
@@ -0,0 +1,104 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_OP_CONVERTER_REGISTRY_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_OP_CONVERTER_REGISTRY_H_
+
+#include <initializer_list>
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+
+#include <array>
+#include <type_traits>
+#include <vector>
+
+#include "tensorflow/compiler/tf2tensorrt/convert/op_converter.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace tensorrt {
+namespace convert {
+
+class OpConverterRegistry {
+ public:
+  OpConverterRegistry();
+  ~OpConverterRegistry() = default;
+
+  InitOnStartupMarker Register(const string& name, const int priority,
+                               OpConverter converter);
+
+  InitOnStartupMarker Register(const std::initializer_list<std::string>& names,
+                               const int priority, OpConverter converter) {
+    for (const auto& name : names) {
+      Register(name, priority, converter);
+    }
+    return {};
+  }
+
+  template <typename T,
+            typename std::enable_if<std::is_convertible<
+                typename T::value_type, std::string>::value>::type* = nullptr>
+  InitOnStartupMarker Register(const T& names, const int priority,
+                               OpConverter converter) {
+    for (const auto& name : names) {
+      Register(name, priority, converter);
+    }
+    return {};
+  }
+
+  // Clear all registered converters for the given Tensorflow operation name.
+  void Clear(const std::string& name);
+
+  ::stream_executor::port::StatusOr<OpConverter> LookUp(const string& name);
+
+  std::vector<std::string> ListRegisteredOps() const;
+
+ private:
+  class Impl;
+  std::unique_ptr<Impl> impl_;
+};
+
+OpConverterRegistry* GetOpConverterRegistry();
+
+class RegisterOpConverter {
+ public:
+  RegisterOpConverter(const string& name, const int priority,
+                      OpConverter converter) {
+    GetOpConverterRegistry()->Register(name, priority, converter);
+  }
+};
+
+constexpr int kDefaultConverterPriority = 1;
+
+}  // namespace convert
+}  // namespace tensorrt
+
+#define REGISTER_TRT_OP_CONVERTER_IMPL(ctr, func, priority, ...)    \
+  static ::tensorflow::InitOnStartupMarker const                    \
+      register_trt_op_converter##ctr TF_ATTRIBUTE_UNUSED =          \
+          TF_INIT_ON_STARTUP_IF(true)                               \
+          << tensorrt::convert::GetOpConverterRegistry()->Register( \
+                 __VA_ARGS__, priority, func)
+
+#define REGISTER_TRT_OP_CONVERTER(func, priority, ...)               \
+  TF_NEW_ID_FOR_INIT(REGISTER_TRT_OP_CONVERTER_IMPL, func, priority, \
+                     __VA_ARGS__)
+
+#define REGISTER_DEFAULT_TRT_OP_CONVERTER(func, ...) \
+  REGISTER_TRT_OP_CONVERTER(                         \
+      func, tensorrt::convert::kDefaultConverterPriority, __VA_ARGS__)
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_OP_CONVERTER_REGISTRY_H_
diff --git a/tensorflow/compiler/tf2tensorrt/convert/op_converter_registry_test.cc b/tensorflow/compiler/tf2tensorrt/convert/op_converter_registry_test.cc
new file mode 100644
index 00000000000..af3f8d7b6cc
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/convert/op_converter_registry_test.cc
@@ -0,0 +1,67 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+#include "tensorflow/compiler/tf2tensorrt/convert/op_converter_registry.h"
+
+#include <gtest/gtest.h>
+#include "tensorflow/compiler/tf2tensorrt/convert/op_converter.h"
+
+namespace tensorflow {
+namespace tensorrt {
+namespace convert {
+
+TEST(TestOpConverterRegistry, TestOpConverterRegistry) {
+  bool flag{false};
+
+  auto set_true_func = [&flag](const OpConverterParams*) -> Status {
+    flag = true;
+    return Status::OK();
+  };
+
+  auto set_false_func = [&flag](const OpConverterParams*) -> Status {
+    flag = false;
+    return Status::OK();
+  };
+
+  GetOpConverterRegistry()->Register("FakeFunc", kDefaultConverterPriority,
+                                     set_true_func);
+
+  // Lower priority fails to override.
+  GetOpConverterRegistry()->Register("FakeFunc", kDefaultConverterPriority - 1,
+                                     set_false_func);
+
+  // The lookup should return set_true_func (default).
+  auto func = GetOpConverterRegistry()->LookUp("FakeFunc");
+  EXPECT_TRUE(func.ok());
+  EXPECT_TRUE(((*func)(nullptr)).ok());
+  EXPECT_TRUE(flag);
+
+  // Override with higher priority.
+  GetOpConverterRegistry()->Register("FakeFunc", kDefaultConverterPriority + 1,
+                                     set_false_func);
+  func = GetOpConverterRegistry()->LookUp("FakeFunc");
+  EXPECT_TRUE(func.ok());
+  EXPECT_TRUE((*func)(nullptr).ok());
+  EXPECT_FALSE(flag);
+
+  // After clearing the op, lookup should return an error.
+  GetOpConverterRegistry()->Clear("FakeFunc");
+  EXPECT_FALSE(GetOpConverterRegistry()->LookUp("FakeFunc").ok());
+}
+}  // namespace convert
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif
diff --git a/tensorflow/compiler/tf2tensorrt/convert/op_converter_test.cc b/tensorflow/compiler/tf2tensorrt/convert/op_converter_test.cc
new file mode 100644
index 00000000000..3d09ea00b7c
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/convert/op_converter_test.cc
@@ -0,0 +1,123 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+#include "tensorflow/compiler/tf2tensorrt/convert/op_converter.h"
+
+#include <gtest/gtest.h>
+#include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/op_converter_registry.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/status_matchers.h"
+#include "tensorflow/core/protobuf/error_codes.pb.h"
+
+namespace tensorflow {
+namespace tensorrt {
+namespace convert {
+
+using ::tensorflow::testing::IsOk;
+using ::tensorflow::testing::StatusIs;
+using ::testing::HasSubstr;
+
+class ExampleOpConverter : public OpConverterBase<ExampleOpConverter> {
+ public:
+  explicit ExampleOpConverter(const OpConverterParams* params)
+      : OpConverterBase<ExampleOpConverter>(params, {DataType::DT_FLOAT}) {}
+
+  static constexpr const char* NodeDefDataTypeAttributeName() {
+    return "data_type";
+  }
+
+  static constexpr std::array<InputArgSpec, 2> InputSpec() {
+    return std::array<InputArgSpec, 2>{
+        InputArgSpec::Create("input_tensor", TrtInputArg::kTensor),
+        InputArgSpec::Create("weight", TrtInputArg::kWeight)};
+  }
+
+  Status Validate() { return Status::OK(); }
+
+  Status Convert() {
+    AddOutput(TRT_TensorOrWeights(nvinfer1::DataType::kFLOAT,
+                                  nvinfer1::Dims{1, {1, 1, 1}}, 1));
+    return Status::OK();
+  }
+};
+
+TEST(TestOpConverterBase, TestOpConverterBase) {
+  // Register a converter which uses the base converter class.
+  GetOpConverterRegistry()->Register(
+      "FakeFunc", 1, MakeConverterFunction<ExampleOpConverter>());
+
+  NodeDef def;
+  def.set_op("FakeFunc");
+  auto converter = Converter::Create(TrtPrecisionMode::FP32, false,
+                                     Logger::GetLogger(), false, "test_engine");
+  EXPECT_THAT(converter, IsOk());
+
+  // Base class should check attribute with key given by
+  // Impl::NodeDefDataTypeAttributeName().
+  Status conversion_status = (*converter)->ConvertNode(def);
+  EXPECT_THAT(conversion_status,
+              StatusIs(error::INVALID_ARGUMENT,
+                       HasSubstr("Attribute with name data_type not found")));
+
+  // Add partial inputs to the node and make the converter aware.
+  def.mutable_input()->Add("input1");
+  conversion_status = (*converter)
+                          ->AddInputTensor("input1", nvinfer1::DataType::kFLOAT,
+                                           nvinfer1::Dims{4, {1, 1, 1, 1}}, 1);
+  EXPECT_THAT(conversion_status, IsOk());
+
+  // Base class method should check number of inputs.
+  AddNodeAttr("data_type", DT_FLOAT, &def);
+  conversion_status = (*converter)->ConvertNode(def);
+  EXPECT_THAT(conversion_status, StatusIs(error::INTERNAL));
+
+  // Add second input to the node and make the converter aware.
+  def.mutable_input()->Add("input2");
+  conversion_status = (*converter)
+                          ->AddInputTensor("input2", nvinfer1::DataType::kFLOAT,
+                                           nvinfer1::Dims{4, {1, 1, 1, 1}}, 1);
+  EXPECT_THAT(conversion_status, IsOk());
+
+  // Base class validation should check the type (Constant or Tensor) of the
+  // inputs.
+  conversion_status = (*converter)->ConvertNode(def);
+  EXPECT_THAT(
+      conversion_status,
+      StatusIs(error::UNIMPLEMENTED,
+               HasSubstr("input \"weight\" for FakeFunc must be a constant")));
+
+  // Correct input2 so that it is a weight.
+  (*converter)->TensorsMap().erase("input2");
+  (*converter)
+      ->TensorsMap()
+      .insert(std::make_pair("input2", TRT_TensorOrWeights(TRT_ShapedWeights(
+                                           nvinfer1::DataType::kFLOAT))));
+
+  // With the correct input types, check that the converter is called and sets
+  // one output tensor.
+  conversion_status = (*converter)->ConvertNode(def);
+  EXPECT_THAT(conversion_status, IsOk());
+  EXPECT_EQ((*converter)->TensorsMap().size(), 3U);
+
+  GetOpConverterRegistry()->Clear("FakeFunc");
+}
+
+}  // namespace convert
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif
diff --git a/tensorflow/compiler/tf2tensorrt/convert/ops/binary_ops.cc b/tensorflow/compiler/tf2tensorrt/convert/ops/binary_ops.cc
new file mode 100644
index 00000000000..d611920717c
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/convert/ops/binary_ops.cc
@@ -0,0 +1,235 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+
+#include "tensorflow/compiler/tf2tensorrt/convert/op_converter_registry.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/ops/layer_utils.h"
+
+namespace tensorflow {
+namespace tensorrt {
+namespace convert {
+
+const BinaryOperationMapType* BinaryOperationMap() {
+  static const auto* map = new BinaryOperationMapType({
+    {"Add", nvinfer1::ElementWiseOperation::kSUM},
+        {"AddV2", nvinfer1::ElementWiseOperation::kSUM},
+        {"Mul", nvinfer1::ElementWiseOperation::kPROD},
+        {"Sub", nvinfer1::ElementWiseOperation::kSUB},
+        {"Div", nvinfer1::ElementWiseOperation::kDIV},
+        {"FloorDiv", nvinfer1::ElementWiseOperation::kFLOOR_DIV},
+        {"RealDiv", nvinfer1::ElementWiseOperation::kDIV},
+        {"Minimum", nvinfer1::ElementWiseOperation::kMIN},
+        {"Maximum", nvinfer1::ElementWiseOperation::kMAX},
+        {"Pow", nvinfer1::ElementWiseOperation::kPOW},
+#if IS_TRT_VERSION_GE(8, 2, 0, 0)
+        {"Greater", nvinfer1::ElementWiseOperation::kGREATER},
+        {"Less", nvinfer1::ElementWiseOperation::kLESS},
+        {"Equal", nvinfer1::ElementWiseOperation::kEQUAL},
+        // Operators are implemented as NOT Less and NOT Greater, respectively.
+        {"GreaterEqual", nvinfer1::ElementWiseOperation::kLESS},
+        {"LessEqual", nvinfer1::ElementWiseOperation::kGREATER},
+#endif
+  });
+  return map;
+}
+
+const BinaryOperationMapType* BinaryBooleanOperationMap() {
+  static const auto* map = new BinaryOperationMapType({
+      {"LogicalOr", nvinfer1::ElementWiseOperation::kOR},
+      {"LogicalAnd", nvinfer1::ElementWiseOperation::kAND},
+  });
+  return map;
+}
+
+namespace {
+class ConvertBinaryImpl {
+ protected:
+  ConvertBinaryImpl(const BinaryOperationMapType* pOperMap)
+      : pOperMap_(pOperMap) {}
+
+  Status ValidateImpl(
+      const OpConverterParams& params,
+      const std::vector<string>& implicit_batch_not_supported_ops = {},
+      bool both_tensors = false) {
+    const auto& node_def = params.node_def;
+    const auto& op = node_def.op();
+    const auto op_pair = pOperMap_->find(op);
+    if (op_pair == pOperMap_->end()) {
+      return errors::Unimplemented("Binary op: ", op, " not supported");
+    }
+
+    // Constant folding should have been done by TensorFlow.
+    const auto& inputs = params.inputs;
+    if (inputs.at(0).is_weights() && inputs.at(1).is_weights()) {
+      return errors::Unimplemented(
+          "Constant folding is falled back to TensorFlow, binary op '", op,
+          "' received both input as constant");
+    }
+
+    if ((convertToBool_ = find_name(op, implicit_batch_not_supported_ops))) {
+      if (params.use_implicit_batch) {
+        return errors::Unimplemented(
+            convert_not_supported_implicit(op, node_def.name(), "Binary"));
+      }
+    }
+
+    if (both_tensors) {
+      if (inputs.at(0).is_weights() || inputs.at(1).is_weights()) {
+        return errors::InvalidArgument("Both inputs  of '", op,
+                                       "' are expected to be tensors");
+      }
+      // No need to convert the output of "LogicalOr" and "LogicalAnd"
+      convertToBool_ = false;
+    }
+
+    nvinfer1::Dims broadcasted_dims[2];
+    TF_RETURN_IF_ERROR(GetTrtBroadcastShape(
+        inputs.at(0), inputs.at(1), true, params.use_implicit_batch,
+        broadcasted_dims, broadcasted_dims + 1));
+
+    for (int i = 0; i < tensor_.size(); i++) {
+      // This will also convert constants to tensors.
+      TF_RETURN_IF_ERROR(PrepareTensorForShape(
+          params.converter, inputs.at(i), broadcasted_dims[i],
+          params.validation_only, &tensor_[i], node_def, i));
+    }
+    operation_ = op_pair->second;
+    return Status::OK();
+  }
+
+  Status ConvertImpl(const OpConverterParams& params,
+                     const std::vector<string>& revert_bool_ops = {}) {
+    const auto& node_def = params.node_def;
+    // Add ElementWise layer.
+    auto* network = params.converter->network();
+    nvinfer1::ILayer* layer = network->addElementWise(
+        *tensor_[0]->trt_tensor(), *tensor_[1]->trt_tensor(), operation_);
+    TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+
+    if (params.use_explicit_precision) {
+      layer->setPrecision(nvinfer1::DataType::kFLOAT);
+    }
+
+    params.converter->SetLayerName(layer, node_def);
+    const auto& output = layer->getOutput(0);
+    if (convertToBool_) {
+      output->setType(nvinfer1::DataType::kBOOL);
+      if (find_name(node_def.op(), revert_bool_ops)) {
+        nvinfer1::IUnaryLayer* unary_layer =
+            network->addUnary(*output, nvinfer1::UnaryOperation::kNOT);
+        TFTRT_RETURN_ERROR_IF_NULLPTR(unary_layer, node_def.name());
+        params.outputs->push_back(
+            TRT_TensorOrWeights(unary_layer->getOutput(0)));
+        return Status::OK();
+      }
+    }
+
+    params.outputs->push_back(TRT_TensorOrWeights(output));
+    return Status::OK();
+  }
+
+  static constexpr std::array<InputArgSpec, 2> InputSpec() {
+    return std::array<InputArgSpec, 2>{
+        InputArgSpec::Create("x", TrtInputArg::kBoth),
+        InputArgSpec::Create("y", TrtInputArg::kBoth)};
+  }
+
+ private:
+  const BinaryOperationMapType* pOperMap_;
+  std::array<ITensorProxyPtr, 2> tensor_{nullptr, nullptr};
+  nvinfer1::ElementWiseOperation operation_;
+  bool convertToBool_;
+};
+
+class ConvertBinary : public OpConverterBase<ConvertBinary>,
+                      protected ConvertBinaryImpl {
+ public:
+  explicit ConvertBinary(const OpConverterParams* params)
+      : OpConverterBase<ConvertBinary>(
+            params,
+            {DataType::DT_FLOAT, DataType::DT_HALF, DataType::DT_INT32}),
+        ConvertBinaryImpl(BinaryOperationMap()) {}
+
+  static constexpr std::array<InputArgSpec, 2> InputSpec() {
+    return ConvertBinaryImpl::InputSpec();
+  }
+
+  Status Validate() {
+    const std::vector<string> implicit_batch_not_supported_ops {
+#if IS_TRT_VERSION_GE(8, 2, 0, 0)
+      "Greater", "Less", "Equal", "GreaterEqual", "LessEqual"
+#endif
+    };
+    return ValidateImpl(*params_, implicit_batch_not_supported_ops);
+  }
+  Status Convert() {
+    const std::vector<string> implemented_with_reverted_ops {
+#if IS_TRT_VERSION_GE(8, 2, 0, 0)
+      "GreaterEqual", "LessEqual"
+#endif
+    };
+    return ConvertImpl(*params_, implemented_with_reverted_ops);
+  }
+};
+
+class ConvertBooleanBinary : public OpConverterBase<ConvertBooleanBinary>,
+                             public ConvertBinaryImpl {
+ public:
+  explicit ConvertBooleanBinary(const OpConverterParams* params)
+      : OpConverterBase<ConvertBooleanBinary>(params, {DataType::DT_BOOL}),
+        ConvertBinaryImpl(BinaryBooleanOperationMap()) {}
+
+  static constexpr std::array<InputArgSpec, 2> InputSpec() {
+    return ConvertBinaryImpl::InputSpec();
+  }
+
+  static constexpr const char* NodeDefDataTypeAttributeName() {
+    /*
+    node {
+      name: "..."
+      op: "LogicalOr"
+      input: "..."
+      input: "..."
+      attr {
+        key: "_output_shapes"
+        ...
+      }
+    }
+    */
+    return "";
+  }
+  Status Validate() {
+#if IS_TRT_VERSION_GE(8, 2, 0, 0)
+    return ValidateImpl(*params_, {"LogicalOr", "LogicalAnd"}, true);
+#else
+    return errors::Unimplemented("Boolean op: ", params_->node_def.op(),
+                                 " is not supported in TRT version < 8.2");
+#endif
+  }
+  Status Convert() { return ConvertImpl(*params_); }
+};
+}  // namespace
+
+REGISTER_DEFAULT_TRT_OP_CONVERTER(MakeConverterFunction<ConvertBinary>(),
+                                  GetOperationNames(*BinaryOperationMap()));
+REGISTER_DEFAULT_TRT_OP_CONVERTER(
+    MakeConverterFunction<ConvertBooleanBinary>(),
+    GetOperationNames(*BinaryBooleanOperationMap()));
+
+}  // namespace convert
+}  // namespace tensorrt
+}  // namespace tensorflow
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/convert/ops/data_format_vec_permute.cc b/tensorflow/compiler/tf2tensorrt/convert/ops/data_format_vec_permute.cc
new file mode 100644
index 00000000000..348d478aaeb
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/convert/ops/data_format_vec_permute.cc
@@ -0,0 +1,179 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+#include "tensorflow/compiler/tf2tensorrt/common/utils.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/op_converter.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/op_converter_registry.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "third_party/tensorrt/NvInfer.h"
+#include "third_party/tensorrt/NvInferRuntimeCommon.h"
+
+namespace tensorflow {
+namespace tensorrt {
+namespace convert {
+
+int get_spatial_dim_count(string format) {
+  // Spatial dimensions are the dimensions besides NC, and here we assume NC
+  // always appear in the format string.
+  return format.size() - 2;
+}
+
+class ConvertDataFormatVecPermute
+    : public OpConverterBase<ConvertDataFormatVecPermute> {
+ public:
+  ConvertDataFormatVecPermute(const OpConverterParams* params)
+      : OpConverterBase<ConvertDataFormatVecPermute>(params,
+                                                     {DataType::DT_INT32}) {}
+
+  struct DataFormatVecPermuteAttributes {
+    string dst_format;
+    string src_format;
+    int x_dim_count;
+  };
+
+  static constexpr std::array<InputArgSpec, 1> InputSpec() {
+    return {InputArgSpec::Create("x", TrtInputArg::kBoth)};
+  }
+
+  Status Validate() {
+    TF_RETURN_IF_ERROR(NotSupportedInImplicitBatch());
+    const auto& inputs = params_->inputs;
+    const auto& nodeName = params_->node_def.name();
+
+    x_input_ = inputs.at(0);
+
+    // Check input rank.
+    const auto x_dims = x_input_.GetTrtDims();
+    int input_rank = x_dims.nbDims;
+    if (input_rank != 1 && input_rank != 2) {
+      return errors::InvalidArgument(
+          "Input must be a vector or matrix, but got rank ", input_rank,
+          ", at ", nodeName);
+    }
+
+    // Verify and consume node attributes.
+    ::stream_executor::port::StatusOr<string> dst_format =
+        GetAttrValue<string>("dst_format");
+    ::stream_executor::port::StatusOr<string> src_format =
+        GetAttrValue<string>("src_format");
+    TRT_ENSURE_OK(dst_format);
+    TRT_ENSURE_OK(src_format);
+
+    // Check input dims.
+    const int full_dim_count = src_format.ValueOrDie().size();
+    const int spatial_dim_count =
+        get_spatial_dim_count(src_format.ValueOrDie());
+    if (input_rank == 1) {
+      if (x_dims.d[0] != spatial_dim_count && x_dims.d[0] != full_dim_count) {
+        return errors::InvalidArgument(
+            "1D input must be of size ", spatial_dim_count, " or ",
+            full_dim_count, ", but got size ", x_dims.d[0], ", at ", nodeName);
+      }
+    } else if (input_rank == 2) {
+      if (x_dims.d[0] != spatial_dim_count && x_dims.d[0] != full_dim_count) {
+        return errors::InvalidArgument(
+            "First dimension of 2D input must be of size ", spatial_dim_count,
+            " or ", full_dim_count, ", but got shape (", x_dims.d[0], ", ",
+            x_dims.d[1], "), at ", nodeName);
+      }
+      if (x_dims.d[1] != 2) {
+        return errors::InvalidArgument(
+            "Second dimension of 2D input must be of size 2, but got shape (",
+            x_dims.d[0], ", ", x_dims.d[1], "), at ", nodeName);
+      }
+    }
+
+    // Set custom attributes.
+    attrs_.x_dim_count = x_dims.d[0];
+    attrs_.dst_format = dst_format.ValueOrDie();
+    attrs_.src_format = src_format.ValueOrDie();
+
+    return Status::OK();
+  }
+
+  Status Convert() {
+    // Copy format strings in case they need to be modified.
+    string dst_format = attrs_.dst_format;
+    string src_format = attrs_.src_format;
+    const int& spatial_dim_count = get_spatial_dim_count(src_format);
+
+    // If the input is a vector of size spatial_dim_count, treat the elements
+    // as spatial dimensions.
+    if (attrs_.x_dim_count == spatial_dim_count) {
+      auto keep_only_spatial_dimensions =
+          [spatial_dim_count](string* format_str) -> void {
+        auto new_end = std::remove_if(format_str->begin(), format_str->end(),
+                                      [spatial_dim_count](const char dim) {
+                                        return dim == 'N' || dim == 'C';
+                                      });
+        format_str->erase(new_end, format_str->end());
+      };
+      keep_only_spatial_dimensions(&src_format);
+      keep_only_spatial_dimensions(&dst_format);
+    }
+
+    // Create indices for the gather layer and make weights out of them.
+    std::vector<int32> dst_indices(attrs_.x_dim_count);
+    for (int i = 0; i < attrs_.x_dim_count; ++i) {
+      for (int j = 0; j < attrs_.x_dim_count; ++j) {
+        if (src_format[i] == dst_format[j]) {
+          dst_indices[j] = i;
+          break;
+        }
+      }
+    }
+    nvinfer1::Dims indices_dims = {1, {attrs_.x_dim_count}};
+    ::stream_executor::port::StatusOr<TRT_ShapedWeights> indices_weights =
+        params_->weight_store->GetTempWeights(nvinfer1::DataType::kINT32,
+                                              indices_dims);
+    TRT_ENSURE_OK(indices_weights);
+    int32* indices_ptr = indices_weights.ValueOrDie().GetPointer<int32>();
+    std::copy(dst_indices.data(), dst_indices.data() + attrs_.x_dim_count,
+              indices_ptr);
+    ITensorProxyPtr x_tensor =
+        x_input_.is_weights() ? params_->converter->CreateConstantLayer(
+                                    x_input_.weights(), x_input_.GetTrtDims())
+                              : x_input_.tensor();
+    ITensorProxyPtr indices_tensor = params_->converter->CreateConstantLayer(
+        indices_weights.ValueOrDie(), indices_dims);
+
+    // Gather layer with 1D indices on axis 0, conserves shape.
+    nvinfer1::IGatherLayer* layer = params_->converter->network()->addGather(
+        *x_tensor->trt_tensor(), *indices_tensor->trt_tensor(), 0);
+    TRT_ENSURE(layer);
+    params_->converter->SetLayerName(layer, params_->node_def);
+
+    ITensorProxyPtr output_tensor = layer->getOutput(0);
+
+    params_->outputs->push_back(TRT_TensorOrWeights(output_tensor));
+    return Status::OK();
+  }
+
+ private:
+  TRT_TensorOrWeights x_input_;
+  DataFormatVecPermuteAttributes attrs_{};
+};
+REGISTER_DEFAULT_TRT_OP_CONVERTER(
+    MakeConverterFunction<ConvertDataFormatVecPermute>(),
+    {"DataFormatVecPermute"});
+
+}  // namespace convert
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/convert/ops/fill_ops.cc b/tensorflow/compiler/tf2tensorrt/convert/ops/fill_ops.cc
new file mode 100644
index 00000000000..96e5558532e
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/convert/ops/fill_ops.cc
@@ -0,0 +1,316 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+
+#include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/op_converter_registry.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/ops/layer_utils.h"
+
+namespace tensorflow {
+namespace tensorrt {
+namespace convert {
+
+#if IS_TRT_VERSION_GE(8, 2, 0, 0)
+
+template <typename Impl>
+class ConvertFillBase : public OpConverterBase<Impl> {
+ public:
+  explicit ConvertFillBase(const OpConverterParams* params)
+      : OpConverterBase<Impl>(params, {DataType::DT_FLOAT, DataType::DT_HALF,
+                                       DataType::DT_INT32}) {}
+};
+
+class ConvertFill : public ConvertFillBase<ConvertFill> {
+ public:
+  explicit ConvertFill(const OpConverterParams* params)
+      : ConvertFillBase<ConvertFill>(params) {}
+
+  static constexpr std::array<InputArgSpec, 2> InputSpec() {
+    return std::array<InputArgSpec, 2>{
+        InputArgSpec::Create("dims", TrtInputArg::kBoth),
+        InputArgSpec::Create("value", TrtInputArg::kBoth)};
+  }
+
+  Status Validate() {
+    const auto& params = *this->params_;
+    TF_RETURN_IF_ERROR(NotSupportedInImplicitBatch());
+
+    const auto& inputs = params.inputs;
+    const auto& node_def = params.node_def;
+    const TRT_TensorOrWeights& dims_input = inputs.at(0);
+
+    const auto dims_type = dims_input.TrtDType();
+    if (dims_type != nvinfer1::DataType::kINT32) {
+      return errors::InvalidArgument("The dims parameter of ", node_def.op(),
+                                     " operation in ", node_def.name(),
+                                     " is expected to be of type ",
+                                     DebugString(nvinfer1::DataType::kINT32),
+                                     " type, got ", DebugString(dims_type));
+    }
+
+    const auto nbDims = dims_input.GetTrtDims().nbDims;
+    if (nbDims < 0) {
+      return errors::InvalidArgument("The shape of parameter ", node_def.op(),
+                                     " operation in ", node_def.name(),
+                                     " cannot be partial.");
+    }
+    return Status::OK();
+  }
+
+  Status Convert() {
+    const auto& params = *this->params_;
+    auto* network = params.converter->network();
+    const auto& inputs = params.inputs;
+
+    const bool is_dims_static = inputs[0].is_weights();
+    const bool is_value_static = inputs[1].is_weights();
+
+    const TRT_TensorOrWeights& dims_input = inputs.at(0);
+    const TRT_TensorOrWeights& value_input = inputs.at(1);
+
+    int nbDims = dims_input.GetTrtDims().d[0];
+
+    nvinfer1::Dims trt_dims{0};
+    if (is_dims_static) {
+      const auto dims_weights = dims_input.weights();
+      DimsAdapter dims_adapter(dims_weights.GetSpan<int32>());
+      dims_adapter.TrtDims(&trt_dims);
+    }
+
+    auto builder = TRTNetworkBuilder::Create(network, params.weight_store);
+    ::stream_executor::port::StatusOr<nvinfer1::ILayer*> layer =
+        builder.ValueOrDie().AddFill(value_input, dims_input, is_value_static,
+                                     is_dims_static, nbDims, trt_dims);
+    ITensorProxyPtr output_tensor = layer.ValueOrDie()->getOutput(0);
+    this->AddOutput(TRT_TensorOrWeights(output_tensor));
+    return Status::OK();
+  }
+};
+
+class ConvertRange : public ConvertFillBase<ConvertRange> {
+ public:
+  explicit ConvertRange(const OpConverterParams* params)
+      : ConvertFillBase<ConvertRange>(params) {}
+
+  static constexpr std::array<InputArgSpec, 3> InputSpec() {
+    return std::array<InputArgSpec, 3>{
+        InputArgSpec::Create("start", TrtInputArg::kBoth),
+        InputArgSpec::Create("limit", TrtInputArg::kBoth),
+        InputArgSpec::Create("delta", TrtInputArg::kBoth)};
+  }
+
+  static constexpr const char* NodeDefDataTypeAttributeName() {
+    /*
+    node {
+      name: "..."
+      op: "Range"
+      ...
+      attr {
+        key: "Tidx"
+        value {
+          type: DT_INT32
+        }
+      }
+    }
+    */
+    return "Tidx";
+  }
+  Status Validate() {
+    TF_RETURN_IF_ERROR(NotSupportedInImplicitBatch());
+    const auto& params = *this->params_;
+    const auto& inputs = params.inputs;
+    const auto& node_def = params.node_def;
+
+    float param[3];
+    all_weights_ = all_integers_ = true;
+    for (int i = 0; i < 3; i++) {
+      const auto& input = inputs.at(i);
+      all_integers_ &= input.TrtDType() == nvinfer1::DataType::kINT32;
+      if (input.is_weights()) {
+        switch (input.TrtDType()) {
+          case nvinfer1::DataType::kFLOAT:
+            param[i] = get_input_param<float>(input);
+            break;
+          case nvinfer1::DataType::kHALF:
+            param[i] = get_input_param<Eigen::half>(input);
+            break;
+          case nvinfer1::DataType::kINT32:
+            param[i] = get_input_param<int>(input);
+            break;
+          default:
+            return errors::InvalidArgument(
+                "Unsupported data type ", DebugString(input.TrtDType()),
+                " used for '", InputSpec()[i].name, "'");
+        }
+      } else {
+        all_weights_ = false;
+      }
+    }
+
+    if (!(all_weights_ || all_integers_)) {
+      // As of 06/03/2022, when at least one of the (start, limit, delta)
+      // is passed as a tensor, they must all be of type kINT32
+      return errors::Unimplemented(convert_range_expected_msg(node_def));
+    }
+
+    if (inputs.at(2).is_weights()) {
+      if ((delta_ = param[2]) == 0) {
+        return errors::InvalidArgument("The delta parameter of ", node_def.op(),
+                                       " operation cannot be equal to 0");
+      }
+
+      if (!all_weights_ && delta_ < 0) {
+        return errors::InvalidArgument(
+            "The delta parameter of Range operation "
+            "cannot be negative, when one of (start, limit) is passed as "
+            "a tensor, but got ",
+            delta_);
+      }
+    }
+
+    for (int i = 0; i < 3; i++) {
+      const auto& input = inputs.at(i);
+      const auto& dims = input.GetTrtDims();
+      if (dims.nbDims != 1 || dims.d[0] != 1) {
+        return errors::InvalidArgument("Dimension for '", InputSpec()[i].name,
+                                       "' of ", node_def.op(), " operator ",
+                                       "should be equal to 1");
+      }
+    }
+
+    if (all_weights_) {
+      const auto num_intervals_float =
+          (param[1] - (start_ = param[0])) / delta_;
+      if (num_intervals_float < 0) {
+        const auto error = convert_range_error_msg(start_, param[1], delta_);
+        return errors::InvalidArgument(error);
+      }
+
+      num_values_ = static_cast<int>(num_intervals_float);
+      if (start_ + delta_ * num_values_ != param[1]) {
+        num_values_++;
+      }
+    }
+
+    return Status::OK();
+  }
+
+  Status Convert() {
+    const auto& params = *this->params_;
+    const auto& inputs = params.inputs;
+    const TRT_TensorOrWeights& input = inputs.at(0);
+    TRT_TensorOrWeights value_input;
+    nvinfer1::Dims trt_dims{1};
+    auto builder = TRTNetworkBuilder::Create(params.converter->network(),
+                                             params.weight_store);
+    TRT_ENSURE_OK(builder);
+    ITensorProxyPtr dims_input_tensor = nullptr;
+    ITensorProxyPtr beta_tensor = nullptr;
+    ITensorProxyPtr scalar_tensor = nullptr;
+    if (!all_weights_) {
+      ITensorProxyPtr tensors[3];
+      for (int i = 0; i < 3; i++) {
+        TF_RETURN_IF_ERROR(builder.ValueOrDie().get_tensor4TensorOrWeights(
+            inputs.at(i), tensors + i));
+      }
+
+      ::stream_executor::port::StatusOr<nvinfer1::IElementWiseLayer*> num =
+          builder.ValueOrDie().Sub(/*limit*/ tensors[1]->trt_tensor(),
+                                   /*start*/ tensors[0]->trt_tensor());
+
+      TRT_ENSURE_PTR_OK(num);
+      ::stream_executor::port::StatusOr<nvinfer1::IElementWiseLayer*> ceil_div =
+          builder.ValueOrDie().FloorDiv(
+              num.ValueOrDie()->getOutput(0),
+              (beta_tensor = tensors[2])->trt_tensor());
+      TRT_ENSURE_PTR_OK(ceil_div);
+      dims_input_tensor = ceil_div.ValueOrDie()->getOutput(0);
+      dims_input_tensor->setType(nvinfer1::DataType::kINT32);
+
+      nvinfer1::Dims scalar_dims{0};
+      TF_RETURN_IF_ERROR(PrepareTensorForShape(
+          params.converter, params.inputs.at(0), scalar_dims, false,
+          &scalar_tensor, params.node_def));
+    } else {
+      DimsAdapter value_input_dims(std::vector<int>{1});
+      ::stream_executor::port::StatusOr<TRT_ShapedWeights> value_weights =
+          params.weight_store->GetTempWeights(input.TrtDType(),
+                                              value_input_dims);
+
+      TF_RETURN_IF_ERROR(value_weights.status());
+      TF_RETURN_IF_ERROR(value_weights.ValueOrDie().SetValues(start_));
+      value_input = TRT_TensorOrWeights(value_weights.ValueOrDie());
+
+      trt_dims.d[0] = num_values_;
+      ::stream_executor::port::StatusOr<nvinfer1::IConstantLayer*> const_layer =
+          builder.ValueOrDie().ConstantShape(value_input_dims);
+      TRT_ENSURE_PTR_OK(const_layer);
+      dims_input_tensor = const_layer.ValueOrDie()->getOutput(0);
+    }
+
+    TRT_TensorOrWeights dims_input(dims_input_tensor);
+
+    ::stream_executor::port::StatusOr<nvinfer1::ILayer*> layer =
+        builder.ValueOrDie().AddFill(value_input, dims_input, all_weights_,
+                                     all_weights_, 1, trt_dims, scalar_tensor,
+                                     beta_tensor, delta_);
+
+    ITensorProxyPtr output_tensor = layer.ValueOrDie()->getOutput(0);
+    if (all_integers_) {
+      output_tensor->setType(nvinfer1::DataType::kINT32);
+    }
+
+    this->AddOutput(TRT_TensorOrWeights(output_tensor));
+    return Status::OK();
+  }
+
+ private:
+  template <typename T>
+  float get_input_param(const TRT_TensorOrWeights& input) {
+    return static_cast<float>(*input.weights().GetPointer<T>());
+  }
+
+  float start_;
+  float delta_;
+  int num_values_;
+  bool all_weights_;
+  bool all_integers_;
+};
+
+std::string convert_range_error_msg(float start, float limit, float delta) {
+  constexpr const char* format_string =
+      "For parameters (start, limit) = (%.2f, %.2f) "
+      "of the Range operation delta cannot be %s, got %.2f";
+  return absl::StrFormat(format_string, start, limit,
+                         start < limit ? "negative" : "positive", delta);
+}
+
+std::string convert_range_expected_msg(const NodeDef& node_def) {
+  return "When at least one of parameters (start, limit, delta) of " +
+         node_def.op() + " operation in " + node_def.name() +
+         " is passed as a tensor, they must all be of type kINT32";
+}
+
+REGISTER_DEFAULT_TRT_OP_CONVERTER(MakeConverterFunction<ConvertFill>(), "Fill");
+REGISTER_DEFAULT_TRT_OP_CONVERTER(MakeConverterFunction<ConvertRange>(),
+                                  "Range");
+
+#endif  // IS_TRT_VERSION_GE(8, 2, 0, 0)
+
+}  // namespace convert
+}  // namespace tensorrt
+}  // namespace tensorflow
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/convert/ops/layer_utils.h b/tensorflow/compiler/tf2tensorrt/convert/ops/layer_utils.h
new file mode 100644
index 00000000000..458b8e8191d
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/convert/ops/layer_utils.h
@@ -0,0 +1,736 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_OPS_LAYER_UTILS_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_OPS_LAYER_UTILS_H_
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+
+#include <type_traits>
+
+#include "absl/strings/str_cat.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "third_party/tensorrt/NvInfer.h"
+#include "third_party/tensorrt/NvInferRuntimeCommon.h"
+
+namespace tensorflow {
+namespace tensorrt {
+
+namespace convert {
+
+// Facilitates the creation of TensorRT layers inside a network. The user
+// provides a INetworkDefinition pointer during construction. They can then add
+// operations to the network through the provided functions. Each function
+// returns a struct which contains the symbolic result of the operation (ITensor
+// pointer) as well as a pointer to the last TensorRT ILayer created. Some
+// operations may create multiple layers in order to accomplish the desired
+// result (e.g. Sign).
+class TRTNetworkBuilder {
+ public:
+  static ::stream_executor::port::StatusOr<TRTNetworkBuilder> Create(
+      nvinfer1::INetworkDefinition* network, TrtWeightStore* weight_store) {
+    TRT_ENSURE(network);
+    TRT_ENSURE(weight_store);
+    return TRTNetworkBuilder(network, weight_store);
+  }
+
+ private:
+  TRTNetworkBuilder(nvinfer1::INetworkDefinition* network,
+                    TrtWeightStore* weight_store)
+      : network_(network), weight_store_(weight_store) {}
+
+ public:
+  // Adds an Add operation to the network.
+  ::stream_executor::port::StatusOr<nvinfer1::IElementWiseLayer*> Add(
+      nvinfer1::ITensor* lhs, nvinfer1::ITensor* rhs) noexcept {
+    TRT_ENSURE(lhs);
+    TRT_ENSURE(rhs);
+    nvinfer1::IElementWiseLayer* layer = network_->addElementWise(
+        *lhs, *rhs, nvinfer1::ElementWiseOperation::kSUM);
+    TRT_ENSURE(layer);
+    return layer;
+  };
+
+  // Adds an elementwise min(lhs, rhs) operation to the network. The output has
+  // the same data type as the input.
+  ::stream_executor::port::StatusOr<nvinfer1::IElementWiseLayer*> Min(
+      nvinfer1::ITensor* lhs, nvinfer1::ITensor* rhs) noexcept {
+    TRT_ENSURE(lhs);
+    TRT_ENSURE(rhs);
+    nvinfer1::IElementWiseLayer* layer = network_->addElementWise(
+        *lhs, *rhs, nvinfer1::ElementWiseOperation::kMIN);
+    TRT_ENSURE(layer);
+    return layer;
+  };
+
+  // Adds an elementwise max(lhs, rhs) operation to the network. The output has
+  // the same datatype as the input.
+  ::stream_executor::port::StatusOr<nvinfer1::IElementWiseLayer*> Max(
+      nvinfer1::ITensor* lhs, nvinfer1::ITensor* rhs) noexcept {
+    TRT_ENSURE(lhs);
+    TRT_ENSURE(rhs);
+    nvinfer1::IElementWiseLayer* layer = network_->addElementWise(
+        *lhs, *rhs, nvinfer1::ElementWiseOperation::kMAX);
+    TRT_ENSURE(layer);
+    return layer;
+  };
+
+  // Adds an absolute value operation to the network. Note that this unary
+  // operation will do an implict float conversion. For int32 tensors, use
+  // "AbsInt".
+  ::stream_executor::port::StatusOr<nvinfer1::IUnaryLayer*> AbsFloat(
+      nvinfer1::ITensor* input) noexcept {
+    TRT_ENSURE(input);
+    TRT_ENSURE(input->getType() != nvinfer1::DataType::kFLOAT &&
+               input->getType() != nvinfer1::DataType::kHALF);
+    nvinfer1::IUnaryLayer* layer =
+        network_->addUnary(*input, nvinfer1::UnaryOperation::kABS);
+    TRT_ENSURE(layer);
+    return layer;
+  }
+
+  // Performs Abs without implict float conversion. The input should be of type
+  // kInt32. For float datatypes, use "Abs".
+  ::stream_executor::port::StatusOr<nvinfer1::IElementWiseLayer*> AbsInt(
+      nvinfer1::ITensor* input) noexcept {
+    TRT_ENSURE(input);
+    TRT_ENSURE(input->getType() == nvinfer1::DataType::kINT32);
+    ::stream_executor::port::StatusOr<nvinfer1::IElementWiseLayer*> sign =
+        this->SignInt(input);
+    return this->Mul(input, sign.ValueOrDie()->getOutput(0));
+  }
+
+  // Returns elementwise sign(x) for int32 input tensors where sign(x) is
+  // defined as 1 where x > 0, -1 where x < 0 and 0 where x == 0.
+  ::stream_executor::port::StatusOr<nvinfer1::IElementWiseLayer*> SignInt(
+      nvinfer1::ITensor* input) noexcept {
+    TRT_ENSURE(input);
+
+    // Create constants +1 and -1.
+    ::stream_executor::port::StatusOr<nvinfer1::IConstantLayer*> one =
+        this->Constant<int32>(1, input->getDimensions().nbDims);
+    TRT_ENSURE_PTR_OK(one);
+
+    ::stream_executor::port::StatusOr<nvinfer1::IConstantLayer*> neg_one =
+        this->Constant<int32>(-1, input->getDimensions().nbDims);
+    TRT_ENSURE_PTR_OK(neg_one);
+
+    // Turn all negaitve elements into -1, positive and zero elements
+    // unaffected.
+    ::stream_executor::port::StatusOr<nvinfer1::IElementWiseLayer*> max =
+        this->Max(input, neg_one.ValueOrDie()->getOutput(0));
+    TRT_ENSURE_PTR_OK(max);
+
+    // Turn all positive elements into +1, negative and zero elements
+    // unaffected.
+    ::stream_executor::port::StatusOr<nvinfer1::IElementWiseLayer*> min =
+        this->Min(max.ValueOrDie()->getOutput(0),
+                  one.ValueOrDie()->getOutput(0));
+    TRT_ENSURE_PTR_OK(min);
+    return min;
+  }
+
+  // Adds a Sub operation to the network.
+  ::stream_executor::port::StatusOr<nvinfer1::IElementWiseLayer*> Sub(
+      nvinfer1::ITensor* lhs, nvinfer1::ITensor* rhs) noexcept {
+    TRT_ENSURE(lhs);
+    TRT_ENSURE(rhs);
+    nvinfer1::IElementWiseLayer* layer = network_->addElementWise(
+        *lhs, *rhs, nvinfer1::ElementWiseOperation::kSUB);
+    TRT_ENSURE(layer);
+    return layer;
+  }
+
+  // Adds an Greater operation to the network.
+  ::stream_executor::port::StatusOr<nvinfer1::IElementWiseLayer*> Greater(
+      nvinfer1::ITensor* lhs, nvinfer1::ITensor* rhs) noexcept {
+    TRT_ENSURE(lhs);
+    TRT_ENSURE(rhs);
+    nvinfer1::IElementWiseLayer* layer = network_->addElementWise(
+        *lhs, *rhs, nvinfer1::ElementWiseOperation::kGREATER);
+    TRT_ENSURE(layer);
+    return layer;
+  }
+
+  // Adds an Equal operation to the network.
+  ::stream_executor::port::StatusOr<nvinfer1::IElementWiseLayer*> Equal(
+      nvinfer1::ITensor* lhs, nvinfer1::ITensor* rhs) noexcept {
+    TRT_ENSURE(lhs);
+    TRT_ENSURE(rhs);
+    nvinfer1::IElementWiseLayer* layer = network_->addElementWise(
+        *lhs, *rhs, nvinfer1::ElementWiseOperation::kEQUAL);
+    TRT_ENSURE(layer);
+    return layer;
+  }
+
+  // Adds a FloorDiv operation to the network.
+  ::stream_executor::port::StatusOr<nvinfer1::IElementWiseLayer*> FloorDiv(
+      nvinfer1::ITensor* lhs, nvinfer1::ITensor* rhs) noexcept {
+    TRT_ENSURE(lhs);
+    TRT_ENSURE(rhs);
+    nvinfer1::IElementWiseLayer* layer = network_->addElementWise(
+        *lhs, *rhs, nvinfer1::ElementWiseOperation::kFLOOR_DIV);
+    TRT_ENSURE(layer);
+    return layer;
+  }
+
+  // Returns the equivalent of ceil_divide(abs(x)/abs(y))) operation. The inputs
+  // "lhs" and "rhs" should be int32 tensors.
+  ::stream_executor::port::StatusOr<nvinfer1::IElementWiseLayer*> AbsCeilDivInt(
+      nvinfer1::ITensor* lhs, nvinfer1::ITensor* rhs) noexcept {
+    TRT_ENSURE(lhs);
+    TRT_ENSURE(rhs);
+    TRT_ENSURE(lhs->getType() == nvinfer1::DataType::kINT32);
+    TRT_ENSURE(rhs->getType() == nvinfer1::DataType::kINT32);
+
+    ::stream_executor::port::StatusOr<nvinfer1::IElementWiseLayer*> rhs_abs =
+        this->AbsInt(rhs);
+    TRT_ENSURE_PTR_OK(rhs_abs);
+    ::stream_executor::port::StatusOr<nvinfer1::IElementWiseLayer*> lhs_abs =
+        this->AbsInt(lhs);
+    TRT_ENSURE_PTR_OK(lhs_abs);
+    ::stream_executor::port::StatusOr<nvinfer1::IElementWiseLayer*> add1 =
+        this->Add(lhs_abs.ValueOrDie()->getOutput(0),
+                  rhs_abs.ValueOrDie()->getOutput(0));
+    TRT_ENSURE_PTR_OK(add1);
+    ::stream_executor::port::StatusOr<nvinfer1::IConstantLayer*> one_const =
+        this->Constant<int32>(1, rhs->getDimensions().nbDims);
+    TRT_ENSURE_PTR_OK(one_const);
+    ::stream_executor::port::StatusOr<nvinfer1::IElementWiseLayer*> numerator =
+        this->Sub(add1.ValueOrDie()->getOutput(0),
+                  one_const.ValueOrDie()->getOutput(0));
+    TRT_ENSURE_PTR_OK(numerator);
+    return FloorDiv(numerator.ValueOrDie()->getOutput(0),
+                    rhs_abs.ValueOrDie()->getOutput(0));
+  }
+
+  // Adds an elementwise multiplication operation to the network.
+  ::stream_executor::port::StatusOr<nvinfer1::IElementWiseLayer*> Mul(
+      nvinfer1::ITensor* lhs, nvinfer1::ITensor* rhs) noexcept {
+    TRT_ENSURE(lhs);
+    TRT_ENSURE(rhs);
+    nvinfer1::IElementWiseLayer* layer = network_->addElementWise(
+        *lhs, *rhs, nvinfer1::ElementWiseOperation::kPROD);
+    TRT_ENSURE(layer);
+    return layer;
+  }
+
+  // Adds a sequence of elementwise multiplication operations to the network.
+  // The returned layer's output contains the cumulative elementwise product of
+  // all tensors in the input.
+  ::stream_executor::port::StatusOr<nvinfer1::ILayer*> CumulativeProd(
+      absl::Span<nvinfer1::ITensor*> inputs) noexcept {
+    TRT_ENSURE(!absl::c_any_of(
+        inputs, [](nvinfer1::ITensor* x) { return x == nullptr; }));
+    nvinfer1::ILayer* out = nullptr;
+    if (inputs.size() == 1) {
+      out = network_->addIdentity(*inputs[0]);
+      TRT_ENSURE(out != nullptr);
+      return out;
+    }
+    nvinfer1::ITensor* last = inputs[0];
+    for (int i = 1; i < inputs.size(); i++) {
+      ::stream_executor::port::StatusOr<nvinfer1::IElementWiseLayer*> mul =
+          this->Mul(last, inputs[i]);
+      TRT_ENSURE_PTR_OK(mul);
+      out = mul.ValueOrDie();
+      last = mul.ValueOrDie()->getOutput(0);
+    }
+    return out;
+  }
+
+  // Adds a Constant layer whose output is a TensorRT shape tensor. The shape
+  // tensor's size and values correspond to dim's nbDims and d[], respectively.
+  ::stream_executor::port::StatusOr<nvinfer1::IConstantLayer*> ConstantShape(
+      const DimsAdapter& shape_data) noexcept {
+    TRT_ENSURE(shape_data.NumDims() > 0);
+    nvinfer1::Dims shape_dims;
+    shape_dims.nbDims = 1;
+    shape_dims.d[0] = shape_data.NumDims();
+    ::stream_executor::port::StatusOr<TRT_ShapedWeights> const_weights =
+        weight_store_->GetTempWeights(nvinfer1::DataType::kINT32, shape_dims);
+    TRT_ENSURE_OK(const_weights);
+    absl::c_copy(shape_data, const_weights.ValueOrDie().GetPointer<int32>());
+    ::stream_executor::port::StatusOr<nvinfer1::Dims> trt_dims =
+        const_weights.ValueOrDie().Shape().AsTrtDims();
+    TRT_ENSURE_OK(trt_dims);
+    nvinfer1::IConstantLayer* const_layer = network_->addConstant(
+        trt_dims.ValueOrDie(), const_weights.ValueOrDie().GetTrtWeights());
+    TRT_ENSURE(const_layer);
+    nvinfer1::ITensor* output = const_layer->getOutput(0);
+    TRT_ENSURE(output);
+    TRT_ENSURE(output->getType() == nvinfer1::DataType::kINT32);
+    return const_layer;
+  }
+
+  // Adds a Constant layer whose output is a TensorRT shape tensor. The shape
+  // tensor's size and values correspond to dim's nbDims and d[], respectively.
+  ::stream_executor::port::StatusOr<nvinfer1::IConstantLayer*> Constant(
+      const std::vector<int>& data) noexcept {
+    nvinfer1::Dims shape_dims;
+    shape_dims.nbDims = 1;
+    shape_dims.d[0] = data.size();
+    ::stream_executor::port::StatusOr<TRT_ShapedWeights> const_weights =
+        weight_store_->GetTempWeights(nvinfer1::DataType::kINT32, shape_dims);
+    TRT_ENSURE_OK(const_weights);
+    int32* values = const_weights.ValueOrDie().GetPointer<int32>();
+    for (int i = 0; i < data.size(); i++) {
+      values[i] = static_cast<int32>(data[i]);
+    }
+    ::stream_executor::port::StatusOr<nvinfer1::Dims> trt_dims =
+        const_weights.ValueOrDie().Shape().AsTrtDims();
+    TRT_ENSURE_OK(trt_dims);
+    nvinfer1::IConstantLayer* const_layer = network_->addConstant(
+        trt_dims.ValueOrDie(), const_weights.ValueOrDie().GetTrtWeights());
+    TRT_ENSURE(const_layer);
+    nvinfer1::ITensor* output = const_layer->getOutput(0);
+    TRT_ENSURE(output);
+    TRT_ENSURE(output->getType() == nvinfer1::DataType::kINT32);
+    TRT_ENSURE(const_layer);
+    return const_layer;
+  }
+
+  // Adds a Constant layer that produces a tensor of shape "shape",
+  // type "data_type" and filled with value "scalar".
+  template <typename T>
+  ::stream_executor::port::StatusOr<nvinfer1::IConstantLayer*> Constant(
+      const T value, nvinfer1::Dims shape,
+      nvinfer1::DataType data_type) noexcept {
+    ::stream_executor::port::StatusOr<TRT_ShapedWeights> const_weights =
+        weight_store_->GetTempWeights(data_type, shape);
+    TRT_ENSURE_OK(const_weights);
+    TRT_ENSURE(const_weights.ValueOrDie().SetValues(value).ok());
+    nvinfer1::IConstantLayer* const_layer = network_->addConstant(
+        shape, const_weights.ValueOrDie().GetTrtWeights());
+    TRT_ENSURE(const_layer);
+    return const_layer;
+  }
+
+  // Adds a Constant layer that produces a tensor with a single value "scalar".
+  // The tensor has "nb_dims" dimensions and each dimension has only one
+  // element. The data type of the tensor is determined by the data type of
+  // "scalar".
+  template <typename T,
+            typename std::enable_if<std::is_pod<T>::value>::type* = nullptr>
+  ::stream_executor::port::StatusOr<nvinfer1::IConstantLayer*> Constant(
+      const T scalar, const int nb_dims) noexcept {
+    TRT_ENSURE(nb_dims <= nvinfer1::Dims::MAX_DIMS);
+    auto data_type = nvinfer1::DataType::kINT32;
+    if (std::is_floating_point<T>::value) {
+      data_type = nvinfer1::DataType::kFLOAT;
+    }
+    nvinfer1::Dims zero_shape;
+    zero_shape.nbDims = nb_dims;
+    std::fill_n(zero_shape.d, nb_dims, 1);
+    return Constant<T>(scalar, zero_shape, data_type);
+  }
+
+  // Adds a Constant layer from a TRT_ShapedWeights object.
+  ::stream_executor::port::StatusOr<nvinfer1::IConstantLayer*>
+  WeightsToConstant(const nvinfer1::Weights& weights,
+                    const DimsAdapter& dims) noexcept {
+    ::stream_executor::port::StatusOr<int64_t> vol = dims.Volume();
+    TRT_ENSURE_OK(vol);
+    TRT_ENSURE(vol.ValueOrDie() == weights.count);
+    ::stream_executor::port::StatusOr<nvinfer1::Dims> trt_dims =
+        dims.AsTrtDims();
+    TRT_ENSURE_OK(trt_dims);
+    nvinfer1::IConstantLayer* const_layer =
+        network_->addConstant(trt_dims.ValueOrDie(), weights);
+    TRT_ENSURE(const_layer);
+    return const_layer;
+  }
+
+  Status get_tensor4TensorOrWeights(const TRT_TensorOrWeights& input,
+                                    ITensorProxyPtr* pTensor) {
+    if (input.is_weights()) {
+      ::stream_executor::port::StatusOr<nvinfer1::IConstantLayer*> const_layer =
+          WeightsToConstant(input.weights().GetTrtWeights(),
+                            input.GetTrtDims());
+      if (!const_layer.status().ok()) return const_layer.status();
+      *pTensor = const_layer.ValueOrDie()->getOutput(0);
+    } else {
+      *pTensor = input.tensor();
+    }
+    return Status::OK();
+  }
+
+  // Creates a nvinfer1::Weights object containing a single scalar.
+  template <typename T,
+            typename std::enable_if<std::is_pod<T>::value>::type* = nullptr>
+  ::stream_executor::port::StatusOr<nvinfer1::Weights> ScalarWeights(
+      const T scalar, const int nb_dims) noexcept {
+    TRT_ENSURE(nb_dims <= nvinfer1::Dims::MAX_DIMS);
+    auto data_type = nvinfer1::DataType::kINT32;
+    if (std::is_floating_point<T>::value) {
+      data_type = nvinfer1::DataType::kFLOAT;
+    }
+    nvinfer1::Dims weights_shape;
+    weights_shape.nbDims = nb_dims;
+    std::fill_n(weights_shape.d, nb_dims, 1);
+    ::stream_executor::port::StatusOr<TRT_ShapedWeights> const_weights =
+        weight_store_->GetTempWeights(data_type, weights_shape);
+    TRT_ENSURE_OK(const_weights);
+    const_weights.ValueOrDie().GetPointer<T>()[0] = scalar;
+    return const_weights.ValueOrDie().GetTrtWeights();
+  }
+
+  // Adds a TensorRT Slice operation to the network.
+  ::stream_executor::port::StatusOr<nvinfer1::ISliceLayer*> Slice(
+      nvinfer1::ITensor* input, const nvinfer1::Dims& begin,
+      const nvinfer1::Dims& size, const nvinfer1::Dims& stride) noexcept {
+    nvinfer1::ISliceLayer* layer =
+        network_->addSlice(*input, begin, size, stride);
+    TRT_ENSURE(layer);
+    return layer;
+  }
+
+  // Adds a TensorRT Concatenate operation to the network.
+  ::stream_executor::port::StatusOr<nvinfer1::IConcatenationLayer*> Concat(
+      absl::Span<nvinfer1::ITensor* const> inputs, const int axis) {
+    for (nvinfer1::ITensor* input : inputs) {
+      TRT_ENSURE(input);
+    }
+    nvinfer1::IConcatenationLayer* layer = network_->addConcatenation(
+        inputs.data(), static_cast<int32_t>(inputs.size()));
+    TRT_ENSURE(layer);
+    layer->setAxis(axis);
+    return layer;
+  }
+
+  // Adds a TensorRT Concatenate operation to the network.
+  ::stream_executor::port::StatusOr<nvinfer1::IConcatenationLayer*> Concat(
+      const std::vector<nvinfer1::ITensor*>& inputs, const int axis) {
+    return this->Concat(absl::MakeSpan(inputs), axis);
+  }
+
+  // Adds a TensorRT Shape operation, which determines the runtime shape of the
+  // input tensor, to the network.
+  ::stream_executor::port::StatusOr<nvinfer1::IShapeLayer*> Shape(
+      nvinfer1::ITensor* input) {
+    TRT_ENSURE(input);
+    nvinfer1::IShapeLayer* layer = network_->addShape(*input);
+    TRT_ENSURE(layer);
+    return layer;
+  }
+
+  // Creates a Gather operation on the shape of the input tensor. The output of
+  // the gather operation is a 1D shape tensor where output[i] = (!sub_one ?
+  // input_shape[i] : input_shape[i] -1) if i is in "indices", otherwise zero.
+  ::stream_executor::port::StatusOr<nvinfer1::IGatherLayer*> GetPartialShapeOf(
+      nvinfer1::ITensor* input, absl::InlinedVector<int64, 4> indices,
+      bool sub_one = false) {
+    TRT_ENSURE(input);
+    TRT_ENSURE(indices.size() <= nvinfer1::Dims::MAX_DIMS);
+
+    // Get the runtime shape of input;
+    ::stream_executor::port::StatusOr<nvinfer1::IShapeLayer*> shape_layer =
+        this->Shape(input);
+    TRT_ENSURE_PTR_OK(shape_layer);
+    nvinfer1::ITensor* runtime_shape = shape_layer.ValueOrDie()->getOutput(0);
+
+    if (sub_one) {
+      ::stream_executor::port::StatusOr<nvinfer1::IConstantLayer*> ones =
+          this->Constant<int32>(1, 1);
+      TRT_ENSURE_PTR_OK(ones);
+      ::stream_executor::port::StatusOr<nvinfer1::IElementWiseLayer*> sub =
+          this->Sub(runtime_shape, ones.ValueOrDie()->getOutput(0));
+      TRT_ENSURE_PTR_OK(sub);
+      runtime_shape = sub.ValueOrDie()->getOutput(0);
+    }
+
+    // Create a constant tensor containing the gather indices.
+    // For any dim not in "indices", we mark it size to gather a zero.
+    const int input_nb_dims = input->getDimensions().nbDims;
+    std::vector<int> indices_all(input_nb_dims, input_nb_dims);
+    for (auto idx : indices) {
+      TRT_ENSURE(idx < input_nb_dims);
+      indices_all[idx] = idx;
+    }
+
+    ::stream_executor::port::StatusOr<nvinfer1::IConstantLayer*>
+        indices_result = this->Constant(indices_all);
+    TRT_ENSURE_PTR_OK(indices_result);
+    nvinfer1::ITensor* gather_indices =
+        indices_result.ValueOrDie()->getOutput(0);
+    TRT_ENSURE(gather_indices->getDimensions().nbDims == 1);
+    TRT_ENSURE(gather_indices->getType() == nvinfer1::DataType::kINT32);
+
+    // Append a zero to the shape tensor.
+    ::stream_executor::port::StatusOr<nvinfer1::IConstantLayer*> zero_result =
+        this->Constant(std::vector<int>{0});
+    TRT_ENSURE_PTR_OK(zero_result);
+    std::array<nvinfer1::ITensor*, 2> cat_inputs = {
+        runtime_shape, zero_result.ValueOrDie()->getOutput(0)};
+    nvinfer1::IConcatenationLayer* cat_layer =
+        network_->addConcatenation(cat_inputs.data(), cat_inputs.size());
+    TRT_ENSURE(cat_layer);
+    nvinfer1::ITensor* gather_input = cat_layer->getOutput(0);
+    TRT_ENSURE(gather_input);
+
+    // Finally, gather the indices from the input.
+    nvinfer1::IGatherLayer* gather =
+        network_->addGather(*gather_input, *gather_indices, 0);
+    TRT_ENSURE(gather);
+    return gather;
+  }
+
+  // Adds a scale layer that uniformly scales the input tensor by the specified
+  // amount.
+  ::stream_executor::port::StatusOr<nvinfer1::IScaleLayer*> AddUniformScale(
+      nvinfer1::ITensor* input, float scale, const std::string& name) {
+    TRT_ENSURE(input);
+    TRT_ENSURE(!name.empty());
+    ::stream_executor::port::StatusOr<nvinfer1::Weights> weight =
+        this->ScalarWeights<float>(scale, 1);
+    TRT_ENSURE_OK(weight);
+    const nvinfer1::Weights empty_weights =
+        nvinfer1::Weights{nvinfer1::DataType::kFLOAT, nullptr, 0};
+    nvinfer1::IScaleLayer* scale_layer =
+        network_->addScale(*input, nvinfer1::ScaleMode::kUNIFORM, empty_weights,
+                           weight.ValueOrDie(), empty_weights);
+    TRT_ENSURE(scale_layer != nullptr);
+    scale_layer->setName(name.c_str());
+    TRT_ENSURE((*scale_layer).getPower().count == 0);
+    TRT_ENSURE((*scale_layer).getShift().count == 0);
+    TRT_ENSURE((*scale_layer).getScale().count == 1);
+    return scale_layer;
+  }
+
+  ::stream_executor::port::StatusOr<nvinfer1::ILayer*> AddFill(
+      const TRT_TensorOrWeights& value_input,
+      const TRT_TensorOrWeights& dims_input, bool is_value_static,
+      bool is_dims_static, int nbDims, const nvinfer1::Dims& trt_dims,
+      ITensorProxyPtr scalar_tensor = nullptr,
+      ITensorProxyPtr beta_tensor = nullptr, const float delta = 0) {
+    // TensorRT IFillLayer requires a rank 0 scalar.
+    nvinfer1::Dims scalar_dims;
+    scalar_dims.nbDims = 0;
+    if (is_value_static) {
+      ::stream_executor::port::StatusOr<nvinfer1::IConstantLayer*> const_layer =
+          WeightsToConstant(value_input.weights().GetTrtWeights(), scalar_dims);
+      if (!const_layer.status().ok()) return const_layer.status();
+      scalar_tensor = const_layer.ValueOrDie()->getOutput(0);
+    } else {
+      if (scalar_tensor == nullptr) {
+        ::stream_executor::port::StatusOr<nvinfer1::IShuffleLayer*>
+            shuffler_layer =
+                Reshape(value_input.tensor()->trt_tensor(), scalar_dims);
+        if (!shuffler_layer.status().ok()) return shuffler_layer.status();
+        scalar_tensor = shuffler_layer.ValueOrDie()->getOutput(0);
+      }
+    }
+
+    if (beta_tensor == nullptr) {
+      nvinfer1::Dims beta_shape{1, {nbDims}};
+      ::stream_executor::port::StatusOr<nvinfer1::IConstantLayer*> const_layer =
+          Constant(delta, beta_shape, value_input.TrtDType());
+      TF_RETURN_IF_ERROR(const_layer.status());
+      beta_tensor = const_layer.ValueOrDie()->getOutput(0);
+    }
+
+    nvinfer1::IFillLayer* layer =
+        network_->addFill(trt_dims, nvinfer1::FillOperation::kLINSPACE);
+    TRT_ENSURE(layer);
+    if (!is_dims_static) {
+      layer->setInput(0, *dims_input.tensor()->trt_tensor());
+    }
+    layer->setInput(1, *scalar_tensor->trt_tensor());
+    layer->setInput(2, *beta_tensor->trt_tensor());
+    return layer;
+  }
+
+  // Adds a quantization layer that uniformly scales the input tensor
+  // by the given multiplicative "scaling_factor", then rounds
+  // (round-to-nearest-ties-to-even) to the nearest integer and clamps in the
+  // range of [-128, 127].
+  ::stream_executor::port::StatusOr<nvinfer1::ILayer*> Quantize(
+      nvinfer1::ITensor* input, const float scaling_factor,
+      const std::string& name) {
+    TRT_ENSURE(input);
+    TRT_ENSURE(!name.empty());
+    // Preprocessor usage here is unavoidable because TRT8 API is new.
+#if IS_TRT_VERSION_GE(8, 0, 0, 0)
+    // The TensorRT IQuantizeLayer divides by the scale factor rather than
+    // multiplies. To be consistent, in this function we expect a multiplicative
+    // scale factor, so we take the reciprical.
+    ::stream_executor::port::StatusOr<nvinfer1::IConstantLayer*> scaling_const =
+        this->Constant<float>(1.0f / scaling_factor, 1);
+    TRT_ENSURE_PTR_OK(scaling_const);
+    scaling_const.ValueOrDie()->setDimensions(nvinfer1::Dims{0, {}});
+    nvinfer1::IQuantizeLayer* quant_layer = network_->addQuantize(
+        *input, *scaling_const.ValueOrDie()->getOutput(0));
+    TRT_ENSURE(quant_layer);
+    quant_layer->setAxis(1);
+    return quant_layer;
+#else
+    ::stream_executor::port::StatusOr<nvinfer1::IScaleLayer*> result =
+        this->AddUniformScale(input, scaling_factor, name);
+    TRT_ENSURE_PTR_OK(result);
+    (*result)->setOutputType(0, nvinfer1::DataType::kINT8);
+    (*result)->setPrecision(nvinfer1::DataType::kFLOAT);
+    return result;
+#endif
+  }
+
+  // Adds a dequantize layer that casts the input tensor to TensorRT float type
+  // and scales it uniformly by the given multiplicative "scaling_factor".
+  ::stream_executor::port::StatusOr<nvinfer1::ILayer*> Dequantize(
+      nvinfer1::ITensor* input, const float scaling_factor,
+      const std::string& name) {
+    TRT_ENSURE(input);
+    TRT_ENSURE(!name.empty());
+#if IS_TRT_VERSION_GE(8, 0, 0, 0)
+    ::stream_executor::port::StatusOr<nvinfer1::IConstantLayer*> scaling_const =
+        this->Constant<float>(scaling_factor, 1);
+    TRT_ENSURE_PTR_OK(scaling_const);
+    scaling_const.ValueOrDie()->setDimensions(nvinfer1::Dims{0, {}});
+    nvinfer1::IDequantizeLayer* dequant_layer = network_->addDequantize(
+        *input, *scaling_const.ValueOrDie()->getOutput(0));
+    dequant_layer->setAxis(1);
+    TRT_ENSURE(dequant_layer);
+    return dequant_layer;
+#else
+    ::stream_executor::port::StatusOr<nvinfer1::IScaleLayer*> result =
+        this->AddUniformScale(input, scaling_factor, name);
+    TRT_ENSURE_PTR_OK(result);
+    (*result)->setOutputType(0, nvinfer1::DataType::kFLOAT);
+    (*result)->setPrecision(nvinfer1::DataType::kINT8);
+    return result;
+#endif
+  }
+
+  // Adds TensorRT Q/DQ operations. This is for explicit precision mode.
+  ::stream_executor::port::StatusOr<nvinfer1::ILayer*>
+  UniformQuantizeDequantizeExplicit(nvinfer1::ITensor* input,
+                                    float quantize_scale,
+                                    float dequantize_scale,
+                                    const std::string& name) {
+    TRT_ENSURE(input);
+    if (!IS_TRT_VERSION_GE(8, 0, 0, 0)) {
+      TRT_ENSURE(network_->hasExplicitPrecision());
+    }
+    TRT_ENSURE(IS_TRT_VERSION_GE(7, 1, 0, 0));
+
+    static int count = 0;
+    TRT_ENSURE(input->getType() == nvinfer1::DataType::kFLOAT);
+    std::string quant_name = absl::StrCat(input->getName(), "_quant_", count);
+
+    ::stream_executor::port::StatusOr<nvinfer1::ILayer*> quant =
+        this->Quantize(input, quantize_scale, quant_name);
+    TRT_ENSURE_PTR_OK(quant);
+
+    std::string dequant_name =
+        absl::StrCat(input->getName(), "_dequant_", count);
+    ::stream_executor::port::StatusOr<nvinfer1::ILayer*> dequant =
+        this->Dequantize(quant.ValueOrDie()->getOutput(0), dequantize_scale,
+                         dequant_name);
+    TRT_ENSURE_PTR_OK(dequant);
+
+    count++;
+    return dequant;
+  }
+
+  ::stream_executor::port::StatusOr<nvinfer1::IShuffleLayer*> Reshape(
+      nvinfer1::ITensor* input, const nvinfer1::Dims& new_shape) {
+    TRT_ENSURE(input);
+    nvinfer1::IShuffleLayer* layer = network_->addShuffle(*input);
+    TRT_ENSURE(layer);
+    layer->setReshapeDimensions(new_shape);
+    return layer;
+  }
+
+  ::stream_executor::port::StatusOr<nvinfer1::ILayer*> FindProducerOf(
+      const nvinfer1::ITensor* tensor) {
+    const char* name = tensor->getName();
+    const int num_layers = network_->getNbLayers();
+    for (int i = 0; i < num_layers; i++) {
+      nvinfer1::ILayer* layer = network_->getLayer(i);
+      const int num_outputs = layer->getNbOutputs();
+      for (int j = 0; j < num_outputs; j++) {
+        nvinfer1::ITensor* t = layer->getOutput(j);
+        if (std::string(t->getName()) == name) {
+          return layer;
+        }
+      }
+    }
+    return errors::NotFound("could not find producing layer of ", name);
+  }
+
+  ::stream_executor::port::StatusOr<nvinfer1::ILayer*> UniqueParentOf(
+      const nvinfer1::ILayer* layer, int input_idx = 0) {
+    return FindProducerOf(layer->getInput(input_idx));
+  }
+
+  nvinfer1::INetworkDefinition* Network() { return network_; }
+
+ private:
+  nvinfer1::INetworkDefinition* network_;
+  TrtWeightStore* weight_store_;
+};
+
+class ShuffleBuilder {
+ private:
+  explicit ShuffleBuilder(TRTNetworkBuilder* builder, nvinfer1::ITensor* input)
+      : builder_(builder) {
+    layer_ = builder->Network()->addShuffle(*input);
+  }
+
+ public:
+  static ::stream_executor::port::StatusOr<ShuffleBuilder> Create(
+      TRTNetworkBuilder* builder, nvinfer1::ITensor* input) {
+    TRT_ENSURE(builder != nullptr);
+    TRT_ENSURE(input != nullptr);
+    return ShuffleBuilder(builder, input);
+  }
+
+  ShuffleBuilder& SetReshape(const nvinfer1::Dims& dims) {
+    layer_->setReshapeDimensions(dims);
+    return *this;
+  }
+
+  ShuffleBuilder& SetReshape(nvinfer1::ITensor* shape) {
+    layer_->setInput(1, *shape);
+    return *this;
+  }
+
+  ShuffleBuilder& SetFirstTranspose(const nvinfer1::Permutation& perm) {
+    layer_->setFirstTranspose(perm);
+    return *this;
+  }
+
+  ShuffleBuilder& SetSecondTranspose(const nvinfer1::Permutation& perm) {
+    layer_->setSecondTranspose(perm);
+    return *this;
+  }
+
+  ::stream_executor::port::StatusOr<nvinfer1::ITensor*> Output() {
+    TRT_ENSURE(layer_ != nullptr);
+    TRT_ENSURE(layer_->getOutput(0) != nullptr);
+    return layer_->getOutput(0);
+  }
+
+ private:
+  TRTNetworkBuilder* builder_;
+  nvinfer1::IShuffleLayer* layer_;
+};
+
+}  // namespace convert
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_OPS_LAYER_UTILS_H_
diff --git a/tensorflow/compiler/tf2tensorrt/convert/ops/like_ops.cc b/tensorflow/compiler/tf2tensorrt/convert/ops/like_ops.cc
new file mode 100644
index 00000000000..7a40d9aa9b1
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/convert/ops/like_ops.cc
@@ -0,0 +1,95 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+
+#include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/op_converter_registry.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/ops/layer_utils.h"
+
+namespace tensorflow {
+namespace tensorrt {
+namespace convert {
+
+#if IS_TRT_VERSION_GE(8, 2, 0, 0)
+
+template <int V>
+class ConvertLikeOps : public OpConverterBase<ConvertLikeOps<V>> {
+ public:
+  explicit ConvertLikeOps(const OpConverterParams *params)
+      : OpConverterBase<ConvertLikeOps<V>>(
+            params,
+            {DataType::DT_FLOAT, DataType::DT_HALF, DataType::DT_INT32}) {}
+
+  static constexpr std::array<InputArgSpec, 1> InputSpec() {
+    return std::array<InputArgSpec, 1>{
+        InputArgSpec::Create("input", TrtInputArg::kBoth),
+    };
+  }
+  Status Validate() { return ConvertLikeOps<V>::NotSupportedInImplicitBatch(); }
+
+  Status Convert() {
+    const auto &params = *this->params_;
+    const auto &inputs = params.inputs;
+    auto *network = params.converter->network();
+
+    const TRT_TensorOrWeights &input = inputs.at(0);
+    nvinfer1::Dims dims(input.GetTrtDims());
+
+    const std::vector<int> value_input_dims_data = {1};
+    const DimsAdapter value_input_dims(value_input_dims_data);
+    ::stream_executor::port::StatusOr<TRT_ShapedWeights> value_weights =
+        params.weight_store->GetTempWeights(input.TrtDType(), value_input_dims);
+    TF_RETURN_IF_ERROR(value_weights.status());
+    TF_RETURN_IF_ERROR(value_weights.ValueOrDie().SetValues(V));
+    TRT_TensorOrWeights value_input(value_weights.ValueOrDie());
+
+    const auto is_dims_static = HasStaticShape(dims);
+    auto builder = TRTNetworkBuilder::Create(network, params.weight_store);
+    ITensorProxyPtr dims_input_tensor;
+    if (!is_dims_static) {
+      ::stream_executor::port::StatusOr<nvinfer1::IShapeLayer *> shape_layer =
+          builder.ValueOrDie().Shape(input.tensor()->trt_tensor());
+      TF_RETURN_IF_ERROR(shape_layer.status());
+      dims_input_tensor = shape_layer.ValueOrDie()->getOutput(0);
+      dims.nbDims = 0;
+    }
+
+    TRT_TensorOrWeights dims_input(dims_input_tensor);
+    ::stream_executor::port::StatusOr<nvinfer1::ILayer *> layer =
+        builder.ValueOrDie().AddFill(value_input, dims_input, true,
+                                     is_dims_static, input.GetTrtDims().nbDims,
+                                     dims);
+    ITensorProxyPtr output_tensor = layer.ValueOrDie()->getOutput(0);
+    this->AddOutput(TRT_TensorOrWeights(output_tensor));
+    return Status::OK();
+  }
+};
+
+REGISTER_DEFAULT_TRT_OP_CONVERTER(MakeConverterFunction<ConvertLikeOps<0>>(),
+                                  "zeros_like");
+REGISTER_DEFAULT_TRT_OP_CONVERTER(MakeConverterFunction<ConvertLikeOps<1>>(),
+                                  "ones_like");
+REGISTER_DEFAULT_TRT_OP_CONVERTER(MakeConverterFunction<ConvertLikeOps<0>>(),
+                                  "ZerosLike");
+REGISTER_DEFAULT_TRT_OP_CONVERTER(MakeConverterFunction<ConvertLikeOps<1>>(),
+                                  "OnesLike");
+
+#endif  // IS_TRT_VERSION_GE(8, 2, 0, 0)
+
+}  // namespace convert
+}  // namespace tensorrt
+}  // namespace tensorflow
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/convert/ops/log_softmax.cc b/tensorflow/compiler/tf2tensorrt/convert/ops/log_softmax.cc
new file mode 100644
index 00000000000..d29b5481643
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/convert/ops/log_softmax.cc
@@ -0,0 +1,104 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+
+#include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/op_converter_registry.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/ops/layer_utils.h"
+
+namespace tensorflow {
+namespace tensorrt {
+namespace convert {
+
+class ConvertLogSoftmax : public OpConverterBase<ConvertLogSoftmax> {
+ public:
+  explicit ConvertLogSoftmax(const OpConverterParams *params)
+      : OpConverterBase<ConvertLogSoftmax>(params) {}
+
+  static constexpr std::array<InputArgSpec, 1> InputSpec() {
+    return std::array<InputArgSpec, 1>{
+        InputArgSpec::Create("logits", TrtInputArg::kTensor)};
+  }
+
+  Status Validate() {
+    const auto &params = *this->params_;
+    const auto &inputs = params.inputs;
+
+    ITensorProxyPtr logits_tensor = inputs.at(0).tensor();
+
+    const int num_trt_dims = logits_tensor->getDimensions().nbDims;
+    if (!num_trt_dims && params.use_implicit_batch) {
+      return errors::InvalidArgument(
+          "TensorRT LogSoftmax cannot apply on the batch dimension");
+    }
+
+    return Status::OK();
+  }
+
+  Status Convert() {
+    const auto &params = *this->params_;
+    const auto &inputs = params.inputs;
+    const auto &node_def = params.node_def;
+
+    // Perform LogSoftmax operation:
+    // `logsoftmax = logits - log(reduce_sum(exp(logits), axis))`
+
+    // Get the logits tensor.
+    ITensorProxyPtr logits_tensor = inputs.at(0).tensor();
+    const int num_trt_dims = logits_tensor->getDimensions().nbDims;
+
+    // Exponent of logits.
+    nvinfer1::IUnaryLayer *exp = params.converter->network()->addUnary(
+        *logits_tensor->trt_tensor(), nvinfer1::UnaryOperation::kEXP);
+    TFTRT_RETURN_ERROR_IF_NULLPTR(exp, node_def.name());
+    params.converter->SetLayerName(exp, node_def, "exp");
+
+    // Reduce-sum operation across the final dimension.
+    nvinfer1::IReduceLayer *reduced_sum =
+        params.converter->network()->addReduce(
+            *exp->getOutput(0), nvinfer1::ReduceOperation::kSUM,
+            (1 << (num_trt_dims - 1)), /*Reduce across final dimension*/
+            true /*Keep reduced dims*/);
+    params.converter->SetLayerName(reduced_sum, node_def, "reduced_sum");
+
+    // Logarithm of reduced_sum.
+    nvinfer1::IUnaryLayer *log_reduced_sum =
+        params.converter->network()->addUnary(*reduced_sum->getOutput(0),
+                                              nvinfer1::UnaryOperation::kLOG);
+    TFTRT_RETURN_ERROR_IF_NULLPTR(log_reduced_sum, node_def.name());
+    params.converter->SetLayerName(log_reduced_sum, node_def,
+                                   "log_reduced_sum");
+
+    // Finally, get the output by subtracting log_reduced_sum from logits.
+    nvinfer1::IElementWiseLayer *sub =
+        params.converter->network()->addElementWise(
+            *logits_tensor->trt_tensor(), *log_reduced_sum->getOutput(0),
+            nvinfer1::ElementWiseOperation::kSUB);
+    TFTRT_RETURN_ERROR_IF_NULLPTR(sub, node_def.name());
+    params.converter->SetLayerName(sub, node_def, "sub");
+
+    params.outputs->push_back(TRT_TensorOrWeights(sub->getOutput(0)));
+    return Status::OK();
+  }
+};
+
+REGISTER_DEFAULT_TRT_OP_CONVERTER(MakeConverterFunction<ConvertLogSoftmax>(),
+                                  "LogSoftmax");
+
+}  // namespace convert
+}  // namespace tensorrt
+}  // namespace tensorflow
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/convert/ops/quantization_ops.cc b/tensorflow/compiler/tf2tensorrt/convert/ops/quantization_ops.cc
new file mode 100644
index 00000000000..c6622f88345
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/convert/ops/quantization_ops.cc
@@ -0,0 +1,426 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+#include "tensorflow/compiler/tf2tensorrt/convert/ops/quantization_ops.h"
+
+#include "absl/strings/str_format.h"
+#include "tensorflow/cc/ops//array_ops.h"
+#include "tensorflow/compiler/tf2tensorrt/common/utils.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/op_converter.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/op_converter_registry.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/ops/layer_utils.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/weights.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "third_party/tensorrt/NvInfer.h"
+
+namespace tensorflow {
+namespace tensorrt {
+namespace convert {
+
+bool IsQuantizeAndDequantizeOp(const Node* node) {
+  return absl::c_find(kQuantizationOpNames, node->def().op()) !=
+         kQuantizationOpNames.end();
+}
+
+namespace {
+
+// Provides quantizing and dequantizing tensor scales for a given dynamic range.
+// Borrowed from TF quantization kernel logic.
+template <typename T>
+QuantizationScales<T, 1> ComputeQuantizationRange(bool signed_input,
+                                                  int num_bits,
+                                                  bool narrow_range,
+                                                  T* min_range, T* max_range) {
+  // Calculate the range for the simulated integer quantization:
+  // e.g. [-127,127] for signed = true, narrow_range = true, num_bits = 8,
+  // or [-128,127] for signed = true, narrow_range = false, num_bits = 8,
+  // or [0, 255] for signed = false, num_bits = 8.
+  const int64_t min_quantized =
+      signed_input ? narrow_range ? -(1ULL << (num_bits - 1)) + 1
+                                  : -(1ULL << (num_bits - 1))
+                   : 0;
+  const int64_t max_quantized =
+      signed_input ? (1ULL << (num_bits - 1)) - 1 : (1ULL << num_bits) - 1;
+  // Determine the maximum scaling factor that would scale
+  // [min_range, max_range] to not exceed [min_quantized, max_quantized],
+  // while keeping 0 unchanged.
+  const T scale_from_min_side = (min_quantized * *min_range > 0)
+                                    ? min_quantized / *min_range
+                                    : std::numeric_limits<T>::max();
+  const T scale_from_max_side = (max_quantized * *max_range > 0)
+                                    ? max_quantized / *max_range
+                                    : std::numeric_limits<T>::max();
+
+  QuantizationScales<T, 1> scales;
+  // Note: Avoids changing the side of the range that determines scale.
+  if (scale_from_min_side < scale_from_max_side) {
+    scales.quantize_scale[0] = scale_from_min_side;
+    scales.dequantize_scale[0] = *min_range / min_quantized;
+    *max_range = max_quantized * scales.dequantize_scale[0];
+  } else {
+    scales.quantize_scale[0] = scale_from_max_side;
+    scales.dequantize_scale[0] = *max_range / max_quantized;
+    *min_range = min_quantized * scales.dequantize_scale[0];
+  }
+  return scales;
+}
+
+// Prepares the input for a QDQ node in explicit precision mode, returning a
+// ITensor pointer. If the input is weights, we convert it to a ITensor by
+// adding a constant layer.
+::stream_executor::port::StatusOr<nvinfer1::ITensor*> ExlicitQDQInputToTensor(
+    TRTNetworkBuilder* builder, const OpConverterParams* params,
+    const TRT_TensorOrWeights& input) {
+  if (input.is_tensor()) {
+    return input.tensor()->trt_tensor();
+  }
+  if (!IS_TRT_VERSION_GE(8, 0, 0, 0) && input.weights().count() > 1) {
+    LOG(WARNING) << absl::StrCat(
+        "QDQ per-channel for weights not "
+        "implemented, assuming uniform scaling");
+  }
+  TRT_ShapedWeights trt_weights = input.weights();
+  ::stream_executor::port::StatusOr<nvinfer1::IConstantLayer*> weights_const =
+      builder->WeightsToConstant(trt_weights.GetTrtWeights(),
+                                 trt_weights.Shape());
+  TRT_ENSURE_PTR_OK(weights_const);
+  params->converter->SetLayerName(weights_const.ValueOrDie(), params->node_def,
+                                  "const");
+  nvinfer1::ITensor* qdq_input = weights_const.ValueOrDie()->getOutput(0);
+  std::string name =
+      absl::StrCat(weights_const.ValueOrDie()->getName(), "_output");
+  qdq_input->setName(name.c_str());
+  return qdq_input;
+}
+
+}  // namespace
+
+// Carries traits for each specific quantization op type for conversion.
+// Specialization for template parameter T should be given for each TF C++
+// quantization op.
+template <typename T>
+struct QDQOpSpec {};
+
+template <>
+struct QDQOpSpec<ops::QuantizeAndDequantizeV2> {
+  static constexpr std::array<InputArgSpec, 3> InputSpec() {
+    return {
+        InputArgSpec::Create("input", TrtInputArg::kBoth),
+        InputArgSpec::Create("input_min", TrtInputArg::kWeight),
+        InputArgSpec::Create("input_max", TrtInputArg::kWeight),
+    };
+  }
+
+  struct Attrs {
+    float min_range;
+    float max_range;
+    bool narrow_range;
+    std::string round_mode;
+    UniformQuantizationScales scales;
+  };
+
+  static Status ValidateQDQForExplicitPrecision(
+      const std::vector<TRT_TensorOrWeights>& inputs, const NodeDef& node_def,
+      Attrs* args) {
+    AttrSlice attrs(node_def);
+    TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "round_mode", &args->round_mode));
+    if (args->round_mode != "HALF_TO_EVEN") {
+      LOG(WARNING) << node_def.op() << ": " << node_def.name()
+                   << " has round_mode=" << args->round_mode
+                   << ", but for TensorRT conversion, "
+                      "round_mode=HALF_TO_EVEN is recommended.";
+    }
+    TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "narrow_range", &args->narrow_range));
+    if (args->narrow_range) {
+      LOG(WARNING) << node_def.op() << ": " << node_def.name()
+                   << " has narrow_range=true, but for TensorRT conversion, "
+                      "narrow_range=false is recommended.";
+    }
+    args->min_range = inputs.at(1).weights().template GetPointer<float>()[0];
+    args->max_range = inputs.at(2).weights().template GetPointer<float>()[0];
+    const int num_bits = 8;
+    args->scales = ComputeQuantizationRange<float>(
+        /*signed_input=*/true, num_bits, args->narrow_range, &args->min_range,
+        &args->max_range);
+    TRT_ENSURE(args->scales.dequantize_scale[0] != 0);
+    TRT_ENSURE(args->scales.quantize_scale[0] != 0);
+    return Status::OK();
+  }
+
+  // Converts in explicit precision mode. In this mode, QDQ operations are
+  // directly converted into TensorRT quantizing and dequantizing scale
+  // operations.
+  static Status ConvertExplicit(const OpConverterParams* params,
+                                const Attrs& args) {
+    const auto& node_def = params->node_def;
+
+    ::stream_executor::port::StatusOr<TRTNetworkBuilder> builder =
+        TRTNetworkBuilder::Create(params->converter->network(),
+                                  params->weight_store);
+
+    ::stream_executor::port::StatusOr<nvinfer1::ITensor*> qdq_input =
+        ExlicitQDQInputToTensor(&builder.ValueOrDie(), params,
+                                params->inputs.at(0));
+    TRT_ENSURE_PTR_OK(qdq_input);
+
+    // TODO(cbate): check this condition exists for TRT8? Outline this block to
+    // a "reshape policy".
+    const int required_dims = params->use_implicit_batch ? 3 : 4;
+    const nvinfer1::Dims idims = qdq_input.ValueOrDie()->getDimensions();
+    nvinfer1::Dims intermediate_dims = idims;
+    TRT_ENSURE(idims.nbDims > 0);
+    if (idims.nbDims < required_dims) {
+      const int nb_extra_dims = required_dims - idims.nbDims;
+      intermediate_dims.nbDims = required_dims;
+      std::vector<int> ones(nb_extra_dims, 1);
+      TRT_ENSURE(ones.size() == nb_extra_dims && nb_extra_dims > 0);
+
+      if (!params->use_implicit_batch) {
+        intermediate_dims.d[0] = idims.d[0];
+        std::copy(ones.begin(), ones.end(), intermediate_dims.d + 1);
+        std::copy_n(idims.d + 1, idims.nbDims - 1,
+                    intermediate_dims.d + ones.size() + 1);
+      } else {
+        std::copy(ones.begin(), ones.end(), intermediate_dims.d);
+        std::copy_n(idims.d, idims.nbDims, intermediate_dims.d + ones.size());
+      }
+
+      LOG(WARNING) << absl::StrCat(
+          node_def.name(), ":", node_def.op(), ": tensor ",
+          qdq_input.ValueOrDie()->getName(), " has shape ", DebugString(idims),
+          " but TRT scale layer requires at least 3 dims excluding batch dim, "
+          "trying to recover by inserting 1's to create shape ",
+          DebugString(intermediate_dims));
+      ::stream_executor::port::StatusOr<nvinfer1::IShuffleLayer*> reshape =
+          builder.ValueOrDie().Reshape(qdq_input.ValueOrDie(),
+                                       intermediate_dims);
+      TRT_ENSURE_PTR_OK(reshape);
+      qdq_input.ValueOrDie() = reshape.ValueOrDie()->getOutput(0);
+    }
+
+    VLOG(1) << "[ExplicitPrecision]" << node_def.op() << ": " << node_def.name()
+            << " computed scales: " << args.scales << " from min/max ranges "
+            << args.min_range << "/" << args.max_range;
+
+    ::stream_executor::port::StatusOr<nvinfer1::ILayer*> qdq =
+        builder.ValueOrDie().UniformQuantizeDequantizeExplicit(
+            qdq_input.ValueOrDie(), args.scales.quantize_scale[0],
+            args.scales.dequantize_scale[0], node_def.name());
+    TRT_ENSURE_PTR_OK(qdq);
+    ITensorProxyPtr final_output = qdq.ValueOrDie()->getOutput(0);
+    if (idims.nbDims != intermediate_dims.nbDims) {
+      ::stream_executor::port::StatusOr<nvinfer1::IShuffleLayer*> undo_reshape =
+          builder.ValueOrDie().Reshape(qdq_input.ValueOrDie(), idims);
+      TRT_ENSURE_PTR_OK(undo_reshape);
+      final_output = undo_reshape.ValueOrDie()->getOutput(0);
+    }
+    params->outputs->push_back(final_output);
+    return Status::OK();
+  }
+};
+
+template <>
+
+struct QDQOpSpec<ops::QuantizeAndDequantizeV3> {
+  static constexpr std::array<InputArgSpec, 4> InputSpec() {
+    return {
+        InputArgSpec::Create("input", TrtInputArg::kBoth),
+        InputArgSpec::Create("min", TrtInputArg::kWeight),
+        InputArgSpec::Create("max", TrtInputArg::kWeight),
+        InputArgSpec::Create("num_bits", TrtInputArg::kWeight),
+    };
+  }
+  // Use same attributes and conversion functions as QDQV2.
+  using Attrs = QDQOpSpec<ops::QuantizeAndDequantizeV2>::Attrs;
+
+  static Status ValidateQDQForExplicitPrecision(
+      const std::vector<TRT_TensorOrWeights>& inputs, const NodeDef& node_def,
+      Attrs* args) {
+    return QDQOpSpec<
+        ops::QuantizeAndDequantizeV2>::ValidateQDQForExplicitPrecision(inputs,
+                                                                       node_def,
+                                                                       args);
+  }
+
+  static Status ConvertExplicit(const OpConverterParams* params,
+                                const Attrs& args) {
+    return QDQOpSpec<ops::QuantizeAndDequantizeV2>::ConvertExplicit(params,
+                                                                    args);
+  }
+};
+
+template <>
+
+struct QDQOpSpec<ops::FakeQuantWithMinMaxVars> {
+  static constexpr std::array<InputArgSpec, 3> InputSpec() {
+    return {
+        InputArgSpec::Create("input", TrtInputArg::kBoth),
+        InputArgSpec::Create("min", TrtInputArg::kWeight),
+        InputArgSpec::Create("max", TrtInputArg::kWeight),
+    };
+  }
+  struct Attrs {
+    int num_bits;
+    bool narrow_range;
+  };
+
+  static Status ValidateQDQForExplicitPrecision(
+      const std::vector<TRT_TensorOrWeights>& inputs, const NodeDef& node_def,
+      Attrs* args) {
+    return errors::Unimplemented("");
+  }
+
+  static Status ConvertExplicit(const OpConverterParams* params,
+                                const Attrs& args) {
+    return errors::Unimplemented("");
+  }
+};
+
+template <>
+
+struct QDQOpSpec<ops::FakeQuantWithMinMaxArgs> {
+  static constexpr std::array<InputArgSpec, 1> InputSpec() {
+    return {
+        InputArgSpec::Create("input", TrtInputArg::kBoth),
+    };
+  }
+
+  struct Attrs {
+    float min;
+    float max;
+    int num_bits;
+    bool narrow_range;
+  };
+
+  static Status ValidateQDQForExplicitPrecision(
+      const std::vector<TRT_TensorOrWeights>& inputs, const NodeDef& node_def,
+      Attrs* args) {
+    return errors::Unimplemented("");
+  }
+
+  static Status ConvertExplicit(const OpConverterParams* params,
+                                const Attrs& args) {
+    return errors::Unimplemented("");
+  }
+};
+
+// Converts QDQ operations in non-explicit precision mode. This is the original
+// "ConvertQuantize" function. In this mode, Q/DQ operations are no-ops and are
+// instead used to set the dynamic range of the input tensor.
+Status ConvertDynamicRangeMode(const OpConverterParams* params) {
+  const auto& inputs = params->inputs;
+  const auto& node_def = params->node_def;
+  float min_range = 0.0f;
+  float max_range = 0.0f;
+  const auto& op_name = node_def.op();
+  if (op_name == "FakeQuantWithMinMaxArgs") {
+    AttrSlice attrs(node_def);
+    // Get ranges via node attributes.
+    TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "min", &min_range));
+    TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "max", &max_range));
+  } else if (op_name == "FakeQuantWithMinMaxVars" ||
+             op_name == "QuantizeAndDequantizeV2" ||
+             op_name == "QuantizeAndDequantizeV3") {
+    // Get ranges via inputs.
+    auto get_weights_value = [&inputs](int index) {
+      const auto* raw_weights = inputs.at(index).weights().GetPointer<float>();
+      return raw_weights[0];
+    };
+    min_range = get_weights_value(1);
+    max_range = get_weights_value(2);
+  } else {
+    return errors::InvalidArgument("Unknown quantization op ", op_name, ", at ",
+                                   node_def.name());
+  }
+  if (params->validation_only) {
+    return Status::OK();
+  }
+
+  // Store ranges for tensor
+  ITensorProxyPtr input0 = inputs.at(0).tensor();
+  params->converter->ProvideQuantizationRange(&input0, min_range, max_range);
+  // Sometimes, TRT may not quantize a tensor, either because it chooses to
+  // execute a higher precision kernel or because of op fusion. In these
+  // cases, accuracy will suffer if the model was trained to expect
+  // quantization at that tensor. We should consider adding a clip(tensor,
+  // min_range, max_range) operation here to ensure that any arbitrarily
+  // placed quantize node will execute as expected. However, this will
+  // negatively affect performance. If users train their models in a way which
+  // models inference as close as possible (i.e. not quantizing in place where
+  // fusion will occur), then there is no problem with the current
+  // implementation.
+  params->outputs->push_back(inputs.at(0));
+  return Status::OK();
+}
+
+template <typename TFOpType>
+class ConvertQDQ : public OpConverterBase<ConvertQDQ<TFOpType>> {
+ public:
+  explicit ConvertQDQ(const OpConverterParams* params)
+      : OpConverterBase<ConvertQDQ<TFOpType>>(params) {}
+
+  static constexpr auto InputSpec() { return QDQOpSpec<TFOpType>::InputSpec(); }
+
+  // Disable the non-applicable data type check by providing empty string.
+  static constexpr const char* NodeDefDataTypeAttributeName() { return ""; }
+
+  Status ValidateDynamicRangeINT8Mode() {
+    // The condition ensures we only call the conversion once. We should break
+    // this function up into validation and conversion.
+    if (this->params_->validation_only) {
+      return ConvertDynamicRangeMode(this->params_);
+    }
+    return Status::OK();
+  }
+
+  Status Validate() {
+    if (!this->params_->use_explicit_precision) {
+      return ValidateDynamicRangeINT8Mode();
+    }
+    return OpSpec::ValidateQDQForExplicitPrecision(
+        this->params_->inputs, this->params_->node_def, &attrs_);
+  }
+
+  Status Convert() {
+    if (!this->params_->use_explicit_precision) {
+      return ConvertDynamicRangeMode(this->params_);
+    }
+    return OpSpec::ConvertExplicit(this->params_, attrs_);
+  }
+
+  using OpSpec = QDQOpSpec<TFOpType>;
+  using OpSpecAttrs = typename QDQOpSpec<TFOpType>::Attrs;
+  OpSpecAttrs attrs_;
+};
+
+REGISTER_DEFAULT_TRT_OP_CONVERTER(
+    MakeConverterFunction<ConvertQDQ<ops::QuantizeAndDequantizeV2>>(),
+    "QuantizeAndDequantizeV2");
+REGISTER_DEFAULT_TRT_OP_CONVERTER(
+    MakeConverterFunction<ConvertQDQ<ops::QuantizeAndDequantizeV3>>(),
+    "QuantizeAndDequantizeV3");
+REGISTER_DEFAULT_TRT_OP_CONVERTER(
+    MakeConverterFunction<ConvertQDQ<ops::FakeQuantWithMinMaxVars>>(),
+    "FakeQuantWithMinMaxVars");
+REGISTER_DEFAULT_TRT_OP_CONVERTER(
+    MakeConverterFunction<ConvertQDQ<ops::FakeQuantWithMinMaxArgs>>(),
+    "FakeQuantWithMinMaxArgs");
+
+}  // namespace convert
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/convert/ops/quantization_ops.h b/tensorflow/compiler/tf2tensorrt/convert/ops/quantization_ops.h
new file mode 100644
index 00000000000..280dc1e79f5
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/convert/ops/quantization_ops.h
@@ -0,0 +1,76 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_OPS_QUANTIZATION_OPS_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_OPS_QUANTIZATION_OPS_H_
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+
+#include "absl/strings/str_format.h"
+#include "absl/strings/str_join.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace tensorrt {
+namespace convert {
+
+constexpr std::array<const char*, 4> kQuantizationOpNames = {
+    "QuantizeAndDequantizeV2",
+    "QuantizeAndDequantizeV3",
+    "FakeQuantWithMinMaxVars",
+    "FakeQuantWithMinMaxArgs",
+};
+
+// Operations with supported conversion to Q/DQ ops in TensorRT explicit
+// precision mode.
+constexpr std::array<const char*, 1> kExplicitQuantizationOpNames = {
+    "QuantizeAndDequantizeV2",
+};
+
+// Contains two scaling factors for quantization and dequantization
+// respectively. A shift factor is omitted as TensorRT only supports symmetric
+// quantization.
+template <typename T, size_t N>
+struct QuantizationScales {
+  std::array<T, N> quantize_scale;
+  std::array<T, N> dequantize_scale;
+};
+
+// In TensorRT 7 and 8, only uniform tensor scaling is supported for
+// activations.
+using UniformQuantizationScales = QuantizationScales<float, 1>;
+
+// Per-channel scaling is supported for weights in TensorRT version >= 8.0.
+template <size_t ChannelDimSize>
+using PerChannelQuantizationScales = QuantizationScales<float, ChannelDimSize>;
+
+template <typename T, size_t N>
+std::ostream& operator<<(std::ostream& os,
+                         const QuantizationScales<T, N>& scales) {
+  os << absl::StrFormat("QuantizationScales[quantize={%s},dequantize={%s}]",
+                        absl::StrJoin(scales.quantize_scale, ","),
+                        absl::StrJoin(scales.dequantize_scale, ","));
+  return os;
+}
+
+// Returns true if the Tensorflow node is a quantize and dequantize operation.
+bool IsQuantizeAndDequantizeOp(const Node*);
+
+}  // namespace convert
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_OPS_QUANTIZATION_OPS_H_
diff --git a/tensorflow/compiler/tf2tensorrt/convert/ops/quantization_ops_test.cc b/tensorflow/compiler/tf2tensorrt/convert/ops/quantization_ops_test.cc
new file mode 100644
index 00000000000..578fae3577b
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/convert/ops/quantization_ops_test.cc
@@ -0,0 +1,619 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+
+#include "tensorflow/compiler/tf2tensorrt/convert/ops/quantization_ops.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/cc/framework/ops.h"
+#include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/cc/ops/array_ops.h"
+#include "tensorflow/cc/ops/const_op.h"
+#include "tensorflow/cc/ops/linalg_ops.h"
+#include "tensorflow/cc/ops/math_ops.h"
+#include "tensorflow/cc/ops/nn_ops.h"
+#include "tensorflow/compiler/jit/shape_inference.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
+#include "tensorflow/compiler/tf2tensorrt/trt_convert_api.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/status_matchers.h"
+#include "tensorflow/core/protobuf/error_codes.pb.h"
+
+#if IS_TRT_VERSION_GE(8, 0, 0, 0)
+
+namespace tensorflow {
+namespace tensorrt {
+namespace convert {
+
+namespace ops = ::tensorflow::ops;
+using ::tensorflow::testing::StatusIs;
+
+// This anonymous namespace contains helper functions for instatiating small TF
+// building blocks. These are used below to construct specific graph patterns
+// which test end-to-end conversion of the TF graph to an explciit-precision
+// enabled TensorRT network.
+namespace {
+
+enum class ConvEpilogueType {
+  kNone,
+  kReLU,
+  kBatchNorm,
+  kReLUBatchnorm,
+  kBatchnormReLU
+};
+
+std::ostream& operator<<(std::ostream& os, ConvEpilogueType epilogue) {
+  switch (epilogue) {
+    case ConvEpilogueType::kNone:
+      return os << "None";
+    case ConvEpilogueType::kReLU:
+      return os << "ReLU only";
+    case ConvEpilogueType::kBatchNorm:
+      return os << "BatchNorm Only";
+    case ConvEpilogueType::kReLUBatchnorm:
+      return os << "ReLU+Batchnorm";
+    case ConvEpilogueType::kBatchnormReLU:
+      return os << "BatchNorm+ReLU";
+  }
+}
+
+std::string DebugString(ConvEpilogueType epilogue) {
+  std::stringstream ss;
+  ss << epilogue;
+  return ss.str();
+}
+
+// Adds a 2D 3x3, single channel input with specified data_format. data_format
+// must be NHWC,NCHW or NHW.
+ops::Placeholder AddInput(Scope scope, int input_idx,
+                          const std::string data_format,
+                          std::array<int, 3> size_chw = {1, 3, 3}) {
+  PartialTensorShape input_shape;
+  if (data_format == "NCHW") {
+    input_shape =
+        PartialTensorShape({1, size_chw[0], size_chw[1], size_chw[2]});
+  } else if (data_format == "NHWC") {
+    input_shape =
+        PartialTensorShape({1, size_chw[1], size_chw[2], size_chw[0]});
+  } else if (data_format == "NHW") {
+    input_shape = PartialTensorShape({1, size_chw[1], size_chw[2]});
+  } else {
+    LOG(FATAL) << "Unknown input shape type " << data_format;
+  }
+  auto input_attrs = ops::Placeholder::Attrs().Shape(input_shape);
+  return ops::Placeholder(scope.WithOpName(absl::StrCat("input_", input_idx)),
+                          DT_FLOAT, input_attrs);
+}
+
+// Adds QDQ op with min = -1.0f, max = 1.0f.
+Output AddQDQV2(Scope scope, Input input) {
+  // Create scaling factors.
+  auto input_min =
+      ops::Const<float>(scope.WithOpName("in_min"), -1.0f, TensorShape{});
+  auto input_max =
+      ops::Const<float>(scope.WithOpName("in_max"), 1.0f, TensorShape{});
+  return ops::QuantizeAndDequantizeV2(scope.WithOpName("qdq"), input, input_min,
+                                      input_max);
+}
+
+Output AddOutput(Scope scope, Output input, int idx, bool add_qdq) {
+  Output out = input;
+  if (add_qdq) {
+    out = AddQDQV2(scope, input);
+  }
+  return ops::Identity(scope.WithOpName(StrCat("output_", idx)), out);
+}
+
+// Adds a 3x3x1x1 Conv2D op and optional bias weights, followed by ReLU
+// activation. Puts QDQ between (weights, op). Puts QDQ between (input, op)
+// when qdq_on_output=false. Otherwise, puts QDQ between (op, output).
+Output AddConv2D(Scope scope, Input input, int in_channels, int out_channels,
+                 std::array<int, 2> filter_size = {1, 1},
+                 std::array<int, 2> stride = {1, 1},
+                 const std::string& data_format = "NCHW", bool with_bias = true,
+                 ConvEpilogueType epilogue = ConvEpilogueType::kBatchnormReLU,
+                 bool qdq_on_output = false) {
+  // Create 3x3 non-quantized weights weights.
+  auto weights_const = ops::Const(
+      scope.WithOpName("weights"), 1.0f,
+      TensorShape({filter_size[0], filter_size[1], in_channels, out_channels}));
+
+  // Add QDQ to input if we don't add QDQ to output.
+  auto conv_input =
+      !qdq_on_output ? AddQDQV2(scope.WithOpName("qdq_input"), input) : input;
+
+  Output result = ops::Conv2D(
+      scope.WithOpName("conv2d"), conv_input, AddQDQV2(scope, weights_const),
+      /*strides=*/{1, 1, 1, 1},
+      /*padding=*/"SAME", ops::Conv2D::Attrs().DataFormat(data_format));
+
+  if (with_bias) {
+    auto bias_const = ops::Const(scope.WithOpName("bias_weights"), 1.0f,
+                                 TensorShape({
+                                     out_channels,
+                                 }));
+    result = ops::BiasAdd(scope.WithOpName("bias"), result, bias_const,
+                          ops::BiasAdd::Attrs().DataFormat(data_format));
+  }
+
+  auto add_bn = [scope, data_format](Input input,
+                                     const int channels) -> Output {
+    TensorShape constant_shape = TensorShape({channels});
+    auto bn_scale =
+        ops::Const(scope.WithOpName("bn_scale"), 1.0f, constant_shape);
+    auto bn_offset =
+        ops::Const(scope.WithOpName("bn_offset"), 1.0f, constant_shape);
+    auto bn_mean =
+        ops::Const(scope.WithOpName("bn_mean"), 0.1f, TensorShape({channels}));
+    auto bn_var =
+        ops::Const(scope.WithOpName("bn_var"), 1.0f, TensorShape({channels}));
+    Input conv_bn_input = IS_TRT_VERSION_GE(8, 0, 1, 0)
+                              ? input
+                              : AddQDQV2(scope.WithOpName("qdq_input"), input);
+    return ops::FusedBatchNormV3(
+               scope.WithOpName("bn"), conv_bn_input, bn_scale, bn_offset,
+               bn_mean, bn_var,
+               ops::FusedBatchNormV3::Attrs().IsTraining(false).DataFormat(
+                   data_format))
+        .y;
+  };
+
+  switch (epilogue) {
+    case ConvEpilogueType::kBatchNorm: {
+      result = add_bn(result, out_channels);
+      break;
+    }
+    case ConvEpilogueType::kReLU: {
+      result = ops::Relu(scope.WithOpName("relu"), result);
+      break;
+    }
+    case ConvEpilogueType::kReLUBatchnorm: {
+      result = ops::Relu(scope.WithOpName("relu"), result);
+      result = add_bn(result, out_channels);
+      break;
+    }
+    case ConvEpilogueType::kBatchnormReLU: {
+      result = add_bn(result, out_channels);
+      result = ops::Relu(scope.WithOpName("relu"), result);
+      break;
+    }
+    case ConvEpilogueType::kNone:
+      break;
+  }
+
+  if (qdq_on_output) {
+    result = AddQDQV2(scope.WithOpName("qdq_out"), result);
+  }
+  return result;
+}
+
+// Adds a batch matrix multiplication V2 operation, which commonly appears in
+// fully connected layers. Puts QDQ between (input, op) as well as between
+// (weights, op).
+ops::BatchMatMulV2 AddMatMul(Scope scope, const std::string& name,
+                             Input input) {
+  // Add QDQ to input.
+  auto input_qdq = AddQDQV2(scope, input);
+
+  // Add 3x3 weights with QDQ.
+  auto weights_const =
+      ops::Const(scope.WithOpName(name + "_weights"),
+                 {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f},
+                 TensorShape({3, 3}));
+  auto weights_qdq = AddQDQV2(scope.WithOpName("weights_qdq"), weights_const);
+  return ops::BatchMatMulV2(scope.WithOpName(name), input_qdq, weights_qdq);
+}
+}  // namespace
+
+struct QDQTestOptions {
+  bool conv_has_bias{true};
+
+  // TRT7 may have issues with optimizing redundant transpose operations between
+  // QDQ and Op introduced by TF-TRT when format is not "NCHW". This allows to
+  // test both cases as well as WAR feasibility.
+  std::string data_format{"NCHW"};
+
+  // Tests whether placing QDQ on outputs rather than inputs is handled
+  // correctly.
+  bool qdq_on_output{false};
+
+  // Option for testing whether TRT build succeeds without a final QDQ before
+  // the output.
+  bool final_qdq{true};
+
+  // Whether to add activations (relu) to conv operations
+  ConvEpilogueType conv_epilogue;
+
+  // TF-TRT API Options
+  TfTrtConversionParams conversion_params{};
+};
+
+std::ostream& operator<<(std::ostream& os, const QDQTestOptions opts) {
+  return os << absl::StrCat(
+             "QDQTestOptions(conv_has_bias=",
+             static_cast<int>(opts.conv_has_bias),
+             ", qdq_on_output=", static_cast<int>(opts.qdq_on_output),
+             ", data_format=", opts.data_format,
+             ", conv_epilogue=", DebugString(opts.conv_epilogue),
+             ", final_qdq=", opts.final_qdq, ")");
+}
+
+std::vector<QDQTestOptions> EnumerateQDQTestOptions() {
+  std::vector<QDQTestOptions> result;
+  for (const absl::string_view data_format : {"NCHW", "NHWC"}) {
+    for (auto use_bias : {true, false}) {
+      for (auto qdq_on_output : {false, true}) {
+        // For now, always append a QDQ before output. For small single-op tests
+        // (besides QDQ), TensorRT7 sometimes has trouble.
+        for (auto final_qdq : {true, false}) {
+          for (auto conv_epilogue :
+               {ConvEpilogueType::kReLU, ConvEpilogueType::kNone,
+                ConvEpilogueType::kBatchnormReLU}) {
+            // Currently batch norm converter only supports NHWC.
+            if (data_format == "NHWC" &&
+                (conv_epilogue == ConvEpilogueType::kBatchnormReLU ||
+                 conv_epilogue == ConvEpilogueType::kBatchNorm ||
+                 conv_epilogue == ConvEpilogueType::kBatchnormReLU)) {
+              continue;
+            }
+            QDQTestOptions opts{};
+            opts.conv_has_bias = use_bias;
+            opts.data_format = data_format;
+            opts.qdq_on_output = qdq_on_output;
+            opts.final_qdq = final_qdq;
+            opts.conv_epilogue = conv_epilogue;
+            result.push_back(opts);
+          }
+        }
+      }
+    }
+  }
+  return result;
+}
+
+// This class is a test fixture for running graph conversion and evaluating
+// numerical results.
+class QDQExplicitTest : public ::testing::Test,
+                        public ::testing::WithParamInterface<QDQTestOptions> {
+ public:
+  static ::stream_executor::port::StatusOr<PartialTensorShape> GetShape(
+      const std::string& name, const GraphShapeInfo& shapes) {
+    TRT_ENSURE(shapes.find(name) != shapes.end());
+    TRT_ENSURE(shapes.at(name).size() == 1);
+    return shapes.at(name)[0].shape;
+  }
+
+  ::stream_executor::port::StatusOr<MetaGraphDef> GetModel(
+      const GraphDef& graph_def, const std::vector<const NodeDef*>& inputs,
+      const std::vector<const NodeDef*>& outputs,
+      const GraphShapeInfo& shapes) {
+    TRT_ENSURE(!inputs.empty());
+    TRT_ENSURE(!outputs.empty());
+
+    MetaGraphDef out;
+    out.mutable_graph_def()->CopyFrom(graph_def);
+
+    SignatureDef signature_def;
+    auto& mutable_inputs = *signature_def.mutable_inputs();
+    for (int i = 0; i < inputs.size(); i++) {
+      std::string input_name = inputs[i]->name();
+      auto& input = mutable_inputs[input_name];
+      input.set_name(input_name);
+      input.set_dtype(DT_FLOAT);
+      TRT_ENSURE(shapes.find(input_name) != shapes.end());
+      TRT_ENSURE(shapes.at(input_name).size() == 1);
+      PartialTensorShape input_shape = shapes.at(input_name)[0].shape;
+      input_shape.AsProto(input.mutable_tensor_shape());
+    }
+
+    auto& mutable_outputs = *signature_def.mutable_outputs();
+    for (int i = 0; i < outputs.size(); i++) {
+      std::string output_name = outputs[i]->name();
+      auto& output = mutable_outputs[output_name];
+      output.set_name(output_name);
+      output.set_dtype(DT_FLOAT);
+      TRT_ENSURE(shapes.find(output_name) != shapes.end());
+      TRT_ENSURE(shapes.at(output_name).size() == 1);
+      PartialTensorShape output_shape = shapes.at(output_name)[0].shape;
+      output_shape.AsProto(output.mutable_tensor_shape());
+    }
+
+    (*out.mutable_signature_def())["serving_default"] = signature_def;
+    return out;
+  }
+
+  // Confirms that we have a TRT node with the correct attributes.
+  static Status CheckTrtNode(const GraphDef& converted_graph_def) {
+    int n_trt_ops = 0;
+    string op_name{"TRTEngineOp"};
+    for (const auto& node : converted_graph_def.node()) {
+      if (op_name == node.op()) {
+        n_trt_ops++;
+        const auto& attr = node.attr();
+        TRT_ENSURE(attr.at("static_engine").b());
+        VLOG(2) << "Found serialized segment with size "
+                << attr.at("serialized_segment").s().size();
+        TRT_ENSURE(!attr.at("serialized_segment").s().empty());
+      }
+    }
+    TRT_ENSURE(n_trt_ops == 1);
+    return Status::OK();
+  }
+
+  Status ConvertAndRun(Scope* scope) {
+    std::vector<const NodeDef*> inputs;
+    std::vector<const NodeDef*> outputs;
+
+    GraphDef gdef;
+    TF_RETURN_IF_ERROR(scope->ToGraphDef(&gdef));
+
+    std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+    TF_RETURN_IF_ERROR(scope->ToGraph(graph.get()));
+
+    GraphShapeInfo shape_info;
+    TF_RETURN_IF_ERROR(InferShapes(graph.get(), /*arg_shapes=*/{},
+                                   /*fnlib_def=*/nullptr, &shape_info));
+
+    for (const NodeDef& node : gdef.node()) {
+      if (absl::StartsWith(node.name(), "input_")) {
+        inputs.push_back(&node);
+      } else if (absl::StartsWith(node.name(), "output_")) {
+        outputs.push_back(&node);
+      }
+    }
+
+    ::stream_executor::port::StatusOr<MetaGraphDef> meta_graph_def =
+        GetModel(gdef, inputs, outputs, shape_info);
+    TRT_ENSURE_OK(meta_graph_def);
+
+    // Create a list of input tensors, they will be used to build the engines.
+    std::vector<Tensor> input_tensors;
+    std::vector<std::string> input_names;
+    for (const auto& input : inputs) {
+      input_names.push_back(input->name());
+
+      ::stream_executor::port::StatusOr<PartialTensorShape> input_shape =
+          GetShape(input->name(), shape_info);
+      TRT_ENSURE_OK(input_shape);
+
+      TensorShape shape;
+      input_shape.ValueOrDie().AsTensorShape(&shape);
+      Tensor tensor(DT_FLOAT, shape);
+      test::FillIota(&tensor, 1.0f);
+      input_tensors.push_back(tensor);
+    }
+
+    std::vector<std::string> output_names;
+    for (const auto& output : outputs) {
+      output_names.push_back(output->name());
+    }
+
+    TfTrtConversionParams conversion_params;
+    conversion_params.allow_build_at_runtime = true;
+    conversion_params.precision_mode = TrtPrecisionMode::INT8;
+    conversion_params.use_calibration = false;
+    conversion_params.convert_to_static_engine = true;
+    TRT_ENSURE(input_names.size() == input_tensors.size());
+    ::stream_executor::port::StatusOr<GraphDef> converted_gdef =
+        tensorrt::ConvertAndBuild(meta_graph_def.ValueOrDie().graph_def(),
+                                  input_names, output_names, {input_tensors},
+                                  conversion_params);
+    TRT_ENSURE_OK(converted_gdef);
+    return CheckTrtNode(converted_gdef.ValueOrDie());
+  }
+
+ protected:
+  TfTrtConversionParams params_;
+  TrtUniquePtrType<nvinfer1::ICudaEngine> engine_;
+};
+
+class TestQDQSuite : public QDQExplicitTest {};
+
+#define EXPECT_QDQ_ON_OUTPUT_FAILURE(params, scope)                  \
+  if ((params).qdq_on_output) {                                      \
+    EXPECT_THAT(ConvertAndRun(&(scope)), StatusIs(error::INTERNAL)); \
+    return;                                                          \
+  }
+#define EXPECT_NO_FINAL_QDQ_FAILURE(params, scope)                   \
+  if (!(params).final_qdq) {                                         \
+    EXPECT_THAT(ConvertAndRun(&(scope)), StatusIs(error::INTERNAL)); \
+    return;                                                          \
+  }
+
+#define EXPECT_BUILD_OK(scope) TF_EXPECT_OK(ConvertAndRun(&(scope)))
+
+#define POLICY_TRT7(params, scope)               \
+  if (!IS_TRT_VERSION_GE(8, 0, 0, 0)) {          \
+    EXPECT_QDQ_ON_OUTPUT_FAILURE(params, scope); \
+    EXPECT_NO_FINAL_QDQ_FAILURE(params, scope);  \
+    EXPECT_BUILD_OK(scope);                      \
+  }
+
+#define POLICY_TRT8(params, scope)                                          \
+  if (IS_TRT_VERSION_GE(8, 0, 0, 0)) {                                      \
+    if (((params).conv_epilogue == ConvEpilogueType::kBatchNorm ||          \
+         (params).conv_epilogue == ConvEpilogueType::kBatchnormReLU ||      \
+         (params).conv_epilogue == ConvEpilogueType::kReLUBatchnorm) &&     \
+        (params).data_format == "NHWC") {                                   \
+      EXPECT_THAT(ConvertAndRun(&(scope)), StatusIs(error::UNIMPLEMENTED)); \
+      return;                                                               \
+    }                                                                       \
+    EXPECT_BUILD_OK(scope);                                                 \
+  }
+
+#define SKIP_TRT7(x)                           \
+  if (!IS_TRT_VERSION_GE(8, 0, 0, 0) && (x)) { \
+    GTEST_SKIP();                              \
+  }
+
+// Tests single convolution operation conversion.
+TEST_P(TestQDQSuite, TestConv2DBasic) {
+  SKIP_TRT7(GetParam().qdq_on_output);
+  SKIP_TRT7(GetParam().data_format != "NCHW");
+  SKIP_TRT7(!GetParam().final_qdq);
+
+  Scope scope = Scope::NewRootScope();
+  auto input = AddInput(scope, 0, GetParam().data_format, {3, 28, 28});
+
+  Output out = input;
+  const int num_conv = 1;
+  std::array<int, 2> in_channels = {3, 16};
+  std::array<int, 2> out_channels = {16, 32};
+  for (int i = 0; i < num_conv; i++) {
+    out = AddConv2D(scope.WithOpName(absl::StrCat("conv_", i)), out,
+                    in_channels[i], out_channels[i], /*filter_size=*/{3, 3},
+                    /*stride=*/{1, 1}, GetParam().data_format,
+                    GetParam().conv_has_bias, GetParam().conv_epilogue,
+                    GetParam().qdq_on_output);
+  }
+  out = AddOutput(scope, out, 0, GetParam().final_qdq);
+  POLICY_TRT7(GetParam(), scope);
+  POLICY_TRT8(GetParam(), scope);
+}
+
+// Tests single convolution operation conversion.
+TEST_P(TestQDQSuite, TestMatMulBasic) {
+  // Some param's don't apply, so pick one combination and skip otherwise.
+  if (GetParam().data_format != "NCHW" || !GetParam().conv_has_bias ||
+      GetParam().qdq_on_output ||
+      GetParam().conv_epilogue != ConvEpilogueType::kReLU) {
+    GTEST_SKIP();
+  }
+  Scope scope = Scope::NewRootScope();
+  auto input = AddInput(scope, 0, "NHW");
+  auto matmul_op = AddMatMul(scope, "matmul", input);
+  auto out = AddOutput(scope, matmul_op, 0, GetParam().final_qdq);
+
+  TF_EXPECT_OK(ConvertAndRun(&scope));
+}
+
+// A single input goes through two different Conv2D. Outputs of Conv2D are
+// added together, with QQQ on both branches of ADD.
+TEST_P(TestQDQSuite, AddBothBranchesQDQConvSingleInput) {
+  SKIP_TRT7(!GetParam().final_qdq);
+  SKIP_TRT7(GetParam().data_format != "NCHW");
+
+  Scope scope = Scope::NewRootScope();
+  auto input1 = AddInput(scope, 0, GetParam().data_format,
+                         /*size_chw=*/{3, 28, 28});
+
+  auto conv1 =
+      AddConv2D(scope, input1, 3, 16, /*filter_size=*/{3, 3}, /*stride=*/{1, 1},
+                GetParam().data_format, GetParam().conv_has_bias,
+                GetParam().conv_epilogue, GetParam().qdq_on_output);
+
+  auto conv2 =
+      AddConv2D(scope, input1, 3, 16, /*filter_size=*/{3, 3}, /*stride=*/
+                {1, 1}, GetParam().data_format, GetParam().conv_has_bias,
+                GetParam().conv_epilogue, GetParam().qdq_on_output);
+
+  // In the case of "qdq on output", we don't need to add QDQ.
+  auto add =
+      ops::Add(scope.WithOpName("add"),
+               !GetParam().qdq_on_output ? AddQDQV2(scope, conv1) : conv1,
+               !GetParam().qdq_on_output ? AddQDQV2(scope, conv2) : conv2);
+
+  auto conv3 =
+      AddConv2D(scope.WithOpName("conv3"), conv2, 16, 16, {1, 1}, {1, 1},
+                GetParam().data_format, GetParam().conv_has_bias,
+                GetParam().conv_epilogue, GetParam().qdq_on_output);
+
+  auto out =
+      AddOutput(scope.WithOpName("output"), conv3, 0, GetParam().final_qdq);
+
+  POLICY_TRT7(GetParam(), scope);
+  POLICY_TRT8(GetParam(), scope);
+}
+
+// Tests adding a single tensor to itself, with QQQ on both branches of ADD.
+TEST_P(TestQDQSuite, AddBothBranchesQDQMultipleInput) {
+  // TRT7 QDQ optimizer makes single-input restriction.
+  SKIP_TRT7(true);
+
+  Scope scope = Scope::NewRootScope();
+  auto input1 = AddInput(scope, 0, GetParam().data_format);
+  auto input2 = AddInput(scope, 1, GetParam().data_format);
+  auto add =
+      ops::Add(scope.WithOpName("add"),
+               !GetParam().qdq_on_output ? AddQDQV2(scope, input1) : input1,
+               !GetParam().qdq_on_output ? AddQDQV2(scope, input2) : input2);
+  auto output = AddOutput(scope, add, 0, true);
+  TF_EXPECT_OK(ConvertAndRun(&scope));
+}
+
+// Tests Conv-MaxPool combination
+TEST_P(TestQDQSuite, TestConvMaxpool) {
+  SKIP_TRT7(!GetParam().final_qdq);
+  SKIP_TRT7(GetParam().data_format != "NCHW");
+
+  Scope scope = Scope::NewRootScope();
+  auto input = AddInput(scope, 0, GetParam().data_format,
+                        /*size_chw=*/{3, 28, 28});
+  auto conv1 =
+      AddConv2D(scope, input, 3, 16, /*filter_size=*/{3, 3}, /*stride=*/{1, 1},
+                GetParam().data_format, GetParam().conv_has_bias,
+                GetParam().conv_epilogue, GetParam().qdq_on_output);
+  ops::MaxPool maxpool =
+      ops::MaxPool(scope.WithOpName("maxpool"),
+                   AddQDQV2(scope.WithOpName("mp_qdq_in"), conv1), {1, 1, 1, 1},
+                   {1, 1, 1, 1}, "SAME",
+                   ops::MaxPool::Attrs().DataFormat(GetParam().data_format));
+  auto output =
+      AddOutput(scope.WithOpName("output"), maxpool, 0, GetParam().final_qdq);
+  POLICY_TRT7(GetParam(), scope);
+  POLICY_TRT8(GetParam(), scope);
+}
+
+// Tests QDQ(Conv(QDQ(MaxPool(Conv(QDQ(x))))))
+TEST_P(TestQDQSuite, TestConvMaxpoolConv) {
+  SKIP_TRT7(!GetParam().final_qdq);
+  SKIP_TRT7(GetParam().data_format != "NCHW");
+
+  Scope scope = Scope::NewRootScope();
+  auto input = AddInput(scope, 0, GetParam().data_format,
+                        /*size_chw=*/{3, 28, 28});
+  auto conv1 =
+      AddConv2D(scope, input, 3, 16, /*filter_size=*/{3, 3}, /*stride=*/{1, 1},
+                GetParam().data_format, GetParam().conv_has_bias,
+                GetParam().conv_epilogue, GetParam().qdq_on_output);
+  ops::MaxPool maxpool =
+      ops::MaxPool(scope.WithOpName("maxpool"),
+                   AddQDQV2(scope.WithOpName("mp_qdq_in"), conv1), {1, 1, 1, 1},
+                   {1, 1, 1, 1}, "SAME",
+                   ops::MaxPool::Attrs().DataFormat(GetParam().data_format));
+  auto conv2 = AddConv2D(scope, maxpool, 16, 16, {3, 3}, {1, 1},
+                         GetParam().data_format, GetParam().conv_has_bias,
+                         GetParam().conv_epilogue, GetParam().qdq_on_output);
+  auto output =
+      AddOutput(scope.WithOpName("out"), conv2, 0, GetParam().final_qdq);
+  POLICY_TRT7(GetParam(), scope);
+  POLICY_TRT8(GetParam(), scope);
+}
+
+INSTANTIATE_TEST_SUITE_P(TestQDQSuiteInst, TestQDQSuite,
+                         ::testing::ValuesIn(EnumerateQDQTestOptions()));
+
+}  // namespace convert
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // IS_TRT_VERSION_GE(8, 0, 0, 0)
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/convert/ops/selectv2.cc b/tensorflow/compiler/tf2tensorrt/convert/ops/selectv2.cc
new file mode 100644
index 00000000000..0e6736f3cf9
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/convert/ops/selectv2.cc
@@ -0,0 +1,220 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+
+#include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/op_converter_registry.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/ops/layer_utils.h"
+
+namespace tensorflow {
+namespace tensorrt {
+namespace convert {
+
+#if IS_TRT_VERSION_GE(8, 2, 0, 0)
+/*  The ConvertSelectV2 is working only for cond_input passed as a boolean
+ *  tensor, which could be created only for TRT >= 8.2.
+ */
+class ConvertSelectBase : public OpConverterBase<ConvertSelectBase> {
+ public:
+  explicit ConvertSelectBase(const OpConverterParams* params,
+                             const std::string& layer_name)
+      : OpConverterBase<ConvertSelectBase>(
+            params,
+            {DataType::DT_FLOAT, DataType::DT_HALF, DataType::DT_INT32}),
+        layer_name_(layer_name) {}
+
+  static constexpr std::array<InputArgSpec, 3> InputSpec() {
+    return std::array<InputArgSpec, 3>{
+        InputArgSpec::Create("cond", TrtInputArg::kBoth),
+        InputArgSpec::Create("then", TrtInputArg::kBoth),
+        InputArgSpec::Create("else", TrtInputArg::kBoth)};
+  }
+
+  Status Validate() {
+    TF_RETURN_IF_ERROR(NotSupportedInImplicitBatch());
+
+    const auto& params = *this->params_;
+    const auto& inputs = params.inputs;
+    const auto& i_cond = inputs.at(0);
+    const auto& node = params.node_def;
+    TF_RETURN_IF_ERROR(
+        check_type(i_cond.TrtDType(), nvinfer1::DataType::kBOOL, node));
+
+    if (i_cond.is_weights()) {
+      // According to
+      // https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#constant-layer
+      // Boolean weights are not supported in TRT version 8.4.
+      return errors::InvalidArgument(bool_weight_error_msg(node));
+    }
+
+    const auto& i_then = inputs.at(1);
+    const auto& i_else = inputs.at(2);
+    const auto type_then = i_then.TrtDType();
+    const auto type_else = i_else.TrtDType();
+    if (type_then != type_else && (type_then == nvinfer1::DataType::kINT32 ||
+                                   type_else == nvinfer1::DataType::kINT32)) {
+      // Both or none of (type_then, type_else) should be equal to kINT32.
+      return errors::InvalidArgument(
+          then_else_dtypes_error_msg(type_then, type_else, node));
+    }
+
+    bool cond_is_vector = false;
+    const auto& shape_cond = i_cond.GetTrtDims();
+    if (layer_name_ == "select") {
+      const auto& shape_then = i_then.GetTrtDims();
+      const auto& shape_else = i_else.GetTrtDims();
+      TF_RETURN_IF_ERROR(compare_shapes(shape_then, shape_else));
+      TF_RETURN_IF_ERROR(
+          compare_shapes(shape_cond, shape_then, &cond_is_vector));
+    }
+
+    nvinfer1::Dims cond_dims(shape_cond);
+    if (cond_is_vector) {
+      cond_dims.nbDims = i_then.GetTrtDims().nbDims;
+      const std::vector<int> ones(cond_dims.d[0], 1);
+      std::copy(ones.begin(), ones.end(), cond_dims.d + 1);
+    }
+
+    const TRT_TensorOrWeights new_cond(nvinfer1::DataType::kBOOL, cond_dims,
+                                       i_cond.batch_size());
+    nvinfer1::Dims broadcasted_dims[3];
+    for (int i = 1; i < 3; i++) {
+      TF_RETURN_IF_ERROR(GetTrtBroadcastShape(new_cond, inputs.at(i), true,
+                                              false, broadcasted_dims,
+                                              broadcasted_dims + i));
+    }
+
+    for (int i = 0; i < tensor_.size(); i++) {
+      // This will also convert constants to tensors.
+      tensor_[i] = std::make_unique<TRT_TensorOrWeights>(inputs.at(i));
+      TF_RETURN_IF_ERROR(
+          ApplyBroadcast(tensor_[i], broadcasted_dims[i], this->params_, 0));
+    }
+
+    return Status::OK();
+  }
+
+  Status Convert() {
+    const auto& params = *this->params_;
+    auto* converter = params.converter;
+
+    nvinfer1::ISelectLayer* select_layer = converter->network()->addSelect(
+        *tensor_[0].get()->as_tensor(params_)->trt_tensor(),  // cond_tensor
+        *tensor_[1].get()->as_tensor(params_)->trt_tensor(),  // then_tensor
+        *tensor_[2].get()->as_tensor(params_)->trt_tensor()   // else_tensor
+    );
+
+    converter->SetLayerName(select_layer, params.node_def.name(), layer_name_);
+    AddOutput(TRT_TensorOrWeights(select_layer->getOutput(0)));
+    return Status::OK();
+  }
+
+ private:
+  Status compare_shapes(const nvinfer1::Dims& shape1,
+                        const nvinfer1::Dims& shape2,
+                        bool* cond_is_vector = nullptr) const {
+    const bool then_vs_else = cond_is_vector == nullptr;
+    bool same_shapes = shape1 == shape2;
+    if (!same_shapes && shape1.nbDims == shape2.nbDims) {
+      // We can't check size equivalent when dynamic shapes are involved.
+      // In this case, the two shapes should be equal at runtime. Therefore,
+      // the shapes still should be considered as equal if at least one of
+      // them is a tensor with dynamic shape,
+      same_shapes = DynamicShapeInput(this->params_->inputs, then_vs_else);
+    }
+    if (!same_shapes) {
+      if (then_vs_else || !(*cond_is_vector = (shape1.nbDims == 1 &&
+                                               shape1.d[0] == shape2.d[0]))) {
+        const auto err = input_shapes_error_msg(
+            shape1, shape2, this->params_->node_def, then_vs_else);
+        return errors::InvalidArgument(err);
+      }
+    }
+    return Status::OK();
+  }
+
+  bool DynamicShapeInput(const std::vector<TRT_TensorOrWeights>& inputs,
+                         bool then_vs_else) const {
+    const int idx = then_vs_else ? 1 : 0;
+    for (int i = 0; i < 2; ++i) {
+      const auto& input = inputs.at(i + idx);
+      if (input.is_tensor() && !HasStaticShape(input.GetTrtDims())) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  std::array<std::unique_ptr<TRT_TensorOrWeights>, 3> tensor_;
+  const std::string layer_name_;
+};
+
+class ConvertSelect : public ConvertSelectBase {
+ public:
+  explicit ConvertSelect(const OpConverterParams* params)
+      : ConvertSelectBase(params, "select") {}
+};
+
+class ConvertSelectV2 : public ConvertSelectBase {
+ public:
+  explicit ConvertSelectV2(const OpConverterParams* params)
+      : ConvertSelectBase(params, "selectv2") {}
+};
+
+std::string op_node_info(const NodeDef& node) {
+  return " of the '" + node.op() + "' operation at the node '" + node.name() +
+         "' ";
+}
+
+std::string bool_weight_error_msg(const NodeDef& node) {
+  return "The boolean parameter '" + node.input(0) + "'" + op_node_info(node) +
+         "cannot be passed as a weight in TRT version 8.4.";
+}
+
+std::string then_else_dtypes_error_msg(nvinfer1::DataType type_then,
+                                       nvinfer1::DataType type_else,
+                                       const NodeDef& node) {
+  return "DataTypes (" + DebugString(type_then) + ", " +
+         DebugString(type_else) + ") of parameters (" + node.input(1) + ", " +
+         node.input(2) + ")" + op_node_info(node) + "are incompatible.";
+}
+
+std::string input_shapes_error_msg(const nvinfer1::Dims& shape1,
+                                   const nvinfer1::Dims& shape2,
+                                   const NodeDef& node, bool then_vs_else) {
+  const std::string& param_names =
+      then_vs_else ? "'then' and 'else'" : "'cond' and 'then'";
+  std::string error_msg = "The shapes of the " + param_names + " parameters" +
+                          op_node_info(node) + "must be the same";
+  if (!then_vs_else) {
+    error_msg +=
+        " OR 'cond' must be a vector with N elements, "
+        "where N is a batch size (the first shape dimension for 'then')";
+  }
+  return error_msg + ", got " + DebugString(shape1) + " vs. " +
+         DebugString(shape2) + ".";
+}
+
+REGISTER_DEFAULT_TRT_OP_CONVERTER(MakeConverterFunction<ConvertSelect>(),
+                                  "Select");
+REGISTER_DEFAULT_TRT_OP_CONVERTER(MakeConverterFunction<ConvertSelectV2>(),
+                                  "SelectV2");
+#endif
+
+}  // namespace convert
+}  // namespace tensorrt
+}  // namespace tensorflow
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/convert/ops/softmax.cc b/tensorflow/compiler/tf2tensorrt/convert/ops/softmax.cc
new file mode 100644
index 00000000000..dcbf992b08e
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/convert/ops/softmax.cc
@@ -0,0 +1,81 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+
+#include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/op_converter_registry.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/ops/layer_utils.h"
+
+namespace tensorflow {
+namespace tensorrt {
+namespace convert {
+
+class ConvertSoftmax : public OpConverterBase<ConvertSoftmax> {
+ public:
+  explicit ConvertSoftmax(const OpConverterParams *params)
+      : OpConverterBase<ConvertSoftmax>(params) {}
+
+  static constexpr std::array<DataType, 3> AllowedDataTypes() {
+    return {DataType::DT_FLOAT, DataType::DT_HALF};
+  }
+
+  static constexpr std::array<InputArgSpec, 1> InputSpec() {
+    return std::array<InputArgSpec, 1>{
+        InputArgSpec::Create("logits", TrtInputArg::kTensor)};
+  }
+
+  Status Validate() {
+    const auto &params = *this->params_;
+    const auto &inputs = params.inputs;
+
+    ITensorProxyPtr logits_tensor = inputs.at(0).tensor();
+    const int num_trt_dims = logits_tensor->getDimensions().nbDims;
+    if (!num_trt_dims && params.use_implicit_batch) {
+      return errors::InvalidArgument(
+          "TensorRT Softmax cannot apply on the batch dimension");
+    }
+    return Status::OK();
+  }
+
+  Status Convert() {
+    const auto &params = *this->params_;
+    const auto &inputs = params.inputs;
+    const auto &node_def = params.node_def;
+
+    ITensorProxyPtr logits_tensor = inputs.at(0).tensor();
+    const int num_trt_dims = logits_tensor->getDimensions().nbDims;
+
+    // Perform Softmax operation:
+    nvinfer1::ISoftMaxLayer *layer =
+        params.converter->network()->addSoftMax(*logits_tensor->trt_tensor());
+    TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+    params.converter->SetLayerName(layer, node_def);
+    // Tensorflow SoftMax applies softmax operation over the last dimension.
+    layer->setAxes(1 << (num_trt_dims - 1));
+
+    ITensorProxyPtr output_tensor = layer->getOutput(0);
+    params.outputs->push_back(TRT_TensorOrWeights(output_tensor));
+    return Status::OK();
+  }
+};
+
+REGISTER_DEFAULT_TRT_OP_CONVERTER(MakeConverterFunction<ConvertSoftmax>(),
+                                  "Softmax");
+
+}  // namespace convert
+}  // namespace tensorrt
+}  // namespace tensorflow
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/convert/ops/tile.cc b/tensorflow/compiler/tf2tensorrt/convert/ops/tile.cc
new file mode 100644
index 00000000000..84961670b33
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/convert/ops/tile.cc
@@ -0,0 +1,208 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+
+#include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/op_converter_registry.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/ops/layer_utils.h"
+
+namespace tensorflow {
+namespace tensorrt {
+namespace convert {
+
+class ConvertTile : public OpConverterBase<ConvertTile> {
+ public:
+  explicit ConvertTile(const OpConverterParams *params)
+      : OpConverterBase<ConvertTile>(
+            params,
+            {DataType::DT_FLOAT, DataType::DT_HALF, DataType::DT_INT32}) {}
+
+  static constexpr std::array<InputArgSpec, 2> InputSpec() {
+    return std::array<InputArgSpec, 2>{
+        InputArgSpec::Create("input_tensor", TrtInputArg::kBoth),
+        InputArgSpec::Create("weight", TrtInputArg::kBoth)};
+  }
+
+  Status Validate() {
+    const auto &params = *this->params_;
+    const auto &inputs = params.inputs;
+
+    const auto &repl = inputs.at(1);
+    if (params.use_implicit_batch && repl.is_tensor()) {
+      return errors::InvalidArgument(
+          "Conversion for Tile is not implemented for multipliers "
+          "passed as a tensor in implicit batch mode.");
+    }
+
+    nvinfer1::DataType dtype;
+    const int *multiplies;
+    if (repl.is_weights()) {
+      TFTRT_CHECK_SHAPE_TENSOR(repl.weights().GetTensor());
+      dtype = repl.weights().TrtDType();
+      multiplies = repl.weights().GetPointer<int>();
+    } else {
+      dtype = repl.tensor()->getType();
+      multiplies = nullptr;
+    }
+
+    const auto &node = params.node_def;
+    TF_RETURN_IF_ERROR(check_type(dtype, nvinfer1::DataType::kINT32, node, 1));
+
+    const auto dims = inputs.at(0).GetTrtDims();
+    const auto nb_dims =
+        dims.nbDims +
+        (params.use_implicit_batch && inputs.at(0).is_tensor() ? 1 : 0);
+    if (multiplies) {
+      const int mult_numb = repl.weights().count();
+      if (mult_numb != nb_dims) {
+        return errors::InvalidArgument(
+            "The length of the replication vector (", mult_numb,
+            ") of the Tile operation in '", node.name(),
+            "' is expected to be equal to the rank of the input vector (",
+            nb_dims, ").");
+      }
+
+      if (std::any_of(multiplies, multiplies + nb_dims,
+                      [](int i) { return i <= 0; })) {
+        const auto &mul = absl::StrJoin(multiplies, multiplies + nb_dims, ", ");
+        return errors::InvalidArgument(
+            "All replications of the Tile operation in '", node.name(),
+            "' should be positive, got (", mul, ").");
+      }
+
+      if (params.use_implicit_batch && multiplies[0] > 1) {
+        return errors::Unimplemented(
+            "The Tile operation along the batch dimension in '", node.name(),
+            "' is not implemented.");
+      }
+    } else {
+      const auto &repl_dims = repl.GetTrtDims();
+      if (repl_dims.nbDims != 1) {
+        return errors::InvalidArgument(
+            "When replications are defined as a tensor, that tensor must be "
+            "1-dimensional. Got ",
+            repl_dims.nbDims, "-dimensional tensor.");
+      }
+
+      // Check the number of elements in multiplyer for tensors with non-dynamic
+      // shape
+      if (repl_dims.d[0] >= 0 && repl_dims.d[0] != nb_dims) {
+        return errors::InvalidArgument(
+            "When replications are defined as a tensor, "
+            "the number of its elements (",
+            repl_dims.d[0], ") must be equal to the rank of the input tensor (",
+            nb_dims, ").");
+      }
+    }
+
+    return Status::OK();
+  }
+
+  Status Convert() {
+    const auto &params = *this->params_;
+    const auto &inputs = params.inputs;
+    auto *converter = params.converter;
+    auto *network = converter->network();
+    const auto &tensor = inputs.at(0);
+    const auto &replics = inputs.at(1);
+    const auto dims = tensor.GetTrtDims();
+    const auto nb_dims = dims.nbDims;
+
+    nvinfer1::Dims output_size{nb_dims, {1}};
+    bool dynamic_flag = replics.is_tensor() || !HasStaticShape(dims);
+
+    if (!dynamic_flag) {
+      // If input0 is a tensor, and we're in implicit batch mode, then we need
+      // dim_offset.
+      const auto dim_offset =
+          params.use_implicit_batch && tensor.is_tensor() ? 1 : 0;
+      const auto *input_size = dims.d;
+      const int *pReplics = replics.weights().GetPointer<int>() + dim_offset;
+      for (int i = 0; i < nb_dims; i++)
+        output_size.d[i] = pReplics[i] * input_size[i];
+    }
+
+    ::stream_executor::port::StatusOr<TRTNetworkBuilder> builder;
+    if (tensor.is_weights() || (dynamic_flag && replics.is_weights())) {
+      builder =
+          TRTNetworkBuilder::Create(converter->network(), params.weight_store);
+      TRT_ENSURE_OK(builder);
+    }
+
+    ITensorProxyPtr input_tensor;
+    if (tensor.is_weights()) {
+      ::stream_executor::port::StatusOr<nvinfer1::IConstantLayer *>
+          weights_const = builder.ValueOrDie().WeightsToConstant(
+              tensor.weights().GetTrtWeights(), dims);
+      TRT_ENSURE_PTR_OK(weights_const);
+      input_tensor = weights_const.ValueOrDie()->getOutput(0);
+    } else {
+      input_tensor = tensor.tensor();
+    }
+
+    auto &input_trt_tensor = *input_tensor->trt_tensor();
+    nvinfer1::ITensor *target_shape = nullptr;
+    if (dynamic_flag) {
+      nvinfer1::ITensor *mult;
+      if (replics.is_weights()) {
+        ::stream_executor::port::StatusOr<nvinfer1::IConstantLayer *>
+            weights_const = builder.ValueOrDie().WeightsToConstant(
+                replics.weights().GetTrtWeights(), replics.GetTrtDims());
+        TRT_ENSURE_PTR_OK(weights_const);
+        mult = weights_const.ValueOrDie()->getOutput(0);
+      } else {
+        const ITensorProxyPtr multiplies = replics.tensor()->trt_tensor();
+        mult = multiplies->trt_tensor();
+      }
+
+      nvinfer1::ITensor *shape =
+          network->addShape(input_trt_tensor)->getOutput(0);
+      target_shape = network
+                         ->addElementWise(*shape, *mult,
+                                          nvinfer1::ElementWiseOperation::kPROD)
+                         ->getOutput(0);
+    }
+
+    nvinfer1::Dims start{nb_dims, {}};
+    DimsAdapter stride(std::vector<int>(nb_dims, 1));
+    auto layer = network->addSlice(input_trt_tensor, start, output_size,
+                                   stride.AsTrtDims());
+    layer->setMode(nvinfer1::SliceMode::kWRAP);
+    if (target_shape) layer->setInput(2, *target_shape);
+
+    converter->SetLayerName(layer, params.node_def.name(), "to_tile");
+    ITensorProxyPtr output_tensor = layer->getOutput(0);
+    if (tensor.is_weights() && params.use_implicit_batch) {
+      // Reshape output tensor by removing first dimension.
+      DimsAdapter adap(output_tensor->getDimensions());
+      TF_RETURN_IF_ERROR(adap.RemoveBatchDimension());
+
+      TF_RETURN_IF_ERROR(PrepareTensorForShape(
+          params.converter, TRT_TensorOrWeights(output_tensor),
+          adap.AsTrtDims(), false, &output_tensor, params.node_def));
+    }
+
+    AddOutput(TRT_TensorOrWeights(output_tensor));
+    return Status::OK();
+  }
+};
+
+REGISTER_DEFAULT_TRT_OP_CONVERTER(MakeConverterFunction<ConvertTile>(), "Tile");
+
+}  // namespace convert
+}  // namespace tensorrt
+}  // namespace tensorflow
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/convert/ops/unary_ops.cc b/tensorflow/compiler/tf2tensorrt/convert/ops/unary_ops.cc
new file mode 100644
index 00000000000..45bade296f6
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/convert/ops/unary_ops.cc
@@ -0,0 +1,251 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+
+#include "tensorflow/compiler/tf2tensorrt/convert/op_converter_registry.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/ops/layer_utils.h"
+
+namespace tensorflow {
+namespace tensorrt {
+namespace convert {
+
+const UnaryOperationMapType* UnaryOperationMap() {
+  static auto* const m = new UnaryOperationMapType({
+    {"Exp", nvinfer1::UnaryOperation::kEXP},
+        {"Log", nvinfer1::UnaryOperation::kLOG},
+        {"Sqrt", nvinfer1::UnaryOperation::kSQRT},
+        {"Rsqrt", nvinfer1::UnaryOperation::kSQRT},
+        {"Reciprocal", nvinfer1::UnaryOperation::kRECIP},
+        {"Abs", nvinfer1::UnaryOperation::kABS},
+        {"Neg", nvinfer1::UnaryOperation::kNEG},
+        {"Sin", nvinfer1::UnaryOperation::kSIN},
+        {"Cos", nvinfer1::UnaryOperation::kCOS},
+        {"Tan", nvinfer1::UnaryOperation::kTAN},
+        {"Sinh", nvinfer1::UnaryOperation::kSINH},
+        {"Cosh", nvinfer1::UnaryOperation::kCOSH},
+        {"Asin", nvinfer1::UnaryOperation::kASIN},
+        {"Acos", nvinfer1::UnaryOperation::kACOS},
+        {"Atan", nvinfer1::UnaryOperation::kATAN},
+        {"Asinh", nvinfer1::UnaryOperation::kASINH},
+        {"Acosh", nvinfer1::UnaryOperation::kACOSH},
+        {"Atanh", nvinfer1::UnaryOperation::kATANH},
+        {"Ceil", nvinfer1::UnaryOperation::kCEIL},
+        {"Floor", nvinfer1::UnaryOperation::kFLOOR},
+        {"Erf", nvinfer1::UnaryOperation::kERF},
+#if IS_TRT_VERSION_GE(8, 2, 0, 0)
+        {"Round", nvinfer1::UnaryOperation::kROUND},
+        {"Sign", nvinfer1::UnaryOperation::kSIGN},
+#endif
+  });
+  return m;
+}
+
+const UnaryOperationMapType* UnaryBooleanOperationMap() {
+  static auto* const m = new UnaryOperationMapType({
+      {"LogicalNot", nvinfer1::UnaryOperation::kNOT},
+  });
+  return m;
+}
+
+const ActivationTypeMapType* ActivationTypeMap() {
+  static auto* const m = new ActivationTypeMapType({
+      {"LeakyRelu", nvinfer1::ActivationType::kLEAKY_RELU},
+      {"Relu", nvinfer1::ActivationType::kRELU},
+      {"Relu6", nvinfer1::ActivationType::kCLIP},
+      {"Sigmoid", nvinfer1::ActivationType::kSIGMOID},
+      {"Tanh", nvinfer1::ActivationType::kTANH},
+      {"Elu", nvinfer1::ActivationType::kELU},
+      {"Selu", nvinfer1::ActivationType::kSELU},
+      {"Softsign", nvinfer1::ActivationType::kSOFTSIGN},
+      {"Softplus", nvinfer1::ActivationType::kSOFTPLUS},
+  });
+  return m;
+}
+
+template <typename T>
+class ConvertUnaryImpl {
+ protected:
+  ConvertUnaryImpl(const OperationMap<T>* pOperMap) : pOperMap_(pOperMap) {}
+
+  Status ValidateImpl(const OpConverterParams& params,
+                      const std::vector<string>& not_supported_ops = {}) {
+    const auto& node = params.node_def;
+    const auto& op = node.op();
+    if (pOperMap_->find(op) == pOperMap_->end()) {
+      return errors::Unimplemented("Unary op: ", op, " not supported");
+    }
+    DimsAdapter input_dims(params.inputs.at(0).GetTrtDims());
+    if (!input_dims.NumDims()) {
+      return errors::InvalidArgument(
+          "At least 1 dimension is required for UNARY operation '", op, "'");
+    }
+
+    if (!not_supported_ops.empty() && params.use_implicit_batch) {
+      const auto& end = not_supported_ops.end();
+      if (std::find(not_supported_ops.begin(), end, op) != end) {
+        const auto& err =
+            convert_not_supported_implicit(op, node.name(), "Unary");
+        return errors::Unimplemented(err);
+      }
+    }
+
+    return Status::OK();
+  }
+
+  Status ConvertImpl(const OpConverterParams& params) {
+    const auto& node_def = params.node_def;
+    auto* converter = params.converter;
+    const auto op_pair = pOperMap_->find(node_def.op());
+    ITensorProxyPtr tensor = params.inputs.at(0).tensor();
+    nvinfer1::IUnaryLayer* layer =
+        converter->network()->addUnary(*tensor->trt_tensor(), op_pair->second);
+    TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+    converter->SetLayerName(layer, node_def);
+    if (node_def.op() == "Rsqrt") {
+      layer = converter->network()->addUnary(*layer->getOutput(0),
+                                             nvinfer1::UnaryOperation::kRECIP);
+      TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+      converter->SetLayerName(layer, node_def, "recip");
+    }
+    params.outputs->push_back(TRT_TensorOrWeights(layer->getOutput(0)));
+    return Status::OK();
+  }
+  static constexpr std::array<InputArgSpec, 1> InputSpec() {
+    return std::array<InputArgSpec, 1>{
+        InputArgSpec::Create("x", TrtInputArg::kTensor)};
+  }
+
+ protected:
+  const OperationMap<T>* pOperMap_;
+};
+
+class ConvertUnary : public OpConverterBase<ConvertUnary>,
+                     protected ConvertUnaryImpl<nvinfer1::UnaryOperation> {
+ public:
+  explicit ConvertUnary(const OpConverterParams* params)
+      : OpConverterBase<ConvertUnary>(
+            params,
+            params->node_def.op() == "Sign"
+                ? std::vector<DataType>{DataType::DT_FLOAT, DataType::DT_HALF,
+                                        DataType::DT_INT8, DT_INT32}
+                : std::vector<DataType>{DataType::DT_FLOAT, DataType::DT_HALF,
+                                        DataType::DT_INT8}),
+        ConvertUnaryImpl(UnaryOperationMap()) {}
+
+  static constexpr std::array<InputArgSpec, 1> InputSpec() {
+    return ConvertUnaryImpl::InputSpec();
+  }
+
+  Status Validate() { return ValidateImpl(*params_, {"Sign", "Round"}); }
+  Status Convert() { return ConvertImpl(*params_); }
+};
+
+class ConvertBooleanUnary : public OpConverterBase<ConvertBooleanUnary>,
+                            public ConvertUnaryImpl<nvinfer1::UnaryOperation> {
+ public:
+  explicit ConvertBooleanUnary(const OpConverterParams* params)
+      : OpConverterBase<ConvertBooleanUnary>(params, {DataType::DT_BOOL}),
+        ConvertUnaryImpl(UnaryBooleanOperationMap()) {}
+
+  static constexpr std::array<InputArgSpec, 1> InputSpec() {
+    return ConvertUnaryImpl::InputSpec();
+  }
+
+  static constexpr const char* NodeDefDataTypeAttributeName() {
+    /*
+    node {
+      name: "..."
+      op: "LogicalNot"
+      input: "..."
+    }
+    */
+    return "";
+  }
+  Status Validate() {
+#if IS_TRT_VERSION_GE(8, 2, 0, 0)
+    return ValidateImpl(*params_, {"LogicalNot"});
+#else
+    return errors::Unimplemented("Boolean op: ", params_->node_def.op(),
+                                 " is not supported in TRT version < 8.2");
+#endif
+  }
+  Status Convert() { return ConvertImpl(*params_); }
+};
+
+class ConvertActivation : public OpConverterBase<ConvertActivation>,
+                          protected ConvertUnaryImpl<nvinfer1::ActivationType> {
+ public:
+  explicit ConvertActivation(const OpConverterParams* params)
+      : OpConverterBase<ConvertActivation>(params),
+        ConvertUnaryImpl(ActivationTypeMap()) {}
+
+  static constexpr std::array<InputArgSpec, 1> InputSpec() {
+    return std::array<InputArgSpec, 1>{
+        InputArgSpec::Create("input", TrtInputArg::kTensor)};
+  }
+
+  Status Validate() {
+    TF_RETURN_IF_ERROR(ValidateImpl(*params_));
+    const auto& node_def = params_->node_def;
+    if (node_def.op() == "LeakyRelu") {
+      return GetNodeAttr(AttrSlice(node_def), "alpha", &alpha_);
+    }
+    alpha_ = 1.0f;
+    return Status::OK();
+  }
+  Status Convert() {
+    auto* converter = params_->converter;
+    const auto& inputs = params_->inputs;
+    const auto& node_def = params_->node_def;
+    const auto& op = node_def.op();
+    const auto op_pair = pOperMap_->find(op);
+    nvinfer1::IActivationLayer* layer = converter->network()->addActivation(
+        *inputs.at(0).tensor()->trt_tensor(), op_pair->second);
+    TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+    converter->SetLayerName(layer, node_def, "activation");
+    ITensorProxyPtr output_tensor = layer->getOutput(0);
+    // Set parameters.
+    if (op == "Selu") {
+      // From tensorflow/core/kernels/relu_op_functor.h
+      alpha_ = 1.7580993408473768599402175208123f;
+      layer->setBeta(1.0507009873554804934193349852946f);
+    } else if (op == "Softplus") {
+      layer->setBeta(1.0f);
+    } else if (op == "Relu6") {
+      layer->setBeta(6.0f);
+      converter->ProvideQuantizationRange(&output_tensor, alpha_ = 0.0f, 6.0f);
+    }
+    layer->setAlpha(alpha_);
+    params_->outputs->push_back(TRT_TensorOrWeights(output_tensor));
+    return Status::OK();
+  }
+
+ private:
+  float alpha_ = 0.f;
+};
+
+REGISTER_DEFAULT_TRT_OP_CONVERTER(MakeConverterFunction<ConvertUnary>(),
+                                  GetOperationNames(*UnaryOperationMap()));
+REGISTER_DEFAULT_TRT_OP_CONVERTER(
+    MakeConverterFunction<ConvertBooleanUnary>(),
+    GetOperationNames(*UnaryBooleanOperationMap()));
+
+REGISTER_DEFAULT_TRT_OP_CONVERTER(MakeConverterFunction<ConvertActivation>(),
+                                  GetOperationNames(*ActivationTypeMap()));
+}  // namespace convert
+}  // namespace tensorrt
+}  // namespace tensorflow
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/convert/ops/variable_ops.cc b/tensorflow/compiler/tf2tensorrt/convert/ops/variable_ops.cc
new file mode 100644
index 00000000000..3df027e803f
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/convert/ops/variable_ops.cc
@@ -0,0 +1,370 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+#include "absl/strings/str_cat.h"
+#include "tensorflow/compiler/tf2tensorrt/common/utils.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/op_converter.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/op_converter_registry.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
+#include "tensorflow/core/common_runtime/process_function_library_runtime.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/stream_executor.h"
+#include "third_party/tensorrt/NvInfer.h"
+#include "third_party/tensorrt/NvInferRuntimeCommon.h"
+
+namespace tensorflow {
+namespace tensorrt {
+namespace convert {
+
+struct VarAttributes {
+  TensorShapeProto shape_proto;
+  TensorShape shape;
+  string name;
+  DataType dtype;
+  string shared_name;
+  string container;
+};
+
+template <typename T, bool is_resource>
+Status ReadVariableHelper(const OpConverterParams* params,
+                          const VarAttributes& attrs,
+                          TRT_ShapedWeights* weights) {
+  Tensor tensor(attrs.dtype, attrs.shape);
+  auto ctx = params->converter->context();
+  TRT_ENSURE(ctx != nullptr);
+  auto tensor_flat = tensor.flat<T>();
+
+  // Clone function library runtime in order to get a mutable library
+  // definition to add and run a function with the variable operation.
+  auto lib = ctx->function_library();
+  std::unique_ptr<FunctionLibraryDefinition> lib_def;
+  std::unique_ptr<ProcessFunctionLibraryRuntime> lib_pflr;
+  FunctionLibraryRuntime* lib_clone;  // Not owned.
+  TF_RETURN_IF_ERROR(lib->Clone(&lib_def, &lib_pflr, &lib_clone));
+
+  // Create function definition.
+  FunctionDef fdef;
+  std::vector<Tensor> args;
+  string func_name = attrs.name + "/func";
+  if (is_resource) {
+    // Create input tensor with the resource handle.
+    const auto& inputs = params->inputs;
+    const TRT_TensorOrWeights& handle = inputs.at(0);
+    args.emplace_back(handle.resource());
+
+    fdef = FunctionDefHelper::Define(
+        func_name,                                             // Name
+        {"in: resource"},                                      // Args
+        {absl::StrCat("out: ", DataTypeString(attrs.dtype))},  // Returns
+        {},                                                    // Attr def
+        // Nodes
+        {{{attrs.name},
+          "ReadVariableOp",
+          {"in"},  // Name of the Placeholder or VarHandleOp
+          {{"dtype", attrs.dtype}}},
+         {{"out"}, "Identity", {attrs.name}, {{"T", attrs.dtype}}}});
+  } else {
+    fdef = FunctionDefHelper::Define(
+        func_name,                                             // Name
+        {},                                                    // Args
+        {absl::StrCat("out: ", DataTypeString(attrs.dtype))},  // Returns
+        {},                                                    // Attr def
+        // Nodes
+        {{{attrs.name},
+          "VariableV2",
+          {},
+          {{"dtype", attrs.dtype},
+           {"shape", attrs.shape_proto},
+           {"container", attrs.container},
+           {"shared_name", attrs.shared_name}}},
+         {{"out"}, "Identity", {attrs.name}, {{"T", attrs.dtype}}}});
+  }
+
+  // Add function definition to the library.
+  TF_RETURN_IF_ERROR(lib_def->AddFunctionDef(fdef));
+
+  // Instantiate function.
+  FunctionLibraryRuntime::Handle func_handle;
+  FunctionLibraryRuntime::InstantiateOptions inst_ops;
+  inst_ops.state_handle = "";
+  inst_ops.target = ctx->device()->name();
+  AttrValueMap attr_list;
+  TF_RETURN_IF_ERROR(lib_clone->Instantiate(func_name, AttrSlice(&attr_list),
+                                            inst_ops, &func_handle));
+
+  FunctionLibraryRuntime::Options opts;
+  opts.rendezvous = ctx->rendezvous();
+  opts.cancellation_manager = ctx->cancellation_manager();
+  opts.runner = ctx->runner();
+
+  std::vector<Tensor>* rets = new std::vector<Tensor>();
+  std::unique_ptr<std::vector<Tensor>> outputs_wrapper(rets);
+
+  // Run the new function synchronously.
+  Status s_dry_run;
+  Notification done_dry_run;
+  lib_clone->Run(opts, func_handle, args, rets,
+                 [&s_dry_run, &done_dry_run](const Status& s) {
+                   s_dry_run = s;
+                   done_dry_run.Notify();
+                 });
+  done_dry_run.WaitForNotification();
+  TF_RETURN_IF_ERROR(s_dry_run);
+  TRT_ENSURE(ctx->op_device_context() != nullptr);
+  TRT_ENSURE(ctx->op_device_context()->stream() != nullptr);
+
+  // Copy tensor.
+  cudaStream_t stream = reinterpret_cast<cudaStream_t>(
+      CHECK_NOTNULL(ctx->op_device_context()
+                        ->stream()
+                        ->implementation()
+                        ->GpuStreamMemberHack()));
+
+  auto ret = cudaMemcpyAsync(tensor_flat.data(), rets->at(0).flat<T>().data(),
+                             rets->at(0).NumElements() * sizeof(T),
+                             cudaMemcpyDeviceToHost, stream);
+  if (ret != 0) {
+    return errors::Internal("Could not copy the variable ", attrs.name);
+  }
+  cudaStreamSynchronize(stream);
+
+  TF_RETURN_IF_ERROR(
+      TfTensorToTrtWeights(tensor, params->weight_store, weights));
+
+  return Status::OK();
+}
+
+class ConvertVariableV2 : public OpConverterBase<ConvertVariableV2> {
+ public:
+  ConvertVariableV2(const OpConverterParams* params)
+      : OpConverterBase<ConvertVariableV2>(params) {}
+
+  static constexpr std::array<InputArgSpec, 0> InputSpec() { return {}; }
+
+  static constexpr const char* NodeDefDataTypeAttributeName() {
+    /*
+    node {
+      name: "..."
+      op: "VariableV2"
+      ...
+      attr {
+        key: "dtype"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      ...
+    }
+    */
+    return "dtype";
+  }
+
+  template <typename T>
+  Status ValidateImpl() {
+    const auto& node_def = params_->node_def;
+
+    // Verify and consume node attributes.
+    ::stream_executor::port::StatusOr<TensorShapeProto> shape_proto =
+        GetAttrValue<TensorShapeProto>("shape");
+    ::stream_executor::port::StatusOr<string> shared_name =
+        GetAttrValue<string>("shared_name");
+    ::stream_executor::port::StatusOr<string> container =
+        GetAttrValue<string>("container");
+    TRT_ENSURE_OK(shape_proto);
+    TRT_ENSURE_OK(shared_name);
+    TRT_ENSURE_OK(container);
+
+    attrs_.shape_proto = shape_proto.ValueOrDie();
+    attrs_.shape = TensorShape(shape_proto.ValueOrDie());
+    attrs_.name = node_def.name();
+    attrs_.shared_name = shared_name.ValueOrDie();
+    attrs_.container = container.ValueOrDie();
+
+    Tensor tensor(attrs_.dtype, attrs_.shape);
+    auto tensor_flat = tensor.flat<T>();
+    for (int64_t i = 0; i < tensor_flat.size(); i++) {
+      tensor_flat(i) = T(0.0f);
+    }
+
+    TRT_ShapedWeights weights;
+    TF_RETURN_IF_ERROR(
+        TfTensorToTrtWeights(tensor, params_->weight_store, &weights));
+
+    // Only push outputs during validation and when outputs are expected.
+    if (params_->validation_only && params_->outputs != nullptr) {
+      AddOutput(TRT_TensorOrWeights(weights));
+    }
+    return Status::OK();
+  }
+
+  Status Validate() {
+    const auto& node_def = params_->node_def;
+    ::stream_executor::port::StatusOr<DataType> dtype =
+        GetAttrValue<DataType>("dtype");
+    TRT_ENSURE_OK(dtype);
+    attrs_.dtype = dtype.ValueOrDie();
+
+    switch (attrs_.dtype) {
+      case DT_FLOAT:
+        return ValidateImpl<float>();
+      case DT_HALF:
+        return ValidateImpl<Eigen::half>();
+      default:
+        // Note: this should have been caught by ValidateNodeDefDataType, but
+        // the compiler expects that all paths be handled in switch.
+        return errors::Unimplemented("Data type ", DataTypeString(attrs_.dtype),
+                                     " is not supported for ", node_def.op(),
+                                     ", at ", node_def.name());
+    }
+  }
+
+  template <typename T>
+  Status ConvertImpl() {
+    TRT_ShapedWeights weights;
+    TF_RETURN_IF_ERROR(ReadVariableHelper<T, false>(params_, attrs_, &weights));
+    AddOutput(TRT_TensorOrWeights(weights));
+    return Status::OK();
+  }
+
+  Status Convert() {
+    const auto& node_def = params_->node_def;
+
+    switch (attrs_.dtype) {
+      case DT_FLOAT:
+        return ConvertImpl<float>();
+      case DT_HALF:
+        return ConvertImpl<Eigen::half>();
+      default:
+        // Note: this should have been caught by ValidateNodeDefDataType, but
+        // the compiler expects that all paths be handled in switch.
+        return errors::Unimplemented("Data type ", DataTypeString(attrs_.dtype),
+                                     " is not supported for ", node_def.op(),
+                                     ", at ", node_def.name());
+    }
+  }
+
+ private:
+  VarAttributes attrs_{};
+};
+REGISTER_DEFAULT_TRT_OP_CONVERTER(MakeConverterFunction<ConvertVariableV2>(),
+                                  {"VariableV2"});
+
+class ConvertReadVariableOp : public OpConverterBase<ConvertReadVariableOp> {
+ public:
+  ConvertReadVariableOp(const OpConverterParams* params)
+      : OpConverterBase<ConvertReadVariableOp>(params) {}
+
+  static constexpr std::array<InputArgSpec, 1> InputSpec() {
+    return {InputArgSpec::Create("resource", TrtInputArg::kResource)};
+  }
+
+  static constexpr const char* NodeDefDataTypeAttributeName() {
+    return "dtype";
+  }
+
+  template <typename T>
+  Status ValidateImpl() {
+    const auto& node_def = params_->node_def;
+
+    // Verify and consume node attributes.
+    ::stream_executor::port::StatusOr<TensorShapeProto> shape_proto =
+        GetAttrValue<TensorShapeProto>("_shape");
+    TRT_ENSURE_OK(shape_proto);
+
+    attrs_.shape_proto = shape_proto.ValueOrDie();
+    attrs_.shape = TensorShape(shape_proto.ValueOrDie());
+    attrs_.name = node_def.name();
+
+    Tensor tensor(attrs_.dtype, attrs_.shape);
+    auto tensor_flat = tensor.flat<T>();
+    for (int64_t i = 0; i < tensor_flat.size(); i++) {
+      tensor_flat(i) = T(0.0f);
+    }
+
+    TRT_ShapedWeights weights;
+    TF_RETURN_IF_ERROR(
+        TfTensorToTrtWeights(tensor, params_->weight_store, &weights));
+
+    // Only push outputs during validation and when outputs are expected.
+    if (params_->validation_only && params_->outputs != nullptr) {
+      AddOutput(TRT_TensorOrWeights(weights));
+    }
+    return Status::OK();
+  }
+
+  Status Validate() {
+    const auto& node_def = params_->node_def;
+    if (params_->use_implicit_batch) {
+      return errors::Unimplemented("Implicit batch mode not supported, at ",
+                                   node_def.name());
+    }
+
+    ::stream_executor::port::StatusOr<DataType> dtype =
+        GetAttrValue<DataType>("dtype");
+    TRT_ENSURE_OK(dtype);
+    attrs_.dtype = dtype.ValueOrDie();
+
+    switch (attrs_.dtype) {
+      case DT_FLOAT:
+        return ValidateImpl<float>();
+      case DT_HALF:
+        return ValidateImpl<Eigen::half>();
+      default:
+        // Note: this should have been caught by ValidateNodeDefDataType, but
+        // the compiler expects that all paths be handled in switch.
+        return errors::Unimplemented("Data type ", DataTypeString(attrs_.dtype),
+                                     " is not supported for ", node_def.op(),
+                                     ", at ", node_def.name());
+    }
+  }
+
+  template <typename T>
+  Status ConvertImpl() {
+    TRT_ShapedWeights weights;
+    TF_RETURN_IF_ERROR(ReadVariableHelper<T, true>(params_, attrs_, &weights));
+    AddOutput(TRT_TensorOrWeights(weights));
+    return Status::OK();
+  }
+
+  Status Convert() {
+    const auto& node_def = params_->node_def;
+
+    switch (attrs_.dtype) {
+      case DT_FLOAT:
+        return ConvertImpl<float>();
+      case DT_HALF:
+        return ConvertImpl<Eigen::half>();
+      default:
+        // Note: this should have been caught by ValidateNodeDefDataType, but
+        // the compiler expects that all paths be handled in switch.
+        return errors::Unimplemented("Data type ", DataTypeString(attrs_.dtype),
+                                     " is not supported for ", node_def.op(),
+                                     ", at ", node_def.name());
+    }
+  }
+
+ private:
+  VarAttributes attrs_{};
+};
+REGISTER_DEFAULT_TRT_OP_CONVERTER(
+    MakeConverterFunction<ConvertReadVariableOp>(), {"ReadVariableOp"});
+
+}  // namespace convert
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/convert/timing_cache.cc b/tensorflow/compiler/tf2tensorrt/convert/timing_cache.cc
new file mode 100644
index 00000000000..423e5eb6c17
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/convert/timing_cache.cc
@@ -0,0 +1,87 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+
+#include "tensorflow/compiler/tf2tensorrt/convert/timing_cache.h"
+
+#include <unordered_map>
+
+#include "tensorflow/compiler/tf2tensorrt/common/utils.h"
+#include "tensorflow/core/platform/errors.h"
+#include "third_party/tensorrt/NvInfer.h"
+
+namespace tensorflow {
+namespace tensorrt {
+namespace convert {
+
+::stream_executor::port::StatusOr<TimingCacheRegistry::TimingCachePtr>
+TimingCacheRegistry::LookUp(const string& name,
+                            nvinfer1::IBuilderConfig* builder_config) {
+#if IS_TRT_VERSION_GE(8, 0, 0, 0)
+  TRT_ENSURE(builder_config != nullptr);
+  mutex_lock scoped_lock(mu_);
+  if (map_.find(name) != map_.end()) {
+    const std::vector<uint8_t>& data = map_[name];
+    return std::unique_ptr<nvinfer1::ITimingCache>(
+        builder_config->createTimingCache(data.data(), data.size()));
+  }
+
+  // If no such timing cache exists, create a new timing cache.
+  return std::unique_ptr<nvinfer1::ITimingCache>(
+      builder_config->createTimingCache(nullptr, 0));
+#endif  // IS_TRT_VERSION_GE(8, 0, 0, 0)
+  return errors::Unimplemented(
+      "serializable timing cache does not exist in TensorRT versions < 8.0");
+}
+
+void TimingCacheRegistry::Upsert(const string& name, TimingCache* cache) {
+#if IS_TRT_VERSION_GE(8, 0, 0, 0)
+  nvinfer1::IHostMemory* memory = cache->serialize();
+  if (memory == nullptr) {
+    return;
+  }
+
+  if (map_.find(name) == map_.end()) {
+    // If the timing cache with the given name does not exist, emplace the
+    // serialized buffer.
+    std::vector<uint8_t> mem(memory->size());
+    std::copy_n(static_cast<uint8_t*>(memory->data()), memory->size(),
+                mem.begin());
+    {
+      mutex_lock scoped_lock(mu_);
+      map_.emplace(name, std::move(mem));
+    }
+  } else {
+    // If the timing cache does exist, use the existing buffer.
+    mutex_lock scoped_lock(mu_);
+    std::vector<uint8_t>& mem = map_[name];
+    mem.resize(memory->size());
+    std::copy_n(static_cast<uint8_t*>(memory->data()), memory->size(),
+                mem.begin());
+  }
+  memory->destroy();
+#endif  // IS_TRT_VERSION_GE(8, 0, 0, 0)
+}
+
+TimingCacheRegistry* GetTimingCacheRegistry() {
+  static TimingCacheRegistry* registry = new TimingCacheRegistry();
+  return registry;
+}
+
+}  // namespace convert
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/convert/timing_cache.h b/tensorflow/compiler/tf2tensorrt/convert/timing_cache.h
new file mode 100644
index 00000000000..27992dd5fe0
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/convert/timing_cache.h
@@ -0,0 +1,70 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_TIMING_CACHE_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_TIMING_CACHE_H_
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+#include <unordered_map>
+
+#include "tensorflow/compiler/tf2tensorrt/common/utils.h"
+#include "tensorflow/core/framework/selective_registration.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+#include "third_party/tensorrt/NvInfer.h"
+
+namespace tensorflow {
+namespace tensorrt {
+namespace convert {
+
+// A registry for holding serialized TensorRT autotuner timing caches.
+// For TensorRT versions < 8.0, the timing cache is not serializable, so these
+// operations become no-ops.
+class TimingCacheRegistry {
+ public:
+  TimingCacheRegistry() = default;
+  ~TimingCacheRegistry() = default;
+
+#if IS_TRT_VERSION_GE(8, 0, 0, 0)
+  using TimingCache = nvinfer1::ITimingCache;
+  using TimingCachePtr = std::unique_ptr<TimingCache>;
+#else
+  struct TimingCache {};
+  using TimingCachePtr = std::unique_ptr<TimingCache>;
+#endif
+
+  // Insert or update a registry into the map using the given name. The cache
+  // will be serialized before being placed into the map.
+  void Upsert(const string& name, TimingCache* cache);
+
+  // Find a timing cache using the given name. The provided BuilderConfig is
+  // used to deserialize the cache. If no timing cache is found, a new timing
+  // cache is returned.
+  ::stream_executor::port::StatusOr<TimingCachePtr> LookUp(
+      const string& name, nvinfer1::IBuilderConfig* builder_config);
+
+ private:
+  using SerializedTimingCache = std::vector<uint8_t>;
+
+  mutex mu_;
+  std::unordered_map<std::string, SerializedTimingCache> map_;
+};
+
+TimingCacheRegistry* GetTimingCacheRegistry();
+
+}  // namespace convert
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_TIMING_CACHE_H_
diff --git a/tensorflow/compiler/tf2tensorrt/convert/trt_layout_optimization_pass.cc b/tensorflow/compiler/tf2tensorrt/convert/trt_layout_optimization_pass.cc
new file mode 100644
index 00000000000..eda360da7a3
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/convert/trt_layout_optimization_pass.cc
@@ -0,0 +1,97 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2tensorrt/convert/trt_layout_optimization_pass.h"
+
+#include "absl/strings/ascii.h"
+#include "absl/strings/escaping.h"
+#include "absl/strings/match.h"
+#include "absl/strings/str_cat.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/convert_graph.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
+#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/grappler/clusters/cluster.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h"
+#include "tensorflow/core/grappler/utils/functions.h"
+#include "tensorflow/core/lib/strings/numbers.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/casts.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/stacktrace.h"
+
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+namespace tensorflow {
+namespace tensorrt {
+namespace convert {
+
+using absl::AsciiStrToUpper;
+using absl::StrAppend;
+using absl::StrCat;
+
+TRTLayoutOptimizationPass::TRTLayoutOptimizationPass(const string& name)
+    : name_(name),
+      trt_logger_name_("DefaultLogger"),
+      minimum_segment_size_(3),
+      is_dynamic_op_(false),
+      max_cached_batches_(1),
+      max_workspace_size_bytes_(256LL << 20) {
+  VLOG(1) << "Constructing " << name_;
+}
+
+Status TRTLayoutOptimizationPass::Optimize(grappler::Cluster* cluster,
+                                           const grappler::GrapplerItem& item,
+                                           GraphDef* optimized_graph) {
+  GraphDef modified_graph_def = item.graph;
+
+  // Construct a GrapplerItem using the modified graph_def and the input
+  // grappler_item.
+  grappler::GrapplerItem grappler_item =
+      grappler_item.WithGraph(std::move(modified_graph_def));
+  const GraphDef& graph_def = grappler_item.graph;
+
+  // Convert graphdef to graph.
+  FunctionLibraryDefinition flib(OpRegistry::Global(), graph_def.library());
+  Graph graph(flib);
+  TF_RETURN_IF_ERROR(
+      ConvertGraphDefToGraph(GraphConstructorOptions(), graph_def, &graph));
+
+  // Algorithm steps:
+  // 1. We iterate over the graph to find any Conv (or other layout sensitive
+  // op)
+  // 2. If found, we continue, else we return
+  // 3. We iterate over the nodes and replace the layout-sensitive params
+  // 3. We add Transpose before the inputs and after the outputs
+
+  grappler::GraphProperties static_graph_properties(grappler_item);
+
+  std::cout << "TRTLayoutOptimizationPass: reading nodes..." << std::endl;
+  for (Node* node : graph.nodes()) {
+    std::cout << node->name() << std::endl;
+  }
+
+  // TODO: assign output *optimized_graph =;
+}
+
+Status TRTLayoutOptimizationPass::Init(
+    const RewriterConfig_CustomGraphOptimizer* config) {
+  std::cout << "Do nothing for now" << std::endl;
+}
+
+}  // namespace convert
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/convert/trt_layout_optimization_pass.h b/tensorflow/compiler/tf2tensorrt/convert/trt_layout_optimization_pass.h
new file mode 100644
index 00000000000..e91b3cd8e5f
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/convert/trt_layout_optimization_pass.h
@@ -0,0 +1,69 @@
+/* Copyright 20121 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_TRT_LAYOUT_OPTIMIZATION_PASS_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_TRT_LAYOUT_OPTIMIZATION_PASS_H_
+
+#include <string>
+
+#include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer.h"
+#include "tensorflow/core/platform/logging.h"
+
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+
+#if !IS_TRT_VERSION_GE(7, 0, 0, 0)
+#error From version 2.6, we only support NVIDIA TensorRT version 7 or newer.
+#error Please update your environment and relaunch the compilation.
+#endif
+
+namespace tensorflow {
+namespace tensorrt {
+namespace convert {
+class TRTLayoutOptimizationPass : public grappler::CustomGraphOptimizer {
+ public:
+  TRTLayoutOptimizationPass(const string& name = "TRTLayoutOptimizationPass");
+
+  string name() const override { return name_; };
+
+  bool UsesFunctionLibrary() const override { return true; }
+
+  Status Init(
+      const RewriterConfig_CustomGraphOptimizer* config = nullptr) override;
+
+  Status Optimize(grappler::Cluster* cluster,
+                  const grappler::GrapplerItem& item,
+                  GraphDef* optimized_graph) override;
+
+  /*  void PrintDebugInfo(grappler::Cluster* cluster,
+                        const grappler::GrapplerItem& item);
+  */
+
+ private:
+  const string name_;
+  string trt_logger_name_;
+  int minimum_segment_size_;
+  bool is_dynamic_op_;
+  int max_cached_batches_;
+  int64_t max_workspace_size_bytes_;
+};
+
+}  // namespace convert
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_TRT_LAYOUT_OPTIMIZATION_PASS_H_
diff --git a/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc b/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc
index 35a8c6340f8..3ee9e5d98e1 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc
@@ -14,221 +14,203 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.h"
 
+#include <memory>
+
 #include "absl/strings/ascii.h"
 #include "absl/strings/escaping.h"
+#include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/tf2tensorrt/convert/convert_graph.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/ops/quantization_ops.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/trt_parameters.h"
 #include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
 #include "tensorflow/core/grappler/clusters/cluster.h"
 #include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h"
+#include "tensorflow/core/grappler/utils/functions.h"
+#include "tensorflow/core/grappler/utils/topological_sort.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/stacktrace.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 namespace tensorflow {
 namespace tensorrt {
 namespace convert {
-// TODO(sami): Remove VLOG messages once the code matures
 using absl::AsciiStrToUpper;
 using absl::StrAppend;
 using absl::StrCat;
 
+namespace {
+
+bool ShouldUseExplicitPrecision(const GraphDef& gdef) {
+  if (!IS_TRT_VERSION_GE(8, 0, 0, 0)) {
+    return false;
+  }
+  return absl::c_any_of(gdef.node(), [](const auto& node) {
+    return (absl::c_find(kExplicitQuantizationOpNames, node.op()) !=
+            kExplicitQuantizationOpNames.end());
+  });
+}
+
+::stream_executor::port::StatusOr<bool> ShouldConvertFunction(
+    const grappler::GrapplerItem& item) {
+  if (item.id == "tf_graph") {
+    return false;
+  }
+  const auto& func_item =
+      static_cast<const grappler::GrapplerFunctionItem&>(item);
+  const AttrSlice& attr = func_item.func_attr();
+  const AttrValue* attr_value = attr.Find("_tftrt_convert_function");
+  if (attr_value != nullptr) {
+    bool result = false;
+    TF_RETURN_IF_ERROR(GetNodeAttr(attr, "_tftrt_convert_function", &result));
+    return result;
+  }
+  VLOG(1) << "Attribute _tftrt_convert_function was not found.";
+  return false;
+}
+
+// Converts function conversion attributes to conversion parameters.
+Status UpdateFunctionSpecificConversionParams(
+    TRTOptimizationPass::ConversionParams& cp,
+    const tensorflow::AttrSlice& attr) {
+  auto get_size_attr = [](const AttrSlice& attr, absl::string_view name,
+                          size_t* dst) -> Status {
+    int tmp = 0;
+    TF_RETURN_IF_ERROR(GetNodeAttr(attr, name, &tmp));
+    *dst = static_cast<size_t>(tmp);
+    return Status::OK();
+  };
+
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(attr, "_tftrt_trt_logger_name", &cp.trt_logger_name));
+  TF_RETURN_IF_ERROR(
+      get_size_attr(attr, "_tftrt_max_batch_size", &cp.max_batch_size));
+  TF_RETURN_IF_ERROR(get_size_attr(attr, "_tftrt_max_workspace_size_bytes",
+                                   &cp.max_workspace_size_bytes));
+  std::string precision_mode;
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(attr, "_tftrt_precision_mode", &precision_mode));
+  TF_RETURN_IF_ERROR(
+      TrtPrecisionModeFromName(precision_mode, &cp.precision_mode));
+  TF_RETURN_IF_ERROR(GetNodeAttr(attr, "_tftrt_minimum_segment_size",
+                                 &cp.minimum_segment_size));
+  TF_RETURN_IF_ERROR(GetNodeAttr(attr, "_tftrt_is_dyn_op", &cp.is_dynamic_op));
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(attr, "_tftrt_max_cached_engines", &cp.max_cached_engines));
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(attr, "_tftrt_use_calibration", &cp.use_calibration));
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(attr, "_tftrt_use_implicit_batch", &cp.use_implicit_batch));
+  std::string profile_strategy;
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(attr, "_tftrt_profile_strategy", &profile_strategy));
+  TF_RETURN_IF_ERROR(
+      ProfileStrategyFromName(profile_strategy, &cp.profile_strategy));
+  TF_RETURN_IF_ERROR(GetNodeAttr(attr, "_tftrt_allow_build_at_runtime",
+                                 &cp.allow_build_at_runtime));
+  return Status::OK();
+}
+}  // namespace
+
 Status TRTOptimizationPass::Init(
     const RewriterConfig_CustomGraphOptimizer* config) {
-  VLOG(1) << "Called INIT for " << name_ << " with config = " << config;
   if (config == nullptr) {
     return Status::OK();
   }
   const auto params = config->parameter_map();
   if (params.count("minimum_segment_size")) {
-    minimum_segment_size_ = params.at("minimum_segment_size").i();
+    params_.minimum_segment_size = params.at("minimum_segment_size").i();
   }
   if (params.count("max_batch_size")) {
-    maximum_batch_size_ = params.at("max_batch_size").i();
+    params_.max_batch_size = params.at("max_batch_size").i();
   }
   if (params.count("is_dynamic_op")) {
-    is_dynamic_op_ = params.at("is_dynamic_op").b();
+    params_.is_dynamic_op = params.at("is_dynamic_op").b();
   }
   if (params.count("maximum_cached_engines")) {
-    max_cached_batches_ = params.at("maximum_cached_engines").i();
+    params_.max_cached_engines = params.at("maximum_cached_engines").i();
   }
   if (params.count("max_workspace_size_bytes")) {
-    max_workspace_size_bytes_ = params.at("max_workspace_size_bytes").i();
+    params_.max_workspace_size_bytes =
+        params.at("max_workspace_size_bytes").i();
   }
   if (params.count("precision_mode")) {
     TF_RETURN_IF_ERROR(TrtPrecisionModeFromName(
-        AsciiStrToUpper(params.at("precision_mode").s()), &precision_mode_));
+        AsciiStrToUpper(params.at("precision_mode").s()),
+        &params_.precision_mode));
   }
   if (params.count("use_calibration")) {
-    use_calibration_ = params.at("use_calibration").b();
-  }
-  return Status::OK();
-}
-
-void TRTOptimizationPass::PrintDebugInfo(grappler::Cluster* cluster,
-                                         const grappler::GrapplerItem& item) {
-  LOG(INFO) << "Cluster = " << cluster;
-  string offset("  ");
-  string offset2 = StrCat(offset, offset);
-  string offset3 = StrCat(offset2, offset);
-  string offset4 = StrCat(offset2, offset2);
-  if (cluster) {
-    LOG(INFO) << offset << "type             = " << cluster->type();
-    LOG(INFO) << offset << "num warmup steps = " << cluster->NumWarmupSteps();
-    const auto dev_names = cluster->GetDeviceNames();
-    if (!dev_names.empty()) {
-      LOG(INFO) << offset << " Device names:";
-      for (const auto s : dev_names) {
-        LOG(INFO) << offset2 << s;
-      }
-    }
-    std::unordered_map<string, uint64> peak_mem;
-    auto status = cluster->GetPeakMemoryUsage(&peak_mem);
-    if (status == Status::OK()) {
-      LOG(INFO) << offset << "Peak Memory Usage :";
-      for (auto s : peak_mem) {
-        LOG(INFO) << offset2 << s.first << " = " << s.second;
-      }
-    }
-
-    const auto dev_props = cluster->GetDevices();
-    if (!dev_props.empty()) {
-      LOG(INFO) << offset << "Device properties:";
-      for (auto k : dev_props) {
-        LOG(INFO) << offset2 << k.first;
-        const auto& dt = k.second;
-        LOG(INFO) << offset3 << "type          = " << dt.type();
-        LOG(INFO) << offset3 << "vendor        = " << dt.vendor();
-        LOG(INFO) << offset3 << "model         = " << dt.model();
-        LOG(INFO) << offset3 << "frequency     = " << dt.frequency();
-        LOG(INFO) << offset3 << "num cores     = " << dt.num_cores();
-        LOG(INFO) << offset3 << "num registers = " << dt.num_registers();
-        LOG(INFO) << offset3 << "L1 cache size = " << dt.l1_cache_size();
-        LOG(INFO) << offset3 << "L2 cache size = " << dt.l2_cache_size();
-        LOG(INFO) << offset3 << "L3 cache size = " << dt.l3_cache_size();
-        LOG(INFO) << offset3 << "SHMem per SMP = "
-                  << dt.shared_memory_size_per_multiprocessor();
-        LOG(INFO) << offset3 << "memory size   = " << dt.memory_size();
-        LOG(INFO) << offset3 << "bandwidth     = " << dt.bandwidth();
-        if (dt.environment_size()) {
-          LOG(INFO) << offset3 << "environment   :";
-          for (const auto e : dt.environment()) {
-            LOG(INFO) << offset4 << e.first << " = " << e.second;
-          }
-        }
-      }
-    }
+    params_.use_calibration = params.at("use_calibration").b();
   }
-  LOG(INFO) << "item: " << item.id;
-  if (!item.feed.empty()) {
-    LOG(INFO) << offset << "Feeds  :";
-    for (const auto& f : item.feed) {
-      const auto& shape = f.second.shape();
-      LOG(INFO) << offset2 << f.first << " = shaped " << shape.DebugString();
-    }
-  } else {
-    LOG(INFO) << offset << "No Feeds";
-  }
-  if (!item.fetch.empty()) {
-    LOG(INFO) << offset << "Fetches  :";
-    for (const auto& f : item.fetch) {
-      LOG(INFO) << offset2 << f;
-    }
-  } else {
-    LOG(INFO) << offset << "No Fetches";
+  if (params.count("trt_logger")) {
+    params_.trt_logger_name = params.at("trt_logger").s();
   }
-
-  if (!item.init_ops.empty()) {
-    LOG(INFO) << offset << "init ops  :";
-    for (const auto& f : item.init_ops) {
-      LOG(INFO) << offset2 << f;
-    }
-  } else {
-    LOG(INFO) << offset << "No init ops";
+  if (params.count("allow_build_at_runtime")) {
+    params_.allow_build_at_runtime = params.at("allow_build_at_runtime").b();
   }
-  LOG(INFO) << "Save Op = " << item.save_op;
-  LOG(INFO) << "Restore Op = " << item.restore_op;
-  LOG(INFO) << "save_restore_loc_tensor = " << item.save_restore_loc_tensor;
-  if (!item.keep_ops.empty()) {
-    LOG(INFO) << offset << "keep ops  :";
-    for (const auto& f : item.keep_ops) {
-      LOG(INFO) << offset2 << f;
-    }
-  } else {
-    LOG(INFO) << offset << "No keep ops";
+  if (params.count("use_implicit_batch")) {
+    params_.use_implicit_batch = params.at("use_implicit_batch").b();
   }
-  for (const auto dev : cluster->GetDeviceSet()->devices()) {
-    const auto& pname = dev->parsed_name();
-    LOG(INFO) << "Device name= " << dev->name()
-              << " parsedname job= " << pname.job << " id= " << pname.id
-              << " has_id: " << pname.has_id << " has_job: " << pname.has_job
-              << "has_type: " << pname.has_type << " type =" << pname.type;
+  if (params.count("profile_strategy")) {
+    TF_RETURN_IF_ERROR(ProfileStrategyFromName(
+        params.at("profile_strategy").s(), &params_.profile_strategy));
   }
+  return Status::OK();
+}
+
+static bool ExplicitPrecisionModePolicy() {
+  return IS_TRT_VERSION_GE(8, 0, 0, 0);
 }
 
 Status TRTOptimizationPass::Optimize(grappler::Cluster* cluster,
                                      const grappler::GrapplerItem& item,
                                      GraphDef* optimized_graph) {
-  VLOG(1) << "Called TRTOptimization Pass " << name_;
-  // This is a hack to workaround optimizer issue. MetaOptimizer calls
-  // optimization passes on function objects as well, we should not modify
-  // generated funcdefs! This is fragile but we don't have any other option
-  // until framework fixes it.
-  if (item.id != "tf_graph") {
-    LOG(WARNING) << name_
-                 << " is probably called on funcdef! This optimizer must *NOT* "
-                    "be called on function objects.";
+  VLOG(1) << "Called TRTOptimization Pass " << name_
+          << " on a grappler item with id=" << item.id;
+  // TF_ASSIGN_OR_RETURN(bool do_function_conversion,
+  // ShouldConvertFunction(item)); Optimizing the main graph(identified with
+  // `item.id == "tf_graph"`) with `minimim_segment_size == -1` indicates
+  // skipping main graph conversion.
+  if ((params_.minimum_segment_size == -1 && item.id == "tf_graph") ||
+      (item.id != "tf_graph")) {
+    VLOG(1) << "Not optimizing this grappler item: " << item.id;
     *optimized_graph = item.graph;
     return Status::OK();
   }
-  if (VLOG_IS_ON(3)) {
-    LOG(INFO) << CurrentStackTrace();
-    PrintDebugInfo(cluster, item);
+
+  if (params_.use_calibration &&
+      params_.precision_mode != TrtPrecisionMode::INT8) {
+    LOG(WARNING) << "Calibration with FP32 or FP16 is not implemented. "
+                 << "Falling back to use_calibration = False."
+                 << "Note that the default value of use_calibration is True.";
+    params_.use_calibration = false;
   }
-  if (!is_dynamic_op_) {
-    int max_batch_dim = -1;
-    if (!item.feed.empty()) {
-      for (const auto& f : item.feed) {
-        const auto& shape = f.second.shape();
-        if (shape.dims() > 0) {
-          if (shape.dim_size(0) > max_batch_dim)
-            max_batch_dim = shape.dim_size(0);
-          VLOG(2) << "Setting max_batch_dim to " << max_batch_dim
-                  << " using batch dimension of " << f.first << " with shape "
-                  << shape;
-        }
-      }
-    }
-    if (max_batch_dim > maximum_batch_size_) {
-      return errors::InvalidArgument(
-          "Specified max_batch_size=", maximum_batch_size_,
-          " is less than maximum batch dimension of inputs (", max_batch_dim,
-          "). ", "To continue, set max_batch_size to >= ", max_batch_dim);
-    } else if (max_batch_dim < maximum_batch_size_) {
-      LOG(INFO) << "Specified max_batch_size=" << maximum_batch_size_
-                << " is larger than maximum batch dimension of inputs ("
-                << max_batch_dim << "). "
-                << "This can result in poor performance.";
+
+  params_.use_explicit_precision = ShouldUseExplicitPrecision(item.graph);
+  if (params_.use_explicit_precision) {
+    LOG(INFO) << "[TF-TRT] Using explicit QDQ mode";
+    if (params_.precision_mode != TrtPrecisionMode::INT8 ||
+        params_.use_calibration) {
+      LOG(WARNING)
+          << "Explicit precision mode with calibration or FP32/FP16 mode is "
+             "not supported."
+          << " Setting precision mode to INT8 and calibration to false.";
+      params_.precision_mode = TrtPrecisionMode::INT8;
+      params_.use_calibration = false;
     }
   }
-  grappler::GraphProperties static_graph_properties(item);
-  TF_RETURN_IF_ERROR(static_graph_properties.InferStatically(true));
-  ConversionParams cp;
 
-  if (use_calibration_ && precision_mode_ != TrtPrecisionMode::INT8) {
-    VLOG(1) << "Calibration with FP32 or FP16 is not implemented. "
-            << "Falling back to use_calibration = False."
-            << "Note that the default value of use_calibration is True.";
-    use_calibration_ = false;
-  }
+  // Create a copy of the graph to optimize.
+  grappler::GrapplerItem optimized_item(item);
 
   std::vector<string> nodes_to_preserve;
-  for (const auto& n : item.NodesToPreserve()) {
+  const auto& old_nodes_to_preserve = item.NodesToPreserve();
+  nodes_to_preserve.reserve(old_nodes_to_preserve.size());
+  for (const auto& n : old_nodes_to_preserve) {
     auto tokens = str_util::Split(n, ":");
     string s = tokens.at(0);
     for (int i = 1; i < tokens.size() - 1; ++i) {
@@ -243,21 +225,16 @@ Status TRTOptimizationPass::Optimize(grappler::Cluster* cluster,
     }
     nodes_to_preserve.push_back(s);
   }
-  cp.input_graph_def = &item.graph;
-  cp.output_names = &nodes_to_preserve;
-  cp.max_batch_size = maximum_batch_size_;
-  cp.max_workspace_size_bytes = max_workspace_size_bytes_;
-  cp.output_graph_def = optimized_graph;
-  cp.precision_mode = precision_mode_;
-  cp.minimum_segment_size = minimum_segment_size_;
-  cp.graph_properties = &static_graph_properties;
-  cp.cluster = cluster;
-  cp.is_dyn_op = is_dynamic_op_;
-  cp.max_cached_engines = max_cached_batches_;
-  cp.use_calibration = use_calibration_;
-  auto status = ConvertAfterShapes(cp);
-  VLOG(1) << "Returning from " << name_;
-  return status;
+
+  if (item.id != "tf_graph") {
+    const grappler::GrapplerFunctionItem& func_item =
+        static_cast<const grappler::GrapplerFunctionItem&>(item);
+    TF_RETURN_IF_ERROR(
+        UpdateFunctionSpecificConversionParams(params_, func_item.func_attr()));
+  }
+
+  return ConvertGraph(params_, optimized_item, nodes_to_preserve, cluster,
+                      optimized_graph);
 }
 
 void TRTOptimizationPass::Feedback(grappler::Cluster* cluster,
@@ -289,5 +266,4 @@ static VerboseCustomGraphOptimizerRegistrar TRTOptimizationPass_Registrar(
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif
-#endif
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.h b/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.h
index 35a92341ee9..0976dd157d8 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.h
+++ b/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.h
@@ -16,15 +16,23 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_TRT_OPTIMIZATION_PASS_H_
 #define TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_TRT_OPTIMIZATION_PASS_H_
 
+#include <memory>
 #include <string>
 
+#include "tensorflow/compiler/tf2tensorrt/convert/trt_parameters.h"
 #include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
 #include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/optimizers/custom_graph_optimizer.h"
+#include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/platform/logging.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+
+#if !IS_TRT_VERSION_GE(7, 0, 0, 0)
+#error From version 2.6, we only support NVIDIA TensorRT version 7 or newer.
+#error Please update your environment and relaunch the compilation.
+#endif
 
 namespace tensorflow {
 namespace tensorrt {
@@ -32,17 +40,25 @@ namespace convert {
 
 class TRTOptimizationPass : public grappler::CustomGraphOptimizer {
  public:
+  struct ConversionParams {
+    string trt_logger_name = "DefaultLogger";
+    size_t max_batch_size = -1;
+    size_t max_workspace_size_bytes = 1 << 30;
+    TrtPrecisionMode precision_mode = TrtPrecisionMode::FP32;
+    int minimum_segment_size = 3;
+    // Whether to create engine on conversion or execution time
+    bool is_dynamic_op = false;
+    // maximum number of cached engines
+    int max_cached_engines = 1;
+    bool use_calibration = true;
+    bool use_implicit_batch = true;
+    ProfileStrategy profile_strategy = ProfileStrategy::kRange;
+    bool allow_build_at_runtime = true;
+    bool use_explicit_precision = false;
+  };
+
   TRTOptimizationPass(const string& name = "TRTOptimizationPass")
-      : name_(name),
-        minimum_segment_size_(3),
-        precision_mode_(TrtPrecisionMode::FP32),
-        maximum_batch_size_(-1),
-        is_dynamic_op_(false),
-        max_cached_batches_(1),
-        max_workspace_size_bytes_(256LL << 20),
-        use_calibration_(true) {
-    VLOG(1) << "Constructing " << name_;
-  }
+      : name_(name) {}
 
   string name() const override { return name_; };
 
@@ -58,26 +74,17 @@ class TRTOptimizationPass : public grappler::CustomGraphOptimizer {
   void Feedback(grappler::Cluster* cluster, const grappler::GrapplerItem& item,
                 const GraphDef& optimized_graph, double result) override;
 
-  void PrintDebugInfo(grappler::Cluster* cluster,
-                      const grappler::GrapplerItem& item);
-
  private:
   const string name_;
-  int minimum_segment_size_;
-  TrtPrecisionMode precision_mode_;
-  int maximum_batch_size_;
-  bool is_dynamic_op_;
-  std::vector<int> batches_;
-  int max_cached_batches_;
-  int64_t max_workspace_size_bytes_;
-  bool use_calibration_;
 
+  ConversionParams params_;
+
+  std::vector<int> batches_;
 };
 
 }  // namespace convert
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
-#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
 #endif  // TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_TRT_OPTIMIZATION_PASS_H_
diff --git a/tensorflow/compiler/tf2tensorrt/convert/trt_parameters.cc b/tensorflow/compiler/tf2tensorrt/convert/trt_parameters.cc
new file mode 100644
index 00000000000..c85d119cc81
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/convert/trt_parameters.cc
@@ -0,0 +1,104 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2tensorrt/convert/trt_parameters.h"
+
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+
+#include <algorithm>
+#include <cctype>
+
+#include "absl/strings/str_cat.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/errors.h"
+
+namespace tensorflow {
+namespace tensorrt {
+
+Status TrtPrecisionModeToName(const TrtPrecisionMode mode, string* name) {
+  const char* kUnknown = "UNKNOWN";
+  *name = *kUnknown;
+  switch (mode) {
+    case TrtPrecisionMode::FP32:
+      *name = "FP32";
+      break;
+    case TrtPrecisionMode::FP16:
+      *name = "FP16";
+      break;
+    case TrtPrecisionMode::INT8:
+      *name = "INT8";
+      break;
+  }
+  if (name->compare(kUnknown) == 0)
+    return errors::OutOfRange("Unknown precision mode");
+  return Status::OK();
+}
+
+Status TrtPrecisionModeFromName(const string& name, TrtPrecisionMode* mode) {
+  if (name == "FP32") {
+    *mode = TrtPrecisionMode::FP32;
+  } else if (name == "FP16") {
+    *mode = TrtPrecisionMode::FP16;
+  } else if (name == "INT8") {
+    *mode = TrtPrecisionMode::INT8;
+  } else {
+    return errors::InvalidArgument("Invalid precision mode name: ", name);
+  }
+  return Status::OK();
+}
+
+string DebugString(const TrtPrecisionMode mode) {
+  string mode_str;
+  TF_CHECK_OK(TrtPrecisionModeToName(mode, &mode_str));
+  return absl::StrCat("TrtPrecisionMode::", mode_str);
+}
+
+string ProfileStrategyToName(const ProfileStrategy strategy) {
+  switch (strategy) {
+    case ProfileStrategy::kRange:
+      return "Range";
+    case ProfileStrategy::kOptimal:
+      return "Optimal";
+    case ProfileStrategy::kRangeOptimal:
+      return "Range+Optimal";
+    case ProfileStrategy::kImplicitBatchModeCompatible:
+      return "ImplicitBatchModeCompatible";
+  }
+  return "Unknown";
+}
+
+Status ProfileStrategyFromName(const string& name, ProfileStrategy* strategy) {
+  string name_lowercase(name);
+  std::transform(name.begin(), name.end(), name_lowercase.begin(),
+                 [](unsigned char c) { return std::tolower(c); });
+  if (name_lowercase == "range") {
+    *strategy = ProfileStrategy::kRange;
+  } else if (name_lowercase == "optimal") {
+    *strategy = ProfileStrategy::kOptimal;
+  } else if (name_lowercase == "range+optimal") {
+    *strategy = ProfileStrategy::kRangeOptimal;
+  } else if (name_lowercase == "implicitbatchmodecompatible") {
+    *strategy = ProfileStrategy::kImplicitBatchModeCompatible;
+  } else {
+    return errors::InvalidArgument("Invalid profile strategy: ", name);
+  }
+  return Status::OK();
+}
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/convert/trt_parameters.h b/tensorflow/compiler/tf2tensorrt/convert/trt_parameters.h
new file mode 100644
index 00000000000..3f44bb5f199
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/convert/trt_parameters.h
@@ -0,0 +1,72 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_TRT_PARAMETERS_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_TRT_PARAMETERS_H_
+
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+namespace tensorrt {
+
+// The PrecisionMode controls the precision used in TRT converted parts of the
+// model. Setting PrecisionMode other than FP32 enables TensorRT to select
+// lower-precision implementations when searching for the fastest kernels.
+//
+// For regularized models whose input dynamic range is approximately one, this
+// typically produces significant speedups with negligible change in accuracy.
+// There is additional complexity when working with INT8, see Calibration.
+//
+// - FP32
+// - FP16 Enable FP16 layer selection, with FP32 fallback.
+// - INT8 Enable Int8 layer selection, with FP32 and FP16 fallback.
+//
+// Note that TensorRT will still choose a higher-precision kernel if it results
+// in overall lower runtime, or if no low-precision implementation exists.
+enum class TrtPrecisionMode { FP32, FP16, INT8 };
+
+Status TrtPrecisionModeToName(const TrtPrecisionMode mode, string* name);
+
+Status TrtPrecisionModeFromName(const string& name, TrtPrecisionMode* mode);
+
+string DebugString(const TrtPrecisionMode mode);
+
+// Optimization profile generation strategies.
+// - `kRange`: create one profile that works for inputs with dimension values
+//   in the range of [min_dims, max_dims] where min_dims and max_dims are
+//   derived from the provided inputs.
+// - `kOptimal`: create one profile for each input. The profile only works for
+//   inputs with the same dimensions as the input it is created for. The GPU
+//   engine will be run with optimal performance with such inputs.
+// - `kRangeOptimal`: create the profiles for both `Range` and `Optimal`.
+// - `kImplicitBatchModeCompatible`: create the profiles that will produce the
+//   same GPU engines as the implicit_batch_mode would produce.
+enum class ProfileStrategy {
+  kRange,
+  kOptimal,
+  kRangeOptimal,
+  kImplicitBatchModeCompatible,
+};
+
+string ProfileStrategyToName(const ProfileStrategy strategy);
+Status ProfileStrategyFromName(const string& name, ProfileStrategy* strategy);
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_TRT_PARAMETERS_H_
diff --git a/tensorflow/compiler/tf2tensorrt/convert/utils.cc b/tensorflow/compiler/tf2tensorrt/convert/utils.cc
index ca21c193d63..18bfb2997df 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/utils.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/utils.cc
@@ -15,41 +15,266 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
 
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+
+#include "absl/strings/ascii.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/errors.h"
 
 namespace tensorflow {
 namespace tensorrt {
 
-Status TrtPrecisionModeToName(TrtPrecisionMode mode, string* name) {
-  switch (mode) {
-    case TrtPrecisionMode::FP32:
-      *name = "FP32";
+string DebugString(const nvinfer1::Dims& dims) {
+  string out = StrCat("nvinfer1::Dims(nbDims=", dims.nbDims, ", d=");
+  for (int i = 0; i < std::max(dims.nbDims, 0); ++i) {
+    StrAppend(&out, dims.d[i]);
+    StrAppend(&out, ",");
+  }
+  StrAppend(&out, ")");
+  return out;
+}
+
+string DebugString(const DataType tf_type) {
+  switch (tf_type) {
+    case DT_FLOAT:
+      return "DT_FLOAT";
+    case DT_HALF:
+      return "DT_HALF";
+    case DT_INT32:
+      return "DT_INT32";
+    case DT_INT8:
+      return "DT_INT8";
+    case DT_BOOL:
+      return "DT_BOOL";
+    case DT_UINT8:
+      return "DT_UINT8";
+    default:
+      return "Unknow TF DataType";
+  }
+}
+
+string DebugString(const nvinfer1::DataType trt_dtype) {
+  switch (trt_dtype) {
+    case nvinfer1::DataType::kFLOAT:
+      return "kFLOAT";
+    case nvinfer1::DataType::kHALF:
+      return "kHALF";
+    case nvinfer1::DataType::kINT8:
+      return "kINT8";
+    case nvinfer1::DataType::kINT32:
+      return "kINT32";
+    case nvinfer1::DataType::kBOOL:
+      return "kBOOL";
+#if IS_TRT_VERSION_GE(8, 5, 0, 0)
+    case nvinfer1::DataType::kUINT8:
+      return "kUINT8";
+#endif
+#if IS_TRT_VERSION_GE(8, 6, 0, 0)
+    case nvinfer1::DataType::kFP8:
+      return "kFP8";
+#endif
+    default:
+      return "Invalid TRT data type";
+  }
+}
+
+string DebugString(const nvinfer1::Permutation& permutation, int len) {
+  string out = "nvinfer1::Permutation(";
+  for (int i = 0; i < len; ++i) {
+    StrAppend(&out, permutation.order[i], ",");
+  }
+  StrAppend(&out, ")");
+  return out;
+}
+
+string DebugString(const ITensorProxyPtr& tensor) {
+  return StrCat(
+      tensor->is_trt_tensor() ? "nvinfer1::ITensor(@" : "SimpleItensor(@",
+      reinterpret_cast<uintptr_t>(&tensor), ", name=", tensor->getName(),
+      ", dtype=", DebugString(tensor->getType()),
+      ", dims=", DebugString(tensor->getDimensions()), ")");
+}
+
+string DebugString(const nvinfer1::ITensor& tensor) {
+  return StrCat("nvinfer1::ITensor(@", reinterpret_cast<uintptr_t>(&tensor),
+                ", name=", tensor.getName(),
+                ", dtype=", DebugString(tensor.getType()),
+                ", dims=", DebugString(tensor.getDimensions()), ")");
+}
+
+string DebugString(const std::vector<nvinfer1::Dims>& dimvec) {
+  return absl::StrCat("[",
+                      absl::StrJoin(dimvec, ",",
+                                    [](std::string* out, nvinfer1::Dims in) {
+                                      out->append(DebugString(in));
+                                    }),
+                      "]");
+}
+
+string DebugString(const std::vector<TensorShape>& shapes) {
+  return TensorShapeUtils::ShapeListString(shapes);
+}
+
+string DebugString(const std::vector<PartialTensorShape>& shapes) {
+  return PartialTensorShapeUtils::PartialShapeListString(shapes);
+}
+
+// Checks whether actual_shapes are compatible with cached_shapes. This should
+// only be used in implicit batch mode (in explicit batch mode one needs to
+// check the profile ranges). Therefore implicit batch mode is assumed.
+// It is also assumed that both actual_shapes and cached_shapes have been
+// verified by TRTEngineOp::VerifyInputShapes, which ensures that the batch size
+// for all tensors are the same.
+bool AreShapesCompatible(const std::vector<TensorShape>& actual_shapes,
+                         const std::vector<TensorShape>& cached_shapes) {
+  auto match_shape = [](const TensorShape& actual_shape,
+                        const TensorShape& cached_shape) {
+    // Match the rank.
+    if (actual_shape.dims() != cached_shape.dims()) return false;
+    // Match the batch size. In implicit batch mode cached_shape.dim_size(0) is
+    // the max batch size, which can be larger than the actual batch size.
+    if (actual_shape.dim_size(0) > cached_shape.dim_size(0)) return false;
+    // Match remaining dimensions.
+    for (int i = 1; i < actual_shape.dims(); ++i) {
+      if (actual_shape.dim_size(i) != cached_shape.dim_size(i)) return false;
+    }
+    return true;
+  };
+  for (int i = 0; i < actual_shapes.size(); ++i) {
+    if (!match_shape(actual_shapes[i], cached_shapes[i])) {
+      return false;
+    }
+  }
+  return true;
+}
+Status GetNetworkInputShapes(const nvinfer1::INetworkDefinition* network,
+                             std::vector<PartialTensorShape>* input_shapes) {
+  const int n_inputs = network->getNbInputs();
+  input_shapes->resize(n_inputs);
+  for (int i = 0; i < n_inputs; i++) {
+    const ITensorProxyPtr input = network->getInput(i);
+    TF_RETURN_IF_ERROR(DimsAdapter(input->getDimensions())
+                           .PartialTensorShape(&input_shapes->at(i)));
+  }
+  return Status::OK();
+}
+
+Status TfTypeToTrtType(DataType tf_type, nvinfer1::DataType* trt_type) {
+  switch (tf_type) {
+    case DT_FLOAT:
+      *trt_type = nvinfer1::DataType::kFLOAT;
+      break;
+    case DT_HALF:
+      *trt_type = nvinfer1::DataType::kHALF;
+      break;
+    case DT_INT32:
+      *trt_type = nvinfer1::DataType::kINT32;
       break;
-    case TrtPrecisionMode::FP16:
-      *name = "FP16";
+#if IS_TRT_VERSION_GE(8, 2, 0, 0)
+    case DT_BOOL:
+      *trt_type = nvinfer1::DataType::kBOOL;
       break;
-    case TrtPrecisionMode::INT8:
-      *name = "INT8";
+#endif
+#if IS_TRT_VERSION_GE(8, 5, 0, 0)
+    case DT_UINT8:
+      *trt_type = nvinfer1::DataType::kUINT8;
       break;
+#endif
     default:
-      return errors::OutOfRange("Unknown precision mode");
+      return errors::InvalidArgument("Unsupported tensorflow data type ",
+                                     DataTypeString(tf_type));
   }
   return Status::OK();
 }
 
-Status TrtPrecisionModeFromName(const string& name, TrtPrecisionMode* mode) {
-  if (name == "FP32") {
-    *mode = TrtPrecisionMode::FP32;
-  } else if (name == "FP16") {
-    *mode = TrtPrecisionMode::FP16;
-  } else if (name == "INT8") {
-    *mode = TrtPrecisionMode::INT8;
-  } else {
-    return errors::InvalidArgument("Invalid precision mode name: ", name);
+Status TrtTypeToTfType(nvinfer1::DataType trt_type, DataType* tf_type) {
+  switch (trt_type) {
+    case nvinfer1::DataType::kFLOAT:
+      *tf_type = DT_FLOAT;
+      break;
+    case nvinfer1::DataType::kHALF:
+      *tf_type = DT_HALF;
+      break;
+    case nvinfer1::DataType::kINT32:
+      *tf_type = DT_INT32;
+      break;
+#if IS_TRT_VERSION_GE(8, 2, 0, 0)
+    case nvinfer1::DataType::kBOOL:
+      *tf_type = DT_BOOL;
+      break;
+#endif
+#if IS_TRT_VERSION_GE(8, 5, 0, 0)
+    case nvinfer1::DataType::kUINT8:
+      *tf_type = DT_UINT8;
+      break;
+#endif
+#if IS_TRT_VERSION_GE(8, 6, 0, 0)
+    case nvinfer1::DataType::kFP8:
+      *tf_type = DT_FLOAT8_E4M3FN;
+      break;
+#endif
+    default:
+      return errors::InvalidArgument("Invalid TRT data type");
   }
   return Status::OK();
 }
 
+int GetNumberOfEngineInputs(const nvinfer1::ICudaEngine* engine) {
+  int n_bindings = engine->getNbBindings();
+  int n_input = 0;
+  for (int i = 0; i < n_bindings; i++) {
+    if (engine->bindingIsInput(i)) n_input++;
+  }
+  // According to TensorRT 7 doc: "If the engine has been built for K profiles,
+  // the first getNbBindings() / K bindings are used by profile number 0, the
+  // following getNbBindings() / K bindings are used by profile number 1 etc."
+  // Therefore, to get the number of input tensors, we need to divide by the
+  // the number of profiles.
+  int n_profiles = engine->getNbOptimizationProfiles();
+  return n_input / n_profiles;
+}
+
+absl::string_view GetDeviceName(const Node* node) {
+  if (node->has_assigned_device_name()) {
+    return node->assigned_device_name();
+  }
+  return node->requested_device();
+}
+
+absl::optional<DeviceNameUtils::ParsedName> GetDeviceParsedName(
+    const Node* node) {
+  absl::string_view device_name = GetDeviceName(node);
+  DeviceNameUtils::ParsedName parsed_name;
+  if (!DeviceNameUtils::ParseFullName(device_name, &parsed_name)) {
+    return absl::nullopt;
+  }
+  return parsed_name;
+}
+
+absl::optional<DeviceNameUtils::ParsedName> MergeIfCompatible(
+    const DeviceNameUtils::ParsedName& a,
+    const DeviceNameUtils::ParsedName& b) {
+  DeviceNameUtils::ParsedName merged_name = a;
+  if (!DeviceNameUtils::MergeDevNames(&merged_name, b,
+                                      /*allow_soft_placement=*/false)
+           .ok()) {
+    return absl::nullopt;
+  }
+  return merged_name;
+}
+
+absl::optional<DeviceNameUtils::ParsedName> MergeIfCompatible(
+    const DeviceNameUtils::ParsedName& a, absl::string_view b) {
+  DeviceNameUtils::ParsedName b_parsed_name;
+  if (!DeviceNameUtils::ParseFullName(b, &b_parsed_name)) {
+    return absl::nullopt;
+  }
+
+  return MergeIfCompatible(a, b_parsed_name);
+}
+
 }  // namespace tensorrt
 }  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/convert/utils.h b/tensorflow/compiler/tf2tensorrt/convert/utils.h
index eb60829d31d..cd701ed0066 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/utils.h
+++ b/tensorflow/compiler/tf2tensorrt/convert/utils.h
@@ -16,36 +16,387 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_UTILS_H_
 #define TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_UTILS_H_
 
+#include <algorithm>
+#include <iterator>
 #include <memory>
+#include <type_traits>
+#include <vector>
 
+#include "absl/algorithm/container.h"
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/tf2tensorrt/common/utils.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_tensor_proxy.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/util/env_var.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+#include "third_party/tensorrt/NvInfer.h"
+
+#define TFTRT_ERROR(func, ...)                                              \
+  do {                                                                      \
+    return func("TFTRT::", __FUNCTION__, ":", __LINE__, ": ", __VA_ARGS__); \
+  } while (0)
+
+#define TFTRT_CHECK_SHAPE_TENSOR(tensor)                                 \
+  if (!IsTrtShapeTensorCompatible(tensor)) {                             \
+    TFTRT_ERROR(errors::InvalidArgument, "Tensor of type ",              \
+                DebugString(tensor.dtype()), " having shape ",           \
+                tensor.shape().DebugString(), " is not TRT compatible"); \
+  }
 
 namespace tensorflow {
 namespace tensorrt {
 
-class IONamePrefixes {
- public:
-  static constexpr const char* const kInputPHName = "TensorRTInputPH_";
-  static constexpr const char* const kOutputPHName = "TensorRTOutputPH_";
-};
+static constexpr char kCastOutputTypeAttrName[] = "DstT";
 
+#if !IS_TRT_VERSION_GE(8, 2, 0, 0)
 template <typename T>
 struct TrtDestroyer {
   void operator()(T* t) {
     if (t) t->destroy();
   }
 };
-
 template <typename T>
 using TrtUniquePtrType = std::unique_ptr<T, TrtDestroyer<T>>;
+#else
+template <typename T>
+using TrtUniquePtrType = std::unique_ptr<T>;
+#endif
+
+// Define a hash function for vector<TensorShape> because it is used as the key
+// for the engine cache.
+struct VectorTensorShapeHasher {
+  std::size_t operator()(const std::vector<TensorShape>& key) const {
+    return std::hash<std::string>()(TensorShapeUtils::ShapeListString(key));
+  }
+};
+
+using absl::StrAppend;
+using absl::StrCat;
+
+// This utility template converts an arithmetic type to a string. This function
+// is necessary to allow the following function to behave recursively:
+// `string DebugString(const std::vector<CType>&)`.
+template <typename CType, typename = typename std::enable_if<
+                              std::is_arithmetic<CType>::value, CType>::type>
+string DebugString(const CType& el) {
+  string el_str = std::to_string(el);
+  // Prettify std::to_string which can sometimes returns 1.50000 instead of 1.5.
+  // In short it removes trailing 0s in a string-formatted number.
+  el_str.erase(el_str.find_last_not_of('0') + 1, std::string::npos);
+  return el_str;
+}
+// This utility template converts nested vectors to a string for debug purposes.
+template <typename CType>
+string DebugString(const std::vector<CType>& vector) {
+  string tmp_s = "";
+  for (const auto el : vector) {
+    StrAppend(&tmp_s, StrCat(DebugString(el), ", "));
+  }
+  return StrCat("{", tmp_s.substr(0, tmp_s.length() - 2), "}");
+}
+string DebugString(const nvinfer1::Dims& dims);
+string DebugString(const nvinfer1::DataType trt_dtype);
+string DebugString(const DataType tf_type);
+string DebugString(const nvinfer1::Permutation& permutation, int len);
+string DebugString(const ITensorProxyPtr& tensor);
+string DebugString(const nvinfer1::ITensor& tensor);
+string DebugString(const std::vector<nvinfer1::Dims>& dimvec);
+string DebugString(const std::vector<TensorShape>& shapes);
+string DebugString(const std::vector<PartialTensorShape>& shapes);
+
+template <size_t N>
+string DebugString(const absl::InlinedVector<int64, N>& data) {
+  return absl::StrCat("[", absl::StrJoin(data, ","), "]");
+}
+
+inline bool HasStaticShape(const nvinfer1::Dims& dims) {
+  if (dims.nbDims < 0) return false;
+  for (int d = 0; d < dims.nbDims; ++d) {
+    if (dims.d[d] < 0) return false;
+  }
+  return true;
+}
+
+template <typename T>
+bool HasStaticShape(const T& dims) {
+  return !absl::c_any_of(dims, [](int i) { return i < 0; });
+}
+
+// Returns whether a shape is compatible with a TRT shape tensor.
+template <typename TensorShapeType>
+inline bool IsTrtShapeTensorCompatible(const TensorShapeType& shape) {
+  return (
+      shape.dims() == 0 ||
+      (shape.dims() == 1 && shape.num_elements() <= nvinfer1::Dims::MAX_DIMS));
+}
+
+// Returns whether a TF tensor could be interpreted as a TRT shape tensor.
+inline bool IsTrtShapeTensorCompatible(const Tensor& tensor) {
+  return tensor.dtype() == DT_INT32 &&
+         IsTrtShapeTensorCompatible(tensor.shape());
+}
+
+// Adapts various representations of shape (TF Shape, TRT Dims, plain
+// containers) and provides methods for properties (length, volume) and
+// conversion between types. Note that unlike TF's TensorShape, the underlying
+// storage will only contain active dimensions. In the case of scalar shapes,
+// `NumDims` is allowed to return 0 or 1, but the `storage_` vector will contain
+// 1 element in both cases. In the non-scalar case, `NumDims() ==
+// storage_.size()`.
+class DimsAdapter {
+ public:
+  using StorageType = absl::InlinedVector<int64_t, 4>;
+
+ private:
+  template <typename T>
+  using EnableIfNotTensorShapeType =
+      std::enable_if_t<!std::is_base_of<TensorShapeBase<T>, T>::value>;
+
+  template <typename T>
+  using EnableIfInt = std::enable_if_t<std::is_arithmetic<T>::value &&
+                                       std::is_integral<T>::value>;
+
+ public:
+  //----- Constructors ------
+
+  // Constructs from an absl::Span.
+  template <typename T>
+  explicit DimsAdapter(absl::Span<T> shape)
+      : num_dims_(static_cast<int32_t>(shape.size())) {
+    absl::c_copy(shape, std::back_inserter(storage_));
+  }
+
+  // Constructs from an absl::Span.
+  template <typename T>
+  explicit DimsAdapter(const std::vector<T>& shape)
+      : num_dims_(static_cast<int32_t>(shape.size())) {
+    absl::c_copy(shape, std::back_inserter(storage_));
+  }
+
+  // Constructs from a TRT dims object.
+  DimsAdapter(const nvinfer1::Dims& dims) : num_dims_(dims.nbDims) {
+    absl::c_copy(absl::MakeSpan(dims.d, dims.d + std::max(dims.nbDims, 0)),
+                 std::back_inserter(storage_));
+  }
+
+  // Constructs explicitly specifing num_dims and storage data.
+  DimsAdapter(int32_t num_dims, StorageType data)
+      : num_dims_(num_dims), storage_(std::forward<StorageType>(data)) {}
+
+  // Constructs from a TensorShape or PartialTensorShape.
+  template <typename T>
+  static ::stream_executor::port::StatusOr<DimsAdapter> Create(
+      const TensorShapeBase<T>& shape, bool ignore_first_dim = false) {
+    if (shape.dims() > nvinfer1::Dims::MAX_DIMS)
+      return errors::InvalidArgument("dims of TensorShape exceed MAX_DIMS");
+    if (ignore_first_dim && shape.dims() <= 0)
+      return errors::InvalidArgument(
+          "removing first dim requires explicit batch dimension");
+    if (shape.dims() == -1) {
+      return DimsAdapter(-1, StorageType{});
+    }
+    if (shape.dims() == 0) {
+      return DimsAdapter(0, StorageType{1});
+    }
+    auto offt = (ignore_first_dim ? 1 : 0);
+    return DimsAdapter(
+        absl::MakeSpan(shape.dim_sizes().begin() + offt, shape.dims() - offt));
+  }
+
+  // Constructs from a container.
+  template <typename InputSequence,
+            typename = EnableIfNotTensorShapeType<InputSequence>>
+  static ::stream_executor::port::StatusOr<DimsAdapter> Create(
+      const InputSequence& shape, bool ignore_first_dim = false) {
+    if (ignore_first_dim && shape.size() <= 0) {
+      return errors::InvalidArgument(
+          "removing first dim requires explicit batch dimension");
+    }
+    return DimsAdapter(
+        absl::MakeSpan(shape).subspan(ignore_first_dim ? 1 : 0, shape.size()));
+  }
+
+  //----- Conversion Utilities ------
+
+  //  Converts to an nvinfers::Dims and assign the result to the object passed
+  //  in via the result pointer.
+  void TrtDims(nvinfer1::Dims* result) const {
+    result->nbDims = num_dims_;
+    absl::c_copy(storage_, static_cast<int32_t*>(result->d));
+  }
+
+  // Converts to an nvinfer1::Dims and return by value.
+  nvinfer1::Dims AsTrtDims() const {
+    nvinfer1::Dims result;
+    TrtDims(&result);
+    return result;
+  }
+
+  // Converts to a TensorShape and assigns the result to the object passed in
+  // via the shape pointer.
+  Status TensorShape(TensorShape* shape,
+                     absl::optional<int> batch_size = absl::nullopt) const {
+    TF_RETURN_IF_ERROR(TensorShapeUtils::MakeShape(
+        reinterpret_cast<const int64*>(storage_.data()), storage_.size(),
+        shape));
+    if (batch_size) shape->InsertDim(0, *batch_size);
+    return Status::OK();
+  }
+
+  // Converts to a PartialTensorShape and assigns the result to the object
+  // passed in via the shape pointer.
+  Status PartialTensorShape(
+      PartialTensorShape* shape,
+      absl::optional<int> batch_size = absl::nullopt) const {
+    TF_RETURN_IF_ERROR(TensorShapeUtils::MakeShape(
+        reinterpret_cast<const int64*>(storage_.data()), storage_.size(),
+        shape));
+    if (batch_size) shape->InsertDim(0, *batch_size);
+    return Status::OK();
+  }
+
+  // Copies the dimension values to the vector passed in via the shape pointer.
+  template <typename T, typename = EnableIfInt<T>>
+  Status Vector(std::vector<T>* shape) const {
+    shape->clear();
+    absl::c_copy(storage_, std::back_inserter(*shape));
+    return Status::OK();
+  }
+
+  //----- Property Accessors ------
+
+  // Returns true if the shape has no dynamic dimensions.
+  bool IsStatic() const {
+    return !absl::c_any_of(storage_, [](auto i) { return i < 0; });
+  }
+
+  // Returns product of all dimensions.
+  int64_t Volume() const {
+    return absl::c_accumulate(storage_, static_cast<int64_t>(1),
+                              std::multiplies<>());
+  }
+
+  int32_t NumDims() const { return num_dims_; }
+
+  // Returns true if the shape should be interpreted as a scalar. This follows
+  // TensorRT conversions: a scalar shape can have NumDims()==1 or NumDims()==0,
+  // but the underlying storage_ container has a single dimension of size 1.
+  bool IsScalar() const {
+    return (num_dims_ == 0 || num_dims_ == 1) && storage_.size() == 1 &&
+           storage_[0] == 1;
+  }
+
+  // Returns true if the dimension storage is empty. This indicates an empty
+  // shape in both the scalar and non-scalar case.
+  bool IsEmpty() const { return storage_.empty(); }
+
+  string DebugString() const {
+    auto vol = absl::c_accumulate(storage_, static_cast<int64_t>(1),
+                                  std::multiplies<>());
+    return absl::StrCat("DimsAdapter(num_dims=", num_dims_, ",shape=[",
+                        absl::StrJoin(storage_, ","), "],", "vol=", vol, ")");
+  }
+
+  // Returns beginning iterator for the underlying storage.
+  StorageType::const_iterator begin() const { return storage_.begin(); }
+
+  // Returns ending iterator for the underlying storage.
+  StorageType::const_iterator end() const { return storage_.end(); }
+
+  // Returns the size of the dimension at `idx`.
+  StorageType::value_type dim(size_t idx) const { return storage_[idx]; }
+
+  // Returns a references to the dimension at `idx`.
+  StorageType::value_type& dim(size_t idx) { return storage_[idx]; }
+
+  //----- Non-Const Operators ------
+
+  DimsAdapter& Append(int32_t dim) {
+    ::stream_executor::port::StatusOr<bool> is_scalar = IsScalar();
+    if (!is_scalar.ok()) return *this;
+    num_dims_ = is_scalar.ValueOrDie() ? 2 : num_dims_ + 1;
+    storage_.push_back(dim);
+    return *this;
+  }
+
+  DimsAdapter& Prepend(absl::optional<int32_t> dim) {
+    if (dim) {
+      num_dims_ = IsScalar() ? 2 : num_dims_ + 1;
+      storage_.insert(storage_.begin(), *dim);
+    }
+    return *this;
+  }
+
+  Status RemoveBatchDimension() {
+    if (storage_.empty())
+      return errors::InvalidArgument(
+          "attempted to remove batch dim from scalar");
+    num_dims_ -= 1;
+    storage_.erase(storage_.begin());
+    return Status::OK();
+  }
+
+  //----- Comparison Operators ------
+
+  bool operator==(const DimsAdapter& rhs) const {
+    if (rhs.num_dims_ != num_dims_) return false;
+    for (int i = 0; i < num_dims_; i++) {
+      if (rhs.storage_[i] != storage_[i]) return false;
+    }
+    return true;
+  }
+
+  bool operator!=(const DimsAdapter& rhs) const { return !(*this == rhs); }
+
+ private:
+  int32_t num_dims_{0};
+  StorageType storage_{};
+};
+
+Status GetNetworkInputShapes(const nvinfer1::INetworkDefinition* network,
+                             std::vector<PartialTensorShape>* input_shapes);
+
+Status TfTypeToTrtType(DataType tf_type, nvinfer1::DataType* trt_type);
+Status TrtTypeToTfType(nvinfer1::DataType trt_type, DataType* tf_type);
+
+// Returns true if an engine built for cached_shapes can also run actual_shapes.
+bool AreShapesCompatible(const std::vector<TensorShape>& actual_shapes,
+                         const std::vector<TensorShape>& cached_shapes);
+
+// Returns the number of inputs for the engine, which also correspends to the
+// number of input tensors for the network. This can differ from the number of
+// input bindings, because the number of total input bindings equals the number
+// of profiles times the number of engine inputs.
+int GetNumberOfEngineInputs(const nvinfer1::ICudaEngine* engine);
 
-enum class TrtPrecisionMode { FP32, FP16, INT8 };
+// Returns the string representation for the assigned device or the requested
+// device of the given node.
+absl::string_view GetDeviceName(const Node* node);
 
-Status TrtPrecisionModeToName(TrtPrecisionMode mode, string* name);
+// Returns the ParsedName representation for the assigned device or the
+// requested device string of the given node. If the device string is invalid,
+// returns absl::nullopt.
+absl::optional<DeviceNameUtils::ParsedName> GetDeviceParsedName(
+    const Node* node);
 
-Status TrtPrecisionModeFromName(const string& name, TrtPrecisionMode* mode);
+// If the given two device assignments as compatible, returns the merge of the
+// two assignments. Otherwise, returns absl::nullopt.
+absl::optional<DeviceNameUtils::ParsedName> MergeIfCompatible(
+    const DeviceNameUtils::ParsedName& a, const DeviceNameUtils::ParsedName& b);
+// Similar to the above, except that the second device assignment is represented
+// by a string_view.
+absl::optional<DeviceNameUtils::ParsedName> MergeIfCompatible(
+    const DeviceNameUtils::ParsedName& a, absl::string_view b);
 
 }  // namespace tensorrt
 }  // namespace tensorflow
 
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
 #endif  // TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_UTILS_H_
diff --git a/tensorflow/compiler/tf2tensorrt/convert/weights.cc b/tensorflow/compiler/tf2tensorrt/convert/weights.cc
new file mode 100644
index 00000000000..eb15351134d
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/convert/weights.cc
@@ -0,0 +1,216 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/tf2tensorrt/convert/weights.h"
+
+#include <functional>
+#include <numeric>
+
+#include "absl/strings/str_cat.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
+
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+
+namespace tensorflow {
+namespace tensorrt {
+
+namespace convert {
+
+TRT_ShapedWeights::TRT_ShapedWeights(nvinfer1::DataType type)
+    : shape_(0, DimsAdapter::StorageType{}), type_(type), volume_(0) {}
+
+::stream_executor::port::StatusOr<TRT_ShapedWeights>
+TRT_ShapedWeights::CreateWithTensor(nvinfer1::DataType type, DimsAdapter dims,
+                                    Tensor tensor) {
+  TRT_ShapedWeights weights(type);
+  weights.shape_ = dims;
+  weights.tensor_ = std::forward<Tensor>(tensor);
+  weights.volume_ = weights.shape_.Volume();
+  if (weights.shape_.NumDims() == 0) {
+    DCHECK(weights.shape_.IsEmpty() || weights.shape_.IsScalar());
+  }
+  return weights;
+}
+
+nvinfer1::Weights TRT_ShapedWeights::GetTrtWeights() const {
+  return nvinfer1::Weights{type_, GetPointer<int8>(), volume_};
+}
+
+Status TRT_ShapedWeights::SetShape(DimsAdapter dims) {
+  if (volume_ != dims.Volume()) {
+    VLOG(2) << "Changing shape from " << shape_.DebugString() << ", to "
+            << dims.DebugString();
+    return errors::Internal("SetShape would change number of elements");
+  }
+  shape_ = std::move(dims);
+  return Status::OK();
+}
+
+size_t TRT_ShapedWeights::size_bytes() const {
+  size_t data_type_size = -1;
+  switch (type_) {
+    case nvinfer1::DataType::kFLOAT:
+    case nvinfer1::DataType::kINT32:
+      data_type_size = 4;
+      break;
+    case nvinfer1::DataType::kHALF:
+      data_type_size = 2;
+      break;
+#if IS_TRT_VERSION_GE(8, 5, 0, 0)
+    case nvinfer1::DataType::kUINT8:
+#endif
+#if IS_TRT_VERSION_GE(8, 6, 0, 0)
+    case nvinfer1::DataType::kFP8:
+#endif
+    case nvinfer1::DataType::kINT8:
+    case nvinfer1::DataType::kBOOL:
+      data_type_size = 1;
+      break;
+  }
+  return volume_ * data_type_size;
+}
+
+string TRT_ShapedWeights::DebugString() const {
+  return absl::StrCat(
+      "TRT_ShapedWeights(shape=", shape_.DebugString(),
+      ", type=", tensorflow::tensorrt::DebugString(type_),
+      ", values=", reinterpret_cast<uintptr_t>(GetPointer<int8>()), ")");
+}
+
+TRT_TensorOrWeights::TRT_TensorOrWeights(ITensorProxyPtr tensor)
+    : tensor_proxy_ptr_(tensor),
+      initialized_(true),
+      arg_type_(TRT_ArgumentType::TENSOR) {}
+
+TRT_TensorOrWeights::TRT_TensorOrWeights(ITensorProxyPtr tensor, int batch_size)
+    : tensor_proxy_ptr_(tensor),
+      batch_size_(batch_size),
+      initialized_(true),
+      arg_type_(TRT_ArgumentType::TENSOR) {}
+
+TRT_TensorOrWeights::TRT_TensorOrWeights(nvinfer1::ITensor* tensor,
+                                         int batch_size)
+    : tensor_proxy_ptr_(tensor),
+      batch_size_(batch_size),
+      initialized_(true),
+      arg_type_(TRT_ArgumentType::TENSOR) {}
+
+TRT_TensorOrWeights::TRT_TensorOrWeights(nvinfer1::DataType trt_dtype,
+                                         const nvinfer1::Dims& trt_dims,
+                                         int batch_size)
+    : tensor_proxy_ptr_(new SimpleITensor(trt_dtype, trt_dims)),
+      batch_size_(batch_size),
+      initialized_(true),
+      arg_type_(TRT_ArgumentType::TENSOR) {}
+
+TRT_TensorOrWeights::TRT_TensorOrWeights(const TRT_ShapedWeights& weights)
+    : weights_(weights),
+      initialized_(true),
+      arg_type_(TRT_ArgumentType::WEIGHTS) {}
+
+TRT_TensorOrWeights::TRT_TensorOrWeights(const ResourceHandle& resource)
+    : resource_(resource),
+      initialized_(true),
+      arg_type_(TRT_ArgumentType::RESOURCE) {}
+
+TRT_TensorOrWeights::TRT_TensorOrWeights(const TRT_TensorOrWeights& rhs)
+    : tensor_proxy_ptr_(rhs.tensor_proxy_ptr_),
+      batch_size_(rhs.batch_size_),
+      resource_(rhs.resource_),
+      weights_(rhs.weights_),
+      initialized_(rhs.initialized_),
+      arg_type_(rhs.arg_type_) {}
+
+void TRT_TensorOrWeights::operator=(const TRT_TensorOrWeights& rhs) {
+  tensor_proxy_ptr_ = rhs.tensor_proxy_ptr_;
+  batch_size_ = rhs.batch_size_;
+  weights_ = rhs.weights_;
+  resource_ = rhs.resource_;
+  initialized_ = rhs.initialized_;
+  arg_type_ = rhs.arg_type_;
+}
+
+ITensorProxyPtr TRT_TensorOrWeights::tensor() const {
+  DCHECK(is_tensor());
+  return tensor_proxy_ptr_;
+}
+
+ResourceHandle TRT_TensorOrWeights::resource() const {
+  DCHECK(is_resource());
+  return resource_;
+}
+
+nvinfer1::Dims TRT_TensorOrWeights::GetTrtDims() const {
+  switch (arg_type_) {
+    case TRT_ArgumentType::TENSOR:
+      return tensor()->getDimensions();
+    case TRT_ArgumentType::WEIGHTS:
+      return weights().Shape().AsTrtDims();
+    case TRT_ArgumentType::RESOURCE:
+      return {0, {}};  // Scalar.
+  }
+}
+
+Status TRT_TensorOrWeights::GetTfType(DataType* tf_type) const {
+  if (!initialized_) {
+    return errors::Internal("The object is not initialized");
+  }
+  switch (arg_type_) {
+    case TRT_ArgumentType::TENSOR: {
+      nvinfer1::DataType trt_type = tensor()->getType();
+      return TrtTypeToTfType(trt_type, tf_type);
+    }
+    case TRT_ArgumentType::WEIGHTS:
+      *tf_type = weights().GetTensor().dtype();
+      return Status::OK();
+    case TRT_ArgumentType::RESOURCE:
+      *tf_type = DataType::DT_RESOURCE;
+      return Status::OK();
+  }
+}
+
+string TRT_TensorOrWeights::DebugString() const {
+  string output = "TRT_TensorOrWeights(type=";
+  if (is_tensor()) {
+    absl::StrAppend(&output,
+                    "tensor=", tensorflow::tensorrt::DebugString(tensor()),
+                    ", batch_size=", batch_size_);
+  } else {
+    absl::StrAppend(&output, "weights=", weights_.DebugString());
+  }
+  absl::StrAppend(&output, ")");
+  return output;
+}
+
+::stream_executor::port::StatusOr<TRT_ShapedWeights>
+TrtWeightStore::GetTempWeights(nvinfer1::DataType trt_dtype,
+                               const DimsAdapter& dims) {
+  DataType tf_dtype;
+  TF_RETURN_IF_ERROR(TrtTypeToTfType(trt_dtype, &tf_dtype));
+  TensorShape shape;
+  TF_RETURN_IF_ERROR(dims.TensorShape(&shape));
+  // TODO(jie): check weights size_bytes. 0 means type error
+  Tensor tensor(tf_dtype, shape);
+  ::stream_executor::port::StatusOr<TRT_ShapedWeights> weights =
+      TRT_ShapedWeights::CreateWithTensor(trt_dtype, dims, tensor);
+  TRT_ENSURE_OK(weights);
+  store_.emplace_back(std::move(tensor));
+  return weights;
+}
+
+}  // namespace convert
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/convert/weights.h b/tensorflow/compiler/tf2tensorrt/convert/weights.h
new file mode 100644
index 00000000000..02c26e711df
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/convert/weights.h
@@ -0,0 +1,295 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_WEIGHTS_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_WEIGHTS_H_
+
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+
+#include <vector>
+
+#include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_tensor_proxy.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/types.h"
+#include "third_party/tensorrt/NvInfer.h"
+
+namespace tensorflow {
+namespace tensorrt {
+namespace convert {
+
+// Class to convert TF compile-time constants (e.g. Const nodes) to TRT weight.
+class TRT_ShapedWeights {
+ public:
+  explicit TRT_ShapedWeights(
+      nvinfer1::DataType type = nvinfer1::DataType::kFLOAT);
+
+  // Constructs a weights from another weights.
+  //
+  // NOTE: this does not copy the underlying buffer but only increase its
+  // reference count.
+  TRT_ShapedWeights(const TRT_ShapedWeights& rhs) = default;
+
+  nvinfer1::Weights GetTrtWeights() const;
+
+  const Tensor& GetTensor() const { return tensor_; }
+
+  // Returns a pointer of type const T to the underlying buffer of the tensor.
+  template <typename T>
+  const T* GetPointer() const {
+    int64 num_elem =
+        (tensor_.NumElements() * DataTypeSize(tensor_.dtype())) / sizeof(T);
+    return tensor_.bit_casted_shaped<T, 1>({num_elem}).data();
+  }
+
+  // Returns a pointer of type T to the underlying buffer of the tensor.
+  template <typename T>
+  T* GetPointer() {
+    int64 num_elem =
+        (tensor_.NumElements() * DataTypeSize(tensor_.dtype())) / sizeof(T);
+    return tensor_.bit_casted_shaped<T, 1>({num_elem}).data();
+  }
+
+  // Fills all the weight values with value.
+  template <typename T>
+  Status SetValues(T value) {
+    switch (type_) {
+      case nvinfer1::DataType::kFLOAT: {
+        float* ptr = tensor_.flat<float>().data();
+        std::fill(ptr, ptr + volume_, value);
+        break;
+      }
+      case nvinfer1::DataType::kHALF: {
+        Eigen::half* ptr = tensor_.flat<Eigen::half>().data();
+        std::fill(ptr, ptr + volume_, Eigen::half(value));
+        break;
+      }
+      case nvinfer1::DataType::kINT32: {
+        int32* ptr = tensor_.flat<int32>().data();
+        std::fill(ptr, ptr + volume_, value);
+        break;
+      }
+      default:
+        return errors::InvalidArgument(
+            "Unsupported data type ", tensorflow::tensorrt::DebugString(type_));
+    }
+    return Status::OK();
+  }
+
+  Status SetShape(DimsAdapter dims);
+  void SetShapeUnsafe(DimsAdapter dims) { shape_ = std::move(dims); }
+
+  // Returns total number of elements. Returning 0 means either some dim is 0
+  // or the number of dims is 0. Note that a TF scalar constant is marked as
+  // Dims{0, {1}}, and has a count() == 1.
+  int64_t count() const { return volume_; }
+
+  size_t size_bytes() const;
+
+  string DebugString() const;
+
+  template <typename T>
+  absl::Span<const T> GetSpan() const {
+    return absl::Span<const T>(tensor_.flat<T>().data(), volume_);
+  }
+
+  template <typename T>
+  std::vector<T> ToVector() const {
+    auto span = GetSpan<T>();
+    return std::vector<T>(span.data(), span.data() + span.size());
+  }
+
+  nvinfer1::DataType TrtDType() const { return type_; }
+
+  const DimsAdapter& Shape() const { return shape_; }
+  DimsAdapter& Shape() { return shape_; }
+
+ private:
+  // The shape of the weights. Defaults to the empty shape.
+  DimsAdapter shape_;
+
+  // This creation method is only used by TrtWeightStore, which creates the
+  // underlying buffer.
+  static ::stream_executor::port::StatusOr<TRT_ShapedWeights> CreateWithTensor(
+      nvinfer1::DataType type, DimsAdapter dims, Tensor tensor);
+
+  nvinfer1::DataType type_;
+
+  // All weights should be stored inside TrtWeightStore to make sure lifetime of
+  // all the underlying tensors are available until the engine is built. For
+  // this reason, tensor_ should never be reassigned to a different value that
+  // is not already present in the TrtWeightStore.
+  Tensor tensor_;
+  // Contains the volume of the weight's shape.
+  int64_t volume_;
+
+  friend class TrtWeightStore;
+};
+
+// Container for TRT_ShapedWeights. We need this container because TRT does not
+// manage the lifetime of the weights buffer, it only keeps a pointer to it and
+// requires that the data referenced by the pointer be available until the
+// building of engine is complete. For more information see
+// https://docs.nvidia.com/deeplearning/sdk/tensorrt-api/c_api/classnvinfer1_1_1_weights.html
+//
+// TODO(laigd): consider adding garbage collection to the unused weights.
+class TrtWeightStore {
+ public:
+  // Gets a TRT_ShapedWeights with 'type' and 'dims'.
+  ::stream_executor::port::StatusOr<TRT_ShapedWeights> GetTempWeights(
+      nvinfer1::DataType trt_type, const DimsAdapter& dims);
+
+  // Gets a TRT_ShapedWeights with the same data type and dimensions as
+  // 'weights'.
+  ::stream_executor::port::StatusOr<TRT_ShapedWeights> GetTempWeights(
+      const TRT_ShapedWeights& weights) {
+    return GetTempWeights(weights.TrtDType(), weights.Shape());
+  }
+
+ private:
+  // The backend storage of the TRT_ShapedWeights.
+  std::vector<Tensor> store_;
+};
+
+// Enumerates the possible types of arguments of a converter. This determines
+// what object is contained in TRT_TensorOrWeights, and converters can require
+// a specific type for each of their arguments.
+enum class TRT_ArgumentType {
+  TENSOR = 0,
+  WEIGHTS = 1,
+  RESOURCE = 2,
+};
+
+struct OpConverterParams;
+
+// Represents a TRT-style input to a TF node, it can be either a
+// ITensorProxyPtr (representing nvinfer1::ITensor* or SimpleITensor),
+// or TRT_ShapedWeights which is compile-time constant.
+//
+// TODO(laigd): maybe rename it to TrtArgument, or mimic XlaCompiler::Argument.
+class TRT_TensorOrWeights {
+ public:
+  TRT_TensorOrWeights() {}
+  TRT_TensorOrWeights(ITensorProxyPtr);
+  TRT_TensorOrWeights(ITensorProxyPtr tensor, int batch_size);
+
+  // Constructs a wrapper for the given ITensor.
+  // This is used by Converter when building the TRT network, where the ITensor
+  // is owned by the TRT network being built. See comment for 'trt_tensor_'
+  // in trt_proxy_tensor.h.
+  explicit TRT_TensorOrWeights(nvinfer1::ITensor* tensor, int batch_size = -1);
+
+  // Creates a SimpleITensor for trt_dtype and trt_dims and takes ownership of
+  // the object. Constructs a wrapper for the SimpleITensor. This is used by
+  // TrtNodeValidator to encapsulate the type and shape information for
+  // validation of graph nodes, and the created ITensor is fake and temporary,
+  // and should not be used to build any TRT network. See comment for
+  // 'simple_tensor_' in trt_proxy_tensor.h.
+  explicit TRT_TensorOrWeights(nvinfer1::DataType trt_dtype,
+                               const nvinfer1::Dims& trt_dims, int batch_size);
+
+  // Constructs a wrapper for the given weights.
+  explicit TRT_TensorOrWeights(const TRT_ShapedWeights& weights);
+
+  // Constructs a wrapper for the given resource handle.
+  explicit TRT_TensorOrWeights(const ResourceHandle& resource);
+
+  TRT_TensorOrWeights(const TRT_TensorOrWeights& rhs);
+
+  void operator=(const TRT_TensorOrWeights& rhs);
+
+  bool is_tensor() const {
+    return initialized_ && arg_type_ == TRT_ArgumentType::TENSOR;
+  }
+  bool is_weights() const {
+    return initialized_ && arg_type_ == TRT_ArgumentType::WEIGHTS;
+  }
+  bool is_resource() const {
+    return initialized_ && arg_type_ == TRT_ArgumentType::RESOURCE;
+  }
+
+  ITensorProxyPtr tensor() const;
+
+  ResourceHandle resource() const;
+
+  ITensorProxyPtr as_tensor(const OpConverterParams* params);
+
+  TRT_ShapedWeights& weights() {
+    DCHECK(is_weights());
+    return weights_;
+  }
+
+  const TRT_ShapedWeights& weights() const {
+    DCHECK(is_weights());
+    return weights_;
+  }
+
+  nvinfer1::Dims GetTrtDims() const;
+
+  Status GetTfType(DataType* tf_type) const;
+
+  int batch_size() const { return batch_size_; }
+
+  string DebugString() const;
+
+  nvinfer1::DataType TrtDType() const {
+    if (arg_type_ == TRT_ArgumentType::RESOURCE) {
+      VLOG(0) << "Calling TrtDType() with a RESOURCE argument is undefined "
+                 "behavior.";
+    }
+    return arg_type_ == TRT_ArgumentType::TENSOR ? tensor_proxy_ptr_->getType()
+                                                 : weights_.TrtDType();
+  }
+
+ private:
+  void set_batch_size(int batch_size) { batch_size_ = batch_size; }
+
+  // First dimension of the TF tensor (NOT tensor_) that is represented by
+  // tensor_ is treated as the "batch dimension" by TRT, and tensor_'s
+  // dimensions (obtained via tensor_->getDimensions()) do not contain the batch
+  // dimension. For example, when a TF tensor with shape (A,B,C) is represented
+  // in TRT, tensor_->getDimensions() will be (B,C) and batch_size_ will be A.
+  //
+  // This requires that all tensors in the subgraph that is converted to a TRT
+  // engine have the same batch size are represented by the first dimension of
+  // their shape, and Converter will verify this during conversion. The drawback
+  // is that currently it cannot convert a graph that doesn't have the batch
+  // size represented in the shapes or the batch sizes are different. See
+  // b/118387490 for more details.
+  //
+  // If use_implicit_batch is false, batch_size_ is unused and
+  // tensor_->getDimensions() will contain the entire shape (A,B,C).
+  //
+  // tensor_proxy_ptr_ is used when arg_type_ == TENSOR.
+  ITensorProxyPtr tensor_proxy_ptr_ = nullptr;
+  int batch_size_ = -1;
+
+  // For DT_RESOURCE arguments (there is no corresponding type in TRT).
+  // resource_ is used when arg_type_ == RESOURCE.
+  ResourceHandle resource_;
+
+  // weights_ is used when arg_type_ == WEIGHTS.
+  TRT_ShapedWeights weights_;
+  bool initialized_ = false;
+  TRT_ArgumentType arg_type_ = TRT_ArgumentType::WEIGHTS;
+
+  friend class Converter;
+};
+}  // namespace convert
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_WEIGHTS_H_
diff --git a/tensorflow/compiler/tf2tensorrt/kernels/get_calibration_data_op.cc b/tensorflow/compiler/tf2tensorrt/kernels/get_calibration_data_op.cc
index 3143b06817e..76fb40b9520 100644
--- a/tensorflow/compiler/tf2tensorrt/kernels/get_calibration_data_op.cc
+++ b/tensorflow/compiler/tf2tensorrt/kernels/get_calibration_data_op.cc
@@ -22,8 +22,7 @@ limitations under the License.
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/lib/core/refcount.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 namespace tensorflow {
 namespace tensorrt {
@@ -67,5 +66,4 @@ REGISTER_KERNEL_BUILDER(Name("GetCalibrationDataOp").Device(DEVICE_GPU),
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
index 7e0e40ceedc..4b53587e75b 100644
--- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
+++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
@@ -20,11 +20,14 @@ limitations under the License.
 #include "absl/strings/ascii.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
+#include "tensorflow/compiler/tf2tensorrt/common/utils.h"
 #include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h"
 #include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
 #include "tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.h"
 #include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h"
 #include "tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/graph_optimizer.h"
 #include "tensorflow/core/framework/function.h"
@@ -34,6 +37,8 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/grappler/clusters/utils.h"
+#include "tensorflow/core/grappler/clusters/virtual_cluster.h"
 #include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
@@ -42,42 +47,93 @@ limitations under the License.
 #include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/platform/types.h"
-#include "tensorflow/stream_executor/lib/statusor.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
+#include "tensorflow/core/util/env_var.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
 #include "third_party/tensorrt/NvInfer.h"
 
 namespace tensorflow {
 namespace tensorrt {
+namespace {
 Logger& logger = *Logger::GetLogger();
 using absl::StrAppend;
 using absl::StrCat;
 using ::nvinfer1::IRuntime;
-using ::stream_executor::port::StatusOr;
 
-// A helper class to call done() when destructed for asynchronous execution.
-// Helps simultaneous execution of native and TRT engines.
+#define LOG_FIRST_FEW_WARNING_WITH_PREFIX \
+  LOG_FIRST_N(WARNING, 5) << "TF-TRT Warning: "
 
-class AsyncHelper : public core::RefCounted {
+// Allocates device memory for an execution context to execute a TensorRT
+// engine and records the relevant information for deallocating the memory when
+// the engine finishes execution.
+class ContextDeviceMemory {
  public:
-  AsyncHelper(AsyncOpKernel::DoneCallback done) : done_(done) {}
-
-  ~AsyncHelper() override { this->operator()(); }
+  ContextDeviceMemory()
+      : execution_context_(nullptr),
+        device_memory_allocator_(nullptr),
+        device_memory_(nullptr) {}
+
+  ~ContextDeviceMemory() {
+    if (device_memory_) {
+      device_memory_allocator_->free(device_memory_);
+    }
+  }
 
-  void operator()() {
-    if (!called_) {
-      done_();
-      called_ = true;
+  Status AllocateDeviceMemory(nvinfer1::IExecutionContext* execution_context,
+                              TRTBaseAllocator* device_memory_allocator,
+                              size_t device_memory_size) {
+    execution_context_ = execution_context;
+    device_memory_allocator_ = device_memory_allocator;
+    device_memory_ = nullptr;
+    VLOG(2) << "Device memory size for TensorRT engine " << device_memory_size;
+    if (device_memory_size > 0) {
+      device_memory_ = device_memory_allocator_->allocate(
+          device_memory_size,
+          /*unused alignment=*/0, /*flags=*/0);
+      if (device_memory_ == nullptr) {
+        return errors::InvalidArgument(
+            "Out of GPU memory for execution context");
+      }
     }
+    {
+      tensorflow::profiler::TraceMe activity(
+          "setDeviceMemory", tensorflow::profiler::TraceMeLevel::kInfo);
+      execution_context_->setDeviceMemory(device_memory_);
+    }
+    return Status::OK();
   }
 
+ private:
+  nvinfer1::IExecutionContext* execution_context_;
+  TRTBaseAllocator* device_memory_allocator_;
+  void* device_memory_;
+};
+
+// Macros for asynchronous execution, such as OP_REQUIRES_OK_ASYNC requires an
+// object with operator (). Provides such an object with a noop operator()
+// because we don't need such macros to invoke the DoneCallback for the
+// TRTEngineOp.
+struct DummyAsyncHelper {
+  void operator()() {}
+};
+
+// A helper class to call the DoneCallback for the TRTEngineOp when the object
+// is destructed to support asynchronous of the native segment and TRT engines
+// for the TRTEngineOp.
+class AsyncHelper : public core::RefCounted {
+ public:
+  AsyncHelper(AsyncOpKernel::DoneCallback done) : done_(done) {}
+
+  ~AsyncHelper() override { done_(); }
+
  private:
   AsyncOpKernel::DoneCallback done_;
-  bool called_ = false;  // Has `done_` been called?
 };
 
+}  // end anonymous namespace
+
 //  This OP can construct TRTEngine on the fly and if construction of engine
 //  fails, executes equivalent subgraph as a TensorFlow function.
 class TRTEngineOp : public AsyncOpKernel {
@@ -88,50 +144,63 @@ class TRTEngineOp : public AsyncOpKernel {
                     AsyncOpKernel::DoneCallback done) override;
 
  private:
-  using CacheType =
-      LRUCache<std::vector<TensorShape>, std::unique_ptr<EngineContext>,
-               VectorTensorShapeHasher>;
-
-  // Execute calibration
+  // Executes calibration asynchronously.
   void ExecuteCalibration(OpKernelContext* ctx,
                           TRTEngineCacheResource* cache_res,
-                          AsyncHelper* helper);
-
-  // Construct a function handle for executing native funcdef graph
-  // These are the exact same function.
-
-  Status ConstructFunctionHandle(FunctionLibraryRuntime* lib,
-                                 const string& device_name);
-
-  // Execute replaced native segment as function Op.
-  void ExecuteNativeSegment(OpKernelContext* ctx, AsyncHelper* helper);
-
-  // Execute the tensorrt engine. Returns whether we need to retry by running
-  // the native segment.
-  bool ExecuteTrtEngine(OpKernelContext* ctx, EngineContext* engine_context);
-
-  // Allocate necessary resources for calibration
+                          AsyncHelper* async_helper);
+
+  // Constructs a function handle for the segment of the TRTEngineOp.
+  ::stream_executor::port::StatusOr<FunctionLibraryRuntime::Handle>
+  ConstructFunctionHandle(FunctionLibraryRuntime* lib,
+                          const string& device_name,
+                          bool allow_soft_placement = false,
+                          size_t num_inputs = 0, size_t num_outputs = 0);
+
+  // Imports the GraphDef for the segment of the TRTEngineOp to
+  // segment_graph_def_.
+  Status ImportSegmentGraphDef(FunctionLibraryRuntime* lib,
+                               const string& device_name);
+
+  // Executes the native segment as function Op  asynchronously.
+  void ExecuteNativeSegment(OpKernelContext* ctx, AsyncHelper* async_helper);
+
+  // Allocates the device memory for the execution context and enqueues the
+  // TensorRT engine for execution. Also deallocates the device memory. Returns
+  // whether we need to retry by running the native segment.
+  Status ExecuteTrtEngine(OpKernelContext* ctx, EngineContext* engine_context,
+                          int trt_context_idx,
+                          const TrtShapeOptimizationProfile& profiles,
+                          TRTBaseAllocator* allocator);
+
+  // Allocates necessary resources for calibration.
   Status AllocateCalibrationResources(OpKernelContext* ctx,
                                       TRTEngineCacheResource* cache_res);
 
   Status GetEngineCacheResource(OpKernelContext* ctx,
                                 TRTEngineCacheResource** cache_res);
 
-  // Get engine for the input shape
-  StatusOr<EngineContext*> GetEngine(
-      const std::vector<TensorShape>& input_shapes, OpKernelContext* ctx,
-      TRTEngineCacheResource* cache_res);
+  // Returns a pair of 1) An EngineContext object that is compatible with the
+  // input and 2) The index of the IExecutionContext compatible with the input.
+  // If a cuda engine for the given input shapes can't be found, returns
+  // (nullptr, 0) to allow native engine execution. Returns an error code for
+  // any problem that would prevent both TensorRT engine exceution and native
+  // segment execution.
+  ::stream_executor::port::StatusOr<std::pair<EngineContext*, int>> GetEngine(
+      const std::vector<TensorShape>& input_concrete_shapes,
+      OpKernelContext* ctx, TRTEngineCacheResource* cache_resource);
+
+  // Builds and returns a cuda engine for the input shapes. If building the
+  // engine fails, enters a dummy entry into the cache_resource cache so we
+  // don't continually try to build the same failing engine.
+  ::stream_executor::port::StatusOr<TrtUniquePtrType<nvinfer1::ICudaEngine>>
+  BuildEngine(const std::vector<TensorShape>& input_concrete_shapes,
+              int batch_size, bool use_calibration,
+              TRTInt8Calibrator* calibrator,
+              TRTEngineCacheResource* cache_resource, OpKernelContext* ctx);
 
   // Verify that the input shapes are consistent and can be handled by this op.
   Status VerifyInputShapes(const std::vector<TensorShape>& shapes);
 
-  // Return engine batch in cached_engne_batch_sizes_ which is closest to input
-  // batch.
-  Status GetEngineInputShapes(
-      const CacheType& cache,
-      const std::vector<TensorShape>& actual_input_shapes,
-      std::vector<TensorShape>* engine_input_shapes);
-
   std::vector<string> input_nodes_;
   std::vector<string> output_nodes_;
 
@@ -142,7 +211,7 @@ class TRTEngineOp : public AsyncOpKernel {
   NameAttrList func_;
 
   // GraphDef representation of the segment.
-  GraphDef segment_graph_;
+  GraphDef segment_graph_def_;
 
   // Engine Precision mode.
   TrtPrecisionMode precision_mode_;
@@ -154,12 +223,35 @@ class TRTEngineOp : public AsyncOpKernel {
   // Whether to calibrate INT8 engine.
   bool calibration_mode_;
 
-  // Maximum number of cached engines
+  // Whether to use implicit batch dimension for TensorRT.
+  bool use_implicit_batch_;
+
+  // Whether to collect optimization profiles for TensorRT, only used when
+  // use_implicit_batch_=false.
+  bool profile_generation_mode_;
+
+  // Optimization profile generation strategy.
+  ProfileStrategy profile_strategy_;
+
+  // Whether the TRTEngineOp has any input with unknown dimensions.
+  bool has_dynamic_shape_input_;
+
+  // Whether to build TensorRT engines at runtime.
+  bool allow_build_at_runtime_;
+
+  // Whether to allow soft placement when the graph is executed with native
+  // TensorFlow.
+  bool allow_soft_placement_;
+
+  // Maximum number of cached engines.
   int max_cached_engines_;
 
+  // Flag to detect whether native segment nodes have been deleted from graph
+  bool native_segment_absent_;
+
   int64 workspace_size_;
   mutex engine_mutex_;
-  FunctionLibraryRuntime::Handle func_handle_;
+  FunctionLibraryRuntime::Handle native_execution_func_handle_;
 
   // The finalized calibrator for inference.
   std::unique_ptr<TRTInt8Calibrator> calibrator_;
@@ -167,20 +259,45 @@ class TRTEngineOp : public AsyncOpKernel {
   // If true, create calibration graph for INT8 mode. Otherwise, we are using
   // user-provided quantization ranges.
   bool use_calibration_;
+
+  tensorflow::grappler::Cluster* cluster_;
+
+  // Array of all input shapes, collected from the input_shapes attribute when
+  // constructing the TRTEngineOp. The input_shapes attribute is set during
+  // graph conversion time. This data is used to retrieve which input dimensions
+  // could be unknown. During inference time this information is not available
+  // otherwise (all shapes are known (concrete) shapes when we run inference).
+  std::vector<PartialTensorShape> input_partial_shapes_;
+  // Shapes, excluding resource inputs.
+  std::vector<PartialTensorShape> input_partial_shapes_filtered_;
+
+  // The TF node can have more inputs than the TRT engine: resource inputs are
+  // saved as weight in the engine, instead of passing that as engine input.
+  // Input mask is true for those TF input that are TRT engine inputs.
+  std::vector<bool> input_mask_;
+
+  // Whether to use explicit precision (QDQ) mode.
+  bool use_explicit_precision_;
 };
 
-#define TYPECASE(dt, X, Y)                                    \
+#define TYPECASE(dt, X)                                       \
   case dt: {                                                  \
     return (void*)X->flat<EnumToDataType<dt>::Type>().data(); \
   }
 
 void* GetTensorAddress(const Tensor* tensor_ptr) {
-  auto tensor_type = tensor_ptr->dtype();
+  const auto tensor_type = tensor_ptr->dtype();
   switch (tensor_type) {
-    TYPECASE(DT_FLOAT, tensor_ptr, dest_ptr);
-    TYPECASE(DT_HALF, tensor_ptr, dest_ptr);
-    TYPECASE(DT_INT8, tensor_ptr, dest_ptr);
-    TYPECASE(DT_INT32, tensor_ptr, dest_ptr);
+    TYPECASE(DT_FLOAT, tensor_ptr);
+    TYPECASE(DT_HALF, tensor_ptr);
+    TYPECASE(DT_INT8, tensor_ptr);
+    TYPECASE(DT_INT32, tensor_ptr);
+#if IS_TRT_VERSION_GE(8, 2, 0, 0)
+    TYPECASE(DT_BOOL, tensor_ptr);
+#endif
+#if IS_TRT_VERSION_GE(8, 5, 0, 0)
+    TYPECASE(DT_UINT8, tensor_ptr);
+#endif
     default: {
       LOG(ERROR) << "Unsupported Data type " << DataTypeString(tensor_type);
       return nullptr;
@@ -232,8 +349,14 @@ static Status FunctionDefToGraphDef(FunctionLibraryRuntime::Handle handle,
   return Status::OK();
 }
 
-Status TRTEngineOp::ConstructFunctionHandle(FunctionLibraryRuntime* lib,
-                                            const string& device_name) {
+::stream_executor::port::StatusOr<FunctionLibraryRuntime::Handle>
+TRTEngineOp::ConstructFunctionHandle(FunctionLibraryRuntime* lib,
+                                     const string& device_name,
+                                     bool allow_soft_placement,
+                                     size_t num_inputs, size_t num_outputs) {
+  tensorflow::profiler::TraceMe activity(
+      "TRTEngineOp::ConstructFunctionHandle",
+      tensorflow::profiler::TraceMeLevel::kInfo);
   VLOG(1) << "Constructing function handle";
   if (lib == nullptr) {
     return errors::Internal("Context function library is null");
@@ -241,12 +364,55 @@ Status TRTEngineOp::ConstructFunctionHandle(FunctionLibraryRuntime* lib,
   FunctionLibraryRuntime::InstantiateOptions inst_ops;
   inst_ops.state_handle = "";
   inst_ops.target = device_name;
-  return lib->Instantiate(func_.name(), AttrSlice(&func_.attr()), inst_ops,
-                          &func_handle_);
+  if (!native_segment_absent_ && allow_soft_placement) {
+    const FunctionDef* fdef =
+        lib->GetFunctionLibraryDefinition()->Find(func_.name());
+    if (!fdef) {
+      return errors::Internal(
+          StrCat("Can't find FunctionDef for ", func_.name()));
+    }
+    bool ints_on_device =
+        fdef->attr().count(FunctionLibraryDefinition::kIntsOnDeviceAttr) != 0 &&
+        fdef->attr().at(FunctionLibraryDefinition::kIntsOnDeviceAttr).b();
+    // kIntsOnDeviceAttr is not compatible with is_multi_device_function which
+    // is needed to support allow_soft_placement.
+    if (ints_on_device) {
+      LOG_FIRST_FEW_WARNING_WITH_PREFIX
+          << "Function " << name()
+          << " has attribute kIntsOnDeviceAttr=true "
+             "and will be executed natively with allow_soft_placement=false. "
+             "If this is a problem, please re-generate your SavedModel with "
+             "the TF-TRT runtime you are using.";
+    } else {
+      inst_ops.is_multi_device_function = true;
+      inst_ops.input_devices.resize(num_inputs, device_name);
+      inst_ops.output_devices.resize(num_outputs, device_name);
+      inst_ops.config_proto.set_allow_soft_placement(true);
+    }
+  }
+  FunctionLibraryRuntime::Handle func_handle;
+  Status status = lib->Instantiate(func_.name(), AttrSlice(&func_.attr()),
+                                   inst_ops, &func_handle);
+  if (status.ok()) {
+    return func_handle;
+  }
+  return status;
+}
+
+Status TRTEngineOp::ImportSegmentGraphDef(FunctionLibraryRuntime* lib,
+                                          const string& device_name) {
+  tensorflow::profiler::TraceMe activity(
+      "TRTEngineOp::ImportSegmentGraphDef",
+      tensorflow::profiler::TraceMeLevel::kInfo);
+  TF_ASSIGN_OR_RETURN(FunctionLibraryRuntime::Handle func_handle,
+                      ConstructFunctionHandle(lib, device_name));
+  return FunctionDefToGraphDef(func_handle, lib, &segment_graph_def_);
 }
 
 TRTEngineOp::TRTEngineOp(OpKernelConstruction* context)
     : AsyncOpKernel(context) {
+  tensorflow::profiler::TraceMe activity(
+      "TRTEngineOp::TRTEngineOp", tensorflow::profiler::TraceMeLevel::kInfo);
   // read serialized_engine
   OP_REQUIRES_OK(context,
                  context->GetAttr("serialized_segment", &serialized_segment_));
@@ -262,21 +428,61 @@ TRTEngineOp::TRTEngineOp(OpKernelConstruction* context)
   OP_REQUIRES_OK(context,
                  context->GetAttr("calibration_data", &calibration_data));
   OP_REQUIRES_OK(context, context->GetAttr("segment_func", &func_));
-  OP_REQUIRES(context, !func_.name().empty(),
-              errors::InvalidArgument(
-                  "The TF function for the TRT segment could not be empty"));
   OP_REQUIRES_OK(context,
                  TrtPrecisionModeFromName(precision_string, &precision_mode_));
   OP_REQUIRES_OK(context,
                  context->GetAttr("use_calibration", &use_calibration_));
-  func_handle_ = kInvalidHandle;
-  if (!static_engine_) {
-    FunctionLibraryRuntime* lib = context->function_library();
-    OP_REQUIRES_OK(context,
-                   ConstructFunctionHandle(lib, context->device()->name()));
-    OP_REQUIRES_OK(context,
-                   FunctionDefToGraphDef(func_handle_, lib, &segment_graph_));
+  OP_REQUIRES_OK(context,
+                 context->GetAttr("input_shapes", &input_partial_shapes_));
+  auto status =
+      context->GetAttr("_allow_build_at_runtime", &allow_build_at_runtime_);
+  if (status.code() == tensorflow::error::NOT_FOUND) {
+    VLOG(2) << "Not found _allow_build_at_runtime in "
+            << context->device()->name()
+            << ", thus setting _allow_build_at_runtime=true";
+    allow_build_at_runtime_ = true;
+  } else {
+    OP_REQUIRES_OK(context, status);
+  }
+
+  // Get a mask of non-resource inputs.
+  std::vector<DataType> in_types;
+  input_mask_.resize(input_partial_shapes_.size());
+  OP_REQUIRES_OK(context, context->GetAttr("InT", &in_types));
+  for (int i = 0; i < input_mask_.size(); i++) {
+    input_mask_[i] = (in_types[i] != DataType::DT_RESOURCE);
+  }
+
+  // Filter the shapes to exclude resources.
+  for (int i = 0; i < input_partial_shapes_.size(); i++) {
+    if (input_mask_[i]) {
+      input_partial_shapes_filtered_.push_back(input_partial_shapes_[i]);
+    }
+  }
+
+  status = context->GetAttr("_allow_soft_placement", &allow_soft_placement_);
+  if (status.code() == tensorflow::error::NOT_FOUND) {
+    allow_soft_placement_ = true;
+  } else {
+    OP_REQUIRES_OK(context, status);
+  }
+
+  status = context->GetAttr("use_explicit_precision", &use_explicit_precision_);
+  if (!status.ok()) {
+    use_explicit_precision_ = false;
+  }
+
+  // When a TF-TRT converted model without native segments is loaded,
+  // func_ can be empty.
+  native_segment_absent_ = (func_.name() == "");
+  native_execution_func_handle_ = kInvalidHandle;
+  if (!native_segment_absent_) {
+    if (!static_engine_) {
+      OP_REQUIRES_OK(context, ImportSegmentGraphDef(context->function_library(),
+                                                    context->device()->name()));
+    }
   }
+
   // TODO(laigd): calibration_data is used in TF v1.x and we keep it only for
   // backward compatibility reasons. Remove it once all known users switch to
   // 2.0.
@@ -289,178 +495,423 @@ TRTEngineOp::TRTEngineOp(OpKernelConstruction* context)
   }
   OP_REQUIRES_OK(context, context->GetAttr("max_cached_engines_count",
                                            &max_cached_engines_));
+
+  status = context->GetAttr("_use_implicit_batch", &use_implicit_batch_);
+  if (status.code() == tensorflow::error::NOT_FOUND) {
+    VLOG(2) << "Not found _use_implicit_batch in " << context->device()->name()
+            << ", thus setting _use_implicit_batch=true";
+    use_implicit_batch_ = true;
+  }
+
+  status =
+      context->GetAttr("_profile_generation_mode", &profile_generation_mode_);
+  if (status.code() == tensorflow::error::NOT_FOUND) {
+    VLOG(2) << "Not found _profile_generation_mode in "
+            << context->device()->name()
+            << ", thus setting _profile_generation_mode=false";
+    profile_generation_mode_ = false;
+  }
+  if (static_engine_) {
+    if (profile_generation_mode_) profile_generation_mode_ = false;
+  }
+  if (use_implicit_batch_) {
+    OP_REQUIRES(context, !profile_generation_mode_,
+                errors::InvalidArgument(
+                    "profile_generation_mode_=true is only supported if "
+                    "use_implicit_batch=false"));
+    if (input_partial_shapes_.empty()) {
+      VLOG(1) << "Attribute input_shapes is not set. This happens probably "
+              << "because you are using a model that is already converted "
+              << "to TensorRT with a previous version of TF-TRT (i.e. includes "
+              << "TRTEngineOp in graph). This is not an error. If you convert "
+              << "the original model again to TensorRT, the attributes "
+              << "input_shapes will be set automatically.";
+    }
+  } else {
+    OP_REQUIRES(
+        context, !input_partial_shapes_.empty(),
+        errors::InvalidArgument(
+            "Explicit batch mode requires attribute input_shapes to be set."
+            "If you are using a model that was converted to TensorRT by a "
+            "previous version of TF-TRT, (i.e. includes TRTEngineOp in graph "
+            "without the input_shapes attribute), then you need to convert the "
+            "original model again to TensorRT in order to set the attribute "
+            "input_shapes."));
+
+    string profile_strategy_name;
+    status = context->GetAttr("profile_strategy", &profile_strategy_name);
+    if (status.code() == tensorflow::error::NOT_FOUND) {
+      VLOG(2) << "Not found strategy in " << context->device()->name()
+              << ", thus setting profile_strategy='Range'";
+      profile_strategy_ = ProfileStrategy::kRange;
+    } else {
+      OP_REQUIRES_OK(context, ProfileStrategyFromName(profile_strategy_name,
+                                                      &profile_strategy_));
+    }
+  }
+  has_dynamic_shape_input_ = absl::c_any_of(
+      input_partial_shapes_filtered_,
+      [](PartialTensorShape shape) { return !shape.IsFullyDefined(); });
+  VLOG(2) << "TRTEngineOp has_dynamic_shape_input_: "
+          << has_dynamic_shape_input_;
+}
+
+// Copies input tensor ctx->input(i) (which is in device memory) to the host,
+// and place the resulting host tensor to the back of native_inputs.
+Status CopyToHostAsync(OpKernelContext* ctx, std::vector<Tensor>* native_inputs,
+                       int i, const cudaStream_t stream) {
+  // The TRTEngineOp has all ctx->inputs on the device. In contrast, the
+  // native segment expects to find int32 inputs on the host. We copy int32
+  // inputs from device to host.
+
+  AllocatorAttributes allocator_attr;
+  allocator_attr.set_on_host(true);
+  Tensor t;
+  TF_RETURN_IF_ERROR(ctx->allocate_temp(
+      ctx->input_dtype(i), ctx->input(i).shape(), &t, allocator_attr));
+  native_inputs->push_back(t);
+  const Tensor& gpu_tensor = ctx->input(i);
+  auto ret = cudaMemcpyAsync(
+      t.flat<int32>().data(), gpu_tensor.flat<int32>().data(),
+      t.NumElements() * sizeof(int32), cudaMemcpyDeviceToHost, stream);
+  if (ret != 0) {
+    return errors::Internal("Could not copy tensor for native segment input");
+  }
+  return Status::OK();
+}
+
+// Copies native_tensor, which is in host memory to ctx->output(t), which is in
+// device memory.
+Status CopyToDeviceAsync(OpKernelContext* ctx, const Tensor& native_tensor,
+                         int t, cudaStream_t stream) {
+  Tensor* gpu_tensor;
+  TF_RETURN_IF_ERROR(
+      ctx->allocate_output(t, native_tensor.shape(), &gpu_tensor));
+  auto ret = cudaMemcpyAsync(gpu_tensor->flat<int32>().data(),
+                             native_tensor.flat<int32>().data(),
+                             native_tensor.NumElements() * sizeof(int32),
+                             cudaMemcpyHostToDevice, stream);
+  if (ret != 0) {
+    return errors::Internal("Could not copy tensor for native segment output");
+  }
+  return Status::OK();
 }
 
 void TRTEngineOp::ExecuteNativeSegment(OpKernelContext* ctx,
-                                       AsyncHelper* helper) {
-  std::vector<Tensor> inputs;
-  std::vector<Tensor>* outputs = new std::vector<Tensor>();
-  if (func_handle_ == kInvalidHandle) {
-    OP_REQUIRES_OK_ASYNC(
-        ctx,
-        ConstructFunctionHandle(ctx->function_library(), ctx->device()->name()),
-        *helper);
+                                       AsyncHelper* async_helper) {
+  tensorflow::profiler::TraceMe activity(
+      "TRTEngineOp::ExecuteNativeSegment",
+      tensorflow::profiler::TraceMeLevel::kInfo);
+  std::vector<Tensor> native_inputs;
+  std::vector<Tensor>* native_outputs = new std::vector<Tensor>();
+  DummyAsyncHelper dummy_async_helper;
+  if (native_execution_func_handle_ == kInvalidHandle) {
+    ::stream_executor::port::StatusOr<FunctionLibraryRuntime::Handle>
+        status_or_handle = ConstructFunctionHandle(
+            ctx->function_library(), ctx->device()->name(),
+            allow_soft_placement_, ctx->num_inputs(), ctx->num_outputs());
+    OP_REQUIRES_OK_ASYNC(ctx, status_or_handle.status(), dummy_async_helper);
+    native_execution_func_handle_ = status_or_handle.ValueOrDie();
   }
+
   auto lib = ctx->function_library();
   FunctionLibraryRuntime::Options opts;
   opts.rendezvous = ctx->rendezvous();
   opts.cancellation_manager = ctx->cancellation_manager();
   opts.runner = ctx->runner();
-  inputs.reserve(ctx->num_inputs());
+  native_inputs.reserve(ctx->num_inputs());
+  int n_copies = 0;
+  cudaStream_t stream = reinterpret_cast<cudaStream_t>(
+      CHECK_NOTNULL(ctx->op_device_context()
+                        ->stream()
+                        ->implementation()
+                        ->GpuStreamMemberHack()));
   for (int i = 0; i < ctx->num_inputs(); i++) {
-    inputs.push_back(ctx->input(i));
+    if (ctx->input_dtype(i) != DT_INT32) {
+      native_inputs.push_back(ctx->input(i));
+    } else {
+      OP_REQUIRES_OK_ASYNC(ctx, CopyToHostAsync(ctx, &native_inputs, i, stream),
+                           dummy_async_helper);
+      n_copies++;
+    }
+  }
+  if (n_copies > 0) {
+    // If we have any int32 tensors, then wait until data is copied to host.
+    cudaStreamSynchronize(stream);
   }
-  helper->Ref();  // Increment count for calculating native graph
   VLOG(1) << "Executing native segment: " << name();
-  lib->Run(opts, func_handle_, inputs, outputs,
-           [this, ctx, outputs, helper](const Status& s) {
-             core::ScopedUnref sc(helper);
-             OP_REQUIRES_OK_ASYNC(ctx, s, *helper);
-             VLOG(1) << "Native Segment completed";
-             for (size_t t = 0; t < outputs->size(); ++t) {
-               ctx->set_output(t, outputs->at(t));
-             }
-             delete outputs;
-           });
+  // Increment the reference count of the async_helper by 1. When the native
+  // segment finishes execution asynchronously, we decrement the reference
+  // count of the object.
+  async_helper->Ref();
+  lib->Run(
+      opts, native_execution_func_handle_, native_inputs, native_outputs,
+      [this, ctx, native_outputs, async_helper, stream](const Status& s) {
+        core::ScopedUnref sc(async_helper);
+        DummyAsyncHelper dummy_async_helper;
+        std::unique_ptr<std::vector<Tensor>> outputs_wrapper(native_outputs);
+        OP_REQUIRES_OK_ASYNC(ctx, s, dummy_async_helper);
+        VLOG(1) << "Native Segment completed";
+        int n_copies = 0;
+        for (size_t t = 0; t < native_outputs->size(); ++t) {
+          if (native_outputs->at(t).dtype() == DT_INT32) {
+            OP_REQUIRES_OK_ASYNC(
+                ctx, CopyToDeviceAsync(ctx, native_outputs->at(t), t, stream),
+                dummy_async_helper);
+            n_copies++;
+          } else {
+            ctx->set_output(t, native_outputs->at(t));
+          }
+        }
+        if (n_copies > 0) {
+          cudaStreamSynchronize(stream);
+        }
+      });
 }
 
 void TRTEngineOp::ExecuteCalibration(OpKernelContext* ctx,
                                      TRTEngineCacheResource* cache_res,
-                                     AsyncHelper* helper) {
+                                     AsyncHelper* async_helper) {
+  tensorflow::profiler::TraceMe activity(
+      "TRTEngineOp::ExecuteCalibration",
+      tensorflow::profiler::TraceMeLevel::kInfo);
   VLOG(1) << "Executing TRT calibration: " << name();
-  helper->Ref();
-  core::ScopedUnref sc(helper);
+  DummyAsyncHelper dummy_async_helper;
 
   CalibrationContext* calib_ctx = cache_res->calib_ctx_.get();
   const int num_inputs = ctx->num_inputs();
   // TODO(laigd): need to check that input shape matches.
   // Pass input data to calibrator
   std::unordered_map<string, void*> input_data;
+  bool input_size_ok = true;
   for (int i = 0; i < num_inputs; i++) {
     const Tensor& t = ctx->input(i);
     void* data_address = GetTensorAddress(&t);
     OP_REQUIRES_ASYNC(ctx, data_address,
                       errors::InvalidArgument(
                           "Unsupported data type encountered in input ", i),
-                      *helper);
+                      dummy_async_helper);
     // Check the allocated buffer is sufficient for input
-    const auto device_tensor =
-        calib_ctx->device_tensors_.at(i).AccessTensor(ctx);
-    CHECK_EQ(t.TotalBytes(), device_tensor->TotalBytes());
+    const auto device_tensor = &calib_ctx->device_tensors_.at(i);
+    if (t.TotalBytes() != device_tensor->TotalBytes()) {
+      // This can happen if the network has data dependent shapes.
+      input_size_ok = false;
+      VLOG(2) << "Size differs for input " << i
+              << ", skipping calibration for this input.";
+      break;
+    }
     input_data.emplace(StrCat(IONamePrefixes::kInputPHName, i), data_address);
   }
-  VLOG(2) << "Filled map for sending";
-  // copied from cuda_kernel_helper since it seems only valid in *.cu.cc files
-  const cudaStream_t* stream = CHECK_NOTNULL(
-      reinterpret_cast<const cudaStream_t*>(ctx->op_device_context()
-                                                ->stream()
-                                                ->implementation()
-                                                ->GpuStreamMemberHack()));
-  // If calibrator is terminated before, it means an error has occurred.
-  //
-  // Note: setBatch() will wait until TRTInt8Calibrator::getBatch() is called
-  // the first time before proceeding, so if buildCudaEngine() returns an error,
-  // it means getBatch() is never called, and the setBatch() here will hang
-  // until setDone() is called later by the calibration thread in
-  // AllocateCalibrationResources(). In that case, this setBatch() will always
-  // be able to detect the error and return false.
-  OP_REQUIRES_ASYNC(ctx, calib_ctx->calibrator_->setBatch(input_data, *stream),
-                    errors::Internal("Failed to feed calibration data"),
-                    *helper);
-  VLOG(2) << "Passed calibration data";
-  ExecuteNativeSegment(ctx, helper);
+  if (input_size_ok) {
+    VLOG(2) << "Filled map for sending";
+    // Copied from gpu_kernel_helper.h as the header can only be used in *.cu.cc
+    // files.
+    cudaStream_t stream = reinterpret_cast<cudaStream_t>(
+        CHECK_NOTNULL(ctx->op_device_context()
+                          ->stream()
+                          ->implementation()
+                          ->GpuStreamMemberHack()));
+    // TRTInt8Calibrator::setBatch will wait until TRTInt8Calibrator::getBatch
+    // is called before proceeding with feeding the calibration data to the
+    // calibrator. It returns true if the calibration data is accepted and
+    // returns false if calibration is terminated due to errors.
+    //
+    // If TRTInt8Calibrator::getBatch is never called, which could happen if
+    // there is any problem in building the cuda engine for calibration inside
+    // TensorRT, then the TRTInt8Calibrator::setBatch call here will hang until
+    // TRTInt8Calibrator::setDone is called by the calibration thread in
+    // AllocateCalibrationResources.
+    //
+    // In both of the above cases, setBatch here returns a boolean value to
+    // indicate the result of the calibration process.
+    if (!calib_ctx->calibrator_->setBatch(input_data, stream)) {
+      VLOG(2) << "Failed to feed calibration data";
+    } else {
+      VLOG(2) << "Passed calibration data";
+    }
+  }
+  if (!native_segment_absent_) {
+    ExecuteNativeSegment(ctx, async_helper);
+  } else {
+    LOG(ERROR) << "Calibration requires native segment, but is not found in "
+                  "the graph.";
+  }
 }
 
-Status TRTEngineOp::VerifyInputShapes(const std::vector<TensorShape>& shapes) {
-  if (shapes.empty()) {
+Status TRTEngineOp::VerifyInputShapes(
+    const std::vector<TensorShape>& input_concrete_shapes) {
+  tensorflow::profiler::TraceMe activity(
+      "TRTEngineOp::VerifyInputShapes",
+      tensorflow::profiler::TraceMeLevel::kInfo);
+  if (input_concrete_shapes.empty()) {
     return errors::InvalidArgument("Input shapes are empty, for ", name());
   }
-  if (shapes[0].dims() < 1) {
-    return errors::InvalidArgument("Input shapes contain scalar, for ", name(),
-                                   ": ",
-                                   TensorShapeUtils::ShapeListString(shapes));
-  }
 
-  const int batch_size = shapes[0].dim_size(0);
-  for (const TensorShape& shape : shapes) {
-    if (shape.dims() < 1 || batch_size != shape.dim_size(0)) {
+  if (input_partial_shapes_filtered_.empty()) {
+    if (!use_implicit_batch_) {
       return errors::InvalidArgument(
-          "Input shapes are inconsistent on the batch dimension, for ", name(),
-          ": ", TensorShapeUtils::ShapeListString(shapes));
+          "Explicit batch mode requires input_partial_shapes_ ",
+          "to contain the dynamic input shapes to TRTEngineOp");
     }
-  }
-  return Status::OK();
-}
-
-bool AreShapesCompatible(const std::vector<TensorShape>& actual_shapes,
-                         const std::vector<TensorShape>& cached_shapes) {
-  auto match_shape = [](const TensorShape& actual_shape,
-                        const TensorShape& cached_shape) {
-    // Match the rank.
-    if (actual_shape.dims() != cached_shape.dims()) return false;
-    // Match the batch size.
-    if (actual_shape.dim_size(0) > cached_shape.dim_size(0)) return false;
-    // Match remaining dimensions.
-    for (int i = 1; i < actual_shape.dims(); ++i) {
-      if (actual_shape.dim_size(i) != cached_shape.dim_size(i)) return false;
+    // If the graph was converted with an earlier version of TF-TRT, it can
+    // happen that the input_partial_shapes_ vector is not set (see
+    // input_shapes attribute handling in the TRTEngineOp constructor).
+    // In implicit batch mode it is allowed to have empty input_partial_shapes_,
+    // since it is only required in explicit batch mode (see the input_shapes
+    // attribute of ConvertGraphDefToEngine in TRTEngineOp::GetEngine.
+  } else {
+    // Additional consistency checks if input_partial_shapes_ is present.
+    const string error_msg = StrCat(
+        "Input shapes do not match input partial shapes stored in graph, for ",
+        name(), ": ", DebugString(input_concrete_shapes),
+        " != ", DebugString(input_partial_shapes_filtered_));
+    if (input_concrete_shapes.size() != input_partial_shapes_filtered_.size()) {
+      return errors::InvalidArgument(error_msg);
     }
-    return true;
-  };
-  for (int i = 0; i < actual_shapes.size(); ++i) {
-    if (!match_shape(actual_shapes[i], cached_shapes[i])) {
-      return false;
+    for (int i = 0; i < input_concrete_shapes.size(); i++) {
+      if (input_concrete_shapes[i].dims() !=
+          input_partial_shapes_filtered_[i].dims()) {
+        return errors::InvalidArgument(error_msg);
+      }
+    }
+    for (int i = 0; i < input_concrete_shapes.size(); i++) {
+      for (int d = 0; d < input_concrete_shapes[i].dims(); d++) {
+        if (input_partial_shapes_filtered_[i].dim_size(d) != -1) {
+          if (input_concrete_shapes[i].dim_size(d) !=
+              input_partial_shapes_filtered_[i].dim_size(d)) {
+            return errors::InvalidArgument(error_msg);
+          }
+        }
+      }
     }
   }
-  return true;
-}
 
-Status TRTEngineOp::GetEngineInputShapes(
-    const CacheType& cache, const std::vector<TensorShape>& actual_input_shapes,
-    std::vector<TensorShape>* engine_input_shapes) {
-  // VerifyInputShapes() already ensured that all input shapes have same
-  // batch size, and are not scalars.
-  *engine_input_shapes = actual_input_shapes;
-  int64 min_matched_batch_size = kint64max;
-  for (const auto& pair : cache) {
-    const std::vector<TensorShape>& cached_input_shapes = pair.first;
-    // This should not happen, but just for safety.
-    if (actual_input_shapes.size() != cached_input_shapes.size()) {
+  if (use_implicit_batch_) {
+    if (input_concrete_shapes[0].dims() < 1) {
       return errors::InvalidArgument(
-          "Input shape list size mismatch for ", name(),
-          ", cached size: ", cached_input_shapes.size(),
-          " vs. actual size: ", actual_input_shapes.size());
-    }
-    if (AreShapesCompatible(actual_input_shapes, cached_input_shapes)) {
-      const int cached_batch_size = cached_input_shapes[0].dim_size(0);
-      if (min_matched_batch_size > cached_batch_size) {
-        min_matched_batch_size = cached_batch_size;
-        *engine_input_shapes = cached_input_shapes;
+          "Input shapes contain scalar, for ", name(), ": ",
+          TensorShapeUtils::ShapeListString(input_concrete_shapes));
+    }
+    const int batch_size = input_concrete_shapes[0].dim_size(0);
+    if (batch_size < 1) {
+      return errors::InvalidArgument(
+          "Incorrect batch dimension, for ", name(), ": ",
+          TensorShapeUtils::ShapeListString(input_concrete_shapes));
+    }
+    for (const TensorShape& shape : input_concrete_shapes) {
+      if (batch_size != shape.dim_size(0)) {
+        return errors::InvalidArgument(
+            "Input shapes are inconsistent on the batch dimension, for ",
+            name(), ": ",
+            TensorShapeUtils::ShapeListString(input_concrete_shapes));
       }
     }
   }
   return Status::OK();
 }
 
+static bool AllowEngineNativeSegmentExecution() {
+  bool value;
+  Status status =
+      ReadBoolFromEnvVar("TF_TRT_ALLOW_ENGINE_NATIVE_SEGMENT_EXECUTION",
+                         /*default_val=*/true, &value);
+  if (!status.ok()) {
+    LOG(ERROR) << status;
+  }
+  return value;
+}
+
 void TRTEngineOp::ComputeAsync(OpKernelContext* ctx,
                                AsyncOpKernel::DoneCallback done) {
-  auto helper = new AsyncHelper(done);
-  core::ScopedUnref sc(helper);
+  tensorflow::profiler::TraceMe activity(
+      "TRTEngineOp::ComputeAsync", tensorflow::profiler::TraceMeLevel::kInfo);
+
+  // Invoke DoneCallback when this object is destructed, which could be after
+  // this routine finishes execution, in particular, when native segment is
+  // executed.
+  auto async_helper = new AsyncHelper(done);
+  core::ScopedUnref sc(async_helper);
+
+  // For all async execution macros, use this object as there is no need to call
+  // DoneCallback from those macros.
+  DummyAsyncHelper dummy_async_helper;
 
   // Get TRT resource.
   TRTEngineCacheResource* cache_res = nullptr;
-  OP_REQUIRES_OK_ASYNC(ctx, GetEngineCacheResource(ctx, &cache_res), *helper);
+  OP_REQUIRES_OK_ASYNC(ctx, GetEngineCacheResource(ctx, &cache_res),
+                       dummy_async_helper);
   core::ScopedUnref unref_cache_res(cache_res);
 
+  // Get shapes of inputs to engine.
+  std::vector<TensorShape> input_concrete_shapes;
+  input_concrete_shapes.reserve(ctx->num_inputs());
+  std::vector<TensorShape> input_concrete_shapes_filtered;
+  for (int i = 0; i < ctx->num_inputs(); ++i) {
+    input_concrete_shapes.push_back(ctx->input(i).shape());
+    if (ctx->input(i).dtype() != DataType::DT_RESOURCE) {
+      input_concrete_shapes_filtered.push_back(ctx->input(i).shape());
+    }
+  }
+
+  /// TODO(lsugy): fix case of engine with only resource inputs.
+  Status verify_input_shape_status =
+      VerifyInputShapes(input_concrete_shapes_filtered);
+  // TODO(bixia): Fix the segmentation.
+  if (!verify_input_shape_status.ok() && !native_segment_absent_) {
+    LOG_FIRST_FEW_WARNING_WITH_PREFIX
+        << "Running native segment for" << name()
+        << " due to failure in verifying input shapes: "
+        << verify_input_shape_status.error_message();
+    ExecuteNativeSegment(ctx, async_helper);
+    return;
+  }
+
+  if (!use_implicit_batch_ &&
+      (has_dynamic_shape_input_ || cache_res->profiles_.HasShapeTensor())) {
+    OP_REQUIRES_OK_ASYNC(ctx, cache_res->profiles_.CollectShapeValues(ctx),
+                         dummy_async_helper);
+    cache_res->profiles_.SetInputMask(input_mask_);
+    if (profile_generation_mode_) {
+      // Collecting new shapes for profiles can be only done once. After the
+      // shapes are converted to TRT profiles, no shapes can be collected
+      // anymore.
+      OP_REQUIRES_ASYNC(ctx, cache_res->profiles_.GetNumProfiles() == 0,
+                        errors::Unimplemented("Cannot collect new shapes when "
+                                              "profiles are already created."),
+                        dummy_async_helper);
+      // Just collect the input shape info and return. The shapes are used to
+      // generate optimization profiles during engine creation.
+      cache_res->profiles_.AddShape(input_concrete_shapes);
+      VLOG(1)
+          << "Native segment is used during collecting shapes for profiles.";
+      if (!native_segment_absent_) {
+        ExecuteNativeSegment(ctx, async_helper);
+      } else {
+        LOG(ERROR) << "Native segment is required for profile generation,  "
+                      "but is not found in the graph.";
+      }
+      return;
+    } else if (cache_res->profiles_.GetNumProfiles() == 0 && !static_engine_) {
+      // Add current shape if we did not collect any shapes so far.
+      if (!cache_res->profiles_.HasShape()) {
+        cache_res->profiles_.AddShape(input_concrete_shapes);
+      }
+      // Create profiles out of collected shapes during profile generation.
+      cache_res->profiles_.InitProfiles(input_partial_shapes_,
+                                        profile_strategy_);
+    }
+  }
+
   // Run calibration if in int8+calibration mode.
   // * Logic in TF 1.x:
   //   - During conversion: calibration_mode_ is true and cache size is 0, so it
   //     will run calibration.
-  //   - During inference: calibration_data will be set, so calibration_mode_ is
-  //     false and it won't trigger calibration.
+  //   - During inference: calibration_data will be set, so calibration_mode_
+  //     is false and it won't trigger calibration.
   // * Logic in TF 2.0:
   //   - During conversion: similar to 1.x.
   //   - During inference: calibration_data will still be empty, but cache will
-  //     contain the the calibrated engine, so it won't trigger calibration.
+  //     contain the calibrated engine, so it won't trigger calibration.
   //
   // TODO(laigd): consider the following alternatives:
   // 1. Serialize the state (calibration or inference) using
@@ -473,174 +924,158 @@ void TRTEngineOp::ComputeAsync(OpKernelContext* ctx,
       // TODO(laigd): better encapsulation.
       mutex_lock lock(engine_mutex_);
       if (!cache_res->calib_ctx_) {
+        // Add profiles if we are in dynamic shape mode.
+        if (!use_implicit_batch_ && (has_dynamic_shape_input_ ||
+                                     cache_res->profiles_.HasShapeTensor())) {
+          cache_res->profiles_.InitCalibProfile(input_concrete_shapes);
+        }
         OP_REQUIRES_OK_ASYNC(ctx, AllocateCalibrationResources(ctx, cache_res),
-                             *helper);
+                             dummy_async_helper);
       }
     }
     // TODO(laigd): check that the input shapes match the shapes of the
     // persistent tensor in the calibration resource.
-    ExecuteCalibration(ctx, cache_res, helper);
+    ExecuteCalibration(ctx, cache_res, async_helper);
     return;
   }
 
-  // Get shapes of inputs to engine.
-  std::vector<TensorShape> input_shapes;
-  input_shapes.reserve(ctx->num_inputs());
-  for (int i = 0; i < ctx->num_inputs(); ++i) {
-    input_shapes.push_back(ctx->input(i).shape());
-  }
-  OP_REQUIRES_OK_ASYNC(ctx, VerifyInputShapes(input_shapes), *helper);
-  StatusOr<EngineContext*> status = GetEngine(input_shapes, ctx, cache_res);
-  OP_REQUIRES_OK_ASYNC(ctx, status.status(), *helper);
-
-  EngineContext* engine_context = status.ValueOrDie();
-  if (!engine_context->cuda_engine) {
-    VLOG(1) << "Engine retrieval for input shapes: "
-            << TensorShapeUtils::ShapeListString(input_shapes)
-            << " failed. Running native segment for " << name();
-    ExecuteNativeSegment(ctx, helper);
+  ::stream_executor::port::StatusOr<std::pair<EngineContext*, int>> status =
+      GetEngine(input_concrete_shapes, ctx, cache_res);
+  OP_REQUIRES_OK_ASYNC(ctx, status.status(), dummy_async_helper);
+
+  EngineContext* engine_context = std::move(status.ValueOrDie()).first;
+  int trt_context_idx = std::move(status.ValueOrDie()).second;
+  auto may_execute_native_segment = [&] {
+    if (!native_segment_absent_ && !AllowEngineNativeSegmentExecution()) {
+      ctx->CtxFailure(
+          errors::Aborted("User disallowed engine native segment execution."));
+      return false;
+    } else if (native_segment_absent_) {
+      ctx->CtxFailure(
+          errors::Aborted("Native segment execution is enabled but "
+                          " native segment is not found in the graph."));
+      return false;
+    }
+    return true;
+  };
+  if (!engine_context->GetCudaEngine()) {
+    LOG_FIRST_FEW_WARNING_WITH_PREFIX
+        << "Engine retrieval for input shapes: "
+        << TensorShapeUtils::ShapeListString(input_concrete_shapes)
+        << " failed. Running native segment for " << name();
+    if (may_execute_native_segment()) {
+      ExecuteNativeSegment(ctx, async_helper);
+    }
     return;
   }
-  const bool retry = ExecuteTrtEngine(ctx, engine_context);
-  if (retry) {
-    LOG(WARNING) << "Failed to execute engine, "
-                 << "retrying with native segment for " << name();
-    ExecuteNativeSegment(ctx, helper);
+  Status stat =
+      ExecuteTrtEngine(ctx, engine_context, trt_context_idx,
+                       cache_res->profiles_, cache_res->allocator_.get());
+  if (stat.ok()) return;
+
+  LOG_FIRST_FEW_WARNING_WITH_PREFIX << "Failed to execute engine: " << stat
+                                    << " Retrying with native segment for "
+                                    << name();
+  if (!may_execute_native_segment()) {
     return;
   }
+  // When Native Segment execution is enabled, release any outputs that
+  // are allocated. ExecuteNativeSegment will re-allocate them and
+  // fail if they are currently allocated.
+  // The Tensor pointer in the returned TensorValue must be explicitly
+  // deleted.
+  for (int i = 0; i < ctx->num_outputs(); i++) {
+    delete ctx->release_output(i).tensor;
+  }
+  if (!native_segment_absent_) {
+    ExecuteNativeSegment(ctx, async_helper);
+  } else {
+    LOG(ERROR) << "Native segment execution is enabled, "
+                  "but native segment is not found in the graph.";
+  }
 }
 
-bool TRTEngineOp::ExecuteTrtEngine(OpKernelContext* ctx,
-                                   EngineContext* engine_context) {
+Status TRTEngineOp::ExecuteTrtEngine(
+    OpKernelContext* ctx, EngineContext* engine_context, int trt_context_idx,
+    const TrtShapeOptimizationProfile& profiles, TRTBaseAllocator* allocator) {
+  tensorflow::profiler::TraceMe activity(
+      "TRTEngineOp::ExecuteTrtEngine",
+      tensorflow::profiler::TraceMeLevel::kInfo);
   VLOG(1) << "Executing TRT engine: " << name();
-  auto& cuda_engine = engine_context->cuda_engine;
-  const bool kRetry = true;
-  // All inputs must have the same batch size, so just get it from the first
-  // input.
-  const int num_batch = ctx->input(0).shape().dim_size(0);
-  const int num_binding = ctx->num_inputs() + ctx->num_outputs();
+  nvinfer1::ICudaEngine* cuda_engine = engine_context->GetCudaEngine();
+
+  if (VLOG_IS_ON(2)) {
+    VLOG(2) << "  Network name: " << cuda_engine->getName();
+    VLOG(2) << "  Activation size: " << engine_context->GetDeviceMemorySize()
+            << " bytes";
+#if !IS_TRT_VERSION_GE(8, 0, 0, 0)
+    // getWorkspaceSize() is deprecated as of TRT 8
+    VLOG(2) << "  Workspace size: " << cuda_engine->getWorkspaceSize()
+            << " bytes";
+#endif  // #if !IS_TRT_VERSION_GE(8, 0, 0, 0)
+    VLOG(2) << "  Datatype of " << cuda_engine->getNbBindings()
+            << " inputs/outputs";
+    string binding_types = "";
+    for (int i = 0; i < cuda_engine->getNbBindings(); i++) {
+      binding_types += "    " + string(cuda_engine->getBindingName(i)) + ": " +
+                       DebugString(cuda_engine->getBindingDataType(i)) + "\n";
+    }
+    VLOG(2) << binding_types;
+  }
 
+  const int num_binding = cuda_engine->getNbBindings();
   std::vector<void*> buffers(num_binding);
 
-  for (int i = 0; i < ctx->num_inputs(); i++) {
-    const string input_name = StrCat(IONamePrefixes::kInputPHName, i);
-    const int binding_index = cuda_engine->getBindingIndex(input_name.c_str());
-    if (binding_index == -1) {
-      const string msg =
-          StrCat("Input node ", input_name, " not found, at ", name());
-      LOG(ERROR) << msg;
-      ctx->SetStatus(errors::NotFound(msg));
-      return !kRetry;
-    }
-
-    const Tensor& input_tensor = ctx->input(i);
-    const TensorShape& input_shape = input_tensor.shape();
-    if (num_batch != input_shape.dim_size(0)) {
-      LOG(ERROR) << "Input data has inconsistent batch size: " << num_batch
-                 << " vs " << input_shape.dim_size(0);
-      return kRetry;
-    }
-    auto dtype = cuda_engine->getBindingDataType(binding_index);
-    switch (dtype) {
-      case nvinfer1::DataType::kFLOAT:
-        buffers[binding_index] =
-            const_cast<float*>(input_tensor.flat<float>().data());
-        break;
-      case nvinfer1::DataType::kHALF:
-        buffers[binding_index] =
-            const_cast<Eigen::half*>(input_tensor.flat<Eigen::half>().data());
-        break;
-      case nvinfer1::DataType::kINT8:
-        LOG(ERROR) << "INT8 inputs are not supported yet!";
-        return kRetry;
-      case nvinfer1::DataType::kINT32:
-        buffers[binding_index] =
-            const_cast<int32*>(input_tensor.flat<int32>().data());
-        break;
-      default:
-        LOG(ERROR) << "Unknown TRT data type: " << static_cast<int>(dtype);
-        return kRetry;
-    }
-  }
+  // nvinfer1::IExecutionContext::enqueue is not thread safe and we need a mutex
+  // for it.
+  mutex_lock lock(engine_context->mu);
+  nvinfer1::IExecutionContext* execution_context;
+  bool has_device_memory;
+  TF_RETURN_IF_ERROR(engine_context->GetExecutionContext(
+      trt_context_idx, &execution_context, &has_device_memory));
 
-  for (int i = 0; i < ctx->num_outputs(); i++) {
-    // Create an output tensor
-    const string output_name = StrCat(IONamePrefixes::kOutputPHName, i);
-    const int binding_index = cuda_engine->getBindingIndex(output_name.c_str());
-    Tensor* output_tensor = nullptr;
-
-    TensorShape output_shape;
-    if (binding_index != -1) {
-      auto dims = cuda_engine->getBindingDimensions(binding_index);
-      std::vector<int> trt_shape(dims.nbDims + 1);
-      trt_shape[0] = num_batch;
-      for (int j = 0; j < dims.nbDims; j++) trt_shape[j + 1] = dims.d[j];
-      auto status = TensorShapeUtils::MakeShape(
-          trt_shape.data(), trt_shape.size(), &output_shape);
-      if (!status.ok()) {
-        LOG(ERROR) << "Failed to get output shape: " << status;
-        return kRetry;
-      }
-    } else {
-      const string msg =
-          StrCat("Ouput node ", output_name, " not found, at ", name());
-      LOG(ERROR) << msg;
-      ctx->SetStatus(errors::NotFound(msg));
-      return !kRetry;
-    }
-    auto status = ctx->allocate_output(i, output_shape, &output_tensor);
-    if (!status.ok()) {
-      LOG(ERROR) << "Allocating output failed with " << status;
-      ctx->SetStatus(status);
-      // Do not retry since we cannot allocate the same output twice.
-      // TODO(aaroey): ideally we should retry, fix this.
-      return !kRetry;
-    }
-    auto dtype = cuda_engine->getBindingDataType(binding_index);
-    switch (dtype) {
-      case nvinfer1::DataType::kFLOAT:
-        buffers[binding_index] =
-            const_cast<float*>(output_tensor->flat<float>().data());
-        break;
-      case nvinfer1::DataType::kHALF:
-        buffers[binding_index] =
-            const_cast<Eigen::half*>(output_tensor->flat<Eigen::half>().data());
-        break;
-      case nvinfer1::DataType::kINT8:
-        LOG(WARNING) << "int8 is not supported yet!";
-        return kRetry;
-      case nvinfer1::DataType::kINT32:
-        buffers[binding_index] =
-            const_cast<int32*>(output_tensor->flat<int32>().data());
-        break;
-      default:
-        LOG(WARNING) << "Unknown TRT data type: " << static_cast<int>(dtype);
-        return kRetry;
-    }
-  }
-  // Copied from cuda_kernel_helper since it seems only valid in *.cu.cc files
+  if (VLOG_IS_ON(2)) {
+    VLOG(2) << "Selected execution context: " << trt_context_idx;
+  }
+  const int num_batch =
+      use_implicit_batch_ ? ctx->input(0).shape().dim_size(0) : 0;
+
+  TF_RETURN_IF_ERROR(SetTrtEngineInputs(
+      cuda_engine, execution_context, trt_context_idx, buffers,
+      use_implicit_batch_, num_batch, profiles, ctx));
+
+  TF_RETURN_IF_ERROR(SetTrtEngineOutputs(cuda_engine, execution_context,
+                                         trt_context_idx, buffers,
+                                         use_implicit_batch_, num_batch, ctx));
+  // Copied from gpu_kernel_helper.h as the header can only be used in *.cu.cc
+  // files.
   const cudaStream_t* stream = CHECK_NOTNULL(
       reinterpret_cast<const cudaStream_t*>(ctx->op_device_context()
                                                 ->stream()
                                                 ->implementation()
                                                 ->GpuStreamMemberHack()));
 
-  // nvinfer1::IExecutionContext::enqueue is not thread safe and we need a mutex
-  // for it.
-  mutex_lock lock(engine_context->mu);
-  // TODO(jie): trt enqueue does not return error
-  auto ret = engine_context->execution_context->enqueue(num_batch, &buffers[0],
-                                                        *stream, nullptr);
-  if (!ret) {
-    LOG(WARNING) << "Failed to enqueue batch for TRT engine: " << name();
-    return kRetry;
-  }
-  // Synchronization will be done by TF.
-  return !kRetry;
+  ContextDeviceMemory context_device_memory;
+  if (!has_device_memory) {
+    tensorflow::profiler::TraceMe activity(
+        "TRTEngineOp::AllocateDeviceMemory",
+        tensorflow::profiler::TraceMeLevel::kInfo);
+    // Allocate device memory for the TensorRT engine execution. The device
+    // memory will be released when context_device_memory goes out of scope.
+    TF_RETURN_IF_ERROR(context_device_memory.AllocateDeviceMemory(
+        execution_context, allocator, engine_context->GetDeviceMemorySize()));
+  }
+
+  // Enqueue the TensorRT engine for execution.
+  return TrtEnqueue(execution_context, buffers, stream, use_implicit_batch_,
+                    num_batch);
 }
 
 Status TRTEngineOp::GetEngineCacheResource(OpKernelContext* ctx,
                                            TRTEngineCacheResource** cache_res) {
+  tensorflow::profiler::TraceMe activity(
+      "TRTEngineOp::GetEngineCachResource",
+      tensorflow::profiler::TraceMeLevel::kInfo);
   // Canonicalize the op name by removing the scopes if any. This is mainly
   // because in TFv2, the function graph can be instantiated in various ways and
   // it'll insert scope names to the name of the TRTEngineOps, which will result
@@ -662,116 +1097,253 @@ Status TRTEngineOp::GetEngineCacheResource(OpKernelContext* ctx,
       }});
 }
 
-StatusOr<EngineContext*> TRTEngineOp::GetEngine(
-    const std::vector<TensorShape>& input_shapes, OpKernelContext* ctx,
-    TRTEngineCacheResource* cache_res) {
-  static EngineContext empty_context;
+::stream_executor::port::StatusOr<TrtUniquePtrType<nvinfer1::ICudaEngine>>
+TRTEngineOp::BuildEngine(const std::vector<TensorShape>& input_concrete_shapes,
+                         int batch_size, bool use_calibration,
+                         TRTInt8Calibrator* calibrator,
+                         TRTEngineCacheResource* cache_resource,
+                         OpKernelContext* ctx) {
+  tensorflow::profiler::TraceMe activity(
+      "TRTEngineOp::BuildEngine", tensorflow::profiler::TraceMeLevel::kInfo);
+  TRT_ENSURE(cache_resource);
+  TRT_ENSURE(ctx);
+  // Use concrete shapes for implicit batch mode and partial shapes for
+  // explicit batch mode.
+  bool use_concrete_shapes =
+      use_implicit_batch_ || cache_resource->profiles_.IsStaticCompatible();
+  const std::vector<PartialTensorShape>& conversion_input_shapes =
+      use_concrete_shapes
+          ? std::vector<PartialTensorShape>(input_concrete_shapes.begin(),
+                                            input_concrete_shapes.end())
+          : input_partial_shapes_;
+
+  VLOG(1) << "Building a new TensorRT engine for " << name()
+          << " with input shapes: " << DebugString(conversion_input_shapes);
+
+  std::unordered_map<string, tensorflow::DeviceProperties> device_map;
+  DeviceNameUtils::ParsedName full_parsed_name;
+  DeviceNameUtils::ParseFullName(ctx->device()->name(), &full_parsed_name);
+  device_map.emplace(ctx->device()->name(),
+                     grappler::GetDeviceInfo(full_parsed_name));
+  tensorflow::grappler::VirtualCluster cluster(device_map);
+
+  TrtUniquePtrType<nvinfer1::ICudaEngine> engine;
+  auto status = convert::ConvertGraphDefToEngine(
+      segment_graph_def_, ctx, precision_mode_, batch_size, workspace_size_,
+      conversion_input_shapes, &logger, cache_resource->allocator_.get(),
+      calibrator, &engine, use_calibration, use_implicit_batch_, nullptr,
+      &cache_resource->profiles_, name(), use_explicit_precision_, &cluster,
+      ctx->device()->name());
+  if (!status.ok()) {
+    LOG_FIRST_FEW_WARNING_WITH_PREFIX
+        << "Engine creation for " << name() << " failed. "
+        << "The native segment will be used instead. "
+        << "Reason: " << status;
+    // Store an empty engine in the cache for these input shapes so we don't try
+    // to build the same failing engine again.
+    cache_resource->cache_.emplace(input_concrete_shapes,
+                                   std::make_unique<EngineContext>());
+    return status;
+  }
+  return engine;
+}
 
+::stream_executor::port::StatusOr<std::pair<EngineContext*, int>>
+TRTEngineOp::GetEngine(const std::vector<TensorShape>& input_concrete_shapes,
+                       OpKernelContext* ctx,
+                       TRTEngineCacheResource* cache_res) {
+  tensorflow::profiler::TraceMe activity(
+      "TRTEngineOp::GetEngine", tensorflow::profiler::TraceMeLevel::kInfo);
+  static EngineContext empty_context;
   mutex_lock lock(engine_mutex_);
-  // Using first input to get batch size is reliable - VerifyInputShapes() has
-  // verified that.
-  const int batch_size = input_shapes[0].dim_size(0);
+  // Using first input to get batch size is reliable - VerifyInputShapes()
+  // guarantees that the first input is not a scalar. As such we can always use
+  // the first input to get the batch size for implicit batch mode. For explicit
+  // batch mode, this value is not used.
+  const int batch_size = input_concrete_shapes[0].dim_size(0);
+  // TODO(Tamas): remove the need for batch_size in explicit_batch mode
   auto& cache = cache_res->cache_;
   auto allocator = cache_res->allocator_.get();
   if (allocator == nullptr) {
-    return &empty_context;
+    return std::pair<EngineContext*, int>(&empty_context, 0);
   }
 
   // Handle the static engine case. For static engines, the cache will have a
   // single element containing the only engine.
   if (static_engine_) {
     if (cache.size()) {
-      if (AreShapesCompatible(input_shapes, cache.begin()->first)) {
-        return cache.begin()->second.get();
+      // TODO(laigd): need a better shape compatibility check for the case where
+      // implicit batch is disabled.
+      if (!use_implicit_batch_ ||
+          AreShapesCompatible(input_concrete_shapes, cache.begin()->first)) {
+        int profile_id = 0;
+        if (!use_implicit_batch_)
+          profile_id =
+              cache_res->profiles_.GetProfileNumber(input_concrete_shapes);
+        if (profile_id != -1) {
+          return std::pair<EngineContext*, int>(cache.begin()->second.get(),
+                                                profile_id);
+        }
       }
-      return &empty_context;
+      return std::pair<EngineContext*, int>(&empty_context, 0);
     }
 
     TrtUniquePtrType<IRuntime> infer(nvinfer1::createInferRuntime(logger));
     infer->setGpuAllocator(allocator);
+    // Need to initialize plugins in order to deserialize engines that contain
+    // plugins.
+    MaybeInitializeTrtPlugins(&logger);
     TrtUniquePtrType<nvinfer1::ICudaEngine> static_engine(
         infer->deserializeCudaEngine(serialized_segment_.c_str(),
                                      serialized_segment_.size(), nullptr));
+    int profile_id = 0;
+    if (static_engine && !use_implicit_batch_) {
+      // load profiles
+      std::vector<ExecutionContext> exec_contexts;
+      TF_RETURN_IF_ERROR(cache_res->profiles_.RestoreProfiles(
+          static_engine.get(), ctx->num_inputs()));
+      TF_RETURN_IF_ERROR(cache_res->profiles_.CreateExecutionContexts(
+          static_engine.get(), &exec_contexts));
+      cache.emplace(input_concrete_shapes,
+                    std::make_unique<EngineContext>(std::move(static_engine),
+                                                    std::move(exec_contexts)));
+      VLOG(1) << "Added new engine to cache of " << name()
+              << ". Cache size: " << cache.size();
+      // Query which profile of the new engine matches the actual input.
+      profile_id = cache_res->profiles_.GetProfileNumber(input_concrete_shapes);
+      if (profile_id == -1) {
+        return std::pair<EngineContext*, int>(&empty_context, 0);
+      }
+      EngineContext* engine_context = cache_res->GetEngineContext(profile_id);
+      return std::pair<EngineContext*, int>(engine_context, profile_id);
+    }
+
+    if (!static_engine) {
+      if (!allow_build_at_runtime_) {
+        // Store an empty engine in the cache so we don't try to load the same
+        // failing engine again.
+        cache.emplace(input_concrete_shapes, std::make_unique<EngineContext>());
+        return std::pair<EngineContext*, int>(&empty_context, 0);
+      }
+      if (segment_graph_def_.node().empty()) {
+        Status status = ImportSegmentGraphDef(ctx->function_library(),
+                                              ctx->device()->name());
+        if (!status.ok()) {
+          LOG_FIRST_FEW_WARNING_WITH_PREFIX << "Getting segment graph for "
+                                            << name() << " failed. "
+                                            << "Reason: " << status;
+        }
+      }
+      auto result = BuildEngine(input_concrete_shapes, batch_size,
+                                /*use_calibration=*/false,
+                                /*calibrator=*/nullptr, cache_res, ctx);
+      if (!result.ok()) {
+        return std::pair<EngineContext*, int>(&empty_context, 0);
+      }
+      static_engine = std::move(result.ValueOrDie());
+    }
+
     auto raw_static_engine = static_engine.get();
-    const auto max_batch_size = raw_static_engine->getMaxBatchSize();
-    // Static engine will have max_batch_size for batch size so that all inputs
-    // will map to this single engine.
-    std::vector<TensorShape> engine_input_shapes(input_shapes);
-    for (int i = 0; i < engine_input_shapes.size(); i++) {
-      // TODO(tmorris): will all inputs have batch size as first dimension??
-      engine_input_shapes[i].set_dim(0, max_batch_size);
+    std::vector<TensorShape> engine_input_shapes(input_concrete_shapes);
+
+    int max_batch_size = 1;
+    if (use_implicit_batch_) {
+      max_batch_size = raw_static_engine->getMaxBatchSize();
+      // Static engine will have max_batch_size for batch size so that all
+      // inputs will map to this single engine.
+      for (int i = 0; i < engine_input_shapes.size(); i++) {
+        engine_input_shapes[i].set_dim(0, max_batch_size);
+      }
     }
+
+    ExecutionContext context = ExecutionContext::Create(raw_static_engine);
     // TODO(laigd): here we assume engine_input_shapes matches the actual input
     // shapes of the engine, we should verify that.
     cache.emplace(engine_input_shapes,
-                  absl::make_unique<EngineContext>(
-                      std::move(static_engine),
-                      TrtUniquePtrType<nvinfer1::IExecutionContext>(
-                          raw_static_engine->createExecutionContext())));
+                  std::make_unique<EngineContext>(std::move(static_engine),
+                                                  std::move(context)));
     // Runtime is safe to delete after engine creation
     VLOG(1) << "Size of serialized TRT engine: "
             << serialized_segment_.capacity();
     string tmp;
     // Swap with temporary empty string to deallocate the CPU memory.
     serialized_segment_.swap(tmp);
-    if (max_batch_size < batch_size) {
-      return &empty_context;
+    if (use_implicit_batch_ && (max_batch_size < batch_size)) {
+      return std::pair<EngineContext*, int>(&empty_context, 0);
     }
-    return cache.at(engine_input_shapes).get();
+    return std::pair<EngineContext*, int>(cache.at(engine_input_shapes).get(),
+                                          0);
   }  // static_engine_
 
-  // Handle the dynamic engine case. See if there is a compatible engine cached.
-  std::vector<TensorShape> engine_input_shapes;
-  TF_RETURN_IF_ERROR(
-      GetEngineInputShapes(cache, input_shapes, &engine_input_shapes));
+  int profile_id = -1;
+  if (!use_implicit_batch_) {
+    profile_id = cache_res->profiles_.GetProfileNumber(input_concrete_shapes);
+    // Since all profiles are already created at this point, finding no
+    // compatible profiles results in falling back to native TF.
+    if (profile_id == -1) {
+      return std::pair<EngineContext*, int>(&empty_context, 0);
+    }
+  }
 
-  // If matched, use that engine. Otherwise, we will look in cache for that
-  // exact shape and possibly create a new engine if it is not in cache.
-  if (!cache.count(engine_input_shapes)) {
-    TrtUniquePtrType<nvinfer1::ICudaEngine> engine;
-    bool convert_successfully = false;
-    LOG(INFO) << "Building a new TensorRT engine for " << name()
-              << " input shapes: "
-              << TensorShapeUtils::ShapeListString(engine_input_shapes);
+  EngineContext* engine_contexts;
+  if (use_implicit_batch_) {
+    engine_contexts = cache_res->GetEngineContext(input_concrete_shapes);
+  } else {
+    engine_contexts = cache_res->GetEngineContext(profile_id);
+  }
 
-    // Convert to partial shapes
-    std::vector<PartialTensorShape> partial_shapes(engine_input_shapes.begin(),
-                                                   engine_input_shapes.end());
+  // If cache does not have a compatible engine then create a new engine.
+  if (engine_contexts == nullptr) {
+    if (!allow_build_at_runtime_) {
+      LOG_FIRST_FEW_WARNING_WITH_PREFIX
+          << "Found no engine in cache matching input shapes. "
+          << "Not building a new engine because "
+          << "allow_build_at_runtime=False. "
+          << "The native segment will be used instead.";
+      // Store an empty engine in the cache for these input shapes so we don't
+      // try to build the same failing engine again.
+      cache.emplace(input_concrete_shapes, std::make_unique<EngineContext>());
+      return std::pair<EngineContext*, int>(&empty_context, 0);
+    }
 
     // Up to this point, calibrator_ can never be empty, since otherwise it
     // means calibration_mode_ is true and this path won't get executed.
-    auto status = convert::ConvertGraphDefToEngine(
-        segment_graph_, precision_mode_, batch_size, workspace_size_,
-        partial_shapes, &logger, allocator, calibrator_.get(), &engine,
-        use_calibration_, &convert_successfully);
-    if (!status.ok()) {
-      LOG(WARNING) << "Engine creation for " << name() << " failed. "
-                   << "The native segment will be used instead. "
-                   << "Reason: " << status;
-      // Store an empty engine in the cache for these input shapes so we don't
-      // try to build the same failing engine again.
-      cache.emplace(engine_input_shapes, absl::make_unique<EngineContext>());
-      return &empty_context;
+    auto result =
+        BuildEngine(input_concrete_shapes, batch_size, use_calibration_,
+                    calibrator_.get(), cache_res, ctx);
+    if (!result.ok()) {
+      return std::pair<EngineContext*, int>(&empty_context, 0);
     }
-    TrtUniquePtrType<nvinfer1::IExecutionContext> exec_context(
-        engine->createExecutionContext());
-    cache.emplace(engine_input_shapes,
-                  absl::make_unique<EngineContext>(std::move(engine),
-                                                   std::move(exec_context)));
+    TrtUniquePtrType<nvinfer1::ICudaEngine> engine =
+        std::move(result.ValueOrDie());
+    std::vector<ExecutionContext> exec_contexts;
+    TF_RETURN_IF_ERROR(cache_res->profiles_.CreateExecutionContexts(
+        engine.get(), &exec_contexts));
+    cache.emplace(input_concrete_shapes,
+                  std::make_unique<EngineContext>(std::move(engine),
+                                                  std::move(exec_contexts)));
     VLOG(1) << "Added new engine to cache of " << name()
             << ". Cache size: " << cache.size();
+    engine_contexts = cache.at(input_concrete_shapes).get();
+    // Query which profile of the new engine matches the actual input.
+    profile_id = cache_res->profiles_.GetProfileNumber(input_concrete_shapes);
   }
-  return cache.at(engine_input_shapes).get();
+  return std::pair<EngineContext*, int>(engine_contexts,
+                                        use_implicit_batch_ ? 0 : profile_id);
 }
 
 // TODO(hinsu): Move this allocation to CalibrationContext constructor, if
 // possible.
 Status TRTEngineOp::AllocateCalibrationResources(
     OpKernelContext* ctx, TRTEngineCacheResource* cache_res) {
-  cache_res->calib_ctx_ = absl::make_unique<CalibrationContext>();
+  tensorflow::profiler::TraceMe activity(
+      "TRTEngineOp::AllocateCalibrationResources",
+      tensorflow::profiler::TraceMeLevel::kInfo);
+  cache_res->calib_ctx_ = std::make_unique<CalibrationContext>();
   auto* cres = cache_res->calib_ctx_.get();
 
   // Get the input shapes.
+  /// TODO(lsugy): support INT8 calibration in non-frozen mode.
   const int batch_size = ctx->input(0).dim_size(0);
   const int num_inputs = ctx->num_inputs();
   std::vector<TensorShape> shapes;
@@ -779,46 +1351,62 @@ Status TRTEngineOp::AllocateCalibrationResources(
   VLOG(1) << "Constructing calibrator";
   for (int i = 0; i < num_inputs; i++) {
     // allocate workspace on device for inputs
+    auto* input = &cres->device_tensors_.at(i);
     const Tensor& t = ctx->input(i);
     shapes.emplace_back(t.shape());
-    Tensor* device_tensor;
-    TF_RETURN_IF_ERROR(ctx->allocate_persistent(
-        t.dtype(), t.shape(), &cres->device_tensors_.at(i), &device_tensor));
-    CHECK_EQ(t.TotalBytes(), device_tensor->TotalBytes());
-    void* device_address = GetTensorAddress(device_tensor);
+    TF_RETURN_IF_ERROR(ctx->allocate_temp(t.dtype(), t.shape(), input));
+    CHECK_EQ(t.TotalBytes(), input->TotalBytes());  // Crash OK
+
+    void* device_address = GetTensorAddress(input);
     if (device_address == nullptr) {
-      return errors::InvalidArgument(
-          "Unsupported data type encountered in input ", i);
+      return errors::InvalidArgument("Unsupported data type [",
+                                     DebugString(t.dtype()),
+                                     "] encountered in input ", i);
     }
     cres->device_buffers_.emplace(
         StrCat(IONamePrefixes::kInputPHName, i),
-        std::pair<void*, size_t>(device_address, device_tensor->TotalBytes()));
+        std::pair<void*, size_t>(device_address, input->TotalBytes()));
   }
   cres->calibrator_.reset(
       new TRTInt8Calibrator(cres->device_buffers_, batch_size, name()));
-  const int platform_gpu_id =
+  const int platform_device_id =
       ctx->device()->tensorflow_gpu_device_info()->gpu_id;
-  if (platform_gpu_id < 0) {
+  if (platform_device_id < 0) {
     LOG(ERROR) << "Can't get gpu_device_info from context->device()";
     return errors::InvalidArgument(
         "Context->device doesn't contain device info!");
   }
 
+  bool use_concrete_shapes =
+      use_implicit_batch_ || cache_res->profiles_.IsStaticCompatible();
+  const std::vector<PartialTensorShape>& conversion_input_shapes =
+      use_concrete_shapes
+          ? std::vector<PartialTensorShape>(shapes.begin(), shapes.end())
+          : input_partial_shapes_;
+
   cache_res->Ref();
-  cres->thr_.reset(new std::thread([this, cres, shapes, platform_gpu_id,
-                                    cache_res]() {
+  string platform_device_name = ctx->device()->name();
+  cres->thr_.reset(new std::thread([this, cres, shapes, conversion_input_shapes,
+                                    platform_device_id, platform_device_name,
+                                    cache_res, ctx]() {
     core::ScopedUnref sc(cache_res);
 
-    LOG(INFO) << "Starting calibration thread on device " << platform_gpu_id
-              << ", Calibration Resource @ " << cres;
-    auto err = cudaSetDevice(platform_gpu_id);
+    VLOG(1) << "Starting calibration thread on device " << platform_device_id
+            << ", Calibration Resource @ " << cres;
+    auto err = cudaSetDevice(platform_device_id);
     if (err != cudaSuccess) {
       // TODO(aaroey): should return error here.
-      LOG(ERROR) << "Couldn't set cuda device to " << platform_gpu_id
+      LOG(ERROR) << "Couldn't set cuda device to " << platform_device_id
                  << " in calibration thread";
     }
-    std::vector<PartialTensorShape> partial_shapes(shapes.begin(),
-                                                   shapes.end());
+
+    std::unordered_map<string, tensorflow::DeviceProperties> device_map;
+    DeviceNameUtils::ParsedName full_parsed_name;
+    DeviceNameUtils::ParseFullName(platform_device_name, &full_parsed_name);
+    device_map.emplace(platform_device_name,
+                       grappler::GetDeviceInfo(full_parsed_name));
+    tensorflow::grappler::VirtualCluster cluster(device_map);
+
     // ConvertGraphDefToEngine() will try to build the engine. This thread
     // will loop inside buildCudaEngine() consuming the calibration data
     // that is set by the TF op, and drive the builder until calibrator
@@ -828,25 +1416,39 @@ Status TRTEngineOp::AllocateCalibrationResources(
     // TODO(aaroey): maybe setting the max batch size using the python
     // calibration wrapper class.
     auto s = convert::ConvertGraphDefToEngine(
-        this->segment_graph_, TrtPrecisionMode::INT8,
+        this->segment_graph_def_, ctx, TrtPrecisionMode::INT8,
         cres->calibrator_->getBatchSize(), this->workspace_size_,
-        partial_shapes, &cache_res->GetLogger(), cache_res->allocator_.get(),
-        cres->calibrator_.get(), &cres->engine_,
-        /*use_calibration=*/true,
-        /*convert_successfully=*/nullptr);
+        conversion_input_shapes, &cache_res->GetLogger(),
+        cache_res->allocator_.get(), cres->calibrator_.get(), &cres->engine_,
+        /*use_calibration=*/true, this->use_implicit_batch_,
+        /*convert_successfully=*/nullptr,
+        /*profiles=*/&cache_res->profiles_, name(),
+        /*use_explicit_precision=*/use_explicit_precision_,
+        /*cluster=*/&cluster, platform_device_name);
     if (!s.ok()) {
       LOG(ERROR) << "Calibration failed: " << s;
       cres->calibrator_->setDone();  // Ignore further pushes
+      cache_res->cache_.emplace(shapes, std::make_unique<EngineContext>());
     } else {
       // Transfer the ownership of the engine to the engine cache, so we can
       // dump it out during conversion for TF 2.0.
       mutex_lock lock(this->engine_mutex_);
       this->calibrator_ = std::move(cres->calibrator_);
-      TrtUniquePtrType<nvinfer1::IExecutionContext> exec_context(
-          cres->engine_->createExecutionContext());
-      cache_res->cache_.emplace(
-          shapes, absl::make_unique<EngineContext>(std::move(cres->engine_),
-                                                   std::move(exec_context)));
+      if (!use_implicit_batch_ &&
+          (has_dynamic_shape_input_ || cache_res->profiles_.HasShapeTensor())) {
+        std::vector<ExecutionContext> exec_contexts;
+        auto calib_result = cache_res->profiles_.CreateExecutionContexts(
+            cres->engine_.get(), &exec_contexts);
+        cache_res->cache_.emplace(
+            shapes, std::make_unique<EngineContext>(std::move(cres->engine_),
+                                                    std::move(exec_contexts)));
+      } else {
+        ExecutionContext context =
+            ExecutionContext::Create(cres->engine_.get());
+        cache_res->cache_.emplace(
+            shapes, std::make_unique<EngineContext>(std::move(cres->engine_),
+                                                    std::move(context)));
+      }
     }
 
     VLOG(1) << "Calibration loop terminated " << this->name();
@@ -860,5 +1462,4 @@ REGISTER_KERNEL_BUILDER(Name("TRTEngineOp").Device(DEVICE_GPU), TRTEngineOp);
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc
index 497a2710c24..317f3a54357 100644
--- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc
@@ -13,44 +13,62 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <dirent.h>
-#include <string.h>
-
-#include <fstream>
+#include <memory>
 #include <numeric>
+#include <utility>
 #include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/container/inlined_vector.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "tensorflow/cc/framework/scope.h"
 #include "tensorflow/cc/ops/function_ops.h"
-#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/cc/ops/math_ops.h"
 #include "tensorflow/compiler/tf2tensorrt/convert/convert_graph.h"
-#include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
 #include "tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/process_function_library_runtime.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/fake_input.h"
 #include "tensorflow/core/framework/function.h"
-#include "tensorflow/core/framework/graph_to_functiondef.h"
+#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
-#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/kernels/ops_testutil.h"
-#include "tensorflow/core/lib/gtl/stl_util.h"
-#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/refcount.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/public/version.h"
+#include "tsl/framework/fixedpoint/FixedPoint.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
-#include "third_party/gpus/cuda/include/cuda_runtime_api.h"
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 namespace tensorflow {
 namespace tensorrt {
 using ::absl::StrCat;
 using ::testing::ElementsAre;
 
+struct TestParam {
+  bool static_engine;
+};
+
 class TRTEngineOpTestBase : public OpsTestBase {
  public:
-  void AddSimpleTrtOp(DataType dtype, int max_cached_engines_count = 1) {
+  void AddSimpleTrtOp(DataType dtype, int max_cached_engines_count = 1,
+                      PartialTensorShape shape = PartialTensorShape({-1, -1}),
+                      bool use_implicit_batch = true,
+                      bool allow_build_at_runtime = true,
+                      bool static_engine = false) {
     // Create the GPU device.
     std::unique_ptr<Device> device(
         DeviceFactory::NewDevice("GPU", {}, "/job:worker/replica:0/task:0"));
@@ -59,40 +77,86 @@ class TRTEngineOpTestBase : public OpsTestBase {
     Scope s = Scope::NewRootScope();
     auto feed = ops::_Arg(s.WithOpName("TensorRTInputPH_0"), dtype, 0);
     auto add = ops::Add(s.WithOpName("add"), feed, feed);
-    ops::_Retval(s.WithOpName("TensorRTOutputPH_0"), add, 0);
+    ops::_Retval give_me_a_name(s.WithOpName("TensorRTOutputPH_0"), add, 0);
 
     // Serialize the graph. TRTEngineOp will convert it using dynamic mode.
     GraphDef graph_def;
     TF_ASSERT_OK(s.ToGraphDef(&graph_def));
     Graph* graph = s.graph();
-    const char* op_name = "myop";
-    TF_ASSERT_OK(
-        convert::RegisterGraphToFunctionLibrary(graph_def, graph, op_name));
+    TF_ASSERT_OK(convert::RegisterGraphToFunctionLibrary(graph_def, graph,
+                                                         std::string(kOpName)));
     TF_ASSERT_OK(flib_def_->AddLibrary(graph->flib_def()));
 
-    PartialTensorShape shape({-1, -1});
+    string segment_string;
+    if (static_engine) {
+      convert::TRTOptimizationPass::ConversionParams params;
+      convert::EngineInfo info;
+      info.segment_graph_def.CopyFrom(graph_def);
+      info.precision_mode = TrtPrecisionMode::FP32;
+      info.max_workspace_size_bytes = 1 << 20;
+      info.engine_name = "TRTEngineOP_000_000";
+      params.use_implicit_batch = use_implicit_batch;
+      params.trt_logger_name = "DefaultLogger";
+
+      TrtShapeOptimizationProfile profile;
+      // We set the input mask to true (no resource inputs)
+      std::vector<bool> input_mask = {true};
+      profile.SetInputMask(input_mask);
+      // We set profile 0 to be incompatible with the input used in the test.
+      // This way we ensure that profile selection is tested.
+      TensorShape my_shape;
+      TF_CHECK_OK(
+          TensorShapeUtils::MakeShape(std::vector<int32>{4, 2}, &my_shape));
+      profile.AddShape({my_shape, {}});
+      TF_CHECK_OK(
+          TensorShapeUtils::MakeShape(std::vector<int32>{1, 2}, &my_shape));
+      profile.AddShape({my_shape, {}});
+
+      profile.InitProfiles({shape}, ProfileStrategy::kOptimal);
+      std::vector<PartialTensorShape> shape_vec{shape, {}};
+      TF_CHECK_OK(convert::CreateStaticEngine(
+          params, info, 1, shape_vec, &profile, &segment_string, nullptr));
+    }
 
     // Create the op.
+    // In implicit batch mode, the input shapes that we specify here are not
+    // used for engine creation, we use the concrete shapes during inference
+    // time for creating the engine.
+    // In explicit batch mode, the input shapes attribute is used to define
+    // the network for the TensorRT engine.
     OpsTestBase::SetDevice(DEVICE_GPU, std::move(device));
     NameAttrList function;
-    function.set_name(StrCat(op_name, "_native_segment"));
-    TF_ASSERT_OK(NodeDefBuilder(op_name, "TRTEngineOp")
+    function.set_name(StrCat(std::string(kOpName), "_native_segment"));
+    // We disable allow_soft_placement when executing the native segment of the
+    // TRTEngineOp for the following reasons:
+    //    OpsTestBase only allow one device in the device manager.
+    //    We need to define the GPU device to test TRTEngineOp.
+    //    When allow_soft_placement is true, the TensorFlow runtime produces an
+    //      error if a CPU device is not defined
+    //      (see ProcessFunctionLibraryRuntime::InstantiateMultiDevice).
+    TF_ASSERT_OK(NodeDefBuilder(std::string(kOpName), "TRTEngineOp")
                      .Input(FakeInput(1, dtype))
                      .Attr("input_shapes", {shape})
                      .Attr("output_shapes", {shape})
-                     .Attr("static_engine", false)
+                     .Attr("static_engine", static_engine)
                      .Attr("segment_func", function)
-                     .Attr("serialized_segment", "")
+                     .Attr("serialized_segment", segment_string)
                      .Attr("calibration_data", "")
                      .Attr("max_cached_engines_count", max_cached_engines_count)
                      .Attr("workspace_size_bytes", 1 << 20)
                      .Attr("precision_mode", "FP32")
                      .Attr("use_calibration", false)
+                     .Attr("profile_strategy", "optimal")
+                     .Attr("_use_implicit_batch", use_implicit_batch)
+                     .Attr("_allow_build_at_runtime", allow_build_at_runtime)
+                     .Attr("_allow_soft_placement", false)
                      .Attr("OutT", {dtype})
                      .Finalize(OpsTestBase::node_def()));
     TF_ASSERT_OK(InitOpWithFunctionLibrary());
   }
 
+  static const absl::string_view kOpName;
+
   template <typename T>
   void AddSimpleInput(const TensorShape& shape) {
     std::vector<T> input(shape.num_elements());
@@ -102,22 +166,49 @@ class TRTEngineOpTestBase : public OpsTestBase {
 
   void ResetInputs() {
     inputs_.clear();
-    gtl::STLDeleteElements(&tensors_);
+    for (auto& temp : tensors_) {
+      delete temp;
+    }
+    tensors_.clear();
   }
 
  private:
   Status InitOpWithFunctionLibrary() {
     OpKernel* kernel = nullptr;
-    Status status = CreateOpKernel(device_type_, device_, allocator(),
-                                   pflr_->GetFLR(device_->name()), node_def_,
-                                   TF_GRAPH_DEF_VERSION, &kernel);
+    auto flr = pflr_->GetFLR(device_->name());
+    std::shared_ptr<const NodeProperties> props;
+    Status status = NodeProperties::CreateFromNodeDef(
+        node_def_, flr->GetFunctionLibraryDefinition(), &props);
+    if (status.ok()) {
+      status.Update(CreateOpKernel(device_type_, device_, allocator(), flr,
+                                   props, TF_GRAPH_DEF_VERSION, &kernel));
+    }
     kernel_ = std::unique_ptr<OpKernel>(kernel);
     if (kernel_ != nullptr) input_types_ = kernel_->input_types();
     return status;
   }
 };
 
-TEST_F(TRTEngineOpTestBase, DynamicShapes) {
+class TRTEngineOpTestWithParam
+    : public TRTEngineOpTestBase,
+      public ::testing::WithParamInterface<TestParam> {
+ public:
+  TRTEngineOpTestWithParam() : param_(GetParam()) {}
+
+ protected:
+  TestParam param_;
+};
+
+const absl::string_view TRTEngineOpTestBase::kOpName = "myop";
+
+constexpr std::array<TestParam, 2> TestParameters{TestParam{false},
+                                                  TestParam{true}};
+
+INSTANTIATE_TEST_CASE_P(TRTEngineOpTestInstantiation, TRTEngineOpTestWithParam,
+                        ::testing::ValuesIn(TestParameters));
+
+TEST_F(TRTEngineOpTestBase, DynamicEngines) {
+  // Test dynamic engine creation during inference time
   TRTEngineOpTestBase::AddSimpleTrtOp(DT_FLOAT, /*max_cached_engines_count=*/4);
 
   // Execute the op with batch size > 1.
@@ -126,8 +217,8 @@ TEST_F(TRTEngineOpTestBase, DynamicShapes) {
 
   // Get the engine cache.
   TRTEngineCacheResource* cache_resource = nullptr;
-  TF_ASSERT_OK(
-      device_->resource_manager()->Lookup("TF-TRT", "myop", &cache_resource));
+  TF_ASSERT_OK(device_->resource_manager()->Lookup(
+      std::string(kTfTrtContainerName), std::string(kOpName), &cache_resource));
   core::ScopedUnref sc(cache_resource);
 
   // It should contain only one engine.
@@ -166,6 +257,98 @@ TEST_F(TRTEngineOpTestBase, DynamicShapes) {
   EXPECT_EQ(1, cache->count({TensorShape({10, 10})}));
 }
 
+TEST_F(TRTEngineOpTestBase, AllowBuildAtRuntime) {
+  TRTEngineOpTestBase::AddSimpleTrtOp(DT_FLOAT, /*max_cached_engines_count=*/1,
+                                      PartialTensorShape({-1, -1}),
+                                      /*use_implicit_batch=*/true,
+                                      /*allow_build_at_runtime=*/false);
+
+  // Execute the op
+  TensorShape input_shape({2, 2});
+  TRTEngineOpTestBase::AddSimpleInput<float>(input_shape);
+  TF_ASSERT_OK(OpsTestBase::RunOpKernel());
+
+  // Get the engine cache.
+  TRTEngineCacheResource* cache_resource = nullptr;
+  TF_ASSERT_OK(device_->resource_manager()->Lookup(
+      std::string(kTfTrtContainerName), std::string(kOpName), &cache_resource));
+  core::ScopedUnref sc(cache_resource);
+
+  // It should contain a placeholder with an empty cuda_engine (to mark that
+  // engine creation was not successful for the given input shape).
+  auto cache = &cache_resource->cache_;
+  EXPECT_EQ(1, cache->size());
+  ASSERT_EQ(1, cache->count({input_shape}));
+  EngineContext* ectx = cache->at({input_shape}).get();
+  EXPECT_EQ(ectx->GetCudaEngine(), nullptr);
+}
+
+TEST_P(TRTEngineOpTestWithParam, ExplicitBatch) {
+  // Test inference in explicit batch mode with static input shapes. Static
+  // shapes in this context means that the TensorRT knows all the input shapes
+  // during engine creation time.
+  TRTEngineOpTestBase::AddSimpleTrtOp(DT_FLOAT, /*max_cached_engines_count=*/1,
+                                      /*shape=*/PartialTensorShape({1, 2}),
+                                      /*use_implicit_batch=*/false,
+                                      /*allow_build_at_runtime=*/true,
+                                      /*static_engine=*/param_.static_engine);
+
+  TensorShape input_shape({1, 2});
+  TRTEngineOpTestBase::AddSimpleInput<float>(input_shape);
+  TF_ASSERT_OK(OpsTestBase::RunOpKernel());
+
+  // Get the engine cache.
+  TRTEngineCacheResource* cache_resource = nullptr;
+  TF_ASSERT_OK(device_->resource_manager()->Lookup(
+      std::string(kTfTrtContainerName), std::string(kOpName), &cache_resource));
+  core::ScopedUnref sc(cache_resource);
+
+  auto cache = &cache_resource->cache_;
+  EXPECT_EQ(1, cache->size());
+  ASSERT_EQ(1, cache->count({input_shape}));
+  EngineContext* ectx = cache->at({input_shape}).get();
+  EXPECT_NE(ectx->GetCudaEngine(), nullptr);
+}
+
+TEST_P(TRTEngineOpTestWithParam, DynamicShapes) {
+  // Test inference in explicit batch mode with dynamic input shapes. Dynamic
+  // shapes in this context means that some input shapes for TensorRT are
+  // unknown during engine creation time. When we create the network, the
+  // unknow shapes are repsesented as -1. Before we run inference, these shapes
+  // have to be specified by calling setBindingDimensions.
+  TRTEngineOpTestBase::AddSimpleTrtOp(DT_FLOAT, /*max_cached_engines_count=*/1,
+                                      /*shape=*/PartialTensorShape({-1, -1}),
+                                      /*use_implicit_batch=*/false,
+                                      /*allow_build_at_runtime=*/true,
+                                      param_.static_engine);
+
+  TensorShape input_shape({1, 2});
+  TRTEngineOpTestBase::AddSimpleInput<float>(input_shape);
+
+  TF_ASSERT_OK(OpsTestBase::RunOpKernel());
+
+  // Get the engine cache.
+  TRTEngineCacheResource* cache_resource = nullptr;
+  TF_ASSERT_OK(device_->resource_manager()->Lookup(
+      std::string(kTfTrtContainerName), std::string(kOpName), &cache_resource));
+  core::ScopedUnref sc(cache_resource);
+
+  auto cache = &cache_resource->cache_;
+  EXPECT_EQ(1, cache->size());
+  ASSERT_EQ(1, cache->count({input_shape}));
+  EngineContext* ectx = cache->at({input_shape}).get();
+  EXPECT_NE(ectx->GetCudaEngine(), nullptr);
+
+  // Execute the op with an incompatible shape.
+  ResetInputs();
+  TRTEngineOpTestBase::AddSimpleInput<float>(TensorShape({1, 37}));
+  // Test that the op runs. This should fall back to native segment.
+  TF_ASSERT_OK(OpsTestBase::RunOpKernel());
+  // We should still have a single engine that is not compatible with the input.
+  EXPECT_EQ(1, cache->size());
+  EXPECT_EQ(0, cache->count({TensorShape({1, 37})}));
+}
+
 template <typename T>
 class TRTEngineOpTest : public TRTEngineOpTestBase {};
 
@@ -191,5 +374,4 @@ TYPED_TEST(TRTEngineOpTest, Basic) {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_resource_ops.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_resource_ops.cc
index 533dd02d460..6889b609d19 100644
--- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_resource_ops.cc
+++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_resource_ops.cc
@@ -32,9 +32,9 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 #include "third_party/tensorrt/NvInfer.h"
 
 namespace tensorflow {
@@ -48,6 +48,9 @@ class CreateTRTResourceHandle : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override {
+    tensorflow::profiler::TraceMe activity(
+        "CreateTRTResourceHandle::Compute",
+        tensorflow::profiler::TraceMeLevel::kInfo);
     {
       mutex_lock l(mutex_);
       if (!initialized_) {
@@ -71,9 +74,10 @@ class CreateTRTResourceHandle : public OpKernel {
   string resource_name_;
   Tensor handle_;
   mutex mutex_;
-  bool initialized_ GUARDED_BY(mutex_) = false;
+  bool initialized_ TF_GUARDED_BY(mutex_) = false;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(CreateTRTResourceHandle);
+  CreateTRTResourceHandle(const CreateTRTResourceHandle&) = delete;
+  void operator=(const CreateTRTResourceHandle&) = delete;
 };
 
 REGISTER_KERNEL_BUILDER(Name("CreateTRTResourceHandle")
@@ -89,6 +93,9 @@ class InitializeTRTResource : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override {
+    tensorflow::profiler::TraceMe activity(
+        "InitializeTRTResource::Compute",
+        tensorflow::profiler::TraceMeLevel::kInfo);
     ResourceHandle handle = HandleFromInput(ctx, 0);
     core::RefCountPtr<TRTEngineCacheResource> resource;
     OP_REQUIRES_OK(
@@ -116,19 +123,21 @@ class InitializeTRTResource : public OpKernel {
     // Parse the serialized engines and add them to the cache.
     std::unique_ptr<RandomAccessFile> file;
     OP_REQUIRES_OK(ctx, ctx->env()->NewRandomAccessFile(filename, &file));
-    auto reader = absl::make_unique<io::RecordReader>(file.get());
+    auto reader = std::make_unique<io::RecordReader>(file.get());
 
     uint64 offset = 0;
     int num_loaded_engine = 0;
     do {
-      string record;
+      tstring record;
       Status status = reader->ReadRecord(&offset, &record);
       if (errors::IsOutOfRange(status)) break;
 
       TRTEngineInstance engine_instance;
       engine_instance.ParseFromString(record);
       std::vector<TensorShape> engine_input_shapes;
-      for (const TensorShapeProto& shape : engine_instance.input_shapes()) {
+      const auto& input_shapes = engine_instance.input_shapes();
+      engine_input_shapes.reserve(input_shapes.size());
+      for (const TensorShapeProto& shape : input_shapes) {
         engine_input_shapes.emplace_back(shape);
       }
 
@@ -140,11 +149,23 @@ class InitializeTRTResource : public OpKernel {
               engine_instance.serialized_engine().c_str(),
               engine_instance.serialized_engine().size(), nullptr));
       auto raw_engine = engine.get();
-      resource->cache_.emplace(
-          engine_input_shapes,
-          absl::make_unique<EngineContext>(
-              std::move(engine), TrtUniquePtrType<nvinfer1::IExecutionContext>(
-                                     raw_engine->createExecutionContext())));
+      std::vector<ExecutionContext> ctx_vec;
+      if (num_loaded_engine == 0) {
+        // Restore profiles if there are any. Currently only 1 engine is allowed
+        // in dynamic mode therefore we call this only for the 0th engine.
+        // it is a no-op in implicit batch mode.
+        OP_REQUIRES_OK(ctx, resource->profiles_.RestoreProfiles(
+                                raw_engine, engine_input_shapes.size()));
+        OP_REQUIRES_OK(ctx, resource->profiles_.CreateExecutionContexts(
+                                raw_engine, &ctx_vec));
+      } else {
+        // Multiple engines are only available in static mode. For each engine
+        // we have only a single execution context.
+        ctx_vec.push_back(ExecutionContext::Create(raw_engine));
+      }
+      resource->cache_.emplace(engine_input_shapes,
+                               std::make_unique<EngineContext>(
+                                   std::move(engine), std::move(ctx_vec)));
       ++num_loaded_engine;
     } while (1);
     VLOG(1) << "Loaded " << num_loaded_engine << " TRT engines for op "
@@ -156,7 +177,8 @@ class InitializeTRTResource : public OpKernel {
   // Maximum number of cached engines
   int max_cached_engines_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(InitializeTRTResource);
+  InitializeTRTResource(const InitializeTRTResource&) = delete;
+  void operator=(const InitializeTRTResource&) = delete;
 };
 
 REGISTER_KERNEL_BUILDER(Name("InitializeTRTResource")
@@ -168,9 +190,14 @@ class SerializeTRTResource : public OpKernel {
  public:
   explicit SerializeTRTResource(OpKernelConstruction* ctx) : OpKernel(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("delete_resource", &delete_resource_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("save_gpu_specific_engines",
+                                     &save_gpu_specific_engines_));
   }
 
   void Compute(OpKernelContext* ctx) override {
+    tensorflow::profiler::TraceMe activity(
+        "SerializeTRTResource::Compute",
+        tensorflow::profiler::TraceMeLevel::kInfo);
     const string& resource_name = ctx->input(0).scalar<tstring>()();
     const string& filename = ctx->input(1).scalar<tstring>()();
     OP_REQUIRES(ctx, !filename.empty(),
@@ -178,9 +205,12 @@ class SerializeTRTResource : public OpKernel {
 
     // Lookup engine cache resource.
     TRTEngineCacheResource* resource = nullptr;
-    OP_REQUIRES_OK(
-        ctx, ctx->resource_manager()->Lookup(std::string(kTfTrtContainerName),
-                                             resource_name, &resource));
+    OP_REQUIRES(
+        ctx,
+        ctx->resource_manager()
+            ->Lookup(std::string(kTfTrtContainerName), resource_name, &resource)
+            .ok(),
+        errors::NotFound("TRTEngineCacheResource not yet created"));
     core::ScopedUnref unref_me(resource);
 
     // Terminate the calibration if any.
@@ -189,29 +219,66 @@ class SerializeTRTResource : public OpKernel {
     // Serialize the engines and write them to file.
     std::unique_ptr<WritableFile> file;
     OP_REQUIRES_OK(ctx, ctx->env()->NewWritableFile(filename, &file));
-    auto writer = absl::make_unique<io::RecordWriter>(file.get());
+    auto writer = std::make_unique<io::RecordWriter>(file.get());
 
     int num_serialized_engines = 0;
-    for (const auto& pair : resource->cache_) {
-      // Ignore engines that failed to build.
-      const std::unique_ptr<EngineContext>& engine = pair.second;
-      if (!engine || !engine->cuda_engine) continue;
-
-      TRTEngineInstance engine_instance;
-      // Add input shapes.
-      const std::vector<TensorShape>& engine_input_shapes = pair.first;
-      for (const TensorShape& shape : engine_input_shapes) {
-        shape.AsProto(engine_instance.add_input_shapes());
+    if (save_gpu_specific_engines_) {
+      // If user requests TRT engines export, recursively create
+      // requisite directories.
+      const char* export_trt_engines_env =
+          getenv("TF_TRT_EXPORT_TRT_ENGINES_PATH");
+      if (export_trt_engines_env) {
+        VLOG(1) << "Exporting TRT engines to directory: "
+                << export_trt_engines_env;
+        OP_REQUIRES_OK(
+            ctx, ctx->env()->RecursivelyCreateDir(export_trt_engines_env));
       }
-      // Add the serialized engine.
-      TrtUniquePtrType<nvinfer1::IHostMemory> engine_data(
-          engine->cuda_engine->serialize());
-      engine_instance.set_serialized_engine(engine_data->data(),
-                                            engine_data->size());
 
-      OP_REQUIRES_OK(ctx,
-                     writer->WriteRecord(engine_instance.SerializeAsString()));
-      ++num_serialized_engines;
+      for (const auto& pair : resource->cache_) {
+        // Ignore engines that failed to build.
+        const std::unique_ptr<EngineContext>& engine = pair.second;
+        if (!engine || !engine->GetCudaEngine()) continue;
+
+        TRTEngineInstance engine_instance;
+        // Add input shapes.
+        const std::vector<TensorShape>& engine_input_shapes = pair.first;
+        for (const TensorShape& shape : engine_input_shapes) {
+          shape.AsProto(engine_instance.add_input_shapes());
+        }
+        // Add the serialized engine.
+        TrtUniquePtrType<nvinfer1::IHostMemory> engine_data(
+            engine->GetCudaEngine()->serialize());
+        engine_instance.set_serialized_engine(engine_data->data(),
+                                              engine_data->size());
+
+        if (export_trt_engines_env) {
+          const std::string engine_filename =
+              std::string(export_trt_engines_env) + "/" + resource_name;
+          std::unique_ptr<WritableFile> engine_file;
+          OP_REQUIRES_OK(
+              ctx, ctx->env()->NewWritableFile(engine_filename, &engine_file));
+          OP_REQUIRES_OK(ctx, engine_file->Append(StringPiece(
+                                  static_cast<char*>(engine_data->data()),
+                                  engine_data->size())));
+
+          const std::string dims_filename =
+              std::string(export_trt_engines_env) + "/dims-" + resource_name;
+          std::unique_ptr<WritableFile> dims_file;
+          OP_REQUIRES_OK(
+              ctx, ctx->env()->NewWritableFile(dims_filename, &dims_file));
+
+          for (const TensorShape& shape : engine_input_shapes) {
+            OP_REQUIRES_OK(ctx,
+                           dims_file->Append(StringPiece(shape.DebugString())));
+          }
+        }
+
+        OP_REQUIRES_OK(
+            ctx, writer->WriteRecord(engine_instance.SerializeAsString()));
+        ++num_serialized_engines;
+      }
+    } else {
+      VLOG(1) << "TRT Engines are not serialized for op: " << resource_name;
     }
     VLOG(1) << "Serialized " << num_serialized_engines << " TRT engines for op "
             << resource_name << " on device " << ctx->device()->name()
@@ -228,8 +295,10 @@ class SerializeTRTResource : public OpKernel {
 
  private:
   bool delete_resource_ = false;
+  bool save_gpu_specific_engines_ = true;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(SerializeTRTResource);
+  SerializeTRTResource(const SerializeTRTResource&) = delete;
+  void operator=(const SerializeTRTResource&) = delete;
 };
 
 REGISTER_KERNEL_BUILDER(Name("SerializeTRTResource").Device(DEVICE_GPU),
@@ -238,5 +307,4 @@ REGISTER_KERNEL_BUILDER(Name("SerializeTRTResource").Device(DEVICE_GPU),
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_resource_ops_test.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_resource_ops_test.cc
index e82f89e9c2d..987b01eebcb 100644
--- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_resource_ops_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_resource_ops_test.cc
@@ -13,99 +13,231 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <dirent.h>
-#include <string.h>
-
-#include <fstream>
+#include <memory>
+#include <string>
+#include <utility>
 #include <vector>
 
+#include "absl/container/inlined_vector.h"
 #include "absl/memory/memory.h"
+#include "absl/strings/str_join.h"
+#include "tensorflow/compiler/tf2tensorrt/common/datavec.h"
+#include "tensorflow/compiler/tf2tensorrt/common/utils.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
 #include "tensorflow/compiler/tf2tensorrt/utils/trt_engine_instance.pb.h"  // NOLINT
 #include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h"
 #include "tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h"
-#include "tensorflow/compiler/tf2tensorrt/utils/trt_tensor_proxy.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/framework/fake_input.h"
 #include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/kernels/ops_testutil.h"
-#include "tensorflow/core/lib/gtl/stl_util.h"
-#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/io/record_reader.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/file_system.h"
+#include "tensorflow/core/platform/path.h"
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/tstring.h"
+#include "tensorflow/core/platform/types.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 namespace tensorflow {
 namespace tensorrt {
 
-class TRTEngineResourceOpsTest : public OpsTestBase {
+struct TestParam {
+  nvinfer1::Dims dims;
+  bool dynamic_shape;
+  int n_inputs;
+};
+
+class TRTEngineResourceOpsTest
+    : public OpsTestBase,
+      public ::testing::WithParamInterface<TestParam> {
+ public:
+  TRTEngineResourceOpsTest() : param_(GetParam()) {}
+
  protected:
   void Reset() {
+    for (auto& temp : tensors_) {
+      delete temp;
+    }
+    for (auto& temp : managed_outputs_) {
+      delete temp;
+    }
+    tensors_.clear();
+    managed_outputs_.clear();
     inputs_.clear();
-    gtl::STLDeleteElements(&tensors_);
-    gtl::STLDeleteElements(&managed_outputs_);
+  }
+
+  ITensorProxyPtr NetworkWith1Input(nvinfer1::INetworkDefinition* network,
+                                    ITensorProxyPtr input) {
+    // Add a unary layer.
+    nvinfer1::IUnaryLayer* layer =
+        network->addUnary(*input->trt_tensor(), nvinfer1::UnaryOperation::kEXP);
+    EXPECT_NE(nullptr, layer);
+    return layer->getOutput(0);
+  }
+
+  // Constructs a network with two inputs, where the second input is a shape
+  // tensor. We take a slice of the first input with the size of the slice
+  // specified by the second input, assuming the first input is a 2D tensor.
+  // We then add the slice to itself to produce the output of the network.
+  ITensorProxyPtr NetworkWith2Inputs(nvinfer1::INetworkDefinition* network,
+                                     ITensorProxyPtr input) {
+    nvinfer1::Dims dims2{1, {2}};
+    ITensorProxyPtr input2 =
+        network->addInput(absl::StrCat(IONamePrefixes::kInputPHName, 1).c_str(),
+                          nvinfer1::DataType::kINT32, dims2);
+    EXPECT_NE(nullptr, input2->trt_tensor());
+
+    nvinfer1::Dims start{2, {0, 0}};
+    nvinfer1::Dims stride{2, {1, 1}};
+    auto slice_layer =
+        network->addSlice(*input->trt_tensor(), start, stride, stride);
+    EXPECT_NE(nullptr, slice_layer);
+
+    slice_layer->setInput(2, *input2->trt_tensor());
+    ITensorProxyPtr sliced_input = slice_layer->getOutput(0);
+    EXPECT_NE(nullptr, sliced_input->trt_tensor());
+
+    auto layer = network->addElementWise(*sliced_input->trt_tensor(),
+                                         *sliced_input->trt_tensor(),
+                                         nvinfer1::ElementWiseOperation::kSUM);
+    EXPECT_NE(nullptr, layer);
+    return layer->getOutput(0);
   }
 
   TrtUniquePtrType<nvinfer1::ICudaEngine> CreateTRTEngine() {
     TrtUniquePtrType<nvinfer1::IBuilder> builder(
         nvinfer1::createInferBuilder(logger_));
     TrtUniquePtrType<nvinfer1::INetworkDefinition> network;
-#if IS_TRT_VERSION_GE(6, 0, 0, 0)
-    const uint32_t flags = 0U;
+#if IS_TRT_VERSION_GE(8, 0, 0, 0)
     network =
-        TrtUniquePtrType<nvinfer1::INetworkDefinition>(builder->createNetworkV2(flags));
+        TrtUniquePtrType<nvinfer1::INetworkDefinition>(builder->createNetworkV2(
+            1U << static_cast<int>(
+                nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH)));
 #else
-    network = TrtUniquePtrType<nvinfer1::INetworkDefinition>(
-        builder->createNetwork());
+    network =
+        TrtUniquePtrType<nvinfer1::INetworkDefinition>(builder->createNetworkV2(
+            1U << static_cast<int>(
+                nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH)));
 #endif
+
     // Add the input.
-    nvinfer1::Dims dims;
-    dims.nbDims = 1;
-    dims.d[0] = 1;
+    nvinfer1::Dims dims = this->param_.dims;
+    if (this->param_.dynamic_shape) {
+      std::fill(dims.d, dims.d + dims.nbDims, -1);
+    }
+    const std::string in_name = StrCat(IONamePrefixes::kInputPHName, 0);
     ITensorProxyPtr input =
-        network->addInput("input", nvinfer1::DataType::kFLOAT, dims);
+        network->addInput(in_name.c_str(), nvinfer1::DataType::kFLOAT, dims);
     EXPECT_NE(nullptr, input->trt_tensor());
-
-    // Add a unary layer.
-    nvinfer1::IUnaryLayer* layer =
-        network->addUnary(*input->trt_tensor(), nvinfer1::UnaryOperation::kEXP);
-    EXPECT_NE(nullptr, layer);
-
     // Mark the output.
-    ITensorProxyPtr output = layer->getOutput(0);
+    ITensorProxyPtr output =
+        this->param_.n_inputs == 1
+            ? this->NetworkWith1Input(network.get(), input)
+            : this->NetworkWith2Inputs(network.get(), input);
     output->setName("output");
     network->markOutput(*output->trt_tensor());
 
     // Build the engine
+    TrtUniquePtrType<nvinfer1::IBuilderConfig> builder_config(
+        builder->createBuilderConfig());
+    builder_config->setMaxWorkspaceSize(1 << 10);
     builder->setMaxBatchSize(1);
-#if IS_TRT_VERSION_GE(6, 0, 0, 0)
-  TrtUniquePtrType<nvinfer1::IBuilderConfig> builder_config(builder->createBuilderConfig());
-  builder_config->setMaxWorkspaceSize(1 << 10);
-  TrtUniquePtrType<nvinfer1::ICudaEngine> engine(
-      builder->buildEngineWithConfig(*network, *builder_config));
-#else
-    builder->setMaxWorkspaceSize(1 << 10);
+
+    if (this->param_.dynamic_shape) {
+      TrtShapeOptimizationProfile profile;
+      profile.SetShapeTensorMask(network.get());
+      const int n_input = param_.n_inputs;
+      // Set the input mask to true (no resource input)
+      std::vector<bool> input_mask(n_input, true);
+      profile.SetInputMask(input_mask);
+      // The for loop defines three optimization profiles for the network.
+      for (int i = 1; i <= 3; i++) {
+        std::vector<TensorShape> shape_vec(n_input);
+        // Define a shape with all dimensions set to 3*i.
+        std::vector<int> dimvec(this->param_.dims.nbDims, 3 * i);
+        TensorShape shape;
+        TF_CHECK_OK(
+            TensorShapeUtils::MakeShape(dimvec.data(), dimvec.size(), &shape));
+
+        const ITensorProxyPtr input = network->getInput(0);
+        const char* name = input->getName();
+        VLOG(2) << "Defining profile for input " << name;
+        shape_vec[0] = shape;
+        if (this->param_.n_inputs == 2) {
+          // The shape of the shape tensor.
+          TF_CHECK_OK(TensorShapeUtils::MakeShape(
+              std::vector<int32>{param_.dims.nbDims}, &shape));
+          shape_vec[1] = shape;
+          // Values of the shape tensor
+          Tensor shape_tensor(DT_INT32, shape);
+          // Define shape values {1, i}, where 1 is the value of the first dim,
+          // and i is the value of the second dimension.
+          std::vector<int32> vals{1, i};
+          std::copy_n(vals.data(), vals.size(),
+                      shape_tensor.flat<int32_t>().data());
+          DataVec shape_values{{"one", {}}, {"two", shape_tensor}};
+          TF_CHECK_OK(profile.CollectShapeValues(shape_values));
+        } else {
+          TF_CHECK_OK(profile.CollectShapeValues({{"one", {}}}));
+        }
+        profile.AddShape(shape_vec);
+      }
+      std::vector<PartialTensorShape> input_partial_shapes;
+      TF_CHECK_OK(GetNetworkInputShapes(network.get(), &input_partial_shapes));
+      profile.InitProfiles(input_partial_shapes, ProfileStrategy::kOptimal);
+      // Configure and build engine
+      TF_CHECK_OK(profile.ConfigureBuilder(builder.get(), builder_config.get(),
+                                           network.get()));
+    }
+    VLOG(2) << "ConfigureBuilder Finished";
     TrtUniquePtrType<nvinfer1::ICudaEngine> engine(
-        builder->buildCudaEngine(*network));
-#endif
+        builder->buildEngineWithConfig(*network, *builder_config));
+    VLOG(2) << "Engine constructed";
     EXPECT_NE(nullptr, engine);
     return engine;
   }
   Logger& logger_ = *Logger::GetLogger();
+  TestParam param_;
 };
 
-TEST_F(TRTEngineResourceOpsTest, Basic) {
+#if IS_TRT_VERSION_GE(7, 1, 3, 0)
+constexpr std::array<TestParam, 3> TestParameters = {
+    TestParam{nvinfer1::Dims{1, {1}}, false, 1},
+    TestParam{nvinfer1::Dims{1, {1}}, true, 1},
+    TestParam{nvinfer1::Dims{2, {3, 3}}, true, 2}};
+#else
+constexpr std::array<TestParam, 2> TestParameters = {
+    TestParam{nvinfer1::Dims{1, {1}}, false, 1},
+    TestParam{nvinfer1::Dims{1, {1}}, true, 1}};
+#endif
+
+INSTANTIATE_TEST_CASE_P(EngineResourceOpsTestInstantiation,
+                        TRTEngineResourceOpsTest,
+                        ::testing::ValuesIn(TestParameters));
+
+TEST_P(TRTEngineResourceOpsTest, Basic) {
   // Create the GPU device.
   std::unique_ptr<Device> device(
       DeviceFactory::NewDevice("GPU", {}, "/job:worker/replica:0/task:0"));
   ResourceMgr* rm = device->resource_manager();
   SetDevice(DEVICE_GPU, std::move(device));
 
-  // Create the resource handle.
+  // Create a resource handle.
   const string container(kTfTrtContainerName);
   const string resource_name = "myresource";
   Reset();
@@ -117,11 +249,12 @@ TEST_F(TRTEngineResourceOpsTest, Basic) {
   ResourceHandle handle =
       context_->mutable_output(0)->scalar<ResourceHandle>()();
 
+  // Check that a resource hasn't been created yet.
   TRTEngineCacheResource* resource = nullptr;
   EXPECT_TRUE(
       errors::IsNotFound(rm->Lookup(container, resource_name, &resource)));
 
-  // Create the resouce using an empty file with InitializeTRTResource.
+  // Create a resource and use an empty file to initialize the resource.
   Reset();
   Env* env = Env::Default();
   const string filename = io::JoinPath(testing::TmpDir(), "trt_engine_file");
@@ -136,21 +269,32 @@ TEST_F(TRTEngineResourceOpsTest, Basic) {
                    .Finalize(node_def()));
   TF_ASSERT_OK(InitOp());
   AddInputFromArray<ResourceHandle>(TensorShape({}), {handle});
-  AddInputFromArray<string>(TensorShape({}), {filename});
+  AddInputFromArray<tstring>(TensorShape({}), {filename});
   TF_ASSERT_OK(RunOpKernel());
+
+  // Check that the resource is registered with the resource manager and the
+  // cache of the resource is empty.
   EXPECT_TRUE(rm->Lookup(container, resource_name, &resource).ok());
   EXPECT_EQ(0, resource->cache_.size());
 
-  // Create a serialized TRT engine file.
+  // Create an engine and add it to the cache of the resource.
   TrtUniquePtrType<nvinfer1::ICudaEngine> engine = CreateTRTEngine();
-  TrtUniquePtrType<nvinfer1::IExecutionContext> context(
-      engine->createExecutionContext());
+  ExecutionContext context = ExecutionContext::Create(engine.get());
+
+  std::vector<TensorShape> engine_input_shape(1);
+  TF_ASSERT_OK(DimsAdapter(param_.dims).TensorShape(&(engine_input_shape[0])));
+  if (param_.n_inputs > 1) {
+    engine_input_shape.push_back(TensorShape({1, 1}));
+  }
   resource->cache_.emplace(
-      std::vector<TensorShape>{TensorShape({1, 1})},
-      absl::make_unique<EngineContext>(std::move(engine), std::move(context)));
-  resource->Unref();
+      engine_input_shape,
+      std::make_unique<EngineContext>(std::move(engine), std::move(context)));
+  // Check that the resource has multiple references before it is unregistered
+  // from the resource manager.
+  EXPECT_FALSE(resource->RefCountIsOne());
 
-  // Serialize the engine using SerializeTRTResource op.
+  // Serialize the engine to a file and unregistered the resource from the
+  // resource manager.
   Reset();
   TF_ASSERT_OK(NodeDefBuilder("op", "SerializeTRTResource")
                    .Attr("delete_resource", true)
@@ -161,8 +305,13 @@ TEST_F(TRTEngineResourceOpsTest, Basic) {
   AddInputFromArray<tstring>(TensorShape({}), {resource_name});
   AddInputFromArray<tstring>(TensorShape({}), {filename});
   TF_ASSERT_OK(RunOpKernel());
+  // Check that the resource now has only one reference. Detach the reference
+  // to the resource to destroy the resource.
+  EXPECT_TRUE(resource->RefCountIsOne());
+  resource->Unref();
 
-  // Make sure the cache is deleted.
+  // Check that unregistering the resource from the resource manager returns
+  // an error as the resource has already been unregistered.
   Reset();
   TF_ASSERT_OK(NodeDefBuilder("op", "DestroyResourceOp")
                    .Attr("ignore_lookup_error", false)
@@ -172,22 +321,24 @@ TEST_F(TRTEngineResourceOpsTest, Basic) {
   AddInputFromArray<ResourceHandle>(TensorShape({}), {handle});
   EXPECT_TRUE(errors::IsNotFound(RunOpKernel()));
 
-  // Verify the serialized engine file.
+  // Verify the file for the serialized engine.
   std::unique_ptr<RandomAccessFile> file;
   TF_ASSERT_OK(env->NewRandomAccessFile(filename, &file));
-  auto reader = absl::make_unique<io::RecordReader>(file.get());
+  auto reader = std::make_unique<io::RecordReader>(file.get());
   uint64 offset = 0;
-  string record;
+  tstring record;
   TF_ASSERT_OK(reader->ReadRecord(&offset, &record));
   TRTEngineInstance engine_instance;
   engine_instance.ParseFromString(record);
-  EXPECT_EQ(1, engine_instance.input_shapes_size());
-  EXPECT_EQ(2, engine_instance.input_shapes(0).dim_size());
-  EXPECT_EQ(1, engine_instance.input_shapes(0).dim(0).size());
-  EXPECT_EQ(1, engine_instance.input_shapes(0).dim(1).size());
+  EXPECT_EQ(param_.n_inputs, engine_instance.input_shapes_size());
+  EXPECT_EQ(param_.dims.nbDims, engine_instance.input_shapes(0).dim_size());
+  for (int i = 0; i < param_.dims.nbDims; i++) {
+    EXPECT_EQ(param_.dims.d[i], engine_instance.input_shapes(0).dim(i).size());
+  }
   EXPECT_TRUE(errors::IsOutOfRange(reader->ReadRecord(&offset, &record)));
 
-  // Recreate the cache resource.
+  // Recreate the resource and use the file with the serialized engine to
+  // initialize the resource.
   Reset();
   TF_ASSERT_OK(NodeDefBuilder("op", "InitializeTRTResource")
                    .Input(FakeInput(DT_RESOURCE))
@@ -198,11 +349,47 @@ TEST_F(TRTEngineResourceOpsTest, Basic) {
   AddInputFromArray<ResourceHandle>(TensorShape({}), {handle});
   AddInputFromArray<tstring>(TensorShape({}), {filename});
   TF_ASSERT_OK(RunOpKernel());
+
+  // Check that the resource is registered with the resource manager again and
+  // the cache of the resource is not empty.
   EXPECT_TRUE(rm->Lookup(container, resource_name, &resource).ok());
   EXPECT_EQ(1, resource->cache_.size());
-  resource->Unref();
+  if (this->param_.dynamic_shape) {
+    EXPECT_EQ(3, resource->profiles_.GetNumProfiles());
+    EXPECT_EQ(3, resource->cache_.begin()->second->GetNumContexts());
+
+    if (this->param_.n_inputs == 1) {
+      // Check if profiles are restored correctly.
+      std::vector<TensorShape> shapes(1);
+      // We create a shape vector that matches only profile 1.
+      TF_CHECK_OK(
+          TensorShapeUtils::MakeShape(std::vector<int32>{6}, &shapes[0]));
+      EXPECT_EQ(1, resource->profiles_.GetProfileNumber(shapes));
+    } else {
+      // Check if shape values are restored corretly.
+      std::vector<TensorShape> shapes(2);
+      // We create a shape vector that matches only profile 2.
+      TF_CHECK_OK(
+          TensorShapeUtils::MakeShape(std::vector<int32>{9, 9}, &shapes[0]));
+      TF_CHECK_OK(
+          TensorShapeUtils::MakeShape(std::vector<int32>{2}, &shapes[1]));
+      Tensor shape_tensor(DT_INT32, shapes[1]);
+      std::vector<int32> vals{1, 3};
+      std::copy_n(vals.data(), vals.size(),
+                  shape_tensor.flat<int32_t>().data());
+      // DataVec names are not in used CollectShapeValues, only the order
+      // matters.
+      DataVec shape_values{{"one", {}}, {"two", shape_tensor}};
+      TF_CHECK_OK(resource->profiles_.CollectShapeValues(shape_values));
+      EXPECT_EQ(2, resource->profiles_.GetProfileNumber(shapes));
+    }
+  }
+  // Check that the resource has multiple references before it is unregistered
+  // from the resource manager.
+  EXPECT_FALSE(resource->RefCountIsOne());
 
-  // Destroy the engine cache again.
+  // Unregister the resource from the resource manager two times, expect that
+  // the second time produces an error.
   Reset();
   TF_ASSERT_OK(NodeDefBuilder("op", "DestroyResourceOp")
                    .Attr("ignore_lookup_error", false)
@@ -212,10 +399,14 @@ TEST_F(TRTEngineResourceOpsTest, Basic) {
   AddInputFromArray<ResourceHandle>(TensorShape({}), {handle});
   TF_ASSERT_OK(RunOpKernel());
   EXPECT_TRUE(errors::IsNotFound(RunOpKernel()));
+
+  // Check that the resource now has only one reference. Detach the reference
+  // to the resource to destroy resource.
+  EXPECT_TRUE(resource->RefCountIsOne());
+  resource->Unref();
 }
 
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/ops/get_calibration_data_op.cc b/tensorflow/compiler/tf2tensorrt/ops/get_calibration_data_op.cc
index 573172b92e6..2af3164c3e2 100644
--- a/tensorflow/compiler/tf2tensorrt/ops/get_calibration_data_op.cc
+++ b/tensorflow/compiler/tf2tensorrt/ops/get_calibration_data_op.cc
@@ -13,8 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
@@ -34,5 +33,4 @@ Returns calibration data for the given resource name
 
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/ops/trt_engine_op.cc b/tensorflow/compiler/tf2tensorrt/ops/trt_engine_op.cc
index 58cabbee53d..1d494bb44af 100644
--- a/tensorflow/compiler/tf2tensorrt/ops/trt_engine_op.cc
+++ b/tensorflow/compiler/tf2tensorrt/ops/trt_engine_op.cc
@@ -13,8 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
@@ -34,22 +33,23 @@ namespace tensorflow {
 REGISTER_OP("TRTEngineOp")
     .Attr("serialized_segment: string")
     .Attr("segment_func: func = {}")
-    .Attr("InT: list({int8,float16,float32,int32})")
-    .Attr("OutT: list({int8,float16,float32,int32})")
+    .Attr("InT: list({bool,int8,float16,float32,int32,resource})")
+    .Attr("OutT: list({bool,int8,float16,float32,int32})")
+    .Attr("input_shapes: list(shape) = []")
+    .Attr("output_shapes: list(shape) = []")
     .Attr("max_cached_engines_count: int = 1")
+    .Attr("max_batch_size: int = 1")
     .Attr("workspace_size_bytes: int")
     .Attr("precision_mode: {'FP32', 'FP16', 'INT8'}")
     .Attr("calibration_data: string = ''")
     .Attr("use_calibration: bool = true")
-    .Attr("input_shapes: list(shape) = []")
-    .Attr("output_shapes: list(shape) = []")
     .Input("in_tensor: InT")
     .Output("out_tensor: OutT")
     .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
       std::vector<tensorflow::PartialTensorShape> output_shapes;
       TF_RETURN_IF_ERROR(c->GetAttr("output_shapes", &output_shapes));
 
-      for(int i=0; i<output_shapes.size(); i++) {
+      for (int i = 0; i < output_shapes.size(); i++) {
         ::tensorflow::shape_inference::ShapeHandle shape;
         shape_inference::ShapeHandle output_shape_handle;
         TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(
@@ -63,8 +63,9 @@ REGISTER_OP("TRTEngineOp")
     .Attr("segment_funcdef_name: string = ''")
     .Attr("cached_engine_batches: list(int) >= 0 = []")
     .Attr("fixed_input_size: bool = true")
-    .Attr("static_engine: bool = true");
+    .Attr("static_engine: bool = true")
+    .Attr("profile_strategy: string = ''")
+    .Attr("use_explicit_precision: bool = false");
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/ops/trt_engine_resource_ops.cc b/tensorflow/compiler/tf2tensorrt/ops/trt_engine_resource_ops.cc
index 01911de66ec..3f21a22136e 100644
--- a/tensorflow/compiler/tf2tensorrt/ops/trt_engine_resource_ops.cc
+++ b/tensorflow/compiler/tf2tensorrt/ops/trt_engine_resource_ops.cc
@@ -13,8 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
@@ -39,6 +38,7 @@ REGISTER_OP("InitializeTRTResource")
 
 REGISTER_OP("SerializeTRTResource")
     .Attr("delete_resource: bool = false")
+    .Attr("save_gpu_specific_engines: bool = True")
     .Input("resource_name: string")
     .Input("filename: string")
     .SetIsStateful()
@@ -46,5 +46,4 @@ REGISTER_OP("SerializeTRTResource")
 
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.cc b/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.cc
index 563ce724f43..83d5f9b5965 100644
--- a/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.cc
+++ b/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.cc
@@ -17,8 +17,7 @@ limitations under the License.
 
 #include <cstring>
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 namespace tensorflow {
 namespace tensorrt {
@@ -30,5 +29,4 @@ const char* kTfTrtPluginNamespace = "TF";
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
-#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.h b/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.h
index 46b35a24afc..8976cc6e862 100644
--- a/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.h
+++ b/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.h
@@ -20,8 +20,7 @@ limitations under the License.
 
 #include "tensorflow/core/platform/logging.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 #include "third_party/tensorrt/NvInfer.h"
 
 namespace tensorflow {
@@ -30,7 +29,6 @@ namespace tensorrt {
 extern const char* kTfTrtPluginVersion;
 extern const char* kTfTrtPluginNamespace;
 
-#if NV_TENSORRT_MAJOR > 5 || (NV_TENSORRT_MAJOR == 5 && NV_TENSORRT_MINOR >= 1)
 // A wrapper class for TensorRT plugin. User application should inherit from
 // this class to write custom kernels.
 class TrtPlugin : public nvinfer1::IPluginV2Ext {
@@ -51,7 +49,9 @@ class TrtPlugin : public nvinfer1::IPluginV2Ext {
     namespace_ = plugin_namespace;
   }
 
-  const char* getPluginNamespace() const noexcept override { return namespace_.c_str(); }
+  const char* getPluginNamespace() const noexcept override {
+    return namespace_.c_str();
+  }
 
  protected:
   template <typename T>
@@ -70,7 +70,6 @@ class TrtPlugin : public nvinfer1::IPluginV2Ext {
  private:
   std::string namespace_;
 };
-#endif
 
 template <typename T>
 class TrtPluginRegistrar {
@@ -90,7 +89,6 @@ class TrtPluginRegistrar {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
 
 #endif  // TENSORFLOW_COMPILER_TF2TENSORRT_PLUGIN_TRT_PLUGIN_H_
diff --git a/tensorflow/compiler/tf2tensorrt/segment/segment.cc b/tensorflow/compiler/tf2tensorrt/segment/segment.cc
index 1b46d54ab5f..7b5860b600e 100644
--- a/tensorflow/compiler/tf2tensorrt/segment/segment.cc
+++ b/tensorflow/compiler/tf2tensorrt/segment/segment.cc
@@ -15,54 +15,42 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2tensorrt/segment/segment.h"
 
+#include <algorithm>
+#include <fstream>
+#include <map>
+#include <numeric>
 #include <queue>
-#include <set>
+#include <tuple>
 #include <unordered_map>
-#include <vector>
+#include <utility>
 
+#include "absl/container/flat_hash_set.h"
 #include "absl/strings/str_cat.h"
-#include "tensorflow/compiler/tf2tensorrt/segment/union_find.h"
+#include "absl/strings/str_format.h"
+#include "tensorflow/compiler/tf2tensorrt/common/utils.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/graph/graph_util.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/gtl/flatset.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
+#include "tensorflow/core/util/device_name_utils.h"
 #include "tensorflow/core/util/env_var.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 namespace tensorflow {
 namespace tensorrt {
-
-namespace {
-
-void GetLabeledNodes(gtl::FlatSet<string>* node_set, Graph* g) {
-  std::unordered_set<Node*> boundary_node_set;
-  graph_util::GetComputeGraphBoundaryNodes(g, boundary_node_set);
-
-  auto label_node_func = [node_set](Node* n) {
-    node_set->insert(n->name());
-  };
-
-  std::vector<Node* > boundary_node_vec;
-  for (const auto node : boundary_node_set) {
-    boundary_node_vec.emplace_back(node);
-  }
-  ReverseDFSFrom(*g, boundary_node_vec,
-                 std::move(label_node_func), nullptr);
-}
-
-} // namespace
-
 namespace segment {
+namespace {
 using absl::StrAppend;
+using absl::StrAppendFormat;
 using absl::StrCat;
+using absl::StrJoin;
 
 // A simple graph representation to mirror Graph. This structure
 // helps saving memory since segmenter modifies the graph in place, preventing
@@ -259,14 +247,6 @@ struct SimpleEdgePtrCompare {
   }
 };
 
-struct NodePtrCompare {
-  bool operator()(const Node* lhs, const Node* rhs) const {
-    return lhs->name() < rhs->name();
-  }
-};
-
-namespace {
-
 // Copied from TF ReverseDFS, which only works for Graph.
 void StableDFS(const SimpleGraph& g, bool reverse,
                const std::vector<const SimpleNode*>& start,
@@ -282,8 +262,9 @@ void StableDFS(const SimpleGraph& g, bool reverse,
     stack[i] = Work{start[i], false};
   }
 
-  auto get_nodes = reverse ? [](const SimpleNode* n) { return n->in_nodes(); }
-                           : [](const SimpleNode* n) { return n->out_nodes(); };
+  auto get_nodes = [reverse](const SimpleNode* n) {
+    return reverse ? n->in_nodes() : n->out_nodes();
+  };
   std::vector<bool> visited(g.num_node_ids(), false);
   while (!stack.empty()) {
     Work w = stack.back();
@@ -366,7 +347,236 @@ bool CanContractEdge(const SimpleEdge* edge,
             });
   return !has_cycle;
 }
-}  // namespace
+
+// TODO(bixia): put this to a common utility file.
+string TensorPropertiesToString(const OpInfo::TensorProperties& prop) {
+  string s = StrCat(DataTypeString(prop.dtype()), ": ");
+  StrAppend(&s, "[");
+  if (prop.shape().unknown_rank()) {
+    StrAppend(&s, "?");
+  } else {
+    StrAppend(&s, StrJoin(prop.shape().dim(), ",",
+                          [](string* out, const TensorShapeProto_Dim& d) {
+                            StrAppendFormat(out, "%d", d.size());
+                          }));
+  }
+  StrAppend(&s, "]");
+  return s;
+}
+
+string TensorPropertiesToString(
+    const std::vector<OpInfo::TensorProperties>& properties) {
+  return StrJoin(properties, "; ",
+                 [](string* out, const OpInfo::TensorProperties& prop) {
+                   StrAppend(out, TensorPropertiesToString(prop));
+                 });
+}
+
+// From the given list of input properties, returns the leading shape, which is
+// the shape that determines the batch size of the operation. The leading shape
+// is selected from the group of input shapes with the highest rank as follows:
+//  . If all of those shapes have non-negative values for the batch dimension,
+//    the leading shape is the one with the largest value for the batch
+//    dimension.
+//  . If some or all of those shapes have negative values for the batch
+//    dimension, and the rest of those shapes have 1 for the batch dimension,
+//    the leading shape is the first of those shapes with a negative value for
+//    the batch dimension.
+//  . Otherwise, we can't determine the leading shape for the operation and
+//    have to exclude the operation from TRT.
+//
+// Examples:
+//    case-1: a[1,3,4] + b[2,3,4] => leading shape [2,3,4]
+//    case-2: a[2,3,4] + b[scalar] => leading shape [2,3,4]
+//    case-3: a[-1,3,4] + b[1,3,4] => leading shape [-1,3,4]
+//    case-4: a[-1,3,4] + b[2,3,4] => no leading shape
+//
+// We have to return "no leading shape" for case-4 to exclude such operation
+// from being translated for this reason:
+//   The actually input for "a" have to be in the shape of [2,3,4] for the
+//   operation to be valid. On the other hand, if we translate the operation
+//   to implicit batch mode, it will becomes a[3,4]+b[3,4] which is valid for
+//   any input shape of "a".
+//
+// This routine assumes the input program is valid. For example, we shouldn't
+// see invalid operation like a[2,3,4] + b[3,3,4]. It also assumes the input
+// properties is not empty and all input have known shapes.
+//
+// TODO(bixia): find a way to share this knowledge with the converter.
+// TODO(bixia): investigate the use of symbolic shape analysis to improve
+//   segmentation, such as by requiring the dynamic dimensions to have the same
+//   negative value.
+absl::optional<const TensorShapeProto*> FindLeadingShape(
+    absl::Span<const OpInfo::TensorProperties> properties) {
+  DCHECK(!properties.empty());
+  const TensorShapeProto* result;
+  int max_batch_dim_value;
+  auto choose_shape_with_higher_rank = [&](const TensorShapeProto* s) {
+    result = s;
+    max_batch_dim_value = s->dim_size() < 1 ? 1 : s->dim(0).size();
+  };
+
+  DCHECK(!properties[0].shape().unknown_rank());
+  choose_shape_with_higher_rank(&properties[0].shape());
+
+  for (const OpInfo::TensorProperties& p : properties.subspan(1)) {
+    DCHECK(!p.shape().unknown_rank());
+    if (p.shape().dim_size() < result->dim_size()) continue;
+
+    if (p.shape().dim_size() > result->dim_size()) {
+      choose_shape_with_higher_rank(&p.shape());
+      continue;
+    }
+
+    // Among the shapes with the same rank, choose the one with a dynamic batch
+    // size. If no shapes have a dynamic batch size, choose the one with the
+    // largest size.
+    if (result->dim_size() < 1) continue;
+
+    if (p.shape().dim(0).size() < 0 || result->dim(0).size() < 0) {
+      if (p.shape().dim(0).size() < 0 && result->dim(0).size() >= 0) {
+        result = &p.shape();
+      } else {
+        max_batch_dim_value =
+            std::max<int>(max_batch_dim_value, p.shape().dim(0).size());
+      }
+
+      continue;
+    }
+
+    if (p.shape().dim(0).size() > result->dim(0).size()) {
+      result = &p.shape();
+      max_batch_dim_value = result->dim(0).size();
+    }
+  }
+
+  if (result->dim_size() > 0 && result->dim(0).size() < 0) {
+    // dynamic batch size
+    if (max_batch_dim_value <= 1) {
+      return result;
+    } else {
+      return absl::nullopt;
+    }
+  }
+
+  return result;
+}
+
+// Returns the inputs that are relevant to determinate the batch size of the
+// operation. This routine handles the following cases:
+//   . Operations that support implicit boradcasting, such as operation mul.
+//     In this case, we need to inspect all the inputs in order to determine the
+//     batch size of the operation.
+//   . Special cases. Such as "Conv2DBackpropInput", "Conv3DBackpropInputV2".
+//   . The batch size of a operation is determined by the first input of the
+//     operation.
+absl::Span<const OpInfo::TensorProperties> GetInputsToDeterminateBatchSize(
+    const Node* node, const std::vector<OpInfo::TensorProperties>& all_inputs) {
+  // TODO(bixia): Find a way to share this knowledge with the converter.
+  static std::set<string> broadcast_supporting_ops = {
+      // ops corresponding to ConvertBinary in the converter
+      "Add",
+      "AddV2",
+      "Mul",
+      "Sub",
+      "Div",
+      "FloorDiv",
+      "RealDiv",
+      "Minimum",
+      "Maximum",
+      "Pow",
+      // other ops that need to need GetTrtBroadcastShape to convert
+      "BiasAdd",
+      "SquaredDifference",
+      "BatchMatMul",
+      "BatchMatMulV2",
+  };
+  const string& op = node->def().op();
+
+  if (op == "Conv2DBackpropInput" || op == "Conv3DBackpropInputV2") {
+    DCHECK_EQ(all_inputs.size(), 3);
+    return absl::MakeSpan(all_inputs).subspan(2, 1);
+  }
+
+  if (broadcast_supporting_ops.count(op)) {
+    return absl::MakeSpan(all_inputs);
+  }
+
+  // This is the common case for the operations that don't support implicit
+  // broadcasting: the first operand determines its batch size. All otherwise
+  // cases are handled before reaching here.
+  return absl::MakeSpan(all_inputs).subspan(0, 1);
+}
+
+// Returns true if the operation we can remove the implicit batch of the
+// operation.
+//
+// In particular, if the input shape has dynamic rank or the input shape rank
+// is less than 2, we can't remove the implicit batch dimension and generate
+// a new operation for TRT translation.
+bool OperationCanBeTranslatedToImplicitBatch(
+    const grappler::GraphProperties* graph_properties, const Node* node) {
+  VLOG(3) << "process node " << node->name();
+  if (node->num_inputs() == 0) return true;
+  if (!graph_properties || !graph_properties->HasInputProperties(node->name()))
+    return false;
+
+  VLOG(3) << "input shapes "
+          << TensorPropertiesToString(
+                 graph_properties->GetInputProperties(node->name()));
+
+  const std::vector<OpInfo::TensorProperties>& all_input_properties =
+      graph_properties->GetInputProperties(node->name());
+  absl::Span<const OpInfo::TensorProperties> input_properties =
+      GetInputsToDeterminateBatchSize(node, all_input_properties);
+  if (absl::c_any_of(input_properties, [](const OpInfo::TensorProperties& p) {
+        return p.shape().unknown_rank();
+      })) {
+    return false;
+  }
+
+  absl::optional<const TensorShapeProto*> leading_shape =
+      FindLeadingShape(input_properties);
+  return leading_shape.has_value() && leading_shape.value()->dim_size() >= 2;
+}
+
+// Returns true if we can't be sure that the operand with the given properties
+// won't have negative values for non-batch dimensions.
+//
+bool HasDynamicNonBatchDimension(const OpInfo::TensorProperties& prop) {
+  const TensorShapeProto& shape = prop.shape();
+  if (shape.unknown_rank()) return true;
+
+  // Scalar is a well specified shape, and TRT supports implicit broadcasting
+  // from scalar to other shapes.
+  if (shape.dim_size() == 0) return false;
+  for (int i = 1; i < shape.dim_size(); ++i) {
+    // The value of a dynamic dimension can be other negative values besides
+    // -1, representing the symbolic group of the dimension.
+    if (shape.dim(i).size() <= -1) {
+      return true;
+    }
+  }
+  return false;
+}
+
+// Returns true if we can't be sure that the operation won't have dynamic
+// non-batch dimension involved. We only check the shape of the first output
+// assuming shape inference already propagates the shapes.
+bool OperationHasDynamicNonBatchDimension(
+    const grappler::GraphProperties* graph_properties, const Node* node) {
+  VLOG(3) << "process node " << node->name();
+  // If the node doesn't have any input or output, not computation is involved.
+  if (node->num_inputs() == 0 || node->num_outputs() == 0) return false;
+
+  // If the node doesn't have output properties, return true to be conservative.
+  if (!graph_properties->HasOutputProperties(node->name())) return true;
+  VLOG(3) << "output shapes "
+          << TensorPropertiesToString(
+                 graph_properties->GetOutputProperties(node->name()));
+  return HasDynamicNonBatchDimension(
+      graph_properties->GetOutputProperties(node->name()).at(0));
+}
 
 void ContractEdge(SimpleEdge* edge, SimpleGraph* graph,
                   std::vector<const SimpleEdge*>* remove_edges) {
@@ -426,12 +636,246 @@ void ContractEdge(SimpleEdge* edge, SimpleGraph* graph,
   }
 }
 
+// Returns a batch size representation for a segment that only contains the
+// given node.
+ClusterBatchSize GetClusterBatchSizeForNode(
+    const grappler::GraphProperties* graph_properties, const Node* node,
+    bool use_implicit_batch) {
+  ClusterBatchSize cluster_batch_size;
+  if (!use_implicit_batch || !node || node->num_inputs() == 0) {
+    return cluster_batch_size;
+  }
+
+  const NodeDef& node_def = node->def();
+  if (node_def.attr().count(kTftrtOpMaxBatchSizeAttr)) {
+    cluster_batch_size.SetMaxBatchSize(
+        node_def.attr().at(kTftrtOpMaxBatchSizeAttr).i());
+  }
+
+  // As shape inference cannot provide any useful information about the batch
+  // size, we keep it as missing.
+  if (!graph_properties ||
+      !graph_properties->HasInputProperties(node->name())) {
+    VLOG(3) << "doesn't have input property";
+    return cluster_batch_size;
+  }
+
+  const std::vector<OpInfo::TensorProperties>& input_properties =
+      graph_properties->GetInputProperties(node->name());
+  absl::optional<const TensorShapeProto*> optional_leading_shape =
+      FindLeadingShape(GetInputsToDeterminateBatchSize(node, input_properties));
+  DCHECK(optional_leading_shape.has_value());
+  const TensorShapeProto* leading_shape = optional_leading_shape.value();
+  DCHECK(!leading_shape->unknown_rank() && leading_shape->dim_size() >= 2);
+  VLOG(3) << "set batch size as " << leading_shape->dim(0).size();
+  return cluster_batch_size.SetBatchSize(leading_shape->dim(0).size());
+}
+
+void AddSegmentForNode(const grappler::GraphProperties* graph_properties,
+                       std::vector<UnionFind<SimpleNode*>>* segments,
+                       SimpleNode* node,
+                       const DeviceNameUtils::ParsedName& device_name,
+                       bool use_implicit_batch) {
+  tensorflow::profiler::TraceMe activity(
+      "AddSegmentForNode", tensorflow::profiler::TraceMeLevel::kInfo);
+  ClusterProperty property(
+      GetClusterBatchSizeForNode(graph_properties,
+                                 node == nullptr ? nullptr : node->tf_node(),
+                                 use_implicit_batch),
+      device_name);
+  segments->emplace_back(node, std::move(property));
+}
+
+}  // namespace
+
+Status ExportNonConversionReportToCSV(
+    string filename,
+    std::map<string, std::map<string, int>>& nonconverted_ops_map,
+    string sep = "|") {
+  tensorflow::profiler::TraceMe activity(
+      "ExportNonConversionReportToCSV",
+      tensorflow::profiler::TraceMeLevel::kInfo);
+  std::unique_ptr<WritableFile> csv_file;
+  auto open_status = Env::Default()->NewWritableFile(filename, &csv_file);
+
+  if (!open_status.ok()) {
+    return errors::Internal("Failed to open output file: `", filename, "`");
+  }
+
+  LOG(WARNING) << "TF-TRT Non-Conversion Report saved at: `" << filename << "`";
+
+  std::ostringstream sstream;
+  sstream << "OP Name" << sep << "Reason" << sep << "Count" << std::endl;
+
+  for (auto& op_details : nonconverted_ops_map) {
+    auto op_name = op_details.first;
+    auto op_data = op_details.second;
+
+    for (auto& reject_data : op_data) {
+      auto reason = reject_data.first;
+      auto count = reject_data.second;
+      sstream << op_name << sep << reason << sep << count << std::endl;
+    }
+  }
+
+  auto append_status = csv_file->Append(sstream.str());
+
+  if (!append_status.ok()) {
+    return errors::Internal("Error writing to output file `", filename, "`.");
+  }
+
+  auto close_status = csv_file->Close();
+
+  if (!close_status.ok()) {
+    return errors::Internal("Error closing the file `", filename,
+                            "`. The file might be corrupted.");
+  }
+
+  return Status::OK();
+}
+
+string GenerateNonConversionReport(
+    std::map<string, std::map<string, int>>& nonconverted_ops_map) {
+  // Fetch whether to print a detailed version of the TF-TRT conversion report.
+  // TF_TRT_SHOW_DETAILED_REPORT triggers three possible behaviors:
+  // - If Number >= 1:      Print detailed non-conversion report on stdout.
+  //                        Usage: TF_TRT_SHOW_DETAILED_REPORT=1
+  // - If non empty string: Exports the non-conversion report in CSV format at
+  //                        the path defined by the environment variable.
+  //                        This will also print the detailed non-conversion
+  //                        report on stdout.
+  //                        Usage: TF_TRT_SHOW_DETAILED_REPORT=/path/to/file.csv
+  // - Else:                Print normal (undetailed) non-conversion report on
+  //                        stdout.
+  tensorflow::profiler::TraceMe activity(
+      "GenerateNonConversionReport", tensorflow::profiler::TraceMeLevel::kInfo);
+
+  string detailed_report_var;
+  TF_CHECK_OK(ReadStringFromEnvVar("TF_TRT_SHOW_DETAILED_REPORT",
+                                   /*default_value=*/"", &detailed_report_var));
+
+  bool show_detailed_conversion_report = false;
+
+  if (detailed_report_var != "") {
+    // Checking if `TF_TRT_SHOW_DETAILED_REPORT` env var is a string or a number
+    if (detailed_report_var.find_first_not_of("-0123456789") != string::npos) {
+      const Status status = ExportNonConversionReportToCSV(
+          detailed_report_var, nonconverted_ops_map);
+
+      if (!status.ok()) {
+        // Log the error in case of issue, however do not stop execution.
+        LOG(ERROR) << "Problem encountered while generating the TF-TRT "
+                   << "Non-Conversion Report in CSV Format:\n"
+                   << status.error_message();
+      }
+      show_detailed_conversion_report = true;
+    } else if (std::stoi(detailed_report_var) >= 1) {
+      show_detailed_conversion_report = true;
+    }
+  }
+
+  string unsupported_op_report =
+      StrCat("\n\n", string(80, '#'), "\n",
+             "TensorRT unsupported/non-converted OP Report:");
+  int total_nonconverted_ops{0};
+
+  // <Reason, Count for this reason>
+  using ReasonCounterVector = std::vector<std::pair<string, int>>;
+  // <OP Name, Total Non-Converted for OP, <Reason, Count for this reason>>>
+  using NotConvertedOPTuple = std::tuple<string, int, ReasonCounterVector>;
+
+  std::vector<NotConvertedOPTuple> nonconverted_ops_vec;
+
+  // Populate the vector from the map
+  for (auto& nonconverted_op_data : nonconverted_ops_map) {
+    int total_nonconverted_op{0};
+    ReasonCounterVector reason_occurances_vect;
+
+    auto op_name = nonconverted_op_data.first;
+    auto op_data = nonconverted_op_data.second;
+
+    for (auto& notconversion_reason_data : op_data) {
+      auto reason_count = notconversion_reason_data.second;
+      total_nonconverted_op += reason_count;
+      reason_occurances_vect.push_back(notconversion_reason_data);
+    }
+
+    // Sort in descending number of occurances for the reasons why a given
+    // TensorFlow OP was not converted.
+    std::sort(reason_occurances_vect.begin(), reason_occurances_vect.end(),
+              [](const std::pair<string, int>& a,
+                 const std::pair<string, int>& b) -> bool {
+                return a.second > b.second;
+              });
+
+    nonconverted_ops_vec.push_back(std::make_tuple(
+        op_name, total_nonconverted_op, reason_occurances_vect));
+  }
+
+  // Sort the vector by descending OP names.
+  std::sort(nonconverted_ops_vec.begin(), nonconverted_ops_vec.end(),
+            [](const NotConvertedOPTuple& a, const NotConvertedOPTuple& b) {
+              return std::get<1>(a) > std::get<1>(b);
+            });
+
+  for (auto& notconverted_op_detail : nonconverted_ops_vec) {
+    auto& op_name = std::get<0>(notconverted_op_detail);
+    auto& op_total_nonconverted = std::get<1>(notconverted_op_detail);
+    total_nonconverted_ops += op_total_nonconverted;
+
+    unsupported_op_report = StrCat(unsupported_op_report, "\n\t- ", op_name,
+                                   " -> ", op_total_nonconverted, "x");
+
+    if (show_detailed_conversion_report) {
+      auto& nonconverted_ops_details = std::get<2>(notconverted_op_detail);
+
+      for (auto& nonconversion_details : nonconverted_ops_details) {
+        auto& reason = nonconversion_details.first;
+        auto& reason_count = nonconversion_details.second;
+        if (reason_count == 0) {
+          continue;
+        }
+
+        unsupported_op_report = StrCat(unsupported_op_report, "\n\t\t- ",
+                                       "[Count: ", reason_count, "x] ", reason);
+      }
+      unsupported_op_report = StrCat(unsupported_op_report, "\n");
+    }
+  }
+
+  unsupported_op_report =
+      StrCat(unsupported_op_report, "\n", string(80, '-'),
+             "\n\t- Total nonconverted OPs: ", total_nonconverted_ops,
+             "\n\t- Total nonconverted OP Types: ", nonconverted_ops_map.size(),
+             "\nFor more information see https://docs.nvidia.com/deeplearning",
+             "/frameworks/tf-trt-user-guide/index.html#supported-ops.", "\n",
+             string(80, '#'), "\n");
+
+  return unsupported_op_report;
+}
+
 Status SegmentGraph(const Graph* tf_graph,
+                    const grappler::GraphProperties* graph_properties,
                     const std::function<Status(const Node*)>& candidate_fn,
                     const std::function<bool(const Edge*)>& input_candidate_fn,
                     const std::function<bool(const Edge*)>& output_candidate_fn,
-                    const SegmentOptions& options,
-                    SegmentNodesVector* segments) {
+                    const SegmentOptions& options, SegmentVector* segments) {
+  tensorflow::profiler::TraceMe activity(
+      "SegmentGraph", tensorflow::profiler::TraceMeLevel::kInfo);
+  if (!options.use_implicit_batch && !options.allow_dynamic_non_batch_dim) {
+    return errors::Internal(
+        "Explicit batch mode should allow dynamic non-batch dimensions");
+  }
+
+  if (options.use_implicit_batch && !options.maximum_batch_size.has_value()) {
+    return errors::Internal("Implicit batch mode requires maximum_batch_size");
+  }
+
+  if (!options.allow_dynamic_non_batch_dim && !graph_properties) {
+    return errors::Internal(
+        "Need graph propertities to disallow dynamic non-batch dimensions");
+  }
+
   // Steps:
   // 1. run the segmentation algorithm to find all the segments, which uses
   //    candidate_fn to determine the candidates segment nodes;
@@ -442,92 +886,96 @@ Status SegmentGraph(const Graph* tf_graph,
 
   // --------------------------------- Step 1 ---------------------------------
   auto graph = std::unique_ptr<SimpleGraph>(new SimpleGraph(tf_graph));
+
+  // Fetch the user-provide TF operations denylisted for conversion by TF-TRT.
+  const absl::flat_hash_set<string> tftrt_op_denylist = [] {
+    string tftrt_op_denylist_str;
+    TF_CHECK_OK(ReadStringFromEnvVar("TF_TRT_OP_DENYLIST", /*default_value=*/"",
+                                     &tftrt_op_denylist_str));
+    absl::flat_hash_set<string> tftrt_op_denylist{};
+    for (const auto& x : str_util::Split(tftrt_op_denylist_str, ",")) {
+      tftrt_op_denylist.insert(x);
+    }
+    // Force a rehash of the flat hash set
+    tftrt_op_denylist.rehash(0);
+    return tftrt_op_denylist;
+  }();
+
   // Use a union-find to collect the nodes that belong to the same
   // segment. A node value of nullptr indicates that the node is not a candidate
   // for TRT.
-  std::unordered_set<string> unsupported_ops;
-  int num_unsupported_ops = 0;
 
-  // Getting the nodes blacklisted for conversion
-  string tftrt_node_blacklist_str;
-  TF_CHECK_OK(ReadStringFromEnvVar(
-    "TF_TRT_OP_BLACKLIST", "", &tftrt_node_blacklist_str
-  ));
-
-  auto tftrt_node_blacklist = gtl::FlatSet<string>{};
-
-  for (const auto& x : str_util::Split(tftrt_node_blacklist_str, ",")) {
-    tftrt_node_blacklist.insert(x);
-  }
-
-  // User defined special subgraphs which can not be convert to trt graph.
-  // e.g. some sparse lookup subgraphs.
-  auto labeled_node_blacklist = gtl::FlatSet<string>{};
-  GetLabeledNodes(&labeled_node_blacklist, const_cast<Graph*>(tf_graph));
+  std::map<string, std::map<string, int>> nonconverted_ops_map = {};
 
   // Parsing each node of the graph
   std::vector<UnionFind<SimpleNode*>> node_segments;
   for (int i = 0; i < graph->num_node_ids(); ++i) {
     SimpleNode* node = graph->FindNodeId(i);
-    if (options.exclude_node_list.count(node->name()) != 0) {
+
+    if (!node) {
+      VLOG(3) << "Node " << i << " doesn't exist in the graph";
+      continue;
+    }
+
+    const string node_op_type{node->tf_node()->type_string()};
+
+    auto exclude_node = [&](absl::string_view reason) {
       VLOG(1) << "Not a TF-TRT candidate, "
-              << "(Op type: " << node->tf_node()->type_string() << "), "
+              << "(Op type: " << node_op_type << "), "
               << "(Op name: " << node->name() << "), "
-              << "(Reason: excluded by segmenter option)";
-      unsupported_ops.emplace(node->tf_node()->type_string());
-      num_unsupported_ops++;
+              << "(Reason: " << reason << ")";
+      nonconverted_ops_map[node_op_type][string(reason)]++;
       node = nullptr;
+    };
+    absl::optional<DeviceNameUtils::ParsedName> device_name =
+        GetDeviceParsedName(node->tf_node());
+    // GetDeviceParseName capitalizes the device type.
+    if (!device_name.has_value() ||
+        (device_name->has_type && device_name->type != "GPU")) {
+      exclude_node("node can't be placed on GPU");
+    } else if (options.exclude_node_list.count(node->name()) != 0) {
+      exclude_node(
+          "excluded by segmenter option. Most likely an input or "
+          "output node.");
+    } else if (options.use_implicit_batch &&
+               !OperationCanBeTranslatedToImplicitBatch(graph_properties,
+                                                        node->tf_node())) {
+      exclude_node(
+          "implicit batch mode requires input shape with at least two "
+          "dimensions");
+    } else if (!options.allow_dynamic_non_batch_dim &&
+               OperationHasDynamicNonBatchDimension(graph_properties,
+                                                    node->tf_node())) {
+      exclude_node("dynamic non-batch dimensions not allowed");
     } else {
       const Status status = candidate_fn(node->tf_node());
       if (!status.ok()) {
-        VLOG(1) << "Not a TF-TRT candidate, "
-                << "(Op type: " << node->tf_node()->type_string() << "), "
-                << "(Op name: " << node->name() << "), "
-                << "(Reason: " << status << ")";
-        unsupported_ops.emplace(node->tf_node()->type_string());
-        num_unsupported_ops++;
-        node = nullptr;
-      } else if (tftrt_node_blacklist.count(node->tf_node()->type_string())) {
+        exclude_node(status.error_message());
+      } else if (tftrt_op_denylist.contains(node->tf_node()->type_string())) {
         // WARNING verbosity since the user explicitly requests this behavior.
-        LOG(WARNING) << "Blacklisted as TF-TRT candidate, "
-                << "(Op type: " << node->tf_node()->type_string() << "), "
-                << "(Op name: " << node->name() << "), "
-                << "(Reason: Blacklisted with the env var TF_TRT_OP_BLACKLIST)";
-        unsupported_ops.emplace(node->tf_node()->type_string());
-        num_unsupported_ops++;
-        node = nullptr;
-      } else if (labeled_node_blacklist.count(node->tf_node()->name())) {
-        LOG(WARNING) << "Blacklisted as TF-TRT candidate, "
-                << "(Op name: " << node->name() << "), "
-                << "(Reason: User labeled nodes blacklist)";
-        // TODO FIXME : delete
-        unsupported_ops.emplace(node->tf_node()->name());
-        num_unsupported_ops++;
-        node = nullptr;
+        LOG_WARNING_WITH_PREFIX
+            << "Denylisted as TF-TRT candidate, "
+            << "(Op type: " << node->tf_node()->type_string() << "), "
+            << "(Op name: " << node->name() << ")";
+        exclude_node("Denylisted with the env var TF_TRT_OP_DENYLIST");
       } else {
         VLOG(2) << "Accepted as a TF-TRT candidate, "
                 << "(Op type: " << node->tf_node()->type_string() << "), "
                 << "(Op name: " << node->name();
       }
     }
-    node_segments.emplace_back(node);
+    AddSegmentForNode(graph_properties, &node_segments, node, *device_name,
+                      options.use_implicit_batch);
   }
-  string msg = StrCat(
-      "There are ", num_unsupported_ops, " ops of ", unsupported_ops.size(),
-      " different types in the graph that", " are not converted to TensorRT: ");
-  for (const auto& elem : unsupported_ops) {
-    StrAppend(&msg, elem, ", ");
-  }
-  LOG(INFO) << msg << "(For more information see "
-            << "https://docs.nvidia.com/deeplearning"
-            << "/frameworks/tf-trt-user-guide/index.html#supported-ops).";
+
+  LOG(WARNING) << GenerateNonConversionReport(nonconverted_ops_map);
 
   // The segmentation algorithm below visits nodes in reverse topological order
   // and attempts to merge nodes along output edges. That means that subgraphs
   // grow from the output-side of the network towards the inputs.
   //
   // In general this is not guaranteed to produce a globally optimal
-  // segmentation. For exaample, consider graph with node {A, B, C, D} and edges
+  // segmentation. For example, consider graph with node {A, B, C, D} and edges
   // {A->B, A->C, B->D, C->D), where A, B, D are trt compatible but C is not, so
   // in theory we can choose to contract either A, B or B, D but not both, but
   // here it always choose to contract B, D.
@@ -543,18 +991,25 @@ Status SegmentGraph(const Graph* tf_graph,
               return true;
             });
   for (const SimpleNode* node : order) {
-    // All output nodes of 'node' have been visited...
+    // All output nodes of 'node' have been visited.
     VLOG(3) << "Trying node " << node->name() << " id=" << node->id();
-    // 'node' must be a TRT candidate...
+    // 'node' must be a TRT candidate.
     if (node_segments[node->id()].Value() == nullptr) {
       VLOG(3) << "... not a TRT candidate";
       continue;
     }
-    // Contract output edges to combine 'node' with output
-    // nodes. Iterate since combining two nodes may unblock other
-    // combining.
+    // Contract output edges to combine 'node' with output nodes. Repeat this
+    // step until no output edges can be further contracted. This is because
+    // contracting an output edge may unblock new edges for contracting.
+    ClusterBatchSize expected_batch_size =
+        node_segments[node->id()].Property().BatchSize();
+    DeviceNameUtils::ParsedName expected_device_name =
+        node_segments[node->id()].Property().DeviceName();
+    VLOG(3) << "batch size " << expected_batch_size;
     while (true) {
       std::set<const SimpleEdge*, SimpleEdgePtrCompare> contract_edges;
+      // TODO(bixia): consider merging the loop to find the edges and the loop
+      // to contract the edges.
       for (const SimpleEdge* out_edge : node->out_edges()) {
         VLOG(3) << "... out node " << out_edge->dst()->name() << " ( "
                 << out_edge->dst()->id() << " <- " << node->id() << " )";
@@ -562,14 +1017,39 @@ Status SegmentGraph(const Graph* tf_graph,
           VLOG(3) << "... ... Control Edge, Skipping";
           continue;
         }
-        // Out node must be TRT candidate...
-        if (node_segments[out_edge->dst()->id()].Value() == nullptr) {
+        UnionFind<SimpleNode*>* out_cluster =
+            &node_segments[out_edge->dst()->id()];
+        // Out node must be a TRT candidate.
+        if (out_cluster->Value() == nullptr) {
           VLOG(3) << "... ... not a TRT candidate";
           continue;
         }
+        // Out node must have compatible batch size.
+        ClusterBatchSize out_batch_size = out_cluster->Property().BatchSize();
+        ClusterBatchSize merged_batch_size = expected_batch_size;
+        if (!merged_batch_size.MergeIfCompatible(out_batch_size)) {
+          VLOG(3) << "... ... incompatible batch sizes "
+                  << expected_batch_size.ToString() << " "
+                  << out_batch_size.ToString();
+          continue;
+        }
+
+        const DeviceNameUtils::ParsedName& out_device_name =
+            out_cluster->Property().DeviceName();
+        absl::optional<DeviceNameUtils::ParsedName> merged_device_name =
+            MergeIfCompatible(expected_device_name, out_device_name);
+        if (!merged_device_name.has_value()) {
+          VLOG(3) << "... ... incompatible device names "
+                  << expected_device_name << " " << out_device_name;
+          continue;
+        }
+
         if (CanContractEdge(out_edge, graph)) {
-          VLOG(3) << "... ... can contract";
+          VLOG(3) << "... ... can contract. new batch size "
+                  << merged_batch_size.ToString();
           contract_edges.insert(out_edge);
+          expected_batch_size = merged_batch_size;
+          expected_device_name = *merged_device_name;
         } else {
           VLOG(3) << "... ... cannot contract, would form cycle";
         }
@@ -586,7 +1066,8 @@ Status SegmentGraph(const Graph* tf_graph,
 
         VLOG(3) << "Merge " << src->name() << " <- " << dst->name() << " ("
                 << src->id() << " <- " << dst->id();
-        node_segments[src->id()].Merge(&node_segments[dst->id()]);
+        TF_RETURN_IF_ERROR(
+            node_segments[src->id()].Merge(&node_segments[dst->id()]));
 
         // Contracting the edge leaves disconnected graph edges.
         // Remove these from the graph and from 'contract_edges' so we
@@ -600,6 +1081,16 @@ Status SegmentGraph(const Graph* tf_graph,
           graph->RemoveEdge(r);
         }
       }
+      if (expected_batch_size !=
+          node_segments[node->id()].Property().BatchSize()) {
+        return errors::Internal(
+            "expected batch size is not the same as the actual batch size");
+      }
+      if (expected_device_name !=
+          node_segments[node->id()].Property().DeviceName()) {
+        return errors::Internal(
+            "expected device name is not the same as the actual device name");
+      }
     }
   }
 
@@ -608,43 +1099,21 @@ Status SegmentGraph(const Graph* tf_graph,
 
   // A map from the segment identifier (currently the name of the root node of
   // the segment tree) to the segment nodes set.
-  std::map<string, std::set<const Node*, NodePtrCompare>> sg_map;
-
-  // A map from the segment identifier (currently the name of the root node of
-  // the segment tree) to the device names that the nodes in the segment are
-  // assigned to.
-  //
-  // TODO(aaroey): nodes assigned to different devices should not be merged,
-  // fix this.
-  std::unordered_map<string, std::set<string>> device_maps;
+  std::map<string, Segment> sg_map;
 
   for (auto& u : node_segments) {
     if ((u.Value() != nullptr) && (u.ParentValue() != nullptr)) {
-      sg_map[u.ParentValue()->name()].insert(u.Value()->tf_node());
-      auto tf_node = u.Value()->tf_node();
-      // has_assigned_device_name() is expected to return true
-      // when called from optimization pass. However, since graph
-      // is converted back and forth between graph and graphdef,
-      // assigned devices demoted to requested devices. If the graph
-      // is passed directly to this module, assigned devices will be set.
-      if (tf_node->has_assigned_device_name()) {
-        device_maps[u.ParentValue()->name()].insert(
-            tf_node->assigned_device_name());
-      } else if (!tf_node->requested_device().empty()) {
-        device_maps[u.ParentValue()->name()].insert(
-            tf_node->requested_device());
-      } else {
-        VLOG(2) << "Node " << tf_node->name()
-                << " has no device assigned requested device is: "
-                << tf_node->requested_device();
-      }
+      sg_map[u.ParentValue()->name()].nodes.insert(u.Value()->tf_node());
+    }
+    if ((u.Value() != nullptr) && (u.ParentValue() == u.Value())) {
+      sg_map[u.Value()->name()].property = u.Property();
     }
   }
 
   // --------------------------------- Step 2 ---------------------------------
   // Remove ineligible input/output nodes.
   for (auto& itr : sg_map) {
-    std::set<const Node*, NodePtrCompare>& segment_nodes = itr.second;
+    std::set<const Node*, NodePtrCompare>& segment_nodes = itr.second.nodes;
     VLOG(1) << "Segment original size: " << segment_nodes.size();
     while (true) {
       std::deque<const Node*> in_nodes_que, out_nodes_que;
@@ -729,10 +1198,12 @@ Status SegmentGraph(const Graph* tf_graph,
 
   // --------------------------------- Step 3 ---------------------------------
   // Convert the segments into the expected return format
+  std::vector<int> effective_nodes_counts;
   for (const auto& itr : sg_map) {
     const string& segment_root = itr.first;
     // Return format does not require set comparator.
-    std::set<const Node*> segment_nodes(itr.second.begin(), itr.second.end());
+    std::set<const Node*, NodePtrCompare> segment_nodes(
+        itr.second.nodes.begin(), itr.second.nodes.end());
     if (VLOG_IS_ON(1) && !segment_nodes.empty()) {
       string s;
       for (auto node : segment_nodes) {
@@ -750,36 +1221,89 @@ Status SegmentGraph(const Graph* tf_graph,
         });
 
     // Don't use segments whose number of effective nodes is small.
-    if (num_effective_nodes < options.minimum_segment_size) {
+    if (num_effective_nodes == 0 ||
+        num_effective_nodes < options.minimum_segment_size) {
       VLOG(1) << "Segment " << segments->size() << " has only "
               << num_effective_nodes << " effective nodes, dropping";
       continue;
     }
+    segments->emplace_back(itr.second.property, segment_nodes);
+    effective_nodes_counts.push_back(num_effective_nodes);
+  }
+
+  // --------------------------------- Step 4 ---------------------------------
+  // If the number of segments exceeds max_engines, prune the smallest ones.
+
+  int64 max_trt_engine_ops;
+  TF_CHECK_OK(ReadInt64FromEnvVar("TF_TRT_MAX_ALLOWED_ENGINES",
+                                  /*default_value=*/20, &max_trt_engine_ops));
 
-    const auto& dev_itr = device_maps.find(segment_root);
-    if (dev_itr == device_maps.end() || dev_itr->second.empty()) {
-      VLOG(1) << "No device assigned to segment " << segments->size();
-    } else if (dev_itr->second.size() > 1) {
-      string s = StrCat("Segment ", segments->size(),
-                        " has multiple devices attached: ");
-      for (const auto& dev : dev_itr->second) {
-        StrAppend(&s, dev, ", ");
+  if (max_trt_engine_ops <= 0) {
+    LOG(WARNING) << "The environment variable TF_TRT_MAX_ALLOWED_ENGINES is "
+                 << "<= 0. TF-TRT did not limit the number of TensorRT engines "
+                 << "created.";
+
+  } else {
+    if (segments->size() > max_trt_engine_ops) {
+      LOG(WARNING) << "A total of " << segments->size() << " segments with at "
+                   << "least minimum_segment_size="
+                   << options.minimum_segment_size << " nodes have been found. "
+                   << "TF-TRT will only convert the " << max_trt_engine_ops
+                   << " largest segments. You can change this behavior by "
+                   << "modifying the environment variable "
+                   << "TF_TRT_MAX_ALLOWED_ENGINES=" << max_trt_engine_ops;
+
+      // Stable sort of the segment indices according to their effective sizes.
+      std::vector<int> indices(segments->size());
+      std::iota(indices.begin(), indices.end(), 0);
+
+      std::stable_sort(indices.begin(), indices.end(),
+                       [&effective_nodes_counts](int i1, int i2) {
+                         return effective_nodes_counts[i1] >
+                                effective_nodes_counts[i2];
+                       });
+
+      // Create a mask of segments to keep.
+      std::vector<bool> mask = std::vector<bool>(segments->size(), false);
+
+      for (int i = 0; i < max_trt_engine_ops; i++) {
+        mask[indices[i]] = true;
       }
-      LOG(WARNING) << s;
-    }
 
-    segments->emplace_back(segment_nodes);
-  }
-  if (VLOG_IS_ON(1)) {
-    for (const auto& d : device_maps) {
-      string s("Segment ");
-      StrAppend(&s, ": '", d.first, "' ");
-      for (const auto& dd : d.second) {
-        StrAppend(&s, dd, ", ");
+      // Gather the masked elements at the start of the array, in place.
+      int j = 0;
+      VLOG(1) << "The following segments have been accepted by TF-TRT:";
+      for (int i = 0; i < segments->size(); i++) {
+        if (mask[i]) {
+          VLOG(1) << "[*] Segment " << i
+                  << " [node count: " << effective_nodes_counts[i]
+                  << "] accepted. Re-assigned "
+                  << "segment id=" << j;
+          segments->at(j) = segments->at(i);
+          j++;
+        }
       }
-      VLOG(1) << "Devices " << s;
+
+      VLOG(1) << "The following segments have been rejected by TF-TRT:";
+      for (int i = 0; i < segments->size(); i++) {
+        if (!mask[i]) {
+          VLOG(1) << "[*] Segment " << i
+                  << " [node count: " << effective_nodes_counts[i]
+                  << "] rejected.";
+        }
+      }
+
+      // Resize the array.
+      segments->resize(max_trt_engine_ops);
+    } else {
+      LOG(WARNING) << "The environment variable TF_TRT_MAX_ALLOWED_ENGINES="
+                   << max_trt_engine_ops << " has no effect since there are "
+                   << "only " << segments->size() << " TRT Engines with  at "
+                   << "least minimum_segment_size="
+                   << options.minimum_segment_size << " nodes.";
     }
   }
+
   return Status::OK();
 }
 
@@ -787,5 +1311,4 @@ Status SegmentGraph(const Graph* tf_graph,
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/segment/segment.h b/tensorflow/compiler/tf2tensorrt/segment/segment.h
index 77c0af223c8..ad41d5eb40f 100644
--- a/tensorflow/compiler/tf2tensorrt/segment/segment.h
+++ b/tensorflow/compiler/tf2tensorrt/segment/segment.h
@@ -19,30 +19,59 @@ limitations under the License.
 #include <set>
 #include <vector>
 
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/tf2tensorrt/segment/union_find.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 namespace tensorflow {
 namespace tensorrt {
 namespace segment {
 
-// Vector of segments, each entry contains a set of node pointers.
-using SegmentNodesVector = std::vector<std::set<const Node*>>;
+constexpr char kTftrtOpMaxBatchSizeAttr[] = "_tftrt_op_max_batch_size";
 
 struct SegmentOptions {
+  // This struct holds per graph segmenting parameters.
   // Segment must contain at least this many nodes.
   int minimum_segment_size = 2;
+  bool use_implicit_batch = true;
+  // The maximum batch size used to build the engines in the graph, when
+  // use_implicit_batch is true.
+  absl::optional<int> maximum_batch_size = absl::nullopt;
+  // When use_implicit_batch is false or when we are building dynamic engines,
+  // we allow dynamic non-batch dimensions.
+  bool allow_dynamic_non_batch_dim = false;
+  // The name of the device to put the segment on.
   std::set<string> exclude_node_list;
 };
 
+struct NodePtrCompare {
+  bool operator()(const Node* lhs, const Node* rhs) const {
+    return lhs->name() < rhs->name();
+  }
+};
+
+struct Segment {
+  Segment() {}
+  Segment(const ClusterProperty& property,
+          const std::set<const Node*, NodePtrCompare>& nodes)
+      : property(property), nodes(nodes) {}
+  ClusterProperty property;
+  std::set<const Node*, NodePtrCompare> nodes;
+};
+
+// Vector of segments, each entry contains a set of node pointers.
+using SegmentVector = std::vector<Segment>;
+
 // Get the subgraphs of a graph that can be handled by TensorRT.
 //
-// @param graph Graph of the network
+// @param tf_graph Graph of the network.
+// @graph_properties is the static graph properties.
 // @param candidate_fn A function that returns OK for a Node* if
 // that node can be handled by TensorRT.
 // @param segments Returns the TensorRT segments/subgraphs. Each entry
@@ -50,17 +79,16 @@ struct SegmentOptions {
 // all the NodeDefs in that subgraph.
 // @return the status.
 Status SegmentGraph(const Graph* tf_graph,
+                    const grappler::GraphProperties* graph_properties,
                     const std::function<Status(const Node*)>& candidate_fn,
                     const std::function<bool(const Edge*)>& input_candidate_fn,
                     const std::function<bool(const Edge*)>& output_candidate_fn,
-                    const SegmentOptions& options,
-                    SegmentNodesVector* segments);
+                    const SegmentOptions& options, SegmentVector* segments);
 
 }  // namespace segment
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
 
 #endif  // TENSORFLOW_COMPILER_TF2TENSORRT_SEGMENT_SEGMENT_H_
diff --git a/tensorflow/compiler/tf2tensorrt/segment/segment_test.cc b/tensorflow/compiler/tf2tensorrt/segment/segment_test.cc
index cb038e58126..12f3e7a5742 100644
--- a/tensorflow/compiler/tf2tensorrt/segment/segment_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/segment/segment_test.cc
@@ -26,8 +26,7 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/public/session.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 namespace tensorflow {
 namespace tensorrt {
@@ -42,7 +41,7 @@ class SegmentTest : public ::testing::Test {
       if (node_names.find(node->name()) != node_names.end()) {
         return Status::OK();
       }
-      return errors::NotFound("");
+      return errors::NotFound("Not a user specified candidate");
     };
   }
 
@@ -60,24 +59,35 @@ class SegmentTest : public ::testing::Test {
     };
   }
 
-  void RunTest(const Graph* graph, const std::set<string>& candidates,
+  void RunTest(const Graph* graph,
+               const grappler::GraphProperties* graph_properties,
+               const std::set<string>& candidates,
                const std::set<string>& input_candidates,
                const std::set<string>& output_candidates,
                const std::vector<std::set<string>>& expected_segments) {
-    SegmentNodesVector segments;
-    TF_EXPECT_OK(SegmentGraph(graph, MakeCandidateFn(candidates),
+    SegmentVector segments;
+    TF_EXPECT_OK(SegmentGraph(graph, graph_properties,
+                              MakeCandidateFn(candidates),
                               MakeInputEdgeCandidateFn(input_candidates),
                               MakeOutputEdgeCandidateFn(output_candidates),
-                              default_options_, &segments));
+                              segment_options_, &segments));
     ValidateSegment(segments, expected_segments);
   }
 
-  void ValidateSegment(const SegmentNodesVector& segments,
+  void RunTest(const Graph* graph, const std::set<string>& candidates,
+               const std::set<string>& input_candidates,
+               const std::set<string>& output_candidates,
+               const std::vector<std::set<string>>& expected_segments) {
+    RunTest(graph, nullptr, candidates, input_candidates, output_candidates,
+            expected_segments);
+  }
+
+  void ValidateSegment(const SegmentVector& segments,
                        const std::vector<std::set<string>>& expected_segments) {
     EXPECT_EQ(expected_segments.size(), segments.size());
     for (int i = 0; i < segments.size(); ++i) {
       std::set<string> segment_node_names;
-      for (const Node* node : segments[i]) {
+      for (const Node* node : segments[i].nodes) {
         segment_node_names.insert(node->name());
       }
       const auto& expected = expected_segments[i];
@@ -93,7 +103,18 @@ class SegmentTest : public ::testing::Test {
     }
   }
 
-  SegmentOptions default_options_;
+  void DisableImplicitBatchMode() {
+    segment_options_.use_implicit_batch = false;
+    segment_options_.allow_dynamic_non_batch_dim = true;
+  }
+
+  void EnableImplicitBatchModeForStaticEngine(int maximum_batch_size = 1000) {
+    segment_options_.use_implicit_batch = true;
+    segment_options_.maximum_batch_size = maximum_batch_size;
+    segment_options_.allow_dynamic_non_batch_dim = false;
+  }
+
+  SegmentOptions segment_options_;
 };
 
 std::set<string> operator-(const std::set<string>& lhs, const string& rhs) {
@@ -107,6 +128,7 @@ TEST_F(SegmentTest, Empty) {
   Graph g(OpRegistry::Global());
   TF_EXPECT_OK(s.ToGraph(&g));
   // Expect no segments/subgraphs.
+  DisableImplicitBatchMode();
   RunTest(&g, {}, {}, {}, {});
 }
 
@@ -133,6 +155,7 @@ TEST_F(SegmentTest, Simple) {
   // All Add operations are candidates, and we expect all of them to be
   // collapsed into a single segment
   const std::set<string> all_adds = {"add0", "add1", "add2", "add3", "add4"};
+  DisableImplicitBatchMode();
   RunTest(&g, all_adds, all_adds, all_adds, {all_adds});
 
   // Make add1 not a candidate, and we expect all other Add operations to be
@@ -157,6 +180,69 @@ TEST_F(SegmentTest, Simple) {
   RunTest(&g, all_adds, all_adds, without_add3, {all_adds});
 }
 
+TEST_F(SegmentTest, WithDeviceAssignments) {
+  //           feed
+  //          //  \\
+  //       add0    add1
+  //        | \    /
+  //        |  add2
+  //        | /   \\
+  //       add3    add4
+  //          \    /
+  //          <sink>
+  Scope s = Scope::NewRootScope();
+  auto feed = ops::Placeholder(s.WithOpName("feed"), DT_FLOAT);
+  auto add0 = ops::Add(s.WithOpName("add0"), feed, feed);
+  auto add1 = ops::Add(s.WithOpName("add1"), feed, feed);
+  auto add2 = ops::Add(s.WithOpName("add2"), add0, add1);
+  auto add3 = ops::Add(s.WithOpName("add3"), add0, add2);
+  auto add4 = ops::Add(s.WithOpName("add4"), add2, add2);
+
+  const std::set<string> all_adds = {"add0", "add1", "add2", "add3", "add4"};
+  DisableImplicitBatchMode();
+
+  {
+    Graph g(OpRegistry::Global());
+    TF_EXPECT_OK(s.ToGraph(&g));
+    RunTest(&g, all_adds, all_adds, all_adds, {all_adds});
+  }
+
+  {
+    // Assigning add1 to CPU to exclude it from the cluster.
+    add1.node()->set_assigned_device_name("/device:CPU:0");
+    Graph g(OpRegistry::Global());
+    TF_EXPECT_OK(s.ToGraph(&g));
+    RunTest(&g, all_adds, all_adds, all_adds, {all_adds - "add1"});
+    add1.node()->set_assigned_device_name("");
+  }
+
+  {
+    // Assigning operations add3 and add4 to another GPU to exclude the
+    // operation from the cluster.
+    constexpr char kGpu0[] = "/device:GPU:0";
+    add0.node()->set_assigned_device_name(kGpu0);
+    add1.node()->set_assigned_device_name(kGpu0);
+    add2.node()->set_assigned_device_name(kGpu0);
+    constexpr char kGpu1[] = "/device:GPU:1";
+    add3.node()->set_assigned_device_name(kGpu1);
+    add4.node()->set_assigned_device_name(kGpu1);
+    Graph g(OpRegistry::Global());
+    TF_EXPECT_OK(s.ToGraph(&g));
+    RunTest(&g, all_adds, all_adds, all_adds, {{"add0", "add1", "add2"}});
+  }
+
+  {
+    // Assigning the operations to two compatibile GPU devices resulting in
+    // one cluster with all operations.
+    constexpr char kGpuAny[] = "/device:GPU:*";
+    add3.node()->set_assigned_device_name(kGpuAny);
+    add4.node()->set_assigned_device_name(kGpuAny);
+    Graph g(OpRegistry::Global());
+    TF_EXPECT_OK(s.ToGraph(&g));
+    RunTest(&g, all_adds, all_adds, all_adds, {all_adds});
+  }
+}
+
 TEST_F(SegmentTest, AvoidCycle) {
   //           feed
   //          //  \\
@@ -179,6 +265,7 @@ TEST_F(SegmentTest, AvoidCycle) {
 
   // add2 is not a TRT candidate so there should be no segments generated.
   const std::set<string> without_add2 = {"add0", "add1", "add3", "add4"};
+  DisableImplicitBatchMode();
   RunTest(&g, without_add2, without_add2, without_add2, {});
 }
 
@@ -212,6 +299,7 @@ TEST_F(SegmentTest, Multiple) {
                                      "add5", "add6", "add7", "add8"};
   // Make add5 not a TRT candidate, and we expect two segments.
   auto without_add5 = all_adds - "add5";
+  DisableImplicitBatchMode();
   RunTest(&g, without_add5, without_add5, without_add5,
           {{"add0", "add1", "add2", "add3"}, {"add6", "add8"}});
 
@@ -258,6 +346,7 @@ TEST_F(SegmentTest, BigIfElse) {
   // Make add2 not a TRT candidate, and we expect 2 segments.
   const std::set<string> all_adds = {"add0", "add1", "add2", "add3",
                                      "add4", "add5", "add6", "add7"};
+  DisableImplicitBatchMode();
   RunTest(&g, all_adds - "add2", all_adds, all_adds,
           {{"add0", "add1"}, {"add3", "add4", "add5", "add6", "add7"}});
 }
@@ -276,13 +365,229 @@ TEST_F(SegmentTest, IdentityOps) {
                                            "identity2", "identity3"};
   // Identity ops are not counted as effective ops in the segment, so no segment
   // will be formed in this case.
+  DisableImplicitBatchMode();
   RunTest(&g, all_identities, all_identities, all_identities, {});
 }
 
+// Testing implicit batch mode segmentation: it excludes the add-2 operation
+// with a dynamic non-batch dimension.
+TEST_F(SegmentTest, ExcludeAddWithDynamicNonBatchDimension) {
+  Scope s = Scope::NewRootScope();
+  auto feed_0_shape = ops::Placeholder::Shape(PartialTensorShape({-1, 2, 3}));
+  auto feed_1_shape = ops::Placeholder::Shape(PartialTensorShape({-1, -1, 3}));
+  auto const_val = ops::Const<float>(s, {1.0}, {});
+  auto feed_0 =
+      ops::Placeholder(s.WithOpName("feed-1"), DT_FLOAT, feed_0_shape);
+  auto feed_1 =
+      ops::Placeholder(s.WithOpName("feed-2"), DT_FLOAT, feed_1_shape);
+  auto add_0 = ops::Add(s.WithOpName("add-0"), feed_0, const_val);
+  auto add_1 = ops::Add(s.WithOpName("add-1"), add_0, feed_0);
+  auto add_2 = ops::Add(s.WithOpName("add-2"), const_val, feed_1);
+
+  grappler::GrapplerItem item;
+  item.fetch.push_back("add-2");
+  TF_EXPECT_OK(s.ToGraphDef(&item.graph));
+
+  grappler::GraphProperties static_graph_properties(item);
+  TF_EXPECT_OK(static_graph_properties.InferStatically(true));
+
+  Graph g(OpRegistry::Global());
+  TF_CHECK_OK(
+      ConvertGraphDefToGraph(GraphConstructorOptions(), item.graph, &g));
+
+  const std::set<string> all_nodes = {"add-0", "add-1", "add-2"};
+  EnableImplicitBatchModeForStaticEngine();
+  RunTest(&g, &static_graph_properties, all_nodes, all_nodes, all_nodes,
+          {all_nodes - "add-2"});
+}
+
+// Testing implicit batch mode segmentation: It excludes the reshape operation
+// with a dynamic non-batch output dimension.
+// TODO(bixia): hoist the check for reshape should not change batch size from
+// the converter to the segmenter and add another test case for excluding
+// a reshape without dynamic dimensions involved.
+TEST_F(SegmentTest, ExcludeReshapeWithDynamicNonBatchDimensionInOutput) {
+  Scope s = Scope::NewRootScope();
+  auto feed_0_shape = ops::Placeholder::Shape(PartialTensorShape({-1, 2, 3}));
+  auto const_val = ops::Const<float>(s, {1.0}, {});
+  auto feed_0 =
+      ops::Placeholder(s.WithOpName("feed-1"), DT_FLOAT, feed_0_shape);
+  auto add_0 = ops::Add(s.WithOpName("add-0"), feed_0, const_val);
+  auto reshape = ops::Reshape(s.WithOpName("reshape"), add_0, Input({6, -1}));
+  auto add_1 = ops::Add(s.WithOpName("add-1"), reshape, const_val);
+
+  grappler::GrapplerItem item;
+  item.fetch.push_back("add-1");
+  TF_EXPECT_OK(s.ToGraphDef(&item.graph));
+
+  grappler::GraphProperties static_graph_properties(item);
+  TF_EXPECT_OK(static_graph_properties.InferStatically(true));
+
+  Graph g(OpRegistry::Global());
+  TF_CHECK_OK(
+      ConvertGraphDefToGraph(GraphConstructorOptions(), item.graph, &g));
+
+  const std::set<string> all_nodes = {"add-0", "reshape", "add-1"};
+  EnableImplicitBatchModeForStaticEngine();
+  RunTest(&g, &static_graph_properties, all_nodes, all_nodes, all_nodes, {});
+}
+
+TEST_F(SegmentTest, RankOneCannotUseImplicitBatch) {
+  Scope s = Scope::NewRootScope();
+  auto input_0_shape = ops::Placeholder::Shape(TensorShape({3}));
+  auto input_1_shape = ops::Placeholder::Shape(TensorShape({3}));
+  auto input_0 =
+      ops::Placeholder(s.WithOpName("input-0"), DT_FLOAT, input_0_shape);
+  auto input_1 =
+      ops::Placeholder(s.WithOpName("input-1"), DT_FLOAT, input_1_shape);
+  auto const_val = ops::Const(s.WithOpName("const-scalar"), 1.0f, {});
+  auto output_0 = ops::Add(s.WithOpName("output-0"), input_0, const_val);
+  auto output_1 = ops::Add(s.WithOpName("output-1"), input_1, const_val);
+
+  grappler::GrapplerItem item;
+  item.fetch.push_back("output-0");
+  item.fetch.push_back("output-1");
+  TF_EXPECT_OK(s.ToGraphDef(&item.graph));
+
+  grappler::GraphProperties static_graph_properties(item);
+  TF_EXPECT_OK(static_graph_properties.InferStatically(true));
+
+  Graph g(OpRegistry::Global());
+  TF_CHECK_OK(
+      ConvertGraphDefToGraph(GraphConstructorOptions(), item.graph, &g));
+
+  const std::set<string> all_nodes = {"const-scalar", "output-0", "output-1"};
+  EnableImplicitBatchModeForStaticEngine();
+  RunTest(&g, &static_graph_properties, all_nodes, all_nodes, all_nodes, {});
+}
+
+TEST_F(SegmentTest, TwoChainsDiffBatchSizes) {
+  Scope s = Scope::NewRootScope();
+  auto input_0_shape = ops::Placeholder::Shape(TensorShape({2, 3}));
+  auto input_1_shape = ops::Placeholder::Shape(TensorShape({5, 3}));
+  auto input_0 =
+      ops::Placeholder(s.WithOpName("input-0"), DT_FLOAT, input_0_shape);
+  auto input_1 =
+      ops::Placeholder(s.WithOpName("input-1"), DT_FLOAT, input_1_shape);
+  auto const_val = ops::Const(s.WithOpName("const-scalar"), 1.0f, {});
+  auto output_0 = ops::Add(s.WithOpName("output-0"), input_0, const_val);
+  auto output_1 = ops::Add(s.WithOpName("output-1"), input_1, const_val);
+
+  grappler::GrapplerItem item;
+  item.fetch.push_back("output-0");
+  item.fetch.push_back("output-1");
+  TF_EXPECT_OK(s.ToGraphDef(&item.graph));
+
+  grappler::GraphProperties static_graph_properties(item);
+  TF_EXPECT_OK(static_graph_properties.InferStatically(true));
+
+  Graph g(OpRegistry::Global());
+  TF_CHECK_OK(
+      ConvertGraphDefToGraph(GraphConstructorOptions(), item.graph, &g));
+
+  const std::set<string> all_nodes = {"const-scalar", "output-0", "output-1"};
+  EnableImplicitBatchModeForStaticEngine();
+  RunTest(&g, &static_graph_properties, all_nodes, all_nodes, all_nodes,
+          /*expected_segments=*/{{"output-0", "const-scalar"}});
+
+  // Converter will create engines based on the static batch size
+  EnableImplicitBatchModeForStaticEngine(1);
+  RunTest(&g, &static_graph_properties, all_nodes, all_nodes, all_nodes,
+          /*expected_segments=*/{{"output-0", "const-scalar"}});
+}
+
+TEST_F(SegmentTest, SameRankImplicitBroadcastingStaticBatchSize) {
+  Scope s = Scope::NewRootScope();
+  auto input_0_shape = ops::Placeholder::Shape(TensorShape({2, 3, 1}));
+  auto input_1_shape = ops::Placeholder::Shape(TensorShape({1, 3, 4}));
+  auto input_2_shape = ops::Placeholder::Shape(TensorShape({2, 3, 4}));
+  auto input_0 =
+      ops::Placeholder(s.WithOpName("input-0"), DT_FLOAT, input_0_shape);
+  auto input_1 =
+      ops::Placeholder(s.WithOpName("input-1"), DT_FLOAT, input_1_shape);
+  auto input_2 =
+      ops::Placeholder(s.WithOpName("input-2"), DT_FLOAT, input_2_shape);
+  auto multiple = ops::Mul(s.WithOpName("multiple"), input_2, input_2);
+  auto output_0 = ops::Add(s.WithOpName("output-0"), input_0, multiple);
+  auto output_1 = ops::Add(s.WithOpName("output-1"), input_1, multiple);
+
+  grappler::GrapplerItem item;
+  item.fetch.push_back("output-0");
+  item.fetch.push_back("output-1");
+  TF_EXPECT_OK(s.ToGraphDef(&item.graph));
+
+  grappler::GraphProperties static_graph_properties(item);
+  TF_EXPECT_OK(static_graph_properties.InferStatically(true));
+
+  Graph g(OpRegistry::Global());
+  TF_CHECK_OK(
+      ConvertGraphDefToGraph(GraphConstructorOptions(), item.graph, &g));
+
+  const std::set<string> all_nodes = {"multiple", "output-0", "output-1"};
+  EnableImplicitBatchModeForStaticEngine();
+  RunTest(&g, &static_graph_properties, all_nodes, all_nodes, all_nodes,
+          {all_nodes});
+}
+
+TEST_F(SegmentTest, SameRankImplicitBroadcastingDynamicBatchSize) {
+  Scope s = Scope::NewRootScope();
+  auto input_0_shape = ops::Placeholder::Shape(PartialTensorShape({-1, 2}));
+  auto input_1_shape = ops::Placeholder::Shape(TensorShape({1, 2}));
+  auto input_0 =
+      ops::Placeholder(s.WithOpName("input-0"), DT_FLOAT, input_0_shape);
+  auto input_1 =
+      ops::Placeholder(s.WithOpName("input-1"), DT_FLOAT, input_1_shape);
+  auto const_val = ops::Const(s.WithOpName("const-val"), 1.0f, {1, 1});
+  auto add_0 = ops::Add(s.WithOpName("add-0"), input_0, const_val);
+  auto output_0 = ops::Add(s.WithOpName("output-0"), input_0, add_0);
+
+  grappler::GrapplerItem item;
+  item.fetch.push_back("output-0");
+  TF_EXPECT_OK(s.ToGraphDef(&item.graph));
+
+  grappler::GraphProperties static_graph_properties(item);
+  TF_EXPECT_OK(static_graph_properties.InferStatically(true));
+
+  Graph g(OpRegistry::Global());
+  TF_CHECK_OK(
+      ConvertGraphDefToGraph(GraphConstructorOptions(), item.graph, &g));
+
+  const std::set<string> all_nodes = {"const-val", "add-0", "output-0"};
+  EnableImplicitBatchModeForStaticEngine();
+  RunTest(&g, &static_graph_properties, all_nodes, all_nodes, all_nodes,
+          {{"const-val", "add-0", "output-0"}});
+}
+
+TEST_F(SegmentTest, IncompatibleBatchSizes) {
+  Scope s = Scope::NewRootScope();
+  auto input_0_shape = ops::Placeholder::Shape(PartialTensorShape({-1, 2}));
+  auto input_1_shape = ops::Placeholder::Shape(TensorShape({2, 2}));
+  auto input_0 =
+      ops::Placeholder(s.WithOpName("input-0"), DT_FLOAT, input_0_shape);
+  auto input_1 =
+      ops::Placeholder(s.WithOpName("input-1"), DT_FLOAT, input_1_shape);
+  auto const_val = ops::Const(s.WithOpName("const-val"), 1.0f, {2, 2});
+  auto add_0 = ops::Add(s.WithOpName("add-0"), input_0, const_val);
+  auto output_0 = ops::Add(s.WithOpName("output-0"), input_0, add_0);
+
+  grappler::GrapplerItem item;
+  item.fetch.push_back("output-0");
+  TF_EXPECT_OK(s.ToGraphDef(&item.graph));
+
+  grappler::GraphProperties static_graph_properties(item);
+  TF_EXPECT_OK(static_graph_properties.InferStatically(true));
+
+  Graph g(OpRegistry::Global());
+  TF_CHECK_OK(
+      ConvertGraphDefToGraph(GraphConstructorOptions(), item.graph, &g));
+
+  const std::set<string> all_nodes = {"const-val", "add-0", "output-0"};
+  EnableImplicitBatchModeForStaticEngine();
+  RunTest(&g, &static_graph_properties, all_nodes, all_nodes, all_nodes, {});
+}
 }  // namespace test
 }  // namespace segment
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/segment/union_find.cc b/tensorflow/compiler/tf2tensorrt/segment/union_find.cc
new file mode 100644
index 00000000000..29882ed6e60
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/segment/union_find.cc
@@ -0,0 +1,154 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2tensorrt/segment/union_find.h"
+
+#include "absl/strings/str_format.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
+
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+
+namespace tensorflow {
+namespace tensorrt {
+namespace segment {
+
+namespace {
+template <typename T>
+inline bool CheckIfCompatible(const absl::optional<T>& a,
+                              const absl::optional<T>& b) {
+  if (a.has_value() && b.has_value()) {
+    return *a == *b;
+  }
+  return true;
+}
+
+template <typename T>
+inline bool UnifyValues(absl::optional<T>& a, absl::optional<T>& b) {
+  if (a.has_value()) {
+    b = a;
+  } else {
+    a = b;
+  }
+  return true;
+}
+
+template <typename T>
+inline absl::optional<T> MergeCompatible(const absl::optional<T>& a,
+                                         const absl::optional<T>& b) {
+  DCHECK(CheckIfCompatible(a, b));
+  return a.has_value() ? a : b;
+}
+
+}  // namespace
+
+ClusterBatchSize::ClusterBatchSize()
+    : batch_size_(absl::nullopt), max_batch_size_(absl::nullopt) {}
+
+bool ClusterBatchSize::operator==(const ClusterBatchSize& other) {
+  return batch_size_ == other.batch_size_ &&
+         max_batch_size_ == other.max_batch_size_;
+}
+
+ClusterBatchSize& ClusterBatchSize::SetBatchSize(int batch_size) {
+  SetBatchSize(static_cast<absl::optional<int>>(batch_size));
+  return *this;
+}
+
+ClusterBatchSize& ClusterBatchSize::SetBatchSize(
+    const absl::optional<int>& batch_size) {
+  batch_size_ = MergeCompatible<int>(batch_size_, batch_size);
+  if (batch_size_.has_value() && batch_size_.value() >= 0) {
+    SetMaxBatchSize(batch_size_);
+  }
+  return *this;
+}
+
+bool ClusterBatchSize::HasBatchSize() const { return batch_size_.has_value(); }
+
+int ClusterBatchSize::GetBatchSize() const {
+  DCHECK(HasBatchSize());
+  return batch_size_.value();
+}
+
+ClusterBatchSize& ClusterBatchSize::SetMaxBatchSize(int max_batch_size) {
+  SetBatchSize(static_cast<absl::optional<int>>(max_batch_size));
+  return *this;
+}
+
+ClusterBatchSize& ClusterBatchSize::SetMaxBatchSize(
+    const absl::optional<int>& max_batch_size) {
+  max_batch_size_ = MergeCompatible<int>(max_batch_size_, max_batch_size);
+  return *this;
+}
+
+absl::optional<int> ClusterBatchSize::GetOptionalMaxBatchSize() const {
+  return max_batch_size_;
+}
+
+bool ClusterBatchSize::MergeIfCompatible(const ClusterBatchSize& other) {
+  if (!CheckIfCompatible(batch_size_, other.batch_size_) ||
+      !CheckIfCompatible(max_batch_size_, other.max_batch_size_)) {
+    return false;
+  }
+
+  SetBatchSize(other.batch_size_);
+  SetMaxBatchSize(other.max_batch_size_);
+  return true;
+}
+
+string ClusterBatchSize::ToString() const {
+  string s;
+  const auto append_optional_num = [&](const absl::optional<int>& num) {
+    if (num.has_value()) {
+      absl::StrAppendFormat(&s, "%d", num.value());
+    } else {
+      absl::StrAppendFormat(&s, "?");
+    }
+  };
+  absl::StrAppendFormat(&s, "batch_size=");
+  append_optional_num(batch_size_);
+  absl::StrAppendFormat(&s, ", max_batch_size=");
+  append_optional_num(max_batch_size_);
+  return s;
+}
+
+ClusterProperty::ClusterProperty(const ClusterBatchSize& batch_size,
+                                 const DeviceNameUtils::ParsedName& device_name)
+    : batch_size_(batch_size), device_name_(device_name) {}
+
+Status ClusterProperty::Merge(const ClusterProperty& other) {
+  ClusterBatchSize merged_batch_size(batch_size_);
+  if (!merged_batch_size.MergeIfCompatible(other.batch_size_)) {
+    return errors::Internal(
+        "trying to merge clusters with incompatible batch sizes.");
+  }
+
+  absl::optional<DeviceNameUtils::ParsedName> merged_device_name =
+      MergeIfCompatible(device_name_, other.device_name_);
+  if (!merged_device_name.has_value()) {
+    return errors::Internal(
+        "trying to merge clusters with incompatible device assignment.");
+  }
+
+  batch_size_ = std::move(merged_batch_size);
+  device_name_ = std::move(merged_device_name.value());
+  return Status::OK();
+}
+
+}  // namespace segment
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/segment/union_find.h b/tensorflow/compiler/tf2tensorrt/segment/union_find.h
index 6458ae692fd..9a2f1e8dd5b 100644
--- a/tensorflow/compiler/tf2tensorrt/segment/union_find.h
+++ b/tensorflow/compiler/tf2tensorrt/segment/union_find.h
@@ -16,55 +16,192 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_TF2TENSORRT_SEGMENT_UNION_FIND_H_
 #define TENSORFLOW_COMPILER_TF2TENSORRT_SEGMENT_UNION_FIND_H_
 
+#include "absl/types/optional.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/util/device_name_utils.h"
+
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+
 namespace tensorflow {
 namespace tensorrt {
 namespace segment {
 
-// Union-Find data structure.
-// Each cluster has an associated value; when merging clusters we can control
-// which value becomes the representative of the merged clusters. Values must be
-// copyable.
-template <typename T>
+// ClusterBatchSize is a data structure to record the batch size we have seen
+// for a cluster during segmentation.
+//
+// With the help of shape inference, all the dynamic batch sizes are converted
+// to a negative integer number.
+// If the number is -1, then nothing is known about the dynamic batch size.
+// Ideally, we should not put nodes with -1 batch size into the same cluster,
+// as they will likely have different batch sizes at runtime. However, we
+// currently treat -1 as an equivalent class for simple implementation. We may
+// need to revise this if it causes performance issues.
+// If the number is strictly less than -1, then it represents a equivalent
+// class. It is infered that all the nodes with the same equivalent class
+// (strictly less than -1) shall have the same batch size at runtime.
+//
+// When constructing clusters for implicit batch mode, we support both
+// dynamic batch sizes and static batch sizes. As all the nodes inside the same
+// cluster shall have the same batch size at runtime, we restrict nodes inside a
+// cluster to either have the same dynamic batch size equivalent class or the
+// same static batch size value.
+//
+// Besides, all the nodes with an annotated max batch size inside the same
+// cluster shall have the same annotated max batch size. (It is allowed if
+// part or all the nodes inside the cluster doesn't have annotated max batch
+// size). Static batch sizes are treated as max batch size annotations. The
+// converter max batch size is used for an OP with a dynamic batch size and no
+// annotated max batch size.
+//
+// cluster:  a = a1[1,3] + a1[1,3]
+// ClusterBatchSize: batch_size_ = 1
+//                   max_batch_size_ = 1
+//
+// cluster:  b = b1[-1,3] + b2[-1, 3]
+// ClusterBatchSize: batch_size_ = -1
+//                   max_batch_size_ = null
+//
+// cluster:  c = c1[-2,3] + c2[-2, 3](max_batch_size=100)
+// ClusterBatchSize: batch_size_ = -2
+//                   max_batch_size_ = 100
+//
+// When constructing cluster for explicit batch mode, all ClusterBatchSize is
+// irrelevant.
+//
+
+class ClusterBatchSize {
+ public:
+  ClusterBatchSize();
+
+  bool operator==(const ClusterBatchSize& other);
+  bool operator!=(const ClusterBatchSize& other) { return !(*this == other); }
+
+  // Sets the batch size assuming that the object doesn't have a batch size yet:
+  //   A non-negative input representing a static batch size value.
+  //   A negative input representing a dynamic batch size equivalent class.
+  ClusterBatchSize& SetBatchSize(int batch_size);
+  bool HasBatchSize() const;
+  int GetBatchSize() const;
+
+  // Sets the max batch size assuming that the object doesn't have a max batch
+  // size yet.
+  ClusterBatchSize& SetMaxBatchSize(int max_batch_size);
+  absl::optional<int> GetOptionalMaxBatchSize() const;
+
+  // Merge `other` into the current ClusterBatchSize if the two are not
+  // conflicting. Two ClusterBatchSizes are conflicting iff they both have a
+  // value and their values are different.
+  bool MergeIfCompatible(const ClusterBatchSize& other);
+
+  // Returns a string for the batch size and the annotated max batch size.
+  // For the batch size:
+  //   If the object has a static batch size, return a string representing a
+  //     non-negative integer.
+  //   If the object has a dynamic batch size, return a string representing a
+  //     negative integer as an equivalent class.
+  //   If the object doesn't have a batch size yet, return "?".
+  // For the annotated max batch size:
+  //   If the cluster has annotated max batch size in at least one of the nodes,
+  //     return a string representing the annotated max batch size. Otherwise,
+  //     return "?".
+  std::string ToString() const;
+
+ private:
+  ClusterBatchSize& SetBatchSize(const absl::optional<int>& batch_size);
+  ClusterBatchSize& SetMaxBatchSize(const absl::optional<int>& batch_size);
+
+  absl::optional<int> batch_size_;
+  absl::optional<int> max_batch_size_;
+};
+
+inline std::ostream& operator<<(std::ostream& os,
+                                const ClusterBatchSize& batch_size) {
+  return os << batch_size.ToString();
+}
+
+// Represents the accumulated properties of a cluster during segmentation,
+// including information about batch size and device assignment. Clusters shall
+// have compatible properties in order to be merged together.
+class ClusterProperty {
+ public:
+  ClusterProperty() {}
+  ClusterProperty(const ClusterBatchSize& batch_size,
+                  const DeviceNameUtils::ParsedName& device_name);
+
+  // Returns the batch size of the cluster and compresses the path from this
+  // object to the root object.
+  const ClusterBatchSize& BatchSize() const { return batch_size_; }
+
+  // Returns the device name of the cluster and compresses the path from this
+  // object to the root object.
+  const DeviceNameUtils::ParsedName& DeviceName() const { return device_name_; }
+
+  Status Merge(const ClusterProperty& other);
+
+ private:
+  ClusterBatchSize batch_size_;
+  DeviceNameUtils::ParsedName device_name_;
+};
+
+// Represents a disjoint set of copyable value with type T and accumulated
+// property of the values with type P. Most of the methods in this class are
+// side-effecting as they also compress the path from the object to the parent
+// of its containing set.
+template <typename T, typename P = ClusterProperty>
 class UnionFind {
  public:
   UnionFind() : size_(1), parent_(nullptr) {}
-  explicit UnionFind(const T& v) : size_(1), parent_(nullptr), value_(v) {}
+  UnionFind(const T& v, const P& p)
+      : size_(1), parent_(nullptr), value_(v), property_(p) {}
+  UnionFind(const T& v, P&& p)
+      : size_(1), parent_(nullptr), value_(v), property_(p) {}
 
-  // Returns the number of elements in a cluster.
+  // Returns the number of elements in the set and compresses the path from
+  // this object to the root of the set.
   int Size() { return FindRoot()->size_; }
 
-  // Merges this cluster with 'other'. This cluster's value becomes
-  // the value of the merged cluster; the value of 'other' is ignored.
-  void Merge(UnionFind* other);
+  // Returns the accumulated property of all the elements in the set and
+  // compresses the path from this object to the root of the set.
+  const P& Property() { return FindRoot()->property_; }
 
-  // Each cluster has an associated value. Retrieves the value associated
-  // with this cluster.
-  T& ParentValue() { return FindRoot()->value_; }
+  // Merges this set with 'other'. This updates the size_ and property_ of the
+  // set. The size_ and property_ of 'other' becomes inaccessible as only the
+  // size_ and property_ of the root of the set is accessible.
+  Status Merge(UnionFind* other);
 
-  // Get the original value of this node.
-  T& Value() { return value_; }
+  // Retrieves the value for the root of the set.
+  const T& ParentValue() { return FindRoot()->value_; }
+
+  // Returns the value for the object.
+  const T& Value() const { return value_; }
 
  private:
-  // Finds the root element of the cluster. Performs path compression.
+  // Returns the root object for the set and compresses the path from this
+  // object to the root object.
   UnionFind* FindRoot();
 
   int size_;
   UnionFind* parent_;
   T value_;
+  P property_;
 };
 
-template <typename T>
-void UnionFind<T>::Merge(UnionFind* other) {
+template <typename T, typename P>
+Status UnionFind<T, P>::Merge(UnionFind* other) {
   UnionFind<T>* a = FindRoot();
   UnionFind<T>* b = other->FindRoot();
-  if (a == b) return;
+  if (a == b) return Status::OK();
 
+  P merged_property(a->property_);
+  TF_RETURN_IF_ERROR(merged_property.Merge(b->property_));
   b->parent_ = a;
   a->size_ += b->size_;
+  a->property_ = std::move(merged_property);
+  return Status::OK();
 }
 
-template <typename T>
-UnionFind<T>* UnionFind<T>::FindRoot() {
+template <typename T, typename P>
+UnionFind<T, P>* UnionFind<T, P>::FindRoot() {
   if (!parent_) return this;
   // Path compression: update intermediate nodes to point to the root of the
   // equivalence class.
@@ -76,4 +213,6 @@ UnionFind<T>* UnionFind<T>::FindRoot() {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+
 #endif  // TENSORFLOW_COMPILER_TF2TENSORRT_SEGMENT_UNION_FIND_H_
diff --git a/tensorflow/compiler/tf2tensorrt/stub/NvInferPlugin_7_0.inc b/tensorflow/compiler/tf2tensorrt/stub/NvInferPlugin_7_0.inc
new file mode 100644
index 00000000000..ca8de76aef9
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/stub/NvInferPlugin_7_0.inc
@@ -0,0 +1,95 @@
+// Auto-generated, do not edit.
+
+extern "C" {
+
+nvinfer1::IPluginV2* createRPNROIPlugin(int featureStride, int preNmsTop,
+                                        int nmsMaxOut, float iouThreshold,
+                                        float minBoxSize, float spatialScale,
+                                        nvinfer1::DimsHW pooling,
+                                        nvinfer1::Weights anchorRatios,
+                                        nvinfer1::Weights anchorScales) {
+  using FuncPtr = nvinfer1::IPluginV2 * ( *)(int, int, int, float, float, float, nvinfer1::DimsHW, nvinfer1::Weights, nvinfer1::Weights);
+  static auto func_ptr = LoadSymbol<FuncPtr>("createRPNROIPlugin");
+  if (!func_ptr) LogFatalSymbolNotFound("createRPNROIPlugin");
+  return func_ptr(featureStride, preNmsTop, nmsMaxOut, iouThreshold, minBoxSize, spatialScale, pooling, anchorRatios, anchorScales);
+}
+
+nvinfer1::IPluginV2* createNormalizePlugin(const nvinfer1::Weights* scales,
+                                           bool acrossSpatial,
+                                           bool channelShared, float eps) {
+  using FuncPtr = nvinfer1::IPluginV2 * ( *)(const nvinfer1::Weights *, bool, bool, float);
+  static auto func_ptr = LoadSymbol<FuncPtr>("createNormalizePlugin");
+  if (!func_ptr) LogFatalSymbolNotFound("createNormalizePlugin");
+  return func_ptr(scales, acrossSpatial, channelShared, eps);
+}
+
+nvinfer1::IPluginV2* createPriorBoxPlugin(
+    nvinfer1::plugin::PriorBoxParameters param) {
+  using FuncPtr = nvinfer1::IPluginV2 * ( *)(nvinfer1::plugin::PriorBoxParameters);
+  static auto func_ptr = LoadSymbol<FuncPtr>("createPriorBoxPlugin");
+  if (!func_ptr) LogFatalSymbolNotFound("createPriorBoxPlugin");
+  return func_ptr(param);
+}
+
+nvinfer1::IPluginV2* createAnchorGeneratorPlugin(
+    nvinfer1::plugin::GridAnchorParameters* param, int numLayers) {
+  using FuncPtr = nvinfer1::IPluginV2 * ( *)(nvinfer1::plugin::GridAnchorParameters *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("createAnchorGeneratorPlugin");
+  if (!func_ptr) LogFatalSymbolNotFound("createAnchorGeneratorPlugin");
+  return func_ptr(param, numLayers);
+}
+
+nvinfer1::IPluginV2* createNMSPlugin(
+    nvinfer1::plugin::DetectionOutputParameters param) {
+  using FuncPtr = nvinfer1::IPluginV2 * ( *)(nvinfer1::plugin::DetectionOutputParameters);
+  static auto func_ptr = LoadSymbol<FuncPtr>("createNMSPlugin");
+  if (!func_ptr) LogFatalSymbolNotFound("createNMSPlugin");
+  return func_ptr(param);
+}
+
+nvinfer1::IPluginV2* createLReLUPlugin(float negSlope) {
+  using FuncPtr = nvinfer1::IPluginV2 * ( *)(float);
+  static auto func_ptr = LoadSymbol<FuncPtr>("createLReLUPlugin");
+  if (!func_ptr) LogFatalSymbolNotFound("createLReLUPlugin");
+  return func_ptr(negSlope);
+}
+
+nvinfer1::IPluginV2* createReorgPlugin(int stride) {
+  using FuncPtr = nvinfer1::IPluginV2 * ( *)(int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("createReorgPlugin");
+  if (!func_ptr) LogFatalSymbolNotFound("createReorgPlugin");
+  return func_ptr(stride);
+}
+
+nvinfer1::IPluginV2* createRegionPlugin(
+    nvinfer1::plugin::RegionParameters params) {
+  using FuncPtr = nvinfer1::IPluginV2 * ( *)(nvinfer1::plugin::RegionParameters);
+  static auto func_ptr = LoadSymbol<FuncPtr>("createRegionPlugin");
+  if (!func_ptr) LogFatalSymbolNotFound("createRegionPlugin");
+  return func_ptr(params);
+}
+
+nvinfer1::IPluginV2* createClipPlugin(const char* layerName, float clipMin,
+                                      float clipMax) {
+  using FuncPtr = nvinfer1::IPluginV2 * ( *)(const char *, float, float);
+  static auto func_ptr = LoadSymbol<FuncPtr>("createClipPlugin");
+  if (!func_ptr) LogFatalSymbolNotFound("createClipPlugin");
+  return func_ptr(layerName, clipMin, clipMax);
+}
+
+nvinfer1::IPluginV2* createBatchedNMSPlugin(
+    nvinfer1::plugin::NMSParameters param) {
+  using FuncPtr = nvinfer1::IPluginV2 * ( *)(nvinfer1::plugin::NMSParameters);
+  static auto func_ptr = LoadSymbol<FuncPtr>("createBatchedNMSPlugin");
+  if (!func_ptr) LogFatalSymbolNotFound("createBatchedNMSPlugin");
+  return func_ptr(param);
+}
+
+bool initLibNvInferPlugins(void* logger, const char* libNamespace) {
+  using FuncPtr = bool ( *)(void *, const char *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("initLibNvInferPlugins");
+  if (!func_ptr) LogFatalSymbolNotFound("initLibNvInferPlugins");
+  return func_ptr(logger, libNamespace);
+}
+
+}  // extern "C"
diff --git a/tensorflow/compiler/tf2tensorrt/stub/NvInfer_7_0.inc b/tensorflow/compiler/tf2tensorrt/stub/NvInfer_7_0.inc
new file mode 100644
index 00000000000..ad393f5c39c
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/stub/NvInfer_7_0.inc
@@ -0,0 +1,47 @@
+// Auto-generated, do not edit.
+
+extern "C" {
+
+void* createInferBuilder_INTERNAL(void* logger, int version) {
+  using FuncPtr = void * (*)(void *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("createInferBuilder_INTERNAL");
+  if (!func_ptr) LogFatalSymbolNotFound("createInferBuilder_INTERNAL");
+  return func_ptr(logger, version);
+}
+
+void* createInferRefitter_INTERNAL(void* engine, void* logger, int version) {
+  using FuncPtr = void * (*)(void *, void *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("createInferRefitter_INTERNAL");
+  if (!func_ptr) LogFatalSymbolNotFound("createInferRefitter_INTERNAL");
+  return func_ptr(engine, logger, version);
+}
+
+void* createInferRuntime_INTERNAL(void* logger, int version) {
+  using FuncPtr = void * (*)(void *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("createInferRuntime_INTERNAL");
+  if (!func_ptr) LogFatalSymbolNotFound("createInferRuntime_INTERNAL");
+  return func_ptr(logger, version);
+}
+
+nvinfer1::ILogger* getLogger() {
+  using FuncPtr = nvinfer1::ILogger * (*)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("getLogger");
+  if (!func_ptr) LogFatalSymbolNotFound("getLogger");
+  return func_ptr();
+}
+
+int getInferLibVersion() {
+  using FuncPtr = int (*)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("getInferLibVersion");
+  if (!func_ptr) LogFatalSymbolNotFound("getInferLibVersion");
+  return func_ptr();
+}
+
+nvinfer1::IPluginRegistry* getPluginRegistry() {
+  using FuncPtr = nvinfer1::IPluginRegistry * (*)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("getPluginRegistry");
+  if (!func_ptr) LogFatalSymbolNotFound("getPluginRegistry");
+  return func_ptr();
+}
+
+}  // extern "C"
diff --git a/tensorflow/compiler/tf2tensorrt/stub/nvinfer_plugin_stub.cc b/tensorflow/compiler/tf2tensorrt/stub/nvinfer_plugin_stub.cc
index c884814e009..002406cf9eb 100644
--- a/tensorflow/compiler/tf2tensorrt/stub/nvinfer_plugin_stub.cc
+++ b/tensorflow/compiler/tf2tensorrt/stub/nvinfer_plugin_stub.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/compiler/tf2tensorrt/common/utils.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/stream_executor/platform/dso_loader.h"
 #include "third_party/tensorrt/NvInferPlugin.h"
@@ -51,10 +50,10 @@ void LogFatalSymbolNotFound(const char* symbol_name) {
 }
 }  // namespace
 
-#if IS_TRT_VERSION_GE(5, 1, 0, 0)
-#include "tensorflow/compiler/tf2tensorrt/stub/NvInferPlugin_5_1.inc"
-#elif IS_TRT_VERSION_GE(5, 0, 0, 0)
-#include "tensorflow/compiler/tf2tensorrt/stub/NvInferPlugin_5_0.inc"
+#if NV_TENSORRT_MAJOR < 7
+#error TensorRT version earlier than 7 is not supported.
+#elif NV_TENSORRT_MAJOR == 7 || NV_TENSORRT_MAJOR == 8
+#include "tensorflow/compiler/tf2tensorrt/stub/NvInferPlugin_7_0.inc"
 #else
-#error TensorRT version earlier than 5 is not supported.
+#error This version of TensorRT is not supported.
 #endif
diff --git a/tensorflow/compiler/tf2tensorrt/stub/nvinfer_stub.cc b/tensorflow/compiler/tf2tensorrt/stub/nvinfer_stub.cc
index 2feb785350d..a0a11766cd3 100644
--- a/tensorflow/compiler/tf2tensorrt/stub/nvinfer_stub.cc
+++ b/tensorflow/compiler/tf2tensorrt/stub/nvinfer_stub.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/compiler/tf2tensorrt/common/utils.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/stream_executor/platform/dso_loader.h"
 #include "third_party/tensorrt/NvInfer.h"
@@ -51,13 +50,12 @@ void LogFatalSymbolNotFound(const char* symbol_name) {
 }
 }  // namespace
 
-#if IS_TRT_VERSION_GE(8, 0, 0, 0)
+#if NV_TENSORRT_MAJOR < 7
+#error TensorRT version earlier than 7 is not supported.
+#elif NV_TENSORRT_MAJOR == 7
+#include "tensorflow/compiler/tf2tensorrt/stub/NvInfer_7_0.inc"
+#elif NV_TENSORRT_MAJOR == 8
 #include "tensorflow/compiler/tf2tensorrt/stub/NvInfer_8_0.inc"
-#elif IS_TRT_VERSION_GE(5, 1, 0, 0)
-#include "tensorflow/compiler/tf2tensorrt/stub/NvInfer_5_1.inc"
-#elif IS_TRT_VERSION_GE(5, 0, 0, 0)
-#include "tensorflow/compiler/tf2tensorrt/stub/NvInfer_5_0.inc"
 #else
-#error TensorRT version earlier than 5 is not supported.
+#error This version of TensorRT is not supported.
 #endif
-
diff --git a/tensorflow/compiler/tf2tensorrt/tensorrt_test.cc b/tensorflow/compiler/tf2tensorrt/tensorrt_test.cc
index 8a18e4eaf70..41e74928f77 100644
--- a/tensorflow/compiler/tf2tensorrt/tensorrt_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/tensorrt_test.cc
@@ -12,24 +12,65 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+#include <functional>
+#include <numeric>
+#include <stack>
 
-#include "tensorflow/core/common_runtime/gpu/gpu_init.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/stream_executor.h"
-#include "tensorflow/core/platform/test.h"
-
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
 #include "third_party/gpus/cuda/include/cuda.h"
 #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
-#include "third_party/tensorrt/NvInfer.h"
 #include "tensorflow/compiler/tf2tensorrt/common/utils.h"
 #include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
 #include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_init.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/stream_executor.h"
+#include "tensorflow/core/platform/test.h"
+#include "third_party/tensorrt/NvInfer.h"
+#include "third_party/tensorrt/NvInferPlugin.h"
+#include "third_party/tensorrt/NvInferRuntimeCommon.h"
 
+#ifdef TF_TRT_USE_EFFICIENT_NMS_PLUGIN
+#include "third_party/tensorrt/plugin/efficientNMSPlugin/efficientNMSPlugin.h"
 namespace tensorflow {
 namespace tensorrt {
+std::unique_ptr<nvinfer1::plugin::EfficientNMSPluginCreator>
+MakeNMSPluginCreator(const std::string& plugin_namespace = "tftrt") {
+  auto pluginCreator =
+      std::make_unique<nvinfer1::plugin::EfficientNMSPluginCreator>();
+  pluginCreator->setPluginNamespace(plugin_namespace.c_str());
+  std::string pluginType = std::string{pluginCreator->getPluginNamespace()} +
+                           "::" + std::string{pluginCreator->getPluginName()} +
+                           " version " +
+                           std::string{pluginCreator->getPluginVersion()};
+  VLOG(0) << "Created plugin type " << pluginType;
+  return pluginCreator;
+}
+
+struct PluginDeleter {
+  void operator()(nvinfer1::IPluginV2* t);
+};
 
+void PluginDeleter::operator()(nvinfer1::IPluginV2* t) { t->destroy(); }
+
+std::unique_ptr<nvinfer1::IPluginV2, PluginDeleter> createPlugin(
+    const std::string& name, nvinfer1::IPluginCreator* pluginCreator,
+    const std::vector<nvinfer1::PluginField>& pluginFields) {
+  if (!pluginCreator) {
+    return nullptr;
+  }
+  nvinfer1::PluginFieldCollection fc;
+  fc.nbFields = pluginFields.size();
+  fc.fields = pluginFields.data();
+  return std::unique_ptr<nvinfer1::IPluginV2, PluginDeleter>{
+      pluginCreator->createPlugin(name.c_str(), &fc)};
+}
+}  // namespace tensorrt
+}  // namespace tensorflow
+#endif
+
+namespace tensorflow {
+namespace tensorrt {
 
 class ScopedWeights {
  public:
@@ -45,65 +86,125 @@ class ScopedWeights {
   nvinfer1::Weights w;
 };
 
-const char* kInputTensor = "input";
-const char* kOutputTensor = "output";
+class ScopedShapedWeights {
+ public:
+  ScopedShapedWeights(nvinfer1::Dims dims, float value)
+      : dims_(dims),
+        value_(std::accumulate(dims.d, dims.d + dims.nbDims, 1,
+                               std::multiplies<>()),
+               value) {
+    w.type = nvinfer1::DataType::kFLOAT;
+    w.values = value_.data();
+    w.count = value_.size();
+  }
+
+  nvinfer1::Dims dims_;
+  std::vector<float> value_;
+  nvinfer1::Weights w;
+};
+
+const char* kInputTensor1 = "input1";
+const char* kInputTensor2 = "input2";
+const char* kOutputTensor1 = "output";
+const char* kOutputTensor2 = "output-nms";
 
-// Creates a network to compute y=2x+3.
+// Creates a network to compute x+y.
 TrtUniquePtrType<nvinfer1::IHostMemory> CreateSerializedEngine() {
   Logger& logger = *Logger::GetLogger();
   TrtUniquePtrType<nvinfer1::IBuilder> builder(
       nvinfer1::createInferBuilder(logger));
-  ScopedWeights weights(2.0);
-  ScopedWeights bias(3.0);
-#if IS_TRT_VERSION_GE(6, 0, 0, 0)
   TrtUniquePtrType<nvinfer1::INetworkDefinition> network(
-      builder->createNetworkV2(0L));
-#else
-  nvinfer1::INetworkDefinition* network = builder->createNetwork();
-#endif
+      builder->createNetworkV2(
+          1U << static_cast<uint32_t>(
+              nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH)));
   // Add the input.
-  auto input = network->addInput(kInputTensor, nvinfer1::DataType::kFLOAT,
-                                 nvinfer1::Dims3{1, 1, 1});
-  EXPECT_NE(input, nullptr);
-  // Add the hidden layer.
-  auto layer = network->addFullyConnected(*input, 1, weights.get(), bias.get());
+  auto input1 = network->addInput(kInputTensor1, nvinfer1::DataType::kFLOAT,
+                                  nvinfer1::Dims4{1, 1, 1, 1});
+  auto input2 = network->addInput(kInputTensor2, nvinfer1::DataType::kFLOAT,
+                                  nvinfer1::Dims4{1, 1, 1, 1});
+  EXPECT_NE(input1, nullptr);
+  EXPECT_NE(input2, nullptr);
+  // Add an ILayer layer.
+  auto layer = network->addElementWise(*input1, *input2,
+                                       nvinfer1::ElementWiseOperation::kSUM);
   EXPECT_NE(layer, nullptr);
-  // Mark the output.
   auto output = layer->getOutput(0);
-  output->setName(kOutputTensor);
+  output->setName(kOutputTensor1);
   network->markOutput(*output);
-  // Build the engine
+
+#ifdef TF_TRT_USE_EFFICIENT_NMS_PLUGIN
+  // Add an efficient nms plugin.
+  ScopedShapedWeights boxes_weights(nvinfer1::Dims3(1, 10, 4), 0.0f);
+  ScopedShapedWeights scores_weights(nvinfer1::Dims3(1, 10, 10), 0.0f);
+  nvinfer1::IConstantLayer* boxes =
+      network->addConstant(boxes_weights.dims_, boxes_weights.w);
+  nvinfer1::IConstantLayer* scores =
+      network->addConstant(scores_weights.dims_, scores_weights.w);
+
+  std::array<nvinfer1::ITensor*, 2> nms_inputs = {boxes->getOutput(0),
+                                                  scores->getOutput(0)};
+  auto plugin_creator = MakeNMSPluginCreator("tftrt");
+  auto plugin = createPlugin("nms_plugin_instance", plugin_creator.get(), {});
+  auto nms = network->addPluginV2(nms_inputs.data(), 2, *plugin);
+  nms->getOutput(0)->setName(kOutputTensor2);
+  network->markOutput(*nms->getOutput(0));
+#else
+  auto sub_layer = network->addElementWise(
+      *input1, *input2, nvinfer1::ElementWiseOperation::kSUB);
+  EXPECT_NE(sub_layer, nullptr);
+  network->markOutput(*sub_layer->getOutput(0));
+  sub_layer->getOutput(0)->setName(kOutputTensor2);
+#endif
+
+  // Build the engine.
   builder->setMaxBatchSize(1);
-#if IS_TRT_VERSION_GE(6, 0, 0, 0)
   TrtUniquePtrType<nvinfer1::IBuilderConfig> builderConfig(
       builder->createBuilderConfig());
-  builderConfig->setMaxWorkspaceSize(1 << 10);
+  builderConfig->setMaxWorkspaceSize(1 << 20);
   TrtUniquePtrType<nvinfer1::ICudaEngine> engine(
       builder->buildEngineWithConfig(*network, *builderConfig));
-#else
-  builder->setMaxWorkspaceSize(1 << 10);
-  auto engine = builder->buildCudaEngine(*network);
-#endif
   EXPECT_NE(engine, nullptr);
   // Serialize the engine to create a model, then close everything.
   TrtUniquePtrType<nvinfer1::IHostMemory> model(engine->serialize());
   return model;
 }
 
+template <typename T>
+unsigned GetBindingSizeBytes(const nvinfer1::ICudaEngine& engine, int index,
+                             unsigned batch_size) {
+  unsigned vol = batch_size;
+  auto dims = engine.getBindingDimensions(index);
+  int vecDim = engine.getBindingVectorizedDim(index);
+  if (-1 != vecDim)  // i.e., 0 != lgScalarsPerVector
+  {
+    int scalarsPerVec = engine.getBindingComponentsPerElement(index);
+    // Divide round up.
+    dims.d[vecDim] = (dims.d[vecDim] + scalarsPerVec - 1 / scalarsPerVec);
+    vol *= scalarsPerVec;
+  }
+  vol *= std::accumulate(dims.d, dims.d + dims.nbDims, 1, std::multiplies<>());
+  return vol * sizeof(T);
+}
+
 // Executes the network.
-void Execute(nvinfer1::IExecutionContext* context, const float* input,
-             float* output) {
+void Execute(nvinfer1::IExecutionContext* context, const float* input1,
+             const float* input2, float* output1, float* output2) {
   const nvinfer1::ICudaEngine& engine = context->getEngine();
 
   // We have two bindings: input and output.
-  ASSERT_EQ(engine.getNbBindings(), 2);
-  const int input_index = engine.getBindingIndex(kInputTensor);
-  const int output_index = engine.getBindingIndex(kOutputTensor);
+  ASSERT_EQ(engine.getNbBindings(), 4);
+  const int input_index1 = engine.getBindingIndex(kInputTensor1);
+  const int input_index2 = engine.getBindingIndex(kInputTensor2);
+  const int output_index1 = engine.getBindingIndex(kOutputTensor1);
+  const int output_index2 = engine.getBindingIndex(kOutputTensor2);
 
   // Create GPU buffers and a stream
-  void* buffers[2];
-  ASSERT_EQ(0, cudaMalloc(&buffers[input_index], sizeof(float)));
-  ASSERT_EQ(0, cudaMalloc(&buffers[output_index], sizeof(float)));
+  std::vector<void*> buffers(engine.getNbBindings());
+  for (int i = 0; i < buffers.size(); i++) {
+    ASSERT_EQ(
+        0, cudaMalloc(&buffers[i], GetBindingSizeBytes<float>(engine, i, 1)));
+  }
+
   cudaStream_t stream;
   ASSERT_EQ(0, cudaStreamCreate(&stream));
 
@@ -112,22 +213,35 @@ void Execute(nvinfer1::IExecutionContext* context, const float* input,
   // Note that since the host buffer was not created as pinned memory, these
   // async copies are turned into sync copies. So the following synchronization
   // could be removed.
-  ASSERT_EQ(0, cudaMemcpyAsync(buffers[input_index], input, sizeof(float),
+  ASSERT_EQ(0, cudaMemcpyAsync(buffers[input_index1], input1, sizeof(float),
+                               cudaMemcpyHostToDevice, stream));
+  ASSERT_EQ(0, cudaMemcpyAsync(buffers[input_index2], input2, sizeof(float),
                                cudaMemcpyHostToDevice, stream));
-  context->enqueue(1, buffers, stream, nullptr);
-  ASSERT_EQ(0, cudaMemcpyAsync(output, buffers[output_index], sizeof(float),
+  context->enqueueV2(buffers.data(), stream, nullptr);
+  ASSERT_EQ(0, cudaMemcpyAsync(output1, buffers[output_index1], sizeof(float),
                                cudaMemcpyDeviceToHost, stream));
+  ASSERT_EQ(
+      0, cudaMemcpyAsync(output2, buffers[output_index2],
+                         GetBindingSizeBytes<int32>(engine, output_index2, 1),
+                         cudaMemcpyDeviceToHost, stream));
   cudaStreamSynchronize(stream);
 
   // Release the stream and the buffers
-  ASSERT_EQ(0, cudaFree(buffers[input_index]));
-  ASSERT_EQ(0, cudaFree(buffers[output_index]));
+  for (int i = 0; i < buffers.size(); i++) {
+    ASSERT_EQ(0, cudaFree(buffers[i]));
+  }
   cudaStreamDestroy(stream);
 }
 
 TEST(TensorrtTest, BasicFunctions) {
+  // We must register the plugin creator in order to deserialize the plugin.
+#ifdef TF_TRT_USE_EFFICIENT_NMS_PLUGIN
+  auto plugin_creator = MakeNMSPluginCreator("tftrt");
+  getPluginRegistry()->registerCreator(*plugin_creator, "tftrt");
+#endif
+
   // Handle the case where the test is run on machine with no gpu available.
-  if (CHECK_NOTNULL(GPUMachineManager())->VisibleDeviceCount() <= 0) {
+  if (CHECK_NOTNULL(se::GPUMachineManager())->VisibleDeviceCount() <= 0) {
     LOG(WARNING) << "No gpu device available, probably not being run on a gpu "
                     "machine. Skipping...";
     return;
@@ -145,14 +259,29 @@ TEST(TensorrtTest, BasicFunctions) {
       engine->createExecutionContext());
 
   // Execute the network.
-  float input = 1234;
-  float output;
-  Execute(context.get(), &input, &output);
-  EXPECT_EQ(output, input * 2 + 3);
+  float input1 = 1234;
+  float input2 = 567;
+
+  std::vector<float> output1(
+      GetBindingSizeBytes<float>(*engine, 2, 1) / sizeof(float), 0.0f);
+
+  std::vector<float> output2(
+      GetBindingSizeBytes<int32>(*engine, 3, 1) / sizeof(int32), 0.0f);
+
+  ASSERT_EQ(output1.size(), 1);
+  ASSERT_EQ(output2.size(), 1);
+
+  Execute(context.get(), &input1, &input2, output1.data(), output2.data());
+  EXPECT_EQ(output1[0], input1 + input2);
+
+#ifdef TF_TRT_USE_EFFICIENT_NMS_PLUGIN
+  EXPECT_EQ(output2[0], 0);
+#else
+  EXPECT_EQ(output2[0], 667);
+#endif  // TF_TRT_USE_EFFICIENT_NMS_PLUGIN
 }
 
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/trt_convert_api.cc b/tensorflow/compiler/tf2tensorrt/trt_convert_api.cc
new file mode 100644
index 00000000000..a415c5fdd41
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/trt_convert_api.cc
@@ -0,0 +1,512 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2tensorrt/trt_convert_api.h"
+
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "absl/strings/str_join.h"
+#include "tensorflow/cc/tools/freeze_saved_model.h"
+#include "tensorflow/compiler/tf2tensorrt/common/utils.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/grappler/clusters/cluster.h"
+#include "tensorflow/core/grappler/clusters/single_machine.h"
+#include "tensorflow/core/grappler/clusters/utils.h"
+#include "tensorflow/core/grappler/devices.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/grappler_item_builder.h"
+#include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+#include "tensorflow/core/public/session.h"
+
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+
+namespace tensorflow {
+
+namespace tensorrt {
+namespace {
+
+// Creates and provisions a new cluster. The caller must call Shutdown before
+// the cluster is destroyed.
+Status NewCluster(grappler::Cluster** cluster) {
+  int num_cpu_cores = grappler::GetNumAvailableLogicalCPUCores();
+  int num_gpus = grappler::GetNumAvailableGPUs();
+  int timeout_s = 60 * 10;
+  *cluster = new grappler::SingleMachine(timeout_s, num_cpu_cores, num_gpus);
+  (*cluster)->DisableDetailedStats(true);
+  (*cluster)->AllowSoftPlacement(true);
+  (*cluster)->SetNumWarmupSteps(10);
+  TF_RETURN_IF_ERROR((*cluster)->Provision());
+  return Status::OK();
+}
+
+Status RunGrappler(const MetaGraphDef& meta_graph_def,
+                   const std::vector<std::string>& input_names,
+                   const std::vector<std::string>& output_names,
+                   const ConfigProto& config_proto, grappler::Cluster* cluster,
+                   GraphDef* out_graph_def) {
+  grappler::ItemConfig item_config;
+
+  for (const string& name : input_names) {
+    item_config.feed_nodes.insert(name);
+  }
+  for (const string& name : output_names) {
+    item_config.fetch_nodes.insert(name);
+  }
+
+  std::unique_ptr<grappler::GrapplerItem> item =
+      grappler::GrapplerItemFromMetaGraphDef("tf_graph", meta_graph_def,
+                                             item_config);
+  if (!item) {
+    return tensorflow::errors::Internal(
+        "Failed to create grappler item from MetaGraphDef.");
+  }
+
+  tensorflow::DeviceBase* cpu_device = nullptr;
+  TF_RETURN_IF_ERROR(grappler::RunMetaOptimizer(
+      std::move(*item), config_proto, cpu_device, cluster, out_graph_def));
+  VLOG(2) << "Grappler finished\n";
+  return Status::OK();
+}
+
+Status ImportGraphDefToSession(Session* session, const GraphDef& graph_def,
+                               const string& prefix) {
+  ImportGraphDefOptions opts;
+  opts.prefix = prefix;
+  Graph graph(OpRegistry::Global());
+  TF_RETURN_IF_ERROR(ImportGraphDef(opts, graph_def, &graph, nullptr));
+  GraphDef new_graph_def;
+  graph.ToGraphDef(&new_graph_def);
+  TF_RETURN_IF_ERROR(session->Extend(new_graph_def));
+  return Status::OK();
+}
+
+Status GetTrtRewriterConfig(const TfTrtConversionParams& params,
+                            const GraphDef& frozen_graph_def,
+                            RewriterConfig* opt_config) {
+  opt_config->set_meta_optimizer_iterations(tensorflow::RewriterConfig::ONE);
+  opt_config->set_min_graph_nodes(-1);  // do not skip small graphs
+
+  // Turn off remapping.
+  opt_config->set_remapping(RewriterConfig_Toggle::RewriterConfig_Toggle_OFF);
+
+  // If the graph has QDQ nodes, then we need to disable folding of the
+  // QDQ with constants. Otherwise, the conversion will not work corectly.
+  // Ideally, we do this after segmentation and outlining of TRT regions to
+  // functions, but we currently lack that capability. Disabling QDQ-const
+  // folding doesn't matter if you don't have QDQ nodes, so we always enable
+  // this.
+  opt_config->set_experimental_disable_folding_quantization_emulation(
+      IS_TRT_VERSION_GE(8, 0, 0, 0));
+
+  // Initial transformations before TensorRTOptimizer is called
+  opt_config->add_optimizers("function");
+  opt_config->add_optimizers("constfold");
+  opt_config->add_optimizers("layout");
+  opt_config->add_optimizers("constfold");
+
+  // Parameters for TensorRTOptimizer
+  auto trt_optimizer = opt_config->add_custom_optimizers();
+  trt_optimizer->set_name("TensorRTOptimizer");
+
+  auto trt_parameter_map = trt_optimizer->mutable_parameter_map();
+  (*trt_parameter_map)["is_dynamic_op"].set_b(true);
+  (*trt_parameter_map)["minimum_segment_size"].set_i(
+      params.minimum_segment_size);
+  string prec_string;
+  TF_RETURN_IF_ERROR(
+      TrtPrecisionModeToName(params.precision_mode, &prec_string));
+  (*trt_parameter_map)["precision_mode"].set_s(prec_string);
+  (*trt_parameter_map)["max_batch_size"].set_i(1);
+  (*trt_parameter_map)["max_workspace_size_bytes"].set_i(
+      params.max_workspace_size_bytes);
+  (*trt_parameter_map)["max_cached_engines"].set_i(params.max_cached_engines);
+  (*trt_parameter_map)["use_calibration"].set_b(params.use_calibration);
+  (*trt_parameter_map)["profile_strategy"].set_s(
+      ProfileStrategyToName(params.profile_strategy));
+  (*trt_parameter_map)["use_implicit_batch"].set_b(!params.use_dynamic_shape);
+  (*trt_parameter_map)["_allow_build_at_runtime"].set_b(
+      params.allow_build_at_runtime);
+  return Status::OK();
+}
+
+// Runs TRTOptimizer grappler pass.
+Status RunTfTrt(const MetaGraphDef& meta_graph_def,
+                const std::vector<std::string>& input_names,
+                const std::vector<std::string>& output_names,
+                const RewriterConfig& rewriter_config,
+                GraphDef* segmented_graph_def) {
+  ConfigProto config_proto;
+  *config_proto.mutable_graph_options()->mutable_rewrite_options() =
+      rewriter_config;
+
+  VLOG(4) << "Setting up Grappler parameters\n" << config_proto.DebugString();
+  std::unique_ptr<grappler::Cluster> cluster;
+  grappler::Cluster* p_cluster;
+  mutex mu_cluster;  // There can be only one provisioned cluster per process.
+  mutex_lock lock(mu_cluster);
+  TF_RETURN_IF_ERROR(NewCluster(&p_cluster));
+  cluster.reset(p_cluster);
+  TF_RETURN_IF_ERROR(RunGrappler(meta_graph_def, input_names, output_names,
+                                 config_proto, cluster.get(),
+                                 segmented_graph_def));
+  TF_RETURN_IF_ERROR(cluster->Shutdown());
+  return Status::OK();
+}
+
+// Sets the _profile_generation mode attribute of all TRTEngineOp nodes in the
+// graph to mode.
+Status SetProfileGenerationMode(GraphDef* graph_def, bool mode) {
+  VLOG(3) << "Setting _profile_generation_mode=" << mode;
+  std::string op{"TRTEngineOp"};
+  for (auto& node : *(graph_def->mutable_node())) {
+    if (!op.compare(node.op())) {
+      auto* attr = node.mutable_attr();
+      AttrValue profile_generation_mode;
+      profile_generation_mode.set_b(mode);
+      (*attr)["_profile_generation_mode"] = profile_generation_mode;
+    }
+  }
+  return Status::OK();
+}
+
+Status RunSession(Session* session, const std::vector<std::string>& input_names,
+                  const std::vector<std::string>& output_names,
+                  const std::vector<Tensor>& input_tensors,
+                  string prefix = "") {
+  TRT_ENSURE(!input_names.empty());
+  TRT_ENSURE(!output_names.empty());
+  TRT_ENSURE(!input_tensors.empty());
+
+  std::vector<std::pair<std::string, tensorflow::Tensor>> input_pairs;
+  std::vector<std::string> prefixed_output_names;
+  auto prefixed_name = [](std::string prefix, std::string name) {
+    return !prefix.empty() ? absl::StrJoin({prefix, name}, "/") : name;
+  };
+  for (int i = 0; i < input_names.size(); i++) {
+    input_pairs.push_back(
+        {prefixed_name(prefix, input_names.at(i)), input_tensors.at(i)});
+  }
+  for (int i = 0; i < output_names.size(); i++) {
+    prefixed_output_names.push_back(prefixed_name(prefix, output_names.at(i)));
+  }
+  std::vector<tensorflow::Tensor> output_tensors;
+  for (int i = 0; i < output_names.size(); i++) {
+    output_tensors.push_back({});
+  }
+  VLOG(3) << "TF-TRT Build mode: running inference\n";
+  TF_RETURN_IF_ERROR(
+      session->Run(input_pairs, prefixed_output_names, {}, &output_tensors));
+  return Status::OK();
+}
+
+// Runs the model to create the engines. In dynamic shape mode, before creating
+// the engines, we provide shapes to define optimization profiles.
+Status Build(GraphDef& segmented_graph_def,
+             const std::vector<std::string>& input_names,
+             const std::vector<std::string>& output_names,
+             const std::vector<std::vector<tensorflow::Tensor>>& inputs,
+             Session* session, const TfTrtConversionParams params) {
+  VLOG(2) << "Building the model";
+  bool need_collect_profiles = params.use_dynamic_shape && inputs.size() > 1;
+  if (need_collect_profiles) {
+    TF_RETURN_IF_ERROR(SetProfileGenerationMode(&segmented_graph_def, true));
+  }
+  TF_RETURN_IF_ERROR(session->Create(segmented_graph_def));
+  string prefix = "";
+  if (need_collect_profiles) {
+    for (auto const& input : inputs) {
+      TF_RETURN_IF_ERROR(RunSession(session, input_names, output_names, input));
+    }
+    prefix = "TrtBuildStep";
+    TF_RETURN_IF_ERROR(SetProfileGenerationMode(&segmented_graph_def, false));
+    VLOG(3) << "Importing graph with _profile_generation_mode disabled";
+    TF_RETURN_IF_ERROR(
+        ImportGraphDefToSession(session, segmented_graph_def, prefix));
+  }
+  TF_RETURN_IF_ERROR(
+      RunSession(session, input_names, output_names, *inputs.begin(), prefix));
+  return Status::OK();
+}
+
+// Returns the resource manager associated with the node.
+Status GetResourceManager(const NodeDef& node, Session* session,
+                          ResourceMgr** rm) {
+  const DeviceMgr* device_mgr;
+  TF_RETURN_IF_ERROR(session->LocalDeviceManager(&device_mgr));
+  Device* device;
+  string device_name = node.device().empty()
+                           ? "/job:localhost/replica:0/task:0/device:GPU:0"
+                           : node.device();
+  TF_RETURN_IF_ERROR(device_mgr->LookupDevice(device_name, &device));
+  *rm = device->resource_manager();
+  return Status::OK();
+}
+
+// Looks up the cache resurce associated with the TRT node.
+Status GetEngineCacheResource(const NodeDef& node, Session* session,
+                              TRTEngineCacheResource** resource) {
+  ResourceMgr* rm;
+  TF_RETURN_IF_ERROR(GetResourceManager(node, session, &rm));
+
+  absl::string_view resource_name = node.name();
+  size_t last_slash = resource_name.find_last_of('/');
+  if (last_slash != absl::string_view::npos) {
+    resource_name.remove_prefix(last_slash + 1);
+  }
+  const std::string container(kTfTrtContainerName);
+  *resource = nullptr;
+  TF_RETURN_IF_ERROR(
+      rm->Lookup(container, std::string(resource_name), resource));
+  if (resource == nullptr || (*resource)->cache_.size() == 0) {
+    return errors::Internal("Engine cache not found for", resource_name);
+  }
+  return Status::OK();
+}
+
+// Looks up the engine from the engine cache, and serializes the engine.
+Status ReadSerializedEngine(
+    const NodeDef& node, Session* session,
+    TrtUniquePtrType<nvinfer1::IHostMemory>* engine_data) {
+  TRTEngineCacheResource* resource;
+  TF_RETURN_IF_ERROR(GetEngineCacheResource(node, session, &resource));
+  core::ScopedUnref unref_cache_res(resource);
+  if (resource->cache_.size() > 1) {
+    return errors::Internal(
+        "Multiple engines found, but we can only serialize one");
+  }
+  const std::unique_ptr<EngineContext>& engine =
+      resource->cache_.begin()->second;
+  if (!engine) {
+    return errors::Internal("Engine not found for", node.name());
+  }
+
+  if (engine->GetCudaEngine()) {
+    // Serialize the engine.
+    engine_data->reset(engine->GetCudaEngine()->serialize());
+  } else {
+    LOG(WARNING) << "Engine cache contains nullptr";
+  }
+
+  return Status::OK();
+}
+
+// Saves the TRT engines as attributes of the TRTEngineOp nodes.
+Status ConvertToStaticEngine(const GraphDef graph_def,
+                             GraphDef* static_graph_def, Session* session) {
+  *static_graph_def = graph_def;
+  VLOG(1) << "Saving TRT engines as static engine";
+  std::string op{"TRTEngineOp"};
+  for (auto& node : *(static_graph_def->mutable_node())) {
+    if (!op.compare(node.op())) {
+      VLOG(2) << "Saving TRT engine for " << node.name()
+              << ", device: " << node.device();
+      TrtUniquePtrType<nvinfer1::IHostMemory> engine_data;
+      TF_RETURN_IF_ERROR(ReadSerializedEngine(node, session, &engine_data));
+      auto* attr = node.mutable_attr();
+      AttrValue static_engine;
+      static_engine.set_b(true);
+      AttrValue engine_string;
+      if (engine_data) {
+        engine_string.set_s(engine_data->data(), engine_data->size());
+      }
+      (*attr)["static_engine"] = static_engine;
+      (*attr)["serialized_segment"] = engine_string;
+    }
+  }
+  return Status::OK();
+}
+
+Status ValidateConversionParams(const TfTrtConversionParams& p, int n_inputs) {
+  if (p.precision_mode == TrtPrecisionMode::INT8 && p.use_calibration) {
+    return errors::InvalidArgument(
+        "Calibration not yet implemented through the C++ interface. Please use "
+        "our Python API for calibration.");
+  }
+  if (p.convert_to_static_engine && n_inputs == 0) {
+    return errors::InvalidArgument(
+        "TRT Engine needs to be built before we can convert it to static "
+        "engine. Please provide input data to build the model.");
+  }
+  if (!p.convert_to_static_engine && n_inputs >= 0) {
+    // After the conversion, the session that was used to build the engines
+    // will be destroyed. If we do not convert the engine to static engine,
+    // then we loose the engines.
+    //
+    // TODO(tfeher): Provide a way to save dynamic engines and remove this
+    // warning.
+    LOG(WARNING)
+        << "Skipping build mode because we cannot save the "
+           "engines. Use convert_to_static_engines=true conversion "
+           "parameter to enable build mode and save the engines in the graph.";
+  }
+  if (!p.allow_build_at_runtime && n_inputs == 0) {
+    LOG(WARNING)
+        << "TRT will not be used since allow_build_at_runtime is disabled and "
+           "no inputs are provided to build during conversion.";
+  }
+  return Status::OK();
+}
+
+// Returns configuration used during the build step session run.
+tensorflow::SessionOptions GetSessionConfg() {
+  // We also need to disable constant folding because we already ran constant
+  // folding and may have prevented quantization operation folding on purpose.
+  tensorflow::SessionOptions opts;
+  auto* rewriter_opts =
+      opts.config.mutable_graph_options()->mutable_rewrite_options();
+  rewriter_opts->set_experimental_disable_folding_quantization_emulation(true);
+
+  // It seems  that we need to disable the optimizer entirely to prevent the
+  // folding.
+  rewriter_opts->set_disable_meta_optimizer(true);
+
+  // Disable optimizations for static graph to allow calls to Session::Extend.
+  opts.config.mutable_experimental()->set_disable_optimize_for_static_graph(
+      true);
+  return opts;
+}
+
+}  // namespace
+
+::stream_executor::port::StatusOr<GraphDef> ConvertAndBuild(
+    const GraphDef& frozen_graph_def, const std::vector<string>& input_names,
+    const std::vector<string>& output_names,
+    const std::vector<std::vector<tensorflow::Tensor>>& inputs,
+    const TfTrtConversionParams& conv_params) {
+  TF_RETURN_IF_ERROR(ValidateConversionParams(conv_params, inputs.size()));
+  MetaGraphDef meta_graph;
+  *meta_graph.mutable_graph_def() = frozen_graph_def;
+
+  RewriterConfig rewriter_config;
+  TF_RETURN_IF_ERROR(
+      GetTrtRewriterConfig(conv_params, frozen_graph_def, &rewriter_config));
+
+  GraphDef segmented_graph_def;
+  TF_RETURN_IF_ERROR(RunTfTrt(meta_graph, input_names, output_names,
+                              rewriter_config, &segmented_graph_def));
+
+  GraphDef output;
+
+  if (!inputs.empty() && conv_params.convert_to_static_engine) {
+    // The TRTOptimization pass has inserted placeholder TRTEngineOps. Here we
+    // trigger conversion by inferring the graph.
+    std::unique_ptr<tensorflow::Session> session(
+        tensorflow::NewSession(GetSessionConfg()));
+    if (!session) {
+      return errors::Internal("Failed to create build session");
+    }
+
+    TF_RETURN_IF_ERROR(Build(segmented_graph_def, input_names, output_names,
+                             inputs, session.get(), conv_params));
+
+    TF_RETURN_IF_ERROR(
+        ConvertToStaticEngine(segmented_graph_def, &output, session.get()));
+  } else {
+    output = segmented_graph_def;
+  }
+  VLOG(1) << "TF-TRT conversion finished";
+  return output;
+}
+
+Status InlineFunctions(const MetaGraphDef& meta_graph_def,
+                       GraphDef* out_graph_def) {
+  ConfigProto config_proto;
+  auto opt_config =
+      config_proto.mutable_graph_options()->mutable_rewrite_options();
+
+  opt_config->set_meta_optimizer_iterations(tensorflow::RewriterConfig::ONE);
+  opt_config->set_min_graph_nodes(-1);  // do not skip small graphs
+  opt_config->add_optimizers("function");
+
+  TF_RETURN_IF_ERROR(RunGrappler(meta_graph_def, {}, {}, config_proto, nullptr,
+                                 out_graph_def));
+
+  VLOG(2) << "Graph is inlined";
+  return Status::OK();
+}
+
+// Freezes the graph. It is assumed that the functions are inlined and the
+// variables are initialized.
+Status FreezeGraph(SavedModelBundle& bundle, MetaGraphDef* frozen_meta_graph) {
+  std::unordered_set<std::string> inputs;
+  std::unordered_set<std::string> outputs;
+  GraphDef frozen_graph_def;
+  TF_RETURN_IF_ERROR(
+      FreezeSavedModel(bundle, &frozen_graph_def, &inputs, &outputs));
+
+  *frozen_meta_graph = bundle.meta_graph_def;
+  GraphDef* gdef = frozen_meta_graph->mutable_graph_def();
+  *gdef = frozen_graph_def;
+
+  VLOG(2) << "Graph frozen";
+  return Status::OK();
+}
+
+// Returns the name of nodes listed in the signature definition.
+std::vector<std::string> GetNodeNames(
+    const google::protobuf::Map<std::string, tensorflow::TensorInfo>& signature) {
+  std::vector<std::string> names;
+  for (auto const& item : signature) {
+    absl::string_view name = item.second.name();
+    // Remove tensor suffix like ":0".
+    size_t last_colon = name.find_last_of(':');
+    if (last_colon != absl::string_view::npos) {
+      name.remove_suffix(name.size() - last_colon);
+    }
+    names.push_back(std::string(name));
+  }
+  return names;
+}
+
+::stream_executor::port::StatusOr<GraphDef> ConvertAndBuild(
+    SavedModelBundle* bundle, const std::string& signature_key,
+    const std::vector<std::vector<tensorflow::Tensor>>& inputs,
+    const TfTrtConversionParams& conversion_params) {
+  // Inline the functions.
+  GraphDef inlined_graph_def;
+  TF_RETURN_IF_ERROR(
+      InlineFunctions(bundle->meta_graph_def, &inlined_graph_def));
+
+  // Replace the graph_def with the inlined graph. Note that bundle->session
+  // still has the original graph.
+  *bundle->meta_graph_def.mutable_graph_def() = inlined_graph_def;
+
+  // Freeze variables.
+  MetaGraphDef frozen_meta_graph;
+  TF_RETURN_IF_ERROR(FreezeGraph(*bundle, &frozen_meta_graph));
+
+  // Convert.
+  auto signature_map = bundle->GetSignatures();
+  const tensorflow::SignatureDef& signature = signature_map[signature_key];
+  std::vector<std::string> input_names = GetNodeNames(signature.inputs());
+  std::vector<std::string> output_names = GetNodeNames(signature.outputs());
+  return ConvertAndBuild(frozen_meta_graph.graph_def(), input_names,
+                         output_names, inputs, conversion_params);
+}
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/trt_convert_api.h b/tensorflow/compiler/tf2tensorrt/trt_convert_api.h
new file mode 100644
index 00000000000..24240b802c0
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/trt_convert_api.h
@@ -0,0 +1,129 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_TRT_CONVERT_API_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_TRT_CONVERT_API_H_
+
+#include <climits>
+#include <string>
+#include <vector>
+
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+
+#include "tensorflow/compiler/tf2tensorrt/common/utils.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/trt_parameters.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/platform/statusor.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+
+namespace tensorflow {
+
+struct SavedModelBundle;
+
+namespace tensorrt {
+
+struct TfTrtConversionParams {
+  // Corresponds 'workspaceSize' parameter of
+  // nvinfer1::IBuilderConfig::setMaxWorkspaceSize.
+#if IS_TRT_VERSION_GE(8, 4, 0, 0)
+  // Must use `LLONG_MAX - 512` to avoid overflow during casting.
+  size_t max_workspace_size_bytes = LLONG_MAX - 512;
+#else
+  size_t max_workspace_size_bytes = 1 << 30;  // 1,073,741,824
+#endif
+
+  // Minimum precision used by the TRT Engine.
+  TrtPrecisionMode precision_mode = TrtPrecisionMode::FP32;
+
+  // The minimum number of nodes required for a subgraph to be replaced by
+  // TRTEngineOp. Note that many small TRT subgraphs could be detrimental for
+  // performance, increasing the minimum segment size can help avoid the
+  // problem.
+  int minimum_segment_size = 3;
+
+  // Max number of cached TRT engines for dynamic TRT ops (by default we have
+  // dynamic TRT ops).
+  int max_cached_engines = 1;
+
+  // Note that calibration is currently not implemented with the C++ converter.
+  // This argument is ignored if precision_mode is not INT8. If set to True, the
+  // implementation will use the user provided inputs to generate calibration
+  // data. If set to False, quantization nodes will be expected for every tensor
+  // in the graph (excluding those which will be fused). If a range is missing,
+  // an error will occur. Please note that accuracy may be negatively affected
+  // if there is a mismatch between which tensors TRT quantizes and which
+  // tensors were trained with fake quantization.
+  bool use_calibration = true;
+
+  // Whether to enable dynamic shape mode for the TRT engines. It is
+  // recommended to use_dynamic_shape mode to handle dynamic input shape.
+  // Enabling dynamic shape mode can also improve the conversion rate of graphs
+  // with static input shape.
+  bool use_dynamic_shape = true;
+
+  // In dynamic shape mode we create an engine that can handle various input
+  // shape ranges. We derive the shape optimization profiles for the TRT engines
+  // in the graph based on user provided input data and profile_strategy.
+  ProfileStrategy profile_strategy = ProfileStrategy::kRange;
+
+  // Whether to allow bulding TRT engines at runtime. If no TensorRT engine can
+  // be found in cache that can handle the given inputs during runtime, then a
+  // new TensorRT engine is built at runtime if allow_build_at_runtime=True,
+  // otherwise native TF is used. We recommend to set this value false and build
+  // the engine in advance, to avoid runtime overhead.
+  bool allow_build_at_runtime = true;
+
+  // Record the TRT engine as an attribute of the TRTEngineOp. This is only
+  // valid when max_cached_engines == 1. Note: the frozen graph together with
+  // the serialized engines have to be below 2GiB (protobuf size limit). If
+  // convert_to_static_engine = false, then the converted graph_def only
+  // contains placeholder TRTEngineOp nodes.
+  bool convert_to_static_engine = true;
+};
+
+/**
+ * Converts the graph with TF-TRT.
+ *
+ * Performs TF-TRT conversion and returns the converted GraphDef. If inputs is
+ * not empty and convert_to_static_engine is requested, we also build the
+ * engines and convert the engines to static engines.
+ *
+ * Arguments:
+ * - frozen_graph_def input graph, it is assumed to be frozen
+ * - input_names names of the input tensors
+ * - output_names names of the output tensors
+ * - inputs tensors that we will use as input while building the TRT engines
+ * - conv_params parameters for the TF-TRT conversion
+ *
+ * Returns the converted graph_def.
+ */
+::stream_executor::port::StatusOr<GraphDef> ConvertAndBuild(
+    const GraphDef& frozen_graph_def, const std::vector<string>& input_names,
+    const std::vector<string>& output_names,
+    const std::vector<std::vector<tensorflow::Tensor>>& inputs,
+    const TfTrtConversionParams& conv_params);
+
+::stream_executor::port::StatusOr<GraphDef> ConvertAndBuild(
+    SavedModelBundle* bundle,
+    const std::string& signature_key = "serving_default",
+    const std::vector<std::vector<tensorflow::Tensor>>& inputs = {},
+    const TfTrtConversionParams& conversion_params = TfTrtConversionParams());
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_TRT_CONVERT_API_H_
diff --git a/tensorflow/compiler/tf2tensorrt/trt_convert_api_test.cc b/tensorflow/compiler/tf2tensorrt/trt_convert_api_test.cc
new file mode 100644
index 00000000000..bcfbb2adab7
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/trt_convert_api_test.cc
@@ -0,0 +1,358 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+
+#include "tensorflow/compiler/tf2tensorrt/trt_convert_api.h"
+
+#include "tensorflow/cc/ops/resource_variable_ops.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/cc/ops/state_ops.h"
+#include "tensorflow/cc/saved_model/loader.h"
+#include "tensorflow/core/framework/function_testlib.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+#include "tensorflow/core/public/session.h"
+
+namespace tensorflow {
+namespace tensorrt {
+
+struct TestParam {
+  TfTrtConversionParams conv_params;
+  std::vector<std::vector<int64>> input_shapes;
+};
+
+class TrtConverterTest
+    : public ::testing::TestWithParam<std::tuple<TestParam, bool, bool>> {
+ protected:
+  TrtConverterTest() {
+    param_ = std::get<0>(GetParam());
+    use_variable_ = std::get<1>(GetParam());
+    use_function_ = std::get<2>(GetParam());
+    input_tensors_ = GetInputTensors();
+  }
+
+  // Returns the following graph: output = input * [42, 137] + input
+  GraphDef GetGraphDef(PartialTensorShape input_shape) {
+    Scope root = Scope::NewRootScope();
+    Output c;
+    c = ops::Const(root.WithOpName("my_const"), {{42.0f, 137.0f}});
+    Output v;
+    if (use_variable_) {
+      Output v_handle = ops::VarHandleOp(root.WithOpName("my_var"),
+                                         DataType::DT_FLOAT, {1, 2});
+      v = ops::ReadVariableOp(root.WithOpName("my_var/Read/ReadVariableOp"),
+                              v_handle, DataType::DT_FLOAT);
+      auto v_init =
+          ops::AssignVariableOp(root.WithOpName("my_var/init"), v_handle, c);
+    } else {
+      v = c;
+    }
+    const auto attrs = ops::Placeholder::Shape(input_shape);
+    auto x = ops::Placeholder(root.WithOpName("input"), DT_FLOAT, attrs);
+    auto y = ops::Mul(root.WithOpName("my_mul"), x, v);
+    auto z = ops::Add(root.WithOpName("my_add"), x, y);
+    auto q = ops::Identity(root.WithOpName("output"), z);
+
+    GraphDef out;
+    TF_CHECK_OK(root.ToGraphDef(&out));
+    return out;
+  }
+
+  GraphDef GetGraphWithFunction(PartialTensorShape input_shape) {
+    using ::tensorflow::test::function::GDef;
+    using ::tensorflow::test::function::NDef;
+    const Tensor kOne = test::AsScalar<float>(1.0f);
+    TensorShapeProto value_shape_proto;
+    kOne.shape().AsProto(&value_shape_proto);
+    TensorShapeProto input_shape_proto;
+    input_shape.AsProto(&input_shape_proto);
+    NodeDef value_node;
+    if (use_variable_) {
+      value_node =
+          NDef("my_value", "Identity", {"my_var:0"}, {{"T", DT_RESOURCE}});
+    } else {
+      value_node =
+          NDef("my_value", "Identity", {"my_const:0"}, {{"T", DT_FLOAT}});
+    }
+    GraphDef gdef = GDef(
+        {
+            NDef("input", "Placeholder", {},
+                 {{"dtype", DT_FLOAT}, {"shape", input_shape_proto}}),
+            NDef("my_const", "Const", {},
+                 {{"dtype", DT_FLOAT}, {"value", kOne}}),
+            value_node,
+            NDef("call", "StatefulPartitionedCall", {"input", "my_value"},
+                 {{"Tin", DataTypeSlice{DT_FLOAT, use_variable_ ? DT_RESOURCE
+                                                                : DT_FLOAT}},
+                  {"Tout", DataTypeSlice{DT_FLOAT}},
+                  {"f", FunctionDefHelper::FunctionRef("f", {})}}),
+            NDef("output", "Identity", {"call:0"}, {{"T", DT_FLOAT}}),
+        },
+        {});
+    FunctionDef fdef;
+    if (use_variable_) {
+      *gdef.add_node() =
+          NDef("my_var", "VarHandleOp", {},
+               {{"dtype", DT_FLOAT}, {"shape", value_shape_proto}});
+
+      *gdef.add_node() = NDef("my_var/init", "AssignVariableOp",
+                              {"my_var", "my_const"}, {{"dtype", DT_FLOAT}});
+      *gdef.add_node() = NDef("my_var/Read/ReadVariableOp", "ReadVariableOp",
+                              {"my_var"}, {{"dtype", DT_FLOAT}});
+      // Define function f(x, v) = x * v + x, where v is a variable.
+      fdef = FunctionDefHelper::Define(
+          "f",                          // Name
+          {"x: float", "v: resource"},  // Args
+          {"q: float"},                 // Returns
+          {},                           // Attr def
+          // Nodes
+          {{{"my_var/Read/ReadVariableOp"},
+            "ReadVariableOp",
+            {"v"},
+            {{"dtype", DT_FLOAT}}},
+           {{"my_mul"},
+            "Mul",
+            {"x", "my_var/Read/ReadVariableOp"},
+            {{"T", DT_FLOAT}}},
+           {{"my_add"}, "AddV2", {"x", "my_mul"}, {{"T", DT_FLOAT}}},
+           {{"q"}, "Identity", {"my_add"}, {{"T", DT_FLOAT}}}});
+    } else {
+      // Define function f(x, v) = x * v + x, where v is const value.
+      fdef = FunctionDefHelper::Define(
+          "f",                       // Name
+          {"x: float", "v: float"},  // Args
+          {"q: float"},              // Returns
+          {},                        // Attr def
+          // Nodes
+          {{{"my_mul"}, "Mul", {"x", "v"}, {{"T", DT_FLOAT}}},
+           {{"my_add"}, "AddV2", {"x", "my_mul"}, {{"T", DT_FLOAT}}},
+           {{"q"}, "Identity", {"my_add"}, {{"T", DT_FLOAT}}}});
+    }
+    *gdef.mutable_library()->add_function() = fdef;
+
+    return gdef;
+  }
+
+  // Returns the following graph: output = input * [42, 137] + input
+  MetaGraphDef GetModel() {
+    PartialTensorShape shape({-1, 2});
+    MetaGraphDef out;
+    if (use_function_) {
+      *(out.mutable_graph_def()) = GetGraphWithFunction(shape);
+    } else {
+      *(out.mutable_graph_def()) = GetGraphDef(shape);
+    }
+    VLOG(2) << out.graph_def().DebugString();
+    TensorShapeProto shape_proto;
+    shape.AsProto(&shape_proto);
+    SignatureDef signature_def;
+    (*signature_def.mutable_inputs())["input"].set_name("input:0");
+    (*signature_def.mutable_inputs())["input"].set_dtype(DT_FLOAT);
+    *(*signature_def.mutable_inputs())["input"].mutable_tensor_shape() =
+        shape_proto;
+    (*signature_def.mutable_outputs())["output"].set_name("output:0");
+    (*signature_def.mutable_outputs())["output"].set_dtype(DT_FLOAT);
+    *(*signature_def.mutable_outputs())["output"].mutable_tensor_shape() =
+        shape_proto;
+    (*out.mutable_signature_def())["serving_default"] = signature_def;
+
+    VLOG(2) << signature_def.DebugString();
+    return out;
+  }
+
+  Status GetSavedModelBundle(SavedModelBundle* bundle) {
+    bundle->meta_graph_def = GetModel();
+    Session* session = nullptr;
+    TF_RETURN_IF_ERROR(NewSession(tensorflow::SessionOptions(), &session));
+    TF_RETURN_IF_ERROR(session->Create(bundle->meta_graph_def.graph_def()));
+    bundle->session.reset(session);
+    TF_RETURN_IF_ERROR(session->Run(/* inputs */ {}, /*outputs*/ {},
+                                    /*targets*/ {"my_var/init"}, nullptr));
+    return Status::OK();
+  }
+
+  // Confirms that we have a TRT node with the correct attributes.
+  void CheckTrtNode(const GraphDef& converted_graph_def) {
+    int n_trt_ops = 0;
+    string op_name{"TRTEngineOp"};
+    for (const auto& node : converted_graph_def.node()) {
+      if (!op_name.compare(node.op())) {
+        n_trt_ops++;
+        const auto& attr = node.attr();
+        EXPECT_EQ(attr.at("static_engine").b(),
+                  param_.conv_params.convert_to_static_engine);
+        if (param_.conv_params.convert_to_static_engine) {
+          VLOG(2) << "Found serialized segment with size "
+                  << attr.at("serialized_segment").s().size();
+          EXPECT_GT(attr.at("serialized_segment").s().size(), 0);
+        }
+      }
+    }
+    EXPECT_EQ(n_trt_ops, 1);
+  }
+
+  // Creates a list of input tensors, they will be used to build the engines.
+  std::vector<std::vector<Tensor>> GetInputTensors() {
+    std::vector<std::vector<Tensor>> input_tensors;
+    for (const std::vector<int64>& shape : param_.input_shapes) {
+      Tensor tensor(DT_FLOAT, TensorShape(shape));
+      test::FillIota(&tensor, 1.0f);
+      input_tensors.push_back({tensor});
+    }
+    return input_tensors;
+  }
+
+  void RunAndCompareResults(Session* session,
+                            const GraphDef& converted_graph_def) {
+    // Create a session to execute the converted graph.
+    Session* p_session = nullptr;
+    TF_EXPECT_OK(NewSession(SessionOptions(), &p_session));
+    std::unique_ptr<tensorflow::Session> trt_session(p_session);
+    TF_EXPECT_OK(trt_session->Create(converted_graph_def));
+
+    // Run models and compare the output.
+    for (const std::vector<Tensor>& input : input_tensors_) {
+      std::vector<Tensor> outputs;
+      TF_EXPECT_OK(
+          session->Run({{"input", input.at(0)}}, {"output"}, {}, &outputs));
+      std::cout << outputs.at(0).DebugString() << std::endl;
+
+      std::vector<Tensor> trt_outputs;
+      TF_EXPECT_OK(trt_session->Run({{"input", input.at(0)}}, {"output"}, {},
+                                    &trt_outputs));
+      std::cout << trt_outputs.at(0).DebugString() << std::endl;
+      ASSERT_EQ(outputs.size(), 1);
+      ASSERT_EQ(trt_outputs.size(), 1);
+      tensorflow::test::ExpectEqual(outputs[0], trt_outputs[0]);
+    }
+  }
+
+  void ConvertAndRunFrozenGraph() {
+    MetaGraphDef meta_graph_def = GetModel();
+
+    ::stream_executor::port::StatusOr<GraphDef> result = tensorrt::ConvertAndBuild(
+        meta_graph_def.graph_def(), {"input"}, {"output"}, input_tensors_,
+        param_.conv_params);
+    TF_ASSERT_OK(result.status());
+    const GraphDef& converted_graph_def = result.value();
+    CheckTrtNode(converted_graph_def);
+
+    // Create a session to execute the original graph.
+    Session* p_session = nullptr;
+    TF_EXPECT_OK(NewSession(SessionOptions(), &p_session));
+    std::unique_ptr<tensorflow::Session> session(p_session);
+    TF_EXPECT_OK(session->Create(meta_graph_def.graph_def()));
+
+    RunAndCompareResults(session.get(), converted_graph_def);
+  }
+
+  void ConvertAndRunSavedModel() {
+    SavedModelBundle bundle;
+    TF_CHECK_OK(GetSavedModelBundle(&bundle));
+
+    ::stream_executor::port::StatusOr<GraphDef> result = tensorrt::ConvertAndBuild(
+        &bundle, "serving_default", input_tensors_, param_.conv_params);
+    TF_ASSERT_OK(result.status());
+    const GraphDef& converted_graph_def = result.value();
+    CheckTrtNode(converted_graph_def);
+
+    RunAndCompareResults(bundle.GetSession(), converted_graph_def);
+  }
+
+  TestParam param_;
+  bool use_variable_;
+  bool use_function_;
+  std::vector<std::vector<Tensor>> input_tensors_;
+};
+
+INSTANTIATE_TEST_CASE_P(
+    TrtConverterTestInstantiation, TrtConverterTest,
+    ::testing::Combine(
+        ::testing::Values(
+            // Dynamic shape mode test with conver_to_static_engine=true.
+            TestParam{TfTrtConversionParams{
+                          1 << 20,  // max workspace size
+                          TrtPrecisionMode::FP32,
+                          3,      // minimum_segment_size
+                          1,      // max_cached_engines
+                          false,  // use_calibration
+                          true,   // use_dynamic_shape
+                          ProfileStrategy::kOptimal,
+                          true,  // allow_build_at_runtime
+                          true   // convert_to_static_engine
+                      },
+                      {{1, 2}, {4, 2}}},
+            // Implicit batch mode test with conver_to_static_engine=true.
+            TestParam{TfTrtConversionParams{
+                          1 << 20,  // max workspace size
+                          TrtPrecisionMode::FP16,
+                          3,      // minimum_segment_size
+                          1,      // max_cached_engines
+                          false,  // use_calibration
+                          false,  // use_dynamic_shape
+                          ProfileStrategy::kRange,
+                          true,  // allow_build_at_runtime
+                          true   // convert_to_static_engine
+                      },
+                      {{1, 2}}},
+            // Dynamic shape mode test convert_to_static_engine=false: we cannot
+            // save the engines, therefore we do not generate profiles. A single
+            // engine will be built during runtime, with profile that matches
+            // the first shape ({1,2}). The second shape will run as native
+            // segment.
+            TestParam{TfTrtConversionParams{
+                          1 << 20,  // max workspace size
+                          TrtPrecisionMode::FP32,
+                          3,      // minimum_segment_size
+                          1,      // max_cached_engines
+                          false,  // use_calibration
+                          true,   // use_dynamic_shape
+                          ProfileStrategy::kOptimal,
+                          true,  // allow_build_at_runtime
+                          false  // convert_to_static_engine
+                      },
+                      {{1, 2}, {4, 2}}},
+            // Implicit batch mode test with convert_to_static_engine=false.
+            // We will have two engines in the cache to handle the two shapes.
+            TestParam{TfTrtConversionParams{
+                          1 << 20,  // max workspace size
+                          TrtPrecisionMode::FP16,
+                          3,      // minimum_segment_size
+                          2,      // max_cached_engines
+                          false,  // use_calibration
+                          false,  // use_dynamic_shape
+                          ProfileStrategy::kRange,
+                          true,  // allow_build_at_runtime
+                          false  // convert_to_static_engine
+                      },
+                      {{1, 2}, {4, 2}}}),
+        ::testing::Values(false, true),    // use_variables
+        ::testing::Values(false, true)));  // use_function
+
+TEST_P(TrtConverterTest, Basic) {
+  if (use_variable_) {
+    ConvertAndRunSavedModel();
+  } else {
+    ConvertAndRunFrozenGraph();
+  }
+}
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/utils/py_utils.cc b/tensorflow/compiler/tf2tensorrt/utils/py_utils.cc
index c76efd813b1..01fc982c573 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/py_utils.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/py_utils.cc
@@ -15,7 +15,11 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2tensorrt/utils/py_utils.h"
 
+#include <string>
+
 #if GOOGLE_CUDA && GOOGLE_TENSORRT
+#include "tensorflow/compiler/tf2tensorrt/common/utils.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/op_converter_registry.h"
 #include "tensorflow/stream_executor/platform/dso_loader.h"
 #include "third_party/tensorrt/NvInfer.h"
 #endif
@@ -25,19 +29,19 @@ namespace tensorrt {
 
 bool IsGoogleTensorRTEnabled() {
 #if GOOGLE_CUDA && GOOGLE_TENSORRT
+#if TF_USE_TENSORRT_STATIC
+  LOG(INFO) << "TensorRT libraries are statically linked, skip dlopen check";
+  return true;
+#else   // TF_USE_TENSORRT_STATIC
   auto handle_or = se::internal::DsoLoader::TryDlopenTensorRTLibraries();
   if (!handle_or.ok()) {
-    LOG(WARNING) << "Cannot dlopen some TensorRT libraries. If you would like "
-                    "to use Nvidia GPU with TensorRT, please make sure the "
-                    "missing libraries mentioned above are installed properly.";
-    return false;
-  } else {
-    LOG(INFO) << "TensorRT is enabled.";
-    return true;
+    LOG_WARNING_WITH_PREFIX << "Could not find TensorRT";
   }
-#else
+  return handle_or.ok();
+#endif  // TF_USE_TENSORRT_STATIC
+#else   // GOOGLE_CUDA && GOOGLE_TENSORRT
   return false;
-#endif
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
 }
 
 void GetLinkedTensorRTVersion(int* major, int* minor, int* patch) {
@@ -66,5 +70,14 @@ void GetLoadedTensorRTVersion(int* major, int* minor, int* patch) {
 #endif
 }
 
+std::vector<std::string> GetRegisteredOpConverters() {
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+  auto* registry = tensorflow::tensorrt::convert::GetOpConverterRegistry();
+  return registry->ListRegisteredOps();
+#else
+  return {"undef"};
+#endif
+}
+
 }  // namespace tensorrt
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2tensorrt/utils/py_utils.h b/tensorflow/compiler/tf2tensorrt/utils/py_utils.h
index f52bb6f1bad..60a0d78cee8 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/py_utils.h
+++ b/tensorflow/compiler/tf2tensorrt/utils/py_utils.h
@@ -16,6 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_PY_UTILS_H_
 #define TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_PY_UTILS_H_
 
+#include <string>
+#include <vector>
+
 namespace tensorflow {
 namespace tensorrt {
 
@@ -27,6 +30,8 @@ void GetLinkedTensorRTVersion(int* major, int* minor, int* patch);
 // Return runtime time TensorRT library version information {Maj, Min, Patch}.
 void GetLoadedTensorRTVersion(int* major, int* minor, int* patch);
 
+std::vector<std::string> GetRegisteredOpConverters();
+
 }  // namespace tensorrt
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/tf2tensorrt/utils/py_utils.i b/tensorflow/compiler/tf2tensorrt/utils/py_utils.i
index d6e8eac5836..1784f5a2a00 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/py_utils.i
+++ b/tensorflow/compiler/tf2tensorrt/utils/py_utils.i
@@ -83,4 +83,4 @@ version_struct get_linked_tensorrt_version();
 version_struct get_loaded_tensorrt_version();
 bool is_tensorrt_enabled();
 
-%rename("%s") "";
+%rename("%s") "";
\ No newline at end of file
diff --git a/tensorflow/compiler/tf2tensorrt/utils/py_utils_wrapper.cc b/tensorflow/compiler/tf2tensorrt/utils/py_utils_wrapper.cc
new file mode 100644
index 00000000000..ea597383531
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/utils/py_utils_wrapper.cc
@@ -0,0 +1,46 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <string>
+#include <tuple>
+#include <vector>
+
+#include "pybind11/pybind11.h"  // from @pybind11
+#include "pybind11/stl.h"       // from @pybind11
+#include "tensorflow/compiler/tf2tensorrt/common/utils.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/py_utils.h"
+
+std::tuple<int, int, int> get_linked_tensorrt_version() {
+  return tensorflow::tensorrt::GetLinkedTensorRTVersion();
+}
+
+std::tuple<int, int, int> get_loaded_tensorrt_version() {
+  return tensorflow::tensorrt::GetLoadedTensorRTVersion();
+}
+
+PYBIND11_MODULE(_pywrap_py_utils, m) {
+  m.doc() = "_pywrap_py_utils: Various TensorRT utilities";
+  m.def("get_linked_tensorrt_version", get_linked_tensorrt_version,
+        "Return the compile time TensorRT library version as the tuple "
+        "(Major, Minor, Patch).");
+  m.def("get_loaded_tensorrt_version", get_loaded_tensorrt_version,
+        "Return the runtime time TensorRT library version as the tuple "
+        "(Major, Minor, Patch).");
+  m.def("is_tensorrt_enabled", tensorflow::tensorrt::IsGoogleTensorRTEnabled,
+        "Returns True if TensorRT is enabled.");
+  m.def("get_registered_op_converters",
+        tensorflow::tensorrt::GetRegisteredOpConverters,
+        "Return a list of registered op converters by operation name");
+}
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_allocator.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_allocator.cc
index f5b2a27f6ce..91b35a18378 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_allocator.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_allocator.cc
@@ -17,11 +17,9 @@ limitations under the License.
 
 #include "tensorflow/core/platform/logging.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
 
 namespace tensorflow {
 namespace tensorrt {
@@ -52,8 +50,7 @@ void* Align(uint64_t alignment, uint64_t size, void*& ptr, uint64_t& space) {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 namespace tensorflow {
 namespace tensorrt {
@@ -70,11 +67,20 @@ void* TRTDeviceAllocator::allocate(uint64_t size, uint64_t alignment,
   // TODO(aaroey): AllocateRaw takes size_t size as input, so it'll produce
   // unexpected result when TRT tries to allocate more bytes than size_t can
   // carry. Fix this.
-  void* mem = allocator_->AllocateRaw(alignment, total_size);
+  //
+  // Fail immediately if allocation fails, rather than waiting 10 seconds and
+  // failing then anyway.
+  // TensorRT 7 can also switch to a different algorithm for a layer if an
+  // algorithm uses too much memory. If we don't fail immediately building the
+  // engine can be *very* slow with TensorRT7 when GPU memory is limited.
+  AllocationAttributes attributes;
+  attributes.no_retry_on_failure = true;
+  void* mem = allocator_->AllocateRaw(alignment, total_size, attributes);
   if (!mem) return nullptr;
 
   void* alloc_mem = mem;
   QCHECK(Align(alignment, size, mem, total_size));
+  mutex_lock lock(mu_);
   if (mem != alloc_mem) {
     QCHECK(mem_map_.insert({mem, alloc_mem}).second);
   }
@@ -90,6 +96,7 @@ TRTDeviceAllocator::TRTDeviceAllocator(Allocator* allocator)
 }
 
 void TRTDeviceAllocator::free(void* memory) noexcept {
+  mutex_lock lock(mu_);
   VLOG(2) << "Deallocating @ " << memory;
   // allocated memory adjusted for alignment, restore the original pointer
   if (memory) {
@@ -105,5 +112,4 @@ void TRTDeviceAllocator::free(void* memory) noexcept {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h b/tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h
index 753e2e3f87d..2812aa06457 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h
@@ -19,12 +19,11 @@ limitations under the License.
 #include <unordered_map>
 
 #include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/platform/mutex.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 #include "third_party/tensorrt/NvInfer.h"
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
 
 namespace tensorflow {
 namespace tensorrt {
@@ -33,8 +32,7 @@ void* Align(uint64_t alignment, uint64_t size, void*& ptr, uint64_t& space);
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 namespace tensorflow {
 namespace tensorrt {
@@ -56,19 +54,20 @@ class TRTDeviceAllocator : public TRTBaseAllocator {
   virtual ~TRTDeviceAllocator() {
     VLOG(1) << "Destroying allocator attached to " << allocator_->Name();
   }
-  void* allocate(uint64_t size, uint64_t alignment, uint32_t flags) noexcept override;
+  void* allocate(uint64_t size, uint64_t alignment,
+                 uint32_t flags) noexcept override;
   void free(void* memory) noexcept override;
 
  private:
+  mutex mu_;
   Allocator* allocator_;
 
   // supporting alignment from allocation request requires a map to free;
-  std::unordered_map<void*, void*> mem_map_;
+  std::unordered_map<void*, void*> mem_map_ TF_GUARDED_BY(mu_);
 };
 
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
 #endif  // TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_ALLOCATOR_H_
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.cc
new file mode 100755
index 00000000000..cab00a036a8
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.cc
@@ -0,0 +1,286 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.h"
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/compiler/tf2tensorrt/common/utils.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_execution_context.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
+
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+#include "third_party/tensorrt/NvInfer.h"
+
+namespace tensorflow {
+namespace tensorrt {
+
+using absl::StrCat;
+
+ExecutionContext ExecutionContext::Create(nvinfer1::ICudaEngine* cuda_engine) {
+  bool has_int32_output = false;
+  for (int i = 0; i < cuda_engine->getNbBindings(); i++) {
+    if (!cuda_engine->bindingIsInput(i) &&
+        cuda_engine->getBindingDataType(i) == nvinfer1::DataType::kINT32) {
+      has_int32_output = true;
+      break;
+    }
+  }
+  if (!IS_TRT_VERSION_GE(8, 0, 0, 0) && has_int32_output) {
+    // TODO(nvbugs/3390469): Remove this workaround when the bug is fixed.
+    nvinfer1::IExecutionContext* execution_context =
+        cuda_engine->createExecutionContext();
+    return ExecutionContext(execution_context, true);
+  }
+
+  nvinfer1::IExecutionContext* execution_context =
+      cuda_engine->createExecutionContextWithoutDeviceMemory();
+  return ExecutionContext(execution_context, false);
+}
+
+Status GetTrtBindingShape(const nvinfer1::ICudaEngine* cuda_engine,
+                          const nvinfer1::IExecutionContext* execution_context,
+                          int binding_index, bool use_implicit_batch,
+                          int batch_size, TensorShape& shape) {
+  tensorflow::profiler::TraceMe activity(
+      "getBindingDimensions", tensorflow::profiler::TraceMeLevel::kInfo);
+  nvinfer1::Dims dims =
+      use_implicit_batch
+          ? cuda_engine->getBindingDimensions(binding_index)
+          : execution_context->getBindingDimensions(binding_index);
+  if (!use_implicit_batch) {
+    if (dims.nbDims == -1) {
+      return errors::Internal(
+          "Binding index out of range. This can happen if profile is not set, "
+          "or the network is invalid for the current profile.");
+    }
+  }
+  TF_RETURN_IF_ERROR(DimsAdapter(dims).TensorShape(
+      &shape,
+      use_implicit_batch ? absl::optional<int>(batch_size) : absl::nullopt));
+  return Status::OK();
+}
+
+Status SetupBindings(nvinfer1::ICudaEngine* cuda_engine, const Tensor& tensor,
+                     std::vector<void*>& buffers, int binding_index) {
+  tensorflow::profiler::TraceMe activity(
+      "SetBindingPointers", tensorflow::profiler::TraceMeLevel::kInfo);
+  const auto dtype = cuda_engine->getBindingDataType(binding_index);
+  VLOG(2) << "<<<<<<<<< SetupBindings with dtype = " << (int)dtype;
+  switch (dtype) {
+    case nvinfer1::DataType::kFLOAT:
+      buffers[binding_index] = const_cast<float*>(tensor.flat<float>().data());
+      break;
+    case nvinfer1::DataType::kHALF:
+      buffers[binding_index] =
+          const_cast<Eigen::half*>(tensor.flat<Eigen::half>().data());
+      break;
+    case nvinfer1::DataType::kINT8:
+      return errors::Internal("INT8 inputs are not supported yet!");
+    case nvinfer1::DataType::kINT32:
+      buffers[binding_index] = const_cast<int32*>(tensor.flat<int32>().data());
+      break;
+#if IS_TRT_VERSION_GE(8, 2, 0, 0)
+    case nvinfer1::DataType::kBOOL:
+      buffers[binding_index] = const_cast<bool*>(tensor.flat<bool>().data());
+      break;
+#endif
+#if IS_TRT_VERSION_GE(8, 5, 0, 0)
+    case nvinfer1::DataType::kUINT8:
+      buffers[binding_index] = const_cast<uint8*>(tensor.flat<uint8>().data());
+      break;
+#endif
+#if IS_TRT_VERSION_GE(8, 6, 0, 0)
+    case nvinfer1::DataType::kFP8:
+      return errors::Internal("FP8 inputs are not supported yet!");
+#endif
+    default:
+      return errors::Internal("Unknown TRT data type: ",
+                              static_cast<int>(dtype));
+  }
+  return Status::OK();
+}
+
+// Sets up bindings.
+Status SetTrtEngineInputs(nvinfer1::ICudaEngine* cuda_engine,
+                          nvinfer1::IExecutionContext* execution_context,
+                          const int trt_profile_idx,
+                          std::vector<void*>& buffers, bool use_implicit_batch,
+                          int num_batch,
+                          const TrtShapeOptimizationProfile& profiles,
+                          OpKernelContext* ctx, const DataVec* input_vec) {
+  tensorflow::profiler::TraceMe activity(
+      "SetTrtEngineInputs", tensorflow::profiler::TraceMeLevel::kInfo);
+  int n_inputs = ctx ? ctx->num_inputs() : (input_vec ? input_vec->size() : 0);
+  // Setup engine inputs.
+  for (int i = 0; i < n_inputs; i++) {
+    const Tensor& input_tensor = ctx ? ctx->input(i) : input_vec->at(i).tensor;
+    const TensorShape& input_shape = input_tensor.shape();
+
+    // Skip resource inputs.
+    if (input_tensor.dtype() == DataType::DT_RESOURCE) {
+      continue;
+    }
+
+    const string input_name =
+        ctx ? StrCat(IONamePrefixes::kInputPHName, i) : input_vec->at(i).name;
+    int binding_index;
+    Status status = GetTrtBindingIndex(input_name.c_str(), trt_profile_idx,
+                                       cuda_engine, &binding_index);
+    if (IS_TRT_VERSION_GE(8, 0, 0, 0)) {
+      TF_RETURN_IF_ERROR(status);
+    } else if (!status.ok()) {
+      // Before TRT 8, an input tensor can be pruned if it is not used by the
+      // network (e.g. only its shape is used, but the shape is already defined
+      // by the optimization profile by setting min=max). nvbugs/3153064
+      VLOG(2) << "Skipping pruned input " << input_name;
+      continue;
+    }
+
+    if (use_implicit_batch && ctx) {
+      // Ensure all inputs have the same batch size
+      if (num_batch != input_shape.dim_size(0)) {
+        const string msg =
+            StrCat("Input data has inconsistent batch size: ", num_batch,
+                   " vs ", input_shape.dim_size(0));
+        return errors::NotFound(msg);
+      }
+    }
+    // Set known input dimensions. This is necessary because TRT network
+    // could be made with dynamic dimensions.
+    if (!use_implicit_batch) {
+      TF_RETURN_IF_ERROR(profiles.SetInputShapeBinding(
+          i, binding_index, cuda_engine, execution_context));
+
+      if (cuda_engine->isExecutionBinding(binding_index)) {
+        tensorflow::profiler::TraceMe activity(
+            "SetTrtEngineInputs::setBindingDimensions",
+            tensorflow::profiler::TraceMeLevel::kInfo);
+        auto adap = DimsAdapter::Create(input_shape);
+        TRT_ENSURE_OK(adap);
+        nvinfer1::Dims trt_dims = adap.ValueOrDie().AsTrtDims();
+        if (execution_context->getBindingDimensions(binding_index) !=
+            trt_dims) {
+          VLOG(2) << "Setting binding dimensions for idx " << binding_index;
+          bool ret =
+              execution_context->setBindingDimensions(binding_index, trt_dims);
+          if (!ret) {
+            VLOG(2) << "Error setting engine input " << binding_index << " "
+                    << DebugString(trt_dims);
+            return errors::Internal(
+                "Binding dimension does not fit selected profile.");
+          }
+        }
+      }
+    }
+    // Setup input bindings.
+    TF_RETURN_IF_ERROR(
+        SetupBindings(cuda_engine, input_tensor, buffers, binding_index));
+  }
+
+  // Ensure all network dynamic dimensions (if any) are set in execution
+  // context.
+  if (!execution_context->allInputDimensionsSpecified()) {
+    return errors::Internal(
+        "Failed to set dimensions for all dynamic input tensors");
+  }
+  if (!execution_context->allInputShapesSpecified()) {
+    return errors::Internal(
+        "Failed to set dimensions for all shape input tensors.");
+  }
+  return Status::OK();
+}
+
+Status SetTrtEngineOutputs(nvinfer1::ICudaEngine* cuda_engine,
+                           nvinfer1::IExecutionContext* execution_context,
+                           int trt_profile_idx, std::vector<void*>& buffers,
+                           bool use_implicit_batch, int batch_size,
+                           OpKernelContext* ctx, DataVec* outputs) {
+  tensorflow::profiler::TraceMe activity(
+      "SetTrtEngineOutputs", tensorflow::profiler::TraceMeLevel::kInfo);
+  // Either one of ctx or outpus should be specified
+  int n_outputs = ctx ? ctx->num_outputs() : (outputs ? outputs->size() : 0);
+  for (int i = 0; i < n_outputs; i++) {
+    const string output_name =
+        ctx ? StrCat(IONamePrefixes::kOutputPHName, i) : outputs->at(i).name;
+    int binding_index;
+    TF_RETURN_IF_ERROR(GetTrtBindingIndex(output_name.c_str(), trt_profile_idx,
+                                          cuda_engine, &binding_index));
+
+    // Get TRT output shapes for allocating output memory.
+    TensorShape output_shape;
+    TF_RETURN_IF_ERROR(GetTrtBindingShape(cuda_engine, execution_context,
+                                          binding_index, use_implicit_batch,
+                                          batch_size, output_shape));
+
+    // Allocate output tensor of TRTEngineOp.
+    Tensor* output_tensor = nullptr;
+    if (ctx) {
+      tensorflow::profiler::TraceMe activity(
+          "AllocateOutput", tensorflow::profiler::TraceMeLevel::kInfo);
+      TF_RETURN_IF_ERROR(ctx->allocate_output(i, output_shape, &output_tensor));
+    } else {
+      // This path is used for unit tests. The tensor is already allocated.
+      // Its shape is not necessarily set correctly, we fix that.
+      VLOG(2) << "Applying shape " << output_shape.DebugString()
+              << " on output.";
+      output_tensor = &(outputs->at(i).tensor);
+      bool status = output_tensor->CopyFrom(*output_tensor, output_shape);
+      if (!status) {
+        return errors::Internal(
+            "Buffer size (", output_tensor->NumElements(),
+            ") do not match while reshaping output tensors to shape ",
+            output_shape.DebugString());
+      }
+    }
+
+    // Set up output bindings.
+    TF_RETURN_IF_ERROR(
+        SetupBindings(cuda_engine, *output_tensor, buffers, binding_index));
+  }
+  return Status::OK();
+}
+
+Status TrtEnqueue(nvinfer1::IExecutionContext* execution_context,
+                  std::vector<void*>& buffers, const cudaStream_t* stream,
+                  bool use_implicit_batch, int batch_size) {
+  tensorflow::profiler::TraceMe activity(
+      "TrtEnqueue", tensorflow::profiler::TraceMeLevel::kInfo);
+  bool ret = false;
+  if (use_implicit_batch) {
+    ret = execution_context->enqueue(batch_size, &buffers[0], *stream, nullptr);
+    VLOG(1) << "Called IExecutionContext::enqueue";
+  } else {
+    ret = execution_context->enqueueV2(&buffers[0], *stream, nullptr);
+    VLOG(1) << "Called IExecutionContext::enqueueV2";
+  }
+  if (!ret) {
+    return errors::Internal("Failed to enqueue batch for TRT engine");
+  }
+  // Synchronization will be done by TF.
+  return Status::OK();
+}
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.h b/tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.h
new file mode 100644
index 00000000000..1eb1d852374
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.h
@@ -0,0 +1,82 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_ENGINE_UTILS_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_ENGINE_UTILS_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/compiler/tf2tensorrt/common/datavec.h"
+#include "tensorflow/compiler/tf2tensorrt/common/utils.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/lib/core/status.h"
+
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+#include "third_party/tensorrt/NvInfer.h"
+
+namespace tensorflow {
+namespace tensorrt {
+using ::stream_executor::port::StatusOr;
+
+// Creates a TensorRT execution context.
+ExecutionContext CreateExecutionContext(nvinfer1::ICudaEngine* cuda_engine);
+
+// Sets input buffers for TRT from a list of input tensors. The input tensors
+// are either defined by ctx or by input_vec.
+Status SetTrtEngineInputs(nvinfer1::ICudaEngine* cuda_engine,
+                          nvinfer1::IExecutionContext* execution_context,
+                          const int trt_profile_idx,
+                          std::vector<void*>& buffers, bool use_implicit_batch,
+                          int num_batch,
+                          const TrtShapeOptimizationProfile& profiles,
+                          OpKernelContext* ctx = nullptr,
+                          const DataVec* input_vec = nullptr);
+
+// Returns the shape of a binding from TensorRT.
+//
+// The binding is identified by its binding_index. The batch_size argument is
+// ignored if use_implicit_batch==false. The shape is returned in the last
+// argument.
+Status GetTrtBindingShape(const nvinfer1::ICudaEngine* cuda_engine,
+                          const nvinfer1::IExecutionContext* execution_context,
+                          int binding_index, bool use_implicit_batch,
+                          int batch_size, TensorShape& shape);
+
+// Defines output buffers for TRT. The buffers are allocated by ctx, if ctx is
+// not null. Otherwise it is expected that the outputs DataVec is not null, and
+// the Tensors in outputs are already allocated.
+Status SetTrtEngineOutputs(nvinfer1::ICudaEngine* cuda_engine,
+                           nvinfer1::IExecutionContext* execution_context,
+                           int trt_profile_idx, std::vector<void*>& buffers,
+                           bool use_implicit_batch, int batch_size = 0,
+                           OpKernelContext* ctx = nullptr,
+                           DataVec* outputs = nullptr);
+
+// Enqueues TensorRT inference job. The batch_size argument is only relevant in
+// implicit batch mode.
+Status TrtEnqueue(nvinfer1::IExecutionContext* execution_context,
+                  std::vector<void*>& buffers, const cudaStream_t* stream,
+                  bool use_implicit_batch, int batch_size = 1);
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_ENGINE_UTILS_H_
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_execution_context.h b/tensorflow/compiler/tf2tensorrt/utils/trt_execution_context.h
new file mode 100644
index 00000000000..05b5cefbf94
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_execution_context.h
@@ -0,0 +1,43 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_EXECUTION_CONTEXT_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_EXECUTION_CONTEXT_H_
+
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+#include "third_party/tensorrt/NvInfer.h"
+
+namespace tensorflow {
+namespace tensorrt {
+
+// A wrapper for the TensorRT execution context which will destroy the TensorRT
+// execution context when the object goes out of scope.
+class ExecutionContext : public TrtUniquePtrType<nvinfer1::IExecutionContext> {
+ public:
+  ExecutionContext(nvinfer1::IExecutionContext* context, bool has_memory)
+      : TrtUniquePtrType<nvinfer1::IExecutionContext>(context),
+        has_device_memory_(has_memory) {}
+  static ExecutionContext Create(nvinfer1::ICudaEngine* cuda_engine);
+
+  bool HasDeviceMemory() { return has_device_memory_; }
+
+ private:
+  bool has_device_memory_;
+};
+
+};  // namespace tensorrt
+};  // namespace tensorflow
+#endif
+#endif
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_experimental_features.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_experimental_features.cc
new file mode 100644
index 00000000000..319ebff642b
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_experimental_features.cc
@@ -0,0 +1,35 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
+
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+
+#include "tensorflow/core/util/env_var.h"
+
+namespace tensorflow {
+namespace tensorrt {
+
+bool isExperimentalFeatureActivated(string feature_name) {
+  string envvar_str;
+  TF_CHECK_OK(
+      ReadStringFromEnvVar("TF_TRT_EXPERIMENTAL_FEATURES", "", &envvar_str));
+  return envvar_str.find(feature_name) != string::npos;
+}
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_experimental_features.h b/tensorflow/compiler/tf2tensorrt/utils/trt_experimental_features.h
new file mode 100644
index 00000000000..1a502c5f7e7
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_experimental_features.h
@@ -0,0 +1,31 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_EXPERIMENTAL_FEATURES_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_EXPERIMENTAL_FEATURES_H_
+
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+
+namespace tensorflow {
+namespace tensorrt {
+
+bool isExperimentalFeatureActivated(string feature_name);
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_EXPERIMENTAL_FEATURES_H_
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.cc
index 25a94502675..f9bf0e0e59d 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.cc
@@ -20,8 +20,7 @@ limitations under the License.
 
 #include "tensorflow/core/platform/logging.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
 
 namespace tensorflow {
@@ -59,7 +58,7 @@ bool TRTInt8Calibrator::setBatch(const std::unordered_map<string, void*>& data,
   VLOG(1) << "Set Batch Waiting finished";
 
   // Sets the batch.
-  for (const auto it : data) {
+  for (const auto& it : data) {
     auto devptr = dev_buffers_.find(it.first);
     if (devptr == dev_buffers_.end()) {
       LOG(FATAL) << "FATAL " << engine_name_ << " input name '" << it.first
@@ -122,7 +121,8 @@ void TRTInt8Calibrator::waitAndSetDone() {
   }
 }
 
-const void* TRTInt8Calibrator::readCalibrationCache(std::size_t& length) noexcept {
+const void* TRTInt8Calibrator::readCalibrationCache(
+    std::size_t& length) noexcept {
   if (calibration_table_.empty()) return nullptr;
   length = calibration_table_.size();
   return calibration_table_.data();
@@ -147,5 +147,4 @@ TRTInt8Calibrator::~TRTInt8Calibrator() {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif
-#endif
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.h b/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.h
index d7a3df7ac1e..2fa22662521 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.h
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.h
@@ -20,10 +20,10 @@ limitations under the License.
 #include <string>
 #include <unordered_map>
 #include <utility>
+
 #include "tensorflow/core/platform/mutex.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
 #include "third_party/tensorrt/NvInfer.h"
@@ -34,12 +34,8 @@ namespace tensorrt {
 // TRTs pull model for calibration. When TRT implements a means for
 // a push calibration This class should be updated accordingly
 
-// IInt8EntropyCalibrator2 is prefferred for TRT 5.1+.
-#if NV_TENSORRT_MAJOR > 5 || (NV_TENSORRT_MAJOR == 5 && NV_TENSORRT_MINOR >= 1)
+// IInt8EntropyCalibrator2 is preferred for TRT 5.1+.
 struct TRTInt8Calibrator : public nvinfer1::IInt8EntropyCalibrator2 {
-#else
-struct TRTInt8Calibrator : public nvinfer1::IInt8EntropyCalibrator {
-#endif
  public:
   // Construct a calibrator for future calibration.
   TRTInt8Calibrator(
@@ -72,7 +68,8 @@ struct TRTInt8Calibrator : public nvinfer1::IInt8EntropyCalibrator {
   // If not null, calibration is skipped.
   const void* readCalibrationCache(std::size_t& length) noexcept override;
 
-  void writeCalibrationCache(const void* ptr, std::size_t length) noexcept override;
+  void writeCalibrationCache(const void* ptr,
+                             std::size_t length) noexcept override;
 
   const string& getCalibrationTableAsString() { return calibration_table_; }
 
@@ -101,6 +98,5 @@ struct TRTInt8Calibrator : public nvinfer1::IInt8EntropyCalibrator {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif
-#endif
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
 #endif  // TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_INT8_CALIBRATOR_H_
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_logger.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_logger.cc
index 7e72204604e..69e66038661 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_logger.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_logger.cc
@@ -15,26 +15,72 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+
+#include <vector>
+
+#include "tensorflow/compiler/tf2tensorrt/common/utils.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/logger_registry.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_experimental_features.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace tensorflow {
 namespace tensorrt {
 
+bool filter_string(string msg) {
+  // This function checks for known substrings that shall be ignored.
+
+  static const std::vector<string> substr_patterns{
+      // Automatic messages generated by TensorRT when combined with
+      // Automatic Mixed Precision - TensorRT 8.2
+      "Missing scale and zero-point for",
+      "Subnormal FP16 values detected",
+      "If this is not the desired behavior, please modify the weights",
+      "had the following issues when converted to FP16",
+      "Values less than smallest positive FP16 Subnormal value detected.",
+      // Deprecation Warnings
+      "The implicit batch dimension mode has been deprecated.",
+      "The getMaxBatchSize() function should not be used with an engine built",
+      // Input-Warnings
+      "[RemoveDeadLayers] Input Tensor input is unused or used only at",
+      "Unused Input:",
+      // Data Type Warnings
+      "Tensor DataType is determined at build time for tensors not marked as",
+      // Myelin Performance Warning in dynamic shape mode
+      "Myelin graph with multiple dynamic values may have poor performance",
+      "(# 0 (SHAPE",
+      "CUDA lazy loading is not enabled. Enabling it can significantly reduce",
+  };
+
+  for (int i = 0; i < substr_patterns.size(); i++) {
+    std::size_t is_found = msg.find(substr_patterns[i]);
+    if (is_found != string::npos) {
+      return true;
+    }
+  }
+  return false;
+}
+
 // Use TF logging for TensorRT informations
 void Logger::log(Severity severity, const char* msg) noexcept {
+  static const bool filter_messages = []() {
+    return !isExperimentalFeatureActivated("disable_logger_filtering");
+  }();
+
+  if (filter_messages && filter_string(msg)) return;
+
+  if (!isValidSeverity(severity, msg) || suppressedMsg_ & (1 << (int)severity))
+    return;
+
   // Suppress info-level messages
   switch (severity) {
-#if NV_TENSORRT_MAJOR > 5 || (NV_TENSORRT_MAJOR == 5 && NV_TENSORRT_MINOR >= 1)
     case Severity::kVERBOSE:
-#endif
     case Severity::kINFO: {  // Mark TRT info messages as debug!
       VLOG(2) << name_ << " " << msg;
       break;
     }
     case Severity::kWARNING: {
-      LOG(WARNING) << name_ << " " << msg;
+      LOG_WARNING_WITH_PREFIX << name_ << " " << msg;
       break;
     }
     case Severity::kERROR: {
@@ -45,21 +91,42 @@ void Logger::log(Severity severity, const char* msg) noexcept {
       LOG(FATAL) << name_ << " " << msg;
       break;
     }
-    // This is useless for now. But would catch it in future if enum changes. It
-    // is always good to have default case!
-    default: {
-      LOG(FATAL) << name_ << "Got unknown severity level " << int(severity)
-                 << " from TensorRT: " << msg;
-      break;
-    }
   }
 }
+
+void Logger::suppressLoggerMsgs(Severity severity) {
+  if (isValidSeverity(severity)) {
+    suppressedMsg_ |= 1 << (int)severity;
+  }
+}
+
+void Logger::unsuppressLoggerMsgs(Severity severity) {
+  if (isValidSeverity(severity)) {
+    suppressedMsg_ &= (-1) ^ (1 << (int)severity);
+  }
+}
+
+bool Logger::isValidSeverity(Severity severity, const char* msg) noexcept {
+  switch (severity) {
+    case Severity::kVERBOSE:
+    case Severity::kINFO:
+    case Severity::kWARNING:
+    case Severity::kERROR:
+    case Severity::kINTERNAL_ERROR:
+      return true;
+  }
+  return false;
+}
+
+// static
 Logger* Logger::GetLogger() {
   static Logger* logger = new Logger("DefaultLogger");
   return logger;
 }
+
+REGISTER_TENSORRT_LOGGER("DefaultLogger", Logger::GetLogger());
+
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
-#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_logger.h b/tensorflow/compiler/tf2tensorrt/utils/trt_logger.h
index a9c1e80668a..8002df53e5c 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_logger.h
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_logger.h
@@ -18,8 +18,7 @@ limitations under the License.
 
 #include "tensorflow/core/platform/types.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 #include "third_party/tensorrt/NvInfer.h"
 
 namespace tensorflow {
@@ -29,17 +28,23 @@ namespace tensorrt {
 class Logger : public nvinfer1::ILogger {
  public:
   Logger(string name = "DefaultLogger") : name_(name) {}
-  void log(nvinfer1::ILogger::Severity severity, const char* msg) noexcept override;
+  void log(nvinfer1::ILogger::Severity severity,
+           const char* msg) noexcept override;
+  void suppressLoggerMsgs(nvinfer1::ILogger::Severity severity);
+  void unsuppressLoggerMsgs(nvinfer1::ILogger::Severity severity);
+  void unsuppressAllLoggerMsgs() { suppressedMsg_ = 0; }
   static Logger* GetLogger();
 
  private:
-  string name_;
+  bool isValidSeverity(nvinfer1::ILogger::Severity severity,
+                       const char* msg = nullptr) noexcept;
+  const string name_;
+  unsigned int suppressedMsg_ = 0;
 };
 
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
 
 #endif  // TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_LOGGER_H_
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.cc
index 5ab6bf1a317..30aff91a76d 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.cc
@@ -23,8 +23,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/platform/mutex.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 #include "third_party/tensorrt/NvInfer.h"
 
 namespace tensorflow {
@@ -87,15 +86,58 @@ string TRTEngineCacheResource::DebugString() const {
   for (const auto& item : cache_) {
     mutex_lock lock(item.second->mu);
     oss << TensorShapeUtils::ShapeListString(item.first) << ": " << hex
-        << "ICudaEngine: " << item.second->cuda_engine.get() << ", "
-        << "IExecutionContext: " << item.second->execution_context.get() << dec
-        << endl;
+        << "ICudaEngine: " << item.second->GetCudaEngine() << ", "
+        << "IExecutionContext: ";
+    absl::c_for_each(
+        item.second->execution_contexts,
+        [&](const ExecutionContext& ctx) { oss << ctx.get() << ","; });
+    oss << dec << endl;
   }
   return oss.str();
 }
 
+EngineContext* TRTEngineCacheResource::GetEngineContext(
+    const std::vector<TensorShape>& input_shapes) {
+  EngineContext* engine_context = nullptr;
+  int64 min_matched_batch_size = kint64max;
+  for (const auto& pair : cache_) {
+    const std::vector<TensorShape>& cached_input_shapes = pair.first;
+    // This should not happen, but just for safety.
+    if (input_shapes.size() != cached_input_shapes.size()) {
+      LOG(ERROR) << "Input shape list size mismatch"
+                 << ", cached size: " << cached_input_shapes.size()
+                 << " vs. input size: " << input_shapes.size();
+    }
+    if (AreShapesCompatible(input_shapes, cached_input_shapes)) {
+      const int cached_batch_size = cached_input_shapes[0].dim_size(0);
+      if (min_matched_batch_size > cached_batch_size) {
+        min_matched_batch_size = cached_batch_size;
+        engine_context = pair.second.get();
+      }
+    }
+  }
+  return engine_context;
+}
+
+EngineContext* TRTEngineCacheResource::GetEngineContext(const int profile_id) {
+  if (profiles_.NeedProfiles() && profile_id >= profiles_.GetNumProfiles()) {
+    LOG(ERROR) << "Out of range: profile_id " << profile_id
+               << " is larger than number of profiles "
+               << profiles_.GetNumProfiles();
+    return nullptr;
+  }
+  if (cache_.size() > 1) {
+    LOG(ERROR) << "Cache is expected to have at most "
+               << "1 engine in explicit batch mode where profiles are used.";
+    return nullptr;
+  }
+  if (cache_.size() == 0) {
+    return nullptr;
+  }
+  return cache_.begin()->second.get();
+}
+
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h b/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h
index 8d603ac4d55..5c4a6c1fdd8 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h
@@ -22,8 +22,10 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
 #include "tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.h"
 #include "tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.h"
 #include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/lib/core/errors.h"
 
@@ -114,31 +116,75 @@ class LRUCache {
   }
 };
 
-// Define a hash function for vector<TensorShape> because it is used as the key
-// for the engine cache.
-struct VectorTensorShapeHasher {
-  std::size_t operator()(const std::vector<TensorShape>& key) const {
-    return std::hash<std::string>()(TensorShapeUtils::ShapeListString(key));
-  }
-};
-
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 struct EngineContext {
   EngineContext() {}  // Creates an empty context.
-  EngineContext(
-      TrtUniquePtrType<nvinfer1::ICudaEngine>&& input_cuda_engine,
-      TrtUniquePtrType<nvinfer1::IExecutionContext>&& input_execution_context)
-      : cuda_engine(std::move(input_cuda_engine)),
-        execution_context(std::move(input_execution_context)) {}
+  EngineContext(TrtUniquePtrType<nvinfer1::ICudaEngine>&& cuda_engine,
+                ExecutionContext&& execution_context)
+      : cuda_engine_(std::move(cuda_engine)) {
+    execution_contexts.push_back(std::move(execution_context));
+    device_memory_size_ =
+        cuda_engine_ ? cuda_engine_->getDeviceMemorySize() : 0;
+  }
+  EngineContext(TrtUniquePtrType<nvinfer1::ICudaEngine>&& cuda_engine,
+                std::vector<ExecutionContext>&& execution_contexts)
+      : cuda_engine_(std::move(cuda_engine)),
+        execution_contexts(std::move(execution_contexts)) {
+    device_memory_size_ =
+        cuda_engine_ ? cuda_engine_->getDeviceMemorySize() : 0;
+  }
 
   mutex mu;
-  TrtUniquePtrType<nvinfer1::ICudaEngine> cuda_engine;
-  TrtUniquePtrType<nvinfer1::IExecutionContext> execution_context
-      GUARDED_BY(mu);
-};
 
+  nvinfer1::ICudaEngine* GetCudaEngine() { return cuda_engine_.get(); }
+
+  Status GetExecutionContext(int idx, nvinfer1::IExecutionContext** exec_ctx,
+                             bool* has_device_memory)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu) {
+    if (idx >= execution_contexts.size()) {
+      return errors::Internal("Requested engine context with index ", idx,
+                              ", but only ", execution_contexts.size(),
+                              "contexts are present.");
+    }
+    *exec_ctx = execution_contexts[idx].get();
+    *has_device_memory = execution_contexts[idx].HasDeviceMemory();
+    return Status::OK();
+  }
+
+  int GetNumContexts() {
+    mutex_lock lock(mu);
+    return execution_contexts.size();
+  }
+
+  size_t GetDeviceMemorySize() { return device_memory_size_; }
+
+ private:
+  // Note: declaration has to come before execution_contexts, to ensure proper
+  // order of destruction.
+  TrtUniquePtrType<nvinfer1::ICudaEngine> cuda_engine_;
+
+ public:
+  // In explicit batch mode, we maintain a vector of contexts for each engine,
+  // where each context is created for a specific profile. This is because it is
+  // either not possible or non-trivial to change the profile of a context for
+  // the following reasons:
+  // - To switch profiles (from TRT 7), one must first ensure that all inference
+  //   calls in that context are finished. This would require an additional
+  //   synchronization before we call setOptimizationProfile. To avoid this
+  //   extra sync call, we mantain separate execution context for each profile.
+  // IExecutionContext object is not thread safe: only one thread should use it
+  // for inference at a time therefore we need a mutex. More details at
+  // https://docs.nvidia.com/deeplearning/sdk/tensorrt-best-practices/index.html#thread-safety
+  // Additional discussion about execution context management and thread safety
+  // at https://github.com/tensorflow/tensorflow/issues/36959
+  std::vector<ExecutionContext> execution_contexts TF_GUARDED_BY(mu);
+
+ private:
+  // Until TRT 8.4 ICudaEngine::getDeviceMemorySize() has a non-negligible
+  // latency. Since its value remains constant, we can cache it.
+  size_t device_memory_size_;
+};
 // Contains the context required to build the calibration data.
 class CalibrationContext {
  public:
@@ -148,7 +194,7 @@ class CalibrationContext {
   std::unordered_map<string, std::pair<void*, size_t>> device_buffers_;
 
   // Temporary staging areas for calibration inputs.
-  std::vector<PersistentTensor> device_tensors_;
+  std::vector<Tensor> device_tensors_;
 
   std::unique_ptr<TRTInt8Calibrator> calibrator_;
   TrtUniquePtrType<nvinfer1::IBuilder> builder_;
@@ -158,8 +204,8 @@ class CalibrationContext {
 
  private:
   mutex mu_;
-  bool terminated_ GUARDED_BY(mu_) = false;
-  std::string calibration_table_ GUARDED_BY(mu_);
+  bool terminated_ TF_GUARDED_BY(mu_) = false;
+  std::string calibration_table_ TF_GUARDED_BY(mu_);
 };
 
 ABSL_CONST_INIT extern const absl::string_view kTfTrtContainerName;
@@ -179,6 +225,16 @@ class TRTEngineCacheResource : public ResourceBase {
 
   string DebugString() const override;
 
+  // Returns the EngineContext that is compatible with input_shapes.
+  // Returns nullptr if no compatible EngineContexts is found in cache.
+  EngineContext* GetEngineContext(const std::vector<TensorShape>& input_shapes);
+
+  // Returns the EngineContext that is compatible with profile_id.
+  // This function should be only called in explicit batch mode where
+  // cache size is expected to be at most one.
+  // Returns nullptr if no compatible EngineContexts is found in cache.
+  EngineContext* GetEngineContext(const int profile_id);
+
   // Keep device allocator for TRT.
   std::unique_ptr<TRTBaseAllocator> allocator_;
 
@@ -190,10 +246,14 @@ class TRTEngineCacheResource : public ResourceBase {
   // TODO(hinsu): Use different calibration context for the available shapes and
   // attach it to each item of the cache.
   std::unique_ptr<CalibrationContext> calib_ctx_;
+
+  // This object maintains all the optimization profiles during profile
+  // generation and engine build. During runtime the list of profiles is used to
+  // look up a matching profile for the input data.
+  TrtShapeOptimizationProfile profiles_;
 };
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
 
 }  // namespace tensorrt
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.cc
new file mode 100644
index 00000000000..ab9377057f9
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.cc
@@ -0,0 +1,664 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h"
+
+#include <algorithm>
+#include <functional>
+
+#include "absl/algorithm/container.h"
+#include "tensorflow/compiler/tf2tensorrt/common/utils.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
+#include "tensorflow/core/platform/stream_executor.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
+
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+
+#include "third_party/gpus/cuda/include/cuda_runtime_api.h"
+
+namespace tensorflow {
+namespace tensorrt {
+
+// Returns a vector of nvinfer1::Dims for a vector of TensorShapes.
+template <typename TensorShapeType>
+std::vector<nvinfer1::Dims> GetDimVec(std::vector<TensorShapeType> shape_vec) {
+  std::vector<nvinfer1::Dims> dimvec(shape_vec.size());
+  absl::c_transform(shape_vec, dimvec.begin(), [](TensorShapeType shape) {
+    auto adap = DimsAdapter::Create(shape);
+    TF_CHECK_OK(adap.status());
+    return adap.ValueOrDie().AsTrtDims();
+  });
+  return dimvec;
+}
+
+// In dynamic shape mode the optimization profile dims are only allowed to
+// differ from the network input dims where the network input dims have -1
+// values. We enforce this condition by changing prof_dims if necessary.
+void EnforceCompatibility(nvinfer1::Dims* prof_dims,
+                          const PartialTensorShape& input_shape) {
+  for (int i = 0; i < input_shape.dims(); i++) {
+    if (input_shape.dim_size(i) != -1) {
+      prof_dims->d[i] = input_shape.dim_size(i);
+    }
+  }
+}
+
+void SetImplicitBatchModeCompatibleProfile(
+    const std::vector<nvinfer1::Dims>& dimvec, std::vector<nvinfer1::Dims>* min,
+    std::vector<nvinfer1::Dims>* opt, std::vector<nvinfer1::Dims>* max) {
+  *min = dimvec;
+  for (auto& dim : *min) {
+    // Shape value tensors can have -1 value as a wildcard. We do not change
+    // in that case.
+    if (dim.d[0] != -1) dim.d[0] = 1;  // Set min batch size to 1.
+  }
+  *opt = dimvec;
+  *max = dimvec;
+}
+
+void TrtShapeOptimizationProfile::ImplicitBatchModeCompatibleStrategy(
+    const std::vector<std::vector<nvinfer1::Dims>>& collected_shapes) {
+  for (auto& shape_vec : collected_shapes) {
+    std::vector<nvinfer1::Dims> min, opt, max;
+    SetImplicitBatchModeCompatibleProfile(shape_vec, &min, &opt, &max);
+    VLOG(2) << "Initializing optimization profile config with min="
+            << DebugString(min) << ", opt=max=" << DebugString(max);
+    OptimizationProfileConfig profConfig{min, opt, max};
+    profiles_.push_back(std::move(profConfig));
+  }
+}
+
+// Applies a binary operation for each dimension of the input shapes.
+// x[i].d[k] = op(x[i].d[k], y[i].d[k]), where i enumerates the input tensors,
+// and k enumerates the dimensions of the tensors. The BinaryOperation may be
+// std::min, std::max etc.
+template <typename BinaryOperation>
+Status ShapeProfileBinaryOp(std::vector<nvinfer1::Dims>* x,
+                            const std::vector<nvinfer1::Dims>& y,
+                            BinaryOperation op) {
+  if (x->size() != y.size())
+    return errors::InvalidArgument(
+        "Number of input tensors differ during profile creation");
+  for (int i = 0; i < x->size(); i++) {
+    if (x->at(i).nbDims != y[i].nbDims)
+      return errors::InvalidArgument(
+          "Number of input dimensions differ during profile creation at dim ",
+          i, ", values ", x->at(i).nbDims, y[i].nbDims);
+    for (int j = 0; j < x->at(i).nbDims; j++) {
+      x->at(i).d[j] = op(x->at(i).d[j], y[i].d[j]);
+    }
+  }
+  return Status::OK();
+}
+
+Status TrtShapeOptimizationProfile::RangeStrategy(
+    const std::vector<std::vector<nvinfer1::Dims>>& collected_shapes) {
+  if (collected_shapes.empty()) return Status::OK();
+
+  std::vector<nvinfer1::Dims> min = collected_shapes[0];
+  std::vector<nvinfer1::Dims> max = min;
+
+  for (int i = 1; i < collected_shapes.size(); i++) {
+    TF_RETURN_IF_ERROR(
+        ShapeProfileBinaryOp(&min, collected_shapes[i],
+                             [](int a, int b) { return std::min(a, b); }));
+    TF_RETURN_IF_ERROR(
+        ShapeProfileBinaryOp(&max, collected_shapes[i],
+                             [](int a, int b) { return std::max(a, b); }));
+  }
+  VLOG(2) << "Initializing optimization profile config with min="
+          << DebugString(min) << ", opt=max=" << DebugString(max);
+  OptimizationProfileConfig profConfig{min, max, max};
+  profiles_.push_back(std::move(profConfig));
+  return Status::OK();
+}
+
+void TrtShapeOptimizationProfile::OptimalStrategy(
+    const std::vector<std::vector<nvinfer1::Dims>>& collected_shapes) {
+  for (auto& shape_vec : collected_shapes) {
+    std::vector<nvinfer1::Dims> min = shape_vec;
+    std::vector<nvinfer1::Dims> opt = min;
+    std::vector<nvinfer1::Dims> max = min;
+    VLOG(2) << "Initializing optimization profile config with min=opt=max="
+            << DebugString(min);
+    OptimizationProfileConfig profConfig{min, opt, max};
+    profiles_.push_back(std::move(profConfig));
+  }
+}
+
+// Collects the values of tensors that are ShapeTensorCompatible to. The values
+// are stored in the actual_shape_values_ member variable.
+Status TrtShapeOptimizationProfile::CollectShapeValues(OpKernelContext* ctx) {
+  tensorflow::profiler::TraceMe activity(
+      "TrtShapeOptimizationProfile::CollectShapeValues",
+      tensorflow::profiler::TraceMeLevel::kInfo);
+  cudaStream_t stream = reinterpret_cast<cudaStream_t>(
+      CHECK_NOTNULL(ctx->op_device_context()
+                        ->stream()
+                        ->implementation()
+                        ->GpuStreamMemberHack()));
+  actual_shape_values_.resize(ctx->num_inputs());
+  if (is_shape_tensor_.empty()) {
+    is_shape_tensor_.resize(ctx->num_inputs());
+    for (int i = 0; i < ctx->num_inputs(); i++) {
+      is_shape_tensor_[i] = IsTrtShapeTensorCompatible(ctx->input(i));
+    }
+  }
+  int n_shape_val = 0;
+  // First copy all the shape value candidates into actual_shape_values_ vector.
+  for (int i = 0; i < ctx->num_inputs(); i++) {
+    if (is_shape_tensor_[i]) {
+      if (ctx->input_dtype(i) != DT_INT32) {
+        // In case the is_shape_tensor mask was initialized with the input
+        // shapes only (without knowledge of dtype) then we apply correction.
+        is_shape_tensor_[i] = false;
+        continue;
+      }
+      if (input_shape_values_.size() > 0 &&
+          input_shape_values_[0][i].nbDims != ctx->input(i).NumElements()) {
+        // Shape tensor dims should not change. It must be a value tensor.
+        is_shape_tensor_[i] = false;
+        continue;
+      }
+      // We have to copy the shape values to the host, because TRT's
+      // ExecutionContext::setInputShapeBinding expects a host pointer.
+      n_shape_val++;
+      const Tensor& input = ctx->input(i);
+      actual_shape_values_[i].nbDims = input.NumElements();
+      auto ret = cudaMemcpyAsync(
+          actual_shape_values_[i].d, input.flat<int32>().data(),
+          input.NumElements() * sizeof(int32), cudaMemcpyDeviceToHost, stream);
+      if (ret != 0) {
+        return errors::Internal("Could not copy shape tensor values");
+      }
+      VLOG(2) << "Input " << i << " is (probably) a shape tensor, n_values="
+              << input.NumElements();
+    } else {
+      actual_shape_values_[i] = {0, {}};
+    }
+  }
+  if (n_shape_val > 0) {
+    // If we have any shape values candidates, then wait until data is copied
+    // to host.
+    cudaStreamSynchronize(stream);
+  }
+  return Status::OK();
+}
+
+// Collects the values of tensors that are ShapeTensorCompatible to. To be used
+// for unit tests.
+Status TrtShapeOptimizationProfile::CollectShapeValues(const DataVec& input) {
+  actual_shape_values_.resize(input.size());
+  for (int i = 0; i < input.size(); i++) {
+    if (is_shape_tensor_[i]) {
+      if (!IsTrtShapeTensorCompatible(input[i].tensor)) {
+        return errors::Internal("Inconsistent shape tensor ", input[i].name,
+                                ", ", i);
+      }
+      int n_elements = input[i].tensor.NumElements();
+      actual_shape_values_[i].nbDims = n_elements;
+      // During unit tests, the data is in unified memory
+      std::copy(input[i].tensor.flat<int32>().data(),
+                input[i].tensor.flat<int32>().data() + n_elements,
+                actual_shape_values_[i].d);
+      VLOG(2) << "Collected tensor shape values "
+              << DebugString(actual_shape_values_[i]);
+    } else {
+      actual_shape_values_[i] = {0, {}};
+    }
+  }
+  return Status::OK();
+}
+
+// Adjusts shape value profile to prevent TRT from removing shape value input
+// bindings whose value is redundant (only a single value matches the profile).
+// This should be removed once the NVIDIA bug 3153064 is fixed.
+void FixShapeValueProfile(OptimizationProfileConfig* prof,
+                          const std::vector<bool>& is_shape_tensor) {
+  int shape_value_offset = is_shape_tensor.size();
+  for (int i = 0; i < is_shape_tensor.size(); i++) {
+    if (is_shape_tensor[i] &&
+        std::equal(prof->min[shape_value_offset + i].d,
+                   prof->min[shape_value_offset + i].d +
+                       prof->min[shape_value_offset + i].nbDims,
+                   prof->max[shape_value_offset + i].d)) {
+      prof->max[shape_value_offset + i].d[0]++;
+      VLOG(2) << "Adjusted profile for shape value tensor " << i << " "
+              << DebugString(prof->max[shape_value_offset + i]);
+    } else {
+      VLOG(2) << i << " is not a shape tensor." << is_shape_tensor[i];
+    }
+  }
+}
+
+// Checks whether rhs is already contained in values.
+bool AlreadyCollected(const std::vector<std::vector<nvinfer1::Dims>>& values,
+                      const std::vector<nvinfer1::Dims>& rhs) {
+  for (auto& lhs : values) {
+    bool ret = lhs.size() == rhs.size();
+    for (int i = 0; ret && i < lhs.size(); i++) {
+      ret &= lhs[i].nbDims == rhs[i].nbDims;
+      for (int j = 0; ret && j < lhs[i].nbDims; j++) {
+        ret &= (lhs[i].d[j] == rhs[i].d[j]);
+      }
+    }
+    if (ret) return true;
+  }
+  return false;
+}
+
+void TrtShapeOptimizationProfile::InitProfiles(
+    const std::vector<PartialTensorShape>& input_partial_shapes,
+    ProfileStrategy strategy) {
+  strategy_ = strategy;
+  if (input_shapes_.size() == 0) {
+    VLOG(1) << "Not creating profiles without input_shapes. "
+               "You have to enable profile generation mode first (build).";
+    return;
+  }
+  // Preprocess the vector of input shapes and shape values:
+  // - Converts TensorShape -> nvinfer::Dims.
+  // - Concatenates the shape values after the input shapes:
+  //   dimvec = [dim0, dim1,..., shapeval0, shapval1, ...]
+  // - Ensures that the list is unique.
+  std::vector<std::vector<nvinfer1::Dims>> collected_shapes;
+  for (int i = 0; i < input_shapes_.size(); i++) {
+    auto shape_vec = input_shapes_[i];
+    VLOG(2) << "Initprofiles, processing shape " << i;
+    if (!shape_vec.empty()) {
+      // Correct for values that are mistakenly used as shape values
+      for (int k = 0; k < input_shape_values_[i].size(); k++) {
+        if (!is_shape_tensor_[k])
+          input_shape_values_[i][k] = nvinfer1::Dims{0, {}};
+      }
+      std::vector<nvinfer1::Dims> dimvec = GetDimVec(shape_vec);
+      dimvec.insert(dimvec.end(), input_shape_values_[i].begin(),
+                    input_shape_values_[i].end());
+      // TODO(tfeher): This condition should not apply for explicit profile. In
+      // that case consicutive elements in collected_shapes contain the user
+      // defined values of min, opt and max, and it is valid the have min = opt
+      // and opt = max.
+      if (!AlreadyCollected(collected_shapes, dimvec)) {
+        collected_shapes.push_back(dimvec);
+      }
+    }
+  }
+  switch (strategy_) {
+    case ProfileStrategy::kImplicitBatchModeCompatible:
+      VLOG(1) << "Creating profiles with ImplicitBatchModeCompatible strategy";
+      ImplicitBatchModeCompatibleStrategy(collected_shapes);
+      break;
+    // Treat all other strategies the same as kOptimal for now. Implementing
+    // those is outlined in the dynamic shape support implementation plan.
+    case ProfileStrategy::kRange:
+      VLOG(1) << "Creating profiles with Range strategy";
+      TF_CHECK_OK(RangeStrategy(collected_shapes));
+      break;
+    case ProfileStrategy::kRangeOptimal:
+      VLOG(1) << "Creating profiles with RangeOptimal strategy";
+      OptimalStrategy(collected_shapes);
+      TF_CHECK_OK(RangeStrategy(collected_shapes));
+      break;
+    case ProfileStrategy::kOptimal:
+      VLOG(1) << "Creating profiles with Optimal strategy";
+      OptimalStrategy(collected_shapes);
+      break;
+  }
+  // Define a mask that describe which input could be a shape tensor. Note
+  // that here we can have false positives. The shape tensor mask will be
+  // updated once the network is constructed.
+  SetShapeTensorMask(input_partial_shapes);
+  if (input_partial_shapes.size() > 0) {
+    for (OptimizationProfileConfig& prof : profiles_) {
+      // TODO: Remove this when the bug is fixed.
+#if !IS_TRT_VERSION_GE(8, 0, 0, 0)
+      FixShapeValueProfile(&prof, is_shape_tensor_);
+#endif
+      for (int i = 0; i < input_partial_shapes.size(); i++) {
+        auto network_input = input_partial_shapes[i];
+        EnforceCompatibility(&prof.min[i], network_input);
+        EnforceCompatibility(&prof.opt[i], network_input);
+        EnforceCompatibility(&prof.max[i], network_input);
+      }
+    }
+  }
+}
+
+void TrtShapeOptimizationProfile::InitCalibProfile(
+    const std::vector<TensorShape>& shapes) {
+  VLOG(1) << "Collected shape(s) " << DebugString(shapes) << " for "
+          << " calibration profile.";
+  auto shape_vec = shapes;
+  if (!shape_vec.empty()) {
+    std::vector<nvinfer1::Dims> dimvec = GetDimVec(shape_vec);
+    dimvec.insert(dimvec.end(), actual_shape_values_.begin(),
+                  actual_shape_values_.end());
+    VLOG(2) << "Initializing calibration optimization profile config with "
+            << "min=opt=max " << DebugString(dimvec);
+
+    OptimizationProfileConfig profConfig{dimvec, dimvec, dimvec};
+    calib_profiles_ = std::move(profConfig);
+  } else {
+    VLOG(2) << "Failed to initialize calibration optimization profile.";
+  }
+}
+
+Status TrtShapeOptimizationProfile::AddProfiles(
+    nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
+    const nvinfer1::INetworkDefinition* network) {
+  // Create optimization profile for calibration if necessary.
+  if (!calib_profiles_.min.empty()) {
+    VLOG(2) << "Setting up calibration profies";
+    auto* calibProfile = builder->createOptimizationProfile();
+    Status status =
+        calib_profiles_.SetDimensions(network, calibProfile, input_mask_);
+    if (!status.ok()) {
+      return status;
+    }
+    bool result = false;
+    if (calibProfile->isValid()) {
+      result = config->setCalibrationProfile(calibProfile);
+    } else {
+      VLOG(2) << "Calibration profile is not valid";
+    }
+    if (result) {
+      VLOG(2) << "Added calibration optimization profile "
+              << calib_profiles_.DebugString() << " to builder config.";
+    } else {
+      VLOG(2) << "FAILED TO ADD PROFILE";
+      LOG(ERROR) << "Failed to add calibration optimization profile "
+                 << calib_profiles_.DebugString()
+                 << ". This usually happens when profile is invalid.";
+    }
+  }
+  // Create a vector of optimization profiles.
+  for (int i = 0; i < profiles_.size(); i++) {
+    auto* optProfile = builder->createOptimizationProfile();
+    Status status =
+        profiles_[i].SetDimensions(network, optProfile, input_mask_);
+    if (!status.ok()) {
+      return status;
+    }
+    int idx = -1;
+    if (optProfile->isValid()) {
+      idx = config->addOptimizationProfile(optProfile);
+    }
+    if (idx >= 0) {
+      if (i != idx) {
+        return errors::Internal(
+            "Profile index of engine config is different from source profile "
+            "index: ",
+            i, " != ", idx);
+      }
+      VLOG(1) << "Added optimization profile " << profiles_[i].DebugString()
+              << " with idx " << idx << " to builder config.";
+    } else {
+      LOG(ERROR) << "Failed to add optimization profile "
+                 << profiles_[i].DebugString()
+                 << ". This usually happens when profile is invalid.";
+    }
+  }
+  if (!profiles_.empty() && config->getNbOptimizationProfiles() == 0) {
+    return errors::Internal("Failure in adding an optimization profile.");
+  }
+  need_profiles_ = config->getNbOptimizationProfiles() > 0;
+  // Update the mask that flag shape tensors. The network is known now,
+  // the mask will be correct.
+  SetShapeTensorMask(network);
+  is_pruned_input_.resize(network->getNbInputs());
+  absl::c_fill(is_pruned_input_, false);
+  return Status::OK();
+}
+
+Status TrtShapeOptimizationProfile::ConfigureBuilder(
+    nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
+    const nvinfer1::INetworkDefinition* network) {
+  TF_RETURN_IF_ERROR(AddProfiles(builder, config, network));
+  return Status::OK();
+}
+
+// Sets the shape tensor mask from the TRT engine definition.
+void TrtShapeOptimizationProfile::SetShapeTensorMask(
+    const nvinfer1::ICudaEngine* engine, int n_inputs) {
+  is_shape_tensor_.resize(n_inputs, false);
+  for (int i = 0; i < n_inputs; i++) {
+    int binding_index;
+    Status status = GetTrtBindingIndex(i, 0, engine, &binding_index);
+    if (!status.ok()) {
+      continue;
+    }
+    is_shape_tensor_[i] = engine->isShapeBinding(binding_index);
+    if (is_shape_tensor_[i]) {
+      VLOG(2) << "Found shape tensor at " << i;
+    }
+  }
+  has_shape_tensor_ =
+      absl::c_any_of(is_shape_tensor_, [](bool b) { return b; });
+}
+
+// Sets the shape tensor mask using the network definition.
+void TrtShapeOptimizationProfile::SetShapeTensorMask(
+    const nvinfer1::INetworkDefinition* network) {
+  int n_inputs = network->getNbInputs();
+  is_shape_tensor_.resize(n_inputs, false);
+  for (int i = 0; i < n_inputs; i++) {
+    const ITensorProxyPtr input = network->getInput(i);
+    is_shape_tensor_[i] = input->isShapeTensor();
+    if (is_shape_tensor_[i]) {
+      VLOG(2) << "Found shape tensor " << input->getName() << " at " << i;
+    }
+  }
+  has_shape_tensor_ =
+      absl::c_any_of(is_shape_tensor_, [](bool b) { return b; });
+}
+
+// Sets the shape tensor mask using the input partial shapes. This only tells
+// whether the tensors are shape value compatible, only the final network
+// definition or the engine would give concrete answers.
+void TrtShapeOptimizationProfile::SetShapeTensorMask(
+    const std::vector<PartialTensorShape>& input_partial_shapes) {
+  if (is_shape_tensor_.size() == input_partial_shapes.size()) {
+    // Already initialized, e.g. by TRTEngineOp::ComputeAsync().
+    return;
+  }
+  is_shape_tensor_.resize(input_partial_shapes.size(), false);
+  for (int i = 0; i < input_partial_shapes.size(); i++) {
+    is_shape_tensor_[i] = IsTrtShapeTensorCompatible(input_partial_shapes[i]);
+    if (is_shape_tensor_[i]) {
+      VLOG(2) << "Found shape compatible tensor at " << i;
+    }
+  }
+  has_shape_tensor_ =
+      absl::c_any_of(is_shape_tensor_, [](bool b) { return b; });
+}
+
+int TrtShapeOptimizationProfile::GetProfileNumber(
+    const std::vector<TensorShape>& shapes) {
+  tensorflow::profiler::TraceMe activity(
+      "TrtShapeOptimizationProfile::GetProfileNumber",
+      tensorflow::profiler::TraceMeLevel::kInfo);
+  if (!need_profiles_) return 0;
+  // TODO(tfeher): Return the best profile not just the first compatible.
+  for (int i = 0; i < profiles_.size(); i++) {
+    if (profiles_[i].IncludesShapes(shapes, HasShapeTensor(),
+                                    actual_shape_values_, is_pruned_input_,
+                                    is_shape_tensor_)) {
+      return i;
+    }
+  }
+  VLOG(1) << "Profile not found for input shapes " << DebugString(shapes);
+  VLOG(2) << "  and shape values " << DebugString(actual_shape_values_);
+  return -1;
+}
+
+Status TrtShapeOptimizationProfile::CreateExecutionContexts(
+    nvinfer1::ICudaEngine* engine,
+    std::vector<ExecutionContext>* exec_contexts) {
+  int i = 0;
+  // The following loop runs once if we have static shapes, to create a single
+  // execution context without profiles. In dynamic mode we create one context
+  // for each profile and set the corresponding optimization profile.
+  do {
+    VLOG(1) << "Creating execution context " << i;
+    ExecutionContext context = ExecutionContext::Create(engine);
+    if (i > 0) {
+      // This condition is needed for two reasons:
+      // - using static shapes we do not have any profiles so we cannot call
+      //   set optimizationprofiles.
+      // - The 0th profile is set implicitly for the first execution context
+      //   therefore we do not need to set.
+      if (!context->setOptimizationProfile(i)) {
+        return errors::Internal("Could not set TRT optimization profile.");
+      }
+    }
+    exec_contexts->push_back(std::move(context));
+    i++;
+  } while (i < profiles_.size());
+
+  return Status::OK();
+}
+
+Status TrtShapeOptimizationProfile::SetInputShapeBinding(
+    int input_index, int binding_index, nvinfer1::ICudaEngine* cuda_engine,
+    nvinfer1::IExecutionContext* exec_context) const {
+  tensorflow::profiler::TraceMe activity(
+      "TrtShapeOptimizationProfile::SetInputShapeBinding",
+      tensorflow::profiler::TraceMeLevel::kInfo);
+  if (cuda_engine->isShapeBinding(binding_index)) {
+    // Input shape binding data has to be in host memory. That is the reason
+    // we can't use input_tensor.flat().data(). which contains the same
+    // values in device memory. Instead, we use data that was copied to host
+    // by CollectShapeValues.
+    VLOG(2) << "Setting input shape binding for idx " << binding_index
+            << ", with values "
+            << DebugString(actual_shape_values_.at(input_index));
+    bool ret = exec_context->setInputShapeBinding(
+        binding_index, actual_shape_values_.at(input_index).d);
+    if (!ret) {
+      return errors::Internal("Could not set input shape binding for idx ",
+                              binding_index);
+    }
+  }
+  return Status::OK();
+}
+
+// If binding_idx is a shape tensor, then returns the associated min/max/opt
+// shape values from prof_idx.
+nvinfer1::Dims GetDimsFromShapeVal(int prof_idx, int binding_idx,
+                                   nvinfer1::OptProfileSelector selector,
+                                   const nvinfer1::ICudaEngine* engine) {
+  if (engine->isShapeBinding(binding_idx)) {
+    const int32* shape_val_ptr =
+        engine->getProfileShapeValues(binding_idx, prof_idx, selector);
+    if (shape_val_ptr) {
+      VLOG(2) << "Found shape value in prof " << prof_idx << ", binding "
+              << binding_idx;
+      nvinfer1::Dims dims = engine->getBindingDimensions(binding_idx);
+      // nbDims == 0 represent scalar, -1 represents invalid dim
+      int n_values = (dims.nbDims == 0) ? 1 : dims.d[0];
+      if (n_values > 0) {
+        dims.nbDims = n_values;
+        std::copy(shape_val_ptr, shape_val_ptr + n_values, dims.d);
+      }
+      return dims;
+    }
+  }
+  return {0, {0}};
+}
+
+Status TrtShapeOptimizationProfile::SetPrunedMask(
+    const nvinfer1::ICudaEngine* engine, int n_network_inputs) {
+  is_pruned_input_.resize(n_network_inputs);
+  absl::c_fill(is_pruned_input_, false);
+  for (int j = 0; j < n_network_inputs; j++) {
+    int binding_idx;
+    Status status = GetTrtBindingIndex(j, 0, engine, &binding_idx);
+    if (!status.ok()) {
+      // Before TRT 8, an input tensor can be pruned (nvbugs/3153064)
+      // Resource inputs are also unknown by TRT, so we can treat them as
+      // pruned (the engine includes the variable as weights).
+      is_pruned_input_[j] = true;
+      VLOG(2) << "Skipping pruned input " << j;
+      continue;
+    }
+  }
+  return Status::OK();
+}
+
+Status TrtShapeOptimizationProfile::RestoreProfiles(
+    const nvinfer1::ICudaEngine* engine, int n_network_inputs) {
+  need_profiles_ = false;
+  if (!engine) {
+    // We do not need to restore profiles for an empty engine.
+    return Status::OK();
+  }
+  if (engine->hasImplicitBatchDimension()) {
+    // Nothing to do, we cannot have profiles in implicit batch mode.
+    return Status::OK();
+  }
+  int n_profiles = engine->getNbOptimizationProfiles();
+  need_profiles_ = n_profiles > 0;
+  int n_inputs = GetNumberOfEngineInputs(engine);
+  if (n_inputs > n_network_inputs) {
+    return errors::Internal("Incorrect number of engine inputs");
+  }
+  VLOG(2) << "Attempting to restore " << n_profiles << " profiles, each with "
+          << n_inputs << " inputs";
+  SetShapeTensorMask(engine, n_network_inputs);
+
+  TF_RETURN_IF_ERROR(SetPrunedMask(engine, n_network_inputs));
+
+  for (int prof_idx = 0; prof_idx < n_profiles; prof_idx++) {
+    OptimizationProfileConfig cfg;
+
+    cfg.min.resize(n_network_inputs * 2);
+    cfg.max.resize(n_network_inputs * 2);
+    cfg.opt.resize(n_network_inputs * 2);
+    // restore shape values
+    for (int j = 0; j < n_network_inputs; j++) {
+      if (is_pruned_input_[j]) continue;
+      int binding_idx;
+      TF_RETURN_IF_ERROR(GetTrtBindingIndex(j, 0, engine, &binding_idx));
+
+      nvinfer1::Dims min = engine->getProfileDimensions(
+          binding_idx, prof_idx, nvinfer1::OptProfileSelector::kMIN);
+      nvinfer1::Dims max = engine->getProfileDimensions(
+          binding_idx, prof_idx, nvinfer1::OptProfileSelector::kMAX);
+      nvinfer1::Dims opt = engine->getProfileDimensions(
+          binding_idx, prof_idx, nvinfer1::OptProfileSelector::kOPT);
+      cfg.min[j] = min;
+      cfg.max[j] = max;
+      cfg.opt[j] = opt;
+
+      cfg.min[j + n_inputs] = GetDimsFromShapeVal(
+          prof_idx, binding_idx, nvinfer1::OptProfileSelector::kMIN, engine);
+      cfg.max[j + n_inputs] = GetDimsFromShapeVal(
+          prof_idx, binding_idx, nvinfer1::OptProfileSelector::kMAX, engine);
+      cfg.opt[j + n_inputs] = GetDimsFromShapeVal(
+          prof_idx, binding_idx, nvinfer1::OptProfileSelector::kOPT, engine);
+    }
+    VLOG(2) << "Restored profile " << cfg.DebugString();
+    profiles_.push_back(std::move(cfg));
+  }
+  return Status::OK();
+}
+
+int TrtShapeOptimizationProfile::GetNumProfiles() const {
+  return profiles_.size();
+}
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h b/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h
new file mode 100644
index 00000000000..e5af88a1928
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h
@@ -0,0 +1,351 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_SHAPE_OPTIMIZATION_PROFILES_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_SHAPE_OPTIMIZATION_PROFILES_H_
+
+#include <list>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "tensorflow/compiler/tf2tensorrt/common/datavec.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/trt_parameters.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_execution_context.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+
+#include "third_party/tensorrt/NvInfer.h"
+
+namespace tensorflow {
+namespace tensorrt {
+
+// Stores optimization profile parameters (min/opt/max of each input shape).
+//
+// A TensorRT optimization profile describes the possible min/max values of
+// each dynamic input shape along with an optimum value. These values are used
+// by the TensorRT builder to select the best kernel for the optimum value among
+// those kernels that are valid for all input tensors in the [min, max] range.
+struct OptimizationProfileConfig {
+  // Length of vector == 2*num_inputs to engine. min[0:num_inputs-1] are the min
+  // input dimensions for execution tensors. If engine has shape input tensors,
+  // then min[num_inputs + i] store the shape value for input i. For inputs that
+  // are not shape tensors min = opt = max = {0, {}}.
+  //
+  // When the OptimizationProfileConfig is created from the network definition
+  // (AddProfiles), then each elements of the min, opt, max vectors are defined.
+  // When the OptimizationProfileConfig object is restored during engine
+  // deserialization (RestoreProfiles), then some inputs can be pruned
+  // (see TrtShapeOptimizationProfile::is_pruned_input_). In that case min[i]
+  // is not defined for pruned inputs (same is true for opt and max).
+  std::vector<nvinfer1::Dims> min;
+  std::vector<nvinfer1::Dims> opt;
+  std::vector<nvinfer1::Dims> max;
+
+  string DebugString() const {
+    using absl::StrCat;
+    return StrCat("[min: ", tensorflow::tensorrt::DebugString(min),
+                  ", opt: : ", tensorflow::tensorrt::DebugString(opt),
+                  ", max: ", tensorflow::tensorrt::DebugString(max), "]");
+  }
+
+  // Sets the min/opt/max dimensions for profile.
+  //
+  // The given min/opt/max dimensions should satisfy the condition
+  // min <= opt <= max. Additionally TRT requires that the min/opt/max values
+  // are compatible with the network input. Compatibility is defined the
+  // following way: let dim be the shape of an input binding and min/opt/max the
+  // corresponding profile dims. TRT requires that dim.d[k] must be -1 if
+  // (min.d[k] != dim.d[k] || opt.d[k] != dim.d[k] || max.d[k] != dim.d[k]).
+  //
+  // Parameters:
+  // network - TensorRT network, used to enumerate all the input tensors
+  // profile - on exit the profile information will be set for each input tensor
+  // input_mask - 1 for TRT inputs, 0 for TF inputs that are not TRT inputs
+  Status SetDimensions(const nvinfer1::INetworkDefinition* network,
+                       nvinfer1::IOptimizationProfile* profile,
+                       const std::vector<bool>& input_mask) const {
+    int n_inputs_trt = network->getNbInputs();
+    int n_inputs_tf = opt.size() / 2;
+    /// TODO(lsugy): check that the sum of the mask equals n_inputs.
+    if (input_mask.size() != n_inputs_tf) {
+      return errors::Internal("Incorrect input mask size: ", input_mask.size());
+    }
+    int n_mask_true = 0;
+    for (bool mask_val : input_mask) {
+      if (mask_val) {
+        n_mask_true++;
+      }
+    }
+    if (n_mask_true != n_inputs_trt) {
+      return errors::Internal(
+          "Number of true elements in input_mask (", n_mask_true,
+          ") doesn't match expected TRT inputs (", n_inputs_trt, ")");
+    }
+    int j = 0;
+    for (int i = 0; i < n_inputs_tf; i++) {
+      if (input_mask[i]) {
+        const ITensorProxyPtr input = network->getInput(j);
+        const char* name = input->getName();
+        if (input->isShapeTensor()) {
+          int idx = i + n_inputs_tf;
+          VLOG(2) << "Setting shape values for " << name << ", "
+                  << ::tensorflow::tensorrt::DebugString(opt[idx]);
+          profile->setShapeValues(name, nvinfer1::OptProfileSelector::kMIN,
+                                  min[idx].d, min[idx].nbDims);
+          profile->setShapeValues(name, nvinfer1::OptProfileSelector::kOPT,
+                                  opt[idx].d, opt[idx].nbDims);
+          profile->setShapeValues(name, nvinfer1::OptProfileSelector::kMAX,
+                                  max[idx].d, max[idx].nbDims);
+        }
+        VLOG(2) << "Setting input dimensions for " << name << ", "
+                << ::tensorflow::tensorrt::DebugString(opt[i]);
+        profile->setDimensions(name, nvinfer1::OptProfileSelector::kMIN,
+                               min[i]);
+        profile->setDimensions(name, nvinfer1::OptProfileSelector::kOPT,
+                               opt[i]);
+        profile->setDimensions(name, nvinfer1::OptProfileSelector::kMAX,
+                               max[i]);
+
+        j++;
+      }
+    }
+    return Status::OK();
+  }
+
+  // Returns true if profile range completely includes the given shapes.
+  bool IncludesShapes(const std::vector<TensorShape>& shapes,
+                      bool has_shape_tensor,
+                      const std::vector<nvinfer1::Dims>& shape_values,
+                      const std::vector<bool>& is_pruned_input,
+                      const std::vector<bool>& is_shape_tensor) const {
+    // min, max, and opt must have the same size which is already verified in
+    // SetDimensions.
+    if (min.size() != shapes.size() * 2 ||
+        (has_shape_tensor && min.size() != shape_values.size() * 2)) {
+      VLOG(2) << "Profile size mismatch min size " << min.size()
+              << " vs input shapes size " << shapes.size() << " "
+              << shape_values.size();
+      return false;
+    }
+    for (int i = 0; i < shapes.size(); i++) {
+      if (is_pruned_input[i]) {
+        continue;
+      }
+      auto current_shape = shapes[i];
+      // min, max, and opt must have the same nbDims, which is already verified
+      // in SetDimensions.
+      if (min[i].nbDims != current_shape.dims()) {
+        return false;
+      }
+      // Check if range [min, max] includes current_shape.
+      for (int dim = 0; dim < current_shape.dims(); dim++) {
+        if ((min[i].d[dim] > current_shape.dim_size(dim)) ||
+            (max[i].d[dim] < current_shape.dim_size(dim))) {
+          return false;
+        }
+      }
+    }
+    // Check shape values.
+    if (has_shape_tensor) {
+      int offset = shapes.size();
+      for (int i = 0; i < shape_values.size(); i++) {
+        if (is_pruned_input[i] || !is_shape_tensor[i]) {
+          continue;
+        }
+        auto shape_val = shape_values[i];
+        // min, max, and opt must have the same nbDims, which is already
+        // verified in SetDimensions.
+        if (min[i + offset].nbDims != shape_val.nbDims) {
+          return false;
+        }
+        // Check if range [min, max] includes shape_val.
+        for (int dim = 0; dim < shape_val.nbDims; dim++) {
+          if (min[i + offset].d[dim] > shape_val.d[dim] ||
+              max[i + offset].d[dim] < shape_val.d[dim]) {
+            return false;
+          }
+        }
+      }
+    }
+    return true;
+  }
+};
+
+// Manages Optimization profiles during TRT Engine construction.
+//
+// An optimization profile describes a range of dimensions for each TRT network
+// input, and the optimal dimensions that the auto-tuner should use for
+// optimization.
+//
+// This class stores the list of input shapes that were seen during the
+// build/profile_generation_mode phase, and using them it creates a set of
+// OptimizationProfileConfigs. These configs will be added to IBuilderConfig
+// before the engine is created.
+class TrtShapeOptimizationProfile {
+ public:
+  TrtShapeOptimizationProfile() {}
+
+  // Stores input shape information during profile_generation_mode.
+  void AddShape(const std::vector<TensorShape>& shapes) {
+    input_shapes_.push_back(shapes);
+    input_shape_values_.push_back(actual_shape_values_);
+    VLOG(1) << "Collected shape(s) " << DebugString(shapes) << " for profiles.";
+  }
+
+  // Stores the input mask.
+  void SetInputMask(const std::vector<bool>& input_mask) {
+    input_mask_ = input_mask;
+  }
+
+  // Collects ShapeTensorCompatible tensor values. This is needed both during
+  // profile_generation_mode and during normal inference calls.
+  Status CollectShapeValues(OpKernelContext* ctx);
+
+  // Collects ShapeTensorCompatible tensor values, used only for unit tests.
+  Status CollectShapeValues(const DataVec& input);
+
+  void clear() { profiles_.clear(); }
+
+  // Returns the profile number that should be used to execute the network with
+  // the given input shapes. Returns -1 if none of cached profiles are
+  // compatible with the given input shapes.
+  int GetProfileNumber(const std::vector<TensorShape>& shapes);
+
+  // Creates optimization profiles and add them to the builder config.
+  Status ConfigureBuilder(nvinfer1::IBuilder* builder,
+                          nvinfer1::IBuilderConfig* config,
+                          const nvinfer1::INetworkDefinition* network);
+
+  // Creates execution contexts for each optimization profile.
+  Status CreateExecutionContexts(nvinfer1::ICudaEngine* engine,
+                                 std::vector<ExecutionContext>* exec_contexts);
+
+  Status SetInputShapeBinding(int input_index, int binding_index,
+                              nvinfer1::ICudaEngine* cuda_engine,
+                              nvinfer1::IExecutionContext* exec_context) const;
+
+  // Creates optimization profiles profiles_ for the set of concrete input
+  // shapes collected in input_shapes_. The input_partial_shapes of the network
+  // is used to ensure that the created optimization profiles are compatible
+  // with the network.
+  void InitProfiles(const std::vector<PartialTensorShape>& input_partial_shapes,
+                    ProfileStrategy strategy);
+
+  void InitCalibProfile(const std::vector<TensorShape>& shapes);
+
+  // Returns number of created profiles.
+  int GetNumProfiles() const;
+
+  bool HasShape() const { return !input_shapes_.empty(); }
+  bool NeedProfiles() const { return need_profiles_; }
+
+  // Restores profiles from the engine (used after deserialization).
+  Status RestoreProfiles(const nvinfer1::ICudaEngine* engine,
+                         int n_network_inputs);
+
+  // Whether the network has any shape tensors.
+  bool HasShapeTensor() const { return has_shape_tensor_; }
+
+  void SetShapeTensorMask(const nvinfer1::INetworkDefinition* network);
+
+  // Whether the optimization profiles describe input that can be handled with
+  // a static engine (only 1 profile with min=max).
+  bool IsStaticCompatible() {
+    return strategy_ == ProfileStrategy::kOptimal && profiles_.size() == 1
+#if !IS_TRT_VERSION_GE(8, 0, 0, 0)
+           && !HasShapeTensor()
+#endif
+        ;
+    // TODO(tfeher): remove !HasShapeTensor() condition once the
+    // FixShapeValueProfile workaround is turned off.
+  }
+
+ private:
+  // Set of input shape vetors that we collect during profile_generation_mode.
+  std::vector<std::vector<TensorShape>> input_shapes_;
+
+  // Input shape values that we collect during profile_generation_mode. If the
+  // tensor is not compatible with a TRT shape tensor then an empty shape is
+  // stored.
+  std::vector<std::vector<nvinfer1::Dims>> input_shape_values_;
+
+  // Shape values present in the current inference call.
+  std::vector<nvinfer1::Dims> actual_shape_values_;
+
+  // The optimization profiles generated from input_shapes_.
+  std::vector<OptimizationProfileConfig> profiles_;
+
+  // The optimization profile for calibration.
+  OptimizationProfileConfig calib_profiles_;
+
+  // A TRTEngineOp can have resource inputs. These are treated as constants:
+  // their value is read during conversion and stored as weights in the TRT
+  // engine. This means that resource inputs have no corresponding TRT engine
+  // input, and we do not need to provide profile information for these. The
+  // input mask helps to identify the TRT inputs, where we need to define
+  // optimization profiles.
+  std::vector<bool> input_mask_;
+
+  // Whether the network has any shape tensors. Initially we assume that the
+  // network might have a shape value input. This will be updated when the
+  // network is created / engine is deserialized.
+  bool has_shape_tensor_ = true;
+
+  // Whether the network/engine requires optimization profiles.
+  bool need_profiles_ = false;
+
+  // Whether an input tensor is a shape tensor.
+  std::vector<bool> is_shape_tensor_;
+
+  // Whether a network input was pruned (only in TRT 7).
+  std::vector<bool> is_pruned_input_;
+
+  // Optimization profile generation strategy.
+  ProfileStrategy strategy_;
+
+  // Adds optimization profiles to the builder config.
+  Status AddProfiles(nvinfer1::IBuilder* builder,
+                     nvinfer1::IBuilderConfig* config,
+                     const nvinfer1::INetworkDefinition* network);
+
+  void SetShapeTensorMask(const nvinfer1::ICudaEngine* engine, int n_inputs);
+  void SetShapeTensorMask(
+      const std::vector<PartialTensorShape>& input_partial_shapes);
+
+  Status SetPrunedMask(const nvinfer1::ICudaEngine* engine,
+                       int n_network_inputs);
+
+  void ImplicitBatchModeCompatibleStrategy(
+      const std::vector<std::vector<nvinfer1::Dims>>& collected_shapes);
+  void OptimalStrategy(
+      const std::vector<std::vector<nvinfer1::Dims>>& collected_shapes);
+  Status RangeStrategy(
+      const std::vector<std::vector<nvinfer1::Dims>>& collected_shapes);
+};
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_SHAPE_OPTIMIZATION_PROFILES_H_
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles_test.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles_test.cc
new file mode 100644
index 00000000000..87e17a9fc3f
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles_test.cc
@@ -0,0 +1,256 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+
+#include <string.h>
+
+#include <vector>
+
+#include "absl/memory/memory.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/test.h"
+#include "third_party/tensorrt/NvInfer.h"
+
+namespace tensorflow {
+namespace tensorrt {
+
+std::vector<TensorShape> DimVecToShapeVec(
+    std::vector<nvinfer1::Dims3> dimvec,
+    bool expand_with_empty_shape_values = false) {
+  std::vector<TensorShape> shapevec(dimvec.size());
+  for (int i = 0; i < dimvec.size(); i++) {
+    TensorShape shape;
+    TF_CHECK_OK(
+        TensorShapeUtils::MakeShape(dimvec[i].d, dimvec[i].nbDims, &shape));
+    shapevec[i] = shape;
+  }
+  if (expand_with_empty_shape_values) {
+    shapevec.resize(2 * dimvec.size());  // Append empty shape values
+  }
+  return shapevec;
+}
+
+bool DimsContained(const nvinfer1::Dims& dim, const nvinfer1::Dims& min,
+                   const nvinfer1::Dims& max) {
+  if (dim.nbDims != min.nbDims || dim.nbDims != max.nbDims) {
+    return false;
+  }
+  for (int i = 0; i < dim.nbDims; i++) {
+    if (dim.d[i] < min.d[i] || dim.d[i] > max.d[i]) {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool DimsEqual(const nvinfer1::Dims& a, const nvinfer1::Dims& b) {
+  if (a.nbDims != b.nbDims) {
+    return false;
+  }
+  for (int i = 0; i < a.nbDims; i++) {
+    if (a.d[i] != b.d[i]) {
+      return false;
+    }
+  }
+  return true;
+}
+
+class TrtShapeOptimizationProfileTest
+    : public ::testing::TestWithParam<ProfileStrategy> {
+ protected:
+  TrtShapeOptimizationProfileTest() {
+    strategy_ = GetParam();
+    builder_ = TrtUniquePtrType<nvinfer1::IBuilder>(
+        nvinfer1::createInferBuilder(logger_));
+    network_ = TrtUniquePtrType<nvinfer1::INetworkDefinition>(
+        builder_->createNetworkV2(flags_));
+    builder_config_ = TrtUniquePtrType<nvinfer1::IBuilderConfig>(
+        builder_->createBuilderConfig());
+    builder_config_->setMaxWorkspaceSize(1 << 10);
+  }
+
+  // Defines a simple network: output = input1 + input2.
+  void DefineNetwork(nvinfer1::INetworkDefinition* network,
+                     nvinfer1::Dims3& dims) {
+    ITensorProxyPtr input1 =
+        network->addInput("input1", nvinfer1::DataType::kFLOAT, dims);
+    EXPECT_NE(nullptr, input1->trt_tensor());
+
+    ITensorProxyPtr input2 =
+        network->addInput("input2", nvinfer1::DataType::kFLOAT, dims);
+    EXPECT_NE(nullptr, input2->trt_tensor());
+
+    auto layer =
+        network->addElementWise(*input1->trt_tensor(), *input2->trt_tensor(),
+                                nvinfer1::ElementWiseOperation::kSUM);
+    EXPECT_NE(nullptr, layer);
+    // Mark the output.
+    ITensorProxyPtr output = layer->getOutput(0);
+    output->setName("output");
+    network->markOutput(*output->trt_tensor());
+  }
+
+  void CheckProfile(const std::vector<nvinfer1::Dims3>& dimvec,
+                    TrtShapeOptimizationProfile* profile, bool has_prof,
+                    bool test_optimality) {
+    std::vector<TensorShape> shape_vec = DimVecToShapeVec(dimvec);
+    int idx = profile->GetProfileNumber(shape_vec);
+    ASSERT_EQ(idx >= 0, has_prof);
+    if (idx < 0) return;
+    int prof_idx = exec_contexts_[idx]->getOptimizationProfile();
+    ASSERT_GE(prof_idx, 0);
+    for (int j = 0; j < dimvec.size(); j++) {
+      nvinfer1::Dims min = engine->getProfileDimensions(
+          j, prof_idx, nvinfer1::OptProfileSelector::kMIN);
+      nvinfer1::Dims max = engine->getProfileDimensions(
+          j, prof_idx, nvinfer1::OptProfileSelector::kMAX);
+      nvinfer1::Dims opt = engine->getProfileDimensions(
+          j, prof_idx, nvinfer1::OptProfileSelector::kOPT);
+
+      // This should always hold.
+      EXPECT_TRUE(DimsContained(dimvec[j], min, max));
+
+      if (test_optimality) {
+        // We shall have selected an optimal strategy.
+        EXPECT_TRUE(DimsEqual(dimvec[j], opt));
+      }
+    }
+  }
+
+  Logger& logger_ = *Logger::GetLogger();
+  TrtUniquePtrType<nvinfer1::IBuilder> builder_;
+  TrtUniquePtrType<nvinfer1::INetworkDefinition> network_;
+  TrtUniquePtrType<nvinfer1::IBuilderConfig> builder_config_;
+  TrtUniquePtrType<nvinfer1::ICudaEngine> engine;
+  std::vector<ExecutionContext> exec_contexts_;
+  // The order is important: exec_context_ must be destroyed first, and logger
+  // at last.
+  const uint32_t flags_ =
+      1U << static_cast<int>(
+          nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
+  ProfileStrategy strategy_;
+};
+
+INSTANTIATE_TEST_CASE_P(
+    OptProfilesTestInstantiation, TrtShapeOptimizationProfileTest,
+    ::testing::Values(ProfileStrategy::kRange, ProfileStrategy::kOptimal,
+                      ProfileStrategy::kRangeOptimal,
+                      ProfileStrategy::kImplicitBatchModeCompatible));
+
+TEST_P(TrtShapeOptimizationProfileTest, Static) {
+  // Static mode does not depend on strategies, we test only once.
+  if (strategy_ != ProfileStrategy::kRange) return;
+
+  // Network with static input shape.
+  nvinfer1::Dims3 dims(8, 8, 10);
+  DefineNetwork(network_.get(), dims);
+
+  TrtShapeOptimizationProfile profile;
+
+  // Configure and build engine - should be a no-op.
+  TF_CHECK_OK(profile.ConfigureBuilder(builder_.get(), builder_config_.get(),
+                                       network_.get()));
+
+  engine = TrtUniquePtrType<nvinfer1::ICudaEngine>(
+      builder_->buildEngineWithConfig(*network_, *builder_config_));
+  EXPECT_NE(nullptr, engine);
+  TF_CHECK_OK(profile.CreateExecutionContexts(engine.get(), &exec_contexts_));
+  // A single execution context should be created for a graph with static input.
+  ASSERT_EQ(exec_contexts_.size(), 1);
+  EXPECT_NE(nullptr, exec_contexts_[0]);
+
+  std::vector<nvinfer1::Dims3> dim_vec(2, dims);
+  std::vector<TensorShape> shape_vec = DimVecToShapeVec(dim_vec);
+  EXPECT_EQ(0, profile.GetProfileNumber(shape_vec));
+}
+
+TEST_P(TrtShapeOptimizationProfileTest, Dynamic) {
+  // Network with dynamic input shapes.
+  nvinfer1::Dims3 dims(-1, -1, 10);
+  DefineNetwork(network_.get(), dims);
+
+  TrtShapeOptimizationProfile profile;
+
+  // Set the input mask to true (no resource input)
+  std::vector<bool> input_mask(2, true);
+  profile.SetInputMask(input_mask);
+
+  std::vector<std::vector<nvinfer1::Dims3>> input_profiles{
+      {nvinfer1::Dims3(2, 2, 10), nvinfer1::Dims3(2, 2, 10)},
+      {nvinfer1::Dims3(3, 3, 10), nvinfer1::Dims3(3, 3, 10)},
+      {nvinfer1::Dims3(16, 16, 10), nvinfer1::Dims3(16, 16, 10)},
+  };
+
+  std::vector<nvinfer1::Dims3> unseen_shapes{nvinfer1::Dims3(5, 5, 10),
+                                             nvinfer1::Dims3(9, 9, 10)};
+
+  // Simulate a profile collection phase.
+  for (auto dim_vec : input_profiles) {
+    std::vector<TensorShape> shape_vec = DimVecToShapeVec(dim_vec, true);
+    profile.AddShape(shape_vec);
+  }
+  std::vector<PartialTensorShape> input_partial_shapes;
+  TF_CHECK_OK(GetNetworkInputShapes(network_.get(), &input_partial_shapes));
+  profile.InitProfiles(input_partial_shapes, strategy_);
+
+  // Configure and build engine.
+  TF_CHECK_OK(profile.ConfigureBuilder(builder_.get(), builder_config_.get(),
+                                       network_.get()));
+  engine = TrtUniquePtrType<nvinfer1::ICudaEngine>(
+      builder_->buildEngineWithConfig(*network_.get(), *builder_config_.get()));
+  ASSERT_NE(nullptr, engine);
+
+  TF_CHECK_OK(profile.CreateExecutionContexts(engine.get(), &exec_contexts_));
+
+  int n_profiles_exp;
+  switch (strategy_) {
+    case (ProfileStrategy::kImplicitBatchModeCompatible):
+    case (ProfileStrategy::kOptimal):
+      n_profiles_exp = input_profiles.size();
+      break;
+    case (ProfileStrategy::kRange):
+      n_profiles_exp = 1;
+      break;
+    case (ProfileStrategy::kRangeOptimal):
+      n_profiles_exp = 1 + input_profiles.size();
+      break;
+  }
+  // Each profile has an associated execution context.
+  EXPECT_EQ(exec_contexts_.size(), n_profiles_exp);
+
+  profile.SetShapeTensorMask(network_.get());
+
+  EXPECT_EQ(profile.HasShapeTensor(), false);
+
+  // Check if the profiles are assigned correctly.
+  for (auto dimvec : input_profiles) {
+    bool test_optimal_prof = strategy_ == ProfileStrategy::kOptimal ||
+                             strategy_ == ProfileStrategy::kRangeOptimal;
+    CheckProfile(dimvec, &profile, true, test_optimal_prof);
+  }
+  bool has_prof = (strategy_ == ProfileStrategy::kRange ||
+                   strategy_ == ProfileStrategy::kRangeOptimal);
+  CheckProfile(unseen_shapes, &profile, has_prof, false);
+}
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_tensor_proxy.h b/tensorflow/compiler/tf2tensorrt/utils/trt_tensor_proxy.h
index 789c518f600..5eea183fa9a 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_tensor_proxy.h
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_tensor_proxy.h
@@ -13,15 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_TRT_TENSOR_PROXY_H
-#define TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_TRT_TENSOR_PROXY_H
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_TENSOR_PROXY_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_TENSOR_PROXY_H_
 
-#include <cassert>
+#include <cstddef>
 #include <memory>
 #include <string>
 #include <vector>
 
 #include "tensorflow/compiler/tf2tensorrt/common/utils.h"
+#include "tensorflow/core/platform/logging.h"
 
 #if GOOGLE_CUDA && GOOGLE_TENSORRT
 #include "third_party/tensorrt/NvInfer.h"
@@ -142,28 +143,26 @@ class ITensorProxy {
         ttype_(TensorType::kSIMPLE) {}
 
   bool is_trt_tensor() const {
-    assert(validate());
-    assert(ttype_ == TensorType::kTRT);
+    CHECK(validate());
     return trt_tensor_ != nullptr;
   }
 
   bool is_simple_tensor() const {
-    assert(validate());
-    assert(ttype_ == TensorType::kSIMPLE);
+    CHECK(validate());
     return simple_tensor_ != nullptr;
   }
 
   TensorType ttype() const { return ttype_; }
 
   nvinfer1::ITensor* trt_tensor() const {
-    assert(trt_tensor_ != nullptr);
-    assert(ttype_ == TensorType::kTRT);
+    CHECK_NOTNULL(trt_tensor_);
+    CHECK(ttype_ == TensorType::kTRT);
     return trt_tensor_;
   }
 
   SimpleITensor* simple_tensor() const {
-    assert(simple_tensor_ != nullptr);
-    assert(ttype_ == TensorType::kSIMPLE);
+    CHECK_NOTNULL(simple_tensor_);
+    CHECK(ttype_ == TensorType::kSIMPLE);
     return simple_tensor_.get();
   }
 
@@ -174,7 +173,7 @@ class ITensorProxy {
       case TensorType::kSIMPLE:
         return simple_tensor_->setName(name);
     }
-    assert(0 && "Unsupported itensor_ type");
+    LOG(FATAL) << "Unsupported itensor_ type";
   }
 
   const char* getName() const {
@@ -184,7 +183,7 @@ class ITensorProxy {
       case TensorType::kSIMPLE:
         return simple_tensor_->getName();
     }
-    assert(0 && "Unsupported itensor_ type");
+    LOG(FATAL) << "Unsupported itensor_ type";
   }
 
   void setDimensions(nvinfer1::Dims dimensions) {
@@ -194,7 +193,7 @@ class ITensorProxy {
       case TensorType::kSIMPLE:
         return simple_tensor_->setDimensions(dimensions);
     }
-    assert(0 && "Unsupported itensor_ type");
+    LOG(FATAL) << "Unsupported itensor_ type";
   }
 
   nvinfer1::Dims getDimensions() const {
@@ -204,7 +203,7 @@ class ITensorProxy {
       case TensorType::kSIMPLE:
         return simple_tensor_->getDimensions();
     }
-    assert(0 && "Unsupported itensor_ type");
+    LOG(FATAL) << "Unsupported itensor_ type";
   }
 
   void setType(nvinfer1::DataType type) {
@@ -214,7 +213,7 @@ class ITensorProxy {
       case TensorType::kSIMPLE:
         return simple_tensor_->setType(type);
     }
-    assert(0 && "Unsupported itensor_ type");
+    LOG(FATAL) << "Unsupported itensor_ type";
   }
 
   nvinfer1::DataType getType() const {
@@ -224,7 +223,7 @@ class ITensorProxy {
       case TensorType::kSIMPLE:
         return simple_tensor_->getType();
     }
-    assert(0 && "Unsupported itensor_ type");
+    LOG(FATAL) << "Unsupported itensor_ type";
   }
 
   bool isNetworkInput() const {
@@ -234,7 +233,7 @@ class ITensorProxy {
       case TensorType::kSIMPLE:
         return simple_tensor_->isNetworkInput();
     }
-    assert(0 && "Unsupported itensor_ type");
+    LOG(FATAL) << "Unsupported itensor_ type";
   }
 
   bool isNetworkOutput() const {
@@ -244,7 +243,7 @@ class ITensorProxy {
       case TensorType::kSIMPLE:
         return simple_tensor_->isNetworkOutput();
     }
-    assert(0 && "Unsupported itensor_ type");
+    LOG(FATAL) << "Unsupported itensor_ type";
   }
 
   void setBroadcastAcrossBatch(bool broadcastAcrossBatch) {
@@ -254,7 +253,7 @@ class ITensorProxy {
       case TensorType::kSIMPLE:
         return simple_tensor_->setBroadcastAcrossBatch(broadcastAcrossBatch);
     }
-    assert(0 && "Unsupported itensor_ type");
+    LOG(FATAL) << "Unsupported itensor_ type";
   }
 
   bool getBroadcastAcrossBatch() const {
@@ -264,7 +263,7 @@ class ITensorProxy {
       case TensorType::kSIMPLE:
         return simple_tensor_->getBroadcastAcrossBatch();
     }
-    assert(0 && "Unsupported itensor_ type");
+    LOG(FATAL) << "Unsupported itensor_ type";
   }
 
   nvinfer1::TensorLocation getLocation() const {
@@ -274,7 +273,7 @@ class ITensorProxy {
       case TensorType::kSIMPLE:
         return simple_tensor_->getLocation();
     }
-    assert(0 && "Unsupported itensor_ type");
+    LOG(FATAL) << "Unsupported itensor_ type";
   }
 
   void setLocation(nvinfer1::TensorLocation location) {
@@ -284,7 +283,7 @@ class ITensorProxy {
       case TensorType::kSIMPLE:
         return simple_tensor_->setLocation(location);
     }
-    assert(0 && "Unsupported itensor_ type");
+    LOG(FATAL) << "Unsupported itensor_ type";
   }
 
   bool setDynamicRange(float min, float max) {
@@ -294,7 +293,7 @@ class ITensorProxy {
       case TensorType::kSIMPLE:
         return simple_tensor_->setDynamicRange(min, max);
     }
-    assert(0 && "Unsupported itensor_ type");
+    LOG(FATAL) << "Unsupported itensor_ type";
   }
 
   bool dynamicRangeIsSet() const {
@@ -304,7 +303,7 @@ class ITensorProxy {
       case TensorType::kSIMPLE:
         return simple_tensor_->dynamicRangeIsSet();
     }
-    assert(0 && "Unsupported itensor_ type");
+    LOG(FATAL) << "Unsupported itensor_ type";
   }
 
   void resetDynamicRange() {
@@ -314,7 +313,7 @@ class ITensorProxy {
       case TensorType::kSIMPLE:
         return simple_tensor_->resetDynamicRange();
     }
-    assert(0 && "Unsupported itensor_ type");
+    LOG(FATAL) << "Unsupported itensor_ type";
   }
   float getDynamicRangeMin() const {
     switch (ttype_) {
@@ -323,7 +322,7 @@ class ITensorProxy {
       case TensorType::kSIMPLE:
         return simple_tensor_->getDynamicRangeMin();
     }
-    assert(0 && "Unsupported itensor_ type");
+    LOG(FATAL) << "Unsupported itensor_ type";
   }
 
   float getDynamicRangeMax() const {
@@ -333,9 +332,9 @@ class ITensorProxy {
       case TensorType::kSIMPLE:
         return simple_tensor_->getDynamicRangeMax();
     }
-    assert(0 && "Unsupported itensor_ type");
+    LOG(FATAL) << "Unsupported itensor_ type";
   }
-#if IS_TRT_VERSION_GE(5, 0, 0, 0) && !IS_TRT_VERSION_GE(8, 0, 0, 0)
+#if !IS_TRT_VERSION_GE(8, 0, 0, 0)
   float getDynamicRange() const {
     switch (ttype_) {
       case TensorType::kTRT:
@@ -343,7 +342,7 @@ class ITensorProxy {
       case TensorType::kSIMPLE:
         return simple_tensor_->getDynamicRange();
     }
-    assert(0 && "Unsupported itensor_ type");
+    LOG(FATAL) << "Unsupported itensor_ type";
   }
 #endif
   void setAllowedFormats(nvinfer1::TensorFormats formats) {
@@ -353,7 +352,7 @@ class ITensorProxy {
       case TensorType::kSIMPLE:
         return simple_tensor_->setAllowedFormats(formats);
     }
-    assert(0 && "Unsupported itensor_ type");
+    LOG(FATAL) << "Unsupported itensor_ type";
   }
 
   nvinfer1::TensorFormats getAllowedFormats() const {
@@ -363,7 +362,7 @@ class ITensorProxy {
       case TensorType::kSIMPLE:
         return simple_tensor_->getAllowedFormats();
     }
-    assert(0 && "Unsupported itensor_ type");
+    LOG(FATAL) << "Unsupported itensor_ type";
   }
 
   bool isShapeTensor() const {
@@ -373,7 +372,7 @@ class ITensorProxy {
       case TensorType::kSIMPLE:
         return simple_tensor_->isShapeTensor();
     }
-    assert(0 && "Unsupported itensor_ type");
+    LOG(FATAL) << "Unsupported itensor_ type";
   }
 
   bool isExecutionTensor() const {
@@ -383,7 +382,7 @@ class ITensorProxy {
       case TensorType::kSIMPLE:
         return simple_tensor_->isExecutionTensor();
     }
-    assert(0 && "Unsupported itensor_ type");
+    LOG(FATAL) << "Unsupported itensor_ type";
   }
 
  private:
@@ -412,7 +411,7 @@ class ITensorProxy {
 
 class ITensorProxyPtr {
  public:
-  ITensorProxyPtr(nullptr_t) : p_(nullptr) {}
+  ITensorProxyPtr(std::nullptr_t) : p_(nullptr) {}
   ITensorProxyPtr(ITensorProxy* p) : p_(p) {}
   ITensorProxyPtr(nvinfer1::ITensor* p) : p_(new ITensorProxy(p)) {}
   ITensorProxyPtr(SimpleITensor* p) : p_(new ITensorProxy(p)) {}
@@ -442,6 +441,10 @@ inline bool operator==(const ITensorProxyPtr& p1, const ITensorProxyPtr& p2) {
            p1->simple_tensor() == p2->simple_tensor()));
 }
 
+inline bool operator!=(const ITensorProxyPtr& p1, const ITensorProxyPtr& p2) {
+  return !(p1 == p2);
+}
+
 struct ITensorProxyHash {
   size_t operator()(const ITensorProxyPtr& tensor) const {
     return reinterpret_cast<std::uintptr_t>(tensor.p_.get());
@@ -452,4 +455,4 @@ struct ITensorProxyHash {
 }  // namespace tensorflow
 #endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
 
-#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_TRT_TENSOR_PROXY_H
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_TENSOR_PROXY_H_
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_testutils.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_testutils.cc
new file mode 100644
index 00000000000..82046a2978e
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_testutils.cc
@@ -0,0 +1,76 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_testutils.h"
+
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include <gmock/gmock.h>
+
+namespace tensorflow {
+
+namespace tensorrt {
+namespace convert {
+
+::testing::Matcher<std::vector<float>> ArrayFloatNear(
+    const std::vector<float>& values, float max_abs_error, bool nan_sensitive) {
+  std::vector<::testing::Matcher<float>> matchers;
+  matchers.reserve(values.size());
+  for (const float& v : values) {
+    if (nan_sensitive) {
+      matchers.emplace_back(::testing::NanSensitiveFloatNear(v, max_abs_error));
+    } else if (max_abs_error == 0) {
+      matchers.emplace_back(::testing::FloatEq(v));
+    } else {
+      EXPECT_GE(max_abs_error, 0);
+      matchers.emplace_back(::testing::FloatNear(v, max_abs_error));
+    }
+  }
+  return ::testing::ElementsAreArray(matchers);
+}
+
+nvinfer1::Dims CreateDims(const std::vector<int>& d) {
+  nvinfer1::Dims dims;
+  dims.nbDims = d.size();
+  for (int i = 0; i < d.size(); ++i) {
+    dims.d[i] = d[i];
+  }
+  return dims;
+}
+
+NodeDef MakeNodeDef(const std::string& name, const std::string& op,
+                    const std::vector<std::string>& inputs,
+                    const std::map<std::string, AttrValue> attrs) {
+  NodeDef node_def;
+  node_def.set_name(name);
+  node_def.set_op(op);
+  for (const auto& input : inputs) {
+    node_def.add_input(input);
+  }
+  for (const auto& attr : attrs) {
+    (*node_def.mutable_attr())[attr.first] = attr.second;
+  }
+  return node_def;
+}
+
+}  // namespace convert
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_testutils.h b/tensorflow/compiler/tf2tensorrt/utils/trt_testutils.h
new file mode 100644
index 00000000000..e0b9a0366a5
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_testutils.h
@@ -0,0 +1,183 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_TESTUTILS_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_TESTUTILS_H_
+
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+
+#include <algorithm>
+#include <map>
+#include <numeric>
+#include <string>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/strings/str_format.h"
+#include "absl/types/span.h"
+#include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/compiler/tf2tensorrt/common/utils.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.h"
+#include "tensorflow/core/framework/node_def.pb.h"  // NOLINT
+#include "tensorflow/core/framework/tensor.pb.h"    // NOLINT
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "third_party/tensorrt/NvInfer.h"
+
+namespace tensorflow {
+namespace tensorrt {
+namespace convert {
+// Creates a node with the given op, inputs, and attributes.
+NodeDef MakeNodeDef(const std::string& name, const std::string& op,
+                    const std::vector<std::string>& inputs,
+                    const std::map<std::string, AttrValue> attrs = {});
+
+// Creates a constant node with the given name and values arranged in the given
+// shape.
+template <typename T>
+NodeDef MakeConstNodeDef(const std::string& name, const std::vector<T>& vals,
+                         const TensorShape& shape) {
+  Scope s = Scope::NewRootScope();
+  Tensor t = test::AsTensor<T>(vals, shape);
+  auto const_op = ops::Const(s.WithOpName(name), t);
+  return const_op.node()->def();
+}
+
+// Creates a constant node with the given name and values, assuming a 1-D shape.
+template <typename T>
+NodeDef MakeConstNodeDef(const std::string& name, const std::vector<T>& vals) {
+  TensorShape shape;
+  const std::vector<int32> shape_dims = {static_cast<int32>(vals.size())};
+  TF_EXPECT_OK(TensorShapeUtils::MakeShape(shape_dims, &shape));
+  return MakeConstNodeDef(name, vals, shape);
+}
+
+// Creates an nvinfer1::Dims struct from the given vector.
+nvinfer1::Dims CreateDims(const std::vector<int>& d);
+
+// A gmock matcher that check that elements of a float vector match to a given
+// tolerance.
+::testing::Matcher<std::vector<float>> ArrayFloatNear(
+    const std::vector<float>& values, float max_abs_error = 1e-5,
+    bool nan_sensitive = false);
+
+// nvinfer1::Dims gMock matchers
+
+// matches nvinfer1::Dims to initializer list or vector of ints
+// Example: EXPECT_THAT(my_dims, DimsAreArray({1, 2, 3}))
+MATCHER_P(DimsAreArrayHelper, array_value,
+          absl::StrFormat("%s [%s]", negation ? "are" : "are not",
+                          ::testing::PrintToString(array_value))) {
+  if (arg.nbDims != array_value.size()) return false;
+  for (int i = 0; i < arg.nbDims; ++i) {
+    if (arg.d[i] != array_value[i]) {
+      return false;
+    }
+  }
+  return true;
+}
+using DimsAreArray = DimsAreArrayHelperMatcherP<std::vector<int>>;
+
+// nvinfer1::INetworkDefinition gMock matchers
+
+// Checks that layer names are equal to initializer list or vector of strings.
+// Example: EXPECT_THAT(my_network, LayerNamesAreArray({"conv1", "conv2"}))
+MATCHER_P(LayerNamesAreArrayHelper, array_value,
+          absl::StrFormat("layer names %s [%s]", negation ? "are" : "are not",
+                          ::testing::PrintToString(array_value))) {
+  if (array_value.size() != arg->getNbLayers()) return false;
+  for (int i = 0; i < arg->getNbLayers(); ++i) {
+    if (arg->getLayer(i)->getName() == nullptr) {
+      return false;
+    }
+  }
+  return true;
+}
+using LayerNamesAreArray =
+    LayerNamesAreArrayHelperMatcherP<std::vector<std::string>>;
+
+// Checks layer names are all non-empty.
+MATCHER(LayerNamesNonEmpty, "") {
+  for (int i = 0; i < arg->getNbLayers(); ++i) {
+    if (arg->getLayer(i)->getName() == nullptr) {
+      return false;
+    }
+  }
+  return true;
+}
+
+// TRT_ShapedWeights gMock matchers.
+
+// Checks that the weight dimensions are values are equal to the given values.
+// Example: EXPECT_THAT(my_weights,
+//                      ShapedWeightsHasDimsAndValues({1, 2},{1.0f, 2.0f}))
+MATCHER_P2(ShapedWeightsHasDimsAndValuesHelper, dims_vec, expected_values, "") {
+  DimsAdapter dims(dims_vec);
+  if (arg.Shape() != dims) {
+    return false;
+  }
+  if (arg.count() != expected_values.size()) {
+    return false;
+  }
+  using T = typename decltype(expected_values)::value_type;
+  const T* actual_values = arg.template GetPointer<T>();
+  for (int i = 0; i < expected_values.size(); ++i) {
+    if (expected_values[i] != actual_values[i]) {
+      return false;
+    }
+  }
+  return true;
+}
+
+template <typename T>
+using ShapedWeightsHasDimsAndValues =
+    ShapedWeightsHasDimsAndValuesHelperMatcherP2<std::vector<int>,
+                                                 std::vector<T>>;
+
+// std::vector convenience utilities.
+
+// Creates a new vector by casting all values of the given InCType vector to
+// OutCType.
+template <typename InCType, typename OutCType>
+std::vector<OutCType> CastVector(
+    const gtl::ArraySlice<InCType>& vals) {  // non-absl ok
+  std::vector<OutCType> res(vals.size());
+  std::transform(vals.begin(), vals.end(), res.begin(),
+                 [](const InCType in_val) -> OutCType {
+                   return static_cast<OutCType>(in_val);
+                 });
+  return res;
+}
+
+// Creates a new vector of the given size and fills it with an increasing
+// sequence starting from the given start_value using std::iota.
+template <typename CType>
+std::vector<CType> CreateVectorIota(int size, CType start_value = CType(0)) {
+  std::vector<CType> res(size);
+  std::iota(res.begin(), res.end(), start_value);
+  return res;
+}
+
+}  // namespace convert
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_TESTUTILS_H_
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_testutils_test.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_testutils_test.cc
new file mode 100644
index 00000000000..d5d9fcf99f5
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_testutils_test.cc
@@ -0,0 +1,99 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_testutils.h"
+
+#include "tensorflow/compiler/tf2tensorrt/common/utils.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h"
+#include "third_party/tensorrt/NvInfer.h"
+
+namespace tensorflow {
+namespace tensorrt {
+namespace convert {
+
+using ::testing::AllOf;
+using ::testing::AnyOf;
+using ::testing::Eq;
+using ::testing::Not;
+
+TEST(TrtDimsMatcher, ParameterizedMatchers) {
+  EXPECT_THAT(nvinfer1::Dims4(1, 2, 3, 4), DimsAreArray({1, 2, 3, 4}));
+  // Check empty dims.
+  EXPECT_THAT(nvinfer1::Dims{}, Not(DimsAreArray({1, 2})));
+  std::vector<int> empty_dims;
+  EXPECT_THAT(nvinfer1::Dims{}, DimsAreArray(empty_dims));
+  // Check mismatching values.
+  EXPECT_THAT(nvinfer1::Dims4(1, 2, 3, 4), Not(DimsAreArray({1, 2, 3, 5})));
+  // Check mismatching number of arguments.
+  EXPECT_THAT(nvinfer1::Dims4(1, 2, 3, 4), Not(DimsAreArray({1, 2, 5})));
+}
+
+TEST(TrtDimsMatcher, EqualityMatcher) {
+  EXPECT_THAT(nvinfer1::Dims4(1, 2, 3, 4), Eq(nvinfer1::Dims4(1, 2, 3, 4)));
+  // Check empty dims.
+  EXPECT_THAT(nvinfer1::Dims{}, Eq(nvinfer1::Dims()));
+  // Check empty Dims is not equal to DimsHW, since their sizes differ.
+  EXPECT_THAT(nvinfer1::Dims{}, Not(Eq(nvinfer1::DimsHW())));
+  // Check mismatching values.
+  EXPECT_THAT(nvinfer1::Dims4(1, 2, 3, 4),
+              Not(Eq(nvinfer1::Dims4(1, 2, 3, 3))));
+  // Check mismatching number of arguments.
+  EXPECT_THAT(nvinfer1::Dims4(1, 2, 3, 4), Not(Eq(nvinfer1::Dims2(1, 2))));
+}
+
+TEST(INetworkDefinitionMatchers, CorrectlyMatch) {
+  Logger& logger = *Logger::GetLogger();
+  TrtUniquePtrType<nvinfer1::IBuilder> builder(
+      nvinfer1::createInferBuilder(logger));
+  TrtUniquePtrType<nvinfer1::INetworkDefinition> network(
+      builder->createNetworkV2(0L));
+
+  // Empty network checks.
+  EXPECT_THAT(network.get(), AllOf(Not(LayerNamesAreArray({"some layer"})),
+                                   LayerNamesNonEmpty()));
+
+  // Add the input and FC layers.
+  nvinfer1::Weights weights;
+  weights.type = nvinfer1::DataType::kFLOAT;
+  std::array<float, 1> vals;
+  weights.values = vals.data();
+  weights.count = 1;
+  auto input = network->addInput("input-tensor", nvinfer1::DataType::kFLOAT,
+                                 nvinfer1::Dims3{1, 1, 1});
+  ASSERT_NE(input, nullptr);
+
+  const char* fc_layer_name = "my-fc-layer";
+  auto layer = network->addFullyConnected(*input, 1, weights, weights);
+  ASSERT_NE(layer, nullptr);
+  layer->setName(fc_layer_name);
+
+  // Check layer names.
+  EXPECT_THAT(network.get(),
+              AllOf(LayerNamesNonEmpty(), LayerNamesAreArray({fc_layer_name})));
+
+  // Add layer with default name and check layer name.
+  layer = network->addFullyConnected(*input, 1, weights, weights);
+  EXPECT_THAT(network.get(), AllOf(LayerNamesNonEmpty(),
+                                   Not(LayerNamesAreArray({fc_layer_name}))));
+}
+
+}  // namespace convert
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/core/framework/selective_registration.h b/tensorflow/core/framework/selective_registration.h
index 4b281a04bf6..2b0225da604 100644
--- a/tensorflow/core/framework/selective_registration.h
+++ b/tensorflow/core/framework/selective_registration.h
@@ -55,4 +55,69 @@ static_assert(false, "ops_to_register.h must define SHOULD_REGISTER macros");
 #define SHOULD_REGISTER_OP_KERNEL(clz) true
 #endif
 
+namespace tensorflow {
+
+// An InitOnStartupMarker is 'initialized' on program startup, purely for the
+// side-effects of that initialization - the struct itself is empty. (The type
+// is expected to be used to define globals.)
+//
+// The '<<' operator should be used in initializer expressions to specify what
+// to run on startup. The following values are accepted:
+//   - An InitOnStartupMarker. Example:
+//      InitOnStartupMarker F();
+//      InitOnStartupMarker const kInitF =
+//        InitOnStartupMarker{} << F();
+//   - Something to call, which returns an InitOnStartupMarker. Example:
+//      InitOnStartupMarker const kInit =
+//        InitOnStartupMarker{} << []() { G(); return
+//
+// See also: TF_INIT_ON_STARTUP_IF
+struct InitOnStartupMarker {
+  constexpr InitOnStartupMarker operator<<(InitOnStartupMarker) const {
+    return *this;
+  }
+
+  template <typename T>
+  constexpr InitOnStartupMarker operator<<(T&& v) const {
+    return std::forward<T>(v)();
+  }
+};
+
+// Conditional initializer expressions for InitOnStartupMarker:
+//   TF_INIT_ON_STARTUP_IF(cond) << f
+// If 'cond' is true, 'f' is evaluated (and called, if applicable) on startup.
+// Otherwise, 'f' is *not evaluated*. Note that 'cond' is required to be a
+// constant-expression, and so this approximates #ifdef.
+//
+// The implementation uses the ?: operator (!cond prevents evaluation of 'f').
+// The relative precedence of ?: and << is significant; this effectively expands
+// to (see extra parens):
+//   !cond ? InitOnStartupMarker{} : (InitOnStartupMarker{} << f)
+//
+// Note that although forcing 'cond' to be a constant-expression should not
+// affect binary size (i.e. the same optimizations should apply if it 'happens'
+// to be one), it was found to be necessary (for a recent version of clang;
+// perhaps an optimizer bug).
+//
+// The parens are necessary to hide the ',' from the preprocessor; it could
+// otherwise act as a macro argument separator.
+#define TF_INIT_ON_STARTUP_IF(cond)                \
+  (::std::integral_constant<bool, !(cond)>::value) \
+      ? ::tensorflow::InitOnStartupMarker{}        \
+      : ::tensorflow::InitOnStartupMarker {}
+
+// Wrapper for generating unique IDs (for 'anonymous' InitOnStartup definitions)
+// using __COUNTER__. The new ID (__COUNTER__ already expanded) is provided as a
+// macro argument.
+//
+// Usage:
+//   #define M_IMPL(id, a, b) ...
+//   #define M(a, b) TF_NEW_ID_FOR_INIT(M_IMPL, a, b)
+#define TF_NEW_ID_FOR_INIT_2(m, c, ...) m(c, __VA_ARGS__)
+#define TF_NEW_ID_FOR_INIT_1(m, c, ...) TF_NEW_ID_FOR_INIT_2(m, c, __VA_ARGS__)
+#define TF_NEW_ID_FOR_INIT(m, ...) \
+  TF_NEW_ID_FOR_INIT_1(m, __COUNTER__, __VA_ARGS__)
+
+}  // namespace tensorflow
+
 #endif  // TENSORFLOW_CORE_FRAMEWORK_SELECTIVE_REGISTRATION_H_
diff --git a/tensorflow/core/profiler/lib/annotated_traceme.h b/tensorflow/core/profiler/lib/annotated_traceme.h
new file mode 100644
index 00000000000..24ab188674f
--- /dev/null
+++ b/tensorflow/core/profiler/lib/annotated_traceme.h
@@ -0,0 +1,59 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_LIB_ANNOTATED_TRACEME_H_
+#define TENSORFLOW_CORE_PROFILER_LIB_ANNOTATED_TRACEME_H_
+
+#include <utility>
+
+#include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/lib/scoped_annotation.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// Combination of TraceMe and ScopedAnnotation which share the same label.
+// Optimization are done to ensure the label generation are done once.
+class AnnotatedTraceMe {
+ public:
+  template <typename NameGeneratorT>
+  explicit AnnotatedTraceMe(NameGeneratorT&& name_generator, int level = 1) {
+    DCHECK_GE(level, 1);
+    bool annotation_enabled = ScopedAnnotation::IsEnabled();
+    bool traceme_enabled = TraceMe::Active(level);
+    if (TF_PREDICT_FALSE(annotation_enabled || traceme_enabled)) {
+      string name = std::forward<NameGeneratorT>(name_generator)();
+      if (annotation_enabled) {
+        scoped_annotation_.emplace(absl::string_view(name));
+      }
+      if (TF_PREDICT_TRUE(traceme_enabled)) {
+        trace_me_.emplace([&name] { return std::move(name); }, level);
+      }
+    }
+  }
+
+ private:
+  absl::optional<TraceMe> trace_me_;
+  absl::optional<ScopedAnnotation> scoped_annotation_;
+};
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_LIB_ANNOTATED_TRACEME_H_
diff --git a/tensorflow/core/util/device_name_utils.h b/tensorflow/core/util/device_name_utils.h
index 25ddd2402a5..5e57fdef4b5 100644
--- a/tensorflow/core/util/device_name_utils.h
+++ b/tensorflow/core/util/device_name_utils.h
@@ -74,6 +74,22 @@ class DeviceNameUtils {
              (has_id ? (other.has_id && id == other.id) : !other.has_id);
     }
 
+    bool operator!=(const ParsedName& other) const {
+      return (has_job ? ((other.has_job && job != other.job) || !other.has_job)
+                      : other.has_job) ||
+             (has_replica ? ((other.has_replica && replica == other.replica) ||
+                             !other.has_replica)
+                          : other.has_replica) ||
+             (has_task
+                  ? ((other.has_task && task == other.task) || !other.has_task)
+                  : other.has_task) ||
+             (has_type
+                  ? ((other.has_type && type == other.type) || !other.has_type)
+                  : other.has_type) ||
+             (has_id ? ((other.has_id && id == other.id) || !other.has_id)
+                     : other.has_id);
+    }
+
     bool has_job = false;
     string job;
     bool has_replica = false;
diff --git a/third_party/tensorrt/BUILD.tpl b/third_party/tensorrt/BUILD.tpl
index 5e3b223e695..2b6ae6ca153 100644
--- a/third_party/tensorrt/BUILD.tpl
+++ b/third_party/tensorrt/BUILD.tpl
@@ -19,14 +19,26 @@ cc_library(
     strip_include_prefix = "tensorrt/include",
 )
 
+config_setting(
+    name = "use_static_tensorrt",
+    define_values = {"TF_TENSORRT_STATIC":"1"},
+)
+
 cc_library(
     name = "tensorrt",
-    srcs = [":tensorrt_lib"],
+    srcs = select({
+        ":use_static_tensorrt": [":tensorrt_static_lib"],
+        "//conditions:default": [":tensorrt_lib"],
+    }),
     copts = cuda_default_copts(),
-    data = [":tensorrt_lib"],
+    data = select({
+        ":use_static_tensorrt": [],
+        "//conditions:default": [":tensorrt_lib"],
+    }),
     linkstatic = 1,
     deps = [
         ":tensorrt_headers",
+        # TODO(b/174608722): fix this line.
         "@local_config_cuda//cuda",
     ],
 )
diff --git a/third_party/tensorrt/tensorrt_configure.bzl b/third_party/tensorrt/tensorrt_configure.bzl
index 9316ef864bb..5c59ac6a513 100644
--- a/third_party/tensorrt/tensorrt_configure.bzl
+++ b/third_party/tensorrt/tensorrt_configure.bzl
@@ -16,6 +16,7 @@ load(
 )
 
 _TENSORRT_INSTALL_PATH = "TENSORRT_INSTALL_PATH"
+_TF_TENSORRT_STATIC_PATH = "TF_TENSORRT_STATIC_PATH"
 _TF_TENSORRT_CONFIG_REPO = "TF_TENSORRT_CONFIG_REPO"
 _TF_TENSORRT_VERSION = "TF_TENSORRT_VERSION"
 _TF_NEED_TENSORRT = "TF_NEED_TENSORRT"
@@ -82,6 +83,21 @@ def enable_tensorrt(repository_ctx):
     """Returns whether to build with TensorRT support."""
     return int(repository_ctx.os.environ.get(_TF_NEED_TENSORRT, False))
 
+def get_host_environ(repository_ctx, env):
+    if env in repository_ctx.os.environ:
+        version = repository_ctx.os.environ[env].strip()
+        return version
+    else:
+        return ""
+
+def _get_tensorrt_static_path(repository_ctx):
+    """Returns the path for TensorRT static libraries."""
+    return get_host_environ(repository_ctx, _TF_TENSORRT_STATIC_PATH)
+
+def _get_tensorrt_full_version(repository_ctx):
+    """Returns the full version for TensorRT."""
+    return get_host_environ(repository_ctx, _TF_TENSORRT_VERSION)
+
 def _tensorrt_configure_impl(repository_ctx):
     """Implementation of the tensorrt_configure repository rule."""
     if _TF_TENSORRT_CONFIG_REPO in repository_ctx.os.environ:
@@ -116,8 +132,11 @@ def _tensorrt_configure_impl(repository_ctx):
         _create_dummy_repository(repository_ctx)
         return
 
-    config = find_cuda_config(repository_ctx, ["tensorrt"])
+    config = find_cuda_config(repository_ctx, ["cuda", "tensorrt"])
+    cuda_version = config["cuda_version"]
+    cuda_library_path = config["cuda_library_dir"] + "/"
     trt_version = config["tensorrt_version"]
+    trt_full_version = _get_tensorrt_full_version(repository_ctx)
     cpu_value = get_cpu_value(repository_ctx)
 
     # Copy the library and header files.
@@ -140,6 +159,33 @@ def _tensorrt_configure_impl(repository_ctx):
         ),
     ]
 
+    tensorrt_static_path = _get_tensorrt_static_path(repository_ctx)
+    if tensorrt_static_path:
+        tensorrt_static_path = tensorrt_static_path + "/"
+        if _at_least_version(trt_full_version, "8.4.1"):
+            raw_static_library_names = _TF_TENSORRT_LIBS
+            nvrtc_ptxjit_static_raw_names = ["nvrtc", "nvrtc-builtins", "nvptxcompiler"]
+            nvrtc_ptxjit_static_names = ["%s_static" % name for name in nvrtc_ptxjit_static_raw_names]
+            nvrtc_ptxjit_static_libraries = [lib_name(lib, cpu_value, trt_version, static = True) for lib in nvrtc_ptxjit_static_names]
+        elif _at_least_version(trt_version, "8"):
+            raw_static_library_names = _TF_TENSORRT_LIBS
+            nvrtc_ptxjit_static_libraries = []
+        else:
+            raw_static_library_names = _TF_TENSORRT_LIBS + ["nvrtc", "myelin_compiler", "myelin_executor", "myelin_pattern_library", "myelin_pattern_runtime"]
+            nvrtc_ptxjit_static_libraries = []
+        static_library_names = ["%s_static" % name for name in raw_static_library_names]
+        static_libraries = [lib_name(lib, cpu_value, trt_version, static = True) for lib in static_library_names]
+        copy_rules = copy_rules + [
+            make_copy_files_rule(
+                repository_ctx,
+                name = "tensorrt_static_lib",
+                srcs = [tensorrt_static_path + library for library in static_libraries] +
+                       [cuda_library_path + library for library in nvrtc_ptxjit_static_libraries],
+                outs = ["tensorrt/lib/" + library for library in static_libraries] +
+                       ["tensorrt/lib/" + library for library in nvrtc_ptxjit_static_libraries],
+            ),
+        ]
+
     # Set up config file.
     _tpl(repository_ctx, "build_defs.bzl", {"%{if_tensorrt}": "if_true"})
 
@@ -161,6 +207,7 @@ tensorrt_configure = repository_rule(
         _TF_TENSORRT_VERSION,
         _TF_TENSORRT_CONFIG_REPO,
         _TF_NEED_TENSORRT,
+        _TF_TENSORRT_STATIC_PATH,
         "TF_CUDA_PATHS",
     ],
 )