Handle the punctuation definition mismatch between different Unicode …

…versions. PiperOrigin-RevId: 707345112
tensorflow · Dec 18, 2024 · 70bd094 · 70bd094
1 parent 31f22e9
commit 70bd094
Show file tree

Hide file tree

Showing 5 changed files with 85 additions and 5 deletions.
diff --git a/tensorflow_text/core/kernels/BUILD b/tensorflow_text/core/kernels/BUILD
@@ -403,12 +403,15 @@ cc_test(
     srcs = ["fast_wordpiece_tokenizer_test.cc"],
     data = [
         "//tensorflow_text:python/ops/test_data/fast_wordpiece_tokenizer_model.fb",
+        "//tensorflow_text:python/ops/test_data/fast_wordpiece_tokenizer_model_ver_15_1.fb",
+        "//tensorflow_text:python/ops/test_data/fast_wordpiece_tokenizer_model_ver_16_0.fb",
     ],
     deps = [
         ":fast_wordpiece_tokenizer",
         ":fast_wordpiece_tokenizer_model_builder",
         "@com_google_googletest//:gtest_main",
         "@com_google_absl//absl/flags:flag",
+        "//third_party/icu:headers",
         # tf:lib tensorflow dep,
     ],
 )

diff --git a/tensorflow_text/core/kernels/fast_wordpiece_tokenizer.cc b/tensorflow_text/core/kernels/fast_wordpiece_tokenizer.cc
@@ -278,14 +278,24 @@ void FastWordpieceTokenizer::TokenizeTextImpl(
                         prev_unicode_char))) {
       // If the current Unicode character is a valid word boundary, collect the
       // remaining tokens stored on a path on the trie.
+      absl::string_view cur_str = absl::string_view(
+          input_substr.data(), cur_pos - input_word_offset_in_text);
       HandleTheRemainingStringOnTriePath<kGetPieces, kGetIds, kGetOffsets>(
-          absl::string_view(input_substr.data(),
-                            cur_pos - input_word_offset_in_text),
-          input_word_offset_in_text, cur_node, original_num_tokens,
+          cur_str, input_word_offset_in_text, cur_node, original_num_tokens,
           cur_offset_in_input_word, output_pieces, output_ids,
           output_start_offsets, output_end_offsets);
-      // Skip the whitespace.
-      if (is_white_space) cur_pos = next_pos;
+      if (is_white_space) {
+        // Skip the whitespace.
+        cur_pos = next_pos;
+      } else if (cur_str.empty()) {
+        // If the remaining tokens are empty, it means we encountered an
+        // unmappable separator, so output an unknown token and continue.
+        cur_pos = next_pos;
+        ResetOutputAppendUnknownToken<kGetPieces, kGetIds, kGetOffsets>(
+            input_word_offset_in_text, (cur_pos - input_word_offset_in_text),
+            original_num_tokens, output_pieces, output_ids,
+            output_start_offsets, output_end_offsets);
+      }
       // Continue in the outer while loop to process the remaining input.
       continue;
     }

diff --git a/tensorflow_text/core/kernels/fast_wordpiece_tokenizer_test.cc b/tensorflow_text/core/kernels/fast_wordpiece_tokenizer_test.cc
@@ -17,13 +17,15 @@
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/flags/flag.h"
+#include "icu4c/source/common/unicode/uchar.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow_text/core/kernels/fast_wordpiece_tokenizer_model_builder.h"
 
 namespace tensorflow {
 namespace text {
 namespace {
 
+using ::testing::AnyOf;
 using ::testing::ElementsAre;
 
 constexpr char kTestConfigPath[] =
@@ -58,6 +60,71 @@ TEST(FastWordpieceTokenizerTest, LoadAndTokenize) {
   EXPECT_THAT(output_end_offsets, ElementsAre(3, 5, 6, 9));
 }
 
+using TestPunctuationVersionMismatch = testing::TestWithParam<std::string>;
+
+TEST_P(TestPunctuationVersionMismatch, Test) {
+  // The config_flatbuffer used here is built from the following config:
+  //  * vocab = {"a", "abc", "abcdefghi", "##de", "##defgxy", "##deh", "##f",
+  //             "##ghz", "<unk>"}
+  //  * unk_token = "<unk>"
+  //  * suffix_indicator = "##"
+  //  * max_bytes_per_token = 100
+  //  * end_to_end = True
+
+  const std::string kTestConfigUnicodePath = GetParam();
+
+  // We test the new punctuation symbol: \341\255\277, which was available in
+  // Unicode 16: https://www.fileformat.info/info/unicode/char//1b7f/index.htm,
+  // but not in 15.1.
+  // We also test an existing punctuation symbol ">".
+  std::string input = "abc>abc\341\255\277abc";
+
+  std::string config_flatbuffer;
+  auto status = tensorflow::ReadFileToString(
+      tensorflow::Env::Default(), kTestConfigUnicodePath, &config_flatbuffer);
+  ASSERT_TRUE(status.ok());
+
+  ASSERT_OK_AND_ASSIGN(
+      auto tokenizer, FastWordpieceTokenizer::Create(config_flatbuffer.data()));
+
+  std::vector<std::string> output_tokens;
+  std::vector<int> output_ids;
+  std::vector<int> output_start_offsets;
+  std::vector<int> output_end_offsets;
+  tokenizer.Tokenize(input, &output_tokens, &output_ids, &output_start_offsets,
+                     &output_end_offsets);
+
+  // If the runtime environment has unicode <=15.1, "\341\255\277" is not a
+  // punctuation, so "abc\341\255\277abc" is one token.
+  // If the runtime environment has unicode >=16.0, "\341\255\277" is a
+  // punctuation, so tokens are "abc", "<unk>", "abc"
+  EXPECT_THAT(output_tokens.size(), AnyOf(3, 5));
+  if (!u_ispunct(0x1b7f)) {
+    // We have a runtime environment of unicode <= 15.1.
+    EXPECT_THAT(output_tokens, ElementsAre("abc", "<unk>", "<unk>"));
+    EXPECT_THAT(output_ids, ElementsAre(1, 8, 8));
+    EXPECT_THAT(output_start_offsets, ElementsAre(0, 3, 4));
+    EXPECT_THAT(output_end_offsets, ElementsAre(3, 4, 13));
+  } else {
+    // We have a runtime environment of unicode >= 16.0.
+    EXPECT_THAT(output_tokens,
+                ElementsAre("abc", "<unk>", "abc", "<unk>", "abc"));
+    EXPECT_THAT(output_ids, ElementsAre(1, 8, 1, 8, 1));
+    EXPECT_THAT(output_start_offsets, ElementsAre(0, 3, 4, 7, 10));
+    EXPECT_THAT(output_end_offsets, ElementsAre(3, 4, 7, 10, 13));
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(FastWordpieceTokenizerPunctuationTest,
+                         TestPunctuationVersionMismatch,
+                         testing::Values(
+                             // Unicode v 15.1 config
+                             "third_party/tensorflow_text/python/ops/test_data/"
+                             "fast_wordpiece_tokenizer_model_ver_15_1.fb",
+                             // Unicode v 16.0 config
+                             "third_party/tensorflow_text/python/ops/test_data/"
+                             "fast_wordpiece_tokenizer_model_ver_16_0.fb"));
+
 template <typename T>
 std::string ListToString(const std::vector<T>& list) {
   return absl::StrCat("[", absl::StrJoin(list, ", "), "]");

diff --git a/tensorflow_text/python/ops/test_data/fast_wordpiece_tokenizer_model_ver_15_1.fb b/tensorflow_text/python/ops/test_data/fast_wordpiece_tokenizer_model_ver_15_1.fb
diff --git a/tensorflow_text/python/ops/test_data/fast_wordpiece_tokenizer_model_ver_16_0.fb b/tensorflow_text/python/ops/test_data/fast_wordpiece_tokenizer_model_ver_16_0.fb