Skip to content

Commit

Permalink
Fix out-of-bounds read in whitespace tokenizer
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 655738272
  • Loading branch information
tf-text-github-robot committed Jul 25, 2024
1 parent 767dfc8 commit 240d049
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 1 deletion.
3 changes: 2 additions & 1 deletion tensorflow_text/core/kernels/whitespace_tokenizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,8 @@ class WhitespaceTokenizerConfig {
: config_(*config), max_codepoint_(config->length() * 8) {}

inline bool IsWhitespace(const UChar32 codepoint) const {
return codepoint <= max_codepoint_ &&
return codepoint != U_SENTINEL &&
codepoint <= max_codepoint_ &&
config_[codepoint >> 3] & (1 << (char)(codepoint & 0x7));
}

Expand Down
12 changes: 12 additions & 0 deletions tensorflow_text/core/kernels/whitespace_tokenizer_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,18 @@ TEST(WhitespaceTokenizerTest, Internationalization) {
EXPECT_THAT(output_end_offsets, ElementsAre(5, 10, 15));
}

TEST(WhitespaceTokenizerTest, InvalidCodepoint) {
absl::string_view input("\xE3");
std::vector<std::string> output_tokens;
std::vector<int> output_start_offsets;
std::vector<int> output_end_offsets;
std::string config = BuildWhitespaceTokenizerConfig();
WhitespaceTokenizer t(&config);
t.Tokenize(input, &output_tokens, &output_start_offsets, &output_end_offsets);
EXPECT_THAT(output_start_offsets, ElementsAre(0));
EXPECT_THAT(output_end_offsets, ElementsAre(1));
}

} // namespace
} // namespace text
} // namespace tensorflow

0 comments on commit 240d049

Please sign in to comment.