From 9024a4c976610c85cf0cbf1b0a38ae75c6aa6266 Mon Sep 17 00:00:00 2001 From: HenryL27 Date: Thu, 7 Dec 2023 14:36:45 -0800 Subject: [PATCH] use truncate flag in huggingface tokenizer Signed-off-by: HenryL27 --- .../engine/algorithms/SentenceTransformerTranslator.java | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/ml-algorithms/src/main/java/org/opensearch/ml/engine/algorithms/SentenceTransformerTranslator.java b/ml-algorithms/src/main/java/org/opensearch/ml/engine/algorithms/SentenceTransformerTranslator.java index 30b0bacc11..d62a2cd992 100644 --- a/ml-algorithms/src/main/java/org/opensearch/ml/engine/algorithms/SentenceTransformerTranslator.java +++ b/ml-algorithms/src/main/java/org/opensearch/ml/engine/algorithms/SentenceTransformerTranslator.java @@ -25,7 +25,12 @@ public Batchifier getBatchifier() { @Override public void prepare(TranslatorContext ctx) throws IOException { Path path = ctx.getModel().getModelPath(); - tokenizer = HuggingFaceTokenizer.builder().optPadding(true).optTokenizerPath(path.resolve("tokenizer.json")).build(); + tokenizer = HuggingFaceTokenizer + .builder() + .optPadding(true) + .optTruncation(true) + .optTokenizerPath(path.resolve("tokenizer.json")) + .build(); } @Override