From df7a3d836291f14e05363e99919a4deffd3baa51 Mon Sep 17 00:00:00 2001 From: aravindMahadevan <15685389+aravindMahadevan@users.noreply.github.com> Date: Sat, 7 Dec 2024 14:46:14 -0500 Subject: [PATCH] Fix whisper timestamp extraction for tokenizers with added tokens (#804) * support user defined tokens by bounding timestamp token if statement * Update src/tokenizers.js Co-authored-by: Joshua Lochner * calculate timestamp_end instead of hardcoding * Update tokenizers.js * Merge conflict resolution --------- Co-authored-by: Joshua Lochner --- src/tokenizers.js | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/tokenizers.js b/src/tokenizers.js index 8cc07f8f4..36dbe1981 100644 --- a/src/tokenizers.js +++ b/src/tokenizers.js @@ -3583,6 +3583,11 @@ export class WhisperTokenizer extends PreTrainedTokenizer { let chunk = new_chunk(); let time_offset = 0.0; const timestamp_begin = this.timestamp_begin; + // Whisper timestamp tokens start from 0.00 and go to timestamp 30.00 in 0.02 increments. + // We can calculate the last time stamp token as timestamp_begin plus the number of tokens + // tokens from 0.00 to 30.00 which is 1500. + const total_timestamp_tokens = 1500; // (30.00 - 0.00) / 0.02 + const timestamp_end = timestamp_begin + total_timestamp_tokens; let previous_tokens = []; let previous_token_timestamps = []; @@ -3670,7 +3675,7 @@ export class WhisperTokenizer extends PreTrainedTokenizer { } else { // 2/ This is a regular special token, ignoring it } - } else if (token >= timestamp_begin) { + } else if (token >= timestamp_begin && token <= timestamp_end) { // 3/ Timestamp token const time = (token - timestamp_begin) * time_precision + time_offset; const rounded_time = round(time, 2);