Skip to content

Commit

Permalink
Fix whisper timestamp extraction for tokenizers with added tokens (#804)
Browse files Browse the repository at this point in the history
* support user defined tokens by bounding timestamp token if statement

* Update src/tokenizers.js

Co-authored-by: Joshua Lochner <[email protected]>

* calculate timestamp_end instead of hardcoding

* Update tokenizers.js

* Merge conflict resolution

---------

Co-authored-by: Joshua Lochner <[email protected]>
  • Loading branch information
aravindMahadevan and xenova authored Dec 7, 2024
1 parent 7dffb9a commit df7a3d8
Showing 1 changed file with 6 additions and 1 deletion.
7 changes: 6 additions & 1 deletion src/tokenizers.js
Original file line number Diff line number Diff line change
Expand Up @@ -3583,6 +3583,11 @@ export class WhisperTokenizer extends PreTrainedTokenizer {
let chunk = new_chunk();
let time_offset = 0.0;
const timestamp_begin = this.timestamp_begin;
// Whisper timestamp tokens start from 0.00 and go to timestamp 30.00 in 0.02 increments.
// We can calculate the last time stamp token as timestamp_begin plus the number of tokens
// tokens from 0.00 to 30.00 which is 1500.
const total_timestamp_tokens = 1500; // (30.00 - 0.00) / 0.02
const timestamp_end = timestamp_begin + total_timestamp_tokens;

let previous_tokens = [];
let previous_token_timestamps = [];
Expand Down Expand Up @@ -3670,7 +3675,7 @@ export class WhisperTokenizer extends PreTrainedTokenizer {
} else {
// 2/ This is a regular special token, ignoring it
}
} else if (token >= timestamp_begin) {
} else if (token >= timestamp_begin && token <= timestamp_end) {
// 3/ Timestamp token
const time = (token - timestamp_begin) * time_precision + time_offset;
const rounded_time = round(time, 2);
Expand Down

0 comments on commit df7a3d8

Please sign in to comment.