Skip to content

Commit

Permalink
Fix pyannote processor post_process_speaker_diarization (#1082)
Browse files Browse the repository at this point in the history
  • Loading branch information
xenova authored Dec 8, 2024
1 parent 9914e7a commit 14bf689
Show file tree
Hide file tree
Showing 2 changed files with 63 additions and 54 deletions.
56 changes: 56 additions & 0 deletions src/models/pyannote/feature_extraction_pyannote.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import { FeatureExtractor, validate_audio_inputs } from '../../base/feature_extraction_utils.js';
import { Tensor } from '../../utils/tensor.js';
import { max, softmax } from '../../utils/maths.js';


export class PyAnnoteFeatureExtractor extends FeatureExtractor {
Expand All @@ -25,4 +26,59 @@ export class PyAnnoteFeatureExtractor extends FeatureExtractor {
};
}

/**
* NOTE: Can return fractional values. `Math.ceil` will ensure correct value.
* @param {number} samples The number of frames in the audio.
* @returns {number} The number of frames in the audio.
*/
samples_to_frames(samples) {
return ((samples - this.config.offset) / this.config.step);
}

/**
* Post-processes the speaker diarization logits output by the model.
* @param {import('../../utils/tensor.js').Tensor} logits The speaker diarization logits output by the model.
* @param {number} num_samples Number of samples in the input audio.
* @returns {Array<Array<{ id: number, start: number, end: number, confidence: number }>>} The post-processed speaker diarization results.
*/
post_process_speaker_diarization(logits, num_samples) {
const ratio = (
num_samples / this.samples_to_frames(num_samples)
) / this.config.sampling_rate;

const results = [];
for (const scores of logits.tolist()) {
const accumulated_segments = [];

let current_speaker = -1;
for (let i = 0; i < scores.length; ++i) {
const probabilities = softmax(scores[i]);
const [score, id] = max(probabilities);
const [start, end] = [i, i + 1];

if (id !== current_speaker) {
// Speaker has changed
current_speaker = id;
accumulated_segments.push({ id, start, end, score });
} else {
// Continue the current segment
accumulated_segments.at(-1).end = end;
accumulated_segments.at(-1).score += score;
}
}

results.push(accumulated_segments.map(
// Convert frame-space to time-space
// and compute the confidence
({ id, start, end, score }) => ({
id,
start: start * ratio,
end: end * ratio,
confidence: score / (end - start),
})
));
}
return results;
}

}
61 changes: 7 additions & 54 deletions src/models/pyannote/processing_pyannote.js
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
import { Processor } from '../../base/processing_utils.js';
import { AutoFeatureExtractor } from '../auto/feature_extraction_auto.js';
import { max, softmax } from '../../utils/maths.js';
import { PyAnnoteFeatureExtractor } from './feature_extraction_pyannote.js';

export class PyAnnoteProcessor extends Processor {
static feature_extractor_class = AutoFeatureExtractor
static feature_extractor_class = PyAnnoteFeatureExtractor

/**
* Calls the feature_extractor function with the given audio input.
Expand All @@ -14,58 +13,12 @@ export class PyAnnoteProcessor extends Processor {
return await this.feature_extractor(audio)
}

/**
* NOTE: Can return fractional values. `Math.ceil` will ensure correct value.
* @param {number} samples The number of frames in the audio.
* @returns {number} The number of frames in the audio.
*/
samples_to_frames(samples) {
return ((samples - this.config.offset) / this.config.step);
/** @type {PyAnnoteFeatureExtractor['post_process_speaker_diarization']} */
post_process_speaker_diarization(...args) {
return /** @type {PyAnnoteFeatureExtractor} */(this.feature_extractor).post_process_speaker_diarization(...args);
}

/**
* Post-processes the speaker diarization logits output by the model.
* @param {import('../../utils/tensor.js').Tensor} logits The speaker diarization logits output by the model.
* @param {number} num_samples Number of samples in the input audio.
* @returns {Array<Array<{ id: number, start: number, end: number, confidence: number }>>} The post-processed speaker diarization results.
*/
post_process_speaker_diarization(logits, num_samples) {
const ratio = (
num_samples / this.samples_to_frames(num_samples)
) / this.config.sampling_rate;

const results = [];
for (const scores of logits.tolist()) {
const accumulated_segments = [];

let current_speaker = -1;
for (let i = 0; i < scores.length; ++i) {
const probabilities = softmax(scores[i]);
const [score, id] = max(probabilities);
const [start, end] = [i, i + 1];

if (id !== current_speaker) {
// Speaker has changed
current_speaker = id;
accumulated_segments.push({ id, start, end, score });
} else {
// Continue the current segment
accumulated_segments.at(-1).end = end;
accumulated_segments.at(-1).score += score;
}
}

results.push(accumulated_segments.map(
// Convert frame-space to time-space
// and compute the confidence
({ id, start, end, score }) => ({
id,
start: start * ratio,
end: end * ratio,
confidence: score / (end - start),
})
));
}
return results;
get sampling_rate() {
return this.feature_extractor.config.sampling_rate;
}
}

0 comments on commit 14bf689

Please sign in to comment.