From 8862bee1f8ee4974bc66bf61620a51383f7508d8 Mon Sep 17 00:00:00 2001 From: ABen <44395133+ben91lin@users.noreply.github.com> Date: Mon, 1 Jul 2024 17:12:45 +0800 Subject: [PATCH] Improve language detection when using clip_timestamps (#867) --- faster_whisper/transcribe.py | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py index ce7fa99..9d603bd 100644 --- a/faster_whisper/transcribe.py +++ b/faster_whisper/transcribe.py @@ -370,16 +370,27 @@ class WhisperModel: or language_detection_segments < 1 ): language_detection_segments = 1 - seek = 0 - detected_language_info = {} + start_timestamp = ( + float(clip_timestamps.split(",")[0]) + if isinstance(clip_timestamps, str) + else clip_timestamps[0] + ) content_frames = ( features.shape[-1] - self.feature_extractor.nb_max_frames ) - while ( - seek <= content_frames - and seek - < self.feature_extractor.nb_max_frames * language_detection_segments - ): + seek = ( + int(start_timestamp * self.frames_per_second) + if start_timestamp * self.frames_per_second < content_frames + else 0 + ) + end_frames = min( + seek + + self.feature_extractor.nb_max_frames + * language_detection_segments, + content_frames, + ) + detected_language_info = {} + while seek < end_frames: segment = features[ :, seek : seek + self.feature_extractor.nb_max_frames ]