Improve language detection when using clip_timestamps (#867)

This commit is contained in:
ABen
2024-07-01 17:12:45 +08:00
committed by GitHub
parent 8d400e9870
commit 8862bee1f8

View File

@@ -370,16 +370,27 @@ class WhisperModel:
or language_detection_segments < 1 or language_detection_segments < 1
): ):
language_detection_segments = 1 language_detection_segments = 1
seek = 0 start_timestamp = (
detected_language_info = {} float(clip_timestamps.split(",")[0])
if isinstance(clip_timestamps, str)
else clip_timestamps[0]
)
content_frames = ( content_frames = (
features.shape[-1] - self.feature_extractor.nb_max_frames features.shape[-1] - self.feature_extractor.nb_max_frames
) )
while ( seek = (
seek <= content_frames int(start_timestamp * self.frames_per_second)
and seek if start_timestamp * self.frames_per_second < content_frames
< self.feature_extractor.nb_max_frames * language_detection_segments else 0
): )
end_frames = min(
seek
+ self.feature_extractor.nb_max_frames
* language_detection_segments,
content_frames,
)
detected_language_info = {}
while seek < end_frames:
segment = features[ segment = features[
:, seek : seek + self.feature_extractor.nb_max_frames :, seek : seek + self.feature_extractor.nb_max_frames
] ]