Improve language detection when using clip_timestamps (#867)
This commit is contained in:
@@ -370,16 +370,27 @@ class WhisperModel:
|
|||||||
or language_detection_segments < 1
|
or language_detection_segments < 1
|
||||||
):
|
):
|
||||||
language_detection_segments = 1
|
language_detection_segments = 1
|
||||||
seek = 0
|
start_timestamp = (
|
||||||
detected_language_info = {}
|
float(clip_timestamps.split(",")[0])
|
||||||
|
if isinstance(clip_timestamps, str)
|
||||||
|
else clip_timestamps[0]
|
||||||
|
)
|
||||||
content_frames = (
|
content_frames = (
|
||||||
features.shape[-1] - self.feature_extractor.nb_max_frames
|
features.shape[-1] - self.feature_extractor.nb_max_frames
|
||||||
)
|
)
|
||||||
while (
|
seek = (
|
||||||
seek <= content_frames
|
int(start_timestamp * self.frames_per_second)
|
||||||
and seek
|
if start_timestamp * self.frames_per_second < content_frames
|
||||||
< self.feature_extractor.nb_max_frames * language_detection_segments
|
else 0
|
||||||
):
|
)
|
||||||
|
end_frames = min(
|
||||||
|
seek
|
||||||
|
+ self.feature_extractor.nb_max_frames
|
||||||
|
* language_detection_segments,
|
||||||
|
content_frames,
|
||||||
|
)
|
||||||
|
detected_language_info = {}
|
||||||
|
while seek < end_frames:
|
||||||
segment = features[
|
segment = features[
|
||||||
:, seek : seek + self.feature_extractor.nb_max_frames
|
:, seek : seek + self.feature_extractor.nb_max_frames
|
||||||
]
|
]
|
||||||
|
|||||||
Reference in New Issue
Block a user