Added audio duration after VAD to TranscriptionInfo object (#445)

* Added VAD removed audio duration to TranscriptionInfo object

Along with the duration of the original audio, this commit  adds the seconds of audio removed by the VAD to the returned info obj

* Chaning naming for duration_after_vad

Instead of the property returning the audio duration removed, it now returns the final duration after the vad.
If vad_filter is False or if it doesn't remove any audio, the original duration is returned.
This commit is contained in:
MinorJinx
2023-08-31 10:19:48 -05:00
committed by GitHub
parent 7b271da035
commit e87fbf8a49

View File

@@ -69,6 +69,7 @@ class TranscriptionInfo(NamedTuple):
language: str
language_probability: float
duration: float
duration_after_vad: float
all_language_probs: Optional[List[Tuple[str, float]]]
transcription_options: TranscriptionOptions
vad_options: VadOptions
@@ -249,6 +250,7 @@ class WhisperModel:
audio = decode_audio(audio, sampling_rate=sampling_rate)
duration = audio.shape[0] / sampling_rate
duration_after_vad = duration
self.logger.info(
"Processing audio with duration %s", format_timestamp(duration)
@@ -261,10 +263,11 @@ class WhisperModel:
vad_parameters = VadOptions(**vad_parameters)
speech_chunks = get_speech_timestamps(audio, vad_parameters)
audio = collect_chunks(audio, speech_chunks)
duration_after_vad = audio.shape[0] / sampling_rate
self.logger.info(
"VAD filter removed %s of audio",
format_timestamp(duration - (audio.shape[0] / sampling_rate)),
format_timestamp(duration - duration_after_vad),
)
if self.logger.isEnabledFor(logging.DEBUG):
@@ -352,6 +355,7 @@ class WhisperModel:
language=language,
language_probability=language_probability,
duration=duration,
duration_after_vad=duration_after_vad,
transcription_options=options,
vad_options=vad_parameters,
all_language_probs=all_language_probs,