From 2646906596d01b330f3072e81c9a56979c75fc6c Mon Sep 17 00:00:00 2001 From: Guillaume Klein Date: Tue, 7 Mar 2023 10:15:36 +0100 Subject: [PATCH] Fix error in decode_audio for long audio inputs --- faster_whisper/audio.py | 53 ++++++++++++++++++++++++++++++++--------- 1 file changed, 42 insertions(+), 11 deletions(-) diff --git a/faster_whisper/audio.py b/faster_whisper/audio.py index eeb9f7f..0b7dfae 100644 --- a/faster_whisper/audio.py +++ b/faster_whisper/audio.py @@ -1,4 +1,14 @@ +"""We use the PyAV library to decode the audio: https://github.com/PyAV-Org/PyAV + +The advantage of PyAV is that it bundles the FFmpeg libraries so there is no additional +system dependencies. FFmpeg does not need to be installed on the system. + +However, the API is quite low-level so we need to manipulate audio frames directly. +""" + import av +import io +import itertools import numpy as np @@ -12,25 +22,46 @@ def decode_audio(input_file, sampling_rate=16000): Returns: A float32 Numpy array. """ - fifo = av.audio.fifo.AudioFifo() resampler = av.audio.resampler.AudioResampler( format="s16", layout="mono", rate=sampling_rate, ) + raw_buffer = io.BytesIO() + dtype = None + with av.open(input_file) as container: - # Decode and resample each audio frame. - for frame in container.decode(audio=0): - frame.pts = None - for new_frame in resampler.resample(frame): - fifo.write(new_frame) + frames = container.decode(audio=0) + frames = _group_frames(frames, 500000) + frames = _resample_frames(frames, resampler) - # Flush the resampler. - for new_frame in resampler.resample(None): - fifo.write(new_frame) + for frame in frames: + array = frame.to_ndarray() + dtype = array.dtype + raw_buffer.write(array) - frame = fifo.read() + audio = np.frombuffer(raw_buffer.getbuffer(), dtype=dtype) # Convert s16 back to f32. - return frame.to_ndarray().flatten().astype(np.float32) / 32768.0 + return audio.astype(np.float32) / 32768.0 + + +def _group_frames(frames, num_samples=None): + fifo = av.audio.fifo.AudioFifo() + + for frame in frames: + frame.pts = None # Ignore timestamp check. + fifo.write(frame) + + if num_samples is not None and fifo.samples >= num_samples: + yield fifo.read() + + if fifo.samples > 0: + yield fifo.read() + + +def _resample_frames(frames, resampler): + # Add None to flush the resampler. + for frame in itertools.chain(frames, [None]): + yield from resampler.resample(frame)