Fix error in decode_audio for long audio inputs

2023-03-07 10:15:36 +01:00
parent 01ef12a6a0
commit 2646906596
1 changed files with 42 additions and 11 deletions
--- a/faster_whisper/audio.py
+++ b/faster_whisper/audio.py
@@ -1,4 +1,14 @@
 """We use the PyAV library to decode the audio: https://github.com/PyAV-Org/PyAV
 The advantage of PyAV is that it bundles the FFmpeg libraries so there is no additional
 system dependencies. FFmpeg does not need to be installed on the system.
 However, the API is quite low-level so we need to manipulate audio frames directly.
 """
 import av
 import io
 import itertools
 import numpy as np
@@ -12,25 +22,46 @@ def decode_audio(input_file, sampling_rate=16000):
    Returns:
      A float32 Numpy array.
    """
    fifo = av.audio.fifo.AudioFifo()
    resampler = av.audio.resampler.AudioResampler(
        format="s16",
        layout="mono",
        rate=sampling_rate,
    )
    raw_buffer = io.BytesIO()
    dtype = None
    with av.open(input_file) as container:
-        # Decode and resample each audio frame.
+        frames = container.decode(audio=0)
-        for frame in container.decode(audio=0):
+        frames = _group_frames(frames, 500000)
-            frame.pts = None
+        frames = _resample_frames(frames, resampler)
            for new_frame in resampler.resample(frame):
                fifo.write(new_frame)
-        # Flush the resampler.
+        for frame in frames:
-        for new_frame in resampler.resample(None):
+            array = frame.to_ndarray()
-            fifo.write(new_frame)
+            dtype = array.dtype
            raw_buffer.write(array)
-    frame = fifo.read()
+    audio = np.frombuffer(raw_buffer.getbuffer(), dtype=dtype)
    # Convert s16 back to f32.
-    return frame.to_ndarray().flatten().astype(np.float32) / 32768.0
+    return audio.astype(np.float32) / 32768.0
 def _group_frames(frames, num_samples=None):
    fifo = av.audio.fifo.AudioFifo()
    for frame in frames:
        frame.pts = None  # Ignore timestamp check.
        fifo.write(frame)
        if num_samples is not None and fifo.samples >= num_samples:
            yield fifo.read()
    if fifo.samples > 0:
        yield fifo.read()
 def _resample_frames(frames, resampler):
    # Add None to flush the resampler.
    for frame in itertools.chain(frames, [None]):
        yield from resampler.resample(frame)