faster-whisper/faster_whisper/audio.py

import av
import numpy as np


def decode_audio(input_file, sampling_rate=16000):
    """Decodes the audio.

    Args:
      input_file: Path to the input file or a file-like object.
      sampling_rate: Resample the audio to this sample rate.

    Returns:
      A float32 Numpy array.
    """
    fifo = av.audio.fifo.AudioFifo()
    resampler = av.audio.resampler.AudioResampler(
        format="s16",
        layout="mono",
        rate=sampling_rate,
    )

    with av.open(input_file) as container:
        # Decode and resample each audio frame.
        for frame in container.decode(audio=0):
            frame.pts = None
            for new_frame in resampler.resample(frame):
                fifo.write(new_frame)

        # Flush the resampler.
        for new_frame in resampler.resample(None):
            fifo.write(new_frame)

    frame = fifo.read()

    # Convert s16 back to f32.
    return frame.to_ndarray().flatten().astype(np.float32) / 32768.0