Support separating the left and right audio channels (#97)
This commit is contained in:
@@ -15,19 +15,27 @@ import av
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
def decode_audio(input_file: Union[str, BinaryIO], sampling_rate: int = 16000):
|
def decode_audio(
|
||||||
|
input_file: Union[str, BinaryIO],
|
||||||
|
sampling_rate: int = 16000,
|
||||||
|
split_stereo: bool = False,
|
||||||
|
):
|
||||||
"""Decodes the audio.
|
"""Decodes the audio.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
input_file: Path to the input file or a file-like object.
|
input_file: Path to the input file or a file-like object.
|
||||||
sampling_rate: Resample the audio to this sample rate.
|
sampling_rate: Resample the audio to this sample rate.
|
||||||
|
split_stereo: Return separate left and right channels.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
A float32 Numpy array.
|
A float32 Numpy array.
|
||||||
|
|
||||||
|
If `split_stereo` is enabled, the function returns a 2-tuple with the
|
||||||
|
separated left and right channels.
|
||||||
"""
|
"""
|
||||||
resampler = av.audio.resampler.AudioResampler(
|
resampler = av.audio.resampler.AudioResampler(
|
||||||
format="s16",
|
format="s16",
|
||||||
layout="mono",
|
layout="mono" if not split_stereo else "stereo",
|
||||||
rate=sampling_rate,
|
rate=sampling_rate,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -48,7 +56,14 @@ def decode_audio(input_file: Union[str, BinaryIO], sampling_rate: int = 16000):
|
|||||||
audio = np.frombuffer(raw_buffer.getbuffer(), dtype=dtype)
|
audio = np.frombuffer(raw_buffer.getbuffer(), dtype=dtype)
|
||||||
|
|
||||||
# Convert s16 back to f32.
|
# Convert s16 back to f32.
|
||||||
return audio.astype(np.float32) / 32768.0
|
audio = audio.astype(np.float32) / 32768.0
|
||||||
|
|
||||||
|
if split_stereo:
|
||||||
|
left_channel = audio[0::2]
|
||||||
|
right_channel = audio[1::2]
|
||||||
|
return left_channel, right_channel
|
||||||
|
|
||||||
|
return audio
|
||||||
|
|
||||||
|
|
||||||
def _ignore_invalid_frames(frames):
|
def _ignore_invalid_frames(frames):
|
||||||
|
|||||||
BIN
tests/data/stereo_diarization.wav
Normal file
BIN
tests/data/stereo_diarization.wav
Normal file
Binary file not shown.
@@ -1,4 +1,6 @@
|
|||||||
from faster_whisper import WhisperModel
|
import os
|
||||||
|
|
||||||
|
from faster_whisper import WhisperModel, decode_audio
|
||||||
|
|
||||||
|
|
||||||
def test_transcribe(jfk_path):
|
def test_transcribe(jfk_path):
|
||||||
@@ -23,3 +25,21 @@ def test_transcribe(jfk_path):
|
|||||||
assert segment.text == "".join(word.word for word in segment.words)
|
assert segment.text == "".join(word.word for word in segment.words)
|
||||||
assert segment.start == segment.words[0].start
|
assert segment.start == segment.words[0].start
|
||||||
assert segment.end == segment.words[-1].end
|
assert segment.end == segment.words[-1].end
|
||||||
|
|
||||||
|
|
||||||
|
def test_stereo_diarization(data_dir):
|
||||||
|
model = WhisperModel("tiny")
|
||||||
|
|
||||||
|
audio_path = os.path.join(data_dir, "stereo_diarization.wav")
|
||||||
|
left, right = decode_audio(audio_path, split_stereo=True)
|
||||||
|
|
||||||
|
segments, _ = model.transcribe(left)
|
||||||
|
transcription = "".join(segment.text for segment in segments).strip()
|
||||||
|
assert transcription == (
|
||||||
|
"He began a confused complaint against the wizard, "
|
||||||
|
"who had vanished behind the curtain on the left."
|
||||||
|
)
|
||||||
|
|
||||||
|
segments, _ = model.transcribe(right)
|
||||||
|
transcription = "".join(segment.text for segment in segments).strip()
|
||||||
|
assert transcription == "The horizon seems extremely distant."
|
||||||
|
|||||||
Reference in New Issue
Block a user