Upgrade to Silero-Vad V5 (#884)

* Fix window_size_samples to 512 * Update SileroVADModel * Replace ONNX file with V5 version
2024-07-01 17:40:37 +09:00
parent bced5f04c0
commit 8d400e9870
2 changed files with 13 additions and 26 deletions
--- a/faster_whisper/assets/silero_vad.onnx
+++ b/faster_whisper/assets/silero_vad.onnx
--- a/faster_whisper/vad.py
+++ b/faster_whisper/vad.py
@@ -1,7 +1,6 @@
 import bisect
 import functools
 import os
-import warnings

 from typing import List, NamedTuple, Optional

@@ -25,9 +24,6 @@ class VadOptions(NamedTuple):
        split aggressively just before max_speech_duration_s.
      min_silence_duration_ms: In the end of each speech chunk wait for min_silence_duration_ms
        before separating it
-      window_size_samples: Audio chunks of window_size_samples size are fed to the silero VAD model.
-        WARNING! Silero VAD models were trained using 512, 1024, 1536 samples for 16000 sample rate.
-        Values other than these may affect model performance!!
      speech_pad_ms: Final speech chunks are padded by speech_pad_ms each side
    """

@@ -35,7 +31,6 @@ class VadOptions(NamedTuple):
    min_speech_duration_ms: int = 250
    max_speech_duration_s: float = float("inf")
    min_silence_duration_ms: int = 2000
-    window_size_samples: int = 1024
    speech_pad_ms: int = 400


@@ -61,15 +56,8 @@ def get_speech_timestamps(
    min_speech_duration_ms = vad_options.min_speech_duration_ms
    max_speech_duration_s = vad_options.max_speech_duration_s
    min_silence_duration_ms = vad_options.min_silence_duration_ms
-    window_size_samples = vad_options.window_size_samples
+    window_size_samples = 512
    speech_pad_ms = vad_options.speech_pad_ms
-
-    if window_size_samples not in [512, 1024, 1536]:
-        warnings.warn(
-            "Unusual window_size_samples! Supported window_size_samples:\n"
-            " - [512, 1024, 1536] for 16000 sampling_rate"
-        )
-
    sampling_rate = 16000
    min_speech_samples = sampling_rate * min_speech_duration_ms / 1000
    speech_pad_samples = sampling_rate * speech_pad_ms / 1000
@@ -84,14 +72,14 @@ def get_speech_timestamps(
    audio_length_samples = len(audio)

    model = get_vad_model()
-    state = model.get_initial_state(batch_size=1)
+    state, context = model.get_initial_states(batch_size=1)

    speech_probs = []
    for current_start_sample in range(0, audio_length_samples, window_size_samples):
        chunk = audio[current_start_sample : current_start_sample + window_size_samples]
        if len(chunk) < window_size_samples:
            chunk = np.pad(chunk, (0, int(window_size_samples - len(chunk))))
-        speech_prob, state = model(chunk, state, sampling_rate)
+        speech_prob, state, context = model(chunk, state, context, sampling_rate)
        speech_probs.append(speech_prob)

    triggered = False
@@ -261,12 +249,12 @@ class SileroVADModel:
            sess_options=opts,
        )

-    def get_initial_state(self, batch_size: int):
-        h = np.zeros((2, batch_size, 64), dtype=np.float32)
-        c = np.zeros((2, batch_size, 64), dtype=np.float32)
-        return h, c
+    def get_initial_states(self, batch_size: int):
+        state = np.zeros((2, batch_size, 128), dtype=np.float32)
+        context = np.zeros((batch_size, 64), dtype=np.float32)
+        return state, context

-    def __call__(self, x, state, sr: int):
+    def __call__(self, x, state, context, sr: int):
        if len(x.shape) == 1:
            x = np.expand_dims(x, 0)
        if len(x.shape) > 2:
@@ -276,16 +264,15 @@ class SileroVADModel:
        if sr / x.shape[1] > 31.25:
            raise ValueError("Input audio chunk is too short")

-        h, c = state
+        x = np.concatenate([context, x], axis=1)

        ort_inputs = {
            "input": x,
-            "h": h,
-            "c": c,
+            "state": state,
            "sr": np.array(sr, dtype="int64"),
        }

-        out, h, c = self.session.run(None, ort_inputs)
-        state = (h, c)
+        out, state = self.session.run(None, ort_inputs)
+        context = x[..., -64:]

-        return out, state
+        return out, state, context