Upgrade to Silero-Vad V5 (#884)

* Fix window_size_samples to 512

* Update SileroVADModel

* Replace ONNX file with V5 version
This commit is contained in:
Ki Hoon Kim
2024-07-01 17:40:37 +09:00
committed by GitHub
parent bced5f04c0
commit 8d400e9870
2 changed files with 13 additions and 26 deletions

View File

@@ -1,7 +1,6 @@
import bisect import bisect
import functools import functools
import os import os
import warnings
from typing import List, NamedTuple, Optional from typing import List, NamedTuple, Optional
@@ -25,9 +24,6 @@ class VadOptions(NamedTuple):
split aggressively just before max_speech_duration_s. split aggressively just before max_speech_duration_s.
min_silence_duration_ms: In the end of each speech chunk wait for min_silence_duration_ms min_silence_duration_ms: In the end of each speech chunk wait for min_silence_duration_ms
before separating it before separating it
window_size_samples: Audio chunks of window_size_samples size are fed to the silero VAD model.
WARNING! Silero VAD models were trained using 512, 1024, 1536 samples for 16000 sample rate.
Values other than these may affect model performance!!
speech_pad_ms: Final speech chunks are padded by speech_pad_ms each side speech_pad_ms: Final speech chunks are padded by speech_pad_ms each side
""" """
@@ -35,7 +31,6 @@ class VadOptions(NamedTuple):
min_speech_duration_ms: int = 250 min_speech_duration_ms: int = 250
max_speech_duration_s: float = float("inf") max_speech_duration_s: float = float("inf")
min_silence_duration_ms: int = 2000 min_silence_duration_ms: int = 2000
window_size_samples: int = 1024
speech_pad_ms: int = 400 speech_pad_ms: int = 400
@@ -61,15 +56,8 @@ def get_speech_timestamps(
min_speech_duration_ms = vad_options.min_speech_duration_ms min_speech_duration_ms = vad_options.min_speech_duration_ms
max_speech_duration_s = vad_options.max_speech_duration_s max_speech_duration_s = vad_options.max_speech_duration_s
min_silence_duration_ms = vad_options.min_silence_duration_ms min_silence_duration_ms = vad_options.min_silence_duration_ms
window_size_samples = vad_options.window_size_samples window_size_samples = 512
speech_pad_ms = vad_options.speech_pad_ms speech_pad_ms = vad_options.speech_pad_ms
if window_size_samples not in [512, 1024, 1536]:
warnings.warn(
"Unusual window_size_samples! Supported window_size_samples:\n"
" - [512, 1024, 1536] for 16000 sampling_rate"
)
sampling_rate = 16000 sampling_rate = 16000
min_speech_samples = sampling_rate * min_speech_duration_ms / 1000 min_speech_samples = sampling_rate * min_speech_duration_ms / 1000
speech_pad_samples = sampling_rate * speech_pad_ms / 1000 speech_pad_samples = sampling_rate * speech_pad_ms / 1000
@@ -84,14 +72,14 @@ def get_speech_timestamps(
audio_length_samples = len(audio) audio_length_samples = len(audio)
model = get_vad_model() model = get_vad_model()
state = model.get_initial_state(batch_size=1) state, context = model.get_initial_states(batch_size=1)
speech_probs = [] speech_probs = []
for current_start_sample in range(0, audio_length_samples, window_size_samples): for current_start_sample in range(0, audio_length_samples, window_size_samples):
chunk = audio[current_start_sample : current_start_sample + window_size_samples] chunk = audio[current_start_sample : current_start_sample + window_size_samples]
if len(chunk) < window_size_samples: if len(chunk) < window_size_samples:
chunk = np.pad(chunk, (0, int(window_size_samples - len(chunk)))) chunk = np.pad(chunk, (0, int(window_size_samples - len(chunk))))
speech_prob, state = model(chunk, state, sampling_rate) speech_prob, state, context = model(chunk, state, context, sampling_rate)
speech_probs.append(speech_prob) speech_probs.append(speech_prob)
triggered = False triggered = False
@@ -261,12 +249,12 @@ class SileroVADModel:
sess_options=opts, sess_options=opts,
) )
def get_initial_state(self, batch_size: int): def get_initial_states(self, batch_size: int):
h = np.zeros((2, batch_size, 64), dtype=np.float32) state = np.zeros((2, batch_size, 128), dtype=np.float32)
c = np.zeros((2, batch_size, 64), dtype=np.float32) context = np.zeros((batch_size, 64), dtype=np.float32)
return h, c return state, context
def __call__(self, x, state, sr: int): def __call__(self, x, state, context, sr: int):
if len(x.shape) == 1: if len(x.shape) == 1:
x = np.expand_dims(x, 0) x = np.expand_dims(x, 0)
if len(x.shape) > 2: if len(x.shape) > 2:
@@ -276,16 +264,15 @@ class SileroVADModel:
if sr / x.shape[1] > 31.25: if sr / x.shape[1] > 31.25:
raise ValueError("Input audio chunk is too short") raise ValueError("Input audio chunk is too short")
h, c = state x = np.concatenate([context, x], axis=1)
ort_inputs = { ort_inputs = {
"input": x, "input": x,
"h": h, "state": state,
"c": c,
"sr": np.array(sr, dtype="int64"), "sr": np.array(sr, dtype="int64"),
} }
out, h, c = self.session.run(None, ort_inputs) out, state = self.session.run(None, ort_inputs)
state = (h, c) context = x[..., -64:]
return out, state return out, state, context