Upgrade to Silero-Vad V5 (#884)
* Fix window_size_samples to 512 * Update SileroVADModel * Replace ONNX file with V5 version
This commit is contained in:
Binary file not shown.
@@ -1,7 +1,6 @@
|
|||||||
import bisect
|
import bisect
|
||||||
import functools
|
import functools
|
||||||
import os
|
import os
|
||||||
import warnings
|
|
||||||
|
|
||||||
from typing import List, NamedTuple, Optional
|
from typing import List, NamedTuple, Optional
|
||||||
|
|
||||||
@@ -25,9 +24,6 @@ class VadOptions(NamedTuple):
|
|||||||
split aggressively just before max_speech_duration_s.
|
split aggressively just before max_speech_duration_s.
|
||||||
min_silence_duration_ms: In the end of each speech chunk wait for min_silence_duration_ms
|
min_silence_duration_ms: In the end of each speech chunk wait for min_silence_duration_ms
|
||||||
before separating it
|
before separating it
|
||||||
window_size_samples: Audio chunks of window_size_samples size are fed to the silero VAD model.
|
|
||||||
WARNING! Silero VAD models were trained using 512, 1024, 1536 samples for 16000 sample rate.
|
|
||||||
Values other than these may affect model performance!!
|
|
||||||
speech_pad_ms: Final speech chunks are padded by speech_pad_ms each side
|
speech_pad_ms: Final speech chunks are padded by speech_pad_ms each side
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@@ -35,7 +31,6 @@ class VadOptions(NamedTuple):
|
|||||||
min_speech_duration_ms: int = 250
|
min_speech_duration_ms: int = 250
|
||||||
max_speech_duration_s: float = float("inf")
|
max_speech_duration_s: float = float("inf")
|
||||||
min_silence_duration_ms: int = 2000
|
min_silence_duration_ms: int = 2000
|
||||||
window_size_samples: int = 1024
|
|
||||||
speech_pad_ms: int = 400
|
speech_pad_ms: int = 400
|
||||||
|
|
||||||
|
|
||||||
@@ -61,15 +56,8 @@ def get_speech_timestamps(
|
|||||||
min_speech_duration_ms = vad_options.min_speech_duration_ms
|
min_speech_duration_ms = vad_options.min_speech_duration_ms
|
||||||
max_speech_duration_s = vad_options.max_speech_duration_s
|
max_speech_duration_s = vad_options.max_speech_duration_s
|
||||||
min_silence_duration_ms = vad_options.min_silence_duration_ms
|
min_silence_duration_ms = vad_options.min_silence_duration_ms
|
||||||
window_size_samples = vad_options.window_size_samples
|
window_size_samples = 512
|
||||||
speech_pad_ms = vad_options.speech_pad_ms
|
speech_pad_ms = vad_options.speech_pad_ms
|
||||||
|
|
||||||
if window_size_samples not in [512, 1024, 1536]:
|
|
||||||
warnings.warn(
|
|
||||||
"Unusual window_size_samples! Supported window_size_samples:\n"
|
|
||||||
" - [512, 1024, 1536] for 16000 sampling_rate"
|
|
||||||
)
|
|
||||||
|
|
||||||
sampling_rate = 16000
|
sampling_rate = 16000
|
||||||
min_speech_samples = sampling_rate * min_speech_duration_ms / 1000
|
min_speech_samples = sampling_rate * min_speech_duration_ms / 1000
|
||||||
speech_pad_samples = sampling_rate * speech_pad_ms / 1000
|
speech_pad_samples = sampling_rate * speech_pad_ms / 1000
|
||||||
@@ -84,14 +72,14 @@ def get_speech_timestamps(
|
|||||||
audio_length_samples = len(audio)
|
audio_length_samples = len(audio)
|
||||||
|
|
||||||
model = get_vad_model()
|
model = get_vad_model()
|
||||||
state = model.get_initial_state(batch_size=1)
|
state, context = model.get_initial_states(batch_size=1)
|
||||||
|
|
||||||
speech_probs = []
|
speech_probs = []
|
||||||
for current_start_sample in range(0, audio_length_samples, window_size_samples):
|
for current_start_sample in range(0, audio_length_samples, window_size_samples):
|
||||||
chunk = audio[current_start_sample : current_start_sample + window_size_samples]
|
chunk = audio[current_start_sample : current_start_sample + window_size_samples]
|
||||||
if len(chunk) < window_size_samples:
|
if len(chunk) < window_size_samples:
|
||||||
chunk = np.pad(chunk, (0, int(window_size_samples - len(chunk))))
|
chunk = np.pad(chunk, (0, int(window_size_samples - len(chunk))))
|
||||||
speech_prob, state = model(chunk, state, sampling_rate)
|
speech_prob, state, context = model(chunk, state, context, sampling_rate)
|
||||||
speech_probs.append(speech_prob)
|
speech_probs.append(speech_prob)
|
||||||
|
|
||||||
triggered = False
|
triggered = False
|
||||||
@@ -261,12 +249,12 @@ class SileroVADModel:
|
|||||||
sess_options=opts,
|
sess_options=opts,
|
||||||
)
|
)
|
||||||
|
|
||||||
def get_initial_state(self, batch_size: int):
|
def get_initial_states(self, batch_size: int):
|
||||||
h = np.zeros((2, batch_size, 64), dtype=np.float32)
|
state = np.zeros((2, batch_size, 128), dtype=np.float32)
|
||||||
c = np.zeros((2, batch_size, 64), dtype=np.float32)
|
context = np.zeros((batch_size, 64), dtype=np.float32)
|
||||||
return h, c
|
return state, context
|
||||||
|
|
||||||
def __call__(self, x, state, sr: int):
|
def __call__(self, x, state, context, sr: int):
|
||||||
if len(x.shape) == 1:
|
if len(x.shape) == 1:
|
||||||
x = np.expand_dims(x, 0)
|
x = np.expand_dims(x, 0)
|
||||||
if len(x.shape) > 2:
|
if len(x.shape) > 2:
|
||||||
@@ -276,16 +264,15 @@ class SileroVADModel:
|
|||||||
if sr / x.shape[1] > 31.25:
|
if sr / x.shape[1] > 31.25:
|
||||||
raise ValueError("Input audio chunk is too short")
|
raise ValueError("Input audio chunk is too short")
|
||||||
|
|
||||||
h, c = state
|
x = np.concatenate([context, x], axis=1)
|
||||||
|
|
||||||
ort_inputs = {
|
ort_inputs = {
|
||||||
"input": x,
|
"input": x,
|
||||||
"h": h,
|
"state": state,
|
||||||
"c": c,
|
|
||||||
"sr": np.array(sr, dtype="int64"),
|
"sr": np.array(sr, dtype="int64"),
|
||||||
}
|
}
|
||||||
|
|
||||||
out, h, c = self.session.run(None, ort_inputs)
|
out, state = self.session.run(None, ort_inputs)
|
||||||
state = (h, c)
|
context = x[..., -64:]
|
||||||
|
|
||||||
return out, state
|
return out, state, context
|
||||||
|
|||||||
Reference in New Issue
Block a user