support distil-whisper (#557)

This commit is contained in:
metame
2024-01-24 17:17:12 +08:00
committed by GitHub
parent 72ff979a2e
commit ad3c83045b
4 changed files with 76 additions and 3 deletions

View File

@@ -142,11 +142,15 @@ class FeatureExtractor:
data[f] = np.fft.fft(fft_signal, axis=0)[:num_fft_bins]
return data.T
def __call__(self, waveform, padding=True):
def __call__(self, waveform, padding=True, chunk_length=None):
"""
Compute the log-Mel spectrogram of the provided audio, gives similar results
whisper's original torch implementation with 1e-5 tolerance.
"""
if chunk_length is not None:
self.n_samples = chunk_length * self.sampling_rate
self.nb_max_frames = self.n_samples // self.hop_length
if padding:
waveform = np.pad(waveform, [(0, self.n_samples)])