Pad the audio instead of the spectrogram

See 919a713499
This commit is contained in:
Guillaume Klein
2023-03-08 10:50:46 +01:00
parent 2646906596
commit 6b16b8a69c
2 changed files with 18 additions and 25 deletions

View File

@@ -142,11 +142,14 @@ class FeatureExtractor:
data[f] = np.fft.fft(fft_signal, axis=0)[:num_fft_bins]
return data.T
def __call__(self, waveform):
def __call__(self, waveform, padding=True):
"""
Compute the log-Mel spectrogram of the provided audio, gives similar results
whisper's original torch implementation with 1e-5 tolerance.
"""
if padding:
waveform = np.pad(waveform, [(0, self.n_samples)])
window = np.hanning(self.n_fft + 1)[:-1]
frames = self.fram_wave(waveform)