Pad the audio instead of the spectrogram

See 919a713499
2023-03-08 10:50:46 +01:00
parent 2646906596
commit 6b16b8a69c
2 changed files with 18 additions and 25 deletions
--- a/faster_whisper/feature_extractor.py
+++ b/faster_whisper/feature_extractor.py
@@ -142,11 +142,14 @@ class FeatureExtractor:
            data[f] = np.fft.fft(fft_signal, axis=0)[:num_fft_bins]
        return data.T

-    def __call__(self, waveform):
+    def __call__(self, waveform, padding=True):
        """
        Compute the log-Mel spectrogram of the provided audio, gives similar results
        whisper's original torch implementation with 1e-5 tolerance.
        """
+        if padding:
+            waveform = np.pad(waveform, [(0, self.n_samples)])
+
        window = np.hanning(self.n_fft + 1)[:-1]

        frames = self.fram_wave(waveform)