Merge remote-tracking branch 'upstream/master' into prompt

2024-03-10 11:53:58 +08:00
parent 4b64ef1f70 a342b028b7
commit e50d82c18c
4 changed files with 19 additions and 11 deletions
--- a/README.md
+++ b/README.md
@@ -160,7 +160,7 @@ segments, _ = model.transcribe("audio.mp3")
 segments = list(segments)  # The transcription will actually run here.
 ```
 ### Faster-distil-whisper
-For usage of `faster-ditil-whisper`, please refer to: https://github.com/guillaumekln/faster-whisper/issues/533
+For usage of `faster-distil-whisper`, please refer to: https://github.com/guillaumekln/faster-whisper/issues/533

 ```python
 model_size = "distil-large-v2"
--- a/faster_whisper/audio.py
+++ b/faster_whisper/audio.py
@@ -102,3 +102,18 @@ def _resample_frames(frames, resampler):
    # Add None to flush the resampler.
    for frame in itertools.chain(frames, [None]):
        yield from resampler.resample(frame)
+
+
+def pad_or_trim(array, length: int, *, axis: int = -1):
+    """
+    Pad or trim the audio array to N_SAMPLES, as expected by the encoder.
+    """
+    if array.shape[axis] > length:
+        array = array.take(indices=range(length), axis=axis)
+
+    if array.shape[axis] < length:
+        pad_widths = [(0, 0)] * array.ndim
+        pad_widths[axis] = (0, length - array.shape[axis])
+        array = np.pad(array, pad_widths)
+
+    return array
--- a/faster_whisper/transcribe.py
+++ b/faster_whisper/transcribe.py
@@ -11,7 +11,7 @@ import ctranslate2
 import numpy as np
 import tokenizers

-from faster_whisper.audio import decode_audio
+from faster_whisper.audio import decode_audio, pad_or_trim
 from faster_whisper.feature_extractor import FeatureExtractor
 from faster_whisper.tokenizer import _LANGUAGE_CODES, Tokenizer
 from faster_whisper.utils import download_model, format_timestamp, get_end, get_logger
@@ -493,6 +493,7 @@ class WhisperModel:
            )
            segment = features[:, seek : seek + segment_size]
            segment_duration = segment_size * self.feature_extractor.time_per_frame
+            segment = pad_or_trim(segment, self.feature_extractor.nb_max_frames)

            if self.logger.isEnabledFor(logging.DEBUG):
                self.logger.debug(
@@ -661,14 +662,6 @@ class WhisperModel:
                # skip silence before possible hallucinations
                if options.hallucination_silence_threshold is not None:
                    threshold = options.hallucination_silence_threshold
-                    if not single_timestamp_ending:
-                        last_word_end = get_end(current_segments)
-                        if last_word_end is not None and last_word_end > time_offset:
-                            remaining_duration = window_end_time - last_word_end
-                            if remaining_duration > threshold:
-                                seek = round(last_word_end * self.frames_per_second)
-                            else:
-                                seek = previous_seek + segment_size

                    # if first segment might be a hallucination, skip leading silence
                    first_segment = next_words_segment(current_segments)
--- a/faster_whisper/version.py
+++ b/faster_whisper/version.py
@@ -1,3 +1,3 @@
 """Version information."""

-__version__ = "1.0.0"
+__version__ = "1.0.1"