diff --git a/README.md b/README.md index 570cd66..e81e3a2 100644 --- a/README.md +++ b/README.md @@ -160,7 +160,7 @@ segments, _ = model.transcribe("audio.mp3") segments = list(segments) # The transcription will actually run here. ``` ### Faster-distil-whisper -For usage of `faster-ditil-whisper`, please refer to: https://github.com/guillaumekln/faster-whisper/issues/533 +For usage of `faster-distil-whisper`, please refer to: https://github.com/guillaumekln/faster-whisper/issues/533 ```python model_size = "distil-large-v2" diff --git a/faster_whisper/audio.py b/faster_whisper/audio.py index 3190619..a597fd8 100644 --- a/faster_whisper/audio.py +++ b/faster_whisper/audio.py @@ -102,3 +102,18 @@ def _resample_frames(frames, resampler): # Add None to flush the resampler. for frame in itertools.chain(frames, [None]): yield from resampler.resample(frame) + + +def pad_or_trim(array, length: int, *, axis: int = -1): + """ + Pad or trim the audio array to N_SAMPLES, as expected by the encoder. + """ + if array.shape[axis] > length: + array = array.take(indices=range(length), axis=axis) + + if array.shape[axis] < length: + pad_widths = [(0, 0)] * array.ndim + pad_widths[axis] = (0, length - array.shape[axis]) + array = np.pad(array, pad_widths) + + return array diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py index ed6000c..f9bb0a1 100644 --- a/faster_whisper/transcribe.py +++ b/faster_whisper/transcribe.py @@ -11,7 +11,7 @@ import ctranslate2 import numpy as np import tokenizers -from faster_whisper.audio import decode_audio +from faster_whisper.audio import decode_audio, pad_or_trim from faster_whisper.feature_extractor import FeatureExtractor from faster_whisper.tokenizer import _LANGUAGE_CODES, Tokenizer from faster_whisper.utils import download_model, format_timestamp, get_end, get_logger @@ -493,6 +493,7 @@ class WhisperModel: ) segment = features[:, seek : seek + segment_size] segment_duration = segment_size * self.feature_extractor.time_per_frame + segment = pad_or_trim(segment, self.feature_extractor.nb_max_frames) if self.logger.isEnabledFor(logging.DEBUG): self.logger.debug( @@ -661,14 +662,6 @@ class WhisperModel: # skip silence before possible hallucinations if options.hallucination_silence_threshold is not None: threshold = options.hallucination_silence_threshold - if not single_timestamp_ending: - last_word_end = get_end(current_segments) - if last_word_end is not None and last_word_end > time_offset: - remaining_duration = window_end_time - last_word_end - if remaining_duration > threshold: - seek = round(last_word_end * self.frames_per_second) - else: - seek = previous_seek + segment_size # if first segment might be a hallucination, skip leading silence first_segment = next_words_segment(current_segments) diff --git a/faster_whisper/version.py b/faster_whisper/version.py index 01ad014..3b64d12 100644 --- a/faster_whisper/version.py +++ b/faster_whisper/version.py @@ -1,3 +1,3 @@ """Version information.""" -__version__ = "1.0.0" +__version__ = "1.0.1"