From 16141e65d902e5aee1737d39110d05c927208c0a Mon Sep 17 00:00:00 2001 From: trungkienbkhn Date: Thu, 29 Feb 2024 23:08:28 +0700 Subject: [PATCH 1/4] Add pad_or_trim function to handle segment before encoding (#705) --- faster_whisper/audio.py | 15 +++++++++++++++ faster_whisper/transcribe.py | 3 ++- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/faster_whisper/audio.py b/faster_whisper/audio.py index 3190619..a597fd8 100644 --- a/faster_whisper/audio.py +++ b/faster_whisper/audio.py @@ -102,3 +102,18 @@ def _resample_frames(frames, resampler): # Add None to flush the resampler. for frame in itertools.chain(frames, [None]): yield from resampler.resample(frame) + + +def pad_or_trim(array, length: int, *, axis: int = -1): + """ + Pad or trim the audio array to N_SAMPLES, as expected by the encoder. + """ + if array.shape[axis] > length: + array = array.take(indices=range(length), axis=axis) + + if array.shape[axis] < length: + pad_widths = [(0, 0)] * array.ndim + pad_widths[axis] = (0, length - array.shape[axis]) + array = np.pad(array, pad_widths) + + return array diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py index c1ea390..bce84d2 100644 --- a/faster_whisper/transcribe.py +++ b/faster_whisper/transcribe.py @@ -11,7 +11,7 @@ import ctranslate2 import numpy as np import tokenizers -from faster_whisper.audio import decode_audio +from faster_whisper.audio import decode_audio, pad_or_trim from faster_whisper.feature_extractor import FeatureExtractor from faster_whisper.tokenizer import _LANGUAGE_CODES, Tokenizer from faster_whisper.utils import download_model, format_timestamp, get_end, get_logger @@ -492,6 +492,7 @@ class WhisperModel: ) segment = features[:, seek : seek + segment_size] segment_duration = segment_size * self.feature_extractor.time_per_frame + segment = pad_or_trim(segment, self.feature_extractor.nb_max_frames) if self.logger.isEnabledFor(logging.DEBUG): self.logger.debug( From 09cd57e7f32f6bf1396a0f2fe469e46355ba3258 Mon Sep 17 00:00:00 2001 From: Gabriel F Date: Thu, 29 Feb 2024 13:08:58 -0300 Subject: [PATCH 2/4] Fix typo 'ditil' (#721) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 570cd66..e81e3a2 100644 --- a/README.md +++ b/README.md @@ -160,7 +160,7 @@ segments, _ = model.transcribe("audio.mp3") segments = list(segments) # The transcription will actually run here. ``` ### Faster-distil-whisper -For usage of `faster-ditil-whisper`, please refer to: https://github.com/guillaumekln/faster-whisper/issues/533 +For usage of `faster-distil-whisper`, please refer to: https://github.com/guillaumekln/faster-whisper/issues/533 ```python model_size = "distil-large-v2" From 5090cc9d0d3048731d63b4e2fe4bf7cd73ccbfdc Mon Sep 17 00:00:00 2001 From: Purfview <69023953+Purfview@users.noreply.github.com> Date: Thu, 29 Feb 2024 16:59:32 +0000 Subject: [PATCH 3/4] Fix window end heuristic for hallucination_silence_threshold (#706) Removes the wishful heuristic causing more issues than it's fixing. Same as https://github.com/openai/whisper/pull/2043 Example of the issue: https://github.com/openai/whisper/pull/1838#issuecomment-1960041500 --- faster_whisper/transcribe.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py index bce84d2..d3d5deb 100644 --- a/faster_whisper/transcribe.py +++ b/faster_whisper/transcribe.py @@ -661,14 +661,6 @@ class WhisperModel: # skip silence before possible hallucinations if options.hallucination_silence_threshold is not None: threshold = options.hallucination_silence_threshold - if not single_timestamp_ending: - last_word_end = get_end(current_segments) - if last_word_end is not None and last_word_end > time_offset: - remaining_duration = window_end_time - last_word_end - if remaining_duration > threshold: - seek = round(last_word_end * self.frames_per_second) - else: - seek = previous_seek + segment_size # if first segment might be a hallucination, skip leading silence first_segment = next_words_segment(current_segments) From a342b028b7e875465dca14c15533484ccbc9f725 Mon Sep 17 00:00:00 2001 From: trungkienbkhn Date: Fri, 1 Mar 2024 17:32:12 +0700 Subject: [PATCH 4/4] Bump version to 1.0.1 (#725) --- faster_whisper/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faster_whisper/version.py b/faster_whisper/version.py index 01ad014..3b64d12 100644 --- a/faster_whisper/version.py +++ b/faster_whisper/version.py @@ -1,3 +1,3 @@ """Version information.""" -__version__ = "1.0.0" +__version__ = "1.0.1"