From 4ab646035ff90a6b88522209aed356f6d003617d Mon Sep 17 00:00:00 2001 From: trungkienbkhn Date: Tue, 20 Feb 2024 23:26:55 +0700 Subject: [PATCH 1/8] Upgrade ctranslate2 version to support CUDA 12 (#694) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index ba0da20..4df8d9c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ av==10.* -ctranslate2>=3.22,<4 +ctranslate2>=4.0,<5 huggingface_hub>=0.13 tokenizers>=0.13,<0.16 onnxruntime>=1.14,<2 From c6b28ed3a0f8700070e23940e3b2aa83f2cd88fd Mon Sep 17 00:00:00 2001 From: IlianP Date: Tue, 20 Feb 2024 17:28:00 +0100 Subject: [PATCH 2/8] Update README.md (#685) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I'm surprised that WhisperX hasn't made it into this list yet, as it has more stars than faster-whisper itself 🚀 --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 02f4c0a..70306d2 100644 --- a/README.md +++ b/README.md @@ -219,6 +219,8 @@ See more model and transcription options in the [`WhisperModel`](https://github. Here is a non exhaustive list of open-source projects using faster-whisper. Feel free to add your project to the list! + +* [WhisperX](https://github.com/m-bain/whisperX) is an award-winning Python library that offers speaker diarization and accurate word-level timestamps using wav2vec2 alignment * [whisper-ctranslate2](https://github.com/Softcatala/whisper-ctranslate2) is a command line client based on faster-whisper and compatible with the original client from openai/whisper. * [whisper-diarize](https://github.com/MahmoudAshraf97/whisper-diarization) is a speaker diarization tool that is based on faster-whisper and NVIDIA NeMo. * [whisper-standalone-win](https://github.com/Purfview/whisper-standalone-win) Standalone CLI executables of faster-whisper for Windows, Linux & macOS. From 52695567c9b31cfbfa7f8f697551db19d0a4231a Mon Sep 17 00:00:00 2001 From: Purfview <69023953+Purfview@users.noreply.github.com> Date: Tue, 20 Feb 2024 16:31:07 +0000 Subject: [PATCH 3/8] Bumps up PyAV version to support Python 3.12.x (#679) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 4df8d9c..1bbf0b0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -av==10.* +av==11.* ctranslate2>=4.0,<5 huggingface_hub>=0.13 tokenizers>=0.13,<0.16 From 6ffcbdfbc290d03813078b60b2abd58d3621aef0 Mon Sep 17 00:00:00 2001 From: Jordi Mas Date: Tue, 20 Feb 2024 17:33:17 +0100 Subject: [PATCH 4/8] Fix typos in README.md (#668) --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 70306d2..64f86b2 100644 --- a/README.md +++ b/README.md @@ -170,7 +170,7 @@ segments, info = model.transcribe("audio.mp3", beam_size=5, language="en", max_new_tokens=128, condition_on_previous_text=False) ``` -NOTE: emprically, `condition_on_previous_text=True` will degrade the performance of `faster-distil-whisper` for long audio. Degradation on the first chunk was observed with `initial_prompt` too. +NOTE: Empirically, `condition_on_previous_text=True` will degrade the performance of `faster-distil-whisper` for long audio. Degradation on the first chunk was observed with `initial_prompt` too. ### Word-level timestamps @@ -233,7 +233,7 @@ Here is a non exhaustive list of open-source projects using faster-whisper. Feel ## Model conversion -When loading a model from its size such as `WhisperModel("large-v3")`, the correspondig CTranslate2 model is automatically downloaded from the [Hugging Face Hub](https://huggingface.co/Systran). +When loading a model from its size such as `WhisperModel("large-v3")`, the corresponding CTranslate2 model is automatically downloaded from the [Hugging Face Hub](https://huggingface.co/Systran). We also provide a script to convert any Whisper models compatible with the Transformers library. They could be the original OpenAI models or user fine-tuned models. From 092067208b45e5eb935bfe8b0ac42b8341da9434 Mon Sep 17 00:00:00 2001 From: trungkienbkhn Date: Tue, 20 Feb 2024 23:34:54 +0700 Subject: [PATCH 5/8] Add clip_timestamps and hallucination_silence_threshold options (#646) --- faster_whisper/transcribe.py | 169 +++++++++++++++++++++++++++++++---- faster_whisper/utils.py | 7 ++ 2 files changed, 157 insertions(+), 19 deletions(-) diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py index 824e4e3..9b0ce6c 100644 --- a/faster_whisper/transcribe.py +++ b/faster_whisper/transcribe.py @@ -14,7 +14,7 @@ import tokenizers from faster_whisper.audio import decode_audio from faster_whisper.feature_extractor import FeatureExtractor from faster_whisper.tokenizer import _LANGUAGE_CODES, Tokenizer -from faster_whisper.utils import download_model, format_timestamp, get_logger +from faster_whisper.utils import download_model, format_timestamp, get_end, get_logger from faster_whisper.vad import ( SpeechTimestampsMap, VadOptions, @@ -67,6 +67,8 @@ class TranscriptionOptions(NamedTuple): prepend_punctuations: str append_punctuations: str max_new_tokens: Optional[int] + clip_timestamps: Union[str, List[float]] + hallucination_silence_threshold: Optional[float] class TranscriptionInfo(NamedTuple): @@ -216,6 +218,8 @@ class WhisperModel: vad_parameters: Optional[Union[dict, VadOptions]] = None, max_new_tokens: Optional[int] = None, chunk_length: Optional[int] = None, + clip_timestamps: Union[str, List[float]] = "0", + hallucination_silence_threshold: Optional[float] = None, ) -> Tuple[Iterable[Segment], TranscriptionInfo]: """Transcribes an input file. @@ -271,6 +275,12 @@ class WhisperModel: the maximum will be set by the default max_length. chunk_length: The length of audio segments. If it is not None, it will overwrite the default chunk_length of the FeatureExtractor. + clip_timestamps: Union[str, List[float]] + Comma-separated list start,end,start,end,... timestamps (in seconds) of clips to + process. The last end timestamp defaults to the end of the file. + hallucination_silence_threshold: Optional[float] + When word_timestamps is True, skip silent periods longer than this threshold + (in seconds) when a possible hallucination is detected Returns: A tuple with: @@ -387,6 +397,8 @@ class WhisperModel: prepend_punctuations=prepend_punctuations, append_punctuations=append_punctuations, max_new_tokens=max_new_tokens, + clip_timestamps=clip_timestamps, + hallucination_silence_threshold=hallucination_silence_threshold, ) segments = self.generate_segments(features, tokenizer, options, encoder_output) @@ -414,8 +426,33 @@ class WhisperModel: encoder_output: Optional[ctranslate2.StorageView] = None, ) -> Iterable[Segment]: content_frames = features.shape[-1] - self.feature_extractor.nb_max_frames + content_duration = float(content_frames * self.feature_extractor.time_per_frame) + + if isinstance(options.clip_timestamps, str): + TranscriptionOptions.clip_timestamps = [ + float(ts) + for ts in ( + options.clip_timestamps.split(",") + if options.clip_timestamps + else [] + ) + ] + seek_points: List[int] = [ + round(ts * self.frames_per_second) for ts in options.clip_timestamps + ] + if len(seek_points) == 0: + seek_points.append(0) + if len(seek_points) % 2 == 1: + seek_points.append(content_frames) + seek_clips: List[Tuple[int, int]] = list( + zip(seek_points[::2], seek_points[1::2]) + ) + + punctuation = "\"'“¿([{-\"'.。,,!!??::”)]}、" + idx = 0 - seek = 0 + clip_idx = 0 + seek = seek_clips[clip_idx][0] all_tokens = [] prompt_reset_since = 0 @@ -428,12 +465,30 @@ class WhisperModel: all_tokens.extend(options.initial_prompt) last_speech_timestamp = 0.0 - while seek < content_frames: + # NOTE: This loop is obscurely flattened to make the diff readable. + # A later commit should turn this into a simpler nested loop. + # for seek_clip_start, seek_clip_end in seek_clips: + # while seek < seek_clip_end + while clip_idx < len(seek_clips): + seek_clip_start, seek_clip_end = seek_clips[clip_idx] + if seek < seek_clip_start: + seek = seek_clip_start + if seek >= seek_clip_end: + clip_idx += 1 + if clip_idx < len(seek_clips): + seek = seek_clips[clip_idx][0] + continue time_offset = seek * self.feature_extractor.time_per_frame - segment = features[:, seek : seek + self.feature_extractor.nb_max_frames] - segment_size = min( - self.feature_extractor.nb_max_frames, content_frames - seek + window_end_time = float( + (seek + self.feature_extractor.nb_max_frames) + * self.feature_extractor.time_per_frame ) + segment_size = min( + self.feature_extractor.nb_max_frames, + content_frames - seek, + seek_clip_end - seek, + ) + segment = features[:, seek : seek + segment_size] segment_duration = segment_size * self.feature_extractor.time_per_frame if self.logger.isEnabledFor(logging.DEBUG): @@ -486,10 +541,33 @@ class WhisperModel: previous_seek = seek current_segments = [] + # anomalous words are very long/short/improbable + def word_anomaly_score(word: dict) -> float: + probability = word.get("probability", 0.0) + duration = word["end"] - word["start"] + score = 0.0 + if probability < 0.15: + score += 1.0 + if duration < 0.133: + score += (0.133 - duration) * 15 + if duration > 2.0: + score += duration - 2.0 + return score + + def is_segment_anomaly(segment: Optional[dict]) -> bool: + if segment is None or not segment["words"]: + return False + words = [w for w in segment["words"] if w["word"] not in punctuation] + words = words[:8] + score = sum(word_anomaly_score(w) for w in words) + return score >= 3 or score + 0.01 >= len(words) + + def next_words_segment(segments: List[dict]) -> Optional[dict]: + return next((s for s in segments if s["words"]), None) + single_timestamp_ending = ( len(tokens) >= 2 - and tokens[-2] < tokenizer.timestamp_begin - and tokens[-1] >= tokenizer.timestamp_begin + and tokens[-2] < tokenizer.timestamp_begin <= tokens[-1] ) consecutive_timestamps = [ @@ -572,18 +650,70 @@ class WhisperModel: last_speech_timestamp=last_speech_timestamp, ) - word_end_timestamps = [ - w["end"] for s in current_segments for w in s["words"] - ] - if len(word_end_timestamps) > 0: - last_speech_timestamp = word_end_timestamps[-1] - if not single_timestamp_ending and len(word_end_timestamps) > 0: - seek_shift = round( - (word_end_timestamps[-1] - time_offset) * self.frames_per_second - ) + if not single_timestamp_ending: + last_word_end = get_end(current_segments) + if last_word_end is not None and last_word_end > time_offset: + seek = round(last_word_end * self.frames_per_second) - if seek_shift > 0: - seek = previous_seek + seek_shift + # skip silence before possible hallucinations + if options.hallucination_silence_threshold is not None: + threshold = options.hallucination_silence_threshold + if not single_timestamp_ending: + last_word_end = get_end(current_segments) + if last_word_end is not None and last_word_end > time_offset: + remaining_duration = window_end_time - last_word_end + if remaining_duration > threshold: + seek = round(last_word_end * self.frames_per_second) + else: + seek = previous_seek + segment_size + + # if first segment might be a hallucination, skip leading silence + first_segment = next_words_segment(current_segments) + if first_segment is not None and is_segment_anomaly(first_segment): + gap = first_segment["start"] - time_offset + if gap > threshold: + seek = previous_seek + round(gap * self.frames_per_second) + continue + + # skip silence before any possible hallucination that is surrounded + # by silence or more hallucinations + hal_last_end = last_speech_timestamp + for si in range(len(current_segments)): + segment = current_segments[si] + if not segment["words"]: + continue + if is_segment_anomaly(segment): + next_segment = next_words_segment( + current_segments[si + 1 :] + ) + if next_segment is not None: + hal_next_start = next_segment["words"][0]["start"] + else: + hal_next_start = time_offset + segment_duration + silence_before = ( + segment["start"] - hal_last_end > threshold + or segment["start"] < threshold + or segment["start"] - time_offset < 2.0 + ) + silence_after = ( + hal_next_start - segment["end"] > threshold + or is_segment_anomaly(next_segment) + or window_end_time - segment["end"] < 2.0 + ) + if silence_before and silence_after: + seek = round( + max(time_offset + 1, segment["start"]) + * self.frames_per_second + ) + if content_duration - segment["end"] < threshold: + seek = content_frames + current_segments[si:] = [] + break + hal_last_end = segment["end"] + + last_word_end = get_end(current_segments) + if last_word_end is not None: + last_speech_timestamp = last_word_end for segment in current_segments: tokens = segment["tokens"] @@ -819,6 +949,7 @@ class WhisperModel: word_durations = np.array([word["end"] - word["start"] for word in alignment]) word_durations = word_durations[word_durations.nonzero()] median_duration = np.median(word_durations) if len(word_durations) > 0 else 0.0 + median_duration = min(0.7, float(median_duration)) max_duration = median_duration * 2 # hack: truncate long words at sentence boundaries. diff --git a/faster_whisper/utils.py b/faster_whisper/utils.py index 9876956..0b5f375 100644 --- a/faster_whisper/utils.py +++ b/faster_whisper/utils.py @@ -146,3 +146,10 @@ class disabled_tqdm(tqdm): def __init__(self, *args, **kwargs): kwargs["disable"] = True super().__init__(*args, **kwargs) + + +def get_end(segments: List[dict]) -> Optional[float]: + return next( + (w["end"] for s in reversed(segments) for w in reversed(s["words"])), + segments[-1]["end"] if segments else None, + ) From 22c75d0cc362e003e7be2fe654f9e3866a4b4692 Mon Sep 17 00:00:00 2001 From: BBC-Esq Date: Wed, 21 Feb 2024 04:18:11 -0500 Subject: [PATCH 6/8] Update README.md (#672) Add Faster-Whisper-Transcriber to community integrations. --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 64f86b2..570cd66 100644 --- a/README.md +++ b/README.md @@ -230,6 +230,7 @@ Here is a non exhaustive list of open-source projects using faster-whisper. Feel * [aTrain](https://github.com/BANDAS-Center/aTrain) is a graphical user interface implementation of faster-whisper developed at the BANDAS-Center at the University of Graz for transcription and diarization in Windows ([Windows Store App](https://apps.microsoft.com/detail/atrain/9N15Q44SZNS2)) and Linux. * [Whisper-Streaming](https://github.com/ufal/whisper_streaming) implements real-time mode for offline Whisper-like speech-to-text models with faster-whisper as the most recommended back-end. It implements a streaming policy with self-adaptive latency based on the actual source complexity, and demonstrates the state of the art. * [WhisperLive](https://github.com/collabora/WhisperLive) is a nearly-live implementation of OpenAI's Whisper which uses faster-whisper as the backend to transcribe audio in real-time. +* [Faster-Whisper-Transcriber](https://github.com/BBC-Esq/ctranslate2-faster-whisper-transcriber) is a simple but reliable voice transcriber that provides a user-friendly interface. ## Model conversion From 30d6043e90e963f2d378047a00bf53c4d958cac1 Mon Sep 17 00:00:00 2001 From: Purfview <69023953+Purfview@users.noreply.github.com> Date: Thu, 22 Feb 2024 08:48:35 +0000 Subject: [PATCH 7/8] Prevent infinite loop for out-of-bound timestamps in clip_timestamps (#697) Same as https://github.com/openai/whisper/pull/2005 --- faster_whisper/transcribe.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py index 9b0ce6c..c1ea390 100644 --- a/faster_whisper/transcribe.py +++ b/faster_whisper/transcribe.py @@ -471,6 +471,8 @@ class WhisperModel: # while seek < seek_clip_end while clip_idx < len(seek_clips): seek_clip_start, seek_clip_end = seek_clips[clip_idx] + if seek_clip_end > content_frames: + seek_clip_end = content_frames if seek < seek_clip_start: seek = seek_clip_start if seek >= seek_clip_end: From 06d32bf0c18848bc6e7c0335e184e295d58bbe9d Mon Sep 17 00:00:00 2001 From: trungkienbkhn Date: Thu, 22 Feb 2024 15:49:01 +0700 Subject: [PATCH 8/8] Bump version to 1.0.0 (#696) --- faster_whisper/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faster_whisper/version.py b/faster_whisper/version.py index e1f6d31..01ad014 100644 --- a/faster_whisper/version.py +++ b/faster_whisper/version.py @@ -1,3 +1,3 @@ """Version information.""" -__version__ = "0.10.0" +__version__ = "1.0.0"