From 1eb9a8004c509a4af2960955374520c35b7b793a Mon Sep 17 00:00:00 2001 From: trungkienbkhn Date: Tue, 12 Mar 2024 21:44:49 +0700 Subject: [PATCH 1/6] Improve language detection (#732) --- faster_whisper/transcribe.py | 59 ++++++++++++++++++++++++++++++------ 1 file changed, 50 insertions(+), 9 deletions(-) diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py index d3d5deb..1c002ed 100644 --- a/faster_whisper/transcribe.py +++ b/faster_whisper/transcribe.py @@ -220,6 +220,8 @@ class WhisperModel: chunk_length: Optional[int] = None, clip_timestamps: Union[str, List[float]] = "0", hallucination_silence_threshold: Optional[float] = None, + language_detection_threshold: Optional[float] = None, + language_detection_segments: int = 1, ) -> Tuple[Iterable[Segment], TranscriptionInfo]: """Transcribes an input file. @@ -281,6 +283,9 @@ class WhisperModel: hallucination_silence_threshold: Optional[float] When word_timestamps is True, skip silent periods longer than this threshold (in seconds) when a possible hallucination is detected + language_detection_threshold: If the maximum probability of the language tokens is higher + than this value, the language is detected. + language_detection_segments: Number of segments to consider for the language detection. Returns: A tuple with: @@ -340,15 +345,51 @@ class WhisperModel: language = "en" language_probability = 1 else: - segment = features[:, : self.feature_extractor.nb_max_frames] - encoder_output = self.encode(segment) - # results is a list of tuple[str, float] with language names and - # probabilities. - results = self.model.detect_language(encoder_output)[0] - # Parse language names to strip out markers - all_language_probs = [(token[2:-2], prob) for (token, prob) in results] - # Get top language token and probability - language, language_probability = all_language_probs[0] + if ( + language_detection_segments is None + or language_detection_segments < 1 + ): + language_detection_segments = 1 + seek = 0 + detected_language_info = {} + content_frames = ( + features.shape[-1] - self.feature_extractor.nb_max_frames + ) + while ( + seek < content_frames + and seek + < self.feature_extractor.nb_max_frames * language_detection_segments + ): + segment = features[ + :, seek : seek + self.feature_extractor.nb_max_frames + ] + encoder_output = self.encode(segment) + # results is a list of tuple[str, float] with language names and + # probabilities. + results = self.model.detect_language(encoder_output)[0] + # Parse language names to strip out markers + all_language_probs = [ + (token[2:-2], prob) for (token, prob) in results + ] + # Get top language token and probability + language, language_probability = all_language_probs[0] + if ( + language_detection_threshold is None + or language_probability > language_detection_threshold + ): + break + detected_language_info.setdefault(language, []).append( + language_probability + ) + seek += segment.shape[-1] + else: + # If no language detected for all segments, the majority vote of the highest + # projected languages for all segments is used to determine the language. + language = max( + detected_language_info, + key=lambda lang: len(detected_language_info[lang]), + ) + language_probability = max(detected_language_info[language]) self.logger.info( "Detected language '%s' with probability %.2f", From a67e0e47aec5dabcb8c519fa971334a91cecc2a2 Mon Sep 17 00:00:00 2001 From: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com> Date: Tue, 26 Mar 2024 13:58:39 +0000 Subject: [PATCH 2/6] Add support for distil-large-v3 (#755) * add distil-large-v3 * Update README.md * use fp16 weights from Systran --- README.md | 23 +++++++++++++++-------- faster_whisper/utils.py | 1 + 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index e81e3a2..dbd69ee 100644 --- a/README.md +++ b/README.md @@ -159,18 +159,25 @@ for segment in segments: segments, _ = model.transcribe("audio.mp3") segments = list(segments) # The transcription will actually run here. ``` -### Faster-distil-whisper -For usage of `faster-distil-whisper`, please refer to: https://github.com/guillaumekln/faster-whisper/issues/533 +### Faster Distil-Whisper + +The Distil-Whisper checkpoints are compatible with the Faster-Whisper package. In particular, the latest [distil-large-v3](https://huggingface.co/distil-whisper/distil-large-v3) +checkpoint is intrinsically designed to work with the Faster-Whisper transcription algorithm. The following code snippet +demonstrates how to run inference with distil-large-v3 on a specified audio file: ```python -model_size = "distil-large-v2" -# model_size = "distil-medium.en" -model = WhisperModel(model_size, device="cuda", compute_type="float16") -segments, info = model.transcribe("audio.mp3", beam_size=5, - language="en", max_new_tokens=128, condition_on_previous_text=False) +from faster_whisper import WhisperModel +model_size = "distil-large-v3" + +model = WhisperModel(model_size, device="cuda", compute_type="float16") +segments, info = model.transcribe("audio.mp3", beam_size=5, language="en", condition_on_previous_text=False) + +for segment in segments: + print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text)) ``` -NOTE: Empirically, `condition_on_previous_text=True` will degrade the performance of `faster-distil-whisper` for long audio. Degradation on the first chunk was observed with `initial_prompt` too. + +For more information about the distil-large-v3 model, refer to the original [model card](https://huggingface.co/distil-whisper/distil-large-v3). ### Word-level timestamps diff --git a/faster_whisper/utils.py b/faster_whisper/utils.py index 0b5f375..1abf808 100644 --- a/faster_whisper/utils.py +++ b/faster_whisper/utils.py @@ -25,6 +25,7 @@ _MODELS = { "distil-large-v2": "Systran/faster-distil-whisper-large-v2", "distil-medium.en": "Systran/faster-distil-whisper-medium.en", "distil-small.en": "Systran/faster-distil-whisper-small.en", + "distil-large-v3": "Systran/faster-distil-whisper-large-v3", } From e0c3a9ed34f77db5c6e2e8432bb0d1ccb155e1af Mon Sep 17 00:00:00 2001 From: trungkienbkhn Date: Wed, 27 Mar 2024 14:31:17 +0700 Subject: [PATCH 3/6] Update project github link to SYSTRAN (#746) --- CONTRIBUTING.md | 2 +- LICENSE | 2 +- README.md | 12 ++++++------ faster_whisper/utils.py | 2 +- setup.py | 2 +- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 379b9ad..8d6a9c2 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -7,7 +7,7 @@ Contributions are welcome! Here are some pointers to help you install the librar We recommend installing the module in editable mode with the `dev` extra requirements: ```bash -git clone https://github.com/guillaumekln/faster-whisper.git +git clone https://github.com/SYSTRAN/faster-whisper.git cd faster-whisper/ pip install -e .[dev] ``` diff --git a/LICENSE b/LICENSE index 62f34be..2d92330 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2023 Guillaume Klein +Copyright (c) 2023 SYSTRAN Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index dbd69ee..037bad8 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -[![CI](https://github.com/guillaumekln/faster-whisper/workflows/CI/badge.svg)](https://github.com/guillaumekln/faster-whisper/actions?query=workflow%3ACI) [![PyPI version](https://badge.fury.io/py/faster-whisper.svg)](https://badge.fury.io/py/faster-whisper) +[![CI](https://github.com/SYSTRAN/faster-whisper/workflows/CI/badge.svg)](https://github.com/SYSTRAN/faster-whisper/actions?query=workflow%3ACI) [![PyPI version](https://badge.fury.io/py/faster-whisper.svg)](https://badge.fury.io/py/faster-whisper) # Faster Whisper transcription with CTranslate2 @@ -14,7 +14,7 @@ For reference, here's the time and memory usage that are required to transcribe * [openai/whisper](https://github.com/openai/whisper)@[6dea21fd](https://github.com/openai/whisper/commit/6dea21fd7f7253bfe450f1e2512a0fe47ee2d258) * [whisper.cpp](https://github.com/ggerganov/whisper.cpp)@[3b010f9](https://github.com/ggerganov/whisper.cpp/commit/3b010f9bed9a6068609e9faf52383aea792b0362) -* [faster-whisper](https://github.com/guillaumekln/faster-whisper)@[cce6b53e](https://github.com/guillaumekln/faster-whisper/commit/cce6b53e4554f71172dad188c45f10fb100f6e3e) +* [faster-whisper](https://github.com/SYSTRAN/faster-whisper)@[cce6b53e](https://github.com/SYSTRAN/faster-whisper/commit/cce6b53e4554f71172dad188c45f10fb100f6e3e) ### Large-v2 model on GPU @@ -117,13 +117,13 @@ pip install faster-whisper ### Install the master branch ```bash -pip install --force-reinstall "faster-whisper @ https://github.com/guillaumekln/faster-whisper/archive/refs/heads/master.tar.gz" +pip install --force-reinstall "faster-whisper @ https://github.com/SYSTRAN/faster-whisper/archive/refs/heads/master.tar.gz" ``` ### Install a specific commit ```bash -pip install --force-reinstall "faster-whisper @ https://github.com/guillaumekln/faster-whisper/archive/a4f1cc8f11433e454c3934442b5e1a4ed5e865c3.tar.gz" +pip install --force-reinstall "faster-whisper @ https://github.com/SYSTRAN/faster-whisper/archive/a4f1cc8f11433e454c3934442b5e1a4ed5e865c3.tar.gz" ``` @@ -197,7 +197,7 @@ The library integrates the [Silero VAD](https://github.com/snakers4/silero-vad) segments, _ = model.transcribe("audio.mp3", vad_filter=True) ``` -The default behavior is conservative and only removes silence longer than 2 seconds. See the available VAD parameters and default values in the [source code](https://github.com/guillaumekln/faster-whisper/blob/master/faster_whisper/vad.py). They can be customized with the dictionary argument `vad_parameters`: +The default behavior is conservative and only removes silence longer than 2 seconds. See the available VAD parameters and default values in the [source code](https://github.com/SYSTRAN/faster-whisper/blob/master/faster_whisper/vad.py). They can be customized with the dictionary argument `vad_parameters`: ```python segments, _ = model.transcribe( @@ -220,7 +220,7 @@ logging.getLogger("faster_whisper").setLevel(logging.DEBUG) ### Going further -See more model and transcription options in the [`WhisperModel`](https://github.com/guillaumekln/faster-whisper/blob/master/faster_whisper/transcribe.py) class implementation. +See more model and transcription options in the [`WhisperModel`](https://github.com/SYSTRAN/faster-whisper/blob/master/faster_whisper/transcribe.py) class implementation. ## Community integrations diff --git a/faster_whisper/utils.py b/faster_whisper/utils.py index 1abf808..93ade3a 100644 --- a/faster_whisper/utils.py +++ b/faster_whisper/utils.py @@ -53,7 +53,7 @@ def download_model( """Downloads a CTranslate2 Whisper model from the Hugging Face Hub. Args: - size_or_id: Size of the model to download from https://huggingface.co/guillaumekln + size_or_id: Size of the model to download from https://huggingface.co/Systran (tiny, tiny.en, base, base.en, small, small.en medium, medium.en, large-v1, large-v2, large-v3, large), or a CTranslate2-converted model ID from the Hugging Face Hub (e.g. Systran/faster-whisper-large-v3). diff --git a/setup.py b/setup.py index 1deca3b..782f1b2 100644 --- a/setup.py +++ b/setup.py @@ -37,7 +37,7 @@ setup( long_description=get_long_description(), long_description_content_type="text/markdown", author="Guillaume Klein", - url="https://github.com/guillaumekln/faster-whisper", + url="https://github.com/SYSTRAN/faster-whisper", classifiers=[ "Development Status :: 4 - Beta", "Intended Audience :: Developers", From 8ae82c8372c7deab3ee0dc21cbe4c70a2ee9803a Mon Sep 17 00:00:00 2001 From: Purfview <69023953+Purfview@users.noreply.github.com> Date: Tue, 2 Apr 2024 17:18:12 +0100 Subject: [PATCH 4/6] Bugfix: code breaks if audio is empty (#768) * Bugfix: code breaks if audio is empty Regression since https://github.com/SYSTRAN/faster-whisper/pull/732 PR --- faster_whisper/transcribe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py index 1c002ed..34cd271 100644 --- a/faster_whisper/transcribe.py +++ b/faster_whisper/transcribe.py @@ -356,7 +356,7 @@ class WhisperModel: features.shape[-1] - self.feature_extractor.nb_max_frames ) while ( - seek < content_frames + seek <= content_frames and seek < self.feature_extractor.nb_max_frames * language_detection_segments ): From b024972a566c1a61279548b3db1d7249cc1d0151 Mon Sep 17 00:00:00 2001 From: Purfview <69023953+Purfview@users.noreply.github.com> Date: Tue, 2 Apr 2024 17:20:34 +0100 Subject: [PATCH 5/6] Foolproof: Disable VAD if clip_timestamps is in use (#769) * Foolproof: Disable VAD if clip_timestamps is in use Prevent silly things to happen. --- faster_whisper/transcribe.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py index 34cd271..337cc42 100644 --- a/faster_whisper/transcribe.py +++ b/faster_whisper/transcribe.py @@ -280,6 +280,7 @@ class WhisperModel: clip_timestamps: Union[str, List[float]] Comma-separated list start,end,start,end,... timestamps (in seconds) of clips to process. The last end timestamp defaults to the end of the file. + vad_filter will be ignored if clip_timestamps is used. hallucination_silence_threshold: Optional[float] When word_timestamps is True, skip silent periods longer than this threshold (in seconds) when a possible hallucination is detected @@ -305,7 +306,7 @@ class WhisperModel: "Processing audio with duration %s", format_timestamp(duration) ) - if vad_filter: + if vad_filter and clip_timestamps == "0": if vad_parameters is None: vad_parameters = VadOptions() elif isinstance(vad_parameters, dict): From 91c8307aa6b305dcaf12fc112b0187ee0d512b00 Mon Sep 17 00:00:00 2001 From: otakutyrant <64188229+otakutyrant@users.noreply.github.com> Date: Wed, 3 Apr 2024 00:22:22 +0800 Subject: [PATCH 6/6] make faster_whisper.assets as a valid python package to distribute (#772) (#774) --- faster_whisper/assets/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 faster_whisper/assets/__init__.py diff --git a/faster_whisper/assets/__init__.py b/faster_whisper/assets/__init__.py new file mode 100644 index 0000000..e69de29