diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 379b9ad..8d6a9c2 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -7,7 +7,7 @@ Contributions are welcome! Here are some pointers to help you install the librar We recommend installing the module in editable mode with the `dev` extra requirements: ```bash -git clone https://github.com/guillaumekln/faster-whisper.git +git clone https://github.com/SYSTRAN/faster-whisper.git cd faster-whisper/ pip install -e .[dev] ``` diff --git a/LICENSE b/LICENSE index 62f34be..2d92330 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2023 Guillaume Klein +Copyright (c) 2023 SYSTRAN Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index e81e3a2..037bad8 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -[![CI](https://github.com/guillaumekln/faster-whisper/workflows/CI/badge.svg)](https://github.com/guillaumekln/faster-whisper/actions?query=workflow%3ACI) [![PyPI version](https://badge.fury.io/py/faster-whisper.svg)](https://badge.fury.io/py/faster-whisper) +[![CI](https://github.com/SYSTRAN/faster-whisper/workflows/CI/badge.svg)](https://github.com/SYSTRAN/faster-whisper/actions?query=workflow%3ACI) [![PyPI version](https://badge.fury.io/py/faster-whisper.svg)](https://badge.fury.io/py/faster-whisper) # Faster Whisper transcription with CTranslate2 @@ -14,7 +14,7 @@ For reference, here's the time and memory usage that are required to transcribe * [openai/whisper](https://github.com/openai/whisper)@[6dea21fd](https://github.com/openai/whisper/commit/6dea21fd7f7253bfe450f1e2512a0fe47ee2d258) * [whisper.cpp](https://github.com/ggerganov/whisper.cpp)@[3b010f9](https://github.com/ggerganov/whisper.cpp/commit/3b010f9bed9a6068609e9faf52383aea792b0362) -* [faster-whisper](https://github.com/guillaumekln/faster-whisper)@[cce6b53e](https://github.com/guillaumekln/faster-whisper/commit/cce6b53e4554f71172dad188c45f10fb100f6e3e) +* [faster-whisper](https://github.com/SYSTRAN/faster-whisper)@[cce6b53e](https://github.com/SYSTRAN/faster-whisper/commit/cce6b53e4554f71172dad188c45f10fb100f6e3e) ### Large-v2 model on GPU @@ -117,13 +117,13 @@ pip install faster-whisper ### Install the master branch ```bash -pip install --force-reinstall "faster-whisper @ https://github.com/guillaumekln/faster-whisper/archive/refs/heads/master.tar.gz" +pip install --force-reinstall "faster-whisper @ https://github.com/SYSTRAN/faster-whisper/archive/refs/heads/master.tar.gz" ``` ### Install a specific commit ```bash -pip install --force-reinstall "faster-whisper @ https://github.com/guillaumekln/faster-whisper/archive/a4f1cc8f11433e454c3934442b5e1a4ed5e865c3.tar.gz" +pip install --force-reinstall "faster-whisper @ https://github.com/SYSTRAN/faster-whisper/archive/a4f1cc8f11433e454c3934442b5e1a4ed5e865c3.tar.gz" ``` @@ -159,18 +159,25 @@ for segment in segments: segments, _ = model.transcribe("audio.mp3") segments = list(segments) # The transcription will actually run here. ``` -### Faster-distil-whisper -For usage of `faster-distil-whisper`, please refer to: https://github.com/guillaumekln/faster-whisper/issues/533 +### Faster Distil-Whisper + +The Distil-Whisper checkpoints are compatible with the Faster-Whisper package. In particular, the latest [distil-large-v3](https://huggingface.co/distil-whisper/distil-large-v3) +checkpoint is intrinsically designed to work with the Faster-Whisper transcription algorithm. The following code snippet +demonstrates how to run inference with distil-large-v3 on a specified audio file: ```python -model_size = "distil-large-v2" -# model_size = "distil-medium.en" -model = WhisperModel(model_size, device="cuda", compute_type="float16") -segments, info = model.transcribe("audio.mp3", beam_size=5, - language="en", max_new_tokens=128, condition_on_previous_text=False) +from faster_whisper import WhisperModel +model_size = "distil-large-v3" + +model = WhisperModel(model_size, device="cuda", compute_type="float16") +segments, info = model.transcribe("audio.mp3", beam_size=5, language="en", condition_on_previous_text=False) + +for segment in segments: + print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text)) ``` -NOTE: Empirically, `condition_on_previous_text=True` will degrade the performance of `faster-distil-whisper` for long audio. Degradation on the first chunk was observed with `initial_prompt` too. + +For more information about the distil-large-v3 model, refer to the original [model card](https://huggingface.co/distil-whisper/distil-large-v3). ### Word-level timestamps @@ -190,7 +197,7 @@ The library integrates the [Silero VAD](https://github.com/snakers4/silero-vad) segments, _ = model.transcribe("audio.mp3", vad_filter=True) ``` -The default behavior is conservative and only removes silence longer than 2 seconds. See the available VAD parameters and default values in the [source code](https://github.com/guillaumekln/faster-whisper/blob/master/faster_whisper/vad.py). They can be customized with the dictionary argument `vad_parameters`: +The default behavior is conservative and only removes silence longer than 2 seconds. See the available VAD parameters and default values in the [source code](https://github.com/SYSTRAN/faster-whisper/blob/master/faster_whisper/vad.py). They can be customized with the dictionary argument `vad_parameters`: ```python segments, _ = model.transcribe( @@ -213,7 +220,7 @@ logging.getLogger("faster_whisper").setLevel(logging.DEBUG) ### Going further -See more model and transcription options in the [`WhisperModel`](https://github.com/guillaumekln/faster-whisper/blob/master/faster_whisper/transcribe.py) class implementation. +See more model and transcription options in the [`WhisperModel`](https://github.com/SYSTRAN/faster-whisper/blob/master/faster_whisper/transcribe.py) class implementation. ## Community integrations diff --git a/faster_whisper/assets/__init__.py b/faster_whisper/assets/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py index f9bb0a1..db2121f 100644 --- a/faster_whisper/transcribe.py +++ b/faster_whisper/transcribe.py @@ -220,6 +220,8 @@ class WhisperModel: chunk_length: Optional[int] = None, clip_timestamps: Union[str, List[float]] = "0", hallucination_silence_threshold: Optional[float] = None, + language_detection_threshold: Optional[float] = None, + language_detection_segments: int = 1, ) -> Tuple[Iterable[Segment], TranscriptionInfo]: """Transcribes an input file. @@ -278,9 +280,13 @@ class WhisperModel: clip_timestamps: Union[str, List[float]] Comma-separated list start,end,start,end,... timestamps (in seconds) of clips to process. The last end timestamp defaults to the end of the file. + vad_filter will be ignored if clip_timestamps is used. hallucination_silence_threshold: Optional[float] When word_timestamps is True, skip silent periods longer than this threshold (in seconds) when a possible hallucination is detected + language_detection_threshold: If the maximum probability of the language tokens is higher + than this value, the language is detected. + language_detection_segments: Number of segments to consider for the language detection. Returns: A tuple with: @@ -300,7 +306,7 @@ class WhisperModel: "Processing audio with duration %s", format_timestamp(duration) ) - if vad_filter: + if vad_filter and clip_timestamps == "0": if vad_parameters is None: vad_parameters = VadOptions() elif isinstance(vad_parameters, dict): @@ -340,15 +346,51 @@ class WhisperModel: language = "en" language_probability = 1 else: - segment = features[:, : self.feature_extractor.nb_max_frames] - encoder_output = self.encode(segment) - # results is a list of tuple[str, float] with language names and - # probabilities. - results = self.model.detect_language(encoder_output)[0] - # Parse language names to strip out markers - all_language_probs = [(token[2:-2], prob) for (token, prob) in results] - # Get top language token and probability - language, language_probability = all_language_probs[0] + if ( + language_detection_segments is None + or language_detection_segments < 1 + ): + language_detection_segments = 1 + seek = 0 + detected_language_info = {} + content_frames = ( + features.shape[-1] - self.feature_extractor.nb_max_frames + ) + while ( + seek <= content_frames + and seek + < self.feature_extractor.nb_max_frames * language_detection_segments + ): + segment = features[ + :, seek : seek + self.feature_extractor.nb_max_frames + ] + encoder_output = self.encode(segment) + # results is a list of tuple[str, float] with language names and + # probabilities. + results = self.model.detect_language(encoder_output)[0] + # Parse language names to strip out markers + all_language_probs = [ + (token[2:-2], prob) for (token, prob) in results + ] + # Get top language token and probability + language, language_probability = all_language_probs[0] + if ( + language_detection_threshold is None + or language_probability > language_detection_threshold + ): + break + detected_language_info.setdefault(language, []).append( + language_probability + ) + seek += segment.shape[-1] + else: + # If no language detected for all segments, the majority vote of the highest + # projected languages for all segments is used to determine the language. + language = max( + detected_language_info, + key=lambda lang: len(detected_language_info[lang]), + ) + language_probability = max(detected_language_info[language]) self.logger.info( "Detected language '%s' with probability %.2f", diff --git a/faster_whisper/utils.py b/faster_whisper/utils.py index 0b5f375..93ade3a 100644 --- a/faster_whisper/utils.py +++ b/faster_whisper/utils.py @@ -25,6 +25,7 @@ _MODELS = { "distil-large-v2": "Systran/faster-distil-whisper-large-v2", "distil-medium.en": "Systran/faster-distil-whisper-medium.en", "distil-small.en": "Systran/faster-distil-whisper-small.en", + "distil-large-v3": "Systran/faster-distil-whisper-large-v3", } @@ -52,7 +53,7 @@ def download_model( """Downloads a CTranslate2 Whisper model from the Hugging Face Hub. Args: - size_or_id: Size of the model to download from https://huggingface.co/guillaumekln + size_or_id: Size of the model to download from https://huggingface.co/Systran (tiny, tiny.en, base, base.en, small, small.en medium, medium.en, large-v1, large-v2, large-v3, large), or a CTranslate2-converted model ID from the Hugging Face Hub (e.g. Systran/faster-whisper-large-v3). diff --git a/setup.py b/setup.py index 1deca3b..782f1b2 100644 --- a/setup.py +++ b/setup.py @@ -37,7 +37,7 @@ setup( long_description=get_long_description(), long_description_content_type="text/markdown", author="Guillaume Klein", - url="https://github.com/guillaumekln/faster-whisper", + url="https://github.com/SYSTRAN/faster-whisper", classifiers=[ "Development Status :: 4 - Beta", "Intended Audience :: Developers",