From 1eb9a8004c509a4af2960955374520c35b7b793a Mon Sep 17 00:00:00 2001
From: trungkienbkhn <trungkienbkhn123@gmail.com>
Date: Tue, 12 Mar 2024 21:44:49 +0700
Subject: [PATCH 1/6] Improve language detection (#732)

---
 faster_whisper/transcribe.py | 59 ++++++++++++++++++++++++++++++------
 1 file changed, 50 insertions(+), 9 deletions(-)

diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py
index d3d5deb..1c002ed 100644
--- a/faster_whisper/transcribe.py
+++ b/faster_whisper/transcribe.py
@@ -220,6 +220,8 @@ class WhisperModel:
         chunk_length: Optional[int] = None,
         clip_timestamps: Union[str, List[float]] = "0",
         hallucination_silence_threshold: Optional[float] = None,
+        language_detection_threshold: Optional[float] = None,
+        language_detection_segments: int = 1,
     ) -> Tuple[Iterable[Segment], TranscriptionInfo]:
         """Transcribes an input file.
 
@@ -281,6 +283,9 @@ class WhisperModel:
           hallucination_silence_threshold: Optional[float]
             When word_timestamps is True, skip silent periods longer than this threshold
              (in seconds) when a possible hallucination is detected
+          language_detection_threshold: If the maximum probability of the language tokens is higher
+           than this value, the language is detected.
+          language_detection_segments: Number of segments to consider for the language detection.
 
         Returns:
           A tuple with:
@@ -340,15 +345,51 @@ class WhisperModel:
                 language = "en"
                 language_probability = 1
             else:
-                segment = features[:, : self.feature_extractor.nb_max_frames]
-                encoder_output = self.encode(segment)
-                # results is a list of tuple[str, float] with language names and
-                # probabilities.
-                results = self.model.detect_language(encoder_output)[0]
-                # Parse language names to strip out markers
-                all_language_probs = [(token[2:-2], prob) for (token, prob) in results]
-                # Get top language token and probability
-                language, language_probability = all_language_probs[0]
+                if (
+                    language_detection_segments is None
+                    or language_detection_segments < 1
+                ):
+                    language_detection_segments = 1
+                seek = 0
+                detected_language_info = {}
+                content_frames = (
+                    features.shape[-1] - self.feature_extractor.nb_max_frames
+                )
+                while (
+                    seek < content_frames
+                    and seek
+                    < self.feature_extractor.nb_max_frames * language_detection_segments
+                ):
+                    segment = features[
+                        :, seek : seek + self.feature_extractor.nb_max_frames
+                    ]
+                    encoder_output = self.encode(segment)
+                    # results is a list of tuple[str, float] with language names and
+                    # probabilities.
+                    results = self.model.detect_language(encoder_output)[0]
+                    # Parse language names to strip out markers
+                    all_language_probs = [
+                        (token[2:-2], prob) for (token, prob) in results
+                    ]
+                    # Get top language token and probability
+                    language, language_probability = all_language_probs[0]
+                    if (
+                        language_detection_threshold is None
+                        or language_probability > language_detection_threshold
+                    ):
+                        break
+                    detected_language_info.setdefault(language, []).append(
+                        language_probability
+                    )
+                    seek += segment.shape[-1]
+                else:
+                    # If no language detected for all segments, the majority vote of the highest
+                    # projected languages for all segments is used to determine the language.
+                    language = max(
+                        detected_language_info,
+                        key=lambda lang: len(detected_language_info[lang]),
+                    )
+                    language_probability = max(detected_language_info[language])
 
                 self.logger.info(
                     "Detected language '%s' with probability %.2f",

From a67e0e47aec5dabcb8c519fa971334a91cecc2a2 Mon Sep 17 00:00:00 2001
From: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>
Date: Tue, 26 Mar 2024 13:58:39 +0000
Subject: [PATCH 2/6] Add support for distil-large-v3 (#755)

* add distil-large-v3

* Update README.md

* use fp16 weights from Systran
---
 README.md               | 23 +++++++++++++++--------
 faster_whisper/utils.py |  1 +
 2 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index e81e3a2..dbd69ee 100644
--- a/README.md
+++ b/README.md
@@ -159,18 +159,25 @@ for segment in segments:
 segments, _ = model.transcribe("audio.mp3")
 segments = list(segments)  # The transcription will actually run here.
 ```
-### Faster-distil-whisper
-For usage of `faster-distil-whisper`, please refer to: https://github.com/guillaumekln/faster-whisper/issues/533
+### Faster Distil-Whisper
+
+The Distil-Whisper checkpoints are compatible with the Faster-Whisper package. In particular, the latest [distil-large-v3](https://huggingface.co/distil-whisper/distil-large-v3)
+checkpoint is intrinsically designed to work with the Faster-Whisper transcription algorithm. The following code snippet 
+demonstrates how to run inference with distil-large-v3 on a specified audio file:
 
 ```python
-model_size = "distil-large-v2"
-# model_size = "distil-medium.en"
-model = WhisperModel(model_size, device="cuda", compute_type="float16")
-segments, info = model.transcribe("audio.mp3", beam_size=5, 
-    language="en", max_new_tokens=128, condition_on_previous_text=False)
+from faster_whisper import WhisperModel
 
+model_size = "distil-large-v3"
+
+model = WhisperModel(model_size, device="cuda", compute_type="float16")
+segments, info = model.transcribe("audio.mp3", beam_size=5, language="en", condition_on_previous_text=False)
+
+for segment in segments:
+    print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
 ```
-NOTE: Empirically, `condition_on_previous_text=True` will degrade the performance of `faster-distil-whisper` for long audio. Degradation on the first chunk was observed with `initial_prompt` too.
+
+For more information about the distil-large-v3 model, refer to the original [model card](https://huggingface.co/distil-whisper/distil-large-v3).
 
 ### Word-level timestamps
 
diff --git a/faster_whisper/utils.py b/faster_whisper/utils.py
index 0b5f375..1abf808 100644
--- a/faster_whisper/utils.py
+++ b/faster_whisper/utils.py
@@ -25,6 +25,7 @@ _MODELS = {
     "distil-large-v2": "Systran/faster-distil-whisper-large-v2",
     "distil-medium.en": "Systran/faster-distil-whisper-medium.en",
     "distil-small.en": "Systran/faster-distil-whisper-small.en",
+    "distil-large-v3": "Systran/faster-distil-whisper-large-v3",
 }
 
 

From e0c3a9ed34f77db5c6e2e8432bb0d1ccb155e1af Mon Sep 17 00:00:00 2001
From: trungkienbkhn <trungkienbkhn123@gmail.com>
Date: Wed, 27 Mar 2024 14:31:17 +0700
Subject: [PATCH 3/6] Update project github link to SYSTRAN (#746)

---
 CONTRIBUTING.md         |  2 +-
 LICENSE                 |  2 +-
 README.md               | 12 ++++++------
 faster_whisper/utils.py |  2 +-
 setup.py                |  2 +-
 5 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 379b9ad..8d6a9c2 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -7,7 +7,7 @@ Contributions are welcome! Here are some pointers to help you install the librar
 We recommend installing the module in editable mode with the `dev` extra requirements:
 
 ```bash
-git clone https://github.com/guillaumekln/faster-whisper.git
+git clone https://github.com/SYSTRAN/faster-whisper.git
 cd faster-whisper/
 pip install -e .[dev]
 ```
diff --git a/LICENSE b/LICENSE
index 62f34be..2d92330 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2023 Guillaume Klein
+Copyright (c) 2023 SYSTRAN
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/README.md b/README.md
index dbd69ee..037bad8 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-[![CI](https://github.com/guillaumekln/faster-whisper/workflows/CI/badge.svg)](https://github.com/guillaumekln/faster-whisper/actions?query=workflow%3ACI) [![PyPI version](https://badge.fury.io/py/faster-whisper.svg)](https://badge.fury.io/py/faster-whisper)
+[![CI](https://github.com/SYSTRAN/faster-whisper/workflows/CI/badge.svg)](https://github.com/SYSTRAN/faster-whisper/actions?query=workflow%3ACI) [![PyPI version](https://badge.fury.io/py/faster-whisper.svg)](https://badge.fury.io/py/faster-whisper)
 
 # Faster Whisper transcription with CTranslate2
 
@@ -14,7 +14,7 @@ For reference, here's the time and memory usage that are required to transcribe
 
 * [openai/whisper](https://github.com/openai/whisper)@[6dea21fd](https://github.com/openai/whisper/commit/6dea21fd7f7253bfe450f1e2512a0fe47ee2d258)
 * [whisper.cpp](https://github.com/ggerganov/whisper.cpp)@[3b010f9](https://github.com/ggerganov/whisper.cpp/commit/3b010f9bed9a6068609e9faf52383aea792b0362)
-* [faster-whisper](https://github.com/guillaumekln/faster-whisper)@[cce6b53e](https://github.com/guillaumekln/faster-whisper/commit/cce6b53e4554f71172dad188c45f10fb100f6e3e)
+* [faster-whisper](https://github.com/SYSTRAN/faster-whisper)@[cce6b53e](https://github.com/SYSTRAN/faster-whisper/commit/cce6b53e4554f71172dad188c45f10fb100f6e3e)
 
 ### Large-v2 model on GPU
 
@@ -117,13 +117,13 @@ pip install faster-whisper
 ### Install the master branch
 
 ```bash
-pip install --force-reinstall "faster-whisper @ https://github.com/guillaumekln/faster-whisper/archive/refs/heads/master.tar.gz"
+pip install --force-reinstall "faster-whisper @ https://github.com/SYSTRAN/faster-whisper/archive/refs/heads/master.tar.gz"
 ```
 
 ### Install a specific commit
 
 ```bash
-pip install --force-reinstall "faster-whisper @ https://github.com/guillaumekln/faster-whisper/archive/a4f1cc8f11433e454c3934442b5e1a4ed5e865c3.tar.gz"
+pip install --force-reinstall "faster-whisper @ https://github.com/SYSTRAN/faster-whisper/archive/a4f1cc8f11433e454c3934442b5e1a4ed5e865c3.tar.gz"
 ```
 
 </details>
@@ -197,7 +197,7 @@ The library integrates the [Silero VAD](https://github.com/snakers4/silero-vad)
 segments, _ = model.transcribe("audio.mp3", vad_filter=True)
 ```
 
-The default behavior is conservative and only removes silence longer than 2 seconds. See the available VAD parameters and default values in the [source code](https://github.com/guillaumekln/faster-whisper/blob/master/faster_whisper/vad.py). They can be customized with the dictionary argument `vad_parameters`:
+The default behavior is conservative and only removes silence longer than 2 seconds. See the available VAD parameters and default values in the [source code](https://github.com/SYSTRAN/faster-whisper/blob/master/faster_whisper/vad.py). They can be customized with the dictionary argument `vad_parameters`:
 
 ```python
 segments, _ = model.transcribe(
@@ -220,7 +220,7 @@ logging.getLogger("faster_whisper").setLevel(logging.DEBUG)
 
 ### Going further
 
-See more model and transcription options in the [`WhisperModel`](https://github.com/guillaumekln/faster-whisper/blob/master/faster_whisper/transcribe.py) class implementation.
+See more model and transcription options in the [`WhisperModel`](https://github.com/SYSTRAN/faster-whisper/blob/master/faster_whisper/transcribe.py) class implementation.
 
 ## Community integrations
 
diff --git a/faster_whisper/utils.py b/faster_whisper/utils.py
index 1abf808..93ade3a 100644
--- a/faster_whisper/utils.py
+++ b/faster_whisper/utils.py
@@ -53,7 +53,7 @@ def download_model(
     """Downloads a CTranslate2 Whisper model from the Hugging Face Hub.
 
     Args:
-      size_or_id: Size of the model to download from https://huggingface.co/guillaumekln
+      size_or_id: Size of the model to download from https://huggingface.co/Systran
         (tiny, tiny.en, base, base.en, small, small.en medium, medium.en, large-v1, large-v2,
         large-v3, large), or a CTranslate2-converted model ID from the Hugging Face Hub
         (e.g. Systran/faster-whisper-large-v3).
diff --git a/setup.py b/setup.py
index 1deca3b..782f1b2 100644
--- a/setup.py
+++ b/setup.py
@@ -37,7 +37,7 @@ setup(
     long_description=get_long_description(),
     long_description_content_type="text/markdown",
     author="Guillaume Klein",
-    url="https://github.com/guillaumekln/faster-whisper",
+    url="https://github.com/SYSTRAN/faster-whisper",
     classifiers=[
         "Development Status :: 4 - Beta",
         "Intended Audience :: Developers",

From 8ae82c8372c7deab3ee0dc21cbe4c70a2ee9803a Mon Sep 17 00:00:00 2001
From: Purfview <69023953+Purfview@users.noreply.github.com>
Date: Tue, 2 Apr 2024 17:18:12 +0100
Subject: [PATCH 4/6] Bugfix: code breaks if audio is empty (#768)

* Bugfix: code breaks if audio is empty

Regression since https://github.com/SYSTRAN/faster-whisper/pull/732 PR
---
 faster_whisper/transcribe.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py
index 1c002ed..34cd271 100644
--- a/faster_whisper/transcribe.py
+++ b/faster_whisper/transcribe.py
@@ -356,7 +356,7 @@ class WhisperModel:
                     features.shape[-1] - self.feature_extractor.nb_max_frames
                 )
                 while (
-                    seek < content_frames
+                    seek <= content_frames
                     and seek
                     < self.feature_extractor.nb_max_frames * language_detection_segments
                 ):

From b024972a566c1a61279548b3db1d7249cc1d0151 Mon Sep 17 00:00:00 2001
From: Purfview <69023953+Purfview@users.noreply.github.com>
Date: Tue, 2 Apr 2024 17:20:34 +0100
Subject: [PATCH 5/6] Foolproof: Disable VAD if clip_timestamps is in use
 (#769)

* Foolproof: Disable VAD if clip_timestamps is in use

Prevent silly things to happen.
---
 faster_whisper/transcribe.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py
index 34cd271..337cc42 100644
--- a/faster_whisper/transcribe.py
+++ b/faster_whisper/transcribe.py
@@ -280,6 +280,7 @@ class WhisperModel:
           clip_timestamps: Union[str, List[float]]
             Comma-separated list start,end,start,end,... timestamps (in seconds) of clips to
              process. The last end timestamp defaults to the end of the file.
+             vad_filter will be ignored if clip_timestamps is used.
           hallucination_silence_threshold: Optional[float]
             When word_timestamps is True, skip silent periods longer than this threshold
              (in seconds) when a possible hallucination is detected
@@ -305,7 +306,7 @@ class WhisperModel:
             "Processing audio with duration %s", format_timestamp(duration)
         )
 
-        if vad_filter:
+        if vad_filter and clip_timestamps == "0":
             if vad_parameters is None:
                 vad_parameters = VadOptions()
             elif isinstance(vad_parameters, dict):

From 91c8307aa6b305dcaf12fc112b0187ee0d512b00 Mon Sep 17 00:00:00 2001
From: otakutyrant <64188229+otakutyrant@users.noreply.github.com>
Date: Wed, 3 Apr 2024 00:22:22 +0800
Subject: [PATCH 6/6] make faster_whisper.assets as a valid python package to
 distribute (#772) (#774)

---
 faster_whisper/assets/__init__.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 faster_whisper/assets/__init__.py

diff --git a/faster_whisper/assets/__init__.py b/faster_whisper/assets/__init__.py
new file mode 100644
index 0000000..e69de29