Update CTranslate2 to 3.8.0

2023-03-06 16:21:48 +01:00
parent 4a18adc382
commit 469244a57d
3 changed files with 33 additions and 10 deletions
--- a/README.md
+++ b/README.md
@@ -68,10 +68,13 @@ A Whisper model should be first converted into the CTranslate2 format. We provid
 For example the command below converts the "large-v2" Whisper model and saves the weights in FP16:

 ```bash
-ct2-transformers-converter --model openai/whisper-large-v2 --output_dir whisper-large-v2-ct2 --quantization float16
+ct2-transformers-converter --model openai/whisper-large-v2 --output_dir whisper-large-v2-ct2 \
+    --copy_files tokenizer.json --quantization float16
 ```

-If needed, models can also be converted from the code. See the [conversion API](https://opennmt.net/CTranslate2/python/ctranslate2.converters.TransformersConverter.html).
+If the option `--copy_files tokenizer.json` is not used, the tokenizer configuration is automatically downloaded when the model is loaded later.
+
+Models can also be converted from the code. See the [conversion API](https://opennmt.net/CTranslate2/python/ctranslate2.converters.TransformersConverter.html).

 ### Transcription

--- a/faster_whisper/transcribe.py
+++ b/faster_whisper/transcribe.py
@@ -26,6 +26,7 @@ class TranscriptionOptions(
    collections.namedtuple(
        "TranscriptionOptions",
        (
+            "language",
            "task",
            "beam_size",
            "best_of",
@@ -38,7 +39,10 @@ class TranscriptionOptions(
            "temperatures",
            "initial_prompt",
            "prefix",
+            "suppress_blank",
+            "suppress_tokens",
            "without_timestamps",
+            "max_initial_timestamp",
        ),
    )
 ):
@@ -120,7 +124,10 @@ class WhisperModel:
        condition_on_previous_text: bool = True,
        initial_prompt: Optional[str] = None,
        prefix: Optional[str] = None,
+        suppress_blank: bool = True,
+        suppress_tokens: Optional[List[int]] = [-1],
        without_timestamps: bool = False,
+        max_initial_timestamp: float = 1.0,
    ):
        """Transcribes an input file.

@@ -150,7 +157,11 @@ class WhisperModel:
            such as repetition looping or timestamps going out of sync.
          initial_prompt: Optional text to provide as a prompt for the first window.
          prefix: Optional text to provide as a prefix for the first window.
+          suppress_blank: Suppress blank outputs at the beginning of the sampling.
+          suppress_tokens: List of token IDs to suppress. -1 will suppress a default set
+            of symbols as defined in the model config.json file.
          without_timestamps: Only sample text tokens.
+          max_initial_timestamp: The initial timestamp cannot be later than this.

        Returns:
          A tuple with:
@@ -181,6 +192,7 @@ class WhisperModel:
            language_probability = 1

        options = TranscriptionOptions(
+            language=language,
            task=task,
            beam_size=beam_size,
            best_of=best_of,
@@ -195,10 +207,13 @@ class WhisperModel:
            ),
            initial_prompt=initial_prompt,
            prefix=prefix,
+            suppress_blank=suppress_blank,
+            suppress_tokens=suppress_tokens,
            without_timestamps=without_timestamps,
+            max_initial_timestamp=max_initial_timestamp,
        )

-        segments = self.generate_segments(features, language, options)
+        segments = self.generate_segments(features, options)

        audio_info = AudioInfo(
            language=language,
@@ -207,10 +222,8 @@ class WhisperModel:

        return segments, audio_info

-    def generate_segments(self, features, language, options):
-        tokenized_segments = self.generate_tokenized_segments(
-            features, language, options
-        )
+    def generate_segments(self, features, options):
+        tokenized_segments = self.generate_tokenized_segments(features, options)

        for start, end, tokens in tokenized_segments:
            text = self.decode_text_tokens(tokens)
@@ -223,7 +236,7 @@ class WhisperModel:
                text=text,
            )

-    def generate_tokenized_segments(self, features, language, options):
+    def generate_tokenized_segments(self, features, options):
        num_frames = features.shape[-1]
        offset = 0
        all_tokens = []
@@ -241,7 +254,7 @@ class WhisperModel:

            previous_tokens = all_tokens[prompt_reset_since:]
            prompt = self.get_prompt(
-                language,
+                options.language,
                previous_tokens,
                task=options.task,
                without_timestamps=options.without_timestamps,
@@ -338,6 +351,10 @@ class WhisperModel:
        avg_log_prob = None
        final_temperature = None

+        max_initial_timestamp_index = int(
+            round(options.max_initial_timestamp / self.time_precision)
+        )
+
        for temperature in options.temperatures:
            if temperature > 0:
                kwargs = {
@@ -360,6 +377,9 @@ class WhisperModel:
                max_length=self.max_length,
                return_scores=True,
                return_no_speech_prob=True,
+                suppress_blank=options.suppress_blank,
+                suppress_tokens=options.suppress_tokens,
+                max_initial_timestamp_index=max_initial_timestamp_index,
                **kwargs,
            )[0]

--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,3 @@
 av==10.*
-ctranslate2>=3.7,<4
+ctranslate2>=3.8,<4
 tokenizers==0.13.*