Update CTranslate2 to 3.8.0

2023-03-06 16:21:48 +01:00
parent 4a18adc382
commit 469244a57d
3 changed files with 33 additions and 10 deletions
--- a/README.md
+++ b/README.md
@@ -68,10 +68,13 @@ A Whisper model should be first converted into the CTranslate2 format. We provid
 For example the command below converts the "large-v2" Whisper model and saves the weights in FP16:
 ```bash
-ct2-transformers-converter --model openai/whisper-large-v2 --output_dir whisper-large-v2-ct2 --quantization float16
+ct2-transformers-converter --model openai/whisper-large-v2 --output_dir whisper-large-v2-ct2 \
    --copy_files tokenizer.json --quantization float16
 ```
-If needed, models can also be converted from the code. See the [conversion API](https://opennmt.net/CTranslate2/python/ctranslate2.converters.TransformersConverter.html).
+If the option `--copy_files tokenizer.json` is not used, the tokenizer configuration is automatically downloaded when the model is loaded later.
 Models can also be converted from the code. See the [conversion API](https://opennmt.net/CTranslate2/python/ctranslate2.converters.TransformersConverter.html).
 ### Transcription
--- a/faster_whisper/transcribe.py
+++ b/faster_whisper/transcribe.py
@@ -26,6 +26,7 @@ class TranscriptionOptions(
    collections.namedtuple(
        "TranscriptionOptions",
        (
            "language",
            "task",
            "beam_size",
            "best_of",
@@ -38,7 +39,10 @@ class TranscriptionOptions(
            "temperatures",
            "initial_prompt",
            "prefix",
            "suppress_blank",
            "suppress_tokens",
            "without_timestamps",
            "max_initial_timestamp",
        ),
    )
 ):
@@ -120,7 +124,10 @@ class WhisperModel:
        condition_on_previous_text: bool = True,
        initial_prompt: Optional[str] = None,
        prefix: Optional[str] = None,
        suppress_blank: bool = True,
        suppress_tokens: Optional[List[int]] = [-1],
        without_timestamps: bool = False,
        max_initial_timestamp: float = 1.0,
    ):
        """Transcribes an input file.
@@ -150,7 +157,11 @@ class WhisperModel:
            such as repetition looping or timestamps going out of sync.
          initial_prompt: Optional text to provide as a prompt for the first window.
          prefix: Optional text to provide as a prefix for the first window.
          suppress_blank: Suppress blank outputs at the beginning of the sampling.
          suppress_tokens: List of token IDs to suppress. -1 will suppress a default set
            of symbols as defined in the model config.json file.
          without_timestamps: Only sample text tokens.
          max_initial_timestamp: The initial timestamp cannot be later than this.
        Returns:
          A tuple with:
@@ -181,6 +192,7 @@ class WhisperModel:
            language_probability = 1
        options = TranscriptionOptions(
            language=language,
            task=task,
            beam_size=beam_size,
            best_of=best_of,
@@ -195,10 +207,13 @@ class WhisperModel:
            ),
            initial_prompt=initial_prompt,
            prefix=prefix,
            suppress_blank=suppress_blank,
            suppress_tokens=suppress_tokens,
            without_timestamps=without_timestamps,
            max_initial_timestamp=max_initial_timestamp,
        )
-        segments = self.generate_segments(features, language, options)
+        segments = self.generate_segments(features, options)
        audio_info = AudioInfo(
            language=language,
@@ -207,10 +222,8 @@ class WhisperModel:
        return segments, audio_info
-    def generate_segments(self, features, language, options):
+    def generate_segments(self, features, options):
-        tokenized_segments = self.generate_tokenized_segments(
+        tokenized_segments = self.generate_tokenized_segments(features, options)
            features, language, options
        )
        for start, end, tokens in tokenized_segments:
            text = self.decode_text_tokens(tokens)
@@ -223,7 +236,7 @@ class WhisperModel:
                text=text,
            )
-    def generate_tokenized_segments(self, features, language, options):
+    def generate_tokenized_segments(self, features, options):
        num_frames = features.shape[-1]
        offset = 0
        all_tokens = []
@@ -241,7 +254,7 @@ class WhisperModel:
            previous_tokens = all_tokens[prompt_reset_since:]
            prompt = self.get_prompt(
-                language,
+                options.language,
                previous_tokens,
                task=options.task,
                without_timestamps=options.without_timestamps,
@@ -338,6 +351,10 @@ class WhisperModel:
        avg_log_prob = None
        final_temperature = None
        max_initial_timestamp_index = int(
            round(options.max_initial_timestamp / self.time_precision)
        )
        for temperature in options.temperatures:
            if temperature > 0:
                kwargs = {
@@ -360,6 +377,9 @@ class WhisperModel:
                max_length=self.max_length,
                return_scores=True,
                return_no_speech_prob=True,
                suppress_blank=options.suppress_blank,
                suppress_tokens=options.suppress_tokens,
                max_initial_timestamp_index=max_initial_timestamp_index,
                **kwargs,
            )[0]
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,3 @@
 av==10.*
-ctranslate2>=3.7,<4
+ctranslate2>=3.8,<4
 tokenizers==0.13.*