Update CTranslate2 to 3.8.0

This commit is contained in:
Guillaume Klein
2023-03-06 16:21:48 +01:00
parent 4a18adc382
commit 469244a57d
3 changed files with 33 additions and 10 deletions

View File

@@ -68,10 +68,13 @@ A Whisper model should be first converted into the CTranslate2 format. We provid
For example the command below converts the "large-v2" Whisper model and saves the weights in FP16: For example the command below converts the "large-v2" Whisper model and saves the weights in FP16:
```bash ```bash
ct2-transformers-converter --model openai/whisper-large-v2 --output_dir whisper-large-v2-ct2 --quantization float16 ct2-transformers-converter --model openai/whisper-large-v2 --output_dir whisper-large-v2-ct2 \
--copy_files tokenizer.json --quantization float16
``` ```
If needed, models can also be converted from the code. See the [conversion API](https://opennmt.net/CTranslate2/python/ctranslate2.converters.TransformersConverter.html). If the option `--copy_files tokenizer.json` is not used, the tokenizer configuration is automatically downloaded when the model is loaded later.
Models can also be converted from the code. See the [conversion API](https://opennmt.net/CTranslate2/python/ctranslate2.converters.TransformersConverter.html).
### Transcription ### Transcription

View File

@@ -26,6 +26,7 @@ class TranscriptionOptions(
collections.namedtuple( collections.namedtuple(
"TranscriptionOptions", "TranscriptionOptions",
( (
"language",
"task", "task",
"beam_size", "beam_size",
"best_of", "best_of",
@@ -38,7 +39,10 @@ class TranscriptionOptions(
"temperatures", "temperatures",
"initial_prompt", "initial_prompt",
"prefix", "prefix",
"suppress_blank",
"suppress_tokens",
"without_timestamps", "without_timestamps",
"max_initial_timestamp",
), ),
) )
): ):
@@ -120,7 +124,10 @@ class WhisperModel:
condition_on_previous_text: bool = True, condition_on_previous_text: bool = True,
initial_prompt: Optional[str] = None, initial_prompt: Optional[str] = None,
prefix: Optional[str] = None, prefix: Optional[str] = None,
suppress_blank: bool = True,
suppress_tokens: Optional[List[int]] = [-1],
without_timestamps: bool = False, without_timestamps: bool = False,
max_initial_timestamp: float = 1.0,
): ):
"""Transcribes an input file. """Transcribes an input file.
@@ -150,7 +157,11 @@ class WhisperModel:
such as repetition looping or timestamps going out of sync. such as repetition looping or timestamps going out of sync.
initial_prompt: Optional text to provide as a prompt for the first window. initial_prompt: Optional text to provide as a prompt for the first window.
prefix: Optional text to provide as a prefix for the first window. prefix: Optional text to provide as a prefix for the first window.
suppress_blank: Suppress blank outputs at the beginning of the sampling.
suppress_tokens: List of token IDs to suppress. -1 will suppress a default set
of symbols as defined in the model config.json file.
without_timestamps: Only sample text tokens. without_timestamps: Only sample text tokens.
max_initial_timestamp: The initial timestamp cannot be later than this.
Returns: Returns:
A tuple with: A tuple with:
@@ -181,6 +192,7 @@ class WhisperModel:
language_probability = 1 language_probability = 1
options = TranscriptionOptions( options = TranscriptionOptions(
language=language,
task=task, task=task,
beam_size=beam_size, beam_size=beam_size,
best_of=best_of, best_of=best_of,
@@ -195,10 +207,13 @@ class WhisperModel:
), ),
initial_prompt=initial_prompt, initial_prompt=initial_prompt,
prefix=prefix, prefix=prefix,
suppress_blank=suppress_blank,
suppress_tokens=suppress_tokens,
without_timestamps=without_timestamps, without_timestamps=without_timestamps,
max_initial_timestamp=max_initial_timestamp,
) )
segments = self.generate_segments(features, language, options) segments = self.generate_segments(features, options)
audio_info = AudioInfo( audio_info = AudioInfo(
language=language, language=language,
@@ -207,10 +222,8 @@ class WhisperModel:
return segments, audio_info return segments, audio_info
def generate_segments(self, features, language, options): def generate_segments(self, features, options):
tokenized_segments = self.generate_tokenized_segments( tokenized_segments = self.generate_tokenized_segments(features, options)
features, language, options
)
for start, end, tokens in tokenized_segments: for start, end, tokens in tokenized_segments:
text = self.decode_text_tokens(tokens) text = self.decode_text_tokens(tokens)
@@ -223,7 +236,7 @@ class WhisperModel:
text=text, text=text,
) )
def generate_tokenized_segments(self, features, language, options): def generate_tokenized_segments(self, features, options):
num_frames = features.shape[-1] num_frames = features.shape[-1]
offset = 0 offset = 0
all_tokens = [] all_tokens = []
@@ -241,7 +254,7 @@ class WhisperModel:
previous_tokens = all_tokens[prompt_reset_since:] previous_tokens = all_tokens[prompt_reset_since:]
prompt = self.get_prompt( prompt = self.get_prompt(
language, options.language,
previous_tokens, previous_tokens,
task=options.task, task=options.task,
without_timestamps=options.without_timestamps, without_timestamps=options.without_timestamps,
@@ -338,6 +351,10 @@ class WhisperModel:
avg_log_prob = None avg_log_prob = None
final_temperature = None final_temperature = None
max_initial_timestamp_index = int(
round(options.max_initial_timestamp / self.time_precision)
)
for temperature in options.temperatures: for temperature in options.temperatures:
if temperature > 0: if temperature > 0:
kwargs = { kwargs = {
@@ -360,6 +377,9 @@ class WhisperModel:
max_length=self.max_length, max_length=self.max_length,
return_scores=True, return_scores=True,
return_no_speech_prob=True, return_no_speech_prob=True,
suppress_blank=options.suppress_blank,
suppress_tokens=options.suppress_tokens,
max_initial_timestamp_index=max_initial_timestamp_index,
**kwargs, **kwargs,
)[0] )[0]

View File

@@ -1,3 +1,3 @@
av==10.* av==10.*
ctranslate2>=3.7,<4 ctranslate2>=3.8,<4
tokenizers==0.13.* tokenizers==0.13.*