Add V3 Support (#578)

* Add V3 Support

* update conversion example

---------

Co-authored-by: oscaarjs <oscar.johansson@conversy.se>
This commit is contained in:
Oscaarjs
2023-11-24 23:16:12 +01:00
committed by GitHub
parent 5a0541ea7d
commit 3084409633
5 changed files with 48 additions and 26 deletions

View File

@@ -1,8 +1,10 @@
import itertools
import json
import logging
import os
import zlib
from inspect import signature
from typing import BinaryIO, Iterable, List, NamedTuple, Optional, Tuple, Union
import ctranslate2
@@ -92,8 +94,8 @@ class WhisperModel:
Args:
model_size_or_path: Size of the model to use (tiny, tiny.en, base, base.en,
small, small.en, medium, medium.en, large-v1, large-v2, or large), a path to a converted
model directory, or a CTranslate2-converted Whisper model ID from the Hugging Face Hub.
small, small.en, medium, medium.en, large-v1, large-v2, large-v3, or large), a path to a
converted model directory, or a CTranslate2-converted Whisper model ID from the HF Hub.
When a size or a model ID is configured, the converted model is downloaded
from the Hugging Face Hub.
device: Device to use for computation ("cpu", "cuda", "auto").
@@ -142,7 +144,8 @@ class WhisperModel:
"openai/whisper-tiny" + ("" if self.model.is_multilingual else ".en")
)
self.feature_extractor = FeatureExtractor()
self.feat_kwargs = self._get_feature_kwargs(model_path)
self.feature_extractor = FeatureExtractor(**self.feat_kwargs)
self.num_samples_per_token = self.feature_extractor.hop_length * 2
self.frames_per_second = (
self.feature_extractor.sampling_rate // self.feature_extractor.hop_length
@@ -159,6 +162,22 @@ class WhisperModel:
"""The languages supported by the model."""
return list(_LANGUAGE_CODES) if self.model.is_multilingual else ["en"]
def _get_feature_kwargs(self, model_path) -> dict:
preprocessor_config_file = os.path.join(model_path, "preprocessor_config.json")
config = {}
if os.path.isfile(preprocessor_config_file):
try:
with open(preprocessor_config_file, "r", encoding="utf-8") as json_file:
config = json.load(json_file)
valid_keys = signature(FeatureExtractor.__init__).parameters.keys()
config = {k: v for k, v in config.items() if k in valid_keys}
except json.JSONDecodeError as e:
self.logger.warning(
"Could not load preprocessor_config.json: %s", str(e)
)
return config
def transcribe(
self,
audio: Union[str, BinaryIO, np.ndarray],