Add V3 Support (#578)

* Add V3 Support

* update conversion example

---------

Co-authored-by: oscaarjs <oscar.johansson@conversy.se>
This commit is contained in:
Oscaarjs
2023-11-24 23:16:12 +01:00
committed by GitHub
parent 5a0541ea7d
commit 3084409633
5 changed files with 48 additions and 26 deletions

View File

@@ -104,7 +104,7 @@ pip install --force-reinstall "faster-whisper @ https://github.com/guillaumekln/
```python ```python
from faster_whisper import WhisperModel from faster_whisper import WhisperModel
model_size = "large-v2" model_size = "large-v3"
# Run on GPU with FP16 # Run on GPU with FP16
model = WhisperModel(model_size, device="cuda", compute_type="float16") model = WhisperModel(model_size, device="cuda", compute_type="float16")
@@ -185,17 +185,17 @@ Here is a non exhaustive list of open-source projects using faster-whisper. Feel
## Model conversion ## Model conversion
When loading a model from its size such as `WhisperModel("large-v2")`, the correspondig CTranslate2 model is automatically downloaded from the [Hugging Face Hub](https://huggingface.co/guillaumekln). When loading a model from its size such as `WhisperModel("large-v3")`, the correspondig CTranslate2 model is automatically downloaded from the [Hugging Face Hub](https://huggingface.co/Systran).
We also provide a script to convert any Whisper models compatible with the Transformers library. They could be the original OpenAI models or user fine-tuned models. We also provide a script to convert any Whisper models compatible with the Transformers library. They could be the original OpenAI models or user fine-tuned models.
For example the command below converts the [original "large-v2" Whisper model](https://huggingface.co/openai/whisper-large-v2) and saves the weights in FP16: For example the command below converts the [original "large-v3" Whisper model](https://huggingface.co/openai/whisper-large-v3) and saves the weights in FP16:
```bash ```bash
pip install transformers[torch]>=4.23 pip install transformers[torch]>=4.23
ct2-transformers-converter --model openai/whisper-large-v2 --output_dir whisper-large-v2-ct2 \ ct2-transformers-converter --model openai/whisper-large-v3 --output_dir whisper-large-v3-ct2
--copy_files tokenizer.json --quantization float16 --copy_files tokenizer.json preprocessor_config.json --quantization float16
``` ```
* The option `--model` accepts a model name on the Hub or a path to a model directory. * The option `--model` accepts a model name on the Hub or a path to a model directory.
@@ -207,12 +207,12 @@ Models can also be converted from the code. See the [conversion API](https://ope
1. Directly load the model from a local directory: 1. Directly load the model from a local directory:
```python ```python
model = faster_whisper.WhisperModel("whisper-large-v2-ct2") model = faster_whisper.WhisperModel("whisper-large-v3-ct2")
``` ```
2. [Upload your model to the Hugging Face Hub](https://huggingface.co/docs/transformers/model_sharing#upload-with-the-web-interface) and load it from its name: 2. [Upload your model to the Hugging Face Hub](https://huggingface.co/docs/transformers/model_sharing#upload-with-the-web-interface) and load it from its name:
```python ```python
model = faster_whisper.WhisperModel("username/whisper-large-v2-ct2") model = faster_whisper.WhisperModel("username/whisper-large-v3-ct2")
``` ```
## Comparing performance against other implementations ## Comparing performance against other implementations

View File

@@ -108,7 +108,7 @@ class Tokenizer:
def split_to_word_tokens( def split_to_word_tokens(
self, tokens: List[int] self, tokens: List[int]
) -> Tuple[List[str], List[List[int]]]: ) -> Tuple[List[str], List[List[int]]]:
if self.language_code in {"zh", "ja", "th", "lo", "my"}: if self.language_code in {"zh", "ja", "th", "lo", "my", "yue"}:
# These languages don't typically use spaces, so it is difficult to split words # These languages don't typically use spaces, so it is difficult to split words
# without morpheme analysis. Here, we instead split words at any # without morpheme analysis. Here, we instead split words at any
# position where the tokens are decoded as valid unicode points # position where the tokens are decoded as valid unicode points
@@ -274,4 +274,5 @@ _LANGUAGE_CODES = (
"yi", "yi",
"yo", "yo",
"zh", "zh",
"yue",
) )

View File

@@ -1,8 +1,10 @@
import itertools import itertools
import json
import logging import logging
import os import os
import zlib import zlib
from inspect import signature
from typing import BinaryIO, Iterable, List, NamedTuple, Optional, Tuple, Union from typing import BinaryIO, Iterable, List, NamedTuple, Optional, Tuple, Union
import ctranslate2 import ctranslate2
@@ -92,8 +94,8 @@ class WhisperModel:
Args: Args:
model_size_or_path: Size of the model to use (tiny, tiny.en, base, base.en, model_size_or_path: Size of the model to use (tiny, tiny.en, base, base.en,
small, small.en, medium, medium.en, large-v1, large-v2, or large), a path to a converted small, small.en, medium, medium.en, large-v1, large-v2, large-v3, or large), a path to a
model directory, or a CTranslate2-converted Whisper model ID from the Hugging Face Hub. converted model directory, or a CTranslate2-converted Whisper model ID from the HF Hub.
When a size or a model ID is configured, the converted model is downloaded When a size or a model ID is configured, the converted model is downloaded
from the Hugging Face Hub. from the Hugging Face Hub.
device: Device to use for computation ("cpu", "cuda", "auto"). device: Device to use for computation ("cpu", "cuda", "auto").
@@ -142,7 +144,8 @@ class WhisperModel:
"openai/whisper-tiny" + ("" if self.model.is_multilingual else ".en") "openai/whisper-tiny" + ("" if self.model.is_multilingual else ".en")
) )
self.feature_extractor = FeatureExtractor() self.feat_kwargs = self._get_feature_kwargs(model_path)
self.feature_extractor = FeatureExtractor(**self.feat_kwargs)
self.num_samples_per_token = self.feature_extractor.hop_length * 2 self.num_samples_per_token = self.feature_extractor.hop_length * 2
self.frames_per_second = ( self.frames_per_second = (
self.feature_extractor.sampling_rate // self.feature_extractor.hop_length self.feature_extractor.sampling_rate // self.feature_extractor.hop_length
@@ -159,6 +162,22 @@ class WhisperModel:
"""The languages supported by the model.""" """The languages supported by the model."""
return list(_LANGUAGE_CODES) if self.model.is_multilingual else ["en"] return list(_LANGUAGE_CODES) if self.model.is_multilingual else ["en"]
def _get_feature_kwargs(self, model_path) -> dict:
preprocessor_config_file = os.path.join(model_path, "preprocessor_config.json")
config = {}
if os.path.isfile(preprocessor_config_file):
try:
with open(preprocessor_config_file, "r", encoding="utf-8") as json_file:
config = json.load(json_file)
valid_keys = signature(FeatureExtractor.__init__).parameters.keys()
config = {k: v for k, v in config.items() if k in valid_keys}
except json.JSONDecodeError as e:
self.logger.warning(
"Could not load preprocessor_config.json: %s", str(e)
)
return config
def transcribe( def transcribe(
self, self,
audio: Union[str, BinaryIO, np.ndarray], audio: Union[str, BinaryIO, np.ndarray],

View File

@@ -10,17 +10,18 @@ import requests
from tqdm.auto import tqdm from tqdm.auto import tqdm
_MODELS = { _MODELS = {
"tiny.en": "guillaumekln/faster-whisper-tiny.en", "tiny.en": "Systran/faster-whisper-tiny.en",
"tiny": "guillaumekln/faster-whisper-tiny", "tiny": "Systran/faster-whisper-tiny",
"base.en": "guillaumekln/faster-whisper-base.en", "base.en": "Systran/faster-whisper-base.en",
"base": "guillaumekln/faster-whisper-base", "base": "Systran/faster-whisper-base",
"small.en": "guillaumekln/faster-whisper-small.en", "small.en": "Systran/faster-whisper-small.en",
"small": "guillaumekln/faster-whisper-small", "small": "Systran/faster-whisper-small",
"medium.en": "guillaumekln/faster-whisper-medium.en", "medium.en": "Systran/faster-whisper-medium.en",
"medium": "guillaumekln/faster-whisper-medium", "medium": "Systran/faster-whisper-medium",
"large-v1": "guillaumekln/faster-whisper-large-v1", "large-v1": "Systran/faster-whisper-large-v1",
"large-v2": "guillaumekln/faster-whisper-large-v2", "large-v2": "Systran/faster-whisper-large-v2",
"large": "guillaumekln/faster-whisper-large-v2", "large-v3": "Systran/faster-whisper-large-v3",
"large": "Systran/faster-whisper-large-v3",
} }
@@ -50,8 +51,8 @@ def download_model(
Args: Args:
size_or_id: Size of the model to download from https://huggingface.co/guillaumekln size_or_id: Size of the model to download from https://huggingface.co/guillaumekln
(tiny, tiny.en, base, base.en, small, small.en medium, medium.en, large-v1, large-v2, (tiny, tiny.en, base, base.en, small, small.en medium, medium.en, large-v1, large-v2,
large), or a CTranslate2-converted model ID from the Hugging Face Hub large-v3, large), or a CTranslate2-converted model ID from the Hugging Face Hub
(e.g. guillaumekln/faster-whisper-large-v2). (e.g. Systran/faster-whisper-large-v3).
output_dir: Directory where the model should be saved. If not set, the model is saved in output_dir: Directory where the model should be saved. If not set, the model is saved in
the cache directory. the cache directory.
local_files_only: If True, avoid downloading the file and return the path to the local local_files_only: If True, avoid downloading the file and return the path to the local
@@ -76,6 +77,7 @@ def download_model(
allow_patterns = [ allow_patterns = [
"config.json", "config.json",
"preprocessor_config.json",
"model.bin", "model.bin",
"tokenizer.json", "tokenizer.json",
"vocabulary.*", "vocabulary.*",

View File

@@ -1,5 +1,5 @@
av==10.* av==10.*
ctranslate2>=3.17,<4 ctranslate2>=3.22,<4
huggingface_hub>=0.13 huggingface_hub>=0.13
tokenizers>=0.13,<0.15 tokenizers>=0.13,<0.16
onnxruntime>=1.14,<2 onnxruntime>=1.14,<2