Add V3 Support (#578)
* Add V3 Support * update conversion example --------- Co-authored-by: oscaarjs <oscar.johansson@conversy.se>
This commit is contained in:
14
README.md
14
README.md
@@ -104,7 +104,7 @@ pip install --force-reinstall "faster-whisper @ https://github.com/guillaumekln/
|
||||
```python
|
||||
from faster_whisper import WhisperModel
|
||||
|
||||
model_size = "large-v2"
|
||||
model_size = "large-v3"
|
||||
|
||||
# Run on GPU with FP16
|
||||
model = WhisperModel(model_size, device="cuda", compute_type="float16")
|
||||
@@ -185,17 +185,17 @@ Here is a non exhaustive list of open-source projects using faster-whisper. Feel
|
||||
|
||||
## Model conversion
|
||||
|
||||
When loading a model from its size such as `WhisperModel("large-v2")`, the correspondig CTranslate2 model is automatically downloaded from the [Hugging Face Hub](https://huggingface.co/guillaumekln).
|
||||
When loading a model from its size such as `WhisperModel("large-v3")`, the correspondig CTranslate2 model is automatically downloaded from the [Hugging Face Hub](https://huggingface.co/Systran).
|
||||
|
||||
We also provide a script to convert any Whisper models compatible with the Transformers library. They could be the original OpenAI models or user fine-tuned models.
|
||||
|
||||
For example the command below converts the [original "large-v2" Whisper model](https://huggingface.co/openai/whisper-large-v2) and saves the weights in FP16:
|
||||
For example the command below converts the [original "large-v3" Whisper model](https://huggingface.co/openai/whisper-large-v3) and saves the weights in FP16:
|
||||
|
||||
```bash
|
||||
pip install transformers[torch]>=4.23
|
||||
|
||||
ct2-transformers-converter --model openai/whisper-large-v2 --output_dir whisper-large-v2-ct2 \
|
||||
--copy_files tokenizer.json --quantization float16
|
||||
ct2-transformers-converter --model openai/whisper-large-v3 --output_dir whisper-large-v3-ct2
|
||||
--copy_files tokenizer.json preprocessor_config.json --quantization float16
|
||||
```
|
||||
|
||||
* The option `--model` accepts a model name on the Hub or a path to a model directory.
|
||||
@@ -207,12 +207,12 @@ Models can also be converted from the code. See the [conversion API](https://ope
|
||||
|
||||
1. Directly load the model from a local directory:
|
||||
```python
|
||||
model = faster_whisper.WhisperModel("whisper-large-v2-ct2")
|
||||
model = faster_whisper.WhisperModel("whisper-large-v3-ct2")
|
||||
```
|
||||
|
||||
2. [Upload your model to the Hugging Face Hub](https://huggingface.co/docs/transformers/model_sharing#upload-with-the-web-interface) and load it from its name:
|
||||
```python
|
||||
model = faster_whisper.WhisperModel("username/whisper-large-v2-ct2")
|
||||
model = faster_whisper.WhisperModel("username/whisper-large-v3-ct2")
|
||||
```
|
||||
|
||||
## Comparing performance against other implementations
|
||||
|
||||
@@ -108,7 +108,7 @@ class Tokenizer:
|
||||
def split_to_word_tokens(
|
||||
self, tokens: List[int]
|
||||
) -> Tuple[List[str], List[List[int]]]:
|
||||
if self.language_code in {"zh", "ja", "th", "lo", "my"}:
|
||||
if self.language_code in {"zh", "ja", "th", "lo", "my", "yue"}:
|
||||
# These languages don't typically use spaces, so it is difficult to split words
|
||||
# without morpheme analysis. Here, we instead split words at any
|
||||
# position where the tokens are decoded as valid unicode points
|
||||
@@ -274,4 +274,5 @@ _LANGUAGE_CODES = (
|
||||
"yi",
|
||||
"yo",
|
||||
"zh",
|
||||
"yue",
|
||||
)
|
||||
|
||||
@@ -1,8 +1,10 @@
|
||||
import itertools
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import zlib
|
||||
|
||||
from inspect import signature
|
||||
from typing import BinaryIO, Iterable, List, NamedTuple, Optional, Tuple, Union
|
||||
|
||||
import ctranslate2
|
||||
@@ -92,8 +94,8 @@ class WhisperModel:
|
||||
|
||||
Args:
|
||||
model_size_or_path: Size of the model to use (tiny, tiny.en, base, base.en,
|
||||
small, small.en, medium, medium.en, large-v1, large-v2, or large), a path to a converted
|
||||
model directory, or a CTranslate2-converted Whisper model ID from the Hugging Face Hub.
|
||||
small, small.en, medium, medium.en, large-v1, large-v2, large-v3, or large), a path to a
|
||||
converted model directory, or a CTranslate2-converted Whisper model ID from the HF Hub.
|
||||
When a size or a model ID is configured, the converted model is downloaded
|
||||
from the Hugging Face Hub.
|
||||
device: Device to use for computation ("cpu", "cuda", "auto").
|
||||
@@ -142,7 +144,8 @@ class WhisperModel:
|
||||
"openai/whisper-tiny" + ("" if self.model.is_multilingual else ".en")
|
||||
)
|
||||
|
||||
self.feature_extractor = FeatureExtractor()
|
||||
self.feat_kwargs = self._get_feature_kwargs(model_path)
|
||||
self.feature_extractor = FeatureExtractor(**self.feat_kwargs)
|
||||
self.num_samples_per_token = self.feature_extractor.hop_length * 2
|
||||
self.frames_per_second = (
|
||||
self.feature_extractor.sampling_rate // self.feature_extractor.hop_length
|
||||
@@ -159,6 +162,22 @@ class WhisperModel:
|
||||
"""The languages supported by the model."""
|
||||
return list(_LANGUAGE_CODES) if self.model.is_multilingual else ["en"]
|
||||
|
||||
def _get_feature_kwargs(self, model_path) -> dict:
|
||||
preprocessor_config_file = os.path.join(model_path, "preprocessor_config.json")
|
||||
config = {}
|
||||
if os.path.isfile(preprocessor_config_file):
|
||||
try:
|
||||
with open(preprocessor_config_file, "r", encoding="utf-8") as json_file:
|
||||
config = json.load(json_file)
|
||||
valid_keys = signature(FeatureExtractor.__init__).parameters.keys()
|
||||
config = {k: v for k, v in config.items() if k in valid_keys}
|
||||
except json.JSONDecodeError as e:
|
||||
self.logger.warning(
|
||||
"Could not load preprocessor_config.json: %s", str(e)
|
||||
)
|
||||
|
||||
return config
|
||||
|
||||
def transcribe(
|
||||
self,
|
||||
audio: Union[str, BinaryIO, np.ndarray],
|
||||
|
||||
@@ -10,17 +10,18 @@ import requests
|
||||
from tqdm.auto import tqdm
|
||||
|
||||
_MODELS = {
|
||||
"tiny.en": "guillaumekln/faster-whisper-tiny.en",
|
||||
"tiny": "guillaumekln/faster-whisper-tiny",
|
||||
"base.en": "guillaumekln/faster-whisper-base.en",
|
||||
"base": "guillaumekln/faster-whisper-base",
|
||||
"small.en": "guillaumekln/faster-whisper-small.en",
|
||||
"small": "guillaumekln/faster-whisper-small",
|
||||
"medium.en": "guillaumekln/faster-whisper-medium.en",
|
||||
"medium": "guillaumekln/faster-whisper-medium",
|
||||
"large-v1": "guillaumekln/faster-whisper-large-v1",
|
||||
"large-v2": "guillaumekln/faster-whisper-large-v2",
|
||||
"large": "guillaumekln/faster-whisper-large-v2",
|
||||
"tiny.en": "Systran/faster-whisper-tiny.en",
|
||||
"tiny": "Systran/faster-whisper-tiny",
|
||||
"base.en": "Systran/faster-whisper-base.en",
|
||||
"base": "Systran/faster-whisper-base",
|
||||
"small.en": "Systran/faster-whisper-small.en",
|
||||
"small": "Systran/faster-whisper-small",
|
||||
"medium.en": "Systran/faster-whisper-medium.en",
|
||||
"medium": "Systran/faster-whisper-medium",
|
||||
"large-v1": "Systran/faster-whisper-large-v1",
|
||||
"large-v2": "Systran/faster-whisper-large-v2",
|
||||
"large-v3": "Systran/faster-whisper-large-v3",
|
||||
"large": "Systran/faster-whisper-large-v3",
|
||||
}
|
||||
|
||||
|
||||
@@ -50,8 +51,8 @@ def download_model(
|
||||
Args:
|
||||
size_or_id: Size of the model to download from https://huggingface.co/guillaumekln
|
||||
(tiny, tiny.en, base, base.en, small, small.en medium, medium.en, large-v1, large-v2,
|
||||
large), or a CTranslate2-converted model ID from the Hugging Face Hub
|
||||
(e.g. guillaumekln/faster-whisper-large-v2).
|
||||
large-v3, large), or a CTranslate2-converted model ID from the Hugging Face Hub
|
||||
(e.g. Systran/faster-whisper-large-v3).
|
||||
output_dir: Directory where the model should be saved. If not set, the model is saved in
|
||||
the cache directory.
|
||||
local_files_only: If True, avoid downloading the file and return the path to the local
|
||||
@@ -76,6 +77,7 @@ def download_model(
|
||||
|
||||
allow_patterns = [
|
||||
"config.json",
|
||||
"preprocessor_config.json",
|
||||
"model.bin",
|
||||
"tokenizer.json",
|
||||
"vocabulary.*",
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
av==10.*
|
||||
ctranslate2>=3.17,<4
|
||||
ctranslate2>=3.22,<4
|
||||
huggingface_hub>=0.13
|
||||
tokenizers>=0.13,<0.15
|
||||
tokenizers>=0.13,<0.16
|
||||
onnxruntime>=1.14,<2
|
||||
|
||||
Reference in New Issue
Block a user