Add V3 Support (#578)
* Add V3 Support * update conversion example --------- Co-authored-by: oscaarjs <oscar.johansson@conversy.se>
This commit is contained in:
14
README.md
14
README.md
@@ -104,7 +104,7 @@ pip install --force-reinstall "faster-whisper @ https://github.com/guillaumekln/
|
|||||||
```python
|
```python
|
||||||
from faster_whisper import WhisperModel
|
from faster_whisper import WhisperModel
|
||||||
|
|
||||||
model_size = "large-v2"
|
model_size = "large-v3"
|
||||||
|
|
||||||
# Run on GPU with FP16
|
# Run on GPU with FP16
|
||||||
model = WhisperModel(model_size, device="cuda", compute_type="float16")
|
model = WhisperModel(model_size, device="cuda", compute_type="float16")
|
||||||
@@ -185,17 +185,17 @@ Here is a non exhaustive list of open-source projects using faster-whisper. Feel
|
|||||||
|
|
||||||
## Model conversion
|
## Model conversion
|
||||||
|
|
||||||
When loading a model from its size such as `WhisperModel("large-v2")`, the correspondig CTranslate2 model is automatically downloaded from the [Hugging Face Hub](https://huggingface.co/guillaumekln).
|
When loading a model from its size such as `WhisperModel("large-v3")`, the correspondig CTranslate2 model is automatically downloaded from the [Hugging Face Hub](https://huggingface.co/Systran).
|
||||||
|
|
||||||
We also provide a script to convert any Whisper models compatible with the Transformers library. They could be the original OpenAI models or user fine-tuned models.
|
We also provide a script to convert any Whisper models compatible with the Transformers library. They could be the original OpenAI models or user fine-tuned models.
|
||||||
|
|
||||||
For example the command below converts the [original "large-v2" Whisper model](https://huggingface.co/openai/whisper-large-v2) and saves the weights in FP16:
|
For example the command below converts the [original "large-v3" Whisper model](https://huggingface.co/openai/whisper-large-v3) and saves the weights in FP16:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
pip install transformers[torch]>=4.23
|
pip install transformers[torch]>=4.23
|
||||||
|
|
||||||
ct2-transformers-converter --model openai/whisper-large-v2 --output_dir whisper-large-v2-ct2 \
|
ct2-transformers-converter --model openai/whisper-large-v3 --output_dir whisper-large-v3-ct2
|
||||||
--copy_files tokenizer.json --quantization float16
|
--copy_files tokenizer.json preprocessor_config.json --quantization float16
|
||||||
```
|
```
|
||||||
|
|
||||||
* The option `--model` accepts a model name on the Hub or a path to a model directory.
|
* The option `--model` accepts a model name on the Hub or a path to a model directory.
|
||||||
@@ -207,12 +207,12 @@ Models can also be converted from the code. See the [conversion API](https://ope
|
|||||||
|
|
||||||
1. Directly load the model from a local directory:
|
1. Directly load the model from a local directory:
|
||||||
```python
|
```python
|
||||||
model = faster_whisper.WhisperModel("whisper-large-v2-ct2")
|
model = faster_whisper.WhisperModel("whisper-large-v3-ct2")
|
||||||
```
|
```
|
||||||
|
|
||||||
2. [Upload your model to the Hugging Face Hub](https://huggingface.co/docs/transformers/model_sharing#upload-with-the-web-interface) and load it from its name:
|
2. [Upload your model to the Hugging Face Hub](https://huggingface.co/docs/transformers/model_sharing#upload-with-the-web-interface) and load it from its name:
|
||||||
```python
|
```python
|
||||||
model = faster_whisper.WhisperModel("username/whisper-large-v2-ct2")
|
model = faster_whisper.WhisperModel("username/whisper-large-v3-ct2")
|
||||||
```
|
```
|
||||||
|
|
||||||
## Comparing performance against other implementations
|
## Comparing performance against other implementations
|
||||||
|
|||||||
@@ -108,7 +108,7 @@ class Tokenizer:
|
|||||||
def split_to_word_tokens(
|
def split_to_word_tokens(
|
||||||
self, tokens: List[int]
|
self, tokens: List[int]
|
||||||
) -> Tuple[List[str], List[List[int]]]:
|
) -> Tuple[List[str], List[List[int]]]:
|
||||||
if self.language_code in {"zh", "ja", "th", "lo", "my"}:
|
if self.language_code in {"zh", "ja", "th", "lo", "my", "yue"}:
|
||||||
# These languages don't typically use spaces, so it is difficult to split words
|
# These languages don't typically use spaces, so it is difficult to split words
|
||||||
# without morpheme analysis. Here, we instead split words at any
|
# without morpheme analysis. Here, we instead split words at any
|
||||||
# position where the tokens are decoded as valid unicode points
|
# position where the tokens are decoded as valid unicode points
|
||||||
@@ -274,4 +274,5 @@ _LANGUAGE_CODES = (
|
|||||||
"yi",
|
"yi",
|
||||||
"yo",
|
"yo",
|
||||||
"zh",
|
"zh",
|
||||||
|
"yue",
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -1,8 +1,10 @@
|
|||||||
import itertools
|
import itertools
|
||||||
|
import json
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import zlib
|
import zlib
|
||||||
|
|
||||||
|
from inspect import signature
|
||||||
from typing import BinaryIO, Iterable, List, NamedTuple, Optional, Tuple, Union
|
from typing import BinaryIO, Iterable, List, NamedTuple, Optional, Tuple, Union
|
||||||
|
|
||||||
import ctranslate2
|
import ctranslate2
|
||||||
@@ -92,8 +94,8 @@ class WhisperModel:
|
|||||||
|
|
||||||
Args:
|
Args:
|
||||||
model_size_or_path: Size of the model to use (tiny, tiny.en, base, base.en,
|
model_size_or_path: Size of the model to use (tiny, tiny.en, base, base.en,
|
||||||
small, small.en, medium, medium.en, large-v1, large-v2, or large), a path to a converted
|
small, small.en, medium, medium.en, large-v1, large-v2, large-v3, or large), a path to a
|
||||||
model directory, or a CTranslate2-converted Whisper model ID from the Hugging Face Hub.
|
converted model directory, or a CTranslate2-converted Whisper model ID from the HF Hub.
|
||||||
When a size or a model ID is configured, the converted model is downloaded
|
When a size or a model ID is configured, the converted model is downloaded
|
||||||
from the Hugging Face Hub.
|
from the Hugging Face Hub.
|
||||||
device: Device to use for computation ("cpu", "cuda", "auto").
|
device: Device to use for computation ("cpu", "cuda", "auto").
|
||||||
@@ -142,7 +144,8 @@ class WhisperModel:
|
|||||||
"openai/whisper-tiny" + ("" if self.model.is_multilingual else ".en")
|
"openai/whisper-tiny" + ("" if self.model.is_multilingual else ".en")
|
||||||
)
|
)
|
||||||
|
|
||||||
self.feature_extractor = FeatureExtractor()
|
self.feat_kwargs = self._get_feature_kwargs(model_path)
|
||||||
|
self.feature_extractor = FeatureExtractor(**self.feat_kwargs)
|
||||||
self.num_samples_per_token = self.feature_extractor.hop_length * 2
|
self.num_samples_per_token = self.feature_extractor.hop_length * 2
|
||||||
self.frames_per_second = (
|
self.frames_per_second = (
|
||||||
self.feature_extractor.sampling_rate // self.feature_extractor.hop_length
|
self.feature_extractor.sampling_rate // self.feature_extractor.hop_length
|
||||||
@@ -159,6 +162,22 @@ class WhisperModel:
|
|||||||
"""The languages supported by the model."""
|
"""The languages supported by the model."""
|
||||||
return list(_LANGUAGE_CODES) if self.model.is_multilingual else ["en"]
|
return list(_LANGUAGE_CODES) if self.model.is_multilingual else ["en"]
|
||||||
|
|
||||||
|
def _get_feature_kwargs(self, model_path) -> dict:
|
||||||
|
preprocessor_config_file = os.path.join(model_path, "preprocessor_config.json")
|
||||||
|
config = {}
|
||||||
|
if os.path.isfile(preprocessor_config_file):
|
||||||
|
try:
|
||||||
|
with open(preprocessor_config_file, "r", encoding="utf-8") as json_file:
|
||||||
|
config = json.load(json_file)
|
||||||
|
valid_keys = signature(FeatureExtractor.__init__).parameters.keys()
|
||||||
|
config = {k: v for k, v in config.items() if k in valid_keys}
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
self.logger.warning(
|
||||||
|
"Could not load preprocessor_config.json: %s", str(e)
|
||||||
|
)
|
||||||
|
|
||||||
|
return config
|
||||||
|
|
||||||
def transcribe(
|
def transcribe(
|
||||||
self,
|
self,
|
||||||
audio: Union[str, BinaryIO, np.ndarray],
|
audio: Union[str, BinaryIO, np.ndarray],
|
||||||
|
|||||||
@@ -10,17 +10,18 @@ import requests
|
|||||||
from tqdm.auto import tqdm
|
from tqdm.auto import tqdm
|
||||||
|
|
||||||
_MODELS = {
|
_MODELS = {
|
||||||
"tiny.en": "guillaumekln/faster-whisper-tiny.en",
|
"tiny.en": "Systran/faster-whisper-tiny.en",
|
||||||
"tiny": "guillaumekln/faster-whisper-tiny",
|
"tiny": "Systran/faster-whisper-tiny",
|
||||||
"base.en": "guillaumekln/faster-whisper-base.en",
|
"base.en": "Systran/faster-whisper-base.en",
|
||||||
"base": "guillaumekln/faster-whisper-base",
|
"base": "Systran/faster-whisper-base",
|
||||||
"small.en": "guillaumekln/faster-whisper-small.en",
|
"small.en": "Systran/faster-whisper-small.en",
|
||||||
"small": "guillaumekln/faster-whisper-small",
|
"small": "Systran/faster-whisper-small",
|
||||||
"medium.en": "guillaumekln/faster-whisper-medium.en",
|
"medium.en": "Systran/faster-whisper-medium.en",
|
||||||
"medium": "guillaumekln/faster-whisper-medium",
|
"medium": "Systran/faster-whisper-medium",
|
||||||
"large-v1": "guillaumekln/faster-whisper-large-v1",
|
"large-v1": "Systran/faster-whisper-large-v1",
|
||||||
"large-v2": "guillaumekln/faster-whisper-large-v2",
|
"large-v2": "Systran/faster-whisper-large-v2",
|
||||||
"large": "guillaumekln/faster-whisper-large-v2",
|
"large-v3": "Systran/faster-whisper-large-v3",
|
||||||
|
"large": "Systran/faster-whisper-large-v3",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -50,8 +51,8 @@ def download_model(
|
|||||||
Args:
|
Args:
|
||||||
size_or_id: Size of the model to download from https://huggingface.co/guillaumekln
|
size_or_id: Size of the model to download from https://huggingface.co/guillaumekln
|
||||||
(tiny, tiny.en, base, base.en, small, small.en medium, medium.en, large-v1, large-v2,
|
(tiny, tiny.en, base, base.en, small, small.en medium, medium.en, large-v1, large-v2,
|
||||||
large), or a CTranslate2-converted model ID from the Hugging Face Hub
|
large-v3, large), or a CTranslate2-converted model ID from the Hugging Face Hub
|
||||||
(e.g. guillaumekln/faster-whisper-large-v2).
|
(e.g. Systran/faster-whisper-large-v3).
|
||||||
output_dir: Directory where the model should be saved. If not set, the model is saved in
|
output_dir: Directory where the model should be saved. If not set, the model is saved in
|
||||||
the cache directory.
|
the cache directory.
|
||||||
local_files_only: If True, avoid downloading the file and return the path to the local
|
local_files_only: If True, avoid downloading the file and return the path to the local
|
||||||
@@ -76,6 +77,7 @@ def download_model(
|
|||||||
|
|
||||||
allow_patterns = [
|
allow_patterns = [
|
||||||
"config.json",
|
"config.json",
|
||||||
|
"preprocessor_config.json",
|
||||||
"model.bin",
|
"model.bin",
|
||||||
"tokenizer.json",
|
"tokenizer.json",
|
||||||
"vocabulary.*",
|
"vocabulary.*",
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
av==10.*
|
av==10.*
|
||||||
ctranslate2>=3.17,<4
|
ctranslate2>=3.22,<4
|
||||||
huggingface_hub>=0.13
|
huggingface_hub>=0.13
|
||||||
tokenizers>=0.13,<0.15
|
tokenizers>=0.13,<0.16
|
||||||
onnxruntime>=1.14,<2
|
onnxruntime>=1.14,<2
|
||||||
|
|||||||
Reference in New Issue
Block a user