From 19c294f978be4991ba303da6af358f6acd251a25 Mon Sep 17 00:00:00 2001 From: Guillaume Klein Date: Mon, 3 Jul 2023 10:20:20 +0200 Subject: [PATCH 01/37] Squash long words at window and sentence boundaries (#226) Port commit https://github.com/openai/whisper/commit/255887f219e6b632bc1a6aac1caf28eecfca1bac --- faster_whisper/transcribe.py | 40 +++++++++++++++++++++++++++--------- 1 file changed, 30 insertions(+), 10 deletions(-) diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py index 71b0ea1..b88686e 100644 --- a/faster_whisper/transcribe.py +++ b/faster_whisper/transcribe.py @@ -732,9 +732,19 @@ class WhisperModel: word_index += 1 if len(words) > 0: - # adjust the segment-level timestamps based on the word-level timestamps segment["start"] = words[0]["start"] - segment["end"] = words[-1]["end"] + + # hack: prefer the segment-level end timestamp if the last word is too long. + # a better segmentation algorithm based on VAD should be able to replace this. + if ( + segment["end"] > words[-1]["start"] + and segment["end"] + 0.5 < words[-1]["end"] + ): + # adjust the word-level timestamps based on the segment-level timestamps + words[-1]["end"] = segment["end"] + else: + # adjust the segment-level timestamps based on the word-level timestamps + segment["end"] = words[-1]["end"] segment["words"] = words @@ -779,20 +789,30 @@ class WhisperModel: for i, j in zip(word_boundaries[:-1], word_boundaries[1:]) ] - # hack: ensure the first and second word is not longer than twice the median word duration. + # hack: truncate long words at the start of a window and the start of a sentence. # a better segmentation algorithm based on VAD should be able to replace this. word_durations = end_times - start_times word_durations = word_durations[word_durations.nonzero()] if len(word_durations) > 0: median_duration = np.median(word_durations) max_duration = median_duration * 2 - if len(word_durations) >= 2 and word_durations[1] > max_duration: - boundary = max(end_times[2] / 2, end_times[2] - max_duration) - end_times[0] = start_times[1] = boundary - if ( - len(word_durations) >= 1 - and end_times[0] - start_times[0] > max_duration - ): + sentence_end_marks = ".。!!??" + # ensure words at sentence boundaries are not longer than twice the median + # word duration. + for i in range(1, len(start_times)): + if end_times[i] - start_times[i] > max_duration: + if words[i] in sentence_end_marks: + end_times[i] = start_times[i] + max_duration + elif words[i - 1] in sentence_end_marks: + start_times[i] = end_times[i] - max_duration + # ensure the first and second word is not longer than twice the median word duration. + if len(start_times) > 0 and end_times[0] - start_times[0] > max_duration: + if ( + len(start_times) > 1 + and end_times[1] - start_times[1] > max_duration + ): + boundary = max(end_times[1] / 2, end_times[1] - max_duration) + end_times[0] = start_times[1] = boundary start_times[0] = max(0, end_times[0] - max_duration) return [ From c0d93d0829ccbf658e68b6b80ec0c2650bc902a8 Mon Sep 17 00:00:00 2001 From: Guillaume Klein Date: Mon, 3 Jul 2023 10:20:36 +0200 Subject: [PATCH 02/37] Avoid computing higher temperatures on no_speech segments (#225) Port commit https://github.com/openai/whisper/commit/e334ff141d5444fbf6904edaaf408e5b0b416fe8 --- faster_whisper/transcribe.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py index b88686e..017d398 100644 --- a/faster_whisper/transcribe.py +++ b/faster_whisper/transcribe.py @@ -649,6 +649,12 @@ class WhisperModel: options.log_prob_threshold, ) + if ( + options.no_speech_threshold is not None + and result.no_speech_prob > options.no_speech_threshold + ): + needs_fallback = False # silence + if not needs_fallback: break From c7cb2aa8d409b106f2b69b2f9637f59f6733f36e Mon Sep 17 00:00:00 2001 From: zh-plus Date: Mon, 3 Jul 2023 23:40:10 +0800 Subject: [PATCH 03/37] Add support for using whisper models from Huggingface by specifying the model id. (#334) * Add support for downloading CTranslate-converted models from Huggingface. * Update utils.py to pass Flake8. * Update utils.py to pass black. * Remove redundant usage instructions. * Apply suggestions from code review Co-authored-by: Guillaume Klein --------- Co-authored-by: Guillaume Klein --- README.md | 12 ++++++++++++ faster_whisper/transcribe.py | 5 +++-- faster_whisper/utils.py | 22 ++++++++++++++-------- 3 files changed, 29 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index daee860..54d7445 100644 --- a/README.md +++ b/README.md @@ -161,6 +161,18 @@ ct2-transformers-converter --model openai/whisper-large-v2 --output_dir whisper- Models can also be converted from the code. See the [conversion API](https://opennmt.net/CTranslate2/python/ctranslate2.converters.TransformersConverter.html). +### Load a converted model + +1. Directly load the model from a local directory: +```python +model = faster_whisper.WhisperModel('whisper-large-v2-ct2') +``` + +2. [Upload your model to the Hugging Face Hub](https://huggingface.co/docs/transformers/model_sharing#upload-with-the-web-interface) and load it from its name: +```python +model = faster_whisper.WhisperModel('username/whisper-large-v2-ct2') +``` + ## Comparing performance against other implementations If you are comparing the performance against other Whisper implementations, you should make sure to run the comparison with similar settings. In particular: diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py index 017d398..cfb2e8a 100644 --- a/faster_whisper/transcribe.py +++ b/faster_whisper/transcribe.py @@ -88,8 +88,9 @@ class WhisperModel: Args: model_size_or_path: Size of the model to use (tiny, tiny.en, base, base.en, - small, small.en, medium, medium.en, large-v1, or large-v2) or a path to a converted - model directory. When a size is configured, the converted model is downloaded + small, small.en, medium, medium.en, large-v1, or large-v2), a path to a converted + model directory, or a CTranslate2-converted Whisper model ID from the Hugging Face Hub. + When a size or a model ID is configured, the converted model is downloaded from the Hugging Face Hub. device: Device to use for computation ("cpu", "cuda", "auto"). device_index: Device ID to use. diff --git a/faster_whisper/utils.py b/faster_whisper/utils.py index 950b0da..e86b89e 100644 --- a/faster_whisper/utils.py +++ b/faster_whisper/utils.py @@ -1,5 +1,6 @@ import logging import os +import re from typing import Optional @@ -33,7 +34,7 @@ def get_logger(): def download_model( - size: str, + size_or_id: str, output_dir: Optional[str] = None, local_files_only: bool = False, cache_dir: Optional[str] = None, @@ -43,8 +44,9 @@ def download_model( The model is downloaded from https://huggingface.co/guillaumekln. Args: - size: Size of the model to download (tiny, tiny.en, base, base.en, small, small.en, - medium, medium.en, large-v1, or large-v2). + size_or_id: Size of the model to download (tiny, tiny.en, base, base.en, small, small.en, + medium, medium.en, large-v1, or large-v2), or a CTranslate2-converted model ID + from the Hugging Face Hub (e.g. guillaumekln/faster-whisper-large-v2). output_dir: Directory where the model should be saved. If not set, the model is saved in the cache directory. local_files_only: If True, avoid downloading the file and return the path to the local @@ -57,12 +59,16 @@ def download_model( Raises: ValueError: if the model size is invalid. """ - if size not in _MODELS: - raise ValueError( - "Invalid model size '%s', expected one of: %s" % (size, ", ".join(_MODELS)) - ) + if re.match(r".*/.*", size_or_id): + repo_id = size_or_id + else: + if size_or_id not in _MODELS: + raise ValueError( + "Invalid model size '%s', expected one of: %s" + % (size_or_id, ", ".join(_MODELS)) + ) - repo_id = "guillaumekln/faster-whisper-%s" % size + repo_id = "guillaumekln/faster-whisper-%s" % size_or_id allow_patterns = [ "config.json", From 3b4a6aa1c22d293ddde9f08bdd31fc842086a6ea Mon Sep 17 00:00:00 2001 From: Hoon Date: Wed, 5 Jul 2023 22:16:53 +0900 Subject: [PATCH 04/37] Improve timestamp heuristics (#336) * Improve timestamp heuristics * Chore --- faster_whisper/transcribe.py | 95 +++++++++++++++++++++++------------- 1 file changed, 62 insertions(+), 33 deletions(-) diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py index cfb2e8a..8cb492d 100644 --- a/faster_whisper/transcribe.py +++ b/faster_whisper/transcribe.py @@ -370,6 +370,7 @@ class WhisperModel: else: all_tokens.extend(options.initial_prompt) + last_speech_timestamp = 0.0 while seek < content_frames: time_offset = seek * self.feature_extractor.time_per_frame segment = features[:, seek : seek + self.feature_extractor.nb_max_frames] @@ -511,12 +512,14 @@ class WhisperModel: segment_size, options.prepend_punctuations, options.append_punctuations, + last_speech_timestamp=last_speech_timestamp, ) word_end_timestamps = [ w["end"] for s in current_segments for w in s["words"] ] - + if len(word_end_timestamps) > 0: + last_speech_timestamp = word_end_timestamps[-1] if not single_timestamp_ending and len(word_end_timestamps) > 0: seek_shift = round( (word_end_timestamps[-1] - time_offset) * self.frames_per_second @@ -695,6 +698,7 @@ class WhisperModel: num_frames: int, prepend_punctuations: str, append_punctuations: str, + last_speech_timestamp: float, ): if len(segments) == 0: return @@ -708,6 +712,26 @@ class WhisperModel: alignment = self.find_alignment( tokenizer, text_tokens, encoder_output, num_frames ) + word_durations = np.array([word["end"] - word["start"] for word in alignment]) + word_durations = word_durations[word_durations.nonzero()] + median_duration = np.median(word_durations) if len(word_durations) > 0 else 0.0 + max_duration = median_duration * 2 + + # hack: truncate long words at sentence boundaries. + # a better segmentation algorithm based on VAD should be able to replace this. + if len(word_durations) > 0: + median_duration = np.median(word_durations) + max_duration = median_duration * 2 + sentence_end_marks = ".。!!??" + # ensure words at sentence boundaries + # are not longer than twice the median word duration. + for i in range(1, len(alignment)): + if alignment[i]["end"] - alignment[i]["start"] > max_duration: + if alignment[i]["word"] in sentence_end_marks: + alignment[i]["end"] = alignment[i]["start"] + max_duration + elif alignment[i - 1]["word"] in sentence_end_marks: + alignment[i]["start"] = alignment[i]["end"] - max_duration + merge_punctuations(alignment, prepend_punctuations, append_punctuations) time_offset = ( @@ -738,21 +762,52 @@ class WhisperModel: saved_tokens += len(timing["tokens"]) word_index += 1 + # hack: truncate long words at segment boundaries. + # a better segmentation algorithm based on VAD should be able to replace this. if len(words) > 0: - segment["start"] = words[0]["start"] + # ensure the first and second word after a pause is not longer than + # twice the median word duration. + if words[0]["end"] - last_speech_timestamp > median_duration * 4 and ( + words[0]["end"] - words[0]["start"] > max_duration + or ( + len(words) > 1 + and words[1]["end"] - words[0]["start"] > max_duration * 2 + ) + ): + if ( + len(words) > 1 + and words[1]["end"] - words[1]["start"] > max_duration + ): + boundary = max( + words[1]["end"] / 2, words[1]["end"] - max_duration + ) + words[0]["end"] = words[1]["start"] = boundary + words[0]["start"] = max(0, words[0]["end"] - max_duration) - # hack: prefer the segment-level end timestamp if the last word is too long. - # a better segmentation algorithm based on VAD should be able to replace this. + # prefer the segment-level start timestamp if the first word is too long. + if ( + segment["start"] < words[0]["end"] + and segment["start"] - 0.5 > words[0]["start"] + ): + words[0]["start"] = max( + 0, min(words[0]["end"] - median_duration, segment["start"]) + ) + else: + segment["start"] = words[0]["start"] + + # prefer the segment-level end timestamp if the last word is too long. if ( segment["end"] > words[-1]["start"] and segment["end"] + 0.5 < words[-1]["end"] ): - # adjust the word-level timestamps based on the segment-level timestamps - words[-1]["end"] = segment["end"] + words[-1]["end"] = max( + words[-1]["start"] + median_duration, segment["end"] + ) else: - # adjust the segment-level timestamps based on the word-level timestamps segment["end"] = words[-1]["end"] + last_speech_timestamp = segment["end"] + segment["words"] = words def find_alignment( @@ -796,32 +851,6 @@ class WhisperModel: for i, j in zip(word_boundaries[:-1], word_boundaries[1:]) ] - # hack: truncate long words at the start of a window and the start of a sentence. - # a better segmentation algorithm based on VAD should be able to replace this. - word_durations = end_times - start_times - word_durations = word_durations[word_durations.nonzero()] - if len(word_durations) > 0: - median_duration = np.median(word_durations) - max_duration = median_duration * 2 - sentence_end_marks = ".。!!??" - # ensure words at sentence boundaries are not longer than twice the median - # word duration. - for i in range(1, len(start_times)): - if end_times[i] - start_times[i] > max_duration: - if words[i] in sentence_end_marks: - end_times[i] = start_times[i] + max_duration - elif words[i - 1] in sentence_end_marks: - start_times[i] = end_times[i] - max_duration - # ensure the first and second word is not longer than twice the median word duration. - if len(start_times) > 0 and end_times[0] - start_times[0] > max_duration: - if ( - len(start_times) > 1 - and end_times[1] - start_times[1] > max_duration - ): - boundary = max(end_times[1] / 2, end_times[1] - max_duration) - end_times[0] = start_times[1] = boundary - start_times[0] = max(0, end_times[0] - max_duration) - return [ dict( word=word, tokens=tokens, start=start, end=end, probability=probability From 2a37390fed873f0c6d645ca234eab2bc0c9b16e5 Mon Sep 17 00:00:00 2001 From: Guillaume Klein Date: Tue, 18 Jul 2023 15:08:53 +0200 Subject: [PATCH 05/37] Minor reformatting in code snippet --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 54d7445..44685e6 100644 --- a/README.md +++ b/README.md @@ -165,12 +165,12 @@ Models can also be converted from the code. See the [conversion API](https://ope 1. Directly load the model from a local directory: ```python -model = faster_whisper.WhisperModel('whisper-large-v2-ct2') +model = faster_whisper.WhisperModel("whisper-large-v2-ct2") ``` 2. [Upload your model to the Hugging Face Hub](https://huggingface.co/docs/transformers/model_sharing#upload-with-the-web-interface) and load it from its name: ```python -model = faster_whisper.WhisperModel('username/whisper-large-v2-ct2') +model = faster_whisper.WhisperModel("username/whisper-large-v2-ct2") ``` ## Comparing performance against other implementations From 0e051a5b7751d4a609033dbc84c8ecee43d8955f Mon Sep 17 00:00:00 2001 From: Guillaume Klein Date: Tue, 18 Jul 2023 15:22:39 +0200 Subject: [PATCH 06/37] Prepend prefix tokens with the initial timestamp token (#358) --- faster_whisper/transcribe.py | 2 ++ requirements.txt | 2 +- tests/test_transcribe.py | 18 ++++++++++++++++++ 3 files changed, 21 insertions(+), 1 deletion(-) diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py index 8cb492d..538bb93 100644 --- a/faster_whisper/transcribe.py +++ b/faster_whisper/transcribe.py @@ -686,6 +686,8 @@ class WhisperModel: prefix_tokens = tokenizer.encode(" " + prefix.strip()) if len(prefix_tokens) >= self.max_length // 2: prefix_tokens = prefix_tokens[: self.max_length // 2 - 1] + if not without_timestamps: + prompt.append(tokenizer.timestamp_begin) prompt.extend(prefix_tokens) return prompt diff --git a/requirements.txt b/requirements.txt index 4dd8bac..819d3d2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ av==10.* -ctranslate2>=3.10,<4 +ctranslate2>=3.17,<4 huggingface_hub>=0.13 tokenizers==0.13.* onnxruntime>=1.14,<2 diff --git a/tests/test_transcribe.py b/tests/test_transcribe.py index 6ecf2c4..ca8d5a9 100644 --- a/tests/test_transcribe.py +++ b/tests/test_transcribe.py @@ -34,6 +34,24 @@ def test_transcribe(jfk_path): assert segment.end == segment.words[-1].end +def test_prefix_with_timestamps(jfk_path): + model = WhisperModel("tiny") + segments, _ = model.transcribe(jfk_path, prefix="And so my fellow Americans") + segments = list(segments) + + assert len(segments) == 1 + + segment = segments[0] + + assert segment.text == ( + " And so my fellow Americans ask not what your country can do for you, " + "ask what you can do for your country." + ) + + assert segment.start == 0 + assert 10 < segment.end < 11 + + def test_vad(jfk_path): model = WhisperModel("tiny") segments, info = model.transcribe( From 171d90dd1f4827376a33b75e24bea59f870bb734 Mon Sep 17 00:00:00 2001 From: Guillaume Klein Date: Tue, 18 Jul 2023 15:23:47 +0200 Subject: [PATCH 07/37] Bump version to 0.7.0 --- faster_whisper/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faster_whisper/version.py b/faster_whisper/version.py index bf288f0..26a803c 100644 --- a/faster_whisper/version.py +++ b/faster_whisper/version.py @@ -1,3 +1,3 @@ """Version information.""" -__version__ = "0.6.0" +__version__ = "0.7.0" From 687db319e0ba10f47239415dd74e8bfbd480d433 Mon Sep 17 00:00:00 2001 From: KH Date: Tue, 18 Jul 2023 23:03:01 +0900 Subject: [PATCH 08/37] Remove duplicate code (#359) --- faster_whisper/transcribe.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py index 538bb93..e4cf904 100644 --- a/faster_whisper/transcribe.py +++ b/faster_whisper/transcribe.py @@ -722,8 +722,6 @@ class WhisperModel: # hack: truncate long words at sentence boundaries. # a better segmentation algorithm based on VAD should be able to replace this. if len(word_durations) > 0: - median_duration = np.median(word_durations) - max_duration = median_duration * 2 sentence_end_marks = ".。!!??" # ensure words at sentence boundaries # are not longer than twice the median word duration. From e786e26f75f49b7d638412f3bf2b2b75a9c3c9e8 Mon Sep 17 00:00:00 2001 From: KH Date: Thu, 20 Jul 2023 23:13:11 +0900 Subject: [PATCH 09/37] Return result with best log prob when all temperature fallbacks failed (#356) * Resolve Inference Selection Bug * Refactor for better readability * Filter out results with compression_ratio * Refactor to avoid variable repetition * Fix incorrect index and perform minor refactoring * Remove final_temperature variable --- faster_whisper/transcribe.py | 45 ++++++++++++++++++++++-------------- 1 file changed, 28 insertions(+), 17 deletions(-) diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py index e4cf904..c351c0a 100644 --- a/faster_whisper/transcribe.py +++ b/faster_whisper/transcribe.py @@ -578,10 +578,9 @@ class WhisperModel: tokenizer: Tokenizer, options: TranscriptionOptions, ) -> Tuple[ctranslate2.models.WhisperGenerationResult, float, float, float]: - result = None - avg_logprob = None - final_temperature = None - compression_ratio = None + decode_result = None + all_results = [] + below_cr_threshold_results = [] max_initial_timestamp_index = int( round(options.max_initial_timestamp / self.time_precision) @@ -601,7 +600,6 @@ class WhisperModel: "patience": options.patience, } - final_temperature = temperature result = self.model.generate( encoder_output, [prompt], @@ -625,20 +623,28 @@ class WhisperModel: text = tokenizer.decode(tokens).strip() compression_ratio = get_compression_ratio(text) + decode_result = ( + result, + avg_logprob, + temperature, + compression_ratio, + ) + all_results.append(decode_result) + needs_fallback = False - if ( - options.compression_ratio_threshold is not None - and compression_ratio > options.compression_ratio_threshold - ): - needs_fallback = True # too repetitive + if options.compression_ratio_threshold is not None: + if compression_ratio > options.compression_ratio_threshold: + needs_fallback = True # too repetitive - self.logger.debug( - "Compression ratio threshold is not met with temperature %.1f (%f > %f)", - temperature, - compression_ratio, - options.compression_ratio_threshold, - ) + self.logger.debug( + "Compression ratio threshold is not met with temperature %.1f (%f > %f)", + temperature, + compression_ratio, + options.compression_ratio_threshold, + ) + else: + below_cr_threshold_results.append(decode_result) if ( options.log_prob_threshold is not None @@ -661,8 +667,13 @@ class WhisperModel: if not needs_fallback: break + else: + # all failed, select the result with the highest average log probability + decode_result = max( + below_cr_threshold_results or all_results, key=lambda x: x[1] + ) - return result, avg_logprob, final_temperature, compression_ratio + return decode_result def get_prompt( self, From 0f55c436fe4ac2e5417366b4eda168ecee6b68eb Mon Sep 17 00:00:00 2001 From: Guillaume Klein Date: Mon, 24 Jul 2023 10:57:15 +0200 Subject: [PATCH 10/37] Invalidate the cached encoder output when no_speech threshold is met (#376) --- faster_whisper/transcribe.py | 1 + 1 file changed, 1 insertion(+) diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py index c351c0a..70717f5 100644 --- a/faster_whisper/transcribe.py +++ b/faster_whisper/transcribe.py @@ -422,6 +422,7 @@ class WhisperModel: # fast-forward to the next segment boundary seek += segment_size + encoder_output = None continue tokens = result.sequences_ids[0] From 5c17de17713f65929c7c33add3a9735ff75a945c Mon Sep 17 00:00:00 2001 From: Guillaume Klein Date: Mon, 24 Jul 2023 11:10:12 +0200 Subject: [PATCH 11/37] Bump version to 0.7.1 --- faster_whisper/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faster_whisper/version.py b/faster_whisper/version.py index 26a803c..c8de23a 100644 --- a/faster_whisper/version.py +++ b/faster_whisper/version.py @@ -1,3 +1,3 @@ """Version information.""" -__version__ = "0.7.0" +__version__ = "0.7.1" From 1a1eb1a027c0bccabd86759b9772509f9c29f9dc Mon Sep 17 00:00:00 2001 From: KH Date: Thu, 3 Aug 2023 22:40:58 +0900 Subject: [PATCH 12/37] Add clear_previous_text_on_temperature parameter (#397) * Add clear_previous_text_on_temperature parameter * Add a description --- faster_whisper/transcribe.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py index 70717f5..2f9b2a3 100644 --- a/faster_whisper/transcribe.py +++ b/faster_whisper/transcribe.py @@ -51,6 +51,7 @@ class TranscriptionOptions(NamedTuple): no_speech_threshold: Optional[float] compression_ratio_threshold: Optional[float] condition_on_previous_text: bool + clear_previous_text_on_temperature: float temperatures: List[float] initial_prompt: Optional[Union[str, Iterable[int]]] prefix: Optional[str] @@ -171,6 +172,7 @@ class WhisperModel: log_prob_threshold: Optional[float] = -1.0, no_speech_threshold: Optional[float] = 0.6, condition_on_previous_text: bool = True, + clear_previous_text_on_temperature: float = 0.5, initial_prompt: Optional[Union[str, Iterable[int]]] = None, prefix: Optional[str] = None, suppress_blank: bool = True, @@ -209,6 +211,8 @@ class WhisperModel: as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loop, such as repetition looping or timestamps going out of sync. + clear_previous_text_on_temperature: If the temperature is above this value, + clear the previous text. initial_prompt: Optional text string or iterable of token ids to provide as a prompt for the first window. prefix: Optional text to provide as a prefix for the first window. @@ -319,6 +323,7 @@ class WhisperModel: no_speech_threshold=no_speech_threshold, compression_ratio_threshold=compression_ratio_threshold, condition_on_previous_text=condition_on_previous_text, + clear_previous_text_on_temperature=clear_previous_text_on_temperature, temperatures=( temperature if isinstance(temperature, (list, tuple)) else [temperature] ), @@ -559,7 +564,10 @@ class WhisperModel: ), ) - if not options.condition_on_previous_text or temperature > 0.5: + if ( + not options.condition_on_previous_text + or temperature > options.clear_previous_text_on_temperature + ): prompt_reset_since = len(all_tokens) def encode(self, features: np.ndarray) -> ctranslate2.StorageView: From 857be6f621c4b940cf784925cd2c94e40d16235f Mon Sep 17 00:00:00 2001 From: Purfview <69023953+Purfview@users.noreply.github.com> Date: Thu, 3 Aug 2023 17:44:37 +0100 Subject: [PATCH 13/37] Rename clear_previous_text_on_temperature argument (#398) `prompt_reset_on_temperature` is more clear what it does. --- faster_whisper/transcribe.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py index 2f9b2a3..6d6ef41 100644 --- a/faster_whisper/transcribe.py +++ b/faster_whisper/transcribe.py @@ -51,7 +51,7 @@ class TranscriptionOptions(NamedTuple): no_speech_threshold: Optional[float] compression_ratio_threshold: Optional[float] condition_on_previous_text: bool - clear_previous_text_on_temperature: float + prompt_reset_on_temperature: float temperatures: List[float] initial_prompt: Optional[Union[str, Iterable[int]]] prefix: Optional[str] @@ -172,7 +172,7 @@ class WhisperModel: log_prob_threshold: Optional[float] = -1.0, no_speech_threshold: Optional[float] = 0.6, condition_on_previous_text: bool = True, - clear_previous_text_on_temperature: float = 0.5, + prompt_reset_on_temperature: float = 0.5, initial_prompt: Optional[Union[str, Iterable[int]]] = None, prefix: Optional[str] = None, suppress_blank: bool = True, @@ -211,8 +211,8 @@ class WhisperModel: as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loop, such as repetition looping or timestamps going out of sync. - clear_previous_text_on_temperature: If the temperature is above this value, - clear the previous text. + prompt_reset_on_temperature: Resets prompt if temperature is above this value. + Arg has effect only if condition_on_previous_text is True. initial_prompt: Optional text string or iterable of token ids to provide as a prompt for the first window. prefix: Optional text to provide as a prefix for the first window. @@ -323,7 +323,7 @@ class WhisperModel: no_speech_threshold=no_speech_threshold, compression_ratio_threshold=compression_ratio_threshold, condition_on_previous_text=condition_on_previous_text, - clear_previous_text_on_temperature=clear_previous_text_on_temperature, + prompt_reset_on_temperature=prompt_reset_on_temperature, temperatures=( temperature if isinstance(temperature, (list, tuple)) else [temperature] ), @@ -566,7 +566,7 @@ class WhisperModel: if ( not options.condition_on_previous_text - or temperature > options.clear_previous_text_on_temperature + or temperature > options.prompt_reset_on_temperature ): prompt_reset_since = len(all_tokens) From 1ce16652ee1da3864c34356e4f7d61eb4f941b20 Mon Sep 17 00:00:00 2001 From: Purfview <69023953+Purfview@users.noreply.github.com> Date: Fri, 4 Aug 2023 08:06:17 +0100 Subject: [PATCH 14/37] Adds DEBUG log message for prompt_reset_on_temperature (#399) Produce DEBUG log message if prompt_reset_on_temperature threshold is met. --- faster_whisper/transcribe.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py index 6d6ef41..593d1be 100644 --- a/faster_whisper/transcribe.py +++ b/faster_whisper/transcribe.py @@ -568,6 +568,13 @@ class WhisperModel: not options.condition_on_previous_text or temperature > options.prompt_reset_on_temperature ): + if options.condition_on_previous_text: + self.logger.debug( + "Reset prompt. prompt_reset_on_temperature threshold is met %f > %f", + temperature, + options.prompt_reset_on_temperature, + ) + prompt_reset_since = len(all_tokens) def encode(self, features: np.ndarray) -> ctranslate2.StorageView: From 1562b02345c8587eff1c6694b28c7f91f5c20ec2 Mon Sep 17 00:00:00 2001 From: Aisu Wata Date: Sun, 6 Aug 2023 05:08:24 -0300 Subject: [PATCH 15/37] added repetition_penalty to TranscriptionOptions (#403) Co-authored-by: Aisu Wata --- faster_whisper/transcribe.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py index 593d1be..ba55adc 100644 --- a/faster_whisper/transcribe.py +++ b/faster_whisper/transcribe.py @@ -47,6 +47,7 @@ class TranscriptionOptions(NamedTuple): best_of: int patience: float length_penalty: float + repetition_penalty: float log_prob_threshold: Optional[float] no_speech_threshold: Optional[float] compression_ratio_threshold: Optional[float] @@ -160,6 +161,7 @@ class WhisperModel: best_of: int = 5, patience: float = 1, length_penalty: float = 1, + repetition_penalty: float = 1, temperature: Union[float, List[float], Tuple[float, ...]] = [ 0.0, 0.2, @@ -197,6 +199,8 @@ class WhisperModel: best_of: Number of candidates when sampling with non-zero temperature. patience: Beam search patience factor. length_penalty: Exponential length penalty constant. + repetition_penalty: Penalty applied to the score of previously generated tokens + (set > 1 to penalize). temperature: Temperature for sampling. It can be a tuple of temperatures, which will be successively used upon failures according to either `compression_ratio_threshold` or `log_prob_threshold`. @@ -319,6 +323,7 @@ class WhisperModel: best_of=best_of, patience=patience, length_penalty=length_penalty, + repetition_penalty=repetition_penalty, log_prob_threshold=log_prob_threshold, no_speech_threshold=no_speech_threshold, compression_ratio_threshold=compression_ratio_threshold, @@ -620,6 +625,7 @@ class WhisperModel: encoder_output, [prompt], length_penalty=options.length_penalty, + repetition_penalty=options.repetition_penalty, max_length=self.max_length, return_scores=True, return_no_speech_prob=True, From 7b271da0351e4f81f80e8bb4d2c21c9406475aa9 Mon Sep 17 00:00:00 2001 From: Hrishikesh Barman Date: Thu, 17 Aug 2023 12:20:24 +0530 Subject: [PATCH 16/37] docs: add wscribe to community integrations (#427) wscribe is a utility to generate transcript specifically to make it easy for further manual edits accompanied by the wscribe-editor --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 44685e6..1993271 100644 --- a/README.md +++ b/README.md @@ -140,6 +140,7 @@ Here is a non exhaustive list of open-source projects using faster-whisper. Feel * [whisper-standalone-win](https://github.com/Purfview/whisper-standalone-win) contains the portable ready to run binaries of faster-whisper for Windows. * [asr-sd-pipeline](https://github.com/hedrergudene/asr-sd-pipeline) provides a scalable, modular, end to end multi-speaker speech to text solution implemented using AzureML pipelines. * [Open-Lyrics](https://github.com/zh-plus/Open-Lyrics) is a Python library that transcribes voice files using faster-whisper, and translates/polishes the resulting text into `.lrc` files in the desired language using OpenAI-GPT. +* [wscribe](https://github.com/geekodour/wscribe) is a flexible transcript generation tool supporting faster-whisper, it can export word level transcript and the exported transcript then can be edited with [wscribe-editor](https://github.com/geekodour/wscribe-editor) ## Model conversion From e87fbf8a496699ce79ed6a4d2e52af389eed7197 Mon Sep 17 00:00:00 2001 From: MinorJinx <47308878+MinorJinx@users.noreply.github.com> Date: Thu, 31 Aug 2023 10:19:48 -0500 Subject: [PATCH 17/37] Added audio duration after VAD to TranscriptionInfo object (#445) * Added VAD removed audio duration to TranscriptionInfo object Along with the duration of the original audio, this commit adds the seconds of audio removed by the VAD to the returned info obj * Chaning naming for duration_after_vad Instead of the property returning the audio duration removed, it now returns the final duration after the vad. If vad_filter is False or if it doesn't remove any audio, the original duration is returned. --- faster_whisper/transcribe.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py index ba55adc..7ff27d2 100644 --- a/faster_whisper/transcribe.py +++ b/faster_whisper/transcribe.py @@ -69,6 +69,7 @@ class TranscriptionInfo(NamedTuple): language: str language_probability: float duration: float + duration_after_vad: float all_language_probs: Optional[List[Tuple[str, float]]] transcription_options: TranscriptionOptions vad_options: VadOptions @@ -249,6 +250,7 @@ class WhisperModel: audio = decode_audio(audio, sampling_rate=sampling_rate) duration = audio.shape[0] / sampling_rate + duration_after_vad = duration self.logger.info( "Processing audio with duration %s", format_timestamp(duration) @@ -261,10 +263,11 @@ class WhisperModel: vad_parameters = VadOptions(**vad_parameters) speech_chunks = get_speech_timestamps(audio, vad_parameters) audio = collect_chunks(audio, speech_chunks) + duration_after_vad = audio.shape[0] / sampling_rate self.logger.info( "VAD filter removed %s of audio", - format_timestamp(duration - (audio.shape[0] / sampling_rate)), + format_timestamp(duration - duration_after_vad), ) if self.logger.isEnabledFor(logging.DEBUG): @@ -352,6 +355,7 @@ class WhisperModel: language=language, language_probability=language_probability, duration=duration, + duration_after_vad=duration_after_vad, transcription_options=options, vad_options=vad_parameters, all_language_probs=all_language_probs, From 5871858a5f12536e823a70f8e77265a462ff37b9 Mon Sep 17 00:00:00 2001 From: Guillaume Klein Date: Fri, 1 Sep 2023 15:25:13 +0200 Subject: [PATCH 18/37] Force the garbage collector to run after decoding the audio with PyAV (#448) --- faster_whisper/audio.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/faster_whisper/audio.py b/faster_whisper/audio.py index fbecc48..4dd8aae 100644 --- a/faster_whisper/audio.py +++ b/faster_whisper/audio.py @@ -6,6 +6,7 @@ system dependencies. FFmpeg does not need to be installed on the system. However, the API is quite low-level so we need to manipulate audio frames directly. """ +import gc import io import itertools @@ -53,6 +54,11 @@ def decode_audio( dtype = array.dtype raw_buffer.write(array) + # It appears that some objects related to the resampler are not freed + # unless the garbage collector is manually run. + del resampler + gc.collect() + audio = np.frombuffer(raw_buffer.getbuffer(), dtype=dtype) # Convert s16 back to f32. From f0ff12965af9c7a026e1d9c1884629adf9b8e9ab Mon Sep 17 00:00:00 2001 From: Guillaume Klein Date: Fri, 1 Sep 2023 17:31:30 +0200 Subject: [PATCH 19/37] Expose generation parameter no_repeat_ngram_size (#449) --- faster_whisper/transcribe.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py index 7ff27d2..326eecb 100644 --- a/faster_whisper/transcribe.py +++ b/faster_whisper/transcribe.py @@ -48,6 +48,7 @@ class TranscriptionOptions(NamedTuple): patience: float length_penalty: float repetition_penalty: float + no_repeat_ngram_size: int log_prob_threshold: Optional[float] no_speech_threshold: Optional[float] compression_ratio_threshold: Optional[float] @@ -163,6 +164,7 @@ class WhisperModel: patience: float = 1, length_penalty: float = 1, repetition_penalty: float = 1, + no_repeat_ngram_size: int = 0, temperature: Union[float, List[float], Tuple[float, ...]] = [ 0.0, 0.2, @@ -202,6 +204,7 @@ class WhisperModel: length_penalty: Exponential length penalty constant. repetition_penalty: Penalty applied to the score of previously generated tokens (set > 1 to penalize). + no_repeat_ngram_size: Prevent repetitions of ngrams with this size (set 0 to disable). temperature: Temperature for sampling. It can be a tuple of temperatures, which will be successively used upon failures according to either `compression_ratio_threshold` or `log_prob_threshold`. @@ -327,6 +330,7 @@ class WhisperModel: patience=patience, length_penalty=length_penalty, repetition_penalty=repetition_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, log_prob_threshold=log_prob_threshold, no_speech_threshold=no_speech_threshold, compression_ratio_threshold=compression_ratio_threshold, @@ -630,6 +634,7 @@ class WhisperModel: [prompt], length_penalty=options.length_penalty, repetition_penalty=options.repetition_penalty, + no_repeat_ngram_size=options.no_repeat_ngram_size, max_length=self.max_length, return_scores=True, return_no_speech_prob=True, From 1e6eb967c92e801427f737e9c98a8f61833eb0ab Mon Sep 17 00:00:00 2001 From: Guillaume Klein Date: Mon, 4 Sep 2023 11:54:42 +0200 Subject: [PATCH 20/37] Add "large" alias for "large-v2" model (#453) --- faster_whisper/transcribe.py | 2 +- faster_whisper/utils.py | 41 ++++++++++++++++++------------------ 2 files changed, 21 insertions(+), 22 deletions(-) diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py index 326eecb..1534efb 100644 --- a/faster_whisper/transcribe.py +++ b/faster_whisper/transcribe.py @@ -92,7 +92,7 @@ class WhisperModel: Args: model_size_or_path: Size of the model to use (tiny, tiny.en, base, base.en, - small, small.en, medium, medium.en, large-v1, or large-v2), a path to a converted + small, small.en, medium, medium.en, large-v1, large-v2, or large), a path to a converted model directory, or a CTranslate2-converted Whisper model ID from the Hugging Face Hub. When a size or a model ID is configured, the converted model is downloaded from the Hugging Face Hub. diff --git a/faster_whisper/utils.py b/faster_whisper/utils.py index e86b89e..5987aee 100644 --- a/faster_whisper/utils.py +++ b/faster_whisper/utils.py @@ -9,18 +9,19 @@ import requests from tqdm.auto import tqdm -_MODELS = ( - "tiny.en", - "tiny", - "base.en", - "base", - "small.en", - "small", - "medium.en", - "medium", - "large-v1", - "large-v2", -) +_MODELS = { + "tiny.en": "guillaumekln/faster-whisper-tiny.en", + "tiny": "guillaumekln/faster-whisper-tiny", + "base.en": "guillaumekln/faster-whisper-base.en", + "base": "guillaumekln/faster-whisper-base", + "small.en": "guillaumekln/faster-whisper-small.en", + "small": "guillaumekln/faster-whisper-small", + "medium.en": "guillaumekln/faster-whisper-medium.en", + "medium": "guillaumekln/faster-whisper-medium", + "large-v1": "guillaumekln/faster-whisper-large-v1", + "large-v2": "guillaumekln/faster-whisper-large-v2", + "large": "guillaumekln/faster-whisper-large-v2", +} def get_assets_path(): @@ -41,12 +42,11 @@ def download_model( ): """Downloads a CTranslate2 Whisper model from the Hugging Face Hub. - The model is downloaded from https://huggingface.co/guillaumekln. - Args: - size_or_id: Size of the model to download (tiny, tiny.en, base, base.en, small, small.en, - medium, medium.en, large-v1, or large-v2), or a CTranslate2-converted model ID - from the Hugging Face Hub (e.g. guillaumekln/faster-whisper-large-v2). + size_or_id: Size of the model to download from https://huggingface.co/guillaumekln + (tiny, tiny.en, base, base.en, small, small.en medium, medium.en, large-v1, large-v2, + large), or a CTranslate2-converted model ID from the Hugging Face Hub + (e.g. guillaumekln/faster-whisper-large-v2). output_dir: Directory where the model should be saved. If not set, the model is saved in the cache directory. local_files_only: If True, avoid downloading the file and return the path to the local @@ -62,14 +62,13 @@ def download_model( if re.match(r".*/.*", size_or_id): repo_id = size_or_id else: - if size_or_id not in _MODELS: + repo_id = _MODELS.get(size_or_id) + if repo_id is None: raise ValueError( "Invalid model size '%s', expected one of: %s" - % (size_or_id, ", ".join(_MODELS)) + % (size_or_id, ", ".join(_MODELS.keys())) ) - repo_id = "guillaumekln/faster-whisper-%s" % size_or_id - allow_patterns = [ "config.json", "model.bin", From 4a41746e55251741ecefa3451bbdb06b234d149b Mon Sep 17 00:00:00 2001 From: Guillaume Klein Date: Mon, 4 Sep 2023 11:55:40 +0200 Subject: [PATCH 21/37] Log a warning when the model is English-only but the language is set to something else (#454) --- faster_whisper/transcribe.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py index 1534efb..717cb59 100644 --- a/faster_whisper/transcribe.py +++ b/faster_whisper/transcribe.py @@ -315,6 +315,13 @@ class WhisperModel: language_probability, ) else: + if not self.model.is_multilingual and language != "en": + self.logger.warning( + "The current model is English-only but the language parameter is set to '%s'; " + "using 'en' instead." % language + ) + language = "en" + language_probability = 1 tokenizer = Tokenizer( From ad388cd394d43c0c13a0dde4577dd611a980c679 Mon Sep 17 00:00:00 2001 From: Guillaume Klein Date: Mon, 4 Sep 2023 11:56:48 +0200 Subject: [PATCH 22/37] Bump version to 0.8.0 --- faster_whisper/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faster_whisper/version.py b/faster_whisper/version.py index c8de23a..028b8b0 100644 --- a/faster_whisper/version.py +++ b/faster_whisper/version.py @@ -1,3 +1,3 @@ """Version information.""" -__version__ = "0.7.1" +__version__ = "0.8.0" From 0285d46f6f476be5db17db8c48a9a5aca85e96c3 Mon Sep 17 00:00:00 2001 From: Guillaume Klein Date: Fri, 8 Sep 2023 14:35:17 +0200 Subject: [PATCH 23/37] Add more details about the requirements in the README (#463) --- README.md | 53 +++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 47 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 1993271..36320fa 100644 --- a/README.md +++ b/README.md @@ -36,6 +36,44 @@ For reference, here's the time and memory usage that are required to transcribe *Executed with 8 threads on a Intel(R) Xeon(R) Gold 6226R.* +## Requirements + +* Python 3.8 or greater + +Unlike openai-whisper, FFmpeg does **not** need to be installed on the system. The audio is decoded with the Python library [PyAV](https://github.com/PyAV-Org/PyAV) which bundles the FFmpeg libraries in its package. + +### GPU + +GPU execution requires the following NVIDIA libraries to be installed: + +* [cuBLAS for CUDA 11](https://developer.nvidia.com/cublas) +* [cuDNN 8 for CUDA 11](https://developer.nvidia.com/cudnn) + +There are multiple ways to install these libraries. The recommended way is described in the official NVIDIA documentation, but we also suggest other installation methods below. + +
+Other installation methods (click to expand) + +#### Use Docker + +The libraries are installed in this official NVIDIA Docker image: `nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu22.04`. + +#### Install with `pip` (Linux only) + +On Linux these libraries can be installed with `pip`. Note that `LD_LIBRARY_PATH` must be set before launching Python. + +```bash +pip install nvidia-cublas-cu11 nvidia-cudnn-cu11 + +export LD_LIBRARY_PATH=`python3 -c 'import os; import nvidia.cublas.lib; import nvidia.cudnn.lib; print(os.path.dirname(nvidia.cublas.lib.__file__) + ":" + os.path.dirname(nvidia.cudnn.lib.__file__))'` +``` + +#### Download the libraries from Purfview's repository (Windows only) + +Purfview's [whisper-standalone-win](https://github.com/Purfview/whisper-standalone-win) provides the required NVIDIA libraries for Windows in a [single archive](https://github.com/Purfview/whisper-standalone-win/releases/tag/libs). Decompress the archive and place the libraries in a directory included in the `PATH`. + +
+ ## Installation The module can be installed from [PyPI](https://pypi.org/project/faster-whisper/): @@ -44,19 +82,22 @@ The module can be installed from [PyPI](https://pypi.org/project/faster-whisper/ pip install faster-whisper ``` -**Other installation methods:** +
+Other installation methods (click to expand) + +### Install the master branch ```bash -# Install the master branch: pip install --force-reinstall "faster-whisper @ https://github.com/guillaumekln/faster-whisper/archive/refs/heads/master.tar.gz" +``` -# Install a specific commit: +### Install a specific commit + +```bash pip install --force-reinstall "faster-whisper @ https://github.com/guillaumekln/faster-whisper/archive/a4f1cc8f11433e454c3934442b5e1a4ed5e865c3.tar.gz" ``` -### GPU support - -GPU execution requires the NVIDIA libraries cuBLAS 11.x and cuDNN 8.x to be installed on the system. Please refer to the [CTranslate2 documentation](https://opennmt.net/CTranslate2/installation.html). +
## Usage From 727ab81f31fccddb9fc4a9a5028871bc598d8c41 Mon Sep 17 00:00:00 2001 From: Guillaume Klein Date: Tue, 12 Sep 2023 10:02:23 +0200 Subject: [PATCH 24/37] Improve error message for invalid task and language parameters (#466) --- faster_whisper/tokenizer.py | 128 ++++++++++++++++++++++++++++++++++-- 1 file changed, 121 insertions(+), 7 deletions(-) diff --git a/faster_whisper/tokenizer.py b/faster_whisper/tokenizer.py index b040044..1af70b9 100644 --- a/faster_whisper/tokenizer.py +++ b/faster_whisper/tokenizer.py @@ -19,15 +19,21 @@ class Tokenizer: self.tokenizer = tokenizer if multilingual: + if task not in _TASKS: + raise ValueError( + "'%s' is not a valid task (accepted tasks: %s)" + % (task, ", ".join(_TASKS)) + ) + + if language not in _LANGUAGE_CODES: + raise ValueError( + "'%s' is not a valid language code (accepted language codes: %s)" + % (language, ", ".join(_LANGUAGE_CODES)) + ) + self.task = self.tokenizer.token_to_id("<|%s|>" % task) - if self.task is None: - raise ValueError("%s is not a valid task" % task) - - self.language_code = language self.language = self.tokenizer.token_to_id("<|%s|>" % language) - if self.language is None: - raise ValueError("%s is not a valid language code" % language) - + self.language_code = language else: self.task = None self.language = None @@ -161,3 +167,111 @@ class Tokenizer: word_tokens[-1].extend(subword_tokens) return words, word_tokens + + +_TASKS = ( + "transcribe", + "translate", +) + +_LANGUAGE_CODES = ( + "af", + "am", + "ar", + "as", + "az", + "ba", + "be", + "bg", + "bn", + "bo", + "br", + "bs", + "ca", + "cs", + "cy", + "da", + "de", + "el", + "en", + "es", + "et", + "eu", + "fa", + "fi", + "fo", + "fr", + "gl", + "gu", + "ha", + "haw", + "he", + "hi", + "hr", + "ht", + "hu", + "hy", + "id", + "is", + "it", + "ja", + "jw", + "ka", + "kk", + "km", + "kn", + "ko", + "la", + "lb", + "ln", + "lo", + "lt", + "lv", + "mg", + "mi", + "mk", + "ml", + "mn", + "mr", + "ms", + "mt", + "my", + "ne", + "nl", + "nn", + "no", + "oc", + "pa", + "pl", + "ps", + "pt", + "ro", + "ru", + "sa", + "sd", + "si", + "sk", + "sl", + "sn", + "so", + "sq", + "sr", + "su", + "sv", + "sw", + "ta", + "te", + "tg", + "th", + "tk", + "tl", + "tr", + "tt", + "uk", + "ur", + "uz", + "vi", + "yi", + "yo", + "zh", +) From f6979456913709307b33a3a8d3687b6a43f5813d Mon Sep 17 00:00:00 2001 From: Guillaume Klein Date: Tue, 12 Sep 2023 14:44:22 +0200 Subject: [PATCH 25/37] Update tokenizers requirement to include version 0.14 (#469) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 819d3d2..fa037f7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ av==10.* ctranslate2>=3.17,<4 huggingface_hub>=0.13 -tokenizers==0.13.* +tokenizers>=0.13,<0.15 onnxruntime>=1.14,<2 From 81086f6d33675c390bc385e2f9e5b1aa975db579 Mon Sep 17 00:00:00 2001 From: Guillaume Klein Date: Tue, 12 Sep 2023 14:44:37 +0200 Subject: [PATCH 26/37] Always run the encoder at the beginning of the loop (#468) --- faster_whisper/transcribe.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py index 717cb59..dd01b80 100644 --- a/faster_whisper/transcribe.py +++ b/faster_whisper/transcribe.py @@ -417,7 +417,7 @@ class WhisperModel: prefix=options.prefix if seek == 0 else None, ) - if encoder_output is None: + if seek > 0 or encoder_output is None: encoder_output = self.encode(segment) ( @@ -447,7 +447,6 @@ class WhisperModel: # fast-forward to the next segment boundary seek += segment_size - encoder_output = None continue tokens = result.sequences_ids[0] @@ -554,8 +553,6 @@ class WhisperModel: if seek_shift > 0: seek = previous_seek + seek_shift - encoder_output = None - for segment in current_segments: tokens = segment["tokens"] text = tokenizer.decode(tokens) From a49097e655e7c7a28f19bf14dadc6ce244c1fe73 Mon Sep 17 00:00:00 2001 From: Guillaume Klein Date: Tue, 12 Sep 2023 15:45:54 +0200 Subject: [PATCH 27/37] Add some missing typing annotations in transcribe.py --- faster_whisper/transcribe.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py index dd01b80..9053d3c 100644 --- a/faster_whisper/transcribe.py +++ b/faster_whisper/transcribe.py @@ -747,7 +747,7 @@ class WhisperModel: prepend_punctuations: str, append_punctuations: str, last_speech_timestamp: float, - ): + ) -> None: if len(segments) == 0: return @@ -953,7 +953,10 @@ def get_compression_ratio(text: str) -> float: return len(text_bytes) / len(zlib.compress(text_bytes)) -def get_suppressed_tokens(tokenizer, suppress_tokens): +def get_suppressed_tokens( + tokenizer: Tokenizer, + suppress_tokens: Optional[List[int]], +) -> Optional[List[int]]: if not suppress_tokens or -1 in suppress_tokens: return suppress_tokens @@ -974,7 +977,7 @@ def get_suppressed_tokens(tokenizer, suppress_tokens): return sorted(set(suppress_tokens)) -def merge_punctuations(alignment: List[dict], prepended: str, appended: str): +def merge_punctuations(alignment: List[dict], prepended: str, appended: str) -> None: # merge prepended punctuations i = len(alignment) - 2 j = len(alignment) - 1 From 0048844f54417a1337e35d858fdcd3434550bdd2 Mon Sep 17 00:00:00 2001 From: Guillaume Klein Date: Thu, 14 Sep 2023 17:17:01 +0200 Subject: [PATCH 28/37] Expose function available_models (#475) * Expose function available_models * Add test case --- faster_whisper/__init__.py | 3 ++- faster_whisper/utils.py | 7 ++++++- tests/test_utils.py | 8 +++++++- 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/faster_whisper/__init__.py b/faster_whisper/__init__.py index e2fe00d..9b56a39 100644 --- a/faster_whisper/__init__.py +++ b/faster_whisper/__init__.py @@ -1,9 +1,10 @@ from faster_whisper.audio import decode_audio from faster_whisper.transcribe import WhisperModel -from faster_whisper.utils import download_model, format_timestamp +from faster_whisper.utils import available_models, download_model, format_timestamp from faster_whisper.version import __version__ __all__ = [ + "available_models", "decode_audio", "WhisperModel", "download_model", diff --git a/faster_whisper/utils.py b/faster_whisper/utils.py index 5987aee..f020bc2 100644 --- a/faster_whisper/utils.py +++ b/faster_whisper/utils.py @@ -2,7 +2,7 @@ import logging import os import re -from typing import Optional +from typing import List, Optional import huggingface_hub import requests @@ -24,6 +24,11 @@ _MODELS = { } +def available_models() -> List[str]: + """Returns the names of available models.""" + return list(_MODELS.keys()) + + def get_assets_path(): """Returns the path to the assets directory.""" return os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets") diff --git a/tests/test_utils.py b/tests/test_utils.py index ee404bf..bb488fe 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,6 +1,12 @@ import os -from faster_whisper import download_model +from faster_whisper import available_models, download_model + + +def test_available_models(): + models = available_models() + assert isinstance(models, list) + assert "tiny" in models def test_download_model(tmpdir): From e94711bb5cee175d8165a22fd9ce03ef295c7170 Mon Sep 17 00:00:00 2001 From: Guillaume Klein Date: Thu, 14 Sep 2023 17:42:02 +0200 Subject: [PATCH 29/37] Add property WhisperModel.supported_languages (#476) * Expose function supported_languages * Make it a method --- faster_whisper/transcribe.py | 7 ++++++- tests/test_transcribe.py | 5 +++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py index 9053d3c..86187fc 100644 --- a/faster_whisper/transcribe.py +++ b/faster_whisper/transcribe.py @@ -11,7 +11,7 @@ import tokenizers from faster_whisper.audio import decode_audio from faster_whisper.feature_extractor import FeatureExtractor -from faster_whisper.tokenizer import Tokenizer +from faster_whisper.tokenizer import _LANGUAGE_CODES, Tokenizer from faster_whisper.utils import download_model, format_timestamp, get_logger from faster_whisper.vad import ( SpeechTimestampsMap, @@ -154,6 +154,11 @@ class WhisperModel: self.time_precision = 0.02 self.max_length = 448 + @property + def supported_languages(self) -> List[str]: + """The languages supported by the model.""" + return list(_LANGUAGE_CODES) if self.model.is_multilingual else ["en"] + def transcribe( self, audio: Union[str, BinaryIO, np.ndarray], diff --git a/tests/test_transcribe.py b/tests/test_transcribe.py index ca8d5a9..d30a0fb 100644 --- a/tests/test_transcribe.py +++ b/tests/test_transcribe.py @@ -3,6 +3,11 @@ import os from faster_whisper import WhisperModel, decode_audio +def test_supported_languages(): + model = WhisperModel("tiny.en") + assert model.supported_languages == ["en"] + + def test_transcribe(jfk_path): model = WhisperModel("tiny") segments, info = model.transcribe(jfk_path, word_timestamps=True) From 5a0541ea7d054aa3716ac492491de30158c20057 Mon Sep 17 00:00:00 2001 From: Guillaume Klein Date: Mon, 18 Sep 2023 16:21:37 +0200 Subject: [PATCH 30/37] Bump version to 0.9.0 --- faster_whisper/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faster_whisper/version.py b/faster_whisper/version.py index 028b8b0..ca25270 100644 --- a/faster_whisper/version.py +++ b/faster_whisper/version.py @@ -1,3 +1,3 @@ """Version information.""" -__version__ = "0.8.0" +__version__ = "0.9.0" From 30844096334757d1cf0f8f2e4f317398e16090ac Mon Sep 17 00:00:00 2001 From: Oscaarjs <37636054+Oscaarjs@users.noreply.github.com> Date: Fri, 24 Nov 2023 23:16:12 +0100 Subject: [PATCH 31/37] Add V3 Support (#578) * Add V3 Support * update conversion example --------- Co-authored-by: oscaarjs --- README.md | 14 +++++++------- faster_whisper/tokenizer.py | 3 ++- faster_whisper/transcribe.py | 25 ++++++++++++++++++++++--- faster_whisper/utils.py | 28 +++++++++++++++------------- requirements.txt | 4 ++-- 5 files changed, 48 insertions(+), 26 deletions(-) diff --git a/README.md b/README.md index 36320fa..a18bddf 100644 --- a/README.md +++ b/README.md @@ -104,7 +104,7 @@ pip install --force-reinstall "faster-whisper @ https://github.com/guillaumekln/ ```python from faster_whisper import WhisperModel -model_size = "large-v2" +model_size = "large-v3" # Run on GPU with FP16 model = WhisperModel(model_size, device="cuda", compute_type="float16") @@ -185,17 +185,17 @@ Here is a non exhaustive list of open-source projects using faster-whisper. Feel ## Model conversion -When loading a model from its size such as `WhisperModel("large-v2")`, the correspondig CTranslate2 model is automatically downloaded from the [Hugging Face Hub](https://huggingface.co/guillaumekln). +When loading a model from its size such as `WhisperModel("large-v3")`, the correspondig CTranslate2 model is automatically downloaded from the [Hugging Face Hub](https://huggingface.co/Systran). We also provide a script to convert any Whisper models compatible with the Transformers library. They could be the original OpenAI models or user fine-tuned models. -For example the command below converts the [original "large-v2" Whisper model](https://huggingface.co/openai/whisper-large-v2) and saves the weights in FP16: +For example the command below converts the [original "large-v3" Whisper model](https://huggingface.co/openai/whisper-large-v3) and saves the weights in FP16: ```bash pip install transformers[torch]>=4.23 -ct2-transformers-converter --model openai/whisper-large-v2 --output_dir whisper-large-v2-ct2 \ - --copy_files tokenizer.json --quantization float16 +ct2-transformers-converter --model openai/whisper-large-v3 --output_dir whisper-large-v3-ct2 +--copy_files tokenizer.json preprocessor_config.json --quantization float16 ``` * The option `--model` accepts a model name on the Hub or a path to a model directory. @@ -207,12 +207,12 @@ Models can also be converted from the code. See the [conversion API](https://ope 1. Directly load the model from a local directory: ```python -model = faster_whisper.WhisperModel("whisper-large-v2-ct2") +model = faster_whisper.WhisperModel("whisper-large-v3-ct2") ``` 2. [Upload your model to the Hugging Face Hub](https://huggingface.co/docs/transformers/model_sharing#upload-with-the-web-interface) and load it from its name: ```python -model = faster_whisper.WhisperModel("username/whisper-large-v2-ct2") +model = faster_whisper.WhisperModel("username/whisper-large-v3-ct2") ``` ## Comparing performance against other implementations diff --git a/faster_whisper/tokenizer.py b/faster_whisper/tokenizer.py index 1af70b9..c3b13b4 100644 --- a/faster_whisper/tokenizer.py +++ b/faster_whisper/tokenizer.py @@ -108,7 +108,7 @@ class Tokenizer: def split_to_word_tokens( self, tokens: List[int] ) -> Tuple[List[str], List[List[int]]]: - if self.language_code in {"zh", "ja", "th", "lo", "my"}: + if self.language_code in {"zh", "ja", "th", "lo", "my", "yue"}: # These languages don't typically use spaces, so it is difficult to split words # without morpheme analysis. Here, we instead split words at any # position where the tokens are decoded as valid unicode points @@ -274,4 +274,5 @@ _LANGUAGE_CODES = ( "yi", "yo", "zh", + "yue", ) diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py index 86187fc..e0525b9 100644 --- a/faster_whisper/transcribe.py +++ b/faster_whisper/transcribe.py @@ -1,8 +1,10 @@ import itertools +import json import logging import os import zlib +from inspect import signature from typing import BinaryIO, Iterable, List, NamedTuple, Optional, Tuple, Union import ctranslate2 @@ -92,8 +94,8 @@ class WhisperModel: Args: model_size_or_path: Size of the model to use (tiny, tiny.en, base, base.en, - small, small.en, medium, medium.en, large-v1, large-v2, or large), a path to a converted - model directory, or a CTranslate2-converted Whisper model ID from the Hugging Face Hub. + small, small.en, medium, medium.en, large-v1, large-v2, large-v3, or large), a path to a + converted model directory, or a CTranslate2-converted Whisper model ID from the HF Hub. When a size or a model ID is configured, the converted model is downloaded from the Hugging Face Hub. device: Device to use for computation ("cpu", "cuda", "auto"). @@ -142,7 +144,8 @@ class WhisperModel: "openai/whisper-tiny" + ("" if self.model.is_multilingual else ".en") ) - self.feature_extractor = FeatureExtractor() + self.feat_kwargs = self._get_feature_kwargs(model_path) + self.feature_extractor = FeatureExtractor(**self.feat_kwargs) self.num_samples_per_token = self.feature_extractor.hop_length * 2 self.frames_per_second = ( self.feature_extractor.sampling_rate // self.feature_extractor.hop_length @@ -159,6 +162,22 @@ class WhisperModel: """The languages supported by the model.""" return list(_LANGUAGE_CODES) if self.model.is_multilingual else ["en"] + def _get_feature_kwargs(self, model_path) -> dict: + preprocessor_config_file = os.path.join(model_path, "preprocessor_config.json") + config = {} + if os.path.isfile(preprocessor_config_file): + try: + with open(preprocessor_config_file, "r", encoding="utf-8") as json_file: + config = json.load(json_file) + valid_keys = signature(FeatureExtractor.__init__).parameters.keys() + config = {k: v for k, v in config.items() if k in valid_keys} + except json.JSONDecodeError as e: + self.logger.warning( + "Could not load preprocessor_config.json: %s", str(e) + ) + + return config + def transcribe( self, audio: Union[str, BinaryIO, np.ndarray], diff --git a/faster_whisper/utils.py b/faster_whisper/utils.py index f020bc2..343a635 100644 --- a/faster_whisper/utils.py +++ b/faster_whisper/utils.py @@ -10,17 +10,18 @@ import requests from tqdm.auto import tqdm _MODELS = { - "tiny.en": "guillaumekln/faster-whisper-tiny.en", - "tiny": "guillaumekln/faster-whisper-tiny", - "base.en": "guillaumekln/faster-whisper-base.en", - "base": "guillaumekln/faster-whisper-base", - "small.en": "guillaumekln/faster-whisper-small.en", - "small": "guillaumekln/faster-whisper-small", - "medium.en": "guillaumekln/faster-whisper-medium.en", - "medium": "guillaumekln/faster-whisper-medium", - "large-v1": "guillaumekln/faster-whisper-large-v1", - "large-v2": "guillaumekln/faster-whisper-large-v2", - "large": "guillaumekln/faster-whisper-large-v2", + "tiny.en": "Systran/faster-whisper-tiny.en", + "tiny": "Systran/faster-whisper-tiny", + "base.en": "Systran/faster-whisper-base.en", + "base": "Systran/faster-whisper-base", + "small.en": "Systran/faster-whisper-small.en", + "small": "Systran/faster-whisper-small", + "medium.en": "Systran/faster-whisper-medium.en", + "medium": "Systran/faster-whisper-medium", + "large-v1": "Systran/faster-whisper-large-v1", + "large-v2": "Systran/faster-whisper-large-v2", + "large-v3": "Systran/faster-whisper-large-v3", + "large": "Systran/faster-whisper-large-v3", } @@ -50,8 +51,8 @@ def download_model( Args: size_or_id: Size of the model to download from https://huggingface.co/guillaumekln (tiny, tiny.en, base, base.en, small, small.en medium, medium.en, large-v1, large-v2, - large), or a CTranslate2-converted model ID from the Hugging Face Hub - (e.g. guillaumekln/faster-whisper-large-v2). + large-v3, large), or a CTranslate2-converted model ID from the Hugging Face Hub + (e.g. Systran/faster-whisper-large-v3). output_dir: Directory where the model should be saved. If not set, the model is saved in the cache directory. local_files_only: If True, avoid downloading the file and return the path to the local @@ -76,6 +77,7 @@ def download_model( allow_patterns = [ "config.json", + "preprocessor_config.json", "model.bin", "tokenizer.json", "vocabulary.*", diff --git a/requirements.txt b/requirements.txt index fa037f7..ba0da20 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ av==10.* -ctranslate2>=3.17,<4 +ctranslate2>=3.22,<4 huggingface_hub>=0.13 -tokenizers>=0.13,<0.15 +tokenizers>=0.13,<0.16 onnxruntime>=1.14,<2 From e1a218fab1ab02d637b79565995bf1a9c4c83a09 Mon Sep 17 00:00:00 2001 From: Dang Chuan Nguyen Date: Fri, 24 Nov 2023 23:19:47 +0100 Subject: [PATCH 32/37] Bump version to 0.10.0 --- faster_whisper/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faster_whisper/version.py b/faster_whisper/version.py index ca25270..e1f6d31 100644 --- a/faster_whisper/version.py +++ b/faster_whisper/version.py @@ -1,3 +1,3 @@ """Version information.""" -__version__ = "0.9.0" +__version__ = "0.10.0" From 9641d5f56ac16b92938928c50f94409a3dce4e56 Mon Sep 17 00:00:00 2001 From: Clayton Yochum Date: Mon, 27 Nov 2023 02:43:35 -0700 Subject: [PATCH 33/37] Force read-mode in `av.open` (#566) The `av.open` functions checks input metadata to determine the mode to open with ("r" or "w"). If an input to `decode_audio` is found to be in write-mode, without this change it can't be read. Forcing read mode solves this. --- faster_whisper/audio.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faster_whisper/audio.py b/faster_whisper/audio.py index 4dd8aae..3190619 100644 --- a/faster_whisper/audio.py +++ b/faster_whisper/audio.py @@ -43,7 +43,7 @@ def decode_audio( raw_buffer = io.BytesIO() dtype = None - with av.open(input_file, metadata_errors="ignore") as container: + with av.open(input_file, mode="r", metadata_errors="ignore") as container: frames = container.decode(audio=0) frames = _ignore_invalid_frames(frames) frames = _group_frames(frames, 500000) From 65094b779e28e391bbba80a169e091144162db17 Mon Sep 17 00:00:00 2001 From: Purfview <69023953+Purfview@users.noreply.github.com> Date: Mon, 27 Nov 2023 11:12:47 +0000 Subject: [PATCH 34/37] Update info on cuBLAS and cuDNN libs in README.md (#513) --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index a18bddf..b1c65c3 100644 --- a/README.md +++ b/README.md @@ -68,9 +68,9 @@ pip install nvidia-cublas-cu11 nvidia-cudnn-cu11 export LD_LIBRARY_PATH=`python3 -c 'import os; import nvidia.cublas.lib; import nvidia.cudnn.lib; print(os.path.dirname(nvidia.cublas.lib.__file__) + ":" + os.path.dirname(nvidia.cudnn.lib.__file__))'` ``` -#### Download the libraries from Purfview's repository (Windows only) +#### Download the libraries from Purfview's repository (Windows & Linux) -Purfview's [whisper-standalone-win](https://github.com/Purfview/whisper-standalone-win) provides the required NVIDIA libraries for Windows in a [single archive](https://github.com/Purfview/whisper-standalone-win/releases/tag/libs). Decompress the archive and place the libraries in a directory included in the `PATH`. +Purfview's [whisper-standalone-win](https://github.com/Purfview/whisper-standalone-win) provides the required NVIDIA libraries for Windows & Linux in a [single archive](https://github.com/Purfview/whisper-standalone-win/releases/tag/libs). Decompress the archive and place the libraries in a directory included in the `PATH`. From 19329a361150bab0596c9f35486a57c94ae6f78e Mon Sep 17 00:00:00 2001 From: trungkienbkhn Date: Wed, 13 Dec 2023 18:38:44 +0700 Subject: [PATCH 35/37] Word timing tweaks (#616) --- faster_whisper/transcribe.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py index e0525b9..c082546 100644 --- a/faster_whisper/transcribe.py +++ b/faster_whisper/transcribe.py @@ -908,6 +908,13 @@ class WhisperModel: words, word_tokens = tokenizer.split_to_word_tokens( text_tokens + [tokenizer.eot] ) + if len(word_tokens) <= 1: + # return on eot only + # >>> np.pad([], (1, 0)) + # array([0.]) + # This results in crashes when we lookup jump_times with float, like + # IndexError: arrays used as indices must be of integer (or boolean) type + return [] word_boundaries = np.pad(np.cumsum([len(t) for t in word_tokens[:-1]]), (1, 0)) if len(word_boundaries) <= 1: return [] From ebcfd6b9646f5176fba8b7f3429d0de28a70192c Mon Sep 17 00:00:00 2001 From: Purfview <69023953+Purfview@users.noreply.github.com> Date: Wed, 13 Dec 2023 12:14:39 +0000 Subject: [PATCH 36/37] Fix broken prompt_reset_on_temperature (#604) * Fix broken prompt_reset_on_temperature Fixing: https://github.com/SYSTRAN/faster-whisper/issues/603 Broken because `generate_with_fallback()` doesn't return final temperature. Regression since PR356 -> https://github.com/SYSTRAN/faster-whisper/pull/356 --- faster_whisper/transcribe.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py index c082546..7996321 100644 --- a/faster_whisper/transcribe.py +++ b/faster_whisper/transcribe.py @@ -731,6 +731,13 @@ class WhisperModel: decode_result = max( below_cr_threshold_results or all_results, key=lambda x: x[1] ) + # to pass final temperature for prompt_reset_on_temperature + decode_result = ( + decode_result[0], + decode_result[1], + temperature, + decode_result[3], + ) return decode_result From 44f7e589478866546bfcd1d105e254a74e2caad5 Mon Sep 17 00:00:00 2001 From: Purfview <69023953+Purfview@users.noreply.github.com> Date: Thu, 14 Dec 2023 12:03:46 +0000 Subject: [PATCH 37/37] Update whisper-standalone-win description in README.md (#508) * Update whisper-standalone-win description in README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index b1c65c3..01417a9 100644 --- a/README.md +++ b/README.md @@ -178,7 +178,7 @@ Here is a non exhaustive list of open-source projects using faster-whisper. Feel * [whisper-ctranslate2](https://github.com/Softcatala/whisper-ctranslate2) is a command line client based on faster-whisper and compatible with the original client from openai/whisper. * [whisper-diarize](https://github.com/MahmoudAshraf97/whisper-diarization) is a speaker diarization tool that is based on faster-whisper and NVIDIA NeMo. -* [whisper-standalone-win](https://github.com/Purfview/whisper-standalone-win) contains the portable ready to run binaries of faster-whisper for Windows. +* [whisper-standalone-win](https://github.com/Purfview/whisper-standalone-win) Standalone CLI executables of faster-whisper for Windows, Linux & macOS. * [asr-sd-pipeline](https://github.com/hedrergudene/asr-sd-pipeline) provides a scalable, modular, end to end multi-speaker speech to text solution implemented using AzureML pipelines. * [Open-Lyrics](https://github.com/zh-plus/Open-Lyrics) is a Python library that transcribes voice files using faster-whisper, and translates/polishes the resulting text into `.lrc` files in the desired language using OpenAI-GPT. * [wscribe](https://github.com/geekodour/wscribe) is a flexible transcript generation tool supporting faster-whisper, it can export word level transcript and the exported transcript then can be edited with [wscribe-editor](https://github.com/geekodour/wscribe-editor)