From 358d373691c95205021bd4bbf28cde7ce4d10030 Mon Sep 17 00:00:00 2001 From: Jordi Mas Date: Thu, 20 Apr 2023 14:26:06 +0200 Subject: [PATCH 01/36] Allow specifying local_files_only to prevent checking the Internet everytime (#166) --- faster_whisper/transcribe.py | 7 ++++++- faster_whisper/utils.py | 10 ++++++++-- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py index 6d31271..2c544fc 100644 --- a/faster_whisper/transcribe.py +++ b/faster_whisper/transcribe.py @@ -73,6 +73,7 @@ class WhisperModel: cpu_threads: int = 0, num_workers: int = 1, download_root: Optional[str] = None, + local_files_only: Optional[bool] = False, ): """Initializes the Whisper model. @@ -96,13 +97,17 @@ class WhisperModel: This can improve the global throughput at the cost of increased memory usage. download_root: Directory where the model should be saved. If not set, the model is saved in the standard Hugging Face cache directory. + local_files_only: If True, avoid downloading the file and return the path to the + local cached file if it exists. """ self.logger = get_logger() if os.path.isdir(model_size_or_path): model_path = model_size_or_path else: - model_path = download_model(model_size_or_path, download_root) + model_path = download_model( + model_size_or_path, download_root, local_files_only + ) self.model = ctranslate2.models.Whisper( model_path, diff --git a/faster_whisper/utils.py b/faster_whisper/utils.py index 66c7161..649906a 100644 --- a/faster_whisper/utils.py +++ b/faster_whisper/utils.py @@ -31,7 +31,11 @@ def get_logger(): return logging.getLogger("faster_whisper") -def download_model(size: str, output_dir: Optional[str] = None): +def download_model( + size: str, + output_dir: Optional[str] = None, + local_files_only: Optional[bool] = False, +): """Downloads a CTranslate2 Whisper model from the Hugging Face Hub. The model is downloaded from https://huggingface.co/guillaumekln. @@ -41,6 +45,8 @@ def download_model(size: str, output_dir: Optional[str] = None): medium, medium.en, large-v1, or large-v2). output_dir: Directory where the model should be saved. If not set, the model is saved in the standard Hugging Face cache directory. + local_files_only: If True, avoid downloading the file and return the path to the local + cached file if it exists. Returns: The path to the downloaded model. @@ -55,7 +61,7 @@ def download_model(size: str, output_dir: Optional[str] = None): repo_id = "guillaumekln/faster-whisper-%s" % size kwargs = {} - + kwargs["local_files_only"] = local_files_only if output_dir is not None: kwargs["local_dir"] = output_dir kwargs["local_dir_use_symlinks"] = False From 2b51a97e61ac99261b828bd7fdd918e84e98b5bb Mon Sep 17 00:00:00 2001 From: FlippFuzz <41221030+FlippFuzz@users.noreply.github.com> Date: Mon, 24 Apr 2023 21:02:19 +0800 Subject: [PATCH 02/36] Add transcription_options to AudioInfo (#170) * Add transcription_options to AudioInfo It would be great if we can include the transcription_options in AudioInfo. My application is only making a few changes but leaving the rest as default. However, I would like to record down all settings (including those that I did not specify) so that the audio can be transcribed again identically in future if need be. * Make TranscriptionOptions appear before AudioInfo * Remove unnecessary whitespace --- faster_whisper/transcribe.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py index 2c544fc..5da7048 100644 --- a/faster_whisper/transcribe.py +++ b/faster_whisper/transcribe.py @@ -36,12 +36,6 @@ class Segment(NamedTuple): no_speech_prob: float -class AudioInfo(NamedTuple): - language: str - language_probability: float - duration: float - - class TranscriptionOptions(NamedTuple): beam_size: int best_of: int @@ -63,6 +57,13 @@ class TranscriptionOptions(NamedTuple): append_punctuations: str +class AudioInfo(NamedTuple): + language: str + language_probability: float + duration: float + transcription_options: TranscriptionOptions + + class WhisperModel: def __init__( self, @@ -321,6 +322,7 @@ class WhisperModel: language=language, language_probability=language_probability, duration=duration, + transcription_options=options, ) return segments, audio_info From f893113759b325d77340ff70d9b165618ae9e88c Mon Sep 17 00:00:00 2001 From: Amar Sood Date: Mon, 24 Apr 2023 09:04:42 -0400 Subject: [PATCH 03/36] Align segment structure with openai/whisper (#154) * Align segment structure with openai/whisper * Update code to apply requested changes * Move increment below the segment filtering --------- Co-authored-by: Guillaume Klein --- faster_whisper/transcribe.py | 46 ++++++++++++++++++++++++------------ 1 file changed, 31 insertions(+), 15 deletions(-) diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py index 5da7048..39d25d5 100644 --- a/faster_whisper/transcribe.py +++ b/faster_whisper/transcribe.py @@ -28,12 +28,17 @@ class Word(NamedTuple): class Segment(NamedTuple): + id: int + seek: int start: float end: float text: str - words: Optional[List[Word]] - avg_log_prob: float + tokens: List[int] + temperature: float + avg_logprob: float + compression_ratio: float no_speech_prob: float + words: Optional[List[Word]] class TranscriptionOptions(NamedTuple): @@ -335,6 +340,7 @@ class WhisperModel: encoder_output: Optional[ctranslate2.StorageView] = None, ) -> Iterable[Segment]: content_frames = features.shape[-1] - self.feature_extractor.nb_max_frames + idx = 0 seek = 0 all_tokens = [] prompt_reset_since = 0 @@ -368,9 +374,12 @@ class WhisperModel: if encoder_output is None: encoder_output = self.encode(segment) - result, avg_log_prob, temperature = self.generate_with_fallback( - encoder_output, prompt, tokenizer, options - ) + ( + result, + avg_logprob, + temperature, + compression_ratio, + ) = self.generate_with_fallback(encoder_output, prompt, tokenizer, options) if options.no_speech_threshold is not None: # no voice activity check @@ -378,7 +387,7 @@ class WhisperModel: if ( options.log_prob_threshold is not None - and avg_log_prob > options.log_prob_threshold + and avg_logprob > options.log_prob_threshold ): # don't skip if the logprob is high enough, despite the no_speech_prob should_skip = False @@ -509,18 +518,24 @@ class WhisperModel: continue all_tokens.extend(tokens) + idx += 1 yield Segment( + id=idx, + seek=seek, start=segment["start"], end=segment["end"], text=text, + tokens=tokens, + temperature=temperature, + avg_logprob=avg_logprob, + compression_ratio=compression_ratio, + no_speech_prob=result.no_speech_prob, words=( [Word(**word) for word in segment["words"]] if options.word_timestamps else None ), - avg_log_prob=avg_log_prob, - no_speech_prob=result.no_speech_prob, ) def encode(self, features: np.ndarray) -> ctranslate2.StorageView: @@ -539,10 +554,11 @@ class WhisperModel: prompt: List[int], tokenizer: Tokenizer, options: TranscriptionOptions, - ) -> Tuple[ctranslate2.models.WhisperGenerationResult, float, float]: + ) -> Tuple[ctranslate2.models.WhisperGenerationResult, float, float, float]: result = None - avg_log_prob = None + avg_logprob = None final_temperature = None + compression_ratio = None max_initial_timestamp_index = int( round(options.max_initial_timestamp / self.time_precision) @@ -580,8 +596,8 @@ class WhisperModel: # Recover the average log prob from the returned score. seq_len = len(tokens) - cum_log_prob = result.scores[0] * (seq_len**options.length_penalty) - avg_log_prob = cum_log_prob / (seq_len + 1) + cum_logprob = result.scores[0] * (seq_len**options.length_penalty) + avg_logprob = cum_logprob / (seq_len + 1) text = tokenizer.decode(tokens).strip() compression_ratio = get_compression_ratio(text) @@ -603,21 +619,21 @@ class WhisperModel: if ( options.log_prob_threshold is not None - and avg_log_prob < options.log_prob_threshold + and avg_logprob < options.log_prob_threshold ): needs_fallback = True # average log probability is too low self.logger.debug( "Log probability threshold is not met with temperature %.1f (%f < %f)", temperature, - avg_log_prob, + avg_logprob, options.log_prob_threshold, ) if not needs_fallback: break - return result, avg_log_prob, final_temperature + return result, avg_logprob, final_temperature, compression_ratio def get_prompt( self, From 338a725ff8005cb37db8752c149c32900ecf3d0b Mon Sep 17 00:00:00 2001 From: Anthony Date: Mon, 24 Apr 2023 16:28:47 +0200 Subject: [PATCH 04/36] fix where the tokens are reset (#175) --- faster_whisper/transcribe.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py index 39d25d5..a685775 100644 --- a/faster_whisper/transcribe.py +++ b/faster_whisper/transcribe.py @@ -483,9 +483,6 @@ class WhisperModel: seek += segment_size - if not options.condition_on_previous_text or temperature > 0.5: - prompt_reset_since = len(all_tokens) - if options.word_timestamps: self.add_word_timestamps( current_segments, @@ -538,6 +535,9 @@ class WhisperModel: ), ) + if not options.condition_on_previous_text or temperature > 0.5: + prompt_reset_since = len(all_tokens) + def encode(self, features: np.ndarray) -> ctranslate2.StorageView: # When the model is running on multiple GPUs, the encoder output should be moved # to the CPU since we don't know which GPU will handle the next job. From e06511f96ba41b1117a55cf2542e59484632d403 Mon Sep 17 00:00:00 2001 From: Guillaume Klein Date: Mon, 24 Apr 2023 16:29:17 +0200 Subject: [PATCH 05/36] Rename AudioInfo to TranscriptionInfo (#174) --- faster_whisper/transcribe.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py index a685775..d89dfeb 100644 --- a/faster_whisper/transcribe.py +++ b/faster_whisper/transcribe.py @@ -62,7 +62,7 @@ class TranscriptionOptions(NamedTuple): append_punctuations: str -class AudioInfo(NamedTuple): +class TranscriptionInfo(NamedTuple): language: str language_probability: float duration: float @@ -176,7 +176,7 @@ class WhisperModel: append_punctuations: str = "\"'.。,,!!??::”)]}、", vad_filter: bool = False, vad_parameters: Optional[dict] = None, - ) -> Tuple[Iterable[Segment], AudioInfo]: + ) -> Tuple[Iterable[Segment], TranscriptionInfo]: """Transcribes an input file. Arguments: @@ -226,7 +226,7 @@ class WhisperModel: A tuple with: - a generator over transcribed segments - - an instance of AudioInfo + - an instance of TranscriptionInfo """ sampling_rate = self.feature_extractor.sampling_rate @@ -323,14 +323,14 @@ class WhisperModel: if speech_chunks: segments = restore_speech_timestamps(segments, speech_chunks, sampling_rate) - audio_info = AudioInfo( + info = TranscriptionInfo( language=language, language_probability=language_probability, duration=duration, transcription_options=options, ) - return segments, audio_info + return segments, info def generate_segments( self, From 32dc625f11ea6118a1e7d36de13674d22a310441 Mon Sep 17 00:00:00 2001 From: Guillaume Klein Date: Tue, 25 Apr 2023 15:47:38 +0200 Subject: [PATCH 06/36] Update README.md --- README.md | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 02a2be6..a7c6ece 100644 --- a/README.md +++ b/README.md @@ -64,8 +64,6 @@ GPU execution requires the NVIDIA libraries cuBLAS 11.x and cuDNN 8.x to be inst ## Usage -### Library - ```python from faster_whisper import WhisperModel @@ -94,7 +92,7 @@ segments, _ = model.transcribe("audio.mp3") segments = list(segments) # The transcription will actually run here. ``` -#### Word-level timestamps +### Word-level timestamps ```python segments, _ = model.transcribe("audio.mp3", word_timestamps=True) @@ -104,7 +102,7 @@ for segment in segments: print("[%.2fs -> %.2fs] %s" % (word.start, word.end, word.word)) ``` -#### VAD filter +### VAD filter The library integrates the [Silero VAD](https://github.com/snakers4/silero-vad) model to filter out parts of the audio without speech: @@ -118,13 +116,26 @@ The default behavior is conservative and only removes silence longer than 2 seco segments, _ = model.transcribe("audio.mp3", vad_filter=True, vad_parameters=dict(min_silence_duration_ms=500)) ``` -#### Going further +### Logging + +The library logging level can be configured like this: + +```python +import logging + +logging.basicConfig() +logging.getLogger("faster_whisper").setLevel(logging.DEBUG) +``` + +### Going further See more model and transcription options in the [`WhisperModel`](https://github.com/guillaumekln/faster-whisper/blob/master/faster_whisper/transcribe.py) class implementation. -### CLI +## Community integrations -You can use [jordimas/whisper-ctranslate2](https://github.com/jordimas/whisper-ctranslate2) to access `faster-whisper` through a CLI interface similar to what is offered by Whisper. +Here is a non exhaustive list of open-source projects using *faster-whisper*. Feel free to add your project to the list! + +* [whisper-ctranslate2](https://github.com/jordimas/whisper-ctranslate2) is a command line client based on `faster-whisper` and compatible with the original client from openai/whisper. ## Model conversion From 8cf5d5a4b31b3b5b70168c56d679055b2cc82957 Mon Sep 17 00:00:00 2001 From: Guillaume Klein Date: Tue, 25 Apr 2023 15:54:22 +0200 Subject: [PATCH 07/36] Increase the default value of speech_pad_ms to 400 ms (#179) --- faster_whisper/vad.py | 2 +- tests/test_transcribe.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/faster_whisper/vad.py b/faster_whisper/vad.py index 080795d..cf14d5c 100644 --- a/faster_whisper/vad.py +++ b/faster_whisper/vad.py @@ -20,7 +20,7 @@ def get_speech_timestamps( max_speech_duration_s: float = float("inf"), min_silence_duration_ms: int = 2000, window_size_samples: int = 1024, - speech_pad_ms: int = 200, + speech_pad_ms: int = 400, ) -> List[dict]: """This method is used for splitting long audios into speech chunks using silero VAD. diff --git a/tests/test_transcribe.py b/tests/test_transcribe.py index 5406535..8bebd2a 100644 --- a/tests/test_transcribe.py +++ b/tests/test_transcribe.py @@ -32,7 +32,7 @@ def test_vad(jfk_path): segments, _ = model.transcribe( jfk_path, vad_filter=True, - vad_parameters=dict(min_silence_duration_ms=500), + vad_parameters=dict(min_silence_duration_ms=500, speech_pad_ms=200), ) segments = list(segments) From 8340e04dc6e9d009080658e41f12fc3ac565ae7b Mon Sep 17 00:00:00 2001 From: Guillaume Klein Date: Tue, 25 Apr 2023 15:54:31 +0200 Subject: [PATCH 08/36] Assign words to the speech chunk with the greatest coverage (#180) --- faster_whisper/transcribe.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py index d89dfeb..a54b9c6 100644 --- a/faster_whisper/transcribe.py +++ b/faster_whisper/transcribe.py @@ -796,7 +796,8 @@ def restore_speech_timestamps( words = [] for word in segment.words: # Ensure the word start and end times are resolved to the same chunk. - chunk_index = ts_map.get_chunk_index(word.start) + middle = (word.start + word.end) / 2 + chunk_index = ts_map.get_chunk_index(middle) word = word._replace( start=ts_map.get_original_time(word.start, chunk_index), end=ts_map.get_original_time(word.end, chunk_index), From 67cce3f55272a1774a8c77e7bebfa7d2e9692e2e Mon Sep 17 00:00:00 2001 From: Guillaume Klein Date: Tue, 25 Apr 2023 17:00:41 +0200 Subject: [PATCH 09/36] Bump version to 0.5.0 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index e3245db..83a1c65 100644 --- a/setup.py +++ b/setup.py @@ -23,7 +23,7 @@ conversion_requires = get_requirements( setup( name="faster-whisper", - version="0.4.1", + version="0.5.0", license="MIT", description="Faster Whisper transcription with CTranslate2", long_description=get_long_description(), From 68df3214ba78baf94a0e41e4c55e675e11444926 Mon Sep 17 00:00:00 2001 From: Jordi Mas Date: Wed, 26 Apr 2023 16:35:18 +0200 Subject: [PATCH 10/36] Use cache_dir instead of local_dir (#182) * Use cache_dir instead of local_dir * Fix unit test * Use cache_dir and preserve local_dir parameter * Remove blank line at the end * Disable ut * Implement download_root suggestion * Use cache_dir=download_root --- faster_whisper/transcribe.py | 4 +++- faster_whisper/utils.py | 5 +++++ tests/test_utils.py | 6 ++++++ 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py index a54b9c6..4d7f0f6 100644 --- a/faster_whisper/transcribe.py +++ b/faster_whisper/transcribe.py @@ -112,7 +112,9 @@ class WhisperModel: model_path = model_size_or_path else: model_path = download_model( - model_size_or_path, download_root, local_files_only + model_size_or_path, + local_files_only=local_files_only, + cache_dir=download_root, ) self.model = ctranslate2.models.Whisper( diff --git a/faster_whisper/utils.py b/faster_whisper/utils.py index 649906a..34a310a 100644 --- a/faster_whisper/utils.py +++ b/faster_whisper/utils.py @@ -35,6 +35,7 @@ def download_model( size: str, output_dir: Optional[str] = None, local_files_only: Optional[bool] = False, + cache_dir: Optional[str] = None, ): """Downloads a CTranslate2 Whisper model from the Hugging Face Hub. @@ -47,6 +48,7 @@ def download_model( the standard Hugging Face cache directory. local_files_only: If True, avoid downloading the file and return the path to the local cached file if it exists. + cache_dir: Path to the folder where cached files are stored. Returns: The path to the downloaded model. @@ -66,6 +68,9 @@ def download_model( kwargs["local_dir"] = output_dir kwargs["local_dir_use_symlinks"] = False + if cache_dir is not None: + kwargs["cache_dir"] = cache_dir + allow_patterns = [ "config.json", "model.bin", diff --git a/tests/test_utils.py b/tests/test_utils.py index 3e981f6..ee404bf 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -15,3 +15,9 @@ def test_download_model(tmpdir): for filename in os.listdir(model_dir): path = os.path.join(model_dir, filename) assert not os.path.islink(path) + + +def test_download_model_in_cache(tmpdir): + cache_dir = str(tmpdir.join("model")) + download_model("tiny", cache_dir=cache_dir) + assert os.path.isdir(cache_dir) From 6f9d68dd6b94da6d3e84ce631f85804aadf1c24d Mon Sep 17 00:00:00 2001 From: Guillaume Klein Date: Wed, 26 Apr 2023 17:36:24 +0200 Subject: [PATCH 11/36] Fix typing of local_files_only --- faster_whisper/transcribe.py | 2 +- faster_whisper/utils.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py index 4d7f0f6..3973fe6 100644 --- a/faster_whisper/transcribe.py +++ b/faster_whisper/transcribe.py @@ -79,7 +79,7 @@ class WhisperModel: cpu_threads: int = 0, num_workers: int = 1, download_root: Optional[str] = None, - local_files_only: Optional[bool] = False, + local_files_only: bool = False, ): """Initializes the Whisper model. diff --git a/faster_whisper/utils.py b/faster_whisper/utils.py index 34a310a..a052ad8 100644 --- a/faster_whisper/utils.py +++ b/faster_whisper/utils.py @@ -34,7 +34,7 @@ def get_logger(): def download_model( size: str, output_dir: Optional[str] = None, - local_files_only: Optional[bool] = False, + local_files_only: bool = False, cache_dir: Optional[str] = None, ): """Downloads a CTranslate2 Whisper model from the Hugging Face Hub. From 89a4c7f1f051317267073fcd456123aa5e997b0c Mon Sep 17 00:00:00 2001 From: Guillaume Klein Date: Wed, 26 Apr 2023 17:37:51 +0200 Subject: [PATCH 12/36] Update docstring to clarify download_root and output_dir --- faster_whisper/transcribe.py | 4 ++-- faster_whisper/utils.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py index 3973fe6..80aade4 100644 --- a/faster_whisper/transcribe.py +++ b/faster_whisper/transcribe.py @@ -101,8 +101,8 @@ class WhisperModel: having multiple workers enables true parallelism when running the model (concurrent calls to self.model.generate() will run in parallel). This can improve the global throughput at the cost of increased memory usage. - download_root: Directory where the model should be saved. If not set, the model - is saved in the standard Hugging Face cache directory. + download_root: Directory where the models should be saved. If not set, the models + are saved in the standard Hugging Face cache directory. local_files_only: If True, avoid downloading the file and return the path to the local cached file if it exists. """ diff --git a/faster_whisper/utils.py b/faster_whisper/utils.py index a052ad8..fe56ab9 100644 --- a/faster_whisper/utils.py +++ b/faster_whisper/utils.py @@ -45,7 +45,7 @@ def download_model( size: Size of the model to download (tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large-v1, or large-v2). output_dir: Directory where the model should be saved. If not set, the model is saved in - the standard Hugging Face cache directory. + the cache directory. local_files_only: If True, avoid downloading the file and return the path to the local cached file if it exists. cache_dir: Path to the folder where cached files are stored. From a3dcb900816055c750eff0f4716821d36917c405 Mon Sep 17 00:00:00 2001 From: Guillaume Klein Date: Wed, 26 Apr 2023 17:38:16 +0200 Subject: [PATCH 13/36] Bump version to 0.5.1 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 83a1c65..7e85dea 100644 --- a/setup.py +++ b/setup.py @@ -23,7 +23,7 @@ conversion_requires = get_requirements( setup( name="faster-whisper", - version="0.5.0", + version="0.5.1", license="MIT", description="Faster Whisper transcription with CTranslate2", long_description=get_long_description(), From 5d203d27571bad0bd7beacc72116bd1cb5b36267 Mon Sep 17 00:00:00 2001 From: Jordi Mas Date: Thu, 27 Apr 2023 14:53:28 +0200 Subject: [PATCH 14/36] Update Github link to community project (#187) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index a7c6ece..39a1c4a 100644 --- a/README.md +++ b/README.md @@ -135,7 +135,7 @@ See more model and transcription options in the [`WhisperModel`](https://github. Here is a non exhaustive list of open-source projects using *faster-whisper*. Feel free to add your project to the list! -* [whisper-ctranslate2](https://github.com/jordimas/whisper-ctranslate2) is a command line client based on `faster-whisper` and compatible with the original client from openai/whisper. +* [whisper-ctranslate2](https://github.com/Softcatala/whisper-ctranslate2) is a command line client based on `faster-whisper` and compatible with the original client from openai/whisper. ## Model conversion From d889345e071de21a83bdae60ba4b07110cfd0696 Mon Sep 17 00:00:00 2001 From: Mahmoud Ashraf Date: Fri, 28 Apr 2023 10:56:13 +0200 Subject: [PATCH 15/36] added whisper-diarize (#193) --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 39a1c4a..b367fd6 100644 --- a/README.md +++ b/README.md @@ -137,6 +137,8 @@ Here is a non exhaustive list of open-source projects using *faster-whisper*. Fe * [whisper-ctranslate2](https://github.com/Softcatala/whisper-ctranslate2) is a command line client based on `faster-whisper` and compatible with the original client from openai/whisper. +* [whisper-diarize](https://github.com/MahmoudAshraf97/whisper-diarization) is a speaker diarization tool that is based on `faster-whisper` and nvidia nemo. + ## Model conversion When loading a model from its size such as `WhisperModel("large-v2")`, the correspondig CTranslate2 model is automatically downloaded from the [Hugging Face Hub](https://huggingface.co/guillaumekln). From 5d8f3e2d905339b2d36ea4c73085daaf213fc548 Mon Sep 17 00:00:00 2001 From: FlippFuzz <41221030+FlippFuzz@users.noreply.github.com> Date: Tue, 9 May 2023 18:47:02 +0800 Subject: [PATCH 16/36] Implement VadOptions (#198) * Implement VadOptions * Fix line too long ./faster_whisper/transcribe.py:226:101: E501 line too long (111 > 100 characters) * Reformatted files with black * black .\faster_whisper\vad.py * black .\faster_whisper\transcribe.py * Fix import order with isort * isort .\faster_whisper\vad.py * isort .\faster_whisper\transcribe.py * Made recommended changes Recommended in https://github.com/guillaumekln/faster-whisper/pull/198 * Fix typing of vad_options argument --------- Co-authored-by: Guillaume Klein --- faster_whisper/transcribe.py | 16 +++++++---- faster_whisper/vad.py | 53 ++++++++++++++++++++++++------------ tests/test_transcribe.py | 5 +++- 3 files changed, 50 insertions(+), 24 deletions(-) diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py index 80aade4..06154f3 100644 --- a/faster_whisper/transcribe.py +++ b/faster_whisper/transcribe.py @@ -15,6 +15,7 @@ from faster_whisper.tokenizer import Tokenizer from faster_whisper.utils import download_model, format_timestamp, get_logger from faster_whisper.vad import ( SpeechTimestampsMap, + VadOptions, collect_chunks, get_speech_timestamps, ) @@ -67,6 +68,7 @@ class TranscriptionInfo(NamedTuple): language_probability: float duration: float transcription_options: TranscriptionOptions + vad_options: VadOptions class WhisperModel: @@ -177,7 +179,7 @@ class WhisperModel: prepend_punctuations: str = "\"'“¿([{-", append_punctuations: str = "\"'.。,,!!??::”)]}、", vad_filter: bool = False, - vad_parameters: Optional[dict] = None, + vad_parameters: Optional[Union[dict, VadOptions]] = None, ) -> Tuple[Iterable[Segment], TranscriptionInfo]: """Transcribes an input file. @@ -221,8 +223,8 @@ class WhisperModel: vad_filter: Enable the voice activity detection (VAD) to filter out parts of the audio without speech. This step is using the Silero VAD model https://github.com/snakers4/silero-vad. - vad_parameters: Dictionary of Silero VAD parameters (see available parameters and - default values in the function `get_speech_timestamps`). + vad_parameters: Dictionary of Silero VAD parameters or VadOptions class (see available + parameters and default values in the class `VadOptions`). Returns: A tuple with: @@ -242,8 +244,11 @@ class WhisperModel: ) if vad_filter: - vad_parameters = {} if vad_parameters is None else vad_parameters - speech_chunks = get_speech_timestamps(audio, **vad_parameters) + if vad_parameters is None: + vad_parameters = VadOptions() + elif isinstance(vad_parameters, dict): + vad_parameters = VadOptions(**vad_parameters) + speech_chunks = get_speech_timestamps(audio, vad_parameters) audio = collect_chunks(audio, speech_chunks) self.logger.info( @@ -330,6 +335,7 @@ class WhisperModel: language_probability=language_probability, duration=duration, transcription_options=options, + vad_options=vad_parameters, ) return segments, info diff --git a/faster_whisper/vad.py b/faster_whisper/vad.py index cf14d5c..cf3b626 100644 --- a/faster_whisper/vad.py +++ b/faster_whisper/vad.py @@ -3,47 +3,64 @@ import functools import os import warnings -from typing import List, Optional +from typing import List, NamedTuple, Optional import numpy as np from faster_whisper.utils import get_assets_path + # The code below is adapted from https://github.com/snakers4/silero-vad. +class VadOptions(NamedTuple): + """VAD options. - -def get_speech_timestamps( - audio: np.ndarray, - *, - threshold: float = 0.5, - min_speech_duration_ms: int = 250, - max_speech_duration_s: float = float("inf"), - min_silence_duration_ms: int = 2000, - window_size_samples: int = 1024, - speech_pad_ms: int = 400, -) -> List[dict]: - """This method is used for splitting long audios into speech chunks using silero VAD. - - Args: - audio: One dimensional float array. + Attributes: threshold: Speech threshold. Silero VAD outputs speech probabilities for each audio chunk, probabilities ABOVE this value are considered as SPEECH. It is better to tune this parameter for each dataset separately, but "lazy" 0.5 is pretty good for most datasets. min_speech_duration_ms: Final speech chunks shorter min_speech_duration_ms are thrown out. max_speech_duration_s: Maximum duration of speech chunks in seconds. Chunks longer than max_speech_duration_s will be split at the timestamp of the last silence that - lasts more than 100s (if any), to prevent agressive cutting. Otherwise, they will be + lasts more than 100s (if any), to prevent aggressive cutting. Otherwise, they will be split aggressively just before max_speech_duration_s. min_silence_duration_ms: In the end of each speech chunk wait for min_silence_duration_ms before separating it window_size_samples: Audio chunks of window_size_samples size are fed to the silero VAD model. WARNING! Silero VAD models were trained using 512, 1024, 1536 samples for 16000 sample rate. - Values other than these may affect model perfomance!! + Values other than these may affect model performance!! speech_pad_ms: Final speech chunks are padded by speech_pad_ms each side + """ + + threshold: float = 0.5 + min_speech_duration_ms: int = 250 + max_speech_duration_s: float = float("inf") + min_silence_duration_ms: int = 2000 + window_size_samples: int = 1024 + speech_pad_ms: int = 400 + + +def get_speech_timestamps( + audio: np.ndarray, vad_options: Optional[VadOptions] = None +) -> List[dict]: + """This method is used for splitting long audios into speech chunks using silero VAD. + + Args: + audio: One dimensional float array. + vad_options: Options for VAD processing. Returns: List of dicts containing begin and end samples of each speech chunk. """ + if vad_options is None: + vad_options = VadOptions() + + threshold = vad_options.threshold + min_speech_duration_ms = vad_options.min_speech_duration_ms + max_speech_duration_s = vad_options.max_speech_duration_s + min_silence_duration_ms = vad_options.min_silence_duration_ms + window_size_samples = vad_options.window_size_samples + speech_pad_ms = vad_options.speech_pad_ms + if window_size_samples not in [512, 1024, 1536]: warnings.warn( "Unusual window_size_samples! Supported window_size_samples:\n" diff --git a/tests/test_transcribe.py b/tests/test_transcribe.py index 8bebd2a..f1c9572 100644 --- a/tests/test_transcribe.py +++ b/tests/test_transcribe.py @@ -29,7 +29,7 @@ def test_transcribe(jfk_path): def test_vad(jfk_path): model = WhisperModel("tiny") - segments, _ = model.transcribe( + segments, info = model.transcribe( jfk_path, vad_filter=True, vad_parameters=dict(min_silence_duration_ms=500, speech_pad_ms=200), @@ -47,6 +47,9 @@ def test_vad(jfk_path): assert 0 < segment.start < 1 assert 10 < segment.end < 11 + assert info.vad_options.min_silence_duration_ms == 500 + assert info.vad_options.speech_pad_ms == 200 + def test_stereo_diarization(data_dir): model = WhisperModel("tiny") From 91f948b0d6dc546bca9ea97aefbcd8f6dfbce209 Mon Sep 17 00:00:00 2001 From: Ozan Caglayan Date: Tue, 9 May 2023 13:53:47 +0100 Subject: [PATCH 17/36] transcribe: return all language probabilities if requested (#210) * transcribe: return all language probabilities if requested If return_all_language_probs is True, TranscriptionInfo structure will have a list of tuples reflecting all language probabilities as returned by the model. * transcribe: fix docstring * transcribe: remove return_all_lang_probs parameter --- faster_whisper/transcribe.py | 13 ++++++++++--- tests/test_transcribe.py | 7 +++++++ 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py index 06154f3..8f0b354 100644 --- a/faster_whisper/transcribe.py +++ b/faster_whisper/transcribe.py @@ -67,6 +67,7 @@ class TranscriptionInfo(NamedTuple): language: str language_probability: float duration: float + all_language_probs: Optional[List[Tuple[str, float]]] transcription_options: TranscriptionOptions vad_options: VadOptions @@ -275,6 +276,7 @@ class WhisperModel: features = self.feature_extractor(audio) encoder_output = None + all_language_probs = None if language is None: if not self.model.is_multilingual: @@ -283,9 +285,13 @@ class WhisperModel: else: segment = features[:, : self.feature_extractor.nb_max_frames] encoder_output = self.encode(segment) - results = self.model.detect_language(encoder_output) - language_token, language_probability = results[0][0] - language = language_token[2:-2] + # results is a list of tuple[str, float] with language names and + # probabilities. + results = self.model.detect_language(encoder_output)[0] + # Parse language names to strip out markers + all_language_probs = [(token[2:-2], prob) for (token, prob) in results] + # Get top language token and probability + language, language_probability = all_language_probs[0] self.logger.info( "Detected language '%s' with probability %.2f", @@ -336,6 +342,7 @@ class WhisperModel: duration=duration, transcription_options=options, vad_options=vad_parameters, + all_language_probs=all_language_probs, ) return segments, info diff --git a/tests/test_transcribe.py b/tests/test_transcribe.py index f1c9572..6ecf2c4 100644 --- a/tests/test_transcribe.py +++ b/tests/test_transcribe.py @@ -6,11 +6,18 @@ from faster_whisper import WhisperModel, decode_audio def test_transcribe(jfk_path): model = WhisperModel("tiny") segments, info = model.transcribe(jfk_path, word_timestamps=True) + assert info.all_language_probs is not None assert info.language == "en" assert info.language_probability > 0.9 assert info.duration == 11 + # Get top language info from all results, which should match the + # already existing metadata + top_lang, top_lang_score = info.all_language_probs[0] + assert info.language == top_lang + assert abs(info.language_probability - top_lang_score) < 1e-16 + segments = list(segments) assert len(segments) == 1 From 53d247b0bba992c65670b85a5054b3780df4c812 Mon Sep 17 00:00:00 2001 From: David Axelrod Date: Tue, 9 May 2023 11:20:22 -0400 Subject: [PATCH 18/36] retry model download locally if huggingface throws an http error. (#215) * rety model download locally if huggingface throws an http error. * appease the linter * key error fix * use non internal lib error Co-authored-by: Guillaume Klein --------- Co-authored-by: Guillaume Klein --- faster_whisper/utils.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/faster_whisper/utils.py b/faster_whisper/utils.py index fe56ab9..bf274aa 100644 --- a/faster_whisper/utils.py +++ b/faster_whisper/utils.py @@ -77,13 +77,20 @@ def download_model( "tokenizer.json", "vocabulary.txt", ] + kwargs["allow_patterns"] = allow_patterns + kwargs["tqdm_class"] = disabled_tqdm - return huggingface_hub.snapshot_download( - repo_id, - allow_patterns=allow_patterns, - tqdm_class=disabled_tqdm, - **kwargs, - ) + try: + return huggingface_hub.snapshot_download( + repo_id, + **kwargs, + ) + except huggingface_hub.utils.HfHubHTTPError: + kwargs["local_files_only"] = True + return huggingface_hub.snapshot_download( + repo_id, + **kwargs, + ) def format_timestamp( From 32b962bed80f1adb15fabb95847c4cba80ad5a59 Mon Sep 17 00:00:00 2001 From: Purfview <69023953+Purfview@users.noreply.github.com> Date: Tue, 9 May 2023 19:20:41 +0100 Subject: [PATCH 19/36] Adds: whisper-standalone-win (#216) --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index b367fd6..2f0458a 100644 --- a/README.md +++ b/README.md @@ -139,6 +139,8 @@ Here is a non exhaustive list of open-source projects using *faster-whisper*. Fe * [whisper-diarize](https://github.com/MahmoudAshraf97/whisper-diarization) is a speaker diarization tool that is based on `faster-whisper` and nvidia nemo. +* [whisper-standalone-win](https://github.com/Purfview/whisper-standalone-win) contains the portable ready to run binaries of `faster-whisper` for Windows. + ## Model conversion When loading a model from its size such as `WhisperModel("large-v2")`, the correspondig CTranslate2 model is automatically downloaded from the [Hugging Face Hub](https://huggingface.co/guillaumekln). From 8e5c747ab55d6c5172c142ee2670c6b86863d060 Mon Sep 17 00:00:00 2001 From: Guillaume Klein Date: Thu, 11 May 2023 12:15:41 +0200 Subject: [PATCH 20/36] Reformat list of community integrations --- README.md | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 2f0458a..61c7b9f 100644 --- a/README.md +++ b/README.md @@ -133,13 +133,11 @@ See more model and transcription options in the [`WhisperModel`](https://github. ## Community integrations -Here is a non exhaustive list of open-source projects using *faster-whisper*. Feel free to add your project to the list! +Here is a non exhaustive list of open-source projects using faster-whisper. Feel free to add your project to the list! -* [whisper-ctranslate2](https://github.com/Softcatala/whisper-ctranslate2) is a command line client based on `faster-whisper` and compatible with the original client from openai/whisper. - -* [whisper-diarize](https://github.com/MahmoudAshraf97/whisper-diarization) is a speaker diarization tool that is based on `faster-whisper` and nvidia nemo. - -* [whisper-standalone-win](https://github.com/Purfview/whisper-standalone-win) contains the portable ready to run binaries of `faster-whisper` for Windows. +* [whisper-ctranslate2](https://github.com/Softcatala/whisper-ctranslate2) is a command line client based on faster-whisper and compatible with the original client from openai/whisper. +* [whisper-diarize](https://github.com/MahmoudAshraf97/whisper-diarization) is a speaker diarization tool that is based on faster-whisper and NVIDIA NeMo. +* [whisper-standalone-win](https://github.com/Purfview/whisper-standalone-win) contains the portable ready to run binaries of faster-whisper for Windows. ## Model conversion From 2d7c984bfc67d4f848f8b2844d2650f6028d9618 Mon Sep 17 00:00:00 2001 From: Guillaume Klein Date: Thu, 11 May 2023 14:47:22 +0200 Subject: [PATCH 21/36] Reformat function download_model for clarity --- faster_whisper/utils.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/faster_whisper/utils.py b/faster_whisper/utils.py index bf274aa..94a203f 100644 --- a/faster_whisper/utils.py +++ b/faster_whisper/utils.py @@ -62,14 +62,6 @@ def download_model( ) repo_id = "guillaumekln/faster-whisper-%s" % size - kwargs = {} - kwargs["local_files_only"] = local_files_only - if output_dir is not None: - kwargs["local_dir"] = output_dir - kwargs["local_dir_use_symlinks"] = False - - if cache_dir is not None: - kwargs["cache_dir"] = cache_dir allow_patterns = [ "config.json", @@ -77,8 +69,19 @@ def download_model( "tokenizer.json", "vocabulary.txt", ] - kwargs["allow_patterns"] = allow_patterns - kwargs["tqdm_class"] = disabled_tqdm + + kwargs = { + "local_files_only": local_files_only, + "allow_patterns": allow_patterns, + "tqdm_class": disabled_tqdm, + } + + if output_dir is not None: + kwargs["local_dir"] = output_dir + kwargs["local_dir_use_symlinks"] = False + + if cache_dir is not None: + kwargs["cache_dir"] = cache_dir try: return huggingface_hub.snapshot_download( From 6a1d331d66fe4a7e46235d63a375b5f8fd2609a4 Mon Sep 17 00:00:00 2001 From: Guillaume Klein Date: Thu, 11 May 2023 15:06:46 +0200 Subject: [PATCH 22/36] Add CONTRIBUTING.md (#229) --- CONTRIBUTING.md | 31 +++++++++++++++++++++++++++++++ README.md | 4 ---- 2 files changed, 31 insertions(+), 4 deletions(-) create mode 100644 CONTRIBUTING.md diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..379b9ad --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,31 @@ +# Contributing to faster-whisper + +Contributions are welcome! Here are some pointers to help you install the library for development and validate your changes before submitting a pull request. + +## Install the library for development + +We recommend installing the module in editable mode with the `dev` extra requirements: + +```bash +git clone https://github.com/guillaumekln/faster-whisper.git +cd faster-whisper/ +pip install -e .[dev] +``` + +## Validate the changes before creating a pull request + +1. Make sure the existing tests are still passing (and consider adding new tests as well!): + +```bash +pytest tests/ +``` + +2. Reformat and validate the code with the following tools: + +```bash +black . +isort . +flake8 . +``` + +These steps are also run automatically in the CI when you open the pull request. diff --git a/README.md b/README.md index 61c7b9f..c7c927b 100644 --- a/README.md +++ b/README.md @@ -52,10 +52,6 @@ pip install --force-reinstall "faster-whisper @ https://github.com/guillaumekln/ # Install a specific commit: pip install --force-reinstall "faster-whisper @ https://github.com/guillaumekln/faster-whisper/archive/a4f1cc8f11433e454c3934442b5e1a4ed5e865c3.tar.gz" - -# Install for development: -git clone https://github.com/guillaumekln/faster-whisper.git -pip install -e faster-whisper/ ``` ### GPU support From 6a2da9a95cf807d529ea97b2ce1c46103a88e158 Mon Sep 17 00:00:00 2001 From: Guillaume Klein Date: Thu, 11 May 2023 15:07:15 +0200 Subject: [PATCH 23/36] Also catch client-side network exceptions when synchronizing models (#228) --- faster_whisper/utils.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/faster_whisper/utils.py b/faster_whisper/utils.py index 94a203f..4b5e290 100644 --- a/faster_whisper/utils.py +++ b/faster_whisper/utils.py @@ -4,6 +4,7 @@ import os from typing import Optional import huggingface_hub +import requests from tqdm.auto import tqdm @@ -84,16 +85,23 @@ def download_model( kwargs["cache_dir"] = cache_dir try: - return huggingface_hub.snapshot_download( + return huggingface_hub.snapshot_download(repo_id, **kwargs) + except ( + huggingface_hub.utils.HfHubHTTPError, + requests.exceptions.ConnectionError, + ) as exception: + logger = get_logger() + logger.warning( + "An error occured while synchronizing the model %s from the Hugging Face Hub:\n%s", repo_id, - **kwargs, + exception, ) - except huggingface_hub.utils.HfHubHTTPError: + logger.warning( + "Trying to load the model directly from the local cache, if it exists." + ) + kwargs["local_files_only"] = True - return huggingface_hub.snapshot_download( - repo_id, - **kwargs, - ) + return huggingface_hub.snapshot_download(repo_id, **kwargs) def format_timestamp( From 723cb9748374e764013342d4371b04f2445aa925 Mon Sep 17 00:00:00 2001 From: Guillaume Klein Date: Wed, 24 May 2023 12:55:04 +0200 Subject: [PATCH 24/36] Fix occasional IndexError on empty segments (#227) --- faster_whisper/transcribe.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py index 8f0b354..aee13b5 100644 --- a/faster_whisper/transcribe.py +++ b/faster_whisper/transcribe.py @@ -763,6 +763,8 @@ class WhisperModel: text_tokens + [tokenizer.eot] ) word_boundaries = np.pad(np.cumsum([len(t) for t in word_tokens[:-1]]), (1, 0)) + if len(word_boundaries) <= 1: + return [] jumps = np.pad(np.diff(text_indices), (1, 0), constant_values=1).astype(bool) jump_times = time_indices[jumps] / self.tokens_per_second From c99feb22dc9b8c1772b584db207a097151849cd1 Mon Sep 17 00:00:00 2001 From: Guillaume Klein Date: Wed, 24 May 2023 12:55:15 +0200 Subject: [PATCH 25/36] Include requirements files in sdist (#240) --- MANIFEST.in | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MANIFEST.in b/MANIFEST.in index e2fff83..6f6187c 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1 +1,3 @@ include faster_whisper/assets/silero_vad.onnx +include requirements.txt +include requirements.conversion.txt From 4db549b8007565b4c7f02ef07a4062217ecab1eb Mon Sep 17 00:00:00 2001 From: Guillaume Klein Date: Wed, 24 May 2023 15:49:36 +0200 Subject: [PATCH 26/36] Make get_speech_timestamps backward compatible with the previous usage (#259) --- faster_whisper/vad.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/faster_whisper/vad.py b/faster_whisper/vad.py index cf3b626..a937812 100644 --- a/faster_whisper/vad.py +++ b/faster_whisper/vad.py @@ -40,19 +40,22 @@ class VadOptions(NamedTuple): def get_speech_timestamps( - audio: np.ndarray, vad_options: Optional[VadOptions] = None + audio: np.ndarray, + vad_options: Optional[VadOptions] = None, + **kwargs, ) -> List[dict]: """This method is used for splitting long audios into speech chunks using silero VAD. Args: audio: One dimensional float array. vad_options: Options for VAD processing. + kwargs: VAD options passed as keyword arguments for backward compatibility. Returns: List of dicts containing begin and end samples of each speech chunk. """ if vad_options is None: - vad_options = VadOptions() + vad_options = VadOptions(**kwargs) threshold = vad_options.threshold min_speech_duration_ms = vad_options.min_speech_duration_ms From cf7c02157301e050fd6ed9f24f8bdfc47da29951 Mon Sep 17 00:00:00 2001 From: Guillaume Klein Date: Wed, 24 May 2023 15:50:37 +0200 Subject: [PATCH 27/36] Export __version__ at the module level (#258) --- faster_whisper/__init__.py | 2 ++ faster_whisper/version.py | 3 +++ setup.py | 10 +++++++++- 3 files changed, 14 insertions(+), 1 deletion(-) create mode 100644 faster_whisper/version.py diff --git a/faster_whisper/__init__.py b/faster_whisper/__init__.py index add677e..e2fe00d 100644 --- a/faster_whisper/__init__.py +++ b/faster_whisper/__init__.py @@ -1,10 +1,12 @@ from faster_whisper.audio import decode_audio from faster_whisper.transcribe import WhisperModel from faster_whisper.utils import download_model, format_timestamp +from faster_whisper.version import __version__ __all__ = [ "decode_audio", "WhisperModel", "download_model", "format_timestamp", + "__version__", ] diff --git a/faster_whisper/version.py b/faster_whisper/version.py new file mode 100644 index 0000000..9793929 --- /dev/null +++ b/faster_whisper/version.py @@ -0,0 +1,3 @@ +"""Version information.""" + +__version__ = "0.5.1" diff --git a/setup.py b/setup.py index 7e85dea..1deca3b 100644 --- a/setup.py +++ b/setup.py @@ -11,6 +11,14 @@ def get_long_description(): return readme_file.read() +def get_project_version(): + version_path = os.path.join(base_dir, "faster_whisper", "version.py") + version = {} + with open(version_path, encoding="utf-8") as fp: + exec(fp.read(), version) + return version["__version__"] + + def get_requirements(path): with open(path, encoding="utf-8") as requirements: return [requirement.strip() for requirement in requirements] @@ -23,7 +31,7 @@ conversion_requires = get_requirements( setup( name="faster-whisper", - version="0.5.1", + version=get_project_version(), license="MIT", description="Faster Whisper transcription with CTranslate2", long_description=get_long_description(), From ae1e6d9883fdae52359e64be3c2121b5c7ab0cc1 Mon Sep 17 00:00:00 2001 From: Guillaume Klein Date: Wed, 24 May 2023 15:56:03 +0200 Subject: [PATCH 28/36] Remove reference to the VAD function from the README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index c7c927b..ed40e09 100644 --- a/README.md +++ b/README.md @@ -106,7 +106,7 @@ The library integrates the [Silero VAD](https://github.com/snakers4/silero-vad) segments, _ = model.transcribe("audio.mp3", vad_filter=True) ``` -The default behavior is conservative and only removes silence longer than 2 seconds. See the available VAD parameters and default values in the function [`get_speech_timestamps`](https://github.com/guillaumekln/faster-whisper/blob/master/faster_whisper/vad.py). They can be customized with the dictionary argument `vad_parameters`: +The default behavior is conservative and only removes silence longer than 2 seconds. See the available VAD parameters and default values in the [source code](https://github.com/guillaumekln/faster-whisper/blob/master/faster_whisper/vad.py). They can be customized with the dictionary argument `vad_parameters`: ```python segments, _ = model.transcribe("audio.mp3", vad_filter=True, vad_parameters=dict(min_silence_duration_ms=500)) From a150adcc19fc9b19833f1371737c208cfedefe07 Mon Sep 17 00:00:00 2001 From: Guillaume Klein Date: Wed, 24 May 2023 16:07:54 +0200 Subject: [PATCH 29/36] Enable onnxruntime dependency for Python 3.11 (#260) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 73c3b6d..4dd8bac 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,4 +2,4 @@ av==10.* ctranslate2>=3.10,<4 huggingface_hub>=0.13 tokenizers==0.13.* -onnxruntime==1.14.* ; python_version < "3.11" +onnxruntime>=1.14,<2 From 2a0062156449d6e79b7399c78c499a2bfbaf3d23 Mon Sep 17 00:00:00 2001 From: Guillaume Klein Date: Wed, 24 May 2023 16:15:01 +0200 Subject: [PATCH 30/36] Bump version to 0.6.0 --- faster_whisper/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faster_whisper/version.py b/faster_whisper/version.py index 9793929..bf288f0 100644 --- a/faster_whisper/version.py +++ b/faster_whisper/version.py @@ -1,3 +1,3 @@ """Version information.""" -__version__ = "0.5.1" +__version__ = "0.6.0" From 1bb7e33b933dde488a17fd556a1114ee0d58d34b Mon Sep 17 00:00:00 2001 From: Guillaume Klein Date: Wed, 24 May 2023 18:22:44 +0200 Subject: [PATCH 31/36] Reformat code snippet in README --- README.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index ed40e09..f32ed90 100644 --- a/README.md +++ b/README.md @@ -109,7 +109,11 @@ segments, _ = model.transcribe("audio.mp3", vad_filter=True) The default behavior is conservative and only removes silence longer than 2 seconds. See the available VAD parameters and default values in the [source code](https://github.com/guillaumekln/faster-whisper/blob/master/faster_whisper/vad.py). They can be customized with the dictionary argument `vad_parameters`: ```python -segments, _ = model.transcribe("audio.mp3", vad_filter=True, vad_parameters=dict(min_silence_duration_ms=500)) +segments, _ = model.transcribe( + "audio.mp3", + vad_filter=True, + vad_parameters=dict(min_silence_duration_ms=500), +) ``` ### Logging From d4222da952fde2aa4064aad820e207d0c7a9de75 Mon Sep 17 00:00:00 2001 From: Antonio Zarauz Moreno <87022369+hedrergudene@users.noreply.github.com> Date: Wed, 7 Jun 2023 11:30:53 +0200 Subject: [PATCH 32/36] Update README with community repo using FW (#284) * Update README with community repo using FW * Minor formatting change --------- Co-authored-by: Guillaume Klein --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index f32ed90..1ba8152 100644 --- a/README.md +++ b/README.md @@ -138,6 +138,7 @@ Here is a non exhaustive list of open-source projects using faster-whisper. Feel * [whisper-ctranslate2](https://github.com/Softcatala/whisper-ctranslate2) is a command line client based on faster-whisper and compatible with the original client from openai/whisper. * [whisper-diarize](https://github.com/MahmoudAshraf97/whisper-diarization) is a speaker diarization tool that is based on faster-whisper and NVIDIA NeMo. * [whisper-standalone-win](https://github.com/Purfview/whisper-standalone-win) contains the portable ready to run binaries of faster-whisper for Windows. +* [asr-sd-pipeline](https://github.com/hedrergudene/asr-sd-pipeline) provides a scalable, modular, end to end multi-speaker speech to text solution implemented using AzureML pipelines. ## Model conversion From 20d4e9418b5efb69ec5aa4819a39e3fb0e772a2a Mon Sep 17 00:00:00 2001 From: zh-plus Date: Sat, 10 Jun 2023 14:22:29 +0800 Subject: [PATCH 33/36] Add Open-Lyrics as a community project. (#291) --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 1ba8152..daee860 100644 --- a/README.md +++ b/README.md @@ -139,6 +139,7 @@ Here is a non exhaustive list of open-source projects using faster-whisper. Feel * [whisper-diarize](https://github.com/MahmoudAshraf97/whisper-diarization) is a speaker diarization tool that is based on faster-whisper and NVIDIA NeMo. * [whisper-standalone-win](https://github.com/Purfview/whisper-standalone-win) contains the portable ready to run binaries of faster-whisper for Windows. * [asr-sd-pipeline](https://github.com/hedrergudene/asr-sd-pipeline) provides a scalable, modular, end to end multi-speaker speech to text solution implemented using AzureML pipelines. +* [Open-Lyrics](https://github.com/zh-plus/Open-Lyrics) is a Python library that transcribes voice files using faster-whisper, and translates/polishes the resulting text into `.lrc` files in the desired language using OpenAI-GPT. ## Model conversion From ad58ba26ab8b3d871b8d4f9962cf8a669c3d41c1 Mon Sep 17 00:00:00 2001 From: kh Date: Fri, 16 Jun 2023 14:37:45 +0900 Subject: [PATCH 34/36] Fix typo (#304) https://github.com/snakers4/silero-vad/discussions/319#discussion-5081706 --- faster_whisper/vad.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faster_whisper/vad.py b/faster_whisper/vad.py index a937812..487dfa0 100644 --- a/faster_whisper/vad.py +++ b/faster_whisper/vad.py @@ -21,7 +21,7 @@ class VadOptions(NamedTuple): min_speech_duration_ms: Final speech chunks shorter min_speech_duration_ms are thrown out. max_speech_duration_s: Maximum duration of speech chunks in seconds. Chunks longer than max_speech_duration_s will be split at the timestamp of the last silence that - lasts more than 100s (if any), to prevent aggressive cutting. Otherwise, they will be + lasts more than 100ms (if any), to prevent aggressive cutting. Otherwise, they will be split aggressively just before max_speech_duration_s. min_silence_duration_ms: In the end of each speech chunk wait for min_silence_duration_ms before separating it From efc4f61d85e32cb3058ac3cd0d9541ea619c7014 Mon Sep 17 00:00:00 2001 From: Guillaume Klein Date: Tue, 20 Jun 2023 10:53:11 +0200 Subject: [PATCH 35/36] Do not specify the vocabulary file extension in the download pattern (#311) --- faster_whisper/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faster_whisper/utils.py b/faster_whisper/utils.py index 4b5e290..950b0da 100644 --- a/faster_whisper/utils.py +++ b/faster_whisper/utils.py @@ -68,7 +68,7 @@ def download_model( "config.json", "model.bin", "tokenizer.json", - "vocabulary.txt", + "vocabulary.*", ] kwargs = { From fee52c922904d18a143faa9d89f3b42caf029e9e Mon Sep 17 00:00:00 2001 From: FlippFuzz <41221030+FlippFuzz@users.noreply.github.com> Date: Wed, 21 Jun 2023 20:46:20 +0800 Subject: [PATCH 36/36] Allow users to input an Iterable of token ids into initial_prompt (#306) * Allow users to input an Iterable of token ids into initial_prompt * Need to check for String first because string is also an Iterable --- faster_whisper/transcribe.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py index aee13b5..71b0ea1 100644 --- a/faster_whisper/transcribe.py +++ b/faster_whisper/transcribe.py @@ -52,7 +52,7 @@ class TranscriptionOptions(NamedTuple): compression_ratio_threshold: Optional[float] condition_on_previous_text: bool temperatures: List[float] - initial_prompt: Optional[str] + initial_prompt: Optional[Union[str, Iterable[int]]] prefix: Optional[str] suppress_blank: bool suppress_tokens: Optional[List[int]] @@ -170,7 +170,7 @@ class WhisperModel: log_prob_threshold: Optional[float] = -1.0, no_speech_threshold: Optional[float] = 0.6, condition_on_previous_text: bool = True, - initial_prompt: Optional[str] = None, + initial_prompt: Optional[Union[str, Iterable[int]]] = None, prefix: Optional[str] = None, suppress_blank: bool = True, suppress_tokens: Optional[List[int]] = [-1], @@ -208,7 +208,8 @@ class WhisperModel: as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loop, such as repetition looping or timestamps going out of sync. - initial_prompt: Optional text to provide as a prompt for the first window. + initial_prompt: Optional text string or iterable of token ids to provide as a + prompt for the first window. prefix: Optional text to provide as a prefix for the first window. suppress_blank: Suppress blank outputs at the beginning of the sampling. suppress_tokens: List of token IDs to suppress. -1 will suppress a default set @@ -361,9 +362,12 @@ class WhisperModel: prompt_reset_since = 0 if options.initial_prompt is not None: - initial_prompt = " " + options.initial_prompt.strip() - initial_prompt_tokens = tokenizer.encode(initial_prompt) - all_tokens.extend(initial_prompt_tokens) + if isinstance(options.initial_prompt, str): + initial_prompt = " " + options.initial_prompt.strip() + initial_prompt_tokens = tokenizer.encode(initial_prompt) + all_tokens.extend(initial_prompt_tokens) + else: + all_tokens.extend(options.initial_prompt) while seek < content_frames: time_offset = seek * self.feature_extractor.time_per_frame