format code

Ignore repeated prompt
Clarify that the returned segments value is a generator (#144 )
2023-04-20 02:00:57 +08:00 · 2023-04-20 01:49:10 +08:00 · 2023-04-13 09:50:53 +02:00 · 2023-04-08 10:02:36 +02:00 · 2023-04-06 19:13:09 +02:00 · 2023-04-06 11:54:40 +02:00
6 changed files with 126 additions and 26 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,15 @@
 # Byte-compiled / Optimized / DLL Files
 *.pyc
 *.pyo
 *.pyd
 __pycache__/
 # Distribution / Packaging
 venv/
 # Unit Test
 .pytest_cache/
 # Ignore IDE, Editor Files
 .idea/
 .vscode/
--- a/README.md
+++ b/README.md
@@ -87,6 +87,13 @@ for segment in segments:
    print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
 ```
 **Warning:** `segments` is a *generator* so the transcription only starts when you iterate over it. The transcription can be run to completion by gathering the segments in a list or a `for` loop:
 ```python
 segments, _ = model.transcribe("audio.mp3")
 segments = list(segments)  # The transcription will actually run here.
 ```
 #### Word-level timestamps
 ```python
--- a/faster_whisper/tokenizer.py
+++ b/faster_whisper/tokenizer.py
@@ -125,19 +125,21 @@ class Tokenizer:
            current_tokens.append(token)
            decoded = self.decode_with_timestamps(current_tokens)
-            if (
+            try:
-                replacement_char not in decoded
+                replacement_char_index = decoded.index(replacement_char)
-                or decoded_full[unicode_offset + decoded.index(replacement_char)]
+                replacement_char_index += unicode_offset
-                == replacement_char
+            except ValueError:
                replacement_char_index = None
            if replacement_char_index is None or (
                replacement_char_index < len(decoded_full)
                and decoded_full[replacement_char_index] == replacement_char
            ):
                words.append(decoded)
                word_tokens.append(current_tokens)
                current_tokens = []
                unicode_offset += len(decoded)
                if unicode_offset >= len(decoded_full):
                    break
        return words, word_tokens
    def split_tokens_on_spaces(
--- a/faster_whisper/transcribe.py
+++ b/faster_whisper/transcribe.py
@@ -1,4 +1,5 @@
 import itertools
 import logging
 import os
 import zlib
@@ -11,7 +12,7 @@ import tokenizers
 from faster_whisper.audio import decode_audio
 from faster_whisper.feature_extractor import FeatureExtractor
 from faster_whisper.tokenizer import Tokenizer
-from faster_whisper.utils import download_model
+from faster_whisper.utils import download_model, format_timestamp, get_logger
 from faster_whisper.vad import (
    SpeechTimestampsMap,
    collect_chunks,
@@ -71,6 +72,7 @@ class WhisperModel:
        compute_type: str = "default",
        cpu_threads: int = 0,
        num_workers: int = 1,
        download_root: Optional[str] = None,
    ):
        """Initializes the Whisper model.
@@ -92,11 +94,15 @@ class WhisperModel:
            having multiple workers enables true parallelism when running the model
            (concurrent calls to self.model.generate() will run in parallel).
            This can improve the global throughput at the cost of increased memory usage.
          download_root: Directory where the model should be saved. If not set, the model
            is saved in the standard Hugging Face cache directory.
        """
        self.logger = get_logger()
        if os.path.isdir(model_size_or_path):
            model_path = model_size_or_path
        else:
-            model_path = download_model(model_size_or_path)
+            model_path = download_model(model_size_or_path, download_root)
        self.model = ctranslate2.models.Whisper(
            model_path,
@@ -211,17 +217,40 @@ class WhisperModel:
            - a generator over transcribed segments
            - an instance of AudioInfo
        """
-        if not isinstance(audio, np.ndarray):
+        sampling_rate = self.feature_extractor.sampling_rate
            audio = decode_audio(
                audio, sampling_rate=self.feature_extractor.sampling_rate
            )
-        duration = audio.shape[0] / self.feature_extractor.sampling_rate
+        if not isinstance(audio, np.ndarray):
            audio = decode_audio(audio, sampling_rate=sampling_rate)
        duration = audio.shape[0] / sampling_rate
        self.logger.info(
            "Processing audio with duration %s", format_timestamp(duration)
        )
        if vad_filter:
            vad_parameters = {} if vad_parameters is None else vad_parameters
            speech_chunks = get_speech_timestamps(audio, **vad_parameters)
            audio = collect_chunks(audio, speech_chunks)
            self.logger.info(
                "VAD filter removed %s of audio",
                format_timestamp(duration - (audio.shape[0] / sampling_rate)),
            )
            if self.logger.isEnabledFor(logging.DEBUG):
                self.logger.debug(
                    "VAD filter kept the following audio segments: %s",
                    ", ".join(
                        "[%s -> %s]"
                        % (
                            format_timestamp(chunk["start"] / sampling_rate),
                            format_timestamp(chunk["end"] / sampling_rate),
                        )
                        for chunk in speech_chunks
                    ),
                )
        else:
            speech_chunks = None
@@ -239,6 +268,12 @@ class WhisperModel:
                results = self.model.detect_language(encoder_output)
                language_token, language_probability = results[0][0]
                language = language_token[2:-2]
                self.logger.info(
                    "Detected language '%s' with probability %.2f",
                    language,
                    language_probability,
                )
        else:
            language_probability = 1
@@ -275,9 +310,7 @@ class WhisperModel:
        segments = self.generate_segments(features, tokenizer, options, encoder_output)
        if speech_chunks:
-            segments = restore_speech_timestamps(
+            segments = restore_speech_timestamps(segments, speech_chunks, sampling_rate)
                segments, speech_chunks, self.feature_extractor.sampling_rate
            )
        audio_info = AudioInfo(
            language=language,
@@ -297,6 +330,7 @@ class WhisperModel:
        content_frames = features.shape[-1] - self.feature_extractor.nb_max_frames
        seek = 0
        all_tokens = []
        all_prompt_text = []
        prompt_reset_since = 0
        if options.initial_prompt is not None:
@@ -312,6 +346,11 @@ class WhisperModel:
            )
            segment_duration = segment_size * self.feature_extractor.time_per_frame
            if self.logger.isEnabledFor(logging.DEBUG):
                self.logger.debug(
                    "Processing segment at %s", format_timestamp(time_offset)
                )
            previous_tokens = all_tokens[prompt_reset_since:]
            prompt = self.get_prompt(
                tokenizer,
@@ -339,6 +378,12 @@ class WhisperModel:
                    should_skip = False
                if should_skip:
                    self.logger.debug(
                        "No speech threshold is met (%f > %f)",
                        result.no_speech_prob,
                        options.no_speech_threshold,
                    )
                    # fast-forward to the next segment boundary
                    seek += segment_size
                    continue
@@ -457,7 +502,15 @@ class WhisperModel:
                if segment["start"] == segment["end"] or not text.strip():
                    continue
-                all_tokens.extend(tokens)
+                check_prompt_num = 1
                if all(
                    [
                        text.strip() != i.strip()
                        for i in all_prompt_text[-check_prompt_num:]
                    ]
                ):
                    all_tokens.extend(tokens)
                    all_prompt_text.append(text)
                yield Segment(
                    start=segment["start"],
@@ -543,12 +596,26 @@ class WhisperModel:
            ):
                needs_fallback = True  # too repetitive
                self.logger.debug(
                    "Compression ratio threshold is not met with temperature %.1f (%f > %f)",
                    temperature,
                    compression_ratio,
                    options.compression_ratio_threshold,
                )
            if (
                options.log_prob_threshold is not None
                and avg_log_prob < options.log_prob_threshold
            ):
                needs_fallback = True  # average log probability is too low
                self.logger.debug(
                    "Log probability threshold is not met with temperature %.1f (%f < %f)",
                    temperature,
                    avg_log_prob,
                    options.log_prob_threshold,
                )
            if not needs_fallback:
                break
@@ -721,14 +788,18 @@ def restore_speech_timestamps(
                    end=ts_map.get_original_time(word.end, chunk_index),
                )
                words.append(word)
        else:
            words = segment.words
-        segment = segment._replace(
+            segment = segment._replace(
-            start=ts_map.get_original_time(segment.start),
+                start=words[0].start,
-            end=ts_map.get_original_time(segment.end),
+                end=words[-1].end,
-            words=words,
+                words=words,
-        )
+            )
        else:
            segment = segment._replace(
                start=ts_map.get_original_time(segment.start),
                end=ts_map.get_original_time(segment.end),
            )
        yield segment
--- a/faster_whisper/utils.py
+++ b/faster_whisper/utils.py
@@ -1,3 +1,4 @@
 import logging
 import os
 from typing import Optional
@@ -25,6 +26,11 @@ def get_assets_path():
    return os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets")
 def get_logger():
    """Returns the module logger."""
    return logging.getLogger("faster_whisper")
 def download_model(size: str, output_dir: Optional[str] = None):
    """Downloads a CTranslate2 Whisper model from the Hugging Face Hub.
--- a/setup.py
+++ b/setup.py
@@ -23,7 +23,7 @@ conversion_requires = get_requirements(
 setup(
    name="faster-whisper",
-    version="0.4.0",
+    version="0.4.1",
    license="MIT",
    description="Faster Whisper transcription with CTranslate2",
    long_description=get_long_description(),
Author	SHA1	Message	Date
heimoshuiyu	9a646b69e6	format code	2023-04-20 02:00:57 +08:00
heimoshuiyu	49af9564ab	Ignore repeated prompt	2023-04-20 01:49:10 +08:00
Guillaume Klein	3adcc12d0f	Clarify that the returned segments value is a generator (#144 ) * Clarify that the returned segments value is a generator * Update README.md	2023-04-13 09:50:53 +02:00
Ewald Enzinger	2b53dee6b6	Expose download location in WhisperModel constructor (#126 ) This increases compatibility with OpenAI Whisper's whisper.load_model() and is useful for downstream integrations	2023-04-08 10:02:36 +02:00
Bekir Bakar	06d24056e9	Configure ignore for more files. (#122 )	2023-04-06 19:13:09 +02:00
Guillaume Klein	e9a082dcf2	Keep segment timestamps aligned with words timestamps after VAD (#119 )	2023-04-06 11:54:40 +02:00
Guillaume Klein	051b3350e5	Add some info and debug logs (#113 )	2023-04-05 16:57:59 +02:00
Guillaume Klein	746f2698db	Bump version to 0.4.1	2023-04-04 12:16:23 +02:00
Guillaume Klein	a5d03e55fa	Prevent out of range error in method split_tokens_on_unicode (#111 )	2023-04-04 10:51:14 +02:00
Guillaume Klein	9fa1989073	Revert "Prevent out of range error in method split_tokens_on_unicode" This reverts commit `36160c1e7e`.	2023-04-04 10:25:41 +02:00