Merge branch 'master' into prompt

2024-02-23 10:52:53 +08:00
parent d04e685ca2 06d32bf0c1
commit 4b64ef1f70
5 changed files with 167 additions and 24 deletions
--- a/README.md
+++ b/README.md
@@ -170,7 +170,7 @@ segments, info = model.transcribe("audio.mp3", beam_size=5,
    language="en", max_new_tokens=128, condition_on_previous_text=False)
 ```
-NOTE: emprically, `condition_on_previous_text=True` will degrade the performance of `faster-distil-whisper` for long audio. Degradation on the first chunk was observed with `initial_prompt` too.
+NOTE: Empirically, `condition_on_previous_text=True` will degrade the performance of `faster-distil-whisper` for long audio. Degradation on the first chunk was observed with `initial_prompt` too.
 ### Word-level timestamps
@@ -219,6 +219,8 @@ See more model and transcription options in the [`WhisperModel`](https://github.
 Here is a non exhaustive list of open-source projects using faster-whisper. Feel free to add your project to the list!
 * [WhisperX](https://github.com/m-bain/whisperX) is an award-winning Python library that offers speaker diarization and accurate word-level timestamps using wav2vec2 alignment
 * [whisper-ctranslate2](https://github.com/Softcatala/whisper-ctranslate2) is a command line client based on faster-whisper and compatible with the original client from openai/whisper.
 * [whisper-diarize](https://github.com/MahmoudAshraf97/whisper-diarization) is a speaker diarization tool that is based on faster-whisper and NVIDIA NeMo.
 * [whisper-standalone-win](https://github.com/Purfview/whisper-standalone-win) Standalone CLI executables of faster-whisper for Windows, Linux & macOS. 
@@ -228,10 +230,11 @@ Here is a non exhaustive list of open-source projects using faster-whisper. Feel
 * [aTrain](https://github.com/BANDAS-Center/aTrain) is a graphical user interface implementation of faster-whisper developed at the BANDAS-Center at the University of Graz for transcription and diarization in Windows ([Windows Store App](https://apps.microsoft.com/detail/atrain/9N15Q44SZNS2)) and Linux.
 * [Whisper-Streaming](https://github.com/ufal/whisper_streaming) implements real-time mode for offline Whisper-like speech-to-text models with faster-whisper as the most recommended back-end. It implements a streaming policy with self-adaptive latency based on the actual source complexity, and demonstrates the state of the art.
 * [WhisperLive](https://github.com/collabora/WhisperLive) is a nearly-live implementation of OpenAI's Whisper which uses faster-whisper as the backend to transcribe audio in real-time.
 * [Faster-Whisper-Transcriber](https://github.com/BBC-Esq/ctranslate2-faster-whisper-transcriber) is a simple but reliable voice transcriber that provides a user-friendly interface.
 ## Model conversion
-When loading a model from its size such as `WhisperModel("large-v3")`, the correspondig CTranslate2 model is automatically downloaded from the [Hugging Face Hub](https://huggingface.co/Systran).
+When loading a model from its size such as `WhisperModel("large-v3")`, the corresponding CTranslate2 model is automatically downloaded from the [Hugging Face Hub](https://huggingface.co/Systran).
 We also provide a script to convert any Whisper models compatible with the Transformers library. They could be the original OpenAI models or user fine-tuned models.
--- a/faster_whisper/transcribe.py
+++ b/faster_whisper/transcribe.py
@@ -14,7 +14,7 @@ import tokenizers
 from faster_whisper.audio import decode_audio
 from faster_whisper.feature_extractor import FeatureExtractor
 from faster_whisper.tokenizer import _LANGUAGE_CODES, Tokenizer
-from faster_whisper.utils import download_model, format_timestamp, get_logger
+from faster_whisper.utils import download_model, format_timestamp, get_end, get_logger
 from faster_whisper.vad import (
    SpeechTimestampsMap,
    VadOptions,
@@ -67,6 +67,8 @@ class TranscriptionOptions(NamedTuple):
    prepend_punctuations: str
    append_punctuations: str
    max_new_tokens: Optional[int]
    clip_timestamps: Union[str, List[float]]
    hallucination_silence_threshold: Optional[float]
 class TranscriptionInfo(NamedTuple):
@@ -216,6 +218,8 @@ class WhisperModel:
        vad_parameters: Optional[Union[dict, VadOptions]] = None,
        max_new_tokens: Optional[int] = None,
        chunk_length: Optional[int] = None,
        clip_timestamps: Union[str, List[float]] = "0",
        hallucination_silence_threshold: Optional[float] = None,
    ) -> Tuple[Iterable[Segment], TranscriptionInfo]:
        """Transcribes an input file.
@@ -271,6 +275,12 @@ class WhisperModel:
            the maximum will be set by the default max_length.
          chunk_length: The length of audio segments. If it is not None, it will overwrite the
            default chunk_length of the FeatureExtractor.
          clip_timestamps: Union[str, List[float]]
            Comma-separated list start,end,start,end,... timestamps (in seconds) of clips to
             process. The last end timestamp defaults to the end of the file.
          hallucination_silence_threshold: Optional[float]
            When word_timestamps is True, skip silent periods longer than this threshold
             (in seconds) when a possible hallucination is detected
        Returns:
          A tuple with:
@@ -387,6 +397,8 @@ class WhisperModel:
            prepend_punctuations=prepend_punctuations,
            append_punctuations=append_punctuations,
            max_new_tokens=max_new_tokens,
            clip_timestamps=clip_timestamps,
            hallucination_silence_threshold=hallucination_silence_threshold,
        )
        segments = self.generate_segments(features, tokenizer, options, encoder_output)
@@ -414,8 +426,33 @@ class WhisperModel:
        encoder_output: Optional[ctranslate2.StorageView] = None,
    ) -> Iterable[Segment]:
        content_frames = features.shape[-1] - self.feature_extractor.nb_max_frames
        content_duration = float(content_frames * self.feature_extractor.time_per_frame)
        if isinstance(options.clip_timestamps, str):
            TranscriptionOptions.clip_timestamps = [
                float(ts)
                for ts in (
                    options.clip_timestamps.split(",")
                    if options.clip_timestamps
                    else []
                )
            ]
        seek_points: List[int] = [
            round(ts * self.frames_per_second) for ts in options.clip_timestamps
        ]
        if len(seek_points) == 0:
            seek_points.append(0)
        if len(seek_points) % 2 == 1:
            seek_points.append(content_frames)
        seek_clips: List[Tuple[int, int]] = list(
            zip(seek_points[::2], seek_points[1::2])
        )
        punctuation = "\"'“¿([{-\"'.。,，!！?？:：”)]}、"
        idx = 0
-        seek = 0
+        clip_idx = 0
        seek = seek_clips[clip_idx][0]
        all_tokens = []
        all_prompt_text = []
        prompt_reset_since = 0
@@ -429,12 +466,32 @@ class WhisperModel:
                all_tokens.extend(options.initial_prompt)
        last_speech_timestamp = 0.0
-        while seek < content_frames:
+        # NOTE: This loop is obscurely flattened to make the diff readable.
        # A later commit should turn this into a simpler nested loop.
        # for seek_clip_start, seek_clip_end in seek_clips:
        #     while seek < seek_clip_end
        while clip_idx < len(seek_clips):
            seek_clip_start, seek_clip_end = seek_clips[clip_idx]
            if seek_clip_end > content_frames:
                seek_clip_end = content_frames
            if seek < seek_clip_start:
                seek = seek_clip_start
            if seek >= seek_clip_end:
                clip_idx += 1
                if clip_idx < len(seek_clips):
                    seek = seek_clips[clip_idx][0]
                continue
            time_offset = seek * self.feature_extractor.time_per_frame
-            segment = features[:, seek : seek + self.feature_extractor.nb_max_frames]
+            window_end_time = float(
-            segment_size = min(
+                (seek + self.feature_extractor.nb_max_frames)
-                self.feature_extractor.nb_max_frames, content_frames - seek
+                * self.feature_extractor.time_per_frame
            )
            segment_size = min(
                self.feature_extractor.nb_max_frames,
                content_frames - seek,
                seek_clip_end - seek,
            )
            segment = features[:, seek : seek + segment_size]
            segment_duration = segment_size * self.feature_extractor.time_per_frame
            if self.logger.isEnabledFor(logging.DEBUG):
@@ -487,10 +544,33 @@ class WhisperModel:
            previous_seek = seek
            current_segments = []
            # anomalous words are very long/short/improbable
            def word_anomaly_score(word: dict) -> float:
                probability = word.get("probability", 0.0)
                duration = word["end"] - word["start"]
                score = 0.0
                if probability < 0.15:
                    score += 1.0
                if duration < 0.133:
                    score += (0.133 - duration) * 15
                if duration > 2.0:
                    score += duration - 2.0
                return score
            def is_segment_anomaly(segment: Optional[dict]) -> bool:
                if segment is None or not segment["words"]:
                    return False
                words = [w for w in segment["words"] if w["word"] not in punctuation]
                words = words[:8]
                score = sum(word_anomaly_score(w) for w in words)
                return score >= 3 or score + 0.01 >= len(words)
            def next_words_segment(segments: List[dict]) -> Optional[dict]:
                return next((s for s in segments if s["words"]), None)
            single_timestamp_ending = (
                len(tokens) >= 2
-                and tokens[-2] < tokenizer.timestamp_begin
+                and tokens[-2] < tokenizer.timestamp_begin <= tokens[-1]
                and tokens[-1] >= tokenizer.timestamp_begin
            )
            consecutive_timestamps = [
@@ -573,18 +653,70 @@ class WhisperModel:
                    last_speech_timestamp=last_speech_timestamp,
                )
-                word_end_timestamps = [
+                if not single_timestamp_ending:
-                    w["end"] for s in current_segments for w in s["words"]
+                    last_word_end = get_end(current_segments)
-                ]
+                    if last_word_end is not None and last_word_end > time_offset:
-                if len(word_end_timestamps) > 0:
+                        seek = round(last_word_end * self.frames_per_second)
                    last_speech_timestamp = word_end_timestamps[-1]
                if not single_timestamp_ending and len(word_end_timestamps) > 0:
                    seek_shift = round(
                        (word_end_timestamps[-1] - time_offset) * self.frames_per_second
                    )
-                    if seek_shift > 0:
+                # skip silence before possible hallucinations
-                        seek = previous_seek + seek_shift
+                if options.hallucination_silence_threshold is not None:
                    threshold = options.hallucination_silence_threshold
                    if not single_timestamp_ending:
                        last_word_end = get_end(current_segments)
                        if last_word_end is not None and last_word_end > time_offset:
                            remaining_duration = window_end_time - last_word_end
                            if remaining_duration > threshold:
                                seek = round(last_word_end * self.frames_per_second)
                            else:
                                seek = previous_seek + segment_size
                    # if first segment might be a hallucination, skip leading silence
                    first_segment = next_words_segment(current_segments)
                    if first_segment is not None and is_segment_anomaly(first_segment):
                        gap = first_segment["start"] - time_offset
                        if gap > threshold:
                            seek = previous_seek + round(gap * self.frames_per_second)
                            continue
                    # skip silence before any possible hallucination that is surrounded
                    # by silence or more hallucinations
                    hal_last_end = last_speech_timestamp
                    for si in range(len(current_segments)):
                        segment = current_segments[si]
                        if not segment["words"]:
                            continue
                        if is_segment_anomaly(segment):
                            next_segment = next_words_segment(
                                current_segments[si + 1 :]
                            )
                            if next_segment is not None:
                                hal_next_start = next_segment["words"][0]["start"]
                            else:
                                hal_next_start = time_offset + segment_duration
                            silence_before = (
                                segment["start"] - hal_last_end > threshold
                                or segment["start"] < threshold
                                or segment["start"] - time_offset < 2.0
                            )
                            silence_after = (
                                hal_next_start - segment["end"] > threshold
                                or is_segment_anomaly(next_segment)
                                or window_end_time - segment["end"] < 2.0
                            )
                            if silence_before and silence_after:
                                seek = round(
                                    max(time_offset + 1, segment["start"])
                                    * self.frames_per_second
                                )
                                if content_duration - segment["end"] < threshold:
                                    seek = content_frames
                                current_segments[si:] = []
                                break
                        hal_last_end = segment["end"]
                last_word_end = get_end(current_segments)
                if last_word_end is not None:
                    last_speech_timestamp = last_word_end
            for segment in current_segments:
                tokens = segment["tokens"]
@@ -828,6 +960,7 @@ class WhisperModel:
        word_durations = np.array([word["end"] - word["start"] for word in alignment])
        word_durations = word_durations[word_durations.nonzero()]
        median_duration = np.median(word_durations) if len(word_durations) > 0 else 0.0
        median_duration = min(0.7, float(median_duration))
        max_duration = median_duration * 2
        # hack: truncate long words at sentence boundaries.
--- a/faster_whisper/utils.py
+++ b/faster_whisper/utils.py
@@ -146,3 +146,10 @@ class disabled_tqdm(tqdm):
    def __init__(self, *args, **kwargs):
        kwargs["disable"] = True
        super().__init__(*args, **kwargs)
 def get_end(segments: List[dict]) -> Optional[float]:
    return next(
        (w["end"] for s in reversed(segments) for w in reversed(s["words"])),
        segments[-1]["end"] if segments else None,
    )
--- a/faster_whisper/version.py
+++ b/faster_whisper/version.py
@@ -1,3 +1,3 @@
 """Version information."""
-__version__ = "0.10.0"
+__version__ = "1.0.0"
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
-av==10.*
+av==11.*
-ctranslate2>=3.22,<4
+ctranslate2>=4.0,<5
 huggingface_hub>=0.13
 tokenizers>=0.13,<0.16
 onnxruntime>=1.14,<2
`@@ -1,3 +1,3 @@`
	`"""Version information."""`	`"""Version information."""`

	`__version__ = "0.10.0"`	`__version__ = "1.0.0"`