Revert "Merge remote-tracking branch 'upstream/master' into prompt"

This reverts commit 6e42088656, reversing changes made to 4a59bb011d.
2024-09-12 00:49:31 +08:00
parent 6e42088656
commit 28a4d11a73
13 changed files with 423 additions and 1599 deletions
--- a/faster_whisper/vad.py
+++ b/faster_whisper/vad.py
@@ -2,17 +2,9 @@ import bisect
 import functools
 import os

-from abc import ABC
-from collections.abc import Callable
-from typing import List, NamedTuple, Optional, Union
+from typing import List, NamedTuple, Optional

 import numpy as np
-import torch
-
-from pyannote.audio.core.io import AudioFile
-from pyannote.audio.pipelines import VoiceActivityDetection
-from pyannote.audio.pipelines.utils import PipelineModel
-from pyannote.core import Annotation, Segment, SlidingWindowFeature

 from faster_whisper.utils import get_assets_path

@@ -43,7 +35,7 @@ class VadOptions(NamedTuple):


 def get_speech_timestamps(
-    audio: torch.Tensor,
+    audio: np.ndarray,
    vad_options: Optional[VadOptions] = None,
    **kwargs,
 ) -> List[dict]:
@@ -184,12 +176,12 @@ def get_speech_timestamps(
    return speeches


-def collect_chunks(audio: torch.Tensor, chunks: List[dict]) -> torch.Tensor:
+def collect_chunks(audio: np.ndarray, chunks: List[dict]) -> np.ndarray:
    """Collects and concatenates audio chunks."""
    if not chunks:
-        return torch.tensor([], dtype=torch.float32)
+        return np.array([], dtype=np.float32)

-    return torch.cat([audio[chunk["start"] : chunk["end"]] for chunk in chunks])
+    return np.concatenate([audio[chunk["start"] : chunk["end"]] for chunk in chunks])


 class SpeechTimestampsMap:
@@ -284,313 +276,3 @@ class SileroVADModel:
        context = x[..., -64:]

        return out, state, context
-
-
-# BSD 2-Clause License
-
-# Copyright (c) 2024, Max Bain
-
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-
-# 1. Redistributions of source code must retain the above copyright notice, this
-#    list of conditions and the following disclaimer.
-
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-#    this list of conditions and the following disclaimer in the documentation
-#    and/or other materials provided with the distribution.
-
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
-# The code below is copied from whisper-x (https://github.com/m-bain/whisperX)
-# and adapted for faster_whisper.
-class SegmentX:
-    def __init__(self, start, end, speaker=None):
-        self.start = start
-        self.end = end
-        self.speaker = speaker
-
-
-class VoiceActivitySegmentation(VoiceActivityDetection, ABC):
-    """Pipeline wrapper class for Voice Activity Segmentation based on VAD scores."""
-
-    def __init__(
-        self,
-        segmentation: PipelineModel = "pyannote/segmentation",
-        device: Optional[Union[str, torch.device]] = None,
-        fscore: bool = False,
-        use_auth_token: Optional[str] = None,
-        **inference_kwargs,
-    ):
-        """Initialize the pipeline with the model name and the optional device.
-
-        Args:
-            dict parameters of VoiceActivityDetection class from pyannote:
-            segmentation (PipelineModel): Loaded model name.
-            device (torch.device or None): Device to perform the segmentation.
-            fscore (bool): Flag indicating whether to compute F-score during inference.
-            use_auth_token (str or None): Optional authentication token for model access.
-            inference_kwargs (dict):  Additional arguments from VoiceActivityDetection pipeline.
-        """
-        super().__init__(
-            segmentation=segmentation,
-            device=device,
-            fscore=fscore,
-            use_auth_token=use_auth_token,
-            **inference_kwargs,
-        )
-
-    def apply(
-        self, file: AudioFile, hook: Optional[Callable] = None
-    ) -> SlidingWindowFeature:
-        """Apply voice activity detection on the audio file.
-
-        Args:
-            file (AudioFile): Processed file.
-            hook (callable): Hook called with signature: hook("step_name", step_artefact, file=file)
-
-        Returns:
-            segmentations (SlidingWindowFeature): Voice activity segmentation.
-        """
-        # setup hook (e.g. for debugging purposes)
-        hook = self.setup_hook(file, hook=hook)
-
-        # apply segmentation model if needed
-        # output shape is (num_chunks, num_frames, 1)
-        if self.training:
-            if self.CACHED_SEGMENTATION in file:
-                segmentations = file[self.CACHED_SEGMENTATION]
-            else:
-                segmentations = self._segmentation(file)
-                file[self.CACHED_SEGMENTATION] = segmentations
-        else:
-            segmentations: SlidingWindowFeature = self._segmentation(file)
-
-        return segmentations
-
-
-class BinarizeVadScores:
-    """Binarize detection scores using hysteresis thresholding.
-
-    Reference:
-        Gregory Gelly and Jean-Luc Gauvain. "Minimum Word Error Training of
-        RNN-based Voice Activity Detection", InterSpeech 2015.
-
-        Modified by Max Bain to include WhisperX's min-cut operation
-        https://arxiv.org/abs/2303.00747
-
-    """
-
-    def __init__(
-        self,
-        onset: float = 0.5,
-        offset: Optional[float] = None,
-        min_duration_on: float = 0.0,
-        min_duration_off: float = 0.0,
-        pad_onset: float = 0.0,
-        pad_offset: float = 0.0,
-        max_duration: float = float("inf"),
-    ):
-        """Initializes the parameters for Binarizing the VAD scores.
-
-        Args:
-            onset (float, optional):
-                Onset threshold. Defaults to 0.5.
-            offset (float, optional):
-                Offset threshold. Defaults to `onset`.
-            min_duration_on (float, optional):
-                Remove active regions shorter than that many seconds. Defaults to 0s.
-            min_duration_off (float, optional):
-                Fill inactive regions shorter than that many seconds. Defaults to 0s.
-            pad_onset (float, optional):
-                Extend active regions by moving their start time by that many seconds.
-                Defaults to 0s.
-            pad_offset (float, optional):
-                Extend active regions by moving their end time by that many seconds.
-                Defaults to 0s.
-            max_duration (float):
-                The maximum length of an active segment.
-        """
-        super().__init__()
-
-        self.onset = onset
-        self.offset = offset or onset
-
-        self.pad_onset = pad_onset
-        self.pad_offset = pad_offset
-
-        self.min_duration_on = min_duration_on
-        self.min_duration_off = min_duration_off
-
-        self.max_duration = max_duration
-
-    def __get_active_regions(self, scores: SlidingWindowFeature) -> Annotation:
-        """Extract active regions from VAD scores.
-
-        Args:
-            scores (SlidingWindowFeature): Detection scores.
-
-        Returns:
-            active (Annotation): Active regions.
-        """
-        num_frames, num_classes = scores.data.shape
-        frames = scores.sliding_window
-        timestamps = [frames[i].middle for i in range(num_frames)]
-        # annotation meant to store 'active' regions
-        active = Annotation()
-        for k, k_scores in enumerate(scores.data.T):
-            label = k if scores.labels is None else scores.labels[k]
-
-            # initial state
-            start = timestamps[0]
-            is_active = k_scores[0] > self.onset
-            curr_scores = [k_scores[0]]
-            curr_timestamps = [start]
-            t = start
-            # optionally add `strict=False` for python 3.10 or later
-            for t, y in zip(timestamps[1:], k_scores[1:]):
-                # currently active
-                if is_active:
-                    curr_duration = t - start
-                    if curr_duration > self.max_duration:
-                        search_after = len(curr_scores) // 2
-                        # divide segment
-                        min_score_div_idx = search_after + np.argmin(
-                            curr_scores[search_after:]
-                        )
-                        min_score_t = curr_timestamps[min_score_div_idx]
-                        region = Segment(
-                            start - self.pad_onset, min_score_t + self.pad_offset
-                        )
-                        active[region, k] = label
-                        start = curr_timestamps[min_score_div_idx]
-                        curr_scores = curr_scores[min_score_div_idx + 1 :]
-                        curr_timestamps = curr_timestamps[min_score_div_idx + 1 :]
-                    # switching from active to inactive
-                    elif y < self.offset:
-                        region = Segment(start - self.pad_onset, t + self.pad_offset)
-                        active[region, k] = label
-                        start = t
-                        is_active = False
-                        curr_scores = []
-                        curr_timestamps = []
-                    curr_scores.append(y)
-                    curr_timestamps.append(t)
-                # currently inactive
-                else:
-                    # switching from inactive to active
-                    if y > self.onset:
-                        start = t
-                        is_active = True
-
-            # if active at the end, add final region
-            if is_active:
-                region = Segment(start - self.pad_onset, t + self.pad_offset)
-                active[region, k] = label
-
-        return active
-
-    def __call__(self, scores: SlidingWindowFeature) -> Annotation:
-        """Binarize detection scores.
-
-        Args:
-            scores (SlidingWindowFeature): Detection scores.
-
-        Returns:
-            active (Annotation): Binarized scores.
-        """
-        active = self.__get_active_regions(scores)
-        # because of padding, some active regions might be overlapping: merge them.
-        # also: fill same speaker gaps shorter than min_duration_off
-        if self.pad_offset > 0.0 or self.pad_onset > 0.0 or self.min_duration_off > 0.0:
-            if self.max_duration < float("inf"):
-                raise NotImplementedError("This would break current max_duration param")
-            active = active.support(collar=self.min_duration_off)
-
-        # remove tracks shorter than min_duration_on
-        if self.min_duration_on > 0:
-            for segment, track in list(active.itertracks()):
-                if segment.duration < self.min_duration_on:
-                    del active[segment, track]
-
-        return active
-
-
-def merge_chunks(
-    segments,
-    chunk_length,
-    onset: float = 0.5,
-    offset: Optional[float] = None,
-    edge_padding: float = 0.1,
-):
-    """
-    Merge operation described in whisper-x paper
-    """
-    curr_end = 0
-    merged_segments = []
-    seg_idxs = []
-    speaker_idxs = []
-
-    assert chunk_length > 0
-    binarize = BinarizeVadScores(max_duration=chunk_length, onset=onset, offset=offset)
-    segments = binarize(segments)
-    segments_list = []
-    for speech_turn in segments.get_timeline():
-        segments_list.append(
-            SegmentX(
-                max(0.0, speech_turn.start - edge_padding),
-                speech_turn.end + edge_padding,
-                "UNKNOWN",
-            )
-        )  # 100ms edge padding to account for edge errors
-
-    if len(segments_list) == 0:
-        print("No active speech found in audio")
-        return []
-
-    # Make sur the starting point is the start of the segment.
-    curr_start = segments_list[0].start
-
-    for idx, seg in enumerate(segments_list):
-        # if any segment start timing is less than previous segment end timing,
-        # reset the edge padding. Similarly for end timing.
-        if idx > 0:
-            if seg.start < segments_list[idx - 1].end:
-                seg.start += edge_padding
-        if idx < len(segments_list) - 1:
-            if seg.end > segments_list[idx + 1].start:
-                seg.end -= edge_padding
-
-        if seg.end - curr_start > chunk_length and curr_end - curr_start > 0:
-            merged_segments.append(
-                {
-                    "start": curr_start,
-                    "end": curr_end,
-                    "segments": seg_idxs,
-                }
-            )
-            curr_start = seg.start
-            seg_idxs = []
-            speaker_idxs = []
-        curr_end = seg.end
-        seg_idxs.append((seg.start, seg.end))
-        speaker_idxs.append(seg.speaker)
-    # add final
-    merged_segments.append(
-        {
-            "start": curr_start,
-            "end": curr_end,
-            "segments": seg_idxs,
-        }
-    )
-    return merged_segments