Merge branch 'master' into prompt
This commit is contained in:
@@ -170,7 +170,7 @@ segments, info = model.transcribe("audio.mp3", beam_size=5,
|
|||||||
language="en", max_new_tokens=128, condition_on_previous_text=False)
|
language="en", max_new_tokens=128, condition_on_previous_text=False)
|
||||||
|
|
||||||
```
|
```
|
||||||
NOTE: emprically, `condition_on_previous_text=True` will degrade the performance of `faster-distil-whisper` for long audio. Degradation on the first chunk was observed with `initial_prompt` too.
|
NOTE: Empirically, `condition_on_previous_text=True` will degrade the performance of `faster-distil-whisper` for long audio. Degradation on the first chunk was observed with `initial_prompt` too.
|
||||||
|
|
||||||
### Word-level timestamps
|
### Word-level timestamps
|
||||||
|
|
||||||
@@ -219,6 +219,8 @@ See more model and transcription options in the [`WhisperModel`](https://github.
|
|||||||
|
|
||||||
Here is a non exhaustive list of open-source projects using faster-whisper. Feel free to add your project to the list!
|
Here is a non exhaustive list of open-source projects using faster-whisper. Feel free to add your project to the list!
|
||||||
|
|
||||||
|
|
||||||
|
* [WhisperX](https://github.com/m-bain/whisperX) is an award-winning Python library that offers speaker diarization and accurate word-level timestamps using wav2vec2 alignment
|
||||||
* [whisper-ctranslate2](https://github.com/Softcatala/whisper-ctranslate2) is a command line client based on faster-whisper and compatible with the original client from openai/whisper.
|
* [whisper-ctranslate2](https://github.com/Softcatala/whisper-ctranslate2) is a command line client based on faster-whisper and compatible with the original client from openai/whisper.
|
||||||
* [whisper-diarize](https://github.com/MahmoudAshraf97/whisper-diarization) is a speaker diarization tool that is based on faster-whisper and NVIDIA NeMo.
|
* [whisper-diarize](https://github.com/MahmoudAshraf97/whisper-diarization) is a speaker diarization tool that is based on faster-whisper and NVIDIA NeMo.
|
||||||
* [whisper-standalone-win](https://github.com/Purfview/whisper-standalone-win) Standalone CLI executables of faster-whisper for Windows, Linux & macOS.
|
* [whisper-standalone-win](https://github.com/Purfview/whisper-standalone-win) Standalone CLI executables of faster-whisper for Windows, Linux & macOS.
|
||||||
@@ -228,10 +230,11 @@ Here is a non exhaustive list of open-source projects using faster-whisper. Feel
|
|||||||
* [aTrain](https://github.com/BANDAS-Center/aTrain) is a graphical user interface implementation of faster-whisper developed at the BANDAS-Center at the University of Graz for transcription and diarization in Windows ([Windows Store App](https://apps.microsoft.com/detail/atrain/9N15Q44SZNS2)) and Linux.
|
* [aTrain](https://github.com/BANDAS-Center/aTrain) is a graphical user interface implementation of faster-whisper developed at the BANDAS-Center at the University of Graz for transcription and diarization in Windows ([Windows Store App](https://apps.microsoft.com/detail/atrain/9N15Q44SZNS2)) and Linux.
|
||||||
* [Whisper-Streaming](https://github.com/ufal/whisper_streaming) implements real-time mode for offline Whisper-like speech-to-text models with faster-whisper as the most recommended back-end. It implements a streaming policy with self-adaptive latency based on the actual source complexity, and demonstrates the state of the art.
|
* [Whisper-Streaming](https://github.com/ufal/whisper_streaming) implements real-time mode for offline Whisper-like speech-to-text models with faster-whisper as the most recommended back-end. It implements a streaming policy with self-adaptive latency based on the actual source complexity, and demonstrates the state of the art.
|
||||||
* [WhisperLive](https://github.com/collabora/WhisperLive) is a nearly-live implementation of OpenAI's Whisper which uses faster-whisper as the backend to transcribe audio in real-time.
|
* [WhisperLive](https://github.com/collabora/WhisperLive) is a nearly-live implementation of OpenAI's Whisper which uses faster-whisper as the backend to transcribe audio in real-time.
|
||||||
|
* [Faster-Whisper-Transcriber](https://github.com/BBC-Esq/ctranslate2-faster-whisper-transcriber) is a simple but reliable voice transcriber that provides a user-friendly interface.
|
||||||
|
|
||||||
## Model conversion
|
## Model conversion
|
||||||
|
|
||||||
When loading a model from its size such as `WhisperModel("large-v3")`, the correspondig CTranslate2 model is automatically downloaded from the [Hugging Face Hub](https://huggingface.co/Systran).
|
When loading a model from its size such as `WhisperModel("large-v3")`, the corresponding CTranslate2 model is automatically downloaded from the [Hugging Face Hub](https://huggingface.co/Systran).
|
||||||
|
|
||||||
We also provide a script to convert any Whisper models compatible with the Transformers library. They could be the original OpenAI models or user fine-tuned models.
|
We also provide a script to convert any Whisper models compatible with the Transformers library. They could be the original OpenAI models or user fine-tuned models.
|
||||||
|
|
||||||
|
|||||||
@@ -14,7 +14,7 @@ import tokenizers
|
|||||||
from faster_whisper.audio import decode_audio
|
from faster_whisper.audio import decode_audio
|
||||||
from faster_whisper.feature_extractor import FeatureExtractor
|
from faster_whisper.feature_extractor import FeatureExtractor
|
||||||
from faster_whisper.tokenizer import _LANGUAGE_CODES, Tokenizer
|
from faster_whisper.tokenizer import _LANGUAGE_CODES, Tokenizer
|
||||||
from faster_whisper.utils import download_model, format_timestamp, get_logger
|
from faster_whisper.utils import download_model, format_timestamp, get_end, get_logger
|
||||||
from faster_whisper.vad import (
|
from faster_whisper.vad import (
|
||||||
SpeechTimestampsMap,
|
SpeechTimestampsMap,
|
||||||
VadOptions,
|
VadOptions,
|
||||||
@@ -67,6 +67,8 @@ class TranscriptionOptions(NamedTuple):
|
|||||||
prepend_punctuations: str
|
prepend_punctuations: str
|
||||||
append_punctuations: str
|
append_punctuations: str
|
||||||
max_new_tokens: Optional[int]
|
max_new_tokens: Optional[int]
|
||||||
|
clip_timestamps: Union[str, List[float]]
|
||||||
|
hallucination_silence_threshold: Optional[float]
|
||||||
|
|
||||||
|
|
||||||
class TranscriptionInfo(NamedTuple):
|
class TranscriptionInfo(NamedTuple):
|
||||||
@@ -216,6 +218,8 @@ class WhisperModel:
|
|||||||
vad_parameters: Optional[Union[dict, VadOptions]] = None,
|
vad_parameters: Optional[Union[dict, VadOptions]] = None,
|
||||||
max_new_tokens: Optional[int] = None,
|
max_new_tokens: Optional[int] = None,
|
||||||
chunk_length: Optional[int] = None,
|
chunk_length: Optional[int] = None,
|
||||||
|
clip_timestamps: Union[str, List[float]] = "0",
|
||||||
|
hallucination_silence_threshold: Optional[float] = None,
|
||||||
) -> Tuple[Iterable[Segment], TranscriptionInfo]:
|
) -> Tuple[Iterable[Segment], TranscriptionInfo]:
|
||||||
"""Transcribes an input file.
|
"""Transcribes an input file.
|
||||||
|
|
||||||
@@ -271,6 +275,12 @@ class WhisperModel:
|
|||||||
the maximum will be set by the default max_length.
|
the maximum will be set by the default max_length.
|
||||||
chunk_length: The length of audio segments. If it is not None, it will overwrite the
|
chunk_length: The length of audio segments. If it is not None, it will overwrite the
|
||||||
default chunk_length of the FeatureExtractor.
|
default chunk_length of the FeatureExtractor.
|
||||||
|
clip_timestamps: Union[str, List[float]]
|
||||||
|
Comma-separated list start,end,start,end,... timestamps (in seconds) of clips to
|
||||||
|
process. The last end timestamp defaults to the end of the file.
|
||||||
|
hallucination_silence_threshold: Optional[float]
|
||||||
|
When word_timestamps is True, skip silent periods longer than this threshold
|
||||||
|
(in seconds) when a possible hallucination is detected
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
A tuple with:
|
A tuple with:
|
||||||
@@ -387,6 +397,8 @@ class WhisperModel:
|
|||||||
prepend_punctuations=prepend_punctuations,
|
prepend_punctuations=prepend_punctuations,
|
||||||
append_punctuations=append_punctuations,
|
append_punctuations=append_punctuations,
|
||||||
max_new_tokens=max_new_tokens,
|
max_new_tokens=max_new_tokens,
|
||||||
|
clip_timestamps=clip_timestamps,
|
||||||
|
hallucination_silence_threshold=hallucination_silence_threshold,
|
||||||
)
|
)
|
||||||
|
|
||||||
segments = self.generate_segments(features, tokenizer, options, encoder_output)
|
segments = self.generate_segments(features, tokenizer, options, encoder_output)
|
||||||
@@ -414,8 +426,33 @@ class WhisperModel:
|
|||||||
encoder_output: Optional[ctranslate2.StorageView] = None,
|
encoder_output: Optional[ctranslate2.StorageView] = None,
|
||||||
) -> Iterable[Segment]:
|
) -> Iterable[Segment]:
|
||||||
content_frames = features.shape[-1] - self.feature_extractor.nb_max_frames
|
content_frames = features.shape[-1] - self.feature_extractor.nb_max_frames
|
||||||
|
content_duration = float(content_frames * self.feature_extractor.time_per_frame)
|
||||||
|
|
||||||
|
if isinstance(options.clip_timestamps, str):
|
||||||
|
TranscriptionOptions.clip_timestamps = [
|
||||||
|
float(ts)
|
||||||
|
for ts in (
|
||||||
|
options.clip_timestamps.split(",")
|
||||||
|
if options.clip_timestamps
|
||||||
|
else []
|
||||||
|
)
|
||||||
|
]
|
||||||
|
seek_points: List[int] = [
|
||||||
|
round(ts * self.frames_per_second) for ts in options.clip_timestamps
|
||||||
|
]
|
||||||
|
if len(seek_points) == 0:
|
||||||
|
seek_points.append(0)
|
||||||
|
if len(seek_points) % 2 == 1:
|
||||||
|
seek_points.append(content_frames)
|
||||||
|
seek_clips: List[Tuple[int, int]] = list(
|
||||||
|
zip(seek_points[::2], seek_points[1::2])
|
||||||
|
)
|
||||||
|
|
||||||
|
punctuation = "\"'“¿([{-\"'.。,,!!??::”)]}、"
|
||||||
|
|
||||||
idx = 0
|
idx = 0
|
||||||
seek = 0
|
clip_idx = 0
|
||||||
|
seek = seek_clips[clip_idx][0]
|
||||||
all_tokens = []
|
all_tokens = []
|
||||||
all_prompt_text = []
|
all_prompt_text = []
|
||||||
prompt_reset_since = 0
|
prompt_reset_since = 0
|
||||||
@@ -429,12 +466,32 @@ class WhisperModel:
|
|||||||
all_tokens.extend(options.initial_prompt)
|
all_tokens.extend(options.initial_prompt)
|
||||||
|
|
||||||
last_speech_timestamp = 0.0
|
last_speech_timestamp = 0.0
|
||||||
while seek < content_frames:
|
# NOTE: This loop is obscurely flattened to make the diff readable.
|
||||||
|
# A later commit should turn this into a simpler nested loop.
|
||||||
|
# for seek_clip_start, seek_clip_end in seek_clips:
|
||||||
|
# while seek < seek_clip_end
|
||||||
|
while clip_idx < len(seek_clips):
|
||||||
|
seek_clip_start, seek_clip_end = seek_clips[clip_idx]
|
||||||
|
if seek_clip_end > content_frames:
|
||||||
|
seek_clip_end = content_frames
|
||||||
|
if seek < seek_clip_start:
|
||||||
|
seek = seek_clip_start
|
||||||
|
if seek >= seek_clip_end:
|
||||||
|
clip_idx += 1
|
||||||
|
if clip_idx < len(seek_clips):
|
||||||
|
seek = seek_clips[clip_idx][0]
|
||||||
|
continue
|
||||||
time_offset = seek * self.feature_extractor.time_per_frame
|
time_offset = seek * self.feature_extractor.time_per_frame
|
||||||
segment = features[:, seek : seek + self.feature_extractor.nb_max_frames]
|
window_end_time = float(
|
||||||
segment_size = min(
|
(seek + self.feature_extractor.nb_max_frames)
|
||||||
self.feature_extractor.nb_max_frames, content_frames - seek
|
* self.feature_extractor.time_per_frame
|
||||||
)
|
)
|
||||||
|
segment_size = min(
|
||||||
|
self.feature_extractor.nb_max_frames,
|
||||||
|
content_frames - seek,
|
||||||
|
seek_clip_end - seek,
|
||||||
|
)
|
||||||
|
segment = features[:, seek : seek + segment_size]
|
||||||
segment_duration = segment_size * self.feature_extractor.time_per_frame
|
segment_duration = segment_size * self.feature_extractor.time_per_frame
|
||||||
|
|
||||||
if self.logger.isEnabledFor(logging.DEBUG):
|
if self.logger.isEnabledFor(logging.DEBUG):
|
||||||
@@ -487,10 +544,33 @@ class WhisperModel:
|
|||||||
previous_seek = seek
|
previous_seek = seek
|
||||||
current_segments = []
|
current_segments = []
|
||||||
|
|
||||||
|
# anomalous words are very long/short/improbable
|
||||||
|
def word_anomaly_score(word: dict) -> float:
|
||||||
|
probability = word.get("probability", 0.0)
|
||||||
|
duration = word["end"] - word["start"]
|
||||||
|
score = 0.0
|
||||||
|
if probability < 0.15:
|
||||||
|
score += 1.0
|
||||||
|
if duration < 0.133:
|
||||||
|
score += (0.133 - duration) * 15
|
||||||
|
if duration > 2.0:
|
||||||
|
score += duration - 2.0
|
||||||
|
return score
|
||||||
|
|
||||||
|
def is_segment_anomaly(segment: Optional[dict]) -> bool:
|
||||||
|
if segment is None or not segment["words"]:
|
||||||
|
return False
|
||||||
|
words = [w for w in segment["words"] if w["word"] not in punctuation]
|
||||||
|
words = words[:8]
|
||||||
|
score = sum(word_anomaly_score(w) for w in words)
|
||||||
|
return score >= 3 or score + 0.01 >= len(words)
|
||||||
|
|
||||||
|
def next_words_segment(segments: List[dict]) -> Optional[dict]:
|
||||||
|
return next((s for s in segments if s["words"]), None)
|
||||||
|
|
||||||
single_timestamp_ending = (
|
single_timestamp_ending = (
|
||||||
len(tokens) >= 2
|
len(tokens) >= 2
|
||||||
and tokens[-2] < tokenizer.timestamp_begin
|
and tokens[-2] < tokenizer.timestamp_begin <= tokens[-1]
|
||||||
and tokens[-1] >= tokenizer.timestamp_begin
|
|
||||||
)
|
)
|
||||||
|
|
||||||
consecutive_timestamps = [
|
consecutive_timestamps = [
|
||||||
@@ -573,18 +653,70 @@ class WhisperModel:
|
|||||||
last_speech_timestamp=last_speech_timestamp,
|
last_speech_timestamp=last_speech_timestamp,
|
||||||
)
|
)
|
||||||
|
|
||||||
word_end_timestamps = [
|
if not single_timestamp_ending:
|
||||||
w["end"] for s in current_segments for w in s["words"]
|
last_word_end = get_end(current_segments)
|
||||||
]
|
if last_word_end is not None and last_word_end > time_offset:
|
||||||
if len(word_end_timestamps) > 0:
|
seek = round(last_word_end * self.frames_per_second)
|
||||||
last_speech_timestamp = word_end_timestamps[-1]
|
|
||||||
if not single_timestamp_ending and len(word_end_timestamps) > 0:
|
|
||||||
seek_shift = round(
|
|
||||||
(word_end_timestamps[-1] - time_offset) * self.frames_per_second
|
|
||||||
)
|
|
||||||
|
|
||||||
if seek_shift > 0:
|
# skip silence before possible hallucinations
|
||||||
seek = previous_seek + seek_shift
|
if options.hallucination_silence_threshold is not None:
|
||||||
|
threshold = options.hallucination_silence_threshold
|
||||||
|
if not single_timestamp_ending:
|
||||||
|
last_word_end = get_end(current_segments)
|
||||||
|
if last_word_end is not None and last_word_end > time_offset:
|
||||||
|
remaining_duration = window_end_time - last_word_end
|
||||||
|
if remaining_duration > threshold:
|
||||||
|
seek = round(last_word_end * self.frames_per_second)
|
||||||
|
else:
|
||||||
|
seek = previous_seek + segment_size
|
||||||
|
|
||||||
|
# if first segment might be a hallucination, skip leading silence
|
||||||
|
first_segment = next_words_segment(current_segments)
|
||||||
|
if first_segment is not None and is_segment_anomaly(first_segment):
|
||||||
|
gap = first_segment["start"] - time_offset
|
||||||
|
if gap > threshold:
|
||||||
|
seek = previous_seek + round(gap * self.frames_per_second)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# skip silence before any possible hallucination that is surrounded
|
||||||
|
# by silence or more hallucinations
|
||||||
|
hal_last_end = last_speech_timestamp
|
||||||
|
for si in range(len(current_segments)):
|
||||||
|
segment = current_segments[si]
|
||||||
|
if not segment["words"]:
|
||||||
|
continue
|
||||||
|
if is_segment_anomaly(segment):
|
||||||
|
next_segment = next_words_segment(
|
||||||
|
current_segments[si + 1 :]
|
||||||
|
)
|
||||||
|
if next_segment is not None:
|
||||||
|
hal_next_start = next_segment["words"][0]["start"]
|
||||||
|
else:
|
||||||
|
hal_next_start = time_offset + segment_duration
|
||||||
|
silence_before = (
|
||||||
|
segment["start"] - hal_last_end > threshold
|
||||||
|
or segment["start"] < threshold
|
||||||
|
or segment["start"] - time_offset < 2.0
|
||||||
|
)
|
||||||
|
silence_after = (
|
||||||
|
hal_next_start - segment["end"] > threshold
|
||||||
|
or is_segment_anomaly(next_segment)
|
||||||
|
or window_end_time - segment["end"] < 2.0
|
||||||
|
)
|
||||||
|
if silence_before and silence_after:
|
||||||
|
seek = round(
|
||||||
|
max(time_offset + 1, segment["start"])
|
||||||
|
* self.frames_per_second
|
||||||
|
)
|
||||||
|
if content_duration - segment["end"] < threshold:
|
||||||
|
seek = content_frames
|
||||||
|
current_segments[si:] = []
|
||||||
|
break
|
||||||
|
hal_last_end = segment["end"]
|
||||||
|
|
||||||
|
last_word_end = get_end(current_segments)
|
||||||
|
if last_word_end is not None:
|
||||||
|
last_speech_timestamp = last_word_end
|
||||||
|
|
||||||
for segment in current_segments:
|
for segment in current_segments:
|
||||||
tokens = segment["tokens"]
|
tokens = segment["tokens"]
|
||||||
@@ -828,6 +960,7 @@ class WhisperModel:
|
|||||||
word_durations = np.array([word["end"] - word["start"] for word in alignment])
|
word_durations = np.array([word["end"] - word["start"] for word in alignment])
|
||||||
word_durations = word_durations[word_durations.nonzero()]
|
word_durations = word_durations[word_durations.nonzero()]
|
||||||
median_duration = np.median(word_durations) if len(word_durations) > 0 else 0.0
|
median_duration = np.median(word_durations) if len(word_durations) > 0 else 0.0
|
||||||
|
median_duration = min(0.7, float(median_duration))
|
||||||
max_duration = median_duration * 2
|
max_duration = median_duration * 2
|
||||||
|
|
||||||
# hack: truncate long words at sentence boundaries.
|
# hack: truncate long words at sentence boundaries.
|
||||||
|
|||||||
@@ -146,3 +146,10 @@ class disabled_tqdm(tqdm):
|
|||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
kwargs["disable"] = True
|
kwargs["disable"] = True
|
||||||
super().__init__(*args, **kwargs)
|
super().__init__(*args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
def get_end(segments: List[dict]) -> Optional[float]:
|
||||||
|
return next(
|
||||||
|
(w["end"] for s in reversed(segments) for w in reversed(s["words"])),
|
||||||
|
segments[-1]["end"] if segments else None,
|
||||||
|
)
|
||||||
|
|||||||
@@ -1,3 +1,3 @@
|
|||||||
"""Version information."""
|
"""Version information."""
|
||||||
|
|
||||||
__version__ = "0.10.0"
|
__version__ = "1.0.0"
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
av==10.*
|
av==11.*
|
||||||
ctranslate2>=3.22,<4
|
ctranslate2>=4.0,<5
|
||||||
huggingface_hub>=0.13
|
huggingface_hub>=0.13
|
||||||
tokenizers>=0.13,<0.16
|
tokenizers>=0.13,<0.16
|
||||||
onnxruntime>=1.14,<2
|
onnxruntime>=1.14,<2
|
||||||
|
|||||||
Reference in New Issue
Block a user