Merge remote-tracking branch 'upstream/master' into prompt
This commit is contained in:
@@ -160,7 +160,7 @@ segments, _ = model.transcribe("audio.mp3")
|
||||
segments = list(segments) # The transcription will actually run here.
|
||||
```
|
||||
### Faster-distil-whisper
|
||||
For usage of `faster-ditil-whisper`, please refer to: https://github.com/guillaumekln/faster-whisper/issues/533
|
||||
For usage of `faster-distil-whisper`, please refer to: https://github.com/guillaumekln/faster-whisper/issues/533
|
||||
|
||||
```python
|
||||
model_size = "distil-large-v2"
|
||||
|
||||
@@ -102,3 +102,18 @@ def _resample_frames(frames, resampler):
|
||||
# Add None to flush the resampler.
|
||||
for frame in itertools.chain(frames, [None]):
|
||||
yield from resampler.resample(frame)
|
||||
|
||||
|
||||
def pad_or_trim(array, length: int, *, axis: int = -1):
|
||||
"""
|
||||
Pad or trim the audio array to N_SAMPLES, as expected by the encoder.
|
||||
"""
|
||||
if array.shape[axis] > length:
|
||||
array = array.take(indices=range(length), axis=axis)
|
||||
|
||||
if array.shape[axis] < length:
|
||||
pad_widths = [(0, 0)] * array.ndim
|
||||
pad_widths[axis] = (0, length - array.shape[axis])
|
||||
array = np.pad(array, pad_widths)
|
||||
|
||||
return array
|
||||
|
||||
@@ -11,7 +11,7 @@ import ctranslate2
|
||||
import numpy as np
|
||||
import tokenizers
|
||||
|
||||
from faster_whisper.audio import decode_audio
|
||||
from faster_whisper.audio import decode_audio, pad_or_trim
|
||||
from faster_whisper.feature_extractor import FeatureExtractor
|
||||
from faster_whisper.tokenizer import _LANGUAGE_CODES, Tokenizer
|
||||
from faster_whisper.utils import download_model, format_timestamp, get_end, get_logger
|
||||
@@ -493,6 +493,7 @@ class WhisperModel:
|
||||
)
|
||||
segment = features[:, seek : seek + segment_size]
|
||||
segment_duration = segment_size * self.feature_extractor.time_per_frame
|
||||
segment = pad_or_trim(segment, self.feature_extractor.nb_max_frames)
|
||||
|
||||
if self.logger.isEnabledFor(logging.DEBUG):
|
||||
self.logger.debug(
|
||||
@@ -661,14 +662,6 @@ class WhisperModel:
|
||||
# skip silence before possible hallucinations
|
||||
if options.hallucination_silence_threshold is not None:
|
||||
threshold = options.hallucination_silence_threshold
|
||||
if not single_timestamp_ending:
|
||||
last_word_end = get_end(current_segments)
|
||||
if last_word_end is not None and last_word_end > time_offset:
|
||||
remaining_duration = window_end_time - last_word_end
|
||||
if remaining_duration > threshold:
|
||||
seek = round(last_word_end * self.frames_per_second)
|
||||
else:
|
||||
seek = previous_seek + segment_size
|
||||
|
||||
# if first segment might be a hallucination, skip leading silence
|
||||
first_segment = next_words_segment(current_segments)
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
"""Version information."""
|
||||
|
||||
__version__ = "1.0.0"
|
||||
__version__ = "1.0.1"
|
||||
|
||||
Reference in New Issue
Block a user