From 19c294f978be4991ba303da6af358f6acd251a25 Mon Sep 17 00:00:00 2001 From: Guillaume Klein Date: Mon, 3 Jul 2023 10:20:20 +0200 Subject: [PATCH] Squash long words at window and sentence boundaries (#226) Port commit https://github.com/openai/whisper/commit/255887f219e6b632bc1a6aac1caf28eecfca1bac --- faster_whisper/transcribe.py | 40 +++++++++++++++++++++++++++--------- 1 file changed, 30 insertions(+), 10 deletions(-) diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py index 71b0ea1..b88686e 100644 --- a/faster_whisper/transcribe.py +++ b/faster_whisper/transcribe.py @@ -732,9 +732,19 @@ class WhisperModel: word_index += 1 if len(words) > 0: - # adjust the segment-level timestamps based on the word-level timestamps segment["start"] = words[0]["start"] - segment["end"] = words[-1]["end"] + + # hack: prefer the segment-level end timestamp if the last word is too long. + # a better segmentation algorithm based on VAD should be able to replace this. + if ( + segment["end"] > words[-1]["start"] + and segment["end"] + 0.5 < words[-1]["end"] + ): + # adjust the word-level timestamps based on the segment-level timestamps + words[-1]["end"] = segment["end"] + else: + # adjust the segment-level timestamps based on the word-level timestamps + segment["end"] = words[-1]["end"] segment["words"] = words @@ -779,20 +789,30 @@ class WhisperModel: for i, j in zip(word_boundaries[:-1], word_boundaries[1:]) ] - # hack: ensure the first and second word is not longer than twice the median word duration. + # hack: truncate long words at the start of a window and the start of a sentence. # a better segmentation algorithm based on VAD should be able to replace this. word_durations = end_times - start_times word_durations = word_durations[word_durations.nonzero()] if len(word_durations) > 0: median_duration = np.median(word_durations) max_duration = median_duration * 2 - if len(word_durations) >= 2 and word_durations[1] > max_duration: - boundary = max(end_times[2] / 2, end_times[2] - max_duration) - end_times[0] = start_times[1] = boundary - if ( - len(word_durations) >= 1 - and end_times[0] - start_times[0] > max_duration - ): + sentence_end_marks = ".。!!??" + # ensure words at sentence boundaries are not longer than twice the median + # word duration. + for i in range(1, len(start_times)): + if end_times[i] - start_times[i] > max_duration: + if words[i] in sentence_end_marks: + end_times[i] = start_times[i] + max_duration + elif words[i - 1] in sentence_end_marks: + start_times[i] = end_times[i] - max_duration + # ensure the first and second word is not longer than twice the median word duration. + if len(start_times) > 0 and end_times[0] - start_times[0] > max_duration: + if ( + len(start_times) > 1 + and end_times[1] - start_times[1] > max_duration + ): + boundary = max(end_times[1] / 2, end_times[1] - max_duration) + end_times[0] = start_times[1] = boundary start_times[0] = max(0, end_times[0] - max_duration) return [