From 19c294f978be4991ba303da6af358f6acd251a25 Mon Sep 17 00:00:00 2001
From: Guillaume Klein <guillaumekln@users.noreply.github.com>
Date: Mon, 3 Jul 2023 10:20:20 +0200
Subject: [PATCH] Squash long words at window and sentence boundaries (#226)

Port commit https://github.com/openai/whisper/commit/255887f219e6b632bc1a6aac1caf28eecfca1bac
---
 faster_whisper/transcribe.py | 40 +++++++++++++++++++++++++++---------
 1 file changed, 30 insertions(+), 10 deletions(-)

diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py
index 71b0ea1..b88686e 100644
--- a/faster_whisper/transcribe.py
+++ b/faster_whisper/transcribe.py
@@ -732,9 +732,19 @@ class WhisperModel:
                 word_index += 1
 
             if len(words) > 0:
-                # adjust the segment-level timestamps based on the word-level timestamps
                 segment["start"] = words[0]["start"]
-                segment["end"] = words[-1]["end"]
+
+                # hack: prefer the segment-level end timestamp if the last word is too long.
+                # a better segmentation algorithm based on VAD should be able to replace this.
+                if (
+                    segment["end"] > words[-1]["start"]
+                    and segment["end"] + 0.5 < words[-1]["end"]
+                ):
+                    # adjust the word-level timestamps based on the segment-level timestamps
+                    words[-1]["end"] = segment["end"]
+                else:
+                    # adjust the segment-level timestamps based on the word-level timestamps
+                    segment["end"] = words[-1]["end"]
 
             segment["words"] = words
 
@@ -779,20 +789,30 @@ class WhisperModel:
             for i, j in zip(word_boundaries[:-1], word_boundaries[1:])
         ]
 
-        # hack: ensure the first and second word is not longer than twice the median word duration.
+        # hack: truncate long words at the start of a window and the start of a sentence.
         # a better segmentation algorithm based on VAD should be able to replace this.
         word_durations = end_times - start_times
         word_durations = word_durations[word_durations.nonzero()]
         if len(word_durations) > 0:
             median_duration = np.median(word_durations)
             max_duration = median_duration * 2
-            if len(word_durations) >= 2 and word_durations[1] > max_duration:
-                boundary = max(end_times[2] / 2, end_times[2] - max_duration)
-                end_times[0] = start_times[1] = boundary
-            if (
-                len(word_durations) >= 1
-                and end_times[0] - start_times[0] > max_duration
-            ):
+            sentence_end_marks = ".。!！?？"
+            # ensure words at sentence boundaries are not longer than twice the median
+            # word duration.
+            for i in range(1, len(start_times)):
+                if end_times[i] - start_times[i] > max_duration:
+                    if words[i] in sentence_end_marks:
+                        end_times[i] = start_times[i] + max_duration
+                    elif words[i - 1] in sentence_end_marks:
+                        start_times[i] = end_times[i] - max_duration
+            # ensure the first and second word is not longer than twice the median word duration.
+            if len(start_times) > 0 and end_times[0] - start_times[0] > max_duration:
+                if (
+                    len(start_times) > 1
+                    and end_times[1] - start_times[1] > max_duration
+                ):
+                    boundary = max(end_times[1] / 2, end_times[1] - max_duration)
+                    end_times[0] = start_times[1] = boundary
                 start_times[0] = max(0, end_times[0] - max_duration)
 
         return [