Squash long words at window and sentence boundaries (#226)
Port commit 255887f219
This commit is contained in:
@@ -732,9 +732,19 @@ class WhisperModel:
|
|||||||
word_index += 1
|
word_index += 1
|
||||||
|
|
||||||
if len(words) > 0:
|
if len(words) > 0:
|
||||||
# adjust the segment-level timestamps based on the word-level timestamps
|
|
||||||
segment["start"] = words[0]["start"]
|
segment["start"] = words[0]["start"]
|
||||||
segment["end"] = words[-1]["end"]
|
|
||||||
|
# hack: prefer the segment-level end timestamp if the last word is too long.
|
||||||
|
# a better segmentation algorithm based on VAD should be able to replace this.
|
||||||
|
if (
|
||||||
|
segment["end"] > words[-1]["start"]
|
||||||
|
and segment["end"] + 0.5 < words[-1]["end"]
|
||||||
|
):
|
||||||
|
# adjust the word-level timestamps based on the segment-level timestamps
|
||||||
|
words[-1]["end"] = segment["end"]
|
||||||
|
else:
|
||||||
|
# adjust the segment-level timestamps based on the word-level timestamps
|
||||||
|
segment["end"] = words[-1]["end"]
|
||||||
|
|
||||||
segment["words"] = words
|
segment["words"] = words
|
||||||
|
|
||||||
@@ -779,20 +789,30 @@ class WhisperModel:
|
|||||||
for i, j in zip(word_boundaries[:-1], word_boundaries[1:])
|
for i, j in zip(word_boundaries[:-1], word_boundaries[1:])
|
||||||
]
|
]
|
||||||
|
|
||||||
# hack: ensure the first and second word is not longer than twice the median word duration.
|
# hack: truncate long words at the start of a window and the start of a sentence.
|
||||||
# a better segmentation algorithm based on VAD should be able to replace this.
|
# a better segmentation algorithm based on VAD should be able to replace this.
|
||||||
word_durations = end_times - start_times
|
word_durations = end_times - start_times
|
||||||
word_durations = word_durations[word_durations.nonzero()]
|
word_durations = word_durations[word_durations.nonzero()]
|
||||||
if len(word_durations) > 0:
|
if len(word_durations) > 0:
|
||||||
median_duration = np.median(word_durations)
|
median_duration = np.median(word_durations)
|
||||||
max_duration = median_duration * 2
|
max_duration = median_duration * 2
|
||||||
if len(word_durations) >= 2 and word_durations[1] > max_duration:
|
sentence_end_marks = ".。!!??"
|
||||||
boundary = max(end_times[2] / 2, end_times[2] - max_duration)
|
# ensure words at sentence boundaries are not longer than twice the median
|
||||||
end_times[0] = start_times[1] = boundary
|
# word duration.
|
||||||
if (
|
for i in range(1, len(start_times)):
|
||||||
len(word_durations) >= 1
|
if end_times[i] - start_times[i] > max_duration:
|
||||||
and end_times[0] - start_times[0] > max_duration
|
if words[i] in sentence_end_marks:
|
||||||
):
|
end_times[i] = start_times[i] + max_duration
|
||||||
|
elif words[i - 1] in sentence_end_marks:
|
||||||
|
start_times[i] = end_times[i] - max_duration
|
||||||
|
# ensure the first and second word is not longer than twice the median word duration.
|
||||||
|
if len(start_times) > 0 and end_times[0] - start_times[0] > max_duration:
|
||||||
|
if (
|
||||||
|
len(start_times) > 1
|
||||||
|
and end_times[1] - start_times[1] > max_duration
|
||||||
|
):
|
||||||
|
boundary = max(end_times[1] / 2, end_times[1] - max_duration)
|
||||||
|
end_times[0] = start_times[1] = boundary
|
||||||
start_times[0] = max(0, end_times[0] - max_duration)
|
start_times[0] = max(0, end_times[0] - max_duration)
|
||||||
|
|
||||||
return [
|
return [
|
||||||
|
|||||||
Reference in New Issue
Block a user