fix all_tokens handling that caused more repetitions and discrepancy in JSON (#1060)

2023-03-08 18:34:07 -05:00
parent aac47c9834
commit 38f2f4d99d
3 changed files with 14 additions and 11 deletions
--- a/whisper/timing.py
+++ b/whisper/timing.py
@@ -290,7 +290,7 @@ def add_word_timestamps(
    if len(segments) == 0:
        return

-    text_tokens = [t for segment in segments for t in segment["tokens"]]
+    text_tokens = [t for s in segments for t in s["tokens"] if t < tokenizer.eot]
    alignment = find_alignment(model, tokenizer, text_tokens, mel, num_frames, **kwargs)
    merge_punctuations(alignment, prepend_punctuations, append_punctuations)