From 19329a361150bab0596c9f35486a57c94ae6f78e Mon Sep 17 00:00:00 2001 From: trungkienbkhn Date: Wed, 13 Dec 2023 18:38:44 +0700 Subject: [PATCH] Word timing tweaks (#616) --- faster_whisper/transcribe.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py index e0525b9..c082546 100644 --- a/faster_whisper/transcribe.py +++ b/faster_whisper/transcribe.py @@ -908,6 +908,13 @@ class WhisperModel: words, word_tokens = tokenizer.split_to_word_tokens( text_tokens + [tokenizer.eot] ) + if len(word_tokens) <= 1: + # return on eot only + # >>> np.pad([], (1, 0)) + # array([0.]) + # This results in crashes when we lookup jump_times with float, like + # IndexError: arrays used as indices must be of integer (or boolean) type + return [] word_boundaries = np.pad(np.cumsum([len(t) for t in word_tokens[:-1]]), (1, 0)) if len(word_boundaries) <= 1: return []