Prevent out of range error in method split_tokens_on_unicode

2023-04-04 10:17:56 +02:00
parent 2f266eb844
commit 36160c1e7e
1 changed files with 3 additions and 0 deletions
--- a/faster_whisper/tokenizer.py
+++ b/faster_whisper/tokenizer.py
@@ -135,6 +135,9 @@ class Tokenizer:
                current_tokens = []
                unicode_offset += len(decoded)

+                if unicode_offset >= len(decoded_full):
+                    break
+
        return words, word_tokens

    def split_tokens_on_spaces(