Prevent out of range error in method split_tokens_on_unicode

This commit is contained in:
Guillaume Klein
2023-04-04 10:17:56 +02:00
parent 2f266eb844
commit 36160c1e7e

View File

@@ -135,6 +135,9 @@ class Tokenizer:
current_tokens = [] current_tokens = []
unicode_offset += len(decoded) unicode_offset += len(decoded)
if unicode_offset >= len(decoded_full):
break
return words, word_tokens return words, word_tokens
def split_tokens_on_spaces( def split_tokens_on_spaces(