Prevent out of range error in method split_tokens_on_unicode
This commit is contained in:
@@ -135,6 +135,9 @@ class Tokenizer:
|
||||
current_tokens = []
|
||||
unicode_offset += len(decoded)
|
||||
|
||||
if unicode_offset >= len(decoded_full):
|
||||
break
|
||||
|
||||
return words, word_tokens
|
||||
|
||||
def split_tokens_on_spaces(
|
||||
|
||||
Reference in New Issue
Block a user