Prevent out of range error in method split_tokens_on_unicode
This commit is contained in:
@@ -135,6 +135,9 @@ class Tokenizer:
|
|||||||
current_tokens = []
|
current_tokens = []
|
||||||
unicode_offset += len(decoded)
|
unicode_offset += len(decoded)
|
||||||
|
|
||||||
|
if unicode_offset >= len(decoded_full):
|
||||||
|
break
|
||||||
|
|
||||||
return words, word_tokens
|
return words, word_tokens
|
||||||
|
|
||||||
def split_tokens_on_spaces(
|
def split_tokens_on_spaces(
|
||||||
|
|||||||
Reference in New Issue
Block a user