Prevent out of range error in method split_tokens_on_unicode (#111)

This commit is contained in:
Guillaume Klein
2023-04-04 10:51:14 +02:00
committed by GitHub
parent 9fa1989073
commit a5d03e55fa

View File

@@ -125,10 +125,15 @@ class Tokenizer:
current_tokens.append(token) current_tokens.append(token)
decoded = self.decode_with_timestamps(current_tokens) decoded = self.decode_with_timestamps(current_tokens)
if ( try:
replacement_char not in decoded replacement_char_index = decoded.index(replacement_char)
or decoded_full[unicode_offset + decoded.index(replacement_char)] replacement_char_index += unicode_offset
== replacement_char except ValueError:
replacement_char_index = None
if replacement_char_index is None or (
replacement_char_index < len(decoded_full)
and decoded_full[replacement_char_index] == replacement_char
): ):
words.append(decoded) words.append(decoded)
word_tokens.append(current_tokens) word_tokens.append(current_tokens)