Prevent out of range error in method split_tokens_on_unicode (#111)

This commit is contained in:
Guillaume Klein
2023-04-04 10:51:14 +02:00
committed by GitHub
parent 9fa1989073
commit a5d03e55fa

View File

@@ -125,10 +125,15 @@ class Tokenizer:
current_tokens.append(token)
decoded = self.decode_with_timestamps(current_tokens)
if (
replacement_char not in decoded
or decoded_full[unicode_offset + decoded.index(replacement_char)]
== replacement_char
try:
replacement_char_index = decoded.index(replacement_char)
replacement_char_index += unicode_offset
except ValueError:
replacement_char_index = None
if replacement_char_index is None or (
replacement_char_index < len(decoded_full)
and decoded_full[replacement_char_index] == replacement_char
):
words.append(decoded)
word_tokens.append(current_tokens)