Prevent out of range error in method split_tokens_on_unicode (#111)
This commit is contained in:
@@ -125,10 +125,15 @@ class Tokenizer:
|
|||||||
current_tokens.append(token)
|
current_tokens.append(token)
|
||||||
decoded = self.decode_with_timestamps(current_tokens)
|
decoded = self.decode_with_timestamps(current_tokens)
|
||||||
|
|
||||||
if (
|
try:
|
||||||
replacement_char not in decoded
|
replacement_char_index = decoded.index(replacement_char)
|
||||||
or decoded_full[unicode_offset + decoded.index(replacement_char)]
|
replacement_char_index += unicode_offset
|
||||||
== replacement_char
|
except ValueError:
|
||||||
|
replacement_char_index = None
|
||||||
|
|
||||||
|
if replacement_char_index is None or (
|
||||||
|
replacement_char_index < len(decoded_full)
|
||||||
|
and decoded_full[replacement_char_index] == replacement_char
|
||||||
):
|
):
|
||||||
words.append(decoded)
|
words.append(decoded)
|
||||||
word_tokens.append(current_tokens)
|
word_tokens.append(current_tokens)
|
||||||
|
|||||||
Reference in New Issue
Block a user