From a5d03e55fa4f977ed58757055a05f9ab0d5e749f Mon Sep 17 00:00:00 2001 From: Guillaume Klein Date: Tue, 4 Apr 2023 10:51:14 +0200 Subject: [PATCH] Prevent out of range error in method split_tokens_on_unicode (#111) --- faster_whisper/tokenizer.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/faster_whisper/tokenizer.py b/faster_whisper/tokenizer.py index efe22a3..b040044 100644 --- a/faster_whisper/tokenizer.py +++ b/faster_whisper/tokenizer.py @@ -125,10 +125,15 @@ class Tokenizer: current_tokens.append(token) decoded = self.decode_with_timestamps(current_tokens) - if ( - replacement_char not in decoded - or decoded_full[unicode_offset + decoded.index(replacement_char)] - == replacement_char + try: + replacement_char_index = decoded.index(replacement_char) + replacement_char_index += unicode_offset + except ValueError: + replacement_char_index = None + + if replacement_char_index is None or ( + replacement_char_index < len(decoded_full) + and decoded_full[replacement_char_index] == replacement_char ): words.append(decoded) word_tokens.append(current_tokens)