Add V3 Support (#578)

* Add V3 Support * update conversion example --------- Co-authored-by: oscaarjs <oscar.johansson@conversy.se>
2023-11-24 23:16:12 +01:00
parent 5a0541ea7d
commit 3084409633
5 changed files with 48 additions and 26 deletions
--- a/faster_whisper/tokenizer.py
+++ b/faster_whisper/tokenizer.py
@@ -108,7 +108,7 @@ class Tokenizer:
    def split_to_word_tokens(
        self, tokens: List[int]
    ) -> Tuple[List[str], List[List[int]]]:
-        if self.language_code in {"zh", "ja", "th", "lo", "my"}:
+        if self.language_code in {"zh", "ja", "th", "lo", "my", "yue"}:
            # These languages don't typically use spaces, so it is difficult to split words
            # without morpheme analysis. Here, we instead split words at any
            # position where the tokens are decoded as valid unicode points
@@ -274,4 +274,5 @@ _LANGUAGE_CODES = (
    "yi",
    "yo",
    "zh",
+    "yue",
 )