Add V3 Support (#578)

* Add V3 Support

* update conversion example

---------

Co-authored-by: oscaarjs <oscar.johansson@conversy.se>
This commit is contained in:
Oscaarjs
2023-11-24 23:16:12 +01:00
committed by GitHub
parent 5a0541ea7d
commit 3084409633
5 changed files with 48 additions and 26 deletions

View File

@@ -108,7 +108,7 @@ class Tokenizer:
def split_to_word_tokens(
self, tokens: List[int]
) -> Tuple[List[str], List[List[int]]]:
if self.language_code in {"zh", "ja", "th", "lo", "my"}:
if self.language_code in {"zh", "ja", "th", "lo", "my", "yue"}:
# These languages don't typically use spaces, so it is difficult to split words
# without morpheme analysis. Here, we instead split words at any
# position where the tokens are decoded as valid unicode points
@@ -274,4 +274,5 @@ _LANGUAGE_CODES = (
"yi",
"yo",
"zh",
"yue",
)