Suppress some special tokens when the default set is not used

This commit is contained in:
Guillaume Klein
2023-03-30 12:42:29 +02:00
parent eda840f8ff
commit 39fddba886
2 changed files with 34 additions and 1 deletions

View File

@@ -33,10 +33,22 @@ class Tokenizer:
self.language = None
self.language_code = "en"
@cached_property
def transcribe(self) -> int:
return self.tokenizer.token_to_id("<|transcribe|>")
@cached_property
def translate(self) -> int:
return self.tokenizer.token_to_id("<|translate|>")
@cached_property
def sot(self) -> int:
return self.tokenizer.token_to_id("<|startoftranscript|>")
@cached_property
def sot_lm(self) -> int:
return self.tokenizer.token_to_id("<|startoflm|>")
@cached_property
def sot_prev(self) -> int:
return self.tokenizer.token_to_id("<|startofprev|>")