Use tiktoken (#1044)
* use tiktoken==0.3.0 * formatting * tuple should be safer * Update whisper/tokenizer.py Co-authored-by: Ruhollah Majdoddin <r.majdodin@gmail.com> * use tiktoken 0.3.1 * reflecting suggestions * cleanup * bypassing load_tiktoken_bpe to avoid blobfile dep --------- Co-authored-by: Ruhollah Majdoddin <r.majdodin@gmail.com>
This commit is contained in:
@@ -4,6 +4,7 @@ import pytest
|
||||
import torch
|
||||
|
||||
import whisper
|
||||
from whisper.tokenizer import get_tokenizer
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_name", whisper.available_models())
|
||||
@@ -24,6 +25,11 @@ def test_transcribe(model_name: str):
|
||||
assert "your country" in transcription
|
||||
assert "do for you" in transcription
|
||||
|
||||
tokenizer = get_tokenizer(model.is_multilingual)
|
||||
all_tokens = [t for s in result["segments"] for t in s["tokens"]]
|
||||
assert tokenizer.decode(all_tokens) == result["text"]
|
||||
assert tokenizer.decode_with_timestamps(all_tokens).startswith("<|0.00|>")
|
||||
|
||||
timing_checked = False
|
||||
for segment in result["segments"]:
|
||||
for timing in segment["words"]:
|
||||
@@ -31,7 +37,6 @@ def test_transcribe(model_name: str):
|
||||
if timing["word"].strip(" ,") == "Americans":
|
||||
assert timing["start"] <= 1.8
|
||||
assert timing["end"] >= 1.8
|
||||
print(timing)
|
||||
timing_checked = True
|
||||
|
||||
assert timing_checked
|
||||
|
||||
Reference in New Issue
Block a user