Use tiktoken (#1044)

* use tiktoken==0.3.0 * formatting * tuple should be safer * Update whisper/tokenizer.py Co-authored-by: Ruhollah Majdoddin <r.majdodin@gmail.com> * use tiktoken 0.3.1 * reflecting suggestions * cleanup * bypassing load_tiktoken_bpe to avoid blobfile dep --------- Co-authored-by: Ruhollah Majdoddin <r.majdodin@gmail.com>
2023-03-13 05:34:16 -04:00
parent ad3250a846
commit 839639a223
15 changed files with 100601 additions and 100096 deletions
--- a/tests/test_transcribe.py
+++ b/tests/test_transcribe.py
@@ -4,6 +4,7 @@ import pytest
 import torch

 import whisper
+from whisper.tokenizer import get_tokenizer


@pytest.mark.parametrize("model_name", whisper.available_models())
@@ -24,6 +25,11 @@ def test_transcribe(model_name: str):
    assert "your country" in transcription
    assert "do for you" in transcription

+    tokenizer = get_tokenizer(model.is_multilingual)
+    all_tokens = [t for s in result["segments"] for t in s["tokens"]]
+    assert tokenizer.decode(all_tokens) == result["text"]
+    assert tokenizer.decode_with_timestamps(all_tokens).startswith("<|0.00|>")
+
    timing_checked = False
    for segment in result["segments"]:
        for timing in segment["words"]:
@@ -31,7 +37,6 @@ def test_transcribe(model_name: str):
            if timing["word"].strip(" ,") == "Americans":
                assert timing["start"] <= 1.8
                assert timing["end"] >= 1.8
-                print(timing)
                timing_checked = True

    assert timing_checked