From 834f00a0ea7521239df548009567b3a507c9b166 Mon Sep 17 00:00:00 2001 From: Jong Wook Kim Date: Thu, 22 Sep 2022 02:45:03 +0900 Subject: [PATCH] making small model the default --- README.md | 10 +++++----- whisper/transcribe.py | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index aa7c209..fa98376 100644 --- a/README.md +++ b/README.md @@ -37,17 +37,17 @@ choco install ffmpeg ## Command-line usage -The following command will transcribe speech in audio files +The following command will transcribe speech in audio files, using the `medium` model: - whisper audio.flac audio.mp3 audio.wav + whisper audio.flac audio.mp3 audio.wav --model medium -The default setting works well for transcribing English. To transcribe an audio file containing non-English speech, you can specify the language using the `--language` option: +The default setting (which selects the `small` model) works well for transcribing English. To transcribe an audio file containing non-English speech, you can specify the language using the `--language` option: - whisper ~/japanese.wav --language Japanese + whisper japanese.wav --language Japanese Adding `--task translate` will translate the speech into English: - whisper ~/japanese.wav --language Japanese --task translate + whisper japanese.wav --language Japanese --task translate Run the following to view all available transcription options: diff --git a/whisper/transcribe.py b/whisper/transcribe.py index d995248..37d34d6 100644 --- a/whisper/transcribe.py +++ b/whisper/transcribe.py @@ -229,13 +229,13 @@ def cli(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("audio", nargs="+", type=str, help="audio file(s) to transcribe") - parser.add_argument("--model", default="base", choices=available_models(), help="name of the Whisper model to use") + parser.add_argument("--model", default="small", choices=available_models(), help="name of the Whisper model to use") parser.add_argument("--device", default="cuda" if torch.cuda.is_available() else "cpu", help="device to use for PyTorch inference") parser.add_argument("--output_dir", "-o", type=str, default=".", help="directory to save the outputs") parser.add_argument("--verbose", type=str2bool, default=True, help="Whether to print out the progress and debug messages") parser.add_argument("--task", type=str, default="transcribe", choices=["transcribe", "translate"], help="whether to perform X->X speech recognition ('transcribe') or X->English translation ('translate')") - parser.add_argument("--language", type=str, default=None, choices=sorted(LANGUAGES.keys()) + sorted(TO_LANGUAGE_CODE.keys()), help="language spoken in the audio, specify None to perform language detection") + parser.add_argument("--language", type=str, default=None, choices=sorted(LANGUAGES.keys()) + sorted([k.title() for k in TO_LANGUAGE_CODE.keys()]), help="language spoken in the audio, specify None to perform language detection") parser.add_argument("--temperature", type=float, default=0, help="temperature to use for sampling") parser.add_argument("--best_of", type=optional_int, default=5, help="number of candidates when sampling with non-zero temperature")