From 4a5ba38f5e359e53688cdec0741b95d5b2cebb20 Mon Sep 17 00:00:00 2001 From: heimoshuiyu Date: Fri, 15 Nov 2024 01:04:49 +0800 Subject: [PATCH] upgrade with faster-whisper upstream --- README.md | 8 ++-- requirements.txt | 2 +- requirements_version.txt | 26 +++++------ whisper_fastapi.py | 96 ++++++++++++++++++---------------------- 4 files changed, 61 insertions(+), 71 deletions(-) diff --git a/README.md b/README.md index 4639837..0c9543c 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ Whisper-FastAPI is a very simple Python FastAPI interface for konele and OpenAI ## Features - **Translation and Transcription**: The application provides an API for konele service, where translations and transcriptions can be obtained by connecting over websockets or POST requests. -- **Language Support**: If the target language is English, then the application will translate any source language to English. +- **Language Support**: If no language is specified, the language will be automatically recognized from the first 30 seconds. - **Websocket and POST Method Support**: The project supports a websocket (`/konele/ws`) and a POST method to `/konele/post`. - **Audio Transcriptions**: The `/v1/audio/transcriptions` endpoint allows users to upload an audio file and receive transcription in response, with an optional `response_type` parameter. The `response_type` can be 'json', 'text', 'tsv', 'srt', and 'vtt'. - **Simplified Chinese**: The traditional Chinese will be automatically convert to simplified Chinese for konele using `opencc` library. @@ -16,10 +16,10 @@ Whisper-FastAPI is a very simple Python FastAPI interface for konele and OpenAI For konele voice typing, you can use either the websocket endpoint or the POST method endpoint. -- **Websocket**: Connect to the websocket at `/konele/ws` and send audio data. The server will respond with the transcription or translation. -- **POST Method**: Send a POST request to `/konele/post` with the audio data in the body. The server will respond with the transcription or translation. +- **Websocket**: Connect to the websocket at `/konele/ws` (or `/v1/konele/ws`) and send audio data. The server will respond with the transcription or translation. +- **POST Method**: Send a POST request to `/konele/post` (or `/v1/konele/post`) with the audio data in the body. The server will respond with the transcription or translation. -You can also use the demo I have created to quickly test the effect at and +You can also use the demo I have created to quickly test the effect at and ### OpenAI Whisper Service diff --git a/requirements.txt b/requirements.txt index 8719b01..e5cd4a2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,5 +4,5 @@ uvicorn[standard] whisper_ctranslate2 opencc prometheus-fastapi-instrumentator -git+https://github.com/heimoshuiyu/faster-whisper@prompt +git+https://github.com/SYSTRAN/faster-whisper@3e0ba86571b9fe93bab2a25b3ff5af1be41014ec pydub diff --git a/requirements_version.txt b/requirements_version.txt index 4403b05..08263d2 100644 --- a/requirements_version.txt +++ b/requirements_version.txt @@ -1,6 +1,6 @@ annotated-types==0.7.0 anyio==4.6.2.post1 -av==12.3.0 +av==13.1.0 certifi==2024.8.30 cffi==1.17.1 charset-normalizer==3.4.0 @@ -8,21 +8,21 @@ click==8.1.7 coloredlogs==15.0.1 ctranslate2==4.5.0 exceptiongroup==1.2.2 -fastapi==0.115.3 -faster-whisper @ git+https://github.com/heimoshuiyu/faster-whisper@8563f889cb23c2f8d64568d7a4a1c7beea28618b +fastapi==0.115.5 +faster-whisper @ git+https://github.com/SYSTRAN/faster-whisper@3e0ba86571b9fe93bab2a25b3ff5af1be41014ec filelock==3.16.1 flatbuffers==24.3.25 fsspec==2024.10.0 h11==0.14.0 httptools==0.6.4 -huggingface-hub==0.26.1 +huggingface-hub==0.26.2 humanfriendly==10.0 idna==3.10 mpmath==1.3.0 -numpy==2.1.2 -onnxruntime==1.19.2 +numpy==2.1.3 +onnxruntime==1.20.0 OpenCC==1.1.9 -packaging==24.1 +packaging==24.2 prometheus-fastapi-instrumentator==7.0.0 prometheus_client==0.21.0 protobuf==5.28.3 @@ -31,19 +31,19 @@ pydantic==2.9.2 pydantic_core==2.23.4 pydub==0.25.1 python-dotenv==1.0.1 -python-multipart==0.0.12 +python-multipart==0.0.17 PyYAML==6.0.2 requests==2.32.3 sniffio==1.3.1 sounddevice==0.5.1 -starlette==0.41.0 +starlette==0.41.2 sympy==1.13.3 -tokenizers==0.20.1 -tqdm==4.66.5 +tokenizers==0.20.3 +tqdm==4.67.0 typing_extensions==4.12.2 urllib3==2.2.3 uvicorn==0.32.0 uvloop==0.21.0 watchfiles==0.24.0 -websockets==13.1 -whisper-ctranslate2==0.4.6 +websockets==14.1 +whisper-ctranslate2==0.4.7 diff --git a/whisper_fastapi.py b/whisper_fastapi.py index 77786e3..3cdc336 100644 --- a/whisper_fastapi.py +++ b/whisper_fastapi.py @@ -1,3 +1,5 @@ +import dataclasses +import faster_whisper import tqdm import json from fastapi.responses import StreamingResponse @@ -7,7 +9,7 @@ import io import hashlib import argparse import uvicorn -from typing import Annotated, Any, BinaryIO, Literal, Generator, Tuple, Iterable +from typing import Annotated, Any, BinaryIO, Literal, Generator, Tuple, Iterable, Union from fastapi import ( File, HTTPException, @@ -40,16 +42,13 @@ Instrumentator().instrument(app).expose(app, endpoint="/konele/metrics") ccc = opencc.OpenCC("t2s.json") print(f"Loading model to device {args.device}...") -transcriber = Transcribe( - model_path=args.model, +model = faster_whisper.WhisperModel( + model_size_or_path=args.model, device=args.device, - device_index=0, - compute_type="default", - threads=args.threads, - cache_directory=args.cache_dir, + cpu_threads=args.threads, local_files_only=args.local_files_only, ) -print(f"Model loaded to device {transcriber.model.model.device}") +print(f"Model loaded to device {model.model.device}") # allow all cors @@ -62,56 +61,62 @@ app.add_middleware( ) -def stream_writer(generator: Generator[dict[str, Any], Any, None]): +def stream_writer(generator: Generator[Segment, Any, None]): for segment in generator: yield "data: " + json.dumps(segment, ensure_ascii=False) + "\n\n" yield "data: [DONE]\n\n" -def text_writer(generator: Generator[dict[str, Any], Any, None]): +def text_writer(generator: Generator[Segment, Any, None]): for segment in generator: - yield segment["text"].strip() + "\n" + yield segment.text.strip() + "\n" -def tsv_writer(generator: Generator[dict[str, Any], Any, None]): +def tsv_writer(generator: Generator[Segment, Any, None]): yield "start\tend\ttext\n" for i, segment in enumerate(generator): - start_time = str(round(1000 * segment["start"])) - end_time = str(round(1000 * segment["end"])) - text = segment["text"].strip() + start_time = str(round(1000 * segment.start)) + end_time = str(round(1000 * segment.end)) + text = segment.text.strip() yield f"{start_time}\t{end_time}\t{text}\n" -def srt_writer(generator: Generator[dict[str, Any], Any, None]): +def srt_writer(generator: Generator[Segment, Any, None]): for i, segment in enumerate(generator): start_time = format_timestamp( - segment["start"], decimal_marker=",", always_include_hours=True + segment.start, decimal_marker=",", always_include_hours=True ) end_time = format_timestamp( - segment["end"], decimal_marker=",", always_include_hours=True + segment.end, decimal_marker=",", always_include_hours=True ) - text = segment["text"].strip() + text = segment.text.strip() yield f"{i}\n{start_time} --> {end_time}\n{text}\n\n" -def vtt_writer(generator: Generator[dict[str, Any], Any, None]): +def vtt_writer(generator: Generator[Segment, Any, None]): yield "WEBVTT\n\n" for i, segment in enumerate(generator): - start_time = format_timestamp(segment["start"]) - end_time = format_timestamp(segment["end"]) - text = segment["text"].strip() + start_time = format_timestamp(segment.start) + end_time = format_timestamp(segment.end) + text = segment.text.strip() yield f"{start_time} --> {end_time}\n{text}\n\n" +@dataclasses.dataclass +class JsonResult(TranscriptionInfo): + segments: list[Segment] + text: str + def build_json_result( - generator: Iterable[dict], - info: dict, -) -> dict[str, Any]: + generator: Iterable[Segment], + info: TranscriptionInfo, +) -> JsonResult: segments = [i for i in generator] - return info | { - "text": "\n".join(i["text"] for i in segments), - "segments": segments, - } + return JsonResult( + text="\n".join(i.text for i in segments), + segments=segments, + **dataclasses.asdict(info) + ) def stream_builder( @@ -121,8 +126,8 @@ def stream_builder( language: str | None, initial_prompt: str = "", repetition_penalty: float = 1.0, -) -> Tuple[Generator[dict, None, None], dict]: - segments, info = transcriber.model.transcribe( +) -> Tuple[Generator[Segment, None, None], TranscriptionInfo]: + segments, info = model.transcribe( audio=audio, language=language, task=task, @@ -142,20 +147,9 @@ def stream_builder( start, end, text = segment.start, segment.end, segment.text pbar.update(end - last_pos) last_pos = end - data = segment._asdict() - if data.get('words') is not None: - data["words"] = [i._asdict() for i in data["words"]] - if info.language == "zh": - data["text"] = ccc.convert(data["text"]) - yield data + yield segment - info_dict = info._asdict() - if info_dict['transcription_options'] is not None: - info_dict['transcription_options'] = info_dict['transcription_options']._asdict() - if info_dict['vad_options'] is not None: - info_dict['vad_options'] = info_dict['vad_options']._asdict() - - return wrap(), info_dict + return wrap(), info @app.websocket("/k6nele/status") @@ -223,13 +217,11 @@ async def konele_ws( ) result = build_json_result(generator, info) - text = result.get("text", "") - await websocket.send_json( { "status": 0, "segment": 0, - "result": {"hypotheses": [{"transcript": text}], "final": True}, + "result": {"hypotheses": [{"transcript": result.text}], "final": True}, "id": md5, } ) @@ -286,17 +278,15 @@ async def translateapi( ) result = build_json_result(generator, info) - text = result.get("text", "") - return { "status": 0, - "hypotheses": [{"utterance": text}], + "hypotheses": [{"utterance": result.text}], "id": md5, } -@app.post("/v1/audio/transcriptions") -@app.post("/v1/audio/translations") +@app.post("/v1/audio/transcriptions", response_model=Union[JsonResult, str]) +@app.post("/v1/audio/translations", response_model=Union[JsonResult, str]) async def transcription( request: Request, file: UploadFile = File(...),