From 4a5ba38f5e359e53688cdec0741b95d5b2cebb20 Mon Sep 17 00:00:00 2001
From: heimoshuiyu <heimoshuiyu@gmail.com>
Date: Fri, 15 Nov 2024 01:04:49 +0800
Subject: [PATCH] upgrade with faster-whisper upstream

---
 README.md                |  8 ++--
 requirements.txt         |  2 +-
 requirements_version.txt | 26 +++++------
 whisper_fastapi.py       | 96 ++++++++++++++++++----------------------
 4 files changed, 61 insertions(+), 71 deletions(-)

diff --git a/README.md b/README.md
index 4639837..0c9543c 100644
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@ Whisper-FastAPI is a very simple Python FastAPI interface for konele and OpenAI
 ## Features
 
 - **Translation and Transcription**: The application provides an API for konele service, where translations and transcriptions can be obtained by connecting over websockets or POST requests.
-- **Language Support**: If the target language is English, then the application will translate any source language to English.
+- **Language Support**: If no language is specified, the language will be automatically recognized from the first 30 seconds.
 - **Websocket and POST Method Support**: The project supports a websocket (`/konele/ws`) and a POST method to `/konele/post`.
 - **Audio Transcriptions**: The `/v1/audio/transcriptions` endpoint allows users to upload an audio file and receive transcription in response, with an optional `response_type` parameter. The `response_type` can be 'json', 'text', 'tsv', 'srt', and 'vtt'.
 - **Simplified Chinese**: The traditional Chinese will be automatically convert to simplified Chinese for konele using `opencc` library.
@@ -16,10 +16,10 @@ Whisper-FastAPI is a very simple Python FastAPI interface for konele and OpenAI
 
 For konele voice typing, you can use either the websocket endpoint or the POST method endpoint.
 
-- **Websocket**: Connect to the websocket at `/konele/ws` and send audio data. The server will respond with the transcription or translation.
-- **POST Method**: Send a POST request to `/konele/post` with the audio data in the body. The server will respond with the transcription or translation.
+- **Websocket**: Connect to the websocket at `/konele/ws` (or `/v1/konele/ws`) and send audio data. The server will respond with the transcription or translation.
+- **POST Method**: Send a POST request to `/konele/post` (or `/v1/konele/post`) with the audio data in the body. The server will respond with the transcription or translation.
 
-You can also use the demo I have created to quickly test the effect at <https://yongyuancv.cn/konele/ws> and <https://yongyuancv.cn/konele/post>
+You can also use the demo I have created to quickly test the effect at <https://yongyuancv.cn/v1/konele/ws> and <https://yongyuancv.cn/v1/konele/post>
 
 ### OpenAI Whisper Service
 
diff --git a/requirements.txt b/requirements.txt
index 8719b01..e5cd4a2 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,5 +4,5 @@ uvicorn[standard]
 whisper_ctranslate2
 opencc
 prometheus-fastapi-instrumentator
-git+https://github.com/heimoshuiyu/faster-whisper@prompt
+git+https://github.com/SYSTRAN/faster-whisper@3e0ba86571b9fe93bab2a25b3ff5af1be41014ec
 pydub
diff --git a/requirements_version.txt b/requirements_version.txt
index 4403b05..08263d2 100644
--- a/requirements_version.txt
+++ b/requirements_version.txt
@@ -1,6 +1,6 @@
 annotated-types==0.7.0
 anyio==4.6.2.post1
-av==12.3.0
+av==13.1.0
 certifi==2024.8.30
 cffi==1.17.1
 charset-normalizer==3.4.0
@@ -8,21 +8,21 @@ click==8.1.7
 coloredlogs==15.0.1
 ctranslate2==4.5.0
 exceptiongroup==1.2.2
-fastapi==0.115.3
-faster-whisper @ git+https://github.com/heimoshuiyu/faster-whisper@8563f889cb23c2f8d64568d7a4a1c7beea28618b
+fastapi==0.115.5
+faster-whisper @ git+https://github.com/SYSTRAN/faster-whisper@3e0ba86571b9fe93bab2a25b3ff5af1be41014ec
 filelock==3.16.1
 flatbuffers==24.3.25
 fsspec==2024.10.0
 h11==0.14.0
 httptools==0.6.4
-huggingface-hub==0.26.1
+huggingface-hub==0.26.2
 humanfriendly==10.0
 idna==3.10
 mpmath==1.3.0
-numpy==2.1.2
-onnxruntime==1.19.2
+numpy==2.1.3
+onnxruntime==1.20.0
 OpenCC==1.1.9
-packaging==24.1
+packaging==24.2
 prometheus-fastapi-instrumentator==7.0.0
 prometheus_client==0.21.0
 protobuf==5.28.3
@@ -31,19 +31,19 @@ pydantic==2.9.2
 pydantic_core==2.23.4
 pydub==0.25.1
 python-dotenv==1.0.1
-python-multipart==0.0.12
+python-multipart==0.0.17
 PyYAML==6.0.2
 requests==2.32.3
 sniffio==1.3.1
 sounddevice==0.5.1
-starlette==0.41.0
+starlette==0.41.2
 sympy==1.13.3
-tokenizers==0.20.1
-tqdm==4.66.5
+tokenizers==0.20.3
+tqdm==4.67.0
 typing_extensions==4.12.2
 urllib3==2.2.3
 uvicorn==0.32.0
 uvloop==0.21.0
 watchfiles==0.24.0
-websockets==13.1
-whisper-ctranslate2==0.4.6
+websockets==14.1
+whisper-ctranslate2==0.4.7
diff --git a/whisper_fastapi.py b/whisper_fastapi.py
index 77786e3..3cdc336 100644
--- a/whisper_fastapi.py
+++ b/whisper_fastapi.py
@@ -1,3 +1,5 @@
+import dataclasses
+import faster_whisper
 import tqdm
 import json
 from fastapi.responses import StreamingResponse
@@ -7,7 +9,7 @@ import io
 import hashlib
 import argparse
 import uvicorn
-from typing import Annotated, Any, BinaryIO, Literal, Generator, Tuple, Iterable
+from typing import Annotated, Any, BinaryIO, Literal, Generator, Tuple, Iterable, Union
 from fastapi import (
     File,
     HTTPException,
@@ -40,16 +42,13 @@ Instrumentator().instrument(app).expose(app, endpoint="/konele/metrics")
 ccc = opencc.OpenCC("t2s.json")
 
 print(f"Loading model to device {args.device}...")
-transcriber = Transcribe(
-    model_path=args.model,
+model = faster_whisper.WhisperModel(
+    model_size_or_path=args.model,
     device=args.device,
-    device_index=0,
-    compute_type="default",
-    threads=args.threads,
-    cache_directory=args.cache_dir,
+    cpu_threads=args.threads,
     local_files_only=args.local_files_only,
 )
-print(f"Model loaded to device {transcriber.model.model.device}")
+print(f"Model loaded to device {model.model.device}")
 
 
 # allow all cors
@@ -62,56 +61,62 @@ app.add_middleware(
 )
 
 
-def stream_writer(generator: Generator[dict[str, Any], Any, None]):
+def stream_writer(generator: Generator[Segment, Any, None]):
     for segment in generator:
         yield "data: " + json.dumps(segment, ensure_ascii=False) + "\n\n"
     yield "data: [DONE]\n\n"
 
 
-def text_writer(generator: Generator[dict[str, Any], Any, None]):
+def text_writer(generator: Generator[Segment, Any, None]):
     for segment in generator:
-        yield segment["text"].strip() + "\n"
+        yield segment.text.strip() + "\n"
 
 
-def tsv_writer(generator: Generator[dict[str, Any], Any, None]):
+def tsv_writer(generator: Generator[Segment, Any, None]):
     yield "start\tend\ttext\n"
     for i, segment in enumerate(generator):
-        start_time = str(round(1000 * segment["start"]))
-        end_time = str(round(1000 * segment["end"]))
-        text = segment["text"].strip()
+        start_time = str(round(1000 * segment.start))
+        end_time = str(round(1000 * segment.end))
+        text = segment.text.strip()
         yield f"{start_time}\t{end_time}\t{text}\n"
 
 
-def srt_writer(generator: Generator[dict[str, Any], Any, None]):
+def srt_writer(generator: Generator[Segment, Any, None]):
     for i, segment in enumerate(generator):
         start_time = format_timestamp(
-            segment["start"], decimal_marker=",", always_include_hours=True
+            segment.start, decimal_marker=",", always_include_hours=True
         )
         end_time = format_timestamp(
-            segment["end"], decimal_marker=",", always_include_hours=True
+            segment.end, decimal_marker=",", always_include_hours=True
         )
-        text = segment["text"].strip()
+        text = segment.text.strip()
         yield f"{i}\n{start_time} --> {end_time}\n{text}\n\n"
 
 
-def vtt_writer(generator: Generator[dict[str, Any], Any, None]):
+def vtt_writer(generator: Generator[Segment, Any, None]):
     yield "WEBVTT\n\n"
     for i, segment in enumerate(generator):
-        start_time = format_timestamp(segment["start"])
-        end_time = format_timestamp(segment["end"])
-        text = segment["text"].strip()
+        start_time = format_timestamp(segment.start)
+        end_time = format_timestamp(segment.end)
+        text = segment.text.strip()
         yield f"{start_time} --> {end_time}\n{text}\n\n"
 
 
+@dataclasses.dataclass
+class JsonResult(TranscriptionInfo):
+    segments: list[Segment]
+    text: str
+
 def build_json_result(
-    generator: Iterable[dict],
-    info:  dict,
-) -> dict[str, Any]:
+    generator: Iterable[Segment],
+    info:  TranscriptionInfo,
+) -> JsonResult:
     segments = [i for i in generator]
-    return info | {
-        "text": "\n".join(i["text"] for i in segments),
-        "segments": segments,
-    }
+    return JsonResult(
+        text="\n".join(i.text for i in segments),
+        segments=segments,
+        **dataclasses.asdict(info)
+    )
 
 
 def stream_builder(
@@ -121,8 +126,8 @@ def stream_builder(
     language: str | None,
     initial_prompt: str = "",
     repetition_penalty: float = 1.0,
-) -> Tuple[Generator[dict, None, None], dict]:
-    segments, info = transcriber.model.transcribe(
+) -> Tuple[Generator[Segment, None, None], TranscriptionInfo]:
+    segments, info = model.transcribe(
         audio=audio,
         language=language,
         task=task,
@@ -142,20 +147,9 @@ def stream_builder(
                 start, end, text = segment.start, segment.end, segment.text
                 pbar.update(end - last_pos)
                 last_pos = end
-                data = segment._asdict()
-                if data.get('words') is not None:
-                    data["words"] = [i._asdict() for i in data["words"]]
-                if info.language == "zh":
-                    data["text"] = ccc.convert(data["text"])
-                yield data
+                yield segment
 
-    info_dict = info._asdict()
-    if info_dict['transcription_options'] is not None:
-        info_dict['transcription_options'] = info_dict['transcription_options']._asdict()
-    if info_dict['vad_options'] is not None:
-        info_dict['vad_options'] = info_dict['vad_options']._asdict()
-    
-    return wrap(), info_dict
+    return wrap(), info
 
 
 @app.websocket("/k6nele/status")
@@ -223,13 +217,11 @@ async def konele_ws(
     )
     result = build_json_result(generator, info)
 
-    text = result.get("text", "")
-
     await websocket.send_json(
         {
             "status": 0,
             "segment": 0,
-            "result": {"hypotheses": [{"transcript": text}], "final": True},
+            "result": {"hypotheses": [{"transcript": result.text}], "final": True},
             "id": md5,
         }
     )
@@ -286,17 +278,15 @@ async def translateapi(
     )
     result = build_json_result(generator, info)
 
-    text = result.get("text", "")
-
     return {
         "status": 0,
-        "hypotheses": [{"utterance": text}],
+        "hypotheses": [{"utterance": result.text}],
         "id": md5,
     }
 
 
-@app.post("/v1/audio/transcriptions")
-@app.post("/v1/audio/translations")
+@app.post("/v1/audio/transcriptions", response_model=Union[JsonResult, str])
+@app.post("/v1/audio/translations", response_model=Union[JsonResult, str])
 async def transcription(
     request: Request,
     file: UploadFile = File(...),