revert faster-whisper to v1.0.3

strip text on tsv and srt output
update faster-whisper to heimoshuiyu(prompt) patched version
2024-09-12 01:29:01 +08:00 · 2024-09-04 18:03:01 +08:00 · 2024-09-04 18:02:44 +08:00 · 2024-09-04 17:45:59 +08:00 · 2024-08-08 18:13:42 +08:00 · 2024-08-08 18:12:13 +08:00
7 changed files with 106 additions and 61 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -0,0 +1 @@
+/venv
--- a/19
+++ b/19
@@ -0,0 +1,19 @@
+FROM docker.io/nvidia/cuda:12.0.0-cudnn8-runtime-ubuntu22.04
+
+RUN apt-get update && \
+    apt-get install -y ffmpeg python3 python3-pip git && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+WORKDIR /app
+
+COPY requirements.txt .
+
+RUN pip3 install --no-cache-dir -r requirements.txt
+
+COPY . .
+
+EXPOSE 5000
+
+# 启动 whisper_fastapi.py
+ENTRYPOINT ["python3", "whisper_fastapi.py"]
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,4 +4,5 @@ uvicorn[standard]
 whisper_ctranslate2
 opencc
 prometheus-fastapi-instrumentator
+git+https://github.com/heimoshuiyu/faster-whisper@prompt
 pydub
--- a/requirements_version.txt
+++ b/requirements_version.txt
@@ -1,48 +1,49 @@
-annotated-types==0.7.0
-anyio==4.4.0
-av==12.2.0
-certifi==2024.7.4
-cffi==1.16.0
-charset-normalizer==3.3.2
-click==8.1.7
-coloredlogs==15.0.1
-ctranslate2==4.3.1
-fastapi==0.111.0
-faster-whisper==1.0.3
-filelock==3.15.4
-flatbuffers==24.3.25
-fsspec==2024.6.1
-h11==0.14.0
-httptools==0.6.1
-huggingface-hub==0.23.4
-humanfriendly==10.0
-idna==3.7
-mpmath==1.3.0
-numpy==1.26.4
-onnxruntime==1.18.1
-OpenCC==1.1.7
-packaging==24.1
-prometheus-client==0.18.0
-prometheus-fastapi-instrumentator==7.0.0
-protobuf==5.27.2
-pycparser==2.22
-pydantic==2.8.2
-pydantic_core==2.20.1
-pydub==0.25.1
-python-dotenv==1.0.1
-python-multipart==0.0.9
-PyYAML==6.0.1
-requests==2.32.3
-sniffio==1.3.1
-sounddevice==0.4.7
-starlette==0.37.2
-sympy==1.12.1
-tokenizers==0.19.1
-tqdm==4.66.4
-typing_extensions==4.12.2
-urllib3==2.2.2
-uvicorn==0.30.1
-uvloop==0.19.0
-watchfiles==0.22.0
-websockets==12.0
-whisper-ctranslate2==0.4.5
+annotated-types==0.7.0
+anyio==4.4.0
+av==12.3.0
+certifi==2024.8.30
+cffi==1.17.1
+charset-normalizer==3.3.2
+click==8.1.7
+coloredlogs==15.0.1
+ctranslate2==4.4.0
+exceptiongroup==1.2.2
+fastapi==0.114.1
+faster-whisper @ git+https://github.com/heimoshuiyu/faster-whisper@28a4d11a736d8cdeb4655ee5d7e4b4e7ae5ec8e0
+filelock==3.16.0
+flatbuffers==24.3.25
+fsspec==2024.9.0
+h11==0.14.0
+httptools==0.6.1
+huggingface-hub==0.24.6
+humanfriendly==10.0
+idna==3.8
+mpmath==1.3.0
+numpy==2.1.1
+onnxruntime==1.19.2
+OpenCC==1.1.9
+packaging==24.1
+prometheus-fastapi-instrumentator==7.0.0
+prometheus_client==0.20.0
+protobuf==5.28.0
+pycparser==2.22
+pydantic==2.9.1
+pydantic_core==2.23.3
+pydub==0.25.1
+python-dotenv==1.0.1
+python-multipart==0.0.9
+PyYAML==6.0.2
+requests==2.32.3
+sniffio==1.3.1
+sounddevice==0.5.0
+starlette==0.38.5
+sympy==1.13.2
+tokenizers==0.20.0
+tqdm==4.66.5
+typing_extensions==4.12.2
+urllib3==2.2.2
+uvicorn==0.30.6
+uvloop==0.20.0
+watchfiles==0.24.0
+websockets==13.0.1
+whisper-ctranslate2==0.4.5
--- a/start-docker.sh
+++ b/start-docker.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+docker run -d --name whisper-fastapi \
+    --restart unless-stopped \
+    --name whisper-fastapi \
+    -v ~/.cache/huggingface:/root/.cache/huggingface \
+    --gpus all \
+    -p 5000:5000 \
+    docker.io/heimoshuiyu/whisper-fastapi:latest \
+    --model large-v2
--- a/start-podman.sh
+++ b/start-podman.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+podman run -d --name whisper-fastapi \
+    --restart unless-stopped \
+    --name whisper-fastapi \
+    -v ~/.cache/huggingface:/root/.cache/huggingface \
+    --device nvidia.com/gpu=all --security-opt=label=disable \
+    --gpus all \
+    -p 5000:5000 \
+    docker.io/heimoshuiyu/whisper-fastapi:latest \
+    --model large-v2
--- a/whisper_fastapi.py
+++ b/whisper_fastapi.py
@@ -22,6 +22,7 @@ from fastapi.middleware.cors import CORSMiddleware
 from src.whisper_ctranslate2.whisper_ctranslate2 import Transcribe
 from src.whisper_ctranslate2.writers import format_timestamp
 from faster_whisper.transcribe import Segment, TranscriptionInfo
+import opencc
 from prometheus_fastapi_instrumentator import Instrumentator

 parser = argparse.ArgumentParser()
@@ -36,8 +37,9 @@ args = parser.parse_args()
 app = FastAPI()
 # Instrument your app with default metrics and expose the metrics
 Instrumentator().instrument(app).expose(app, endpoint="/konele/metrics")
+ccc = opencc.OpenCC("t2s.json")

-print("Loading model...")
+print(f"Loading model to device {args.device}...")
 transcriber = Transcribe(
    model_path=args.model,
    device=args.device,
@@ -47,7 +49,7 @@ transcriber = Transcribe(
    cache_directory=args.cache_dir,
    local_files_only=args.local_files_only,
 )
-print("Model loaded!")
+print(f"Model loaded to device {transcriber.model.model.device}")


 # allow all cors
@@ -76,7 +78,7 @@ def tsv_writer(generator: Generator[dict[str, Any], Any, None]):
    for i, segment in enumerate(generator):
        start_time = str(round(1000 * segment["start"]))
        end_time = str(round(1000 * segment["end"]))
-        text = segment["text"]
+        text = segment["text"].strip()
        yield f"{start_time}\t{end_time}\t{text}\n"


@@ -88,7 +90,7 @@ def srt_writer(generator: Generator[dict[str, Any], Any, None]):
        end_time = format_timestamp(
            segment["end"], decimal_marker=",", always_include_hours=True
        )
-        text = segment["text"]
+        text = segment["text"].strip()
        yield f"{i}\n{start_time} --> {end_time}\n{text}\n\n"


@@ -97,7 +99,7 @@ def vtt_writer(generator: Generator[dict[str, Any], Any, None]):
    for i, segment in enumerate(generator):
        start_time = format_timestamp(segment["start"])
        end_time = format_timestamp(segment["end"])
-        text = segment["text"]
+        text = segment["text"].strip()
        yield f"{start_time} --> {end_time}\n{text}\n\n"


@@ -142,6 +144,8 @@ def stream_builder(
                data = segment._asdict()
                if data.get('words') is not None:
                    data["words"] = [i._asdict() for i in data["words"]]
+                if info.language == "zh":
+                    data["text"] = ccc.convert(data["text"])
                yield data

    info_dict = info._asdict()
@@ -155,6 +159,8 @@ def stream_builder(

@app.websocket("/k6nele/status")
@app.websocket("/konele/status")
+@app.websocket("/v1/k6nele/status")
+@app.websocket("/v1/konele/status")
 async def konele_status(
    websocket: WebSocket,
 ):
@@ -165,6 +171,8 @@ async def konele_status(

@app.websocket("/k6nele/ws")
@app.websocket("/konele/ws")
+@app.websocket("/v1/k6nele/ws")
+@app.websocket("/v1/konele/ws")
 async def konele_ws(
    websocket: WebSocket,
    task: Literal["transcribe", "translate"] = "transcribe",
@@ -178,15 +186,11 @@ async def konele_ws(
    # convert lang code format (eg. en-US to en)
    lang = lang.split("-")[0]

-    print("WebSocket client connected, lang is", lang)
-    print("content-type is", content_type)
    data = b""
    while True:
        try:
            data += await websocket.receive_bytes()
-            print("Received data:", len(data), data[-10:])
            if data[-3:] == b"EOS":
-                print("End of speech")
                break
        except:
            break
@@ -219,7 +223,6 @@ async def konele_ws(
    result = build_json_result(generator, info)

    text = result.get("text", "")
-    print("result", text)

    await websocket.send_json(
        {
@@ -234,6 +237,8 @@ async def konele_ws(

@app.post("/k6nele/post")
@app.post("/konele/post")
+@app.post("/v1/k6nele/post")
+@app.post("/v1/konele/post")
 async def translateapi(
    request: Request,
    task: Literal["transcribe", "translate"] = "transcribe",
@@ -242,14 +247,12 @@ async def translateapi(
    vad_filter: bool = False,
 ):
    content_type = request.headers.get("Content-Type", "")
-    print("downloading request file", content_type)

    # convert lang code format (eg. en-US to en)
    lang = lang.split("-")[0]

    splited = [i.strip() for i in content_type.split(",") if "=" in i]
    info = {k: v for k, v in (i.split("=") for i in splited)}
-    print(info)

    channels = int(info.get("channels", "1"))
    rate = int(info.get("rate", "16000"))
@@ -283,7 +286,6 @@ async def translateapi(
    result = build_json_result(generator, info)

    text = result.get("text", "")
-    print("result", text)

    return {
        "status": 0,
Author	SHA1	Message	Date
heimoshuiyu	72a8c736e3	revert faster-whisper to v1.0.3	2024-09-12 01:29:01 +08:00
heimoshuiyu	b4fb0f217b	strip text on tsv and srt output	2024-09-04 18:03:01 +08:00
heimoshuiyu	1a5dbc65e0	update faster-whisper to heimoshuiyu(prompt) patched version	2024-09-04 18:02:44 +08:00
heimoshuiyu	ea8fc74ed2	add start-podman.sh	2024-09-04 17:45:59 +08:00
heimoshuiyu	c6948654a4	add .dockerignore	2024-08-08 18:13:42 +08:00
heimoshuiyu	ffefb2f09e	Add new WebSocket and POST endpoints	2024-08-08 18:12:13 +08:00
heimoshuiyu	1c8a685e9e	fix: typo	2024-08-08 17:47:31 +08:00
heimoshuiyu	ed1e51fefa	add docker	2024-08-08 17:24:48 +08:00
heimoshuiyu	042800721d	Update model loading message	2024-08-08 17:20:52 +08:00
heimoshuiyu	f71ef945db	Remove print statements and unnecessary code	2024-08-04 16:59:16 +08:00
heimoshuiyu	1c93201250	fix: build_json_result	2024-08-04 16:57:22 +08:00
heimoshuiyu	2ecdc4e607	Fix text conversion for Chinese language	2024-08-04 16:51:17 +08:00
heimoshuiyu	204ccb8f3d	Revert "disable zh-hans to zh-cn convertion" This reverts commit `6decabefae`.	2024-08-04 16:39:20 +08:00