Compare commits

...

11 Commits

7 changed files with 107 additions and 66 deletions

1
.dockerignore Normal file
View File

@@ -0,0 +1 @@
/venv

19
Dockerfile Normal file
View File

@@ -0,0 +1,19 @@
FROM docker.io/nvidia/cuda:12.0.0-cudnn8-runtime-ubuntu22.04
RUN apt-get update && \
apt-get install -y ffmpeg python3 python3-pip git && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
WORKDIR /app
COPY requirements.txt .
RUN pip3 install --no-cache-dir -r requirements.txt
COPY . .
EXPOSE 5000
# 启动 whisper_fastapi.py
ENTRYPOINT ["python3", "whisper_fastapi.py"]

View File

@@ -4,4 +4,5 @@ uvicorn[standard]
whisper_ctranslate2 whisper_ctranslate2
opencc opencc
prometheus-fastapi-instrumentator prometheus-fastapi-instrumentator
git+https://github.com/heimoshuiyu/faster-whisper@prompt
pydub pydub

View File

@@ -1,48 +1,49 @@
annotated-types==0.7.0 annotated-types==0.7.0
anyio==4.4.0 anyio==4.4.0
av==12.2.0 av==12.3.0
certifi==2024.7.4 certifi==2024.8.30
cffi==1.16.0 cffi==1.17.1
charset-normalizer==3.3.2 charset-normalizer==3.3.2
click==8.1.7 click==8.1.7
coloredlogs==15.0.1 coloredlogs==15.0.1
ctranslate2==4.3.1 ctranslate2==4.4.0
fastapi==0.111.0 exceptiongroup==1.2.2
faster-whisper==1.0.3 fastapi==0.114.1
filelock==3.15.4 faster-whisper @ git+https://github.com/heimoshuiyu/faster-whisper@28a4d11a736d8cdeb4655ee5d7e4b4e7ae5ec8e0
flatbuffers==24.3.25 filelock==3.16.0
fsspec==2024.6.1 flatbuffers==24.3.25
h11==0.14.0 fsspec==2024.9.0
httptools==0.6.1 h11==0.14.0
huggingface-hub==0.23.4 httptools==0.6.1
humanfriendly==10.0 huggingface-hub==0.24.6
idna==3.7 humanfriendly==10.0
mpmath==1.3.0 idna==3.8
numpy==1.26.4 mpmath==1.3.0
onnxruntime==1.18.1 numpy==2.1.1
OpenCC==1.1.7 onnxruntime==1.19.2
packaging==24.1 OpenCC==1.1.9
prometheus-client==0.18.0 packaging==24.1
prometheus-fastapi-instrumentator==7.0.0 prometheus-fastapi-instrumentator==7.0.0
protobuf==5.27.2 prometheus_client==0.20.0
pycparser==2.22 protobuf==5.28.0
pydantic==2.8.2 pycparser==2.22
pydantic_core==2.20.1 pydantic==2.9.1
pydub==0.25.1 pydantic_core==2.23.3
python-dotenv==1.0.1 pydub==0.25.1
python-multipart==0.0.9 python-dotenv==1.0.1
PyYAML==6.0.1 python-multipart==0.0.9
requests==2.32.3 PyYAML==6.0.2
sniffio==1.3.1 requests==2.32.3
sounddevice==0.4.7 sniffio==1.3.1
starlette==0.37.2 sounddevice==0.5.0
sympy==1.12.1 starlette==0.38.5
tokenizers==0.19.1 sympy==1.13.2
tqdm==4.66.4 tokenizers==0.20.0
typing_extensions==4.12.2 tqdm==4.66.5
urllib3==2.2.2 typing_extensions==4.12.2
uvicorn==0.30.1 urllib3==2.2.2
uvloop==0.19.0 uvicorn==0.30.6
watchfiles==0.22.0 uvloop==0.20.0
websockets==12.0 watchfiles==0.24.0
whisper-ctranslate2==0.4.5 websockets==13.0.1
whisper-ctranslate2==0.4.5

10
start-docker.sh Executable file
View File

@@ -0,0 +1,10 @@
#!/bin/bash
docker run -d --name whisper-fastapi \
--restart unless-stopped \
--name whisper-fastapi \
-v ~/.cache/huggingface:/root/.cache/huggingface \
--gpus all \
-p 5000:5000 \
docker.io/heimoshuiyu/whisper-fastapi:latest \
--model large-v2

11
start-podman.sh Executable file
View File

@@ -0,0 +1,11 @@
#!/bin/bash
podman run -d --name whisper-fastapi \
--restart unless-stopped \
--name whisper-fastapi \
-v ~/.cache/huggingface:/root/.cache/huggingface \
--device nvidia.com/gpu=all --security-opt=label=disable \
--gpus all \
-p 5000:5000 \
docker.io/heimoshuiyu/whisper-fastapi:latest \
--model large-v2

View File

@@ -39,7 +39,7 @@ app = FastAPI()
Instrumentator().instrument(app).expose(app, endpoint="/konele/metrics") Instrumentator().instrument(app).expose(app, endpoint="/konele/metrics")
ccc = opencc.OpenCC("t2s.json") ccc = opencc.OpenCC("t2s.json")
print("Loading model...") print(f"Loading model to device {args.device}...")
transcriber = Transcribe( transcriber = Transcribe(
model_path=args.model, model_path=args.model,
device=args.device, device=args.device,
@@ -49,7 +49,7 @@ transcriber = Transcribe(
cache_directory=args.cache_dir, cache_directory=args.cache_dir,
local_files_only=args.local_files_only, local_files_only=args.local_files_only,
) )
print("Model loaded!") print(f"Model loaded to device {transcriber.model.model.device}")
# allow all cors # allow all cors
@@ -78,7 +78,7 @@ def tsv_writer(generator: Generator[dict[str, Any], Any, None]):
for i, segment in enumerate(generator): for i, segment in enumerate(generator):
start_time = str(round(1000 * segment["start"])) start_time = str(round(1000 * segment["start"]))
end_time = str(round(1000 * segment["end"])) end_time = str(round(1000 * segment["end"]))
text = segment["text"] text = segment["text"].strip()
yield f"{start_time}\t{end_time}\t{text}\n" yield f"{start_time}\t{end_time}\t{text}\n"
@@ -90,7 +90,7 @@ def srt_writer(generator: Generator[dict[str, Any], Any, None]):
end_time = format_timestamp( end_time = format_timestamp(
segment["end"], decimal_marker=",", always_include_hours=True segment["end"], decimal_marker=",", always_include_hours=True
) )
text = segment["text"] text = segment["text"].strip()
yield f"{i}\n{start_time} --> {end_time}\n{text}\n\n" yield f"{i}\n{start_time} --> {end_time}\n{text}\n\n"
@@ -99,7 +99,7 @@ def vtt_writer(generator: Generator[dict[str, Any], Any, None]):
for i, segment in enumerate(generator): for i, segment in enumerate(generator):
start_time = format_timestamp(segment["start"]) start_time = format_timestamp(segment["start"])
end_time = format_timestamp(segment["end"]) end_time = format_timestamp(segment["end"])
text = segment["text"] text = segment["text"].strip()
yield f"{start_time} --> {end_time}\n{text}\n\n" yield f"{start_time} --> {end_time}\n{text}\n\n"
@@ -159,6 +159,8 @@ def stream_builder(
@app.websocket("/k6nele/status") @app.websocket("/k6nele/status")
@app.websocket("/konele/status") @app.websocket("/konele/status")
@app.websocket("/v1/k6nele/status")
@app.websocket("/v1/konele/status")
async def konele_status( async def konele_status(
websocket: WebSocket, websocket: WebSocket,
): ):
@@ -169,6 +171,8 @@ async def konele_status(
@app.websocket("/k6nele/ws") @app.websocket("/k6nele/ws")
@app.websocket("/konele/ws") @app.websocket("/konele/ws")
@app.websocket("/v1/k6nele/ws")
@app.websocket("/v1/konele/ws")
async def konele_ws( async def konele_ws(
websocket: WebSocket, websocket: WebSocket,
task: Literal["transcribe", "translate"] = "transcribe", task: Literal["transcribe", "translate"] = "transcribe",
@@ -182,15 +186,11 @@ async def konele_ws(
# convert lang code format (eg. en-US to en) # convert lang code format (eg. en-US to en)
lang = lang.split("-")[0] lang = lang.split("-")[0]
print("WebSocket client connected, lang is", lang)
print("content-type is", content_type)
data = b"" data = b""
while True: while True:
try: try:
data += await websocket.receive_bytes() data += await websocket.receive_bytes()
print("Received data:", len(data), data[-10:])
if data[-3:] == b"EOS": if data[-3:] == b"EOS":
print("End of speech")
break break
except: except:
break break
@@ -213,17 +213,16 @@ async def konele_ws(
file_obj.seek(0) file_obj.seek(0)
generator = stream_builder( generator, info = stream_builder(
audio=file_obj, audio=file_obj,
task=task, task=task,
vad_filter=vad_filter, vad_filter=vad_filter,
language=None if lang == "und" else lang, language=None if lang == "und" else lang,
initial_prompt=initial_prompt, initial_prompt=initial_prompt,
) )
result = build_json_result(generator) result = build_json_result(generator, info)
text = result.get("text", "") text = result.get("text", "")
print("result", text)
await websocket.send_json( await websocket.send_json(
{ {
@@ -238,6 +237,8 @@ async def konele_ws(
@app.post("/k6nele/post") @app.post("/k6nele/post")
@app.post("/konele/post") @app.post("/konele/post")
@app.post("/v1/k6nele/post")
@app.post("/v1/konele/post")
async def translateapi( async def translateapi(
request: Request, request: Request,
task: Literal["transcribe", "translate"] = "transcribe", task: Literal["transcribe", "translate"] = "transcribe",
@@ -246,14 +247,12 @@ async def translateapi(
vad_filter: bool = False, vad_filter: bool = False,
): ):
content_type = request.headers.get("Content-Type", "") content_type = request.headers.get("Content-Type", "")
print("downloading request file", content_type)
# convert lang code format (eg. en-US to en) # convert lang code format (eg. en-US to en)
lang = lang.split("-")[0] lang = lang.split("-")[0]
splited = [i.strip() for i in content_type.split(",") if "=" in i] splited = [i.strip() for i in content_type.split(",") if "=" in i]
info = {k: v for k, v in (i.split("=") for i in splited)} info = {k: v for k, v in (i.split("=") for i in splited)}
print(info)
channels = int(info.get("channels", "1")) channels = int(info.get("channels", "1"))
rate = int(info.get("rate", "16000")) rate = int(info.get("rate", "16000"))
@@ -277,17 +276,16 @@ async def translateapi(
file_obj.seek(0) file_obj.seek(0)
generator = stream_builder( generator, info = stream_builder(
audio=file_obj, audio=file_obj,
task=task, task=task,
vad_filter=vad_filter, vad_filter=vad_filter,
language=None if lang == "und" else lang, language=None if lang == "und" else lang,
initial_prompt=initial_prompt, initial_prompt=initial_prompt,
) )
result = build_json_result(generator) result = build_json_result(generator, info)
text = result.get("text", "") text = result.get("text", "")
print("result", text)
return { return {
"status": 0, "status": 0,
@@ -327,7 +325,7 @@ async def transcription(
media_type="text/event-stream", media_type="text/event-stream",
) )
elif response_format == "json": elif response_format == "json":
return build_json_result(generator) return build_json_result(generator, info)
elif response_format == "text": elif response_format == "text":
return StreamingResponse(text_writer(generator), media_type="text/plain") return StreamingResponse(text_writer(generator), media_type="text/plain")
elif response_format == "tsv": elif response_format == "tsv":