Compare commits

...

12 Commits

Author SHA1 Message Date
bd2c6b95cf update faster-whisper 2024-11-28 18:52:00 +08:00
0e46bd91d4 format code 2024-11-21 22:45:02 +08:00
99272b230f Upgrade Dependency 2024-11-21 22:44:49 +08:00
3c01a76405 Convert Traditional Chinese to Simplified Chinese 2024-11-21 22:44:27 +08:00
3401c59c4b update faster-whisper 2024-11-18 11:04:00 +08:00
76b32bc9c4 update faster-whisper 2024-11-15 08:33:43 +08:00
4a5ba38f5e upgrade with faster-whisper upstream 2024-11-15 01:04:49 +08:00
8ae81a124d update dependencies 2024-10-25 22:06:05 +08:00
0faaf0f301 support translate endpoint 2024-09-13 16:21:35 +08:00
fab1ec9d03 fix: initial_prompt params 2024-09-13 16:13:19 +08:00
71bde08b17 fix: vad_filter params 2024-09-13 16:10:38 +08:00
a53a2fb80e fix typing hint 2024-09-13 16:09:37 +08:00
5 changed files with 103 additions and 98 deletions

View File

@@ -1,4 +1,4 @@
FROM docker.io/nvidia/cuda:12.0.0-cudnn8-runtime-ubuntu22.04
FROM nvidia/cuda:12.3.2-cudnn9-runtime-ubuntu22.04
RUN apt-get update && \
apt-get install -y ffmpeg python3 python3-pip git && \

View File

@@ -5,7 +5,7 @@ Whisper-FastAPI is a very simple Python FastAPI interface for konele and OpenAI
## Features
- **Translation and Transcription**: The application provides an API for konele service, where translations and transcriptions can be obtained by connecting over websockets or POST requests.
- **Language Support**: If the target language is English, then the application will translate any source language to English.
- **Language Support**: If no language is specified, the language will be automatically recognized from the first 30 seconds.
- **Websocket and POST Method Support**: The project supports a websocket (`/konele/ws`) and a POST method to `/konele/post`.
- **Audio Transcriptions**: The `/v1/audio/transcriptions` endpoint allows users to upload an audio file and receive transcription in response, with an optional `response_type` parameter. The `response_type` can be 'json', 'text', 'tsv', 'srt', and 'vtt'.
- **Simplified Chinese**: The traditional Chinese will be automatically convert to simplified Chinese for konele using `opencc` library.
@@ -16,10 +16,10 @@ Whisper-FastAPI is a very simple Python FastAPI interface for konele and OpenAI
For konele voice typing, you can use either the websocket endpoint or the POST method endpoint.
- **Websocket**: Connect to the websocket at `/konele/ws` and send audio data. The server will respond with the transcription or translation.
- **POST Method**: Send a POST request to `/konele/post` with the audio data in the body. The server will respond with the transcription or translation.
- **Websocket**: Connect to the websocket at `/konele/ws` (or `/v1/konele/ws`) and send audio data. The server will respond with the transcription or translation.
- **POST Method**: Send a POST request to `/konele/post` (or `/v1/konele/post`) with the audio data in the body. The server will respond with the transcription or translation.
You can also use the demo I have created to quickly test the effect at <https://yongyuancv.cn/konele/ws> and <https://yongyuancv.cn/konele/post>
You can also use the demo I have created to quickly test the effect at <https://yongyuancv.cn/v1/konele/ws> and <https://yongyuancv.cn/v1/konele/post>
### OpenAI Whisper Service

View File

@@ -4,5 +4,5 @@ uvicorn[standard]
whisper_ctranslate2
opencc
prometheus-fastapi-instrumentator
git+https://github.com/heimoshuiyu/faster-whisper@prompt
git+https://github.com/heimoshuiyu/faster-whisper@a759f5f48f5ef5b79461a6461966eafe9df088a9
pydub

View File

@@ -1,49 +1,49 @@
annotated-types==0.7.0
anyio==4.4.0
av==12.3.0
anyio==4.6.2.post1
av==13.1.0
certifi==2024.8.30
cffi==1.17.1
charset-normalizer==3.3.2
charset-normalizer==3.4.0
click==8.1.7
coloredlogs==15.0.1
ctranslate2==4.4.0
ctranslate2==4.5.0
exceptiongroup==1.2.2
fastapi==0.114.1
faster-whisper @ git+https://github.com/heimoshuiyu/faster-whisper@28a4d11a736d8cdeb4655ee5d7e4b4e7ae5ec8e0
filelock==3.16.0
fastapi==0.115.5
faster-whisper @ git+https://github.com/heimoshuiyu/faster-whisper@a759f5f48f5ef5b79461a6461966eafe9df088a9
filelock==3.16.1
flatbuffers==24.3.25
fsspec==2024.9.0
fsspec==2024.10.0
h11==0.14.0
httptools==0.6.1
huggingface-hub==0.24.6
httptools==0.6.4
huggingface-hub==0.26.2
humanfriendly==10.0
idna==3.8
idna==3.10
mpmath==1.3.0
numpy==2.1.1
onnxruntime==1.19.2
numpy==2.1.3
onnxruntime==1.20.1
OpenCC==1.1.9
packaging==24.1
packaging==24.2
prometheus-fastapi-instrumentator==7.0.0
prometheus_client==0.20.0
protobuf==5.28.0
prometheus_client==0.21.0
protobuf==5.28.3
pycparser==2.22
pydantic==2.9.1
pydantic_core==2.23.3
pydantic==2.10.1
pydantic_core==2.27.1
pydub==0.25.1
python-dotenv==1.0.1
python-multipart==0.0.9
python-multipart==0.0.17
PyYAML==6.0.2
requests==2.32.3
sniffio==1.3.1
sounddevice==0.5.0
starlette==0.38.5
sympy==1.13.2
tokenizers==0.20.0
tqdm==4.66.5
sounddevice==0.5.1
starlette==0.41.3
sympy==1.13.3
tokenizers==0.20.3
tqdm==4.67.0
typing_extensions==4.12.2
urllib3==2.2.2
uvicorn==0.30.6
uvloop==0.20.0
urllib3==2.2.3
uvicorn==0.32.1
uvloop==0.21.0
watchfiles==0.24.0
websockets==13.0.1
whisper-ctranslate2==0.4.5
websockets==14.1
whisper-ctranslate2==0.4.8

View File

@@ -1,4 +1,6 @@
import tqdm
import sys
import dataclasses
import faster_whisper
import json
from fastapi.responses import StreamingResponse
import wave
@@ -7,7 +9,7 @@ import io
import hashlib
import argparse
import uvicorn
from typing import Annotated, Any, BinaryIO, Literal, Generator, Tuple, Iterable
from typing import Annotated, Any, BinaryIO, Literal, Generator, Tuple, Iterable, Union
from fastapi import (
File,
HTTPException,
@@ -19,12 +21,16 @@ from fastapi import (
WebSocket,
)
from fastapi.middleware.cors import CORSMiddleware
from src.whisper_ctranslate2.whisper_ctranslate2 import Transcribe
from src.whisper_ctranslate2.writers import format_timestamp
from faster_whisper.transcribe import Segment, TranscriptionInfo
import opencc
from prometheus_fastapi_instrumentator import Instrumentator
# redirect print to stderr
_print = print
def print(*args, **kwargs):
_print(*args, file=sys.stderr, **kwargs)
parser = argparse.ArgumentParser()
parser.add_argument("--host", default="0.0.0.0", type=str)
parser.add_argument("--port", default=5000, type=int)
@@ -40,16 +46,13 @@ Instrumentator().instrument(app).expose(app, endpoint="/konele/metrics")
ccc = opencc.OpenCC("t2s.json")
print(f"Loading model to device {args.device}...")
transcriber = Transcribe(
model_path=args.model,
model = faster_whisper.WhisperModel(
model_size_or_path=args.model,
device=args.device,
device_index=0,
compute_type="default",
threads=args.threads,
cache_directory=args.cache_dir,
cpu_threads=args.threads,
local_files_only=args.local_files_only,
)
print(f"Model loaded to device {transcriber.model.model.device}")
print(f"Model loaded to device {model.model.device}")
# allow all cors
@@ -62,56 +65,63 @@ app.add_middleware(
)
def stream_writer(generator: Generator[dict[str, Any], Any, None]):
def stream_writer(generator: Generator[Segment, Any, None]):
for segment in generator:
yield "data: " + json.dumps(segment, ensure_ascii=False) + "\n\n"
yield "data: [DONE]\n\n"
def text_writer(generator: Generator[dict[str, Any], Any, None]):
def text_writer(generator: Generator[Segment, Any, None]):
for segment in generator:
yield segment["text"].strip() + "\n"
yield segment.text.strip() + "\n"
def tsv_writer(generator: Generator[dict[str, Any], Any, None]):
def tsv_writer(generator: Generator[Segment, Any, None]):
yield "start\tend\ttext\n"
for i, segment in enumerate(generator):
start_time = str(round(1000 * segment["start"]))
end_time = str(round(1000 * segment["end"]))
text = segment["text"].strip()
start_time = str(round(1000 * segment.start))
end_time = str(round(1000 * segment.end))
text = segment.text.strip()
yield f"{start_time}\t{end_time}\t{text}\n"
def srt_writer(generator: Generator[dict[str, Any], Any, None]):
def srt_writer(generator: Generator[Segment, Any, None]):
for i, segment in enumerate(generator):
start_time = format_timestamp(
segment["start"], decimal_marker=",", always_include_hours=True
segment.start, decimal_marker=",", always_include_hours=True
)
end_time = format_timestamp(
segment["end"], decimal_marker=",", always_include_hours=True
segment.end, decimal_marker=",", always_include_hours=True
)
text = segment["text"].strip()
text = segment.text.strip()
yield f"{i}\n{start_time} --> {end_time}\n{text}\n\n"
def vtt_writer(generator: Generator[dict[str, Any], Any, None]):
def vtt_writer(generator: Generator[Segment, Any, None]):
yield "WEBVTT\n\n"
for i, segment in enumerate(generator):
start_time = format_timestamp(segment["start"])
end_time = format_timestamp(segment["end"])
text = segment["text"].strip()
for _, segment in enumerate(generator):
start_time = format_timestamp(segment.start)
end_time = format_timestamp(segment.end)
text = segment.text.strip()
yield f"{start_time} --> {end_time}\n{text}\n\n"
@dataclasses.dataclass
class JsonResult(TranscriptionInfo):
segments: list[Segment]
text: str
def build_json_result(
generator: Iterable[Segment],
info: dict,
) -> dict[str, Any]:
info: TranscriptionInfo,
) -> JsonResult:
segments = [i for i in generator]
return info | {
"text": "\n".join(i["text"] for i in segments),
"segments": segments,
}
return JsonResult(
text="\n".join(i.text for i in segments),
segments=segments,
**dataclasses.asdict(info),
)
def stream_builder(
@@ -121,12 +131,13 @@ def stream_builder(
language: str | None,
initial_prompt: str = "",
repetition_penalty: float = 1.0,
) -> Tuple[Iterable[dict], dict]:
segments, info = transcriber.model.transcribe(
) -> Tuple[Generator[Segment, None, None], TranscriptionInfo]:
segments, info = model.transcribe(
audio=audio,
language=language,
task=task,
initial_prompt=initial_prompt,
vad_filter=vad_filter,
initial_prompt=initial_prompt if initial_prompt else None,
word_timestamps=True,
repetition_penalty=repetition_penalty,
)
@@ -134,27 +145,14 @@ def stream_builder(
"Detected language '%s' with probability %f"
% (info.language, info.language_probability)
)
def wrap():
last_pos = 0
with tqdm.tqdm(total=info.duration, unit="seconds", disable=True) as pbar:
for segment in segments:
start, end, text = segment.start, segment.end, segment.text
pbar.update(end - last_pos)
last_pos = end
data = segment._asdict()
if data.get('words') is not None:
data["words"] = [i._asdict() for i in data["words"]]
if info.language == "zh":
data["text"] = ccc.convert(data["text"])
yield data
info_dict = info._asdict()
if info_dict['transcription_options'] is not None:
info_dict['transcription_options'] = info_dict['transcription_options']._asdict()
if info_dict['vad_options'] is not None:
info_dict['vad_options'] = info_dict['vad_options']._asdict()
return wrap(), info_dict
def wrap():
for segment in segments:
if info.language == "zh":
segment.text = ccc.convert(segment.text)
yield segment
return wrap(), info
@app.websocket("/k6nele/status")
@@ -222,13 +220,11 @@ async def konele_ws(
)
result = build_json_result(generator, info)
text = result.get("text", "")
await websocket.send_json(
{
"status": 0,
"segment": 0,
"result": {"hypotheses": [{"transcript": text}], "final": True},
"result": {"hypotheses": [{"transcript": result.text}], "final": True},
"id": md5,
}
)
@@ -285,21 +281,21 @@ async def translateapi(
)
result = build_json_result(generator, info)
text = result.get("text", "")
return {
"status": 0,
"hypotheses": [{"utterance": text}],
"hypotheses": [{"utterance": result.text}],
"id": md5,
}
@app.post("/v1/audio/transcriptions")
@app.post("/v1/audio/transcriptions", response_model=Union[JsonResult, str])
@app.post("/v1/audio/translations", response_model=Union[JsonResult, str])
async def transcription(
request: Request,
file: UploadFile = File(...),
prompt: str = Form(""),
response_format: str = Form("json"),
task: str = Form("transcribe"),
task: str = Form(""),
language: str = Form("und"),
vad_filter: bool = Form(False),
repetition_penalty: float = Form(1.0),
@@ -309,11 +305,20 @@ async def transcription(
User upload audio file in multipart/form-data format and receive transcription in response
"""
if not task:
if request.url.path == "/v1/audio/transcriptions":
task = "transcribe"
elif request.url.path == "/v1/audio/translations":
task = "translate"
else:
raise HTTPException(400, "task parameter is required")
# timestamp as filename, keep original extension
generator, info = stream_builder(
audio=io.BytesIO(file.file.read()),
task=task,
vad_filter=vad_filter,
initial_prompt=prompt,
language=None if language == "und" else language,
repetition_penalty=repetition_penalty,
)