Compare commits
8 Commits
v1.0.3
...
3401c59c4b
| Author | SHA1 | Date | |
|---|---|---|---|
|
3401c59c4b
|
|||
|
76b32bc9c4
|
|||
|
4a5ba38f5e
|
|||
|
8ae81a124d
|
|||
|
0faaf0f301
|
|||
|
fab1ec9d03
|
|||
|
71bde08b17
|
|||
|
a53a2fb80e
|
@@ -1,4 +1,4 @@
|
||||
FROM docker.io/nvidia/cuda:12.0.0-cudnn8-runtime-ubuntu22.04
|
||||
FROM nvidia/cuda:12.3.2-cudnn9-runtime-ubuntu22.04
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y ffmpeg python3 python3-pip git && \
|
||||
|
||||
@@ -5,7 +5,7 @@ Whisper-FastAPI is a very simple Python FastAPI interface for konele and OpenAI
|
||||
## Features
|
||||
|
||||
- **Translation and Transcription**: The application provides an API for konele service, where translations and transcriptions can be obtained by connecting over websockets or POST requests.
|
||||
- **Language Support**: If the target language is English, then the application will translate any source language to English.
|
||||
- **Language Support**: If no language is specified, the language will be automatically recognized from the first 30 seconds.
|
||||
- **Websocket and POST Method Support**: The project supports a websocket (`/konele/ws`) and a POST method to `/konele/post`.
|
||||
- **Audio Transcriptions**: The `/v1/audio/transcriptions` endpoint allows users to upload an audio file and receive transcription in response, with an optional `response_type` parameter. The `response_type` can be 'json', 'text', 'tsv', 'srt', and 'vtt'.
|
||||
- **Simplified Chinese**: The traditional Chinese will be automatically convert to simplified Chinese for konele using `opencc` library.
|
||||
@@ -16,10 +16,10 @@ Whisper-FastAPI is a very simple Python FastAPI interface for konele and OpenAI
|
||||
|
||||
For konele voice typing, you can use either the websocket endpoint or the POST method endpoint.
|
||||
|
||||
- **Websocket**: Connect to the websocket at `/konele/ws` and send audio data. The server will respond with the transcription or translation.
|
||||
- **POST Method**: Send a POST request to `/konele/post` with the audio data in the body. The server will respond with the transcription or translation.
|
||||
- **Websocket**: Connect to the websocket at `/konele/ws` (or `/v1/konele/ws`) and send audio data. The server will respond with the transcription or translation.
|
||||
- **POST Method**: Send a POST request to `/konele/post` (or `/v1/konele/post`) with the audio data in the body. The server will respond with the transcription or translation.
|
||||
|
||||
You can also use the demo I have created to quickly test the effect at <https://yongyuancv.cn/konele/ws> and <https://yongyuancv.cn/konele/post>
|
||||
You can also use the demo I have created to quickly test the effect at <https://yongyuancv.cn/v1/konele/ws> and <https://yongyuancv.cn/v1/konele/post>
|
||||
|
||||
### OpenAI Whisper Service
|
||||
|
||||
|
||||
@@ -4,5 +4,5 @@ uvicorn[standard]
|
||||
whisper_ctranslate2
|
||||
opencc
|
||||
prometheus-fastapi-instrumentator
|
||||
git+https://github.com/heimoshuiyu/faster-whisper@prompt
|
||||
git+https://github.com/SYSTRAN/faster-whisper@be9fb36ed356b9e299b125de6ee91862e0ac9038
|
||||
pydub
|
||||
|
||||
@@ -1,49 +1,49 @@
|
||||
annotated-types==0.7.0
|
||||
anyio==4.4.0
|
||||
av==12.3.0
|
||||
anyio==4.6.2.post1
|
||||
av==13.1.0
|
||||
certifi==2024.8.30
|
||||
cffi==1.17.1
|
||||
charset-normalizer==3.3.2
|
||||
charset-normalizer==3.4.0
|
||||
click==8.1.7
|
||||
coloredlogs==15.0.1
|
||||
ctranslate2==4.4.0
|
||||
ctranslate2==4.5.0
|
||||
exceptiongroup==1.2.2
|
||||
fastapi==0.114.1
|
||||
faster-whisper @ git+https://github.com/heimoshuiyu/faster-whisper@28a4d11a736d8cdeb4655ee5d7e4b4e7ae5ec8e0
|
||||
filelock==3.16.0
|
||||
fastapi==0.115.5
|
||||
faster-whisper @ git+https://github.com/SYSTRAN/faster-whisper@be9fb36ed356b9e299b125de6ee91862e0ac9038
|
||||
filelock==3.16.1
|
||||
flatbuffers==24.3.25
|
||||
fsspec==2024.9.0
|
||||
fsspec==2024.10.0
|
||||
h11==0.14.0
|
||||
httptools==0.6.1
|
||||
huggingface-hub==0.24.6
|
||||
httptools==0.6.4
|
||||
huggingface-hub==0.26.2
|
||||
humanfriendly==10.0
|
||||
idna==3.8
|
||||
idna==3.10
|
||||
mpmath==1.3.0
|
||||
numpy==2.1.1
|
||||
onnxruntime==1.19.2
|
||||
numpy==2.1.3
|
||||
onnxruntime==1.20.0
|
||||
OpenCC==1.1.9
|
||||
packaging==24.1
|
||||
packaging==24.2
|
||||
prometheus-fastapi-instrumentator==7.0.0
|
||||
prometheus_client==0.20.0
|
||||
protobuf==5.28.0
|
||||
prometheus_client==0.21.0
|
||||
protobuf==5.28.3
|
||||
pycparser==2.22
|
||||
pydantic==2.9.1
|
||||
pydantic_core==2.23.3
|
||||
pydantic==2.9.2
|
||||
pydantic_core==2.23.4
|
||||
pydub==0.25.1
|
||||
python-dotenv==1.0.1
|
||||
python-multipart==0.0.9
|
||||
python-multipart==0.0.17
|
||||
PyYAML==6.0.2
|
||||
requests==2.32.3
|
||||
sniffio==1.3.1
|
||||
sounddevice==0.5.0
|
||||
starlette==0.38.5
|
||||
sympy==1.13.2
|
||||
tokenizers==0.20.0
|
||||
tqdm==4.66.5
|
||||
sounddevice==0.5.1
|
||||
starlette==0.41.2
|
||||
sympy==1.13.3
|
||||
tokenizers==0.20.3
|
||||
tqdm==4.67.0
|
||||
typing_extensions==4.12.2
|
||||
urllib3==2.2.2
|
||||
uvicorn==0.30.6
|
||||
uvloop==0.20.0
|
||||
urllib3==2.2.3
|
||||
uvicorn==0.32.0
|
||||
uvloop==0.21.0
|
||||
watchfiles==0.24.0
|
||||
websockets==13.0.1
|
||||
whisper-ctranslate2==0.4.5
|
||||
websockets==14.1
|
||||
whisper-ctranslate2==0.4.7
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
import dataclasses
|
||||
import faster_whisper
|
||||
import tqdm
|
||||
import json
|
||||
from fastapi.responses import StreamingResponse
|
||||
@@ -7,7 +9,7 @@ import io
|
||||
import hashlib
|
||||
import argparse
|
||||
import uvicorn
|
||||
from typing import Annotated, Any, BinaryIO, Literal, Generator, Tuple, Iterable
|
||||
from typing import Annotated, Any, BinaryIO, Literal, Generator, Tuple, Iterable, Union
|
||||
from fastapi import (
|
||||
File,
|
||||
HTTPException,
|
||||
@@ -40,16 +42,13 @@ Instrumentator().instrument(app).expose(app, endpoint="/konele/metrics")
|
||||
ccc = opencc.OpenCC("t2s.json")
|
||||
|
||||
print(f"Loading model to device {args.device}...")
|
||||
transcriber = Transcribe(
|
||||
model_path=args.model,
|
||||
model = faster_whisper.WhisperModel(
|
||||
model_size_or_path=args.model,
|
||||
device=args.device,
|
||||
device_index=0,
|
||||
compute_type="default",
|
||||
threads=args.threads,
|
||||
cache_directory=args.cache_dir,
|
||||
cpu_threads=args.threads,
|
||||
local_files_only=args.local_files_only,
|
||||
)
|
||||
print(f"Model loaded to device {transcriber.model.model.device}")
|
||||
print(f"Model loaded to device {model.model.device}")
|
||||
|
||||
|
||||
# allow all cors
|
||||
@@ -62,56 +61,62 @@ app.add_middleware(
|
||||
)
|
||||
|
||||
|
||||
def stream_writer(generator: Generator[dict[str, Any], Any, None]):
|
||||
def stream_writer(generator: Generator[Segment, Any, None]):
|
||||
for segment in generator:
|
||||
yield "data: " + json.dumps(segment, ensure_ascii=False) + "\n\n"
|
||||
yield "data: [DONE]\n\n"
|
||||
|
||||
|
||||
def text_writer(generator: Generator[dict[str, Any], Any, None]):
|
||||
def text_writer(generator: Generator[Segment, Any, None]):
|
||||
for segment in generator:
|
||||
yield segment["text"].strip() + "\n"
|
||||
yield segment.text.strip() + "\n"
|
||||
|
||||
|
||||
def tsv_writer(generator: Generator[dict[str, Any], Any, None]):
|
||||
def tsv_writer(generator: Generator[Segment, Any, None]):
|
||||
yield "start\tend\ttext\n"
|
||||
for i, segment in enumerate(generator):
|
||||
start_time = str(round(1000 * segment["start"]))
|
||||
end_time = str(round(1000 * segment["end"]))
|
||||
text = segment["text"].strip()
|
||||
start_time = str(round(1000 * segment.start))
|
||||
end_time = str(round(1000 * segment.end))
|
||||
text = segment.text.strip()
|
||||
yield f"{start_time}\t{end_time}\t{text}\n"
|
||||
|
||||
|
||||
def srt_writer(generator: Generator[dict[str, Any], Any, None]):
|
||||
def srt_writer(generator: Generator[Segment, Any, None]):
|
||||
for i, segment in enumerate(generator):
|
||||
start_time = format_timestamp(
|
||||
segment["start"], decimal_marker=",", always_include_hours=True
|
||||
segment.start, decimal_marker=",", always_include_hours=True
|
||||
)
|
||||
end_time = format_timestamp(
|
||||
segment["end"], decimal_marker=",", always_include_hours=True
|
||||
segment.end, decimal_marker=",", always_include_hours=True
|
||||
)
|
||||
text = segment["text"].strip()
|
||||
text = segment.text.strip()
|
||||
yield f"{i}\n{start_time} --> {end_time}\n{text}\n\n"
|
||||
|
||||
|
||||
def vtt_writer(generator: Generator[dict[str, Any], Any, None]):
|
||||
def vtt_writer(generator: Generator[Segment, Any, None]):
|
||||
yield "WEBVTT\n\n"
|
||||
for i, segment in enumerate(generator):
|
||||
start_time = format_timestamp(segment["start"])
|
||||
end_time = format_timestamp(segment["end"])
|
||||
text = segment["text"].strip()
|
||||
start_time = format_timestamp(segment.start)
|
||||
end_time = format_timestamp(segment.end)
|
||||
text = segment.text.strip()
|
||||
yield f"{start_time} --> {end_time}\n{text}\n\n"
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class JsonResult(TranscriptionInfo):
|
||||
segments: list[Segment]
|
||||
text: str
|
||||
|
||||
def build_json_result(
|
||||
generator: Iterable[Segment],
|
||||
info: dict,
|
||||
) -> dict[str, Any]:
|
||||
info: TranscriptionInfo,
|
||||
) -> JsonResult:
|
||||
segments = [i for i in generator]
|
||||
return info | {
|
||||
"text": "\n".join(i["text"] for i in segments),
|
||||
"segments": segments,
|
||||
}
|
||||
return JsonResult(
|
||||
text="\n".join(i.text for i in segments),
|
||||
segments=segments,
|
||||
**dataclasses.asdict(info)
|
||||
)
|
||||
|
||||
|
||||
def stream_builder(
|
||||
@@ -121,12 +126,13 @@ def stream_builder(
|
||||
language: str | None,
|
||||
initial_prompt: str = "",
|
||||
repetition_penalty: float = 1.0,
|
||||
) -> Tuple[Iterable[dict], dict]:
|
||||
segments, info = transcriber.model.transcribe(
|
||||
) -> Tuple[Generator[Segment, None, None], TranscriptionInfo]:
|
||||
segments, info = model.transcribe(
|
||||
audio=audio,
|
||||
language=language,
|
||||
task=task,
|
||||
initial_prompt=initial_prompt,
|
||||
vad_filter=vad_filter,
|
||||
initial_prompt=initial_prompt if initial_prompt else None,
|
||||
word_timestamps=True,
|
||||
repetition_penalty=repetition_penalty,
|
||||
)
|
||||
@@ -141,20 +147,9 @@ def stream_builder(
|
||||
start, end, text = segment.start, segment.end, segment.text
|
||||
pbar.update(end - last_pos)
|
||||
last_pos = end
|
||||
data = segment._asdict()
|
||||
if data.get('words') is not None:
|
||||
data["words"] = [i._asdict() for i in data["words"]]
|
||||
if info.language == "zh":
|
||||
data["text"] = ccc.convert(data["text"])
|
||||
yield data
|
||||
yield segment
|
||||
|
||||
info_dict = info._asdict()
|
||||
if info_dict['transcription_options'] is not None:
|
||||
info_dict['transcription_options'] = info_dict['transcription_options']._asdict()
|
||||
if info_dict['vad_options'] is not None:
|
||||
info_dict['vad_options'] = info_dict['vad_options']._asdict()
|
||||
|
||||
return wrap(), info_dict
|
||||
return wrap(), info
|
||||
|
||||
|
||||
@app.websocket("/k6nele/status")
|
||||
@@ -222,13 +217,11 @@ async def konele_ws(
|
||||
)
|
||||
result = build_json_result(generator, info)
|
||||
|
||||
text = result.get("text", "")
|
||||
|
||||
await websocket.send_json(
|
||||
{
|
||||
"status": 0,
|
||||
"segment": 0,
|
||||
"result": {"hypotheses": [{"transcript": text}], "final": True},
|
||||
"result": {"hypotheses": [{"transcript": result.text}], "final": True},
|
||||
"id": md5,
|
||||
}
|
||||
)
|
||||
@@ -285,21 +278,21 @@ async def translateapi(
|
||||
)
|
||||
result = build_json_result(generator, info)
|
||||
|
||||
text = result.get("text", "")
|
||||
|
||||
return {
|
||||
"status": 0,
|
||||
"hypotheses": [{"utterance": text}],
|
||||
"hypotheses": [{"utterance": result.text}],
|
||||
"id": md5,
|
||||
}
|
||||
|
||||
|
||||
@app.post("/v1/audio/transcriptions")
|
||||
@app.post("/v1/audio/transcriptions", response_model=Union[JsonResult, str])
|
||||
@app.post("/v1/audio/translations", response_model=Union[JsonResult, str])
|
||||
async def transcription(
|
||||
request: Request,
|
||||
file: UploadFile = File(...),
|
||||
prompt: str = Form(""),
|
||||
response_format: str = Form("json"),
|
||||
task: str = Form("transcribe"),
|
||||
task: str = Form(""),
|
||||
language: str = Form("und"),
|
||||
vad_filter: bool = Form(False),
|
||||
repetition_penalty: float = Form(1.0),
|
||||
@@ -309,11 +302,20 @@ async def transcription(
|
||||
User upload audio file in multipart/form-data format and receive transcription in response
|
||||
"""
|
||||
|
||||
if not task:
|
||||
if request.url.path == '/v1/audio/transcriptions':
|
||||
task = "transcribe"
|
||||
elif request.url.path == '/v1/audio/translations':
|
||||
task = "translate"
|
||||
else:
|
||||
raise HTTPException(400, "task parameter is required")
|
||||
|
||||
# timestamp as filename, keep original extension
|
||||
generator, info = stream_builder(
|
||||
audio=io.BytesIO(file.file.read()),
|
||||
task=task,
|
||||
vad_filter=vad_filter,
|
||||
initial_prompt=prompt,
|
||||
language=None if language == "und" else language,
|
||||
repetition_penalty=repetition_penalty,
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user