upgrade with faster-whisper upstream

This commit is contained in:
2024-11-15 01:04:49 +08:00
parent 8ae81a124d
commit 4a5ba38f5e
4 changed files with 61 additions and 71 deletions

View File

@@ -5,7 +5,7 @@ Whisper-FastAPI is a very simple Python FastAPI interface for konele and OpenAI
## Features ## Features
- **Translation and Transcription**: The application provides an API for konele service, where translations and transcriptions can be obtained by connecting over websockets or POST requests. - **Translation and Transcription**: The application provides an API for konele service, where translations and transcriptions can be obtained by connecting over websockets or POST requests.
- **Language Support**: If the target language is English, then the application will translate any source language to English. - **Language Support**: If no language is specified, the language will be automatically recognized from the first 30 seconds.
- **Websocket and POST Method Support**: The project supports a websocket (`/konele/ws`) and a POST method to `/konele/post`. - **Websocket and POST Method Support**: The project supports a websocket (`/konele/ws`) and a POST method to `/konele/post`.
- **Audio Transcriptions**: The `/v1/audio/transcriptions` endpoint allows users to upload an audio file and receive transcription in response, with an optional `response_type` parameter. The `response_type` can be 'json', 'text', 'tsv', 'srt', and 'vtt'. - **Audio Transcriptions**: The `/v1/audio/transcriptions` endpoint allows users to upload an audio file and receive transcription in response, with an optional `response_type` parameter. The `response_type` can be 'json', 'text', 'tsv', 'srt', and 'vtt'.
- **Simplified Chinese**: The traditional Chinese will be automatically convert to simplified Chinese for konele using `opencc` library. - **Simplified Chinese**: The traditional Chinese will be automatically convert to simplified Chinese for konele using `opencc` library.
@@ -16,10 +16,10 @@ Whisper-FastAPI is a very simple Python FastAPI interface for konele and OpenAI
For konele voice typing, you can use either the websocket endpoint or the POST method endpoint. For konele voice typing, you can use either the websocket endpoint or the POST method endpoint.
- **Websocket**: Connect to the websocket at `/konele/ws` and send audio data. The server will respond with the transcription or translation. - **Websocket**: Connect to the websocket at `/konele/ws` (or `/v1/konele/ws`) and send audio data. The server will respond with the transcription or translation.
- **POST Method**: Send a POST request to `/konele/post` with the audio data in the body. The server will respond with the transcription or translation. - **POST Method**: Send a POST request to `/konele/post` (or `/v1/konele/post`) with the audio data in the body. The server will respond with the transcription or translation.
You can also use the demo I have created to quickly test the effect at <https://yongyuancv.cn/konele/ws> and <https://yongyuancv.cn/konele/post> You can also use the demo I have created to quickly test the effect at <https://yongyuancv.cn/v1/konele/ws> and <https://yongyuancv.cn/v1/konele/post>
### OpenAI Whisper Service ### OpenAI Whisper Service

View File

@@ -4,5 +4,5 @@ uvicorn[standard]
whisper_ctranslate2 whisper_ctranslate2
opencc opencc
prometheus-fastapi-instrumentator prometheus-fastapi-instrumentator
git+https://github.com/heimoshuiyu/faster-whisper@prompt git+https://github.com/SYSTRAN/faster-whisper@3e0ba86571b9fe93bab2a25b3ff5af1be41014ec
pydub pydub

View File

@@ -1,6 +1,6 @@
annotated-types==0.7.0 annotated-types==0.7.0
anyio==4.6.2.post1 anyio==4.6.2.post1
av==12.3.0 av==13.1.0
certifi==2024.8.30 certifi==2024.8.30
cffi==1.17.1 cffi==1.17.1
charset-normalizer==3.4.0 charset-normalizer==3.4.0
@@ -8,21 +8,21 @@ click==8.1.7
coloredlogs==15.0.1 coloredlogs==15.0.1
ctranslate2==4.5.0 ctranslate2==4.5.0
exceptiongroup==1.2.2 exceptiongroup==1.2.2
fastapi==0.115.3 fastapi==0.115.5
faster-whisper @ git+https://github.com/heimoshuiyu/faster-whisper@8563f889cb23c2f8d64568d7a4a1c7beea28618b faster-whisper @ git+https://github.com/SYSTRAN/faster-whisper@3e0ba86571b9fe93bab2a25b3ff5af1be41014ec
filelock==3.16.1 filelock==3.16.1
flatbuffers==24.3.25 flatbuffers==24.3.25
fsspec==2024.10.0 fsspec==2024.10.0
h11==0.14.0 h11==0.14.0
httptools==0.6.4 httptools==0.6.4
huggingface-hub==0.26.1 huggingface-hub==0.26.2
humanfriendly==10.0 humanfriendly==10.0
idna==3.10 idna==3.10
mpmath==1.3.0 mpmath==1.3.0
numpy==2.1.2 numpy==2.1.3
onnxruntime==1.19.2 onnxruntime==1.20.0
OpenCC==1.1.9 OpenCC==1.1.9
packaging==24.1 packaging==24.2
prometheus-fastapi-instrumentator==7.0.0 prometheus-fastapi-instrumentator==7.0.0
prometheus_client==0.21.0 prometheus_client==0.21.0
protobuf==5.28.3 protobuf==5.28.3
@@ -31,19 +31,19 @@ pydantic==2.9.2
pydantic_core==2.23.4 pydantic_core==2.23.4
pydub==0.25.1 pydub==0.25.1
python-dotenv==1.0.1 python-dotenv==1.0.1
python-multipart==0.0.12 python-multipart==0.0.17
PyYAML==6.0.2 PyYAML==6.0.2
requests==2.32.3 requests==2.32.3
sniffio==1.3.1 sniffio==1.3.1
sounddevice==0.5.1 sounddevice==0.5.1
starlette==0.41.0 starlette==0.41.2
sympy==1.13.3 sympy==1.13.3
tokenizers==0.20.1 tokenizers==0.20.3
tqdm==4.66.5 tqdm==4.67.0
typing_extensions==4.12.2 typing_extensions==4.12.2
urllib3==2.2.3 urllib3==2.2.3
uvicorn==0.32.0 uvicorn==0.32.0
uvloop==0.21.0 uvloop==0.21.0
watchfiles==0.24.0 watchfiles==0.24.0
websockets==13.1 websockets==14.1
whisper-ctranslate2==0.4.6 whisper-ctranslate2==0.4.7

View File

@@ -1,3 +1,5 @@
import dataclasses
import faster_whisper
import tqdm import tqdm
import json import json
from fastapi.responses import StreamingResponse from fastapi.responses import StreamingResponse
@@ -7,7 +9,7 @@ import io
import hashlib import hashlib
import argparse import argparse
import uvicorn import uvicorn
from typing import Annotated, Any, BinaryIO, Literal, Generator, Tuple, Iterable from typing import Annotated, Any, BinaryIO, Literal, Generator, Tuple, Iterable, Union
from fastapi import ( from fastapi import (
File, File,
HTTPException, HTTPException,
@@ -40,16 +42,13 @@ Instrumentator().instrument(app).expose(app, endpoint="/konele/metrics")
ccc = opencc.OpenCC("t2s.json") ccc = opencc.OpenCC("t2s.json")
print(f"Loading model to device {args.device}...") print(f"Loading model to device {args.device}...")
transcriber = Transcribe( model = faster_whisper.WhisperModel(
model_path=args.model, model_size_or_path=args.model,
device=args.device, device=args.device,
device_index=0, cpu_threads=args.threads,
compute_type="default",
threads=args.threads,
cache_directory=args.cache_dir,
local_files_only=args.local_files_only, local_files_only=args.local_files_only,
) )
print(f"Model loaded to device {transcriber.model.model.device}") print(f"Model loaded to device {model.model.device}")
# allow all cors # allow all cors
@@ -62,56 +61,62 @@ app.add_middleware(
) )
def stream_writer(generator: Generator[dict[str, Any], Any, None]): def stream_writer(generator: Generator[Segment, Any, None]):
for segment in generator: for segment in generator:
yield "data: " + json.dumps(segment, ensure_ascii=False) + "\n\n" yield "data: " + json.dumps(segment, ensure_ascii=False) + "\n\n"
yield "data: [DONE]\n\n" yield "data: [DONE]\n\n"
def text_writer(generator: Generator[dict[str, Any], Any, None]): def text_writer(generator: Generator[Segment, Any, None]):
for segment in generator: for segment in generator:
yield segment["text"].strip() + "\n" yield segment.text.strip() + "\n"
def tsv_writer(generator: Generator[dict[str, Any], Any, None]): def tsv_writer(generator: Generator[Segment, Any, None]):
yield "start\tend\ttext\n" yield "start\tend\ttext\n"
for i, segment in enumerate(generator): for i, segment in enumerate(generator):
start_time = str(round(1000 * segment["start"])) start_time = str(round(1000 * segment.start))
end_time = str(round(1000 * segment["end"])) end_time = str(round(1000 * segment.end))
text = segment["text"].strip() text = segment.text.strip()
yield f"{start_time}\t{end_time}\t{text}\n" yield f"{start_time}\t{end_time}\t{text}\n"
def srt_writer(generator: Generator[dict[str, Any], Any, None]): def srt_writer(generator: Generator[Segment, Any, None]):
for i, segment in enumerate(generator): for i, segment in enumerate(generator):
start_time = format_timestamp( start_time = format_timestamp(
segment["start"], decimal_marker=",", always_include_hours=True segment.start, decimal_marker=",", always_include_hours=True
) )
end_time = format_timestamp( end_time = format_timestamp(
segment["end"], decimal_marker=",", always_include_hours=True segment.end, decimal_marker=",", always_include_hours=True
) )
text = segment["text"].strip() text = segment.text.strip()
yield f"{i}\n{start_time} --> {end_time}\n{text}\n\n" yield f"{i}\n{start_time} --> {end_time}\n{text}\n\n"
def vtt_writer(generator: Generator[dict[str, Any], Any, None]): def vtt_writer(generator: Generator[Segment, Any, None]):
yield "WEBVTT\n\n" yield "WEBVTT\n\n"
for i, segment in enumerate(generator): for i, segment in enumerate(generator):
start_time = format_timestamp(segment["start"]) start_time = format_timestamp(segment.start)
end_time = format_timestamp(segment["end"]) end_time = format_timestamp(segment.end)
text = segment["text"].strip() text = segment.text.strip()
yield f"{start_time} --> {end_time}\n{text}\n\n" yield f"{start_time} --> {end_time}\n{text}\n\n"
@dataclasses.dataclass
class JsonResult(TranscriptionInfo):
segments: list[Segment]
text: str
def build_json_result( def build_json_result(
generator: Iterable[dict], generator: Iterable[Segment],
info: dict, info: TranscriptionInfo,
) -> dict[str, Any]: ) -> JsonResult:
segments = [i for i in generator] segments = [i for i in generator]
return info | { return JsonResult(
"text": "\n".join(i["text"] for i in segments), text="\n".join(i.text for i in segments),
"segments": segments, segments=segments,
} **dataclasses.asdict(info)
)
def stream_builder( def stream_builder(
@@ -121,8 +126,8 @@ def stream_builder(
language: str | None, language: str | None,
initial_prompt: str = "", initial_prompt: str = "",
repetition_penalty: float = 1.0, repetition_penalty: float = 1.0,
) -> Tuple[Generator[dict, None, None], dict]: ) -> Tuple[Generator[Segment, None, None], TranscriptionInfo]:
segments, info = transcriber.model.transcribe( segments, info = model.transcribe(
audio=audio, audio=audio,
language=language, language=language,
task=task, task=task,
@@ -142,20 +147,9 @@ def stream_builder(
start, end, text = segment.start, segment.end, segment.text start, end, text = segment.start, segment.end, segment.text
pbar.update(end - last_pos) pbar.update(end - last_pos)
last_pos = end last_pos = end
data = segment._asdict() yield segment
if data.get('words') is not None:
data["words"] = [i._asdict() for i in data["words"]]
if info.language == "zh":
data["text"] = ccc.convert(data["text"])
yield data
info_dict = info._asdict() return wrap(), info
if info_dict['transcription_options'] is not None:
info_dict['transcription_options'] = info_dict['transcription_options']._asdict()
if info_dict['vad_options'] is not None:
info_dict['vad_options'] = info_dict['vad_options']._asdict()
return wrap(), info_dict
@app.websocket("/k6nele/status") @app.websocket("/k6nele/status")
@@ -223,13 +217,11 @@ async def konele_ws(
) )
result = build_json_result(generator, info) result = build_json_result(generator, info)
text = result.get("text", "")
await websocket.send_json( await websocket.send_json(
{ {
"status": 0, "status": 0,
"segment": 0, "segment": 0,
"result": {"hypotheses": [{"transcript": text}], "final": True}, "result": {"hypotheses": [{"transcript": result.text}], "final": True},
"id": md5, "id": md5,
} }
) )
@@ -286,17 +278,15 @@ async def translateapi(
) )
result = build_json_result(generator, info) result = build_json_result(generator, info)
text = result.get("text", "")
return { return {
"status": 0, "status": 0,
"hypotheses": [{"utterance": text}], "hypotheses": [{"utterance": result.text}],
"id": md5, "id": md5,
} }
@app.post("/v1/audio/transcriptions") @app.post("/v1/audio/transcriptions", response_model=Union[JsonResult, str])
@app.post("/v1/audio/translations") @app.post("/v1/audio/translations", response_model=Union[JsonResult, str])
async def transcription( async def transcription(
request: Request, request: Request,
file: UploadFile = File(...), file: UploadFile = File(...),