Compare commits
10 Commits
939663863b
...
d86ed9be69
| Author | SHA1 | Date | |
|---|---|---|---|
|
d86ed9be69
|
|||
|
e8ae8bf9c5
|
|||
|
931b578899
|
|||
|
4ed1c695fe
|
|||
|
8b766d0ce1
|
|||
|
b22fea55ac
|
|||
|
6decabefae
|
|||
|
bc5fdec819
|
|||
|
47f9e7e873
|
|||
|
f7b5e8dc69
|
@@ -1,48 +1,48 @@
|
||||
annotated-types==0.6.0
|
||||
anyio==3.7.1
|
||||
av==10.0.0
|
||||
certifi==2023.7.22
|
||||
annotated-types==0.7.0
|
||||
anyio==4.4.0
|
||||
av==12.2.0
|
||||
certifi==2024.7.4
|
||||
cffi==1.16.0
|
||||
charset-normalizer==3.3.2
|
||||
click==8.1.7
|
||||
coloredlogs==15.0.1
|
||||
ctranslate2==3.21.0
|
||||
fastapi==0.104.1
|
||||
faster-whisper==0.9.0
|
||||
filelock==3.13.1
|
||||
flatbuffers==23.5.26
|
||||
fsspec==2023.10.0
|
||||
ctranslate2==4.3.1
|
||||
fastapi==0.111.0
|
||||
faster-whisper==1.0.3
|
||||
filelock==3.15.4
|
||||
flatbuffers==24.3.25
|
||||
fsspec==2024.6.1
|
||||
h11==0.14.0
|
||||
httptools==0.6.1
|
||||
huggingface-hub==0.17.3
|
||||
huggingface-hub==0.23.4
|
||||
humanfriendly==10.0
|
||||
idna==3.4
|
||||
idna==3.7
|
||||
mpmath==1.3.0
|
||||
numpy==1.26.2
|
||||
onnxruntime==1.16.2
|
||||
numpy==1.26.4
|
||||
onnxruntime==1.18.1
|
||||
OpenCC==1.1.7
|
||||
packaging==23.2
|
||||
packaging==24.1
|
||||
prometheus-client==0.18.0
|
||||
prometheus-fastapi-instrumentator==6.1.0
|
||||
protobuf==4.25.0
|
||||
pycparser==2.21
|
||||
pydantic==2.5.0
|
||||
pydantic_core==2.14.1
|
||||
prometheus-fastapi-instrumentator==7.0.0
|
||||
protobuf==5.27.2
|
||||
pycparser==2.22
|
||||
pydantic==2.8.2
|
||||
pydantic_core==2.20.1
|
||||
pydub==0.25.1
|
||||
python-dotenv==1.0.0
|
||||
python-multipart==0.0.6
|
||||
python-dotenv==1.0.1
|
||||
python-multipart==0.0.9
|
||||
PyYAML==6.0.1
|
||||
requests==2.31.0
|
||||
sniffio==1.3.0
|
||||
sounddevice==0.4.6
|
||||
starlette==0.27.0
|
||||
sympy==1.12
|
||||
tokenizers==0.14.1
|
||||
tqdm==4.66.1
|
||||
typing_extensions==4.8.0
|
||||
urllib3==2.1.0
|
||||
uvicorn==0.24.0.post1
|
||||
requests==2.32.3
|
||||
sniffio==1.3.1
|
||||
sounddevice==0.4.7
|
||||
starlette==0.37.2
|
||||
sympy==1.12.1
|
||||
tokenizers==0.19.1
|
||||
tqdm==4.66.4
|
||||
typing_extensions==4.12.2
|
||||
urllib3==2.2.2
|
||||
uvicorn==0.30.1
|
||||
uvloop==0.19.0
|
||||
watchfiles==0.21.0
|
||||
watchfiles==0.22.0
|
||||
websockets==12.0
|
||||
whisper-ctranslate2==0.3.2
|
||||
whisper-ctranslate2==0.4.5
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
from faster_whisper import vad
|
||||
import tqdm
|
||||
import json
|
||||
from fastapi.responses import StreamingResponse
|
||||
@@ -8,7 +7,7 @@ import io
|
||||
import hashlib
|
||||
import argparse
|
||||
import uvicorn
|
||||
from typing import Annotated, Any, BinaryIO, Literal, Generator
|
||||
from typing import Annotated, Any, BinaryIO, Literal, Generator, Tuple, Iterable
|
||||
from fastapi import (
|
||||
File,
|
||||
HTTPException,
|
||||
@@ -22,20 +21,21 @@ from fastapi import (
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from src.whisper_ctranslate2.whisper_ctranslate2 import Transcribe
|
||||
from src.whisper_ctranslate2.writers import format_timestamp
|
||||
import opencc
|
||||
from faster_whisper.transcribe import Segment, TranscriptionInfo
|
||||
from prometheus_fastapi_instrumentator import Instrumentator
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--host", default="0.0.0.0", type=str)
|
||||
parser.add_argument("--port", default=5000, type=int)
|
||||
parser.add_argument("--model", default="large-v2", type=str)
|
||||
parser.add_argument("--model", default="large-v3", type=str)
|
||||
parser.add_argument("--device", default="auto", type=str)
|
||||
parser.add_argument("--cache_dir", default=None, type=str)
|
||||
parser.add_argument("--local_files_only", default=False, type=bool)
|
||||
parser.add_argument("--threads", default=4, type=int)
|
||||
args = parser.parse_args()
|
||||
app = FastAPI()
|
||||
# Instrument your app with default metrics and expose the metrics
|
||||
Instrumentator().instrument(app).expose(app, endpoint="/konele/metrics")
|
||||
ccc = opencc.OpenCC("t2s.json")
|
||||
|
||||
print("Loading model...")
|
||||
transcriber = Transcribe(
|
||||
@@ -43,9 +43,9 @@ transcriber = Transcribe(
|
||||
device=args.device,
|
||||
device_index=0,
|
||||
compute_type="default",
|
||||
threads=1,
|
||||
threads=args.threads,
|
||||
cache_directory=args.cache_dir,
|
||||
local_files_only=False,
|
||||
local_files_only=args.local_files_only,
|
||||
)
|
||||
print("Model loaded!")
|
||||
|
||||
@@ -102,10 +102,11 @@ def vtt_writer(generator: Generator[dict[str, Any], Any, None]):
|
||||
|
||||
|
||||
def build_json_result(
|
||||
generator: Generator[dict[str, Any], Any, None]
|
||||
generator: Iterable[Segment],
|
||||
info: dict,
|
||||
) -> dict[str, Any]:
|
||||
segments = [i for i in generator]
|
||||
return {
|
||||
return info | {
|
||||
"text": "\n".join(i["text"] for i in segments),
|
||||
"segments": segments,
|
||||
}
|
||||
@@ -117,46 +118,39 @@ def stream_builder(
|
||||
vad_filter: bool,
|
||||
language: str | None,
|
||||
initial_prompt: str = "",
|
||||
):
|
||||
repetition_penalty: float = 1.0,
|
||||
) -> Tuple[Iterable[dict], dict]:
|
||||
segments, info = transcriber.model.transcribe(
|
||||
audio=audio,
|
||||
language=language,
|
||||
task=task,
|
||||
beam_size=5,
|
||||
best_of=5,
|
||||
patience=1.0,
|
||||
length_penalty=-1.0,
|
||||
repetition_penalty=1.0,
|
||||
no_repeat_ngram_size=0,
|
||||
temperature=[0.0, 1.0 + 1e-6, 0.2],
|
||||
compression_ratio_threshold=2.4,
|
||||
log_prob_threshold=-1.0,
|
||||
no_speech_threshold=0.6,
|
||||
condition_on_previous_text=True,
|
||||
prompt_reset_on_temperature=False,
|
||||
initial_prompt=initial_prompt,
|
||||
suppress_blank=False,
|
||||
suppress_tokens=[],
|
||||
word_timestamps=True,
|
||||
prepend_punctuations="\"'“¿([{-",
|
||||
append_punctuations="\"'.。,,!!??::”)]}、",
|
||||
vad_filter=vad_filter,
|
||||
vad_parameters=None,
|
||||
repetition_penalty=repetition_penalty,
|
||||
)
|
||||
print(
|
||||
"Detected language '%s' with probability %f"
|
||||
% (info.language, info.language_probability)
|
||||
)
|
||||
last_pos = 0
|
||||
with tqdm.tqdm(total=info.duration, unit="seconds", disable=True) as pbar:
|
||||
for segment in segments:
|
||||
start, end, text = segment.start, segment.end, segment.text
|
||||
pbar.update(end - last_pos)
|
||||
last_pos = end
|
||||
data = segment._asdict()
|
||||
data["total"] = info.duration
|
||||
data["text"] = ccc.convert(data["text"])
|
||||
yield data
|
||||
def wrap():
|
||||
last_pos = 0
|
||||
with tqdm.tqdm(total=info.duration, unit="seconds", disable=True) as pbar:
|
||||
for segment in segments:
|
||||
start, end, text = segment.start, segment.end, segment.text
|
||||
pbar.update(end - last_pos)
|
||||
last_pos = end
|
||||
data = segment._asdict()
|
||||
if data.get('words') is not None:
|
||||
data["words"] = [i._asdict() for i in data["words"]]
|
||||
yield data
|
||||
|
||||
info_dict = info._asdict()
|
||||
if info_dict['transcription_options'] is not None:
|
||||
info_dict['transcription_options'] = info_dict['transcription_options']._asdict()
|
||||
if info_dict['vad_options'] is not None:
|
||||
info_dict['vad_options'] = info_dict['vad_options']._asdict()
|
||||
|
||||
return wrap(), info_dict
|
||||
|
||||
|
||||
@app.websocket("/k6nele/status")
|
||||
@@ -215,14 +209,14 @@ async def konele_ws(
|
||||
|
||||
file_obj.seek(0)
|
||||
|
||||
generator = stream_builder(
|
||||
generator, info = stream_builder(
|
||||
audio=file_obj,
|
||||
task=task,
|
||||
vad_filter=vad_filter,
|
||||
language=None if lang == "und" else lang,
|
||||
initial_prompt=initial_prompt,
|
||||
)
|
||||
result = build_json_result(generator)
|
||||
result = build_json_result(generator, info)
|
||||
|
||||
text = result.get("text", "")
|
||||
print("result", text)
|
||||
@@ -279,14 +273,14 @@ async def translateapi(
|
||||
|
||||
file_obj.seek(0)
|
||||
|
||||
generator = stream_builder(
|
||||
generator, info = stream_builder(
|
||||
audio=file_obj,
|
||||
task=task,
|
||||
vad_filter=vad_filter,
|
||||
language=None if lang == "und" else lang,
|
||||
initial_prompt=initial_prompt,
|
||||
)
|
||||
result = build_json_result(generator)
|
||||
result = build_json_result(generator, info)
|
||||
|
||||
text = result.get("text", "")
|
||||
print("result", text)
|
||||
@@ -306,6 +300,7 @@ async def transcription(
|
||||
task: str = Form("transcribe"),
|
||||
language: str = Form("und"),
|
||||
vad_filter: bool = Form(False),
|
||||
repetition_penalty: float = Form(1.0),
|
||||
):
|
||||
"""Transcription endpoint
|
||||
|
||||
@@ -313,11 +308,12 @@ async def transcription(
|
||||
"""
|
||||
|
||||
# timestamp as filename, keep original extension
|
||||
generator = stream_builder(
|
||||
generator, info = stream_builder(
|
||||
audio=io.BytesIO(file.file.read()),
|
||||
task=task,
|
||||
vad_filter=vad_filter,
|
||||
language=None if language == "und" else language,
|
||||
repetition_penalty=repetition_penalty,
|
||||
)
|
||||
|
||||
# special function for streaming response (OpenAI API does not have this)
|
||||
@@ -327,7 +323,7 @@ async def transcription(
|
||||
media_type="text/event-stream",
|
||||
)
|
||||
elif response_format == "json":
|
||||
return build_json_result(generator)
|
||||
return build_json_result(generator, info)
|
||||
elif response_format == "text":
|
||||
return StreamingResponse(text_writer(generator), media_type="text/plain")
|
||||
elif response_format == "tsv":
|
||||
|
||||
Reference in New Issue
Block a user