使用 title or content 作为embedding内容

This commit is contained in:
2024-10-08 14:35:00 +08:00
parent 16f4469365
commit a49db1b71b

View File

@@ -1,9 +1,7 @@
import traceback import traceback
import datetime import datetime
import asyncio import asyncio
from sqlalchemy.sql.ddl import exc
import tqdm import tqdm
import os
from tokenizers import Tokenizer from tokenizers import Tokenizer
import openai import openai
import hashlib import hashlib
@@ -65,9 +63,8 @@ async def get_embeddings(
- quiet: 是否关闭输出 - quiet: 是否关闭输出
""" """
# 针对 acge-large-zh 模型,需要将文本截断 1024 - 2 # 针对 大多数 模型,需要将文本截断 1024 - 2
if model == "acge-large-zh": texts = [truncate_text(model, text, 1024 - 2) for text in texts]
texts = [truncate_text(model, text, 1024 - 2) for text in texts]
# 构建任务列表 # 构建任务列表
ids = list(range(len(texts))) ids = list(range(len(texts)))
@@ -192,7 +189,7 @@ async def do_update():
print(datetime.datetime.now(), "No data to update") print(datetime.datetime.now(), "No data to update")
break break
embeddings = await get_embeddings([doc[1] + " " + doc[2] for doc in docs], "acge-large-zh") embeddings = await get_embeddings([doc[1] or doc[2] for doc in docs], "acge-large-zh", threads=10)
async with get_cur() as cur: async with get_cur() as cur:
for doc, embedding in tqdm.tqdm(zip(docs, embeddings), total=min(len(docs), len(embeddings)), desc="Update embeddings"): for doc, embedding in tqdm.tqdm(zip(docs, embeddings), total=min(len(docs), len(embeddings)), desc="Update embeddings"):
await cur.execute("UPDATE risk_news SET embedding = %s, embedding_updated_at = now() where id = %s", (embedding, doc[0])) await cur.execute("UPDATE risk_news SET embedding = %s, embedding_updated_at = now() where id = %s", (embedding, doc[0]))