使用 title or content 作为embedding内容
This commit is contained in:
@@ -1,9 +1,7 @@
|
|||||||
import traceback
|
import traceback
|
||||||
import datetime
|
import datetime
|
||||||
import asyncio
|
import asyncio
|
||||||
from sqlalchemy.sql.ddl import exc
|
|
||||||
import tqdm
|
import tqdm
|
||||||
import os
|
|
||||||
from tokenizers import Tokenizer
|
from tokenizers import Tokenizer
|
||||||
import openai
|
import openai
|
||||||
import hashlib
|
import hashlib
|
||||||
@@ -65,9 +63,8 @@ async def get_embeddings(
|
|||||||
- quiet: 是否关闭输出
|
- quiet: 是否关闭输出
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# 针对 acge-large-zh 模型,需要将文本截断 1024 - 2
|
# 针对 大多数 模型,需要将文本截断 1024 - 2
|
||||||
if model == "acge-large-zh":
|
texts = [truncate_text(model, text, 1024 - 2) for text in texts]
|
||||||
texts = [truncate_text(model, text, 1024 - 2) for text in texts]
|
|
||||||
|
|
||||||
# 构建任务列表
|
# 构建任务列表
|
||||||
ids = list(range(len(texts)))
|
ids = list(range(len(texts)))
|
||||||
@@ -192,7 +189,7 @@ async def do_update():
|
|||||||
print(datetime.datetime.now(), "No data to update")
|
print(datetime.datetime.now(), "No data to update")
|
||||||
break
|
break
|
||||||
|
|
||||||
embeddings = await get_embeddings([doc[1] + " " + doc[2] for doc in docs], "acge-large-zh")
|
embeddings = await get_embeddings([doc[1] or doc[2] for doc in docs], "acge-large-zh", threads=10)
|
||||||
async with get_cur() as cur:
|
async with get_cur() as cur:
|
||||||
for doc, embedding in tqdm.tqdm(zip(docs, embeddings), total=min(len(docs), len(embeddings)), desc="Update embeddings"):
|
for doc, embedding in tqdm.tqdm(zip(docs, embeddings), total=min(len(docs), len(embeddings)), desc="Update embeddings"):
|
||||||
await cur.execute("UPDATE risk_news SET embedding = %s, embedding_updated_at = now() where id = %s", (embedding, doc[0]))
|
await cur.execute("UPDATE risk_news SET embedding = %s, embedding_updated_at = now() where id = %s", (embedding, doc[0]))
|
||||||
|
|||||||
Reference in New Issue
Block a user