Files
crawler-toturial/6. 大模型分析.ipynb
2024-10-24 14:06:37 +08:00

1041 lines
47 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": 3,
"id": "71c6ef2b-6795-4526-9c80-86e2ecbd0210",
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: openai in /home/hmsy/.conda/envs/python311/lib/python3.11/site-packages (1.52.2)\n",
"Requirement already satisfied: anyio<5,>=3.5.0 in /home/hmsy/.conda/envs/python311/lib/python3.11/site-packages (from openai) (4.4.0)\n",
"Requirement already satisfied: distro<2,>=1.7.0 in /home/hmsy/.conda/envs/python311/lib/python3.11/site-packages (from openai) (1.9.0)\n",
"Requirement already satisfied: httpx<1,>=0.23.0 in /home/hmsy/.conda/envs/python311/lib/python3.11/site-packages (from openai) (0.27.0)\n",
"Requirement already satisfied: jiter<1,>=0.4.0 in /home/hmsy/.conda/envs/python311/lib/python3.11/site-packages (from openai) (0.6.1)\n",
"Requirement already satisfied: pydantic<3,>=1.9.0 in /home/hmsy/.conda/envs/python311/lib/python3.11/site-packages (from openai) (2.9.2)\n",
"Requirement already satisfied: sniffio in /home/hmsy/.conda/envs/python311/lib/python3.11/site-packages (from openai) (1.3.1)\n",
"Requirement already satisfied: tqdm>4 in /home/hmsy/.conda/envs/python311/lib/python3.11/site-packages (from openai) (4.66.4)\n",
"Requirement already satisfied: typing-extensions<5,>=4.11 in /home/hmsy/.conda/envs/python311/lib/python3.11/site-packages (from openai) (4.12.2)\n",
"Requirement already satisfied: idna>=2.8 in /home/hmsy/.conda/envs/python311/lib/python3.11/site-packages (from anyio<5,>=3.5.0->openai) (3.7)\n",
"Requirement already satisfied: certifi in /home/hmsy/.conda/envs/python311/lib/python3.11/site-packages (from httpx<1,>=0.23.0->openai) (2024.7.4)\n",
"Requirement already satisfied: httpcore==1.* in /home/hmsy/.conda/envs/python311/lib/python3.11/site-packages (from httpx<1,>=0.23.0->openai) (1.0.5)\n",
"Requirement already satisfied: h11<0.15,>=0.13 in /home/hmsy/.conda/envs/python311/lib/python3.11/site-packages (from httpcore==1.*->httpx<1,>=0.23.0->openai) (0.14.0)\n",
"Requirement already satisfied: annotated-types>=0.6.0 in /home/hmsy/.conda/envs/python311/lib/python3.11/site-packages (from pydantic<3,>=1.9.0->openai) (0.7.0)\n",
"Requirement already satisfied: pydantic-core==2.23.4 in /home/hmsy/.conda/envs/python311/lib/python3.11/site-packages (from pydantic<3,>=1.9.0->openai) (2.23.4)\n"
]
}
],
"source": [
"!pip install openai\n"
]
},
{
"cell_type": "code",
"execution_count": 51,
"id": "d94f5b51-7b5e-476e-a375-f084dfd573dd",
"metadata": {},
"outputs": [],
"source": [
"from openai import OpenAI\n",
"client = OpenAI()"
]
},
{
"cell_type": "markdown",
"id": "ce020525-4956-498c-a68c-498c68fab04b",
"metadata": {},
"source": [
"## llm 分析"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "8a3e7594-9959-4fa0-b4b0-77ffb8ef5ab3",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>标题</th>\n",
" <th>时间</th>\n",
" <th>来源</th>\n",
" <th>正文</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>AI创作新风潮影视业拥抱AI新机遇</td>\n",
" <td>2023年08月28日 04:03</td>\n",
" <td>人民日报海外版</td>\n",
" <td>前不久,一部以元宇宙为概念的国潮微短剧《神女杂货铺》在某视频平台播出,讲述了一个现代女孩穿越...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>雨中跪地救人的“二次元小姐姐” 是位喜欢动漫的苏州医生</td>\n",
" <td>2023年07月26日 02:23</td>\n",
" <td>扬子晚报</td>\n",
" <td>7月21日在上海某漫展场馆外一名年轻男子突然在雨中晕厥倒地这一幕恰好被一位穿cosp...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>首批AI克隆明星上线不只是娱乐业“躺赚”</td>\n",
" <td>2023年06月05日 01:40</td>\n",
" <td>新京报</td>\n",
" <td>现实中偶像与粉丝互动被AI复制到虚拟空间中虚实边界被进一步打破。\\n花30元就可以和网红明...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>“10后”的流行密语你能对上几个</td>\n",
" <td>2023年06月01日 09:51</td>\n",
" <td>羊城晚报</td>\n",
" <td>羊城晚报记者 秦小杰\\n作为互联网新生代“10后”的小学生有哪些流行“密语”喜欢什么样的...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>(经济观察)虚拟数字人“现身”各行各业 释放可观商业价值</td>\n",
" <td>2023年05月20日 09:37</td>\n",
" <td>中国新闻网</td>\n",
" <td>中新社上海5月20日电 (谢梦圆)近期多个品牌启用虚拟形象作为代言人、社交平台AI博主大受...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>网络热梗也能成为热门IP IP如何吸引Z世代</td>\n",
" <td>2022年12月15日 01:00</td>\n",
" <td>北京青年报</td>\n",
" <td>随着网络文化的发展新时代IP内容也随之扩展创新不仅涵盖动漫、影视、游戏、潮玩甚至一个符...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>玩具市场迎来多元需求 成年人“入坑”潮流玩具</td>\n",
" <td>2022年10月26日 15:31</td>\n",
" <td>北京青年报</td>\n",
" <td>一年一度的双11来临潮流玩具市场再度成为各大电商平台必争之地玩具市场迎来更多元的市场需求...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>越来越多场景应用 “数字人”走进大众生活</td>\n",
" <td>2022年09月07日 19:55</td>\n",
" <td>中国新闻网</td>\n",
" <td>中新网北京9月7日电 (中新财经 吴家驹)从“初音未来”到“洛天依”再到“嘉然”,近年来,“...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>爱的是“皮”还是“魂”?虚拟偶像凭什么“圈粉”</td>\n",
" <td>2022年09月07日 19:55</td>\n",
" <td>中国新闻网</td>\n",
" <td>虚拟偶像深受当下年轻人的欢迎。艾媒咨询调研显示中国虚拟人爱好者中19岁至30岁之间的年轻...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>唱歌跳舞的“皮套人”?这个千亿级生意没那么简单</td>\n",
" <td>2022年09月07日 19:55</td>\n",
" <td>中国新闻网</td>\n",
" <td>近日,一条微博热搜将人们的视线拉回到了虚拟偶像的身上,一名来自美国的虚拟主播在短短两小时内吸...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" 标题 时间 来源 \\\n",
"0 AI创作新风潮影视业拥抱AI新机遇 2023年08月28日 04:03 人民日报海外版 \n",
"1 雨中跪地救人的“二次元小姐姐” 是位喜欢动漫的苏州医生 2023年07月26日 02:23 扬子晚报 \n",
"2 首批AI克隆明星上线不只是娱乐业“躺赚” 2023年06月05日 01:40 新京报 \n",
"3 “10后”的流行密语你能对上几个 2023年06月01日 09:51 羊城晚报 \n",
"4 (经济观察)虚拟数字人“现身”各行各业 释放可观商业价值 2023年05月20日 09:37 中国新闻网 \n",
"5 网络热梗也能成为热门IP IP如何吸引Z世代 2022年12月15日 01:00 北京青年报 \n",
"6 玩具市场迎来多元需求 成年人“入坑”潮流玩具 2022年10月26日 15:31 北京青年报 \n",
"7 越来越多场景应用 “数字人”走进大众生活 2022年09月07日 19:55 中国新闻网 \n",
"8 爱的是“皮”还是“魂”?虚拟偶像凭什么“圈粉” 2022年09月07日 19:55 中国新闻网 \n",
"9 唱歌跳舞的“皮套人”?这个千亿级生意没那么简单 2022年09月07日 19:55 中国新闻网 \n",
"\n",
" 正文 \n",
"0 前不久,一部以元宇宙为概念的国潮微短剧《神女杂货铺》在某视频平台播出,讲述了一个现代女孩穿越... \n",
"1 7月21日在上海某漫展场馆外一名年轻男子突然在雨中晕厥倒地这一幕恰好被一位穿cosp... \n",
"2 现实中偶像与粉丝互动被AI复制到虚拟空间中虚实边界被进一步打破。\\n花30元就可以和网红明... \n",
"3 羊城晚报记者 秦小杰\\n作为互联网新生代“10后”的小学生有哪些流行“密语”喜欢什么样的... \n",
"4 中新社上海5月20日电 (谢梦圆)近期多个品牌启用虚拟形象作为代言人、社交平台AI博主大受... \n",
"5 随着网络文化的发展新时代IP内容也随之扩展创新不仅涵盖动漫、影视、游戏、潮玩甚至一个符... \n",
"6 一年一度的双11来临潮流玩具市场再度成为各大电商平台必争之地玩具市场迎来更多元的市场需求... \n",
"7 中新网北京9月7日电 (中新财经 吴家驹)从“初音未来”到“洛天依”再到“嘉然”,近年来,“... \n",
"8 虚拟偶像深受当下年轻人的欢迎。艾媒咨询调研显示中国虚拟人爱好者中19岁至30岁之间的年轻... \n",
"9 近日,一条微博热搜将人们的视线拉回到了虚拟偶像的身上,一名来自美国的虚拟主播在短短两小时内吸... "
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 读入数据\n",
"import pandas\n",
"df = pandas.read_csv('news.csv', index_col=0)\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "0cd5cc64-10a8-48aa-911d-684afac56a74",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"否 AI创作新风潮影视业拥抱AI新机遇\n",
"是 雨中跪地救人的“二次元小姐姐” 是位喜欢动漫的苏州医生\n",
"是 首批AI克隆明星上线不只是娱乐业“躺赚”\n",
"是 “10后”的流行密语你能对上几个\n",
"否 (经济观察)虚拟数字人“现身”各行各业 释放可观商业价值\n",
"否 网络热梗也能成为热门IP IP如何吸引Z世代\n",
"否 玩具市场迎来多元需求 成年人“入坑”潮流玩具\n",
"否 越来越多场景应用 “数字人”走进大众生活\n",
"是 爱的是“皮”还是“魂”?虚拟偶像凭什么“圈粉”\n",
"是 唱歌跳舞的“皮套人”?这个千亿级生意没那么简单\n"
]
}
],
"source": [
"for index, row in df.iterrows():\n",
" completion = client.chat.completions.create(\n",
" model=\"gpt-4o-mini\",\n",
" temperature=0,\n",
" messages=[\n",
" {\"role\": \"system\", \"content\": \"你是一个新闻标题判定器,你要分析我提供新闻标题是否属于'标题党'。你只能回答是或否\"},\n",
" {\n",
" \"role\": \"user\",\n",
" \"content\": f\"标题:{row.标题}\"\n",
" }\n",
" ]\n",
" )\n",
" \n",
" print(completion.choices[0].message.content, row.标题)"
]
},
{
"cell_type": "markdown",
"id": "1b141ff4-cdfe-47f2-a267-a6ab04bfcd4f",
"metadata": {},
"source": [
"## 情感分析"
]
},
{
"cell_type": "code",
"execution_count": 55,
"id": "f95c82f0-df80-42b0-8253-db12aae4756d",
"metadata": {},
"outputs": [],
"source": [
"response = client.embeddings.create(\n",
" input=['情感标签:积极', '情感标签:消极'],\n",
" model=\"text-embedding-3-small\"\n",
")\n",
"positive = response.data[0].embedding\n",
"negative = response.data[1].embedding"
]
},
{
"cell_type": "code",
"execution_count": 56,
"id": "5bcdb775-7f5e-4d05-bb97-6cbc5d182415",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"余弦相似度: 0.7307260589905838\n"
]
}
],
"source": [
"import numpy as np\n",
"\n",
"def cosine_similarity(vec1, vec2):\n",
" \"\"\"\n",
" 计算两个向量之间的余弦相似度。\n",
"\n",
" 参数:\n",
" vec1 -- 第一个向量列表或numpy数组\n",
" vec2 -- 第二个向量列表或numpy数组\n",
"\n",
" 返回:\n",
" 余弦相似度,介于-1和1之间\n",
" \"\"\"\n",
" # 将输入转换为numpy数组\n",
" vec1 = np.array(vec1)\n",
" vec2 = np.array(vec2)\n",
" \n",
" # 计算向量的点积\n",
" dot_product = np.dot(vec1, vec2)\n",
" \n",
" # 计算向量的范数(模)\n",
" norm_vec1 = np.linalg.norm(vec1)\n",
" norm_vec2 = np.linalg.norm(vec2)\n",
" \n",
" # 计算余弦相似度\n",
" if norm_vec1 == 0 or norm_vec2 == 0:\n",
" return 0.0 # 如果其中一个向量是零向量则相似度为0\n",
" else:\n",
" return dot_product / (norm_vec1 * norm_vec2)\n",
"\n",
"# 示例用法\n",
"similarity = cosine_similarity(positive, negative)\n",
"print(f\"余弦相似度: {similarity}\")"
]
},
{
"cell_type": "code",
"execution_count": 58,
"id": "f56e8468-5bff-4b38-b480-acb619edc602",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>标题</th>\n",
" <th>时间</th>\n",
" <th>来源</th>\n",
" <th>正文</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>AI创作新风潮影视业拥抱AI新机遇</td>\n",
" <td>2023年08月28日 04:03</td>\n",
" <td>人民日报海外版</td>\n",
" <td>前不久,一部以元宇宙为概念的国潮微短剧《神女杂货铺》在某视频平台播出,讲述了一个现代女孩穿越...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>雨中跪地救人的“二次元小姐姐” 是位喜欢动漫的苏州医生</td>\n",
" <td>2023年07月26日 02:23</td>\n",
" <td>扬子晚报</td>\n",
" <td>7月21日在上海某漫展场馆外一名年轻男子突然在雨中晕厥倒地这一幕恰好被一位穿cosp...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>首批AI克隆明星上线不只是娱乐业“躺赚”</td>\n",
" <td>2023年06月05日 01:40</td>\n",
" <td>新京报</td>\n",
" <td>现实中偶像与粉丝互动被AI复制到虚拟空间中虚实边界被进一步打破。\\n花30元就可以和网红明...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>“10后”的流行密语你能对上几个</td>\n",
" <td>2023年06月01日 09:51</td>\n",
" <td>羊城晚报</td>\n",
" <td>羊城晚报记者 秦小杰\\n作为互联网新生代“10后”的小学生有哪些流行“密语”喜欢什么样的...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>(经济观察)虚拟数字人“现身”各行各业 释放可观商业价值</td>\n",
" <td>2023年05月20日 09:37</td>\n",
" <td>中国新闻网</td>\n",
" <td>中新社上海5月20日电 (谢梦圆)近期多个品牌启用虚拟形象作为代言人、社交平台AI博主大受...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>网络热梗也能成为热门IP IP如何吸引Z世代</td>\n",
" <td>2022年12月15日 01:00</td>\n",
" <td>北京青年报</td>\n",
" <td>随着网络文化的发展新时代IP内容也随之扩展创新不仅涵盖动漫、影视、游戏、潮玩甚至一个符...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>玩具市场迎来多元需求 成年人“入坑”潮流玩具</td>\n",
" <td>2022年10月26日 15:31</td>\n",
" <td>北京青年报</td>\n",
" <td>一年一度的双11来临潮流玩具市场再度成为各大电商平台必争之地玩具市场迎来更多元的市场需求...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>越来越多场景应用 “数字人”走进大众生活</td>\n",
" <td>2022年09月07日 19:55</td>\n",
" <td>中国新闻网</td>\n",
" <td>中新网北京9月7日电 (中新财经 吴家驹)从“初音未来”到“洛天依”再到“嘉然”,近年来,“...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>爱的是“皮”还是“魂”?虚拟偶像凭什么“圈粉”</td>\n",
" <td>2022年09月07日 19:55</td>\n",
" <td>中国新闻网</td>\n",
" <td>虚拟偶像深受当下年轻人的欢迎。艾媒咨询调研显示中国虚拟人爱好者中19岁至30岁之间的年轻...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>唱歌跳舞的“皮套人”?这个千亿级生意没那么简单</td>\n",
" <td>2022年09月07日 19:55</td>\n",
" <td>中国新闻网</td>\n",
" <td>近日,一条微博热搜将人们的视线拉回到了虚拟偶像的身上,一名来自美国的虚拟主播在短短两小时内吸...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" 标题 时间 来源 \\\n",
"0 AI创作新风潮影视业拥抱AI新机遇 2023年08月28日 04:03 人民日报海外版 \n",
"1 雨中跪地救人的“二次元小姐姐” 是位喜欢动漫的苏州医生 2023年07月26日 02:23 扬子晚报 \n",
"2 首批AI克隆明星上线不只是娱乐业“躺赚” 2023年06月05日 01:40 新京报 \n",
"3 “10后”的流行密语你能对上几个 2023年06月01日 09:51 羊城晚报 \n",
"4 (经济观察)虚拟数字人“现身”各行各业 释放可观商业价值 2023年05月20日 09:37 中国新闻网 \n",
"5 网络热梗也能成为热门IP IP如何吸引Z世代 2022年12月15日 01:00 北京青年报 \n",
"6 玩具市场迎来多元需求 成年人“入坑”潮流玩具 2022年10月26日 15:31 北京青年报 \n",
"7 越来越多场景应用 “数字人”走进大众生活 2022年09月07日 19:55 中国新闻网 \n",
"8 爱的是“皮”还是“魂”?虚拟偶像凭什么“圈粉” 2022年09月07日 19:55 中国新闻网 \n",
"9 唱歌跳舞的“皮套人”?这个千亿级生意没那么简单 2022年09月07日 19:55 中国新闻网 \n",
"\n",
" 正文 \n",
"0 前不久,一部以元宇宙为概念的国潮微短剧《神女杂货铺》在某视频平台播出,讲述了一个现代女孩穿越... \n",
"1 7月21日在上海某漫展场馆外一名年轻男子突然在雨中晕厥倒地这一幕恰好被一位穿cosp... \n",
"2 现实中偶像与粉丝互动被AI复制到虚拟空间中虚实边界被进一步打破。\\n花30元就可以和网红明... \n",
"3 羊城晚报记者 秦小杰\\n作为互联网新生代“10后”的小学生有哪些流行“密语”喜欢什么样的... \n",
"4 中新社上海5月20日电 (谢梦圆)近期多个品牌启用虚拟形象作为代言人、社交平台AI博主大受... \n",
"5 随着网络文化的发展新时代IP内容也随之扩展创新不仅涵盖动漫、影视、游戏、潮玩甚至一个符... \n",
"6 一年一度的双11来临潮流玩具市场再度成为各大电商平台必争之地玩具市场迎来更多元的市场需求... \n",
"7 中新网北京9月7日电 (中新财经 吴家驹)从“初音未来”到“洛天依”再到“嘉然”,近年来,“... \n",
"8 虚拟偶像深受当下年轻人的欢迎。艾媒咨询调研显示中国虚拟人爱好者中19岁至30岁之间的年轻... \n",
"9 近日,一条微博热搜将人们的视线拉回到了虚拟偶像的身上,一名来自美国的虚拟主播在短短两小时内吸... "
]
},
"execution_count": 58,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 载入数据\n",
"import pandas\n",
"df = pandas.read_csv('news.csv', index_col = 0)\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 72,
"id": "622dbddf-618a-44b2-a1eb-22a06ac71eb4",
"metadata": {},
"outputs": [],
"source": [
"response = client.embeddings.create(\n",
" input=df.标题 ,\n",
" model=\"text-embedding-3-small\"\n",
")\n",
"embeddings = [i.embedding for i in response.data]\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "866ae4a7-a396-4f34-8b8b-dd974b6ed668",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 73,
"id": "08e569b3-6f90-4189-b4f4-574f8f7863d9",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"positive AI创作新风潮影视业拥抱AI新机遇\n",
"positive 雨中跪地救人的“二次元小姐姐” 是位喜欢动漫的苏州医生\n",
"positive 首批AI克隆明星上线不只是娱乐业“躺赚”\n",
"negative “10后”的流行密语你能对上几个\n",
"positive (经济观察)虚拟数字人“现身”各行各业 释放可观商业价值\n",
"positive 网络热梗也能成为热门IP IP如何吸引Z世代\n",
"positive 玩具市场迎来多元需求 成年人“入坑”潮流玩具\n",
"positive 越来越多场景应用 “数字人”走进大众生活\n",
"positive 爱的是“皮”还是“魂”?虚拟偶像凭什么“圈粉”\n",
"positive 唱歌跳舞的“皮套人”?这个千亿级生意没那么简单\n"
]
},
{
"data": {
"text/plain": [
"['positive',\n",
" 'positive',\n",
" 'positive',\n",
" 'negative',\n",
" 'positive',\n",
" 'positive',\n",
" 'positive',\n",
" 'positive',\n",
" 'positive',\n",
" 'positive']"
]
},
"execution_count": 73,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sentiment = []\n",
"for title, emb in zip(df.标题, embeddings):\n",
" if cosine_similarity(emb, positive) > cosine_similarity(emb, negative):\n",
" result = 'positive'\n",
" else:\n",
" result = 'negative'\n",
" print(result, title)\n",
" sentiment.append(result)\n",
"sentiment"
]
},
{
"cell_type": "markdown",
"id": "6d4ef2b4-f028-4908-bdde-dda719d91c19",
"metadata": {},
"source": [
"## 聚类"
]
},
{
"cell_type": "code",
"execution_count": 39,
"id": "e64290fe-3550-4255-87a6-e7f7dc2f16da",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>时间</th>\n",
" <th>标题</th>\n",
" <th>URL</th>\n",
" <th>摘要</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2024-09-28 23:34:43</td>\n",
" <td>MMD初音未来UNDEAD</td>\n",
" <td>https://www.sohu.com/a/812565548_120122317</td>\n",
" <td>MMD初音未来UNDEAD</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2024-09-06 15:57:15</td>\n",
" <td>初音未来17周年</td>\n",
" <td>https://www.sohu.com/a/806861635_532686</td>\n",
" <td>角色名初音未来CN安凉公主殿下生日快乐wwෆ(˶''ᵕ''˶)ෆ图片授权来源:次元岛…</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2024-10-23 21:41:45</td>\n",
" <td>MMD初音未来五人组Unveiled</td>\n",
" <td>https://www.sohu.com/a/819573079_120122317</td>\n",
" <td>MMD初音未来五人组Unveiled</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2024-09-25 20:18:35</td>\n",
" <td>MMD初音未来单色骑士</td>\n",
" <td>https://www.sohu.com/a/811660644_120122317</td>\n",
" <td>MMD初音未来单色骑士</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2024-08-10 14:30:53</td>\n",
" <td>MMD初音未来SHOW</td>\n",
" <td>https://www.sohu.com/a/799888119_120122317</td>\n",
" <td>MMD初音未来SHOW</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>70</th>\n",
" <td>2024-09-01 10:29:15</td>\n",
" <td>初音未来大折扣PSN港服《ProjectDIVAFTDX》5折优惠音游迷怎能错过</td>\n",
" <td>https://www.sohu.com/a/805408718_362225</td>\n",
" <td>《初音未来ProjectDIVAFutureToneDX》不仅是一款音乐游戏更是一次全面的...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>71</th>\n",
" <td>2024-10-08 19:10:47</td>\n",
" <td>无视一切的恋爱,宅男当年曾娶初音未来领结婚证,如今怎么样了?</td>\n",
" <td>https://www.sohu.com/a/814623848_121166535</td>\n",
" <td>在日本,一男子却做出更加惊人之举,因为他的爱情已经跨越了种族、肤色甚至是虚实,和一位虚拟人物...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>72</th>\n",
" <td>2024-07-30 15:19:15</td>\n",
" <td>《世界计划:破碎的世界与不能唱歌的未来》官宣制作初音未来主演</td>\n",
" <td>https://www.sohu.com/a/797235003_211762</td>\n",
" <td>初音未来主演的剧场版动画《世界计划:破碎的世界与不能唱歌的未来》现已正式开始制作,并且预告和...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>73</th>\n",
" <td>2024-09-22 19:56:47</td>\n",
" <td>还记得和初音未来结婚的男子吗?一年过去了,他们的现状怎么样了?</td>\n",
" <td>https://www.sohu.com/a/810770579_121166539</td>\n",
" <td>现在社会,科技已经深入到生活每一个细节,大到国家重量级项目,小到家中一个小小的扫地机器人,不...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>74</th>\n",
" <td>2024-09-02 08:39:04</td>\n",
" <td>神っぽいな(像神一样呐)|初音ミク(初音未来)|揉揉酱自制小提琴谱|五线谱|乐谱</td>\n",
" <td>https://www.sohu.com/a/805585736_120879343</td>\n",
" <td>这首歌曲是二次元音乐人匹诺曹P的《神芽na(像神一样呐)》,歌词讽刺现代社会中对“神性”的盲...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>75 rows × 4 columns</p>\n",
"</div>"
],
"text/plain": [
" 时间 标题 \\\n",
"0 2024-09-28 23:34:43 MMD初音未来UNDEAD \n",
"1 2024-09-06 15:57:15 初音未来17周年 \n",
"2 2024-10-23 21:41:45 MMD初音未来五人组Unveiled \n",
"3 2024-09-25 20:18:35 MMD初音未来单色骑士 \n",
"4 2024-08-10 14:30:53 MMD初音未来SHOW \n",
".. ... ... \n",
"70 2024-09-01 10:29:15 初音未来大折扣PSN港服《ProjectDIVAFTDX》5折优惠音游迷怎能错过 \n",
"71 2024-10-08 19:10:47 无视一切的恋爱,宅男当年曾娶初音未来领结婚证,如今怎么样了? \n",
"72 2024-07-30 15:19:15 《世界计划:破碎的世界与不能唱歌的未来》官宣制作初音未来主演 \n",
"73 2024-09-22 19:56:47 还记得和初音未来结婚的男子吗?一年过去了,他们的现状怎么样了? \n",
"74 2024-09-02 08:39:04 神っぽいな(像神一样呐)|初音ミク(初音未来)|揉揉酱自制小提琴谱|五线谱|乐谱 \n",
"\n",
" URL \\\n",
"0 https://www.sohu.com/a/812565548_120122317 \n",
"1 https://www.sohu.com/a/806861635_532686 \n",
"2 https://www.sohu.com/a/819573079_120122317 \n",
"3 https://www.sohu.com/a/811660644_120122317 \n",
"4 https://www.sohu.com/a/799888119_120122317 \n",
".. ... \n",
"70 https://www.sohu.com/a/805408718_362225 \n",
"71 https://www.sohu.com/a/814623848_121166535 \n",
"72 https://www.sohu.com/a/797235003_211762 \n",
"73 https://www.sohu.com/a/810770579_121166539 \n",
"74 https://www.sohu.com/a/805585736_120879343 \n",
"\n",
" 摘要 \n",
"0 MMD初音未来UNDEAD \n",
"1 角色名初音未来CN安凉公主殿下生日快乐wwෆ(˶''ᵕ''˶)ෆ图片授权来源:次元岛… \n",
"2 MMD初音未来五人组Unveiled \n",
"3 MMD初音未来单色骑士 \n",
"4 MMD初音未来SHOW \n",
".. ... \n",
"70 《初音未来ProjectDIVAFutureToneDX》不仅是一款音乐游戏更是一次全面的... \n",
"71 在日本,一男子却做出更加惊人之举,因为他的爱情已经跨越了种族、肤色甚至是虚实,和一位虚拟人物... \n",
"72 初音未来主演的剧场版动画《世界计划:破碎的世界与不能唱歌的未来》现已正式开始制作,并且预告和... \n",
"73 现在社会,科技已经深入到生活每一个细节,大到国家重量级项目,小到家中一个小小的扫地机器人,不... \n",
"74 这首歌曲是二次元音乐人匹诺曹P的《神芽na(像神一样呐)》,歌词讽刺现代社会中对“神性”的盲... \n",
"\n",
"[75 rows x 4 columns]"
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 读入数据\n",
"df = pandas.read_csv('souhu.csv', index_col=0)\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 40,
"id": "7e71a802-4e89-47d8-ad95-902d40aa50a4",
"metadata": {},
"outputs": [],
"source": [
"response = client.embeddings.create(\n",
" input=df.标题,\n",
" model=\"text-embedding-3-small\"\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 33,
"id": "ea38f9a0-3942-4576-982e-ccf4b3898222",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: scikit-learn in /home/hmsy/.conda/envs/python311/lib/python3.11/site-packages (1.5.2)\n",
"Requirement already satisfied: numpy>=1.19.5 in /home/hmsy/.conda/envs/python311/lib/python3.11/site-packages (from scikit-learn) (1.26.4)\n",
"Requirement already satisfied: scipy>=1.6.0 in /home/hmsy/.conda/envs/python311/lib/python3.11/site-packages (from scikit-learn) (1.14.1)\n",
"Requirement already satisfied: joblib>=1.2.0 in /home/hmsy/.conda/envs/python311/lib/python3.11/site-packages (from scikit-learn) (1.4.2)\n",
"Requirement already satisfied: threadpoolctl>=3.1.0 in /home/hmsy/.conda/envs/python311/lib/python3.11/site-packages (from scikit-learn) (3.5.0)\n"
]
}
],
"source": [
"!pip install scikit-learn"
]
},
{
"cell_type": "code",
"execution_count": 43,
"id": "088f830c-d741-46b8-8add-f908347e6b0f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"聚类标签: [2 3 1 2 2 2 2 2 2 2 2 1 2 2 5 2 3 2 6 2 2 2 7 4 2 3 4 4 0 3 9 3 7 3 9 3 4\n",
" 4 6 6 6 4 3 7 5 5 4 9 9 3 8 9 9 4 6 4 0 6 5 9 4 5 5 3 9 8 4 9 4 0 7 5 9 3\n",
" 4]\n"
]
}
],
"source": [
"embeddings = [i.embedding for i in response.data]\n",
"import numpy as np\n",
"from sklearn.cluster import KMeans\n",
"\n",
"# 假设你有一个嵌入的列表或数组\n",
"# 这里我们用随机数据来模拟嵌入\n",
"# 每个嵌入是一个长度为128的向量\n",
"np.random.seed(42) # 为了结果可重复\n",
"\n",
"# 定义聚类的数量\n",
"num_clusters =10\n",
"\n",
"# 创建KMeans模型\n",
"kmeans = KMeans(n_clusters=num_clusters, random_state=42)\n",
"\n",
"# 训练模型\n",
"kmeans.fit(embeddings)\n",
"\n",
"# 获取每个嵌入的聚类标签\n",
"labels = kmeans.labels_\n",
"\n",
"# 输出结果\n",
"print(\"聚类标签:\", labels)"
]
},
{
"cell_type": "code",
"execution_count": 49,
"id": "c8bc0b81-1eb3-4f9a-b0af-c65434d2a6f8",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>时间</th>\n",
" <th>标题</th>\n",
" <th>URL</th>\n",
" <th>摘要</th>\n",
" <th>kmeans</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2024-09-28 23:34:43</td>\n",
" <td>MMD初音未来UNDEAD</td>\n",
" <td>https://www.sohu.com/a/812565548_120122317</td>\n",
" <td>MMD初音未来UNDEAD</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2024-09-25 20:18:35</td>\n",
" <td>MMD初音未来单色骑士</td>\n",
" <td>https://www.sohu.com/a/811660644_120122317</td>\n",
" <td>MMD初音未来单色骑士</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2024-08-10 14:30:53</td>\n",
" <td>MMD初音未来SHOW</td>\n",
" <td>https://www.sohu.com/a/799888119_120122317</td>\n",
" <td>MMD初音未来SHOW</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>2024-08-11 22:54:33</td>\n",
" <td>MMD初音未来卡哇伊</td>\n",
" <td>https://www.sohu.com/a/800129858_120122317</td>\n",
" <td>MMD初音未来卡哇伊</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>2024-10-13 23:22:15</td>\n",
" <td>MMD初音未来MelticHell</td>\n",
" <td>https://www.sohu.com/a/816244183_120122317</td>\n",
" <td>MMD初音未来MelticHell</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>2024-09-01 15:28:21</td>\n",
" <td>MMD初音未来情感风车</td>\n",
" <td>https://www.sohu.com/a/805468248_120122317</td>\n",
" <td>MMD初音未来情感风车</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>2024-08-18 23:20:16</td>\n",
" <td>MMD初音未来RPG</td>\n",
" <td>https://www.sohu.com/a/801797229_120122317</td>\n",
" <td>MMD初音未来RPG</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>2024-09-14 21:55:59</td>\n",
" <td>MMD初音未来Go-Getters</td>\n",
" <td>https://www.sohu.com/a/809054528_120122317</td>\n",
" <td>MMD初音未来Go-Getters</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>2024-10-18 23:58:48</td>\n",
" <td>MMD初音未来叭噗</td>\n",
" <td>https://www.sohu.com/a/817974411_120122317</td>\n",
" <td>MMD初音未来叭噗</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>2024-08-13 22:59:23</td>\n",
" <td>MMD初音未来倾诉迷魂</td>\n",
" <td>https://www.sohu.com/a/800642732_120122317</td>\n",
" <td>MMD初音未来倾诉迷魂</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>2024-09-20 21:21:34</td>\n",
" <td>MMD初音未来拜拜呀呆</td>\n",
" <td>https://www.sohu.com/a/810397011_120122317</td>\n",
" <td>MMD初音未来拜拜呀呆</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>2024-07-28 22:04:25</td>\n",
" <td>MMD初音未来恋爱哲学</td>\n",
" <td>https://www.sohu.com/a/796798684_120122317</td>\n",
" <td>MMD初音未来恋爱哲学</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>2024-07-31 21:43:59</td>\n",
" <td>MMD25时初音未来心灵烙印</td>\n",
" <td>https://www.sohu.com/a/797613538_120122317</td>\n",
" <td>MMD25时初音未来心灵烙印</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>2024-08-24 23:33:03</td>\n",
" <td>MMD初音未来猫尾草之歌</td>\n",
" <td>https://www.sohu.com/a/803409803_120122317</td>\n",
" <td>MMD初音未来猫尾草之歌</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>2024-10-12 16:23:07</td>\n",
" <td>MMD初音未来在黎明前一跃而起</td>\n",
" <td>https://www.sohu.com/a/815933102_120122317</td>\n",
" <td>MMD初音未来在黎明前一跃而起</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21</th>\n",
" <td>2024-09-08 12:28:58</td>\n",
" <td>MMD初音未来甜甜圈洞short</td>\n",
" <td>https://www.sohu.com/a/807255356_120122317</td>\n",
" <td>MMD初音未来甜甜圈洞short</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>24</th>\n",
" <td>2024-08-28 22:55:49</td>\n",
" <td>MMD初音未来晓山瑞希宵崎奏孜然炉火</td>\n",
" <td>https://www.sohu.com/a/804480099_120122317</td>\n",
" <td>MMD初音未来晓山瑞希宵崎奏孜然炉火</td>\n",
" <td>2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" 时间 标题 \\\n",
"0 2024-09-28 23:34:43 MMD初音未来UNDEAD \n",
"3 2024-09-25 20:18:35 MMD初音未来单色骑士 \n",
"4 2024-08-10 14:30:53 MMD初音未来SHOW \n",
"5 2024-08-11 22:54:33 MMD初音未来卡哇伊 \n",
"6 2024-10-13 23:22:15 MMD初音未来MelticHell \n",
"7 2024-09-01 15:28:21 MMD初音未来情感风车 \n",
"8 2024-08-18 23:20:16 MMD初音未来RPG \n",
"9 2024-09-14 21:55:59 MMD初音未来Go-Getters \n",
"10 2024-10-18 23:58:48 MMD初音未来叭噗 \n",
"12 2024-08-13 22:59:23 MMD初音未来倾诉迷魂 \n",
"13 2024-09-20 21:21:34 MMD初音未来拜拜呀呆 \n",
"15 2024-07-28 22:04:25 MMD初音未来恋爱哲学 \n",
"17 2024-07-31 21:43:59 MMD25时初音未来心灵烙印 \n",
"19 2024-08-24 23:33:03 MMD初音未来猫尾草之歌 \n",
"20 2024-10-12 16:23:07 MMD初音未来在黎明前一跃而起 \n",
"21 2024-09-08 12:28:58 MMD初音未来甜甜圈洞short \n",
"24 2024-08-28 22:55:49 MMD初音未来晓山瑞希宵崎奏孜然炉火 \n",
"\n",
" URL 摘要 kmeans \n",
"0 https://www.sohu.com/a/812565548_120122317 MMD初音未来UNDEAD 2 \n",
"3 https://www.sohu.com/a/811660644_120122317 MMD初音未来单色骑士 2 \n",
"4 https://www.sohu.com/a/799888119_120122317 MMD初音未来SHOW 2 \n",
"5 https://www.sohu.com/a/800129858_120122317 MMD初音未来卡哇伊 2 \n",
"6 https://www.sohu.com/a/816244183_120122317 MMD初音未来MelticHell 2 \n",
"7 https://www.sohu.com/a/805468248_120122317 MMD初音未来情感风车 2 \n",
"8 https://www.sohu.com/a/801797229_120122317 MMD初音未来RPG 2 \n",
"9 https://www.sohu.com/a/809054528_120122317 MMD初音未来Go-Getters 2 \n",
"10 https://www.sohu.com/a/817974411_120122317 MMD初音未来叭噗 2 \n",
"12 https://www.sohu.com/a/800642732_120122317 MMD初音未来倾诉迷魂 2 \n",
"13 https://www.sohu.com/a/810397011_120122317 MMD初音未来拜拜呀呆 2 \n",
"15 https://www.sohu.com/a/796798684_120122317 MMD初音未来恋爱哲学 2 \n",
"17 https://www.sohu.com/a/797613538_120122317 MMD25时初音未来心灵烙印 2 \n",
"19 https://www.sohu.com/a/803409803_120122317 MMD初音未来猫尾草之歌 2 \n",
"20 https://www.sohu.com/a/815933102_120122317 MMD初音未来在黎明前一跃而起 2 \n",
"21 https://www.sohu.com/a/807255356_120122317 MMD初音未来甜甜圈洞short 2 \n",
"24 https://www.sohu.com/a/804480099_120122317 MMD初音未来晓山瑞希宵崎奏孜然炉火 2 "
]
},
"execution_count": 49,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 把分类结果加入df\n",
"df['kmeans'] = labels\n",
"df[df.kmeans == 2]"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
}
},
"nbformat": 4,
"nbformat_minor": 5
}