423 lines
18 KiB
Plaintext
423 lines
18 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 1,
|
||
"id": "bc76e623-3b53-459c-83a7-1c190ef8486e",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Requirement already satisfied: selenium in /home/hmsy/.conda/envs/python311/lib/python3.11/site-packages (4.25.0)\n",
|
||
"Requirement already satisfied: urllib3<3,>=1.26 in /home/hmsy/.conda/envs/python311/lib/python3.11/site-packages (from urllib3[socks]<3,>=1.26->selenium) (2.2.2)\n",
|
||
"Requirement already satisfied: trio~=0.17 in /home/hmsy/.conda/envs/python311/lib/python3.11/site-packages (from selenium) (0.26.2)\n",
|
||
"Requirement already satisfied: trio-websocket~=0.9 in /home/hmsy/.conda/envs/python311/lib/python3.11/site-packages (from selenium) (0.11.1)\n",
|
||
"Requirement already satisfied: certifi>=2021.10.8 in /home/hmsy/.conda/envs/python311/lib/python3.11/site-packages (from selenium) (2024.7.4)\n",
|
||
"Requirement already satisfied: typing_extensions~=4.9 in /home/hmsy/.conda/envs/python311/lib/python3.11/site-packages (from selenium) (4.12.2)\n",
|
||
"Requirement already satisfied: websocket-client~=1.8 in /home/hmsy/.conda/envs/python311/lib/python3.11/site-packages (from selenium) (1.8.0)\n",
|
||
"Requirement already satisfied: attrs>=23.2.0 in /home/hmsy/.conda/envs/python311/lib/python3.11/site-packages (from trio~=0.17->selenium) (23.2.0)\n",
|
||
"Requirement already satisfied: sortedcontainers in /home/hmsy/.conda/envs/python311/lib/python3.11/site-packages (from trio~=0.17->selenium) (2.4.0)\n",
|
||
"Requirement already satisfied: idna in /home/hmsy/.conda/envs/python311/lib/python3.11/site-packages (from trio~=0.17->selenium) (3.7)\n",
|
||
"Requirement already satisfied: outcome in /home/hmsy/.conda/envs/python311/lib/python3.11/site-packages (from trio~=0.17->selenium) (1.3.0.post0)\n",
|
||
"Requirement already satisfied: sniffio>=1.3.0 in /home/hmsy/.conda/envs/python311/lib/python3.11/site-packages (from trio~=0.17->selenium) (1.3.1)\n",
|
||
"Requirement already satisfied: wsproto>=0.14 in /home/hmsy/.conda/envs/python311/lib/python3.11/site-packages (from trio-websocket~=0.9->selenium) (1.2.0)\n",
|
||
"Requirement already satisfied: pysocks!=1.5.7,<2.0,>=1.5.6 in /home/hmsy/.conda/envs/python311/lib/python3.11/site-packages (from urllib3[socks]<3,>=1.26->selenium) (1.7.1)\n",
|
||
"Requirement already satisfied: h11<1,>=0.9.0 in /home/hmsy/.conda/envs/python311/lib/python3.11/site-packages (from wsproto>=0.14->trio-websocket~=0.9->selenium) (0.14.0)\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# 安装依赖(如果已经安装过了可以跳过)\n",
|
||
"!pip install selenium"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 40,
|
||
"id": "1fd79faf-f138-41fa-9519-7bc72b407afb",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"from selenium import webdriver\n",
|
||
"from selenium.webdriver.common.by import By\n",
|
||
"import time\n",
|
||
"\n",
|
||
"# 创建一个新的 Chrome 浏览器会话\n",
|
||
"driver = webdriver.Chrome()\n",
|
||
"\n",
|
||
"# 让浏览器打开一个网页\n",
|
||
"driver.get('https://sou.chinanews.com/')\n",
|
||
"driver.implicitly_wait(3) # 设置隐式等待时间为3秒\n",
|
||
"\n",
|
||
"# 找到搜索框,输入法文本\n",
|
||
"input = driver.find_element(By.XPATH, '//*[@id=\"q\"]')\n",
|
||
"input.send_keys('初音未来')\n",
|
||
"\n",
|
||
"# 找到搜索按钮,点击按钮\n",
|
||
"search = driver.find_element(By.XPATH, '//button[@class=\"searchBtn\"]')\n",
|
||
"search.click()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 41,
|
||
"id": "876319e9-a10c-47ee-9ef9-ac0e03a39d82",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"['https://www.chinanews.com.cn/cj/2023/08-28/10068093.shtml',\n",
|
||
" 'https://www.chinanews.com.cn/sh/2023/07-26/10049675.shtml',\n",
|
||
" 'https://www.chinanews.com.cn/sh/2023/06-05/10019224.shtml',\n",
|
||
" 'https://www.chinanews.com.cn/sh/2023/06-01/10017432.shtml',\n",
|
||
" 'https://www.chinanews.com.cn/cj/2023/05-20/10010862.shtml',\n",
|
||
" 'https://www.chinanews.com.cn/cul/2022/12-15/9915069.shtml',\n",
|
||
" 'https://www.chinanews.com.cn/cj/2022/10-26/9880366.shtml',\n",
|
||
" 'https://www.chinanews.com.cn/cj/2022/09-07/9847169.shtml',\n",
|
||
" 'https://www.chinanews.com.cn/cj/2022/07-05/9795608.shtml',\n",
|
||
" 'https://www.chinanews.com.cn/cj/2022/07-05/9795607.shtml']"
|
||
]
|
||
},
|
||
"execution_count": 41,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"# 遍历符合条件的 XPATH,寻找所有 新闻标题 的 URL,并保存在 links 列表中\n",
|
||
"links = []\n",
|
||
"for element in driver.find_elements(By.XPATH, '//div[@class=\"news_title\"]/a'):\n",
|
||
" links.append(element.get_attribute('href'))\n",
|
||
"links"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 8,
|
||
"id": "e0823aa9-1aa9-43d1-bdb0-d8a9a130a70f",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# 跳转到该网页\n",
|
||
"driver.get(links[0])"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 26,
|
||
"id": "1cc5210c-a61a-48ba-9938-3f10449ff784",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"AI创作新风潮:影视业,拥抱AI新机遇\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# 解析标题\n",
|
||
"title = driver.find_element(By.XPATH, '//*[@id=\"cont_1_1_2\"]/div[2]/h1').text.strip()\n",
|
||
"print(title)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 12,
|
||
"id": "9877f965-f606-49bd-bd8e-ee578b9b90cb",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# 解析正文\n",
|
||
"content = driver.find_element(By.XPATH, '//*[@id=\"cont_1_1_2\"]/div[2]/div[4]/div[2]').text.strip()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 21,
|
||
"id": "f001d905-abef-47df-9c43-8d87267553c1",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"2023年08月28日 04:03\n",
|
||
"人民日报海外版\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# 解析 时间 和 来源\n",
|
||
"text = driver.find_element(By.XPATH, '//*[@id=\"cont_1_1_2\"]/div[2]/div[2]').text.strip()\n",
|
||
"tuples = text.split('\\n', 1)[0].split('来源:')\n",
|
||
"date = tuples[0].strip()\n",
|
||
"source = tuples[1].strip()\n",
|
||
"print(date)\n",
|
||
"print(source)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 37,
|
||
"id": "1ffb2586-753c-4fb8-8f42-30c122a2e8b4",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"2022年09月07日 19:55 中国新闻网\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# 针对 旧版 的网页\n",
|
||
"title = driver.find_element(By.XPATH, '//*[@id=\"cont_1_1_2\"]/h1').text.strip()\n",
|
||
"content = driver.find_element(By.XPATH, '//*[@id=\"cont_1_1_2\"]/div[5]').text.strip()\n",
|
||
"text = driver.find_element(By.XPATH, '//*[@id=\"cont_1_1_2\"]/div[3]/div').text.strip()\n",
|
||
"date = tuples[0].strip()\n",
|
||
"source = tuples[1].strip()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "e53bc9b7-a103-48ae-ab6f-d574ecd7687b",
|
||
"metadata": {},
|
||
"source": [
|
||
"---"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 31,
|
||
"id": "866f8acc-37d7-4f25-ba2b-62bb8baae94f",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"'https://www.chinanews.com.cn/cj/2022/07-05/9795608.shtml'"
|
||
]
|
||
},
|
||
"execution_count": 31,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"driver.current_url"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "b7c28cdf-40f2-466d-a684-cefc0e34a1e4",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# 合并以上的代码\n",
|
||
"data = []\n",
|
||
"for link in links:\n",
|
||
" # 跳转到该网页\n",
|
||
" driver.get(link)\n",
|
||
" try:\n",
|
||
" # 解析标题\n",
|
||
" title = driver.find_element(By.XPATH, '//*[@id=\"cont_1_1_2\"]/div[2]/h1').text.strip()\n",
|
||
" # 解析正文\n",
|
||
" content = driver.find_element(By.XPATH, '//*[@id=\"cont_1_1_2\"]/div[2]/div[4]/div[2]').text.strip()\n",
|
||
" # 解析 时间 和 来源\n",
|
||
" text = driver.find_element(By.XPATH, '//*[@id=\"cont_1_1_2\"]/div[2]/div[2]').text.strip()\n",
|
||
" tuples = text.split('\\n', 1)[0].split('来源:')\n",
|
||
" date = tuples[0].strip()\n",
|
||
" source = tuples[1].strip()\n",
|
||
" except Exception as e:\n",
|
||
" # 如果上面的代码报错了,说明可能是旧版网页,使用以下的代码进行解析\n",
|
||
" title = driver.find_element(By.XPATH, '//*[@id=\"cont_1_1_2\"]/h1').text.strip()\n",
|
||
" content = driver.find_element(By.XPATH, '//*[@id=\"cont_1_1_2\"]/div[5]').text.strip()\n",
|
||
" text = driver.find_element(By.XPATH, '//*[@id=\"cont_1_1_2\"]/div[3]/div').text.strip()\n",
|
||
" date = tuples[0].strip()\n",
|
||
" source = tuples[1].strip()\n",
|
||
" \n",
|
||
" # 保存所有数据到一个二维列表\n",
|
||
" data.append([title, date, source, content])\n",
|
||
"\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 44,
|
||
"id": "a6d8861c-6b1d-494d-bc8a-bf9310317776",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>标题</th>\n",
|
||
" <th>时间</th>\n",
|
||
" <th>来源</th>\n",
|
||
" <th>正文</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>AI创作新风潮:影视业,拥抱AI新机遇</td>\n",
|
||
" <td>2023年08月28日 04:03</td>\n",
|
||
" <td>人民日报海外版</td>\n",
|
||
" <td>前不久,一部以元宇宙为概念的国潮微短剧《神女杂货铺》在某视频平台播出,讲述了一个现代女孩穿越...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>雨中跪地救人的“二次元小姐姐” 是位喜欢动漫的苏州医生</td>\n",
|
||
" <td>2023年07月26日 02:23</td>\n",
|
||
" <td>扬子晚报</td>\n",
|
||
" <td>7月21日,在上海某漫展场馆外,一名年轻男子突然在雨中晕厥倒地,这一幕,恰好被一位穿cosp...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>首批AI克隆明星上线,不只是娱乐业“躺赚”</td>\n",
|
||
" <td>2023年06月05日 01:40</td>\n",
|
||
" <td>新京报</td>\n",
|
||
" <td>现实中偶像与粉丝互动被AI复制到虚拟空间中,虚实边界被进一步打破。\\n花30元就可以和网红明...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>“10后”的流行密语你能对上几个?</td>\n",
|
||
" <td>2023年06月01日 09:51</td>\n",
|
||
" <td>羊城晚报</td>\n",
|
||
" <td>羊城晚报记者 秦小杰\\n作为互联网新生代,“10后”的小学生有哪些流行“密语”?喜欢什么样的...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>(经济观察)虚拟数字人“现身”各行各业 释放可观商业价值</td>\n",
|
||
" <td>2023年05月20日 09:37</td>\n",
|
||
" <td>中国新闻网</td>\n",
|
||
" <td>中新社上海5月20日电 (谢梦圆)近期,多个品牌启用虚拟形象作为代言人、社交平台AI博主大受...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>网络热梗也能成为热门IP IP如何吸引Z世代?</td>\n",
|
||
" <td>2022年12月15日 01:00</td>\n",
|
||
" <td>北京青年报</td>\n",
|
||
" <td>随着网络文化的发展,新时代IP内容也随之扩展创新,不仅涵盖动漫、影视、游戏、潮玩,甚至一个符...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>玩具市场迎来多元需求 成年人“入坑”潮流玩具</td>\n",
|
||
" <td>2022年10月26日 15:31</td>\n",
|
||
" <td>北京青年报</td>\n",
|
||
" <td>一年一度的双11来临,潮流玩具市场再度成为各大电商平台必争之地,玩具市场迎来更多元的市场需求...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>越来越多场景应用 “数字人”走进大众生活</td>\n",
|
||
" <td>2022年09月07日 19:55</td>\n",
|
||
" <td>中国新闻网</td>\n",
|
||
" <td>中新网北京9月7日电 (中新财经 吴家驹)从“初音未来”到“洛天依”再到“嘉然”,近年来,“...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>爱的是“皮”还是“魂”?虚拟偶像凭什么“圈粉”</td>\n",
|
||
" <td>2022年09月07日 19:55</td>\n",
|
||
" <td>中国新闻网</td>\n",
|
||
" <td>虚拟偶像深受当下年轻人的欢迎。艾媒咨询调研显示,中国虚拟人爱好者中,19岁至30岁之间的年轻...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>唱歌跳舞的“皮套人”?这个千亿级生意没那么简单</td>\n",
|
||
" <td>2022年09月07日 19:55</td>\n",
|
||
" <td>中国新闻网</td>\n",
|
||
" <td>近日,一条微博热搜将人们的视线拉回到了虚拟偶像的身上,一名来自美国的虚拟主播在短短两小时内吸...</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" 标题 时间 来源 \\\n",
|
||
"0 AI创作新风潮:影视业,拥抱AI新机遇 2023年08月28日 04:03 人民日报海外版 \n",
|
||
"1 雨中跪地救人的“二次元小姐姐” 是位喜欢动漫的苏州医生 2023年07月26日 02:23 扬子晚报 \n",
|
||
"2 首批AI克隆明星上线,不只是娱乐业“躺赚” 2023年06月05日 01:40 新京报 \n",
|
||
"3 “10后”的流行密语你能对上几个? 2023年06月01日 09:51 羊城晚报 \n",
|
||
"4 (经济观察)虚拟数字人“现身”各行各业 释放可观商业价值 2023年05月20日 09:37 中国新闻网 \n",
|
||
"5 网络热梗也能成为热门IP IP如何吸引Z世代? 2022年12月15日 01:00 北京青年报 \n",
|
||
"6 玩具市场迎来多元需求 成年人“入坑”潮流玩具 2022年10月26日 15:31 北京青年报 \n",
|
||
"7 越来越多场景应用 “数字人”走进大众生活 2022年09月07日 19:55 中国新闻网 \n",
|
||
"8 爱的是“皮”还是“魂”?虚拟偶像凭什么“圈粉” 2022年09月07日 19:55 中国新闻网 \n",
|
||
"9 唱歌跳舞的“皮套人”?这个千亿级生意没那么简单 2022年09月07日 19:55 中国新闻网 \n",
|
||
"\n",
|
||
" 正文 \n",
|
||
"0 前不久,一部以元宇宙为概念的国潮微短剧《神女杂货铺》在某视频平台播出,讲述了一个现代女孩穿越... \n",
|
||
"1 7月21日,在上海某漫展场馆外,一名年轻男子突然在雨中晕厥倒地,这一幕,恰好被一位穿cosp... \n",
|
||
"2 现实中偶像与粉丝互动被AI复制到虚拟空间中,虚实边界被进一步打破。\\n花30元就可以和网红明... \n",
|
||
"3 羊城晚报记者 秦小杰\\n作为互联网新生代,“10后”的小学生有哪些流行“密语”?喜欢什么样的... \n",
|
||
"4 中新社上海5月20日电 (谢梦圆)近期,多个品牌启用虚拟形象作为代言人、社交平台AI博主大受... \n",
|
||
"5 随着网络文化的发展,新时代IP内容也随之扩展创新,不仅涵盖动漫、影视、游戏、潮玩,甚至一个符... \n",
|
||
"6 一年一度的双11来临,潮流玩具市场再度成为各大电商平台必争之地,玩具市场迎来更多元的市场需求... \n",
|
||
"7 中新网北京9月7日电 (中新财经 吴家驹)从“初音未来”到“洛天依”再到“嘉然”,近年来,“... \n",
|
||
"8 虚拟偶像深受当下年轻人的欢迎。艾媒咨询调研显示,中国虚拟人爱好者中,19岁至30岁之间的年轻... \n",
|
||
"9 近日,一条微博热搜将人们的视线拉回到了虚拟偶像的身上,一名来自美国的虚拟主播在短短两小时内吸... "
|
||
]
|
||
},
|
||
"execution_count": 44,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"# 循环结束后,将data转为 DataFrame 并保存到 csv\n",
|
||
"import pandas\n",
|
||
"df = pandas.DataFrame(data, columns=['标题', '时间', '来源', '正文'])\n",
|
||
"df.to_csv('news.csv')\n",
|
||
"\n",
|
||
"# 显示df\n",
|
||
"df"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "Python 3 (ipykernel)",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.11.9"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 5
|
||
}
|