Files
crawler-toturial/4. 遍历网页.ipynb
2024-10-24 14:06:37 +08:00

423 lines
18 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "bc76e623-3b53-459c-83a7-1c190ef8486e",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: selenium in /home/hmsy/.conda/envs/python311/lib/python3.11/site-packages (4.25.0)\n",
"Requirement already satisfied: urllib3<3,>=1.26 in /home/hmsy/.conda/envs/python311/lib/python3.11/site-packages (from urllib3[socks]<3,>=1.26->selenium) (2.2.2)\n",
"Requirement already satisfied: trio~=0.17 in /home/hmsy/.conda/envs/python311/lib/python3.11/site-packages (from selenium) (0.26.2)\n",
"Requirement already satisfied: trio-websocket~=0.9 in /home/hmsy/.conda/envs/python311/lib/python3.11/site-packages (from selenium) (0.11.1)\n",
"Requirement already satisfied: certifi>=2021.10.8 in /home/hmsy/.conda/envs/python311/lib/python3.11/site-packages (from selenium) (2024.7.4)\n",
"Requirement already satisfied: typing_extensions~=4.9 in /home/hmsy/.conda/envs/python311/lib/python3.11/site-packages (from selenium) (4.12.2)\n",
"Requirement already satisfied: websocket-client~=1.8 in /home/hmsy/.conda/envs/python311/lib/python3.11/site-packages (from selenium) (1.8.0)\n",
"Requirement already satisfied: attrs>=23.2.0 in /home/hmsy/.conda/envs/python311/lib/python3.11/site-packages (from trio~=0.17->selenium) (23.2.0)\n",
"Requirement already satisfied: sortedcontainers in /home/hmsy/.conda/envs/python311/lib/python3.11/site-packages (from trio~=0.17->selenium) (2.4.0)\n",
"Requirement already satisfied: idna in /home/hmsy/.conda/envs/python311/lib/python3.11/site-packages (from trio~=0.17->selenium) (3.7)\n",
"Requirement already satisfied: outcome in /home/hmsy/.conda/envs/python311/lib/python3.11/site-packages (from trio~=0.17->selenium) (1.3.0.post0)\n",
"Requirement already satisfied: sniffio>=1.3.0 in /home/hmsy/.conda/envs/python311/lib/python3.11/site-packages (from trio~=0.17->selenium) (1.3.1)\n",
"Requirement already satisfied: wsproto>=0.14 in /home/hmsy/.conda/envs/python311/lib/python3.11/site-packages (from trio-websocket~=0.9->selenium) (1.2.0)\n",
"Requirement already satisfied: pysocks!=1.5.7,<2.0,>=1.5.6 in /home/hmsy/.conda/envs/python311/lib/python3.11/site-packages (from urllib3[socks]<3,>=1.26->selenium) (1.7.1)\n",
"Requirement already satisfied: h11<1,>=0.9.0 in /home/hmsy/.conda/envs/python311/lib/python3.11/site-packages (from wsproto>=0.14->trio-websocket~=0.9->selenium) (0.14.0)\n"
]
}
],
"source": [
"# 安装依赖(如果已经安装过了可以跳过)\n",
"!pip install selenium"
]
},
{
"cell_type": "code",
"execution_count": 40,
"id": "1fd79faf-f138-41fa-9519-7bc72b407afb",
"metadata": {},
"outputs": [],
"source": [
"from selenium import webdriver\n",
"from selenium.webdriver.common.by import By\n",
"import time\n",
"\n",
"# 创建一个新的 Chrome 浏览器会话\n",
"driver = webdriver.Chrome()\n",
"\n",
"# 让浏览器打开一个网页\n",
"driver.get('https://sou.chinanews.com/')\n",
"driver.implicitly_wait(3) # 设置隐式等待时间为3秒\n",
"\n",
"# 找到搜索框,输入法文本\n",
"input = driver.find_element(By.XPATH, '//*[@id=\"q\"]')\n",
"input.send_keys('初音未来')\n",
"\n",
"# 找到搜索按钮,点击按钮\n",
"search = driver.find_element(By.XPATH, '//button[@class=\"searchBtn\"]')\n",
"search.click()"
]
},
{
"cell_type": "code",
"execution_count": 41,
"id": "876319e9-a10c-47ee-9ef9-ac0e03a39d82",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['https://www.chinanews.com.cn/cj/2023/08-28/10068093.shtml',\n",
" 'https://www.chinanews.com.cn/sh/2023/07-26/10049675.shtml',\n",
" 'https://www.chinanews.com.cn/sh/2023/06-05/10019224.shtml',\n",
" 'https://www.chinanews.com.cn/sh/2023/06-01/10017432.shtml',\n",
" 'https://www.chinanews.com.cn/cj/2023/05-20/10010862.shtml',\n",
" 'https://www.chinanews.com.cn/cul/2022/12-15/9915069.shtml',\n",
" 'https://www.chinanews.com.cn/cj/2022/10-26/9880366.shtml',\n",
" 'https://www.chinanews.com.cn/cj/2022/09-07/9847169.shtml',\n",
" 'https://www.chinanews.com.cn/cj/2022/07-05/9795608.shtml',\n",
" 'https://www.chinanews.com.cn/cj/2022/07-05/9795607.shtml']"
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 遍历符合条件的 XPATH寻找所有 新闻标题 的 URL并保存在 links 列表中\n",
"links = []\n",
"for element in driver.find_elements(By.XPATH, '//div[@class=\"news_title\"]/a'):\n",
" links.append(element.get_attribute('href'))\n",
"links"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "e0823aa9-1aa9-43d1-bdb0-d8a9a130a70f",
"metadata": {},
"outputs": [],
"source": [
"# 跳转到该网页\n",
"driver.get(links[0])"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "1cc5210c-a61a-48ba-9938-3f10449ff784",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"AI创作新风潮影视业拥抱AI新机遇\n"
]
}
],
"source": [
"# 解析标题\n",
"title = driver.find_element(By.XPATH, '//*[@id=\"cont_1_1_2\"]/div[2]/h1').text.strip()\n",
"print(title)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "9877f965-f606-49bd-bd8e-ee578b9b90cb",
"metadata": {},
"outputs": [],
"source": [
"# 解析正文\n",
"content = driver.find_element(By.XPATH, '//*[@id=\"cont_1_1_2\"]/div[2]/div[4]/div[2]').text.strip()"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "f001d905-abef-47df-9c43-8d87267553c1",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2023年08月28日 04:03\n",
"人民日报海外版\n"
]
}
],
"source": [
"# 解析 时间 和 来源\n",
"text = driver.find_element(By.XPATH, '//*[@id=\"cont_1_1_2\"]/div[2]/div[2]').text.strip()\n",
"tuples = text.split('\\n', 1)[0].split('来源:')\n",
"date = tuples[0].strip()\n",
"source = tuples[1].strip()\n",
"print(date)\n",
"print(source)"
]
},
{
"cell_type": "code",
"execution_count": 37,
"id": "1ffb2586-753c-4fb8-8f42-30c122a2e8b4",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2022年09月07日 19:55 中国新闻网\n"
]
}
],
"source": [
"# 针对 旧版 的网页\n",
"title = driver.find_element(By.XPATH, '//*[@id=\"cont_1_1_2\"]/h1').text.strip()\n",
"content = driver.find_element(By.XPATH, '//*[@id=\"cont_1_1_2\"]/div[5]').text.strip()\n",
"text = driver.find_element(By.XPATH, '//*[@id=\"cont_1_1_2\"]/div[3]/div').text.strip()\n",
"date = tuples[0].strip()\n",
"source = tuples[1].strip()"
]
},
{
"cell_type": "markdown",
"id": "e53bc9b7-a103-48ae-ab6f-d574ecd7687b",
"metadata": {},
"source": [
"---"
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "866f8acc-37d7-4f25-ba2b-62bb8baae94f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'https://www.chinanews.com.cn/cj/2022/07-05/9795608.shtml'"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"driver.current_url"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b7c28cdf-40f2-466d-a684-cefc0e34a1e4",
"metadata": {},
"outputs": [],
"source": [
"# 合并以上的代码\n",
"data = []\n",
"for link in links:\n",
" # 跳转到该网页\n",
" driver.get(link)\n",
" try:\n",
" # 解析标题\n",
" title = driver.find_element(By.XPATH, '//*[@id=\"cont_1_1_2\"]/div[2]/h1').text.strip()\n",
" # 解析正文\n",
" content = driver.find_element(By.XPATH, '//*[@id=\"cont_1_1_2\"]/div[2]/div[4]/div[2]').text.strip()\n",
" # 解析 时间 和 来源\n",
" text = driver.find_element(By.XPATH, '//*[@id=\"cont_1_1_2\"]/div[2]/div[2]').text.strip()\n",
" tuples = text.split('\\n', 1)[0].split('来源:')\n",
" date = tuples[0].strip()\n",
" source = tuples[1].strip()\n",
" except Exception as e:\n",
" # 如果上面的代码报错了,说明可能是旧版网页,使用以下的代码进行解析\n",
" title = driver.find_element(By.XPATH, '//*[@id=\"cont_1_1_2\"]/h1').text.strip()\n",
" content = driver.find_element(By.XPATH, '//*[@id=\"cont_1_1_2\"]/div[5]').text.strip()\n",
" text = driver.find_element(By.XPATH, '//*[@id=\"cont_1_1_2\"]/div[3]/div').text.strip()\n",
" date = tuples[0].strip()\n",
" source = tuples[1].strip()\n",
" \n",
" # 保存所有数据到一个二维列表\n",
" data.append([title, date, source, content])\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 44,
"id": "a6d8861c-6b1d-494d-bc8a-bf9310317776",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>标题</th>\n",
" <th>时间</th>\n",
" <th>来源</th>\n",
" <th>正文</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>AI创作新风潮影视业拥抱AI新机遇</td>\n",
" <td>2023年08月28日 04:03</td>\n",
" <td>人民日报海外版</td>\n",
" <td>前不久,一部以元宇宙为概念的国潮微短剧《神女杂货铺》在某视频平台播出,讲述了一个现代女孩穿越...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>雨中跪地救人的“二次元小姐姐” 是位喜欢动漫的苏州医生</td>\n",
" <td>2023年07月26日 02:23</td>\n",
" <td>扬子晚报</td>\n",
" <td>7月21日在上海某漫展场馆外一名年轻男子突然在雨中晕厥倒地这一幕恰好被一位穿cosp...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>首批AI克隆明星上线不只是娱乐业“躺赚”</td>\n",
" <td>2023年06月05日 01:40</td>\n",
" <td>新京报</td>\n",
" <td>现实中偶像与粉丝互动被AI复制到虚拟空间中虚实边界被进一步打破。\\n花30元就可以和网红明...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>“10后”的流行密语你能对上几个</td>\n",
" <td>2023年06月01日 09:51</td>\n",
" <td>羊城晚报</td>\n",
" <td>羊城晚报记者 秦小杰\\n作为互联网新生代“10后”的小学生有哪些流行“密语”喜欢什么样的...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>(经济观察)虚拟数字人“现身”各行各业 释放可观商业价值</td>\n",
" <td>2023年05月20日 09:37</td>\n",
" <td>中国新闻网</td>\n",
" <td>中新社上海5月20日电 (谢梦圆)近期多个品牌启用虚拟形象作为代言人、社交平台AI博主大受...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>网络热梗也能成为热门IP IP如何吸引Z世代</td>\n",
" <td>2022年12月15日 01:00</td>\n",
" <td>北京青年报</td>\n",
" <td>随着网络文化的发展新时代IP内容也随之扩展创新不仅涵盖动漫、影视、游戏、潮玩甚至一个符...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>玩具市场迎来多元需求 成年人“入坑”潮流玩具</td>\n",
" <td>2022年10月26日 15:31</td>\n",
" <td>北京青年报</td>\n",
" <td>一年一度的双11来临潮流玩具市场再度成为各大电商平台必争之地玩具市场迎来更多元的市场需求...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>越来越多场景应用 “数字人”走进大众生活</td>\n",
" <td>2022年09月07日 19:55</td>\n",
" <td>中国新闻网</td>\n",
" <td>中新网北京9月7日电 (中新财经 吴家驹)从“初音未来”到“洛天依”再到“嘉然”,近年来,“...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>爱的是“皮”还是“魂”?虚拟偶像凭什么“圈粉”</td>\n",
" <td>2022年09月07日 19:55</td>\n",
" <td>中国新闻网</td>\n",
" <td>虚拟偶像深受当下年轻人的欢迎。艾媒咨询调研显示中国虚拟人爱好者中19岁至30岁之间的年轻...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>唱歌跳舞的“皮套人”?这个千亿级生意没那么简单</td>\n",
" <td>2022年09月07日 19:55</td>\n",
" <td>中国新闻网</td>\n",
" <td>近日,一条微博热搜将人们的视线拉回到了虚拟偶像的身上,一名来自美国的虚拟主播在短短两小时内吸...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" 标题 时间 来源 \\\n",
"0 AI创作新风潮影视业拥抱AI新机遇 2023年08月28日 04:03 人民日报海外版 \n",
"1 雨中跪地救人的“二次元小姐姐” 是位喜欢动漫的苏州医生 2023年07月26日 02:23 扬子晚报 \n",
"2 首批AI克隆明星上线不只是娱乐业“躺赚” 2023年06月05日 01:40 新京报 \n",
"3 “10后”的流行密语你能对上几个 2023年06月01日 09:51 羊城晚报 \n",
"4 (经济观察)虚拟数字人“现身”各行各业 释放可观商业价值 2023年05月20日 09:37 中国新闻网 \n",
"5 网络热梗也能成为热门IP IP如何吸引Z世代 2022年12月15日 01:00 北京青年报 \n",
"6 玩具市场迎来多元需求 成年人“入坑”潮流玩具 2022年10月26日 15:31 北京青年报 \n",
"7 越来越多场景应用 “数字人”走进大众生活 2022年09月07日 19:55 中国新闻网 \n",
"8 爱的是“皮”还是“魂”?虚拟偶像凭什么“圈粉” 2022年09月07日 19:55 中国新闻网 \n",
"9 唱歌跳舞的“皮套人”?这个千亿级生意没那么简单 2022年09月07日 19:55 中国新闻网 \n",
"\n",
" 正文 \n",
"0 前不久,一部以元宇宙为概念的国潮微短剧《神女杂货铺》在某视频平台播出,讲述了一个现代女孩穿越... \n",
"1 7月21日在上海某漫展场馆外一名年轻男子突然在雨中晕厥倒地这一幕恰好被一位穿cosp... \n",
"2 现实中偶像与粉丝互动被AI复制到虚拟空间中虚实边界被进一步打破。\\n花30元就可以和网红明... \n",
"3 羊城晚报记者 秦小杰\\n作为互联网新生代“10后”的小学生有哪些流行“密语”喜欢什么样的... \n",
"4 中新社上海5月20日电 (谢梦圆)近期多个品牌启用虚拟形象作为代言人、社交平台AI博主大受... \n",
"5 随着网络文化的发展新时代IP内容也随之扩展创新不仅涵盖动漫、影视、游戏、潮玩甚至一个符... \n",
"6 一年一度的双11来临潮流玩具市场再度成为各大电商平台必争之地玩具市场迎来更多元的市场需求... \n",
"7 中新网北京9月7日电 (中新财经 吴家驹)从“初音未来”到“洛天依”再到“嘉然”,近年来,“... \n",
"8 虚拟偶像深受当下年轻人的欢迎。艾媒咨询调研显示中国虚拟人爱好者中19岁至30岁之间的年轻... \n",
"9 近日,一条微博热搜将人们的视线拉回到了虚拟偶像的身上,一名来自美国的虚拟主播在短短两小时内吸... "
]
},
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 循环结束后将data转为 DataFrame 并保存到 csv\n",
"import pandas\n",
"df = pandas.DataFrame(data, columns=['标题', '时间', '来源', '正文'])\n",
"df.to_csv('news.csv')\n",
"\n",
"# 显示df\n",
"df"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
}
},
"nbformat": 4,
"nbformat_minor": 5
}