{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "bc76e623-3b53-459c-83a7-1c190ef8486e", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: selenium in /home/hmsy/.conda/envs/python311/lib/python3.11/site-packages (4.25.0)\n", "Requirement already satisfied: urllib3<3,>=1.26 in /home/hmsy/.conda/envs/python311/lib/python3.11/site-packages (from urllib3[socks]<3,>=1.26->selenium) (2.2.2)\n", "Requirement already satisfied: trio~=0.17 in /home/hmsy/.conda/envs/python311/lib/python3.11/site-packages (from selenium) (0.26.2)\n", "Requirement already satisfied: trio-websocket~=0.9 in /home/hmsy/.conda/envs/python311/lib/python3.11/site-packages (from selenium) (0.11.1)\n", "Requirement already satisfied: certifi>=2021.10.8 in /home/hmsy/.conda/envs/python311/lib/python3.11/site-packages (from selenium) (2024.7.4)\n", "Requirement already satisfied: typing_extensions~=4.9 in /home/hmsy/.conda/envs/python311/lib/python3.11/site-packages (from selenium) (4.12.2)\n", "Requirement already satisfied: websocket-client~=1.8 in /home/hmsy/.conda/envs/python311/lib/python3.11/site-packages (from selenium) (1.8.0)\n", "Requirement already satisfied: attrs>=23.2.0 in /home/hmsy/.conda/envs/python311/lib/python3.11/site-packages (from trio~=0.17->selenium) (23.2.0)\n", "Requirement already satisfied: sortedcontainers in /home/hmsy/.conda/envs/python311/lib/python3.11/site-packages (from trio~=0.17->selenium) (2.4.0)\n", "Requirement already satisfied: idna in /home/hmsy/.conda/envs/python311/lib/python3.11/site-packages (from trio~=0.17->selenium) (3.7)\n", "Requirement already satisfied: outcome in /home/hmsy/.conda/envs/python311/lib/python3.11/site-packages (from trio~=0.17->selenium) (1.3.0.post0)\n", "Requirement already satisfied: sniffio>=1.3.0 in /home/hmsy/.conda/envs/python311/lib/python3.11/site-packages (from trio~=0.17->selenium) (1.3.1)\n", "Requirement already satisfied: wsproto>=0.14 in /home/hmsy/.conda/envs/python311/lib/python3.11/site-packages (from trio-websocket~=0.9->selenium) (1.2.0)\n", "Requirement already satisfied: pysocks!=1.5.7,<2.0,>=1.5.6 in /home/hmsy/.conda/envs/python311/lib/python3.11/site-packages (from urllib3[socks]<3,>=1.26->selenium) (1.7.1)\n", "Requirement already satisfied: h11<1,>=0.9.0 in /home/hmsy/.conda/envs/python311/lib/python3.11/site-packages (from wsproto>=0.14->trio-websocket~=0.9->selenium) (0.14.0)\n" ] } ], "source": [ "# 安装依赖(如果已经安装过了可以跳过)\n", "!pip install selenium" ] }, { "cell_type": "code", "execution_count": 40, "id": "1fd79faf-f138-41fa-9519-7bc72b407afb", "metadata": {}, "outputs": [], "source": [ "from selenium import webdriver\n", "from selenium.webdriver.common.by import By\n", "import time\n", "\n", "# 创建一个新的 Chrome 浏览器会话\n", "driver = webdriver.Chrome()\n", "\n", "# 让浏览器打开一个网页\n", "driver.get('https://sou.chinanews.com/')\n", "driver.implicitly_wait(3) # 设置隐式等待时间为3秒\n", "\n", "# 找到搜索框,输入法文本\n", "input = driver.find_element(By.XPATH, '//*[@id=\"q\"]')\n", "input.send_keys('初音未来')\n", "\n", "# 找到搜索按钮,点击按钮\n", "search = driver.find_element(By.XPATH, '//button[@class=\"searchBtn\"]')\n", "search.click()" ] }, { "cell_type": "code", "execution_count": 41, "id": "876319e9-a10c-47ee-9ef9-ac0e03a39d82", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['https://www.chinanews.com.cn/cj/2023/08-28/10068093.shtml',\n", " 'https://www.chinanews.com.cn/sh/2023/07-26/10049675.shtml',\n", " 'https://www.chinanews.com.cn/sh/2023/06-05/10019224.shtml',\n", " 'https://www.chinanews.com.cn/sh/2023/06-01/10017432.shtml',\n", " 'https://www.chinanews.com.cn/cj/2023/05-20/10010862.shtml',\n", " 'https://www.chinanews.com.cn/cul/2022/12-15/9915069.shtml',\n", " 'https://www.chinanews.com.cn/cj/2022/10-26/9880366.shtml',\n", " 'https://www.chinanews.com.cn/cj/2022/09-07/9847169.shtml',\n", " 'https://www.chinanews.com.cn/cj/2022/07-05/9795608.shtml',\n", " 'https://www.chinanews.com.cn/cj/2022/07-05/9795607.shtml']" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 遍历符合条件的 XPATH,寻找所有 新闻标题 的 URL,并保存在 links 列表中\n", "links = []\n", "for element in driver.find_elements(By.XPATH, '//div[@class=\"news_title\"]/a'):\n", " links.append(element.get_attribute('href'))\n", "links" ] }, { "cell_type": "code", "execution_count": 8, "id": "e0823aa9-1aa9-43d1-bdb0-d8a9a130a70f", "metadata": {}, "outputs": [], "source": [ "# 跳转到该网页\n", "driver.get(links[0])" ] }, { "cell_type": "code", "execution_count": 26, "id": "1cc5210c-a61a-48ba-9938-3f10449ff784", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "AI创作新风潮:影视业,拥抱AI新机遇\n" ] } ], "source": [ "# 解析标题\n", "title = driver.find_element(By.XPATH, '//*[@id=\"cont_1_1_2\"]/div[2]/h1').text.strip()\n", "print(title)" ] }, { "cell_type": "code", "execution_count": 12, "id": "9877f965-f606-49bd-bd8e-ee578b9b90cb", "metadata": {}, "outputs": [], "source": [ "# 解析正文\n", "content = driver.find_element(By.XPATH, '//*[@id=\"cont_1_1_2\"]/div[2]/div[4]/div[2]').text.strip()" ] }, { "cell_type": "code", "execution_count": 21, "id": "f001d905-abef-47df-9c43-8d87267553c1", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2023年08月28日 04:03\n", "人民日报海外版\n" ] } ], "source": [ "# 解析 时间 和 来源\n", "text = driver.find_element(By.XPATH, '//*[@id=\"cont_1_1_2\"]/div[2]/div[2]').text.strip()\n", "tuples = text.split('\\n', 1)[0].split('来源:')\n", "date = tuples[0].strip()\n", "source = tuples[1].strip()\n", "print(date)\n", "print(source)" ] }, { "cell_type": "code", "execution_count": 37, "id": "1ffb2586-753c-4fb8-8f42-30c122a2e8b4", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2022年09月07日 19:55 中国新闻网\n" ] } ], "source": [ "# 针对 旧版 的网页\n", "title = driver.find_element(By.XPATH, '//*[@id=\"cont_1_1_2\"]/h1').text.strip()\n", "content = driver.find_element(By.XPATH, '//*[@id=\"cont_1_1_2\"]/div[5]').text.strip()\n", "text = driver.find_element(By.XPATH, '//*[@id=\"cont_1_1_2\"]/div[3]/div').text.strip()\n", "date = tuples[0].strip()\n", "source = tuples[1].strip()" ] }, { "cell_type": "markdown", "id": "e53bc9b7-a103-48ae-ab6f-d574ecd7687b", "metadata": {}, "source": [ "---" ] }, { "cell_type": "code", "execution_count": 31, "id": "866f8acc-37d7-4f25-ba2b-62bb8baae94f", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'https://www.chinanews.com.cn/cj/2022/07-05/9795608.shtml'" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "driver.current_url" ] }, { "cell_type": "code", "execution_count": null, "id": "b7c28cdf-40f2-466d-a684-cefc0e34a1e4", "metadata": {}, "outputs": [], "source": [ "# 合并以上的代码\n", "data = []\n", "for link in links:\n", " # 跳转到该网页\n", " driver.get(link)\n", " try:\n", " # 解析标题\n", " title = driver.find_element(By.XPATH, '//*[@id=\"cont_1_1_2\"]/div[2]/h1').text.strip()\n", " # 解析正文\n", " content = driver.find_element(By.XPATH, '//*[@id=\"cont_1_1_2\"]/div[2]/div[4]/div[2]').text.strip()\n", " # 解析 时间 和 来源\n", " text = driver.find_element(By.XPATH, '//*[@id=\"cont_1_1_2\"]/div[2]/div[2]').text.strip()\n", " tuples = text.split('\\n', 1)[0].split('来源:')\n", " date = tuples[0].strip()\n", " source = tuples[1].strip()\n", " except Exception as e:\n", " # 如果上面的代码报错了,说明可能是旧版网页,使用以下的代码进行解析\n", " title = driver.find_element(By.XPATH, '//*[@id=\"cont_1_1_2\"]/h1').text.strip()\n", " content = driver.find_element(By.XPATH, '//*[@id=\"cont_1_1_2\"]/div[5]').text.strip()\n", " text = driver.find_element(By.XPATH, '//*[@id=\"cont_1_1_2\"]/div[3]/div').text.strip()\n", " date = tuples[0].strip()\n", " source = tuples[1].strip()\n", " \n", " # 保存所有数据到一个二维列表\n", " data.append([title, date, source, content])\n", "\n" ] }, { "cell_type": "code", "execution_count": 44, "id": "a6d8861c-6b1d-494d-bc8a-bf9310317776", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
标题时间来源正文
0AI创作新风潮:影视业,拥抱AI新机遇2023年08月28日 04:03人民日报海外版前不久,一部以元宇宙为概念的国潮微短剧《神女杂货铺》在某视频平台播出,讲述了一个现代女孩穿越...
1雨中跪地救人的“二次元小姐姐” 是位喜欢动漫的苏州医生2023年07月26日 02:23扬子晚报7月21日,在上海某漫展场馆外,一名年轻男子突然在雨中晕厥倒地,这一幕,恰好被一位穿cosp...
2首批AI克隆明星上线,不只是娱乐业“躺赚”2023年06月05日 01:40新京报现实中偶像与粉丝互动被AI复制到虚拟空间中,虚实边界被进一步打破。\\n花30元就可以和网红明...
3“10后”的流行密语你能对上几个?2023年06月01日 09:51羊城晚报羊城晚报记者 秦小杰\\n作为互联网新生代,“10后”的小学生有哪些流行“密语”?喜欢什么样的...
4(经济观察)虚拟数字人“现身”各行各业 释放可观商业价值2023年05月20日 09:37中国新闻网中新社上海5月20日电 (谢梦圆)近期,多个品牌启用虚拟形象作为代言人、社交平台AI博主大受...
5网络热梗也能成为热门IP IP如何吸引Z世代?2022年12月15日 01:00北京青年报随着网络文化的发展,新时代IP内容也随之扩展创新,不仅涵盖动漫、影视、游戏、潮玩,甚至一个符...
6玩具市场迎来多元需求 成年人“入坑”潮流玩具2022年10月26日 15:31北京青年报一年一度的双11来临,潮流玩具市场再度成为各大电商平台必争之地,玩具市场迎来更多元的市场需求...
7越来越多场景应用 “数字人”走进大众生活2022年09月07日 19:55中国新闻网中新网北京9月7日电 (中新财经 吴家驹)从“初音未来”到“洛天依”再到“嘉然”,近年来,“...
8爱的是“皮”还是“魂”?虚拟偶像凭什么“圈粉”2022年09月07日 19:55中国新闻网虚拟偶像深受当下年轻人的欢迎。艾媒咨询调研显示,中国虚拟人爱好者中,19岁至30岁之间的年轻...
9唱歌跳舞的“皮套人”?这个千亿级生意没那么简单2022年09月07日 19:55中国新闻网近日,一条微博热搜将人们的视线拉回到了虚拟偶像的身上,一名来自美国的虚拟主播在短短两小时内吸...
\n", "
" ], "text/plain": [ " 标题 时间 来源 \\\n", "0 AI创作新风潮:影视业,拥抱AI新机遇 2023年08月28日 04:03 人民日报海外版 \n", "1 雨中跪地救人的“二次元小姐姐” 是位喜欢动漫的苏州医生 2023年07月26日 02:23 扬子晚报 \n", "2 首批AI克隆明星上线,不只是娱乐业“躺赚” 2023年06月05日 01:40 新京报 \n", "3 “10后”的流行密语你能对上几个? 2023年06月01日 09:51 羊城晚报 \n", "4 (经济观察)虚拟数字人“现身”各行各业 释放可观商业价值 2023年05月20日 09:37 中国新闻网 \n", "5 网络热梗也能成为热门IP IP如何吸引Z世代? 2022年12月15日 01:00 北京青年报 \n", "6 玩具市场迎来多元需求 成年人“入坑”潮流玩具 2022年10月26日 15:31 北京青年报 \n", "7 越来越多场景应用 “数字人”走进大众生活 2022年09月07日 19:55 中国新闻网 \n", "8 爱的是“皮”还是“魂”?虚拟偶像凭什么“圈粉” 2022年09月07日 19:55 中国新闻网 \n", "9 唱歌跳舞的“皮套人”?这个千亿级生意没那么简单 2022年09月07日 19:55 中国新闻网 \n", "\n", " 正文 \n", "0 前不久,一部以元宇宙为概念的国潮微短剧《神女杂货铺》在某视频平台播出,讲述了一个现代女孩穿越... \n", "1 7月21日,在上海某漫展场馆外,一名年轻男子突然在雨中晕厥倒地,这一幕,恰好被一位穿cosp... \n", "2 现实中偶像与粉丝互动被AI复制到虚拟空间中,虚实边界被进一步打破。\\n花30元就可以和网红明... \n", "3 羊城晚报记者 秦小杰\\n作为互联网新生代,“10后”的小学生有哪些流行“密语”?喜欢什么样的... \n", "4 中新社上海5月20日电 (谢梦圆)近期,多个品牌启用虚拟形象作为代言人、社交平台AI博主大受... \n", "5 随着网络文化的发展,新时代IP内容也随之扩展创新,不仅涵盖动漫、影视、游戏、潮玩,甚至一个符... \n", "6 一年一度的双11来临,潮流玩具市场再度成为各大电商平台必争之地,玩具市场迎来更多元的市场需求... \n", "7 中新网北京9月7日电 (中新财经 吴家驹)从“初音未来”到“洛天依”再到“嘉然”,近年来,“... \n", "8 虚拟偶像深受当下年轻人的欢迎。艾媒咨询调研显示,中国虚拟人爱好者中,19岁至30岁之间的年轻... \n", "9 近日,一条微博热搜将人们的视线拉回到了虚拟偶像的身上,一名来自美国的虚拟主播在短短两小时内吸... " ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 循环结束后,将data转为 DataFrame 并保存到 csv\n", "import pandas\n", "df = pandas.DataFrame(data, columns=['标题', '时间', '来源', '正文'])\n", "df.to_csv('news.csv')\n", "\n", "# 显示df\n", "df" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.9" } }, "nbformat": 4, "nbformat_minor": 5 }