{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "bc76e623-3b53-459c-83a7-1c190ef8486e", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: selenium in /home/hmsy/.conda/envs/python311/lib/python3.11/site-packages (4.25.0)\n", "Requirement already satisfied: urllib3<3,>=1.26 in /home/hmsy/.conda/envs/python311/lib/python3.11/site-packages (from urllib3[socks]<3,>=1.26->selenium) (2.2.2)\n", "Requirement already satisfied: trio~=0.17 in /home/hmsy/.conda/envs/python311/lib/python3.11/site-packages (from selenium) (0.26.2)\n", "Requirement already satisfied: trio-websocket~=0.9 in /home/hmsy/.conda/envs/python311/lib/python3.11/site-packages (from selenium) (0.11.1)\n", "Requirement already satisfied: certifi>=2021.10.8 in /home/hmsy/.conda/envs/python311/lib/python3.11/site-packages (from selenium) (2024.7.4)\n", "Requirement already satisfied: typing_extensions~=4.9 in /home/hmsy/.conda/envs/python311/lib/python3.11/site-packages (from selenium) (4.12.2)\n", "Requirement already satisfied: websocket-client~=1.8 in /home/hmsy/.conda/envs/python311/lib/python3.11/site-packages (from selenium) (1.8.0)\n", "Requirement already satisfied: attrs>=23.2.0 in /home/hmsy/.conda/envs/python311/lib/python3.11/site-packages (from trio~=0.17->selenium) (23.2.0)\n", "Requirement already satisfied: sortedcontainers in /home/hmsy/.conda/envs/python311/lib/python3.11/site-packages (from trio~=0.17->selenium) (2.4.0)\n", "Requirement already satisfied: idna in /home/hmsy/.conda/envs/python311/lib/python3.11/site-packages (from trio~=0.17->selenium) (3.7)\n", "Requirement already satisfied: outcome in /home/hmsy/.conda/envs/python311/lib/python3.11/site-packages (from trio~=0.17->selenium) (1.3.0.post0)\n", "Requirement already satisfied: sniffio>=1.3.0 in /home/hmsy/.conda/envs/python311/lib/python3.11/site-packages (from trio~=0.17->selenium) (1.3.1)\n", "Requirement already satisfied: wsproto>=0.14 in /home/hmsy/.conda/envs/python311/lib/python3.11/site-packages (from trio-websocket~=0.9->selenium) (1.2.0)\n", "Requirement already satisfied: pysocks!=1.5.7,<2.0,>=1.5.6 in /home/hmsy/.conda/envs/python311/lib/python3.11/site-packages (from urllib3[socks]<3,>=1.26->selenium) (1.7.1)\n", "Requirement already satisfied: h11<1,>=0.9.0 in /home/hmsy/.conda/envs/python311/lib/python3.11/site-packages (from wsproto>=0.14->trio-websocket~=0.9->selenium) (0.14.0)\n" ] } ], "source": [ "# 安装依赖(如果已经安装过了可以跳过)\n", "!pip install selenium" ] }, { "cell_type": "code", "execution_count": 40, "id": "1fd79faf-f138-41fa-9519-7bc72b407afb", "metadata": {}, "outputs": [], "source": [ "from selenium import webdriver\n", "from selenium.webdriver.common.by import By\n", "import time\n", "\n", "# 创建一个新的 Chrome 浏览器会话\n", "driver = webdriver.Chrome()\n", "\n", "# 让浏览器打开一个网页\n", "driver.get('https://sou.chinanews.com/')\n", "driver.implicitly_wait(3) # 设置隐式等待时间为3秒\n", "\n", "# 找到搜索框,输入法文本\n", "input = driver.find_element(By.XPATH, '//*[@id=\"q\"]')\n", "input.send_keys('初音未来')\n", "\n", "# 找到搜索按钮,点击按钮\n", "search = driver.find_element(By.XPATH, '//button[@class=\"searchBtn\"]')\n", "search.click()" ] }, { "cell_type": "code", "execution_count": 41, "id": "876319e9-a10c-47ee-9ef9-ac0e03a39d82", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['https://www.chinanews.com.cn/cj/2023/08-28/10068093.shtml',\n", " 'https://www.chinanews.com.cn/sh/2023/07-26/10049675.shtml',\n", " 'https://www.chinanews.com.cn/sh/2023/06-05/10019224.shtml',\n", " 'https://www.chinanews.com.cn/sh/2023/06-01/10017432.shtml',\n", " 'https://www.chinanews.com.cn/cj/2023/05-20/10010862.shtml',\n", " 'https://www.chinanews.com.cn/cul/2022/12-15/9915069.shtml',\n", " 'https://www.chinanews.com.cn/cj/2022/10-26/9880366.shtml',\n", " 'https://www.chinanews.com.cn/cj/2022/09-07/9847169.shtml',\n", " 'https://www.chinanews.com.cn/cj/2022/07-05/9795608.shtml',\n", " 'https://www.chinanews.com.cn/cj/2022/07-05/9795607.shtml']" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 遍历符合条件的 XPATH,寻找所有 新闻标题 的 URL,并保存在 links 列表中\n", "links = []\n", "for element in driver.find_elements(By.XPATH, '//div[@class=\"news_title\"]/a'):\n", " links.append(element.get_attribute('href'))\n", "links" ] }, { "cell_type": "code", "execution_count": 8, "id": "e0823aa9-1aa9-43d1-bdb0-d8a9a130a70f", "metadata": {}, "outputs": [], "source": [ "# 跳转到该网页\n", "driver.get(links[0])" ] }, { "cell_type": "code", "execution_count": 26, "id": "1cc5210c-a61a-48ba-9938-3f10449ff784", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "AI创作新风潮:影视业,拥抱AI新机遇\n" ] } ], "source": [ "# 解析标题\n", "title = driver.find_element(By.XPATH, '//*[@id=\"cont_1_1_2\"]/div[2]/h1').text.strip()\n", "print(title)" ] }, { "cell_type": "code", "execution_count": 12, "id": "9877f965-f606-49bd-bd8e-ee578b9b90cb", "metadata": {}, "outputs": [], "source": [ "# 解析正文\n", "content = driver.find_element(By.XPATH, '//*[@id=\"cont_1_1_2\"]/div[2]/div[4]/div[2]').text.strip()" ] }, { "cell_type": "code", "execution_count": 21, "id": "f001d905-abef-47df-9c43-8d87267553c1", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2023年08月28日 04:03\n", "人民日报海外版\n" ] } ], "source": [ "# 解析 时间 和 来源\n", "text = driver.find_element(By.XPATH, '//*[@id=\"cont_1_1_2\"]/div[2]/div[2]').text.strip()\n", "tuples = text.split('\\n', 1)[0].split('来源:')\n", "date = tuples[0].strip()\n", "source = tuples[1].strip()\n", "print(date)\n", "print(source)" ] }, { "cell_type": "code", "execution_count": 37, "id": "1ffb2586-753c-4fb8-8f42-30c122a2e8b4", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2022年09月07日 19:55 中国新闻网\n" ] } ], "source": [ "# 针对 旧版 的网页\n", "title = driver.find_element(By.XPATH, '//*[@id=\"cont_1_1_2\"]/h1').text.strip()\n", "content = driver.find_element(By.XPATH, '//*[@id=\"cont_1_1_2\"]/div[5]').text.strip()\n", "text = driver.find_element(By.XPATH, '//*[@id=\"cont_1_1_2\"]/div[3]/div').text.strip()\n", "date = tuples[0].strip()\n", "source = tuples[1].strip()" ] }, { "cell_type": "markdown", "id": "e53bc9b7-a103-48ae-ab6f-d574ecd7687b", "metadata": {}, "source": [ "---" ] }, { "cell_type": "code", "execution_count": 31, "id": "866f8acc-37d7-4f25-ba2b-62bb8baae94f", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'https://www.chinanews.com.cn/cj/2022/07-05/9795608.shtml'" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "driver.current_url" ] }, { "cell_type": "code", "execution_count": null, "id": "b7c28cdf-40f2-466d-a684-cefc0e34a1e4", "metadata": {}, "outputs": [], "source": [ "# 合并以上的代码\n", "data = []\n", "for link in links:\n", " # 跳转到该网页\n", " driver.get(link)\n", " try:\n", " # 解析标题\n", " title = driver.find_element(By.XPATH, '//*[@id=\"cont_1_1_2\"]/div[2]/h1').text.strip()\n", " # 解析正文\n", " content = driver.find_element(By.XPATH, '//*[@id=\"cont_1_1_2\"]/div[2]/div[4]/div[2]').text.strip()\n", " # 解析 时间 和 来源\n", " text = driver.find_element(By.XPATH, '//*[@id=\"cont_1_1_2\"]/div[2]/div[2]').text.strip()\n", " tuples = text.split('\\n', 1)[0].split('来源:')\n", " date = tuples[0].strip()\n", " source = tuples[1].strip()\n", " except Exception as e:\n", " # 如果上面的代码报错了,说明可能是旧版网页,使用以下的代码进行解析\n", " title = driver.find_element(By.XPATH, '//*[@id=\"cont_1_1_2\"]/h1').text.strip()\n", " content = driver.find_element(By.XPATH, '//*[@id=\"cont_1_1_2\"]/div[5]').text.strip()\n", " text = driver.find_element(By.XPATH, '//*[@id=\"cont_1_1_2\"]/div[3]/div').text.strip()\n", " date = tuples[0].strip()\n", " source = tuples[1].strip()\n", " \n", " # 保存所有数据到一个二维列表\n", " data.append([title, date, source, content])\n", "\n" ] }, { "cell_type": "code", "execution_count": 44, "id": "a6d8861c-6b1d-494d-bc8a-bf9310317776", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
| \n", " | 标题 | \n", "时间 | \n", "来源 | \n", "正文 | \n", "
|---|---|---|---|---|
| 0 | \n", "AI创作新风潮:影视业,拥抱AI新机遇 | \n", "2023年08月28日 04:03 | \n", "人民日报海外版 | \n", "前不久,一部以元宇宙为概念的国潮微短剧《神女杂货铺》在某视频平台播出,讲述了一个现代女孩穿越... | \n", "
| 1 | \n", "雨中跪地救人的“二次元小姐姐” 是位喜欢动漫的苏州医生 | \n", "2023年07月26日 02:23 | \n", "扬子晚报 | \n", "7月21日,在上海某漫展场馆外,一名年轻男子突然在雨中晕厥倒地,这一幕,恰好被一位穿cosp... | \n", "
| 2 | \n", "首批AI克隆明星上线,不只是娱乐业“躺赚” | \n", "2023年06月05日 01:40 | \n", "新京报 | \n", "现实中偶像与粉丝互动被AI复制到虚拟空间中,虚实边界被进一步打破。\\n花30元就可以和网红明... | \n", "
| 3 | \n", "“10后”的流行密语你能对上几个? | \n", "2023年06月01日 09:51 | \n", "羊城晚报 | \n", "羊城晚报记者 秦小杰\\n作为互联网新生代,“10后”的小学生有哪些流行“密语”?喜欢什么样的... | \n", "
| 4 | \n", "(经济观察)虚拟数字人“现身”各行各业 释放可观商业价值 | \n", "2023年05月20日 09:37 | \n", "中国新闻网 | \n", "中新社上海5月20日电 (谢梦圆)近期,多个品牌启用虚拟形象作为代言人、社交平台AI博主大受... | \n", "
| 5 | \n", "网络热梗也能成为热门IP IP如何吸引Z世代? | \n", "2022年12月15日 01:00 | \n", "北京青年报 | \n", "随着网络文化的发展,新时代IP内容也随之扩展创新,不仅涵盖动漫、影视、游戏、潮玩,甚至一个符... | \n", "
| 6 | \n", "玩具市场迎来多元需求 成年人“入坑”潮流玩具 | \n", "2022年10月26日 15:31 | \n", "北京青年报 | \n", "一年一度的双11来临,潮流玩具市场再度成为各大电商平台必争之地,玩具市场迎来更多元的市场需求... | \n", "
| 7 | \n", "越来越多场景应用 “数字人”走进大众生活 | \n", "2022年09月07日 19:55 | \n", "中国新闻网 | \n", "中新网北京9月7日电 (中新财经 吴家驹)从“初音未来”到“洛天依”再到“嘉然”,近年来,“... | \n", "
| 8 | \n", "爱的是“皮”还是“魂”?虚拟偶像凭什么“圈粉” | \n", "2022年09月07日 19:55 | \n", "中国新闻网 | \n", "虚拟偶像深受当下年轻人的欢迎。艾媒咨询调研显示,中国虚拟人爱好者中,19岁至30岁之间的年轻... | \n", "
| 9 | \n", "唱歌跳舞的“皮套人”?这个千亿级生意没那么简单 | \n", "2022年09月07日 19:55 | \n", "中国新闻网 | \n", "近日,一条微博热搜将人们的视线拉回到了虚拟偶像的身上,一名来自美国的虚拟主播在短短两小时内吸... | \n", "