from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import time
# 设置Chrome驱动(确保路径正确)
driver_path = 'chromedriver-mac-arm64/chromedriver' # 替换为你自己的chromedriver路径
service = Service(driver_path)
driver = webdriver.Chrome(service=service)
try:
# 打开目标网页
url = 'https://tyj.sc.gov.cn/sctyj/qzty/xwzx_tygl.shtml'
driver.get(url)
# 等待页面加载
time.sleep(3)
# 初始化一个空列表保存2024年度和2023年度的新闻信息
all_news_2024 = []
all_news_2023 = []
# 当前页码
current_page = 1
# 翻页并获取新闻信息
while True:
# 获取新闻标题和链接
news_items = driver.find_elements(By.CSS_SELECTOR, ".p_wz_list a")
# 获取新闻的发布时间
time_items = driver.find_elements(By.CSS_SELECTOR, ".p_wz_list .fr")
# 标志变量,判断是否找到2024年或2023年新闻
found_2024 = False
found_2023 = False
# 提取新闻标题、链接和时间
for i, item in enumerate(news_items):
title = item.text
link = item.get_attribute('href')
publish_time = time_items[i].text
# 检查发布时间
if "2024" in publish_time:
all_news_2024.append((title, link, publish_time))
found_2024 = True
if "2023" in publish_time:
all_news_2023.append((title, link, publish_time))
found_2023 = True
# 如果当前页面没有2024或2023的新闻,停止翻页
if not found_2024 and not found_2023:
print("未找到2024或2023年度的新闻,结束翻页。")
break
# 获取翻页按钮并检查是否还有下一页
pagination = driver.find_elements(By.CSS_SELECTOR, "#page_div .pagination-num")
last_page = int(pagination[-1].text) # 获取最大页码数
if current_page >= last_page:
print("已到达最后一页,结束翻页。")
break
else:
# 点击下一页
current_page += 1
next_page_link = f"xwzx_tygl_{current_page}.shtml"
next_page_element = driver.find_element(By.LINK_TEXT, str(current_page))
next_page_element.click()
time.sleep(3) # 等待新页面加载
# 依次打开每个2024年的链接,获取具体内容
content_list_2024 = []
for title, link, publish_time in all_news_2024:
driver.get(link)
time.sleep(3)
try:
content_element = driver.find_element(By.CLASS_NAME, "p_tpxl")
content = content_element.text
print(f"2024年 - 标题: {title}")
print(f"链接: {link}")
print(f"时间: {publish_time}")
print(f"内容: {content}\n")
content_list_2024.append((title, link, publish_time, content))
except Exception as e:
print(f"无法获取内容: {link}")
# 将2024年度新闻信息保存到txt文件中
with open("all_news_2024.txt", "w", encoding="utf-8") as file:
for title, link, publish_time, content in content_list_2024:
file.write(f"标题: {title}\n链接: {link}\n时间: {publish_time}\n内容:\n{content}\n\n")
print("所有2024年度新闻爬取并保存完成。")
# 依次打开每个2023年的链接,获取具体内容
content_list_2023 = []
for title, link, publish_time in all_news_2023:
driver.get(link)
time.sleep(3)
try:
content_element = driver.find_element(By.CLASS_NAME, "p_tpxl")
content = content_element.text
print(f"2023年 - 标题: {title}")
print(f"链接: {link}")
print(f"时间: {publish_time}")
print(f"内容: {content}\n")
content_list_2023.append((title, link, publish_time, content))
except Exception as e:
print(f"无法获取内容: {link}")
# 将2023年度新闻信息保存到txt文件中
with open("all_news_2023.txt", "w", encoding="utf-8") as file:
for title, link, publish_time, content in content_list_2023:
file.write(f"标题: {title}\n链接: {link}\n时间: {publish_time}\n内容:\n{content}\n\n")
print("所有2023年度新闻爬取并保存完成。")
finally:
# 关闭浏览器
driver.quit()