Selenium爬虫代码分享

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import time

# 设置Chrome驱动(确保路径正确)
driver_path = 'chromedriver-mac-arm64/chromedriver'  # 替换为你自己的chromedriver路径
service = Service(driver_path)
driver = webdriver.Chrome(service=service)

try:
    # 打开目标网页
    url = 'https://tyj.sc.gov.cn/sctyj/qzty/xwzx_tygl.shtml'
    driver.get(url)

    # 等待页面加载
    time.sleep(3)

    # 初始化一个空列表保存2024年度和2023年度的新闻信息
    all_news_2024 = []
    all_news_2023 = []

    # 当前页码
    current_page = 1

    # 翻页并获取新闻信息
    while True:
        # 获取新闻标题和链接
        news_items = driver.find_elements(By.CSS_SELECTOR, ".p_wz_list a")
        # 获取新闻的发布时间
        time_items = driver.find_elements(By.CSS_SELECTOR, ".p_wz_list .fr")

        # 标志变量,判断是否找到2024年或2023年新闻
        found_2024 = False
        found_2023 = False
        
        # 提取新闻标题、链接和时间
        for i, item in enumerate(news_items):
            title = item.text
            link = item.get_attribute('href')
            publish_time = time_items[i].text
            
            # 检查发布时间
            if "2024" in publish_time:
                all_news_2024.append((title, link, publish_time))
                found_2024 = True
            
            if "2023" in publish_time:
                all_news_2023.append((title, link, publish_time))
                found_2023 = True

        # 如果当前页面没有2024或2023的新闻,停止翻页
        if not found_2024 and not found_2023:
            print("未找到2024或2023年度的新闻,结束翻页。")
            break

        # 获取翻页按钮并检查是否还有下一页
        pagination = driver.find_elements(By.CSS_SELECTOR, "#page_div .pagination-num")
        last_page = int(pagination[-1].text)  # 获取最大页码数
        
        if current_page >= last_page:
            print("已到达最后一页,结束翻页。")
            break
        else:
            # 点击下一页
            current_page += 1
            next_page_link = f"xwzx_tygl_{current_page}.shtml"
            next_page_element = driver.find_element(By.LINK_TEXT, str(current_page))
            next_page_element.click()
            time.sleep(3)  # 等待新页面加载

    # 依次打开每个2024年的链接,获取具体内容
    content_list_2024 = []
    for title, link, publish_time in all_news_2024:
        driver.get(link)
        time.sleep(3)
        
        try:
            content_element = driver.find_element(By.CLASS_NAME, "p_tpxl")
            content = content_element.text
            
            print(f"2024年 - 标题: {title}")
            print(f"链接: {link}")
            print(f"时间: {publish_time}")
            print(f"内容: {content}\n")
            
            content_list_2024.append((title, link, publish_time, content))
        except Exception as e:
            print(f"无法获取内容: {link}")

    # 将2024年度新闻信息保存到txt文件中
    with open("all_news_2024.txt", "w", encoding="utf-8") as file:
        for title, link, publish_time, content in content_list_2024:
            file.write(f"标题: {title}\n链接: {link}\n时间: {publish_time}\n内容:\n{content}\n\n")

    print("所有2024年度新闻爬取并保存完成。")

    # 依次打开每个2023年的链接,获取具体内容
    content_list_2023 = []
    for title, link, publish_time in all_news_2023:
        driver.get(link)
        time.sleep(3)
        
        try:
            content_element = driver.find_element(By.CLASS_NAME, "p_tpxl")
            content = content_element.text
            
            print(f"2023年 - 标题: {title}")
            print(f"链接: {link}")
            print(f"时间: {publish_time}")
            print(f"内容: {content}\n")
            
            content_list_2023.append((title, link, publish_time, content))
        except Exception as e:
            print(f"无法获取内容: {link}")

    # 将2023年度新闻信息保存到txt文件中
    with open("all_news_2023.txt", "w", encoding="utf-8") as file:
        for title, link, publish_time, content in content_list_2023:
            file.write(f"标题: {title}\n链接: {link}\n时间: {publish_time}\n内容:\n{content}\n\n")

    print("所有2023年度新闻爬取并保存完成。")

finally:
    # 关闭浏览器
    driver.quit()

留下评论

您的邮箱地址不会被公开。 必填项已用 * 标注