Dify＋Firecrawl+Playwright实现半自动抓取及LLM舆情分析

原创

happywei

发布于 2025-05-27 13:56:41

3.4K0

基于上一篇文章本地部署的Firecrawl，本文实现Dify＋Firecrawl+Playwright实现半自动抓取以及LLM舆情分析，最后以markdown格式输出。

流程框架：

首先贴上整个workflow

问题分析：

1.上篇文章只能通过Firecrawl实现单页面抓取和深度抓取，无法满足多页面的同时抓取。

2.抓取的页面通过 JavaScript 异步加载（AJAX） 获取的，而不是直接写在 HTML 源码中。而Firecrawl 默认采用静态爬虫策略（即只获取初始 HTML），因此可能找不到实际内容页面。

问题解决：

一、Playwright实现半自动抓取

提供两个抓取思路：

1.在公告栏页面实现对每个公告网址的抓取，并将每个网址通过拼接，组合成能抓取的pdf输出格式

import asyncio
from urllib.parse import urlparse, parse_qs
from playwright.async_api import async_playwright


async def main():
    url = "https://www.cninfo.com.cn/new/commonUrl/pageOfSearch?url=disclosure/list/search"

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context()
        page = await context.new_page()
        await page.goto(url)

        all_pdf_urls = []
        total_pages = 6  # 当前页 + 后5页

        for page_num in range(total_pages):
            await page.wait_for_selector(".el-table__row")
            rows = await page.locator(".el-table__row").all()
            print(f"\n 第 {page_num + 1} 页，共 {len(rows)} 行")

            for idx, row in enumerate(rows, start=1):
                try:
                    a_tag = row.locator("td:nth-child(3) a")
                    href = await a_tag.get_attribute("href")
                    if href:
                        # 解析 URL 查询参数
                        parsed = urlparse(href)
                        query_params = parse_qs(parsed.query)
                        announcement_id = query_params.get("announcementId", [""])[0]
                        announcement_time = query_params.get("announcementTime", [""])[0]

                        if announcement_id and announcement_time:
                            pdf_url = f"https://static.cninfo.com.cn/finalpage/{announcement_time}/{announcement_id}.PDF"
                            print(f"第{idx}行 PDF链接: {pdf_url}")
                            all_pdf_urls.append(pdf_url)
                        else:
                            print(f"第{idx}行 缺少必要参数，跳过")

                except Exception as e:
                    print(f"第{idx}行解析失败: {e}")

            if page_num < total_pages - 1:
                try:
                    next_button = page.locator("button >> i.el-icon-arrow-right")
                    await next_button.first.click()
                    await page.wait_for_timeout(1500)
                except Exception as e:
                    print(f" 点击下一页失败: {e}")
                    break

        await browser.close()

        # 打印总链接数
        print("\n 总共生成 PDF 链接数量:", len(all_pdf_urls))

        # 可选：保存为 txt
        with open("pdf_urls.txt", "w", encoding="utf-8") as f:
            for url in all_pdf_urls:
                f.write(url + "\n")
asyncio.run(main())

2.在每条公告页面实现抓取，不用拼接，直接得到具体pdf网页

from playwright.sync_api import sync_playwright
import requests
import time

url_list = [
    "https://www.cninfo.com.cn/new/disclosure/detail?stockCode=688041&announcementId=1223675981&orgId=9900048365&announcementTime=2025-05-26",
    "https://www.cninfo.com.cn/new/disclosure/detail?stockCode=688525&announcementId=1223676024&orgId=9900047412&announcementTime=2025-05-26",
    "https://www.cninfo.com.cn/new/disclosure/detail?stockCode=300972&announcementId=1223676983&orgId=gfbj0833260&announcementTime=2025-05-26",
]

def download_pdf(pdf_url, filename):
    headers = {"User-Agent": "Mozilla/5.0"}
    try:
        response = requests.get(pdf_url, headers=headers)
        if response.status_code == 200:
            with open(filename, "wb") as f:
                f.write(response.content)
            print(f" PDF 已下载为：{filename}")
        else:
            print(f" 下载失败（状态码 {response.status_code}）：{pdf_url}")
    except Exception as e:
        print(" 下载 PDF 异常：", e)

def process_url(page, url, index):
    print(f"\n 正在处理第 {index+1} 个链接：{url}")
    try:
        page.goto(url, timeout=60000)
        page.wait_for_timeout(2000)  # 给页面 JS 一点加载时间

        pdf_url = None

        # 尝试方法 1：直接查找 <embed>
        embed = page.locator("embed.pdfobject")
        if embed.count() > 0:
            pdf_url = embed.first.get_attribute("src")
            print(" 方式一：页面中直接找到了 PDF。")
        else:
            # 方法 2：页面中可能有 iframe，需要再查
            iframes = page.locator("iframe")
            if iframes.count() == 0:
                print(" 页面无 iframe，也无 embed，跳过。")
                return

            print(" 尝试在 iframe 中查找 PDF...")
            try:
                frame = page.frame_locator("iframe").first.frame()
                embed_in_iframe = frame.locator("embed.pdfobject")
                if embed_in_iframe.count() > 0:
                    pdf_url = embed_in_iframe.first.get_attribute("src")
                    print(" 方式二：iframe 中找到了 PDF。")
                else:
                    print(" iframe 中也未找到 embed 标签。")
                    return
            except Exception as e:
                print(" iframe 加载或解析失败：", e)
                return

        if pdf_url:
            if pdf_url.startswith("/"):
                pdf_url = "https://www.cninfo.com.cn" + pdf_url
            elif not pdf_url.startswith("http"):
                pdf_url = "https://static.cninfo.com.cn" + pdf_url
            print(" 提取到 PDF 地址：", pdf_url)
            filename = f"公告_{index+1}.pdf"
            download_pdf(pdf_url, filename)
        else:
            print(" 未成功提取 PDF 地址")
    except Exception as e:
        print(" 页面处理出错：", e)

def batch_process(urls):
    with sync_playwright() as p:
        browser = p.chromium.launch(headless=False)
        page = browser.new_page()

        for i, url in enumerate(urls):
            process_url(page, url, i)
            time.sleep(0.5)  # 避免被限速或封IP

        browser.close()

batch_process(url_list)