首页
学习
活动
专区
圈层
工具
发布
社区首页 >问答首页 >使用Apify和Puppeteer抓取URL

使用Apify和Puppeteer抓取URL
EN

Stack Overflow用户
提问于 2020-02-28 10:59:27
回答 1查看 968关注 0票数 1

我正在尝试使用an Apify actor called "web-scraper" (https://apify.com/apify/web-scraper)从https://en.wikipedia.org/wiki/List_of_hedge_funds中抓取URL

具体地说,我尝试使用以下Apify pageFunction来抓取目标页面,并从HTML中出现的锚标记返回URL列表。

pageFunction

代码语言:javascript
复制
async function pageFunction( context ) {
    const url = 'https://en.wikipedia.org/wiki/List_of_hedge_funds';
    const cssSelector = 'tr > td > a';

    const $ = context.jQuery;
    const pageTitle = $('title').first().text();
    const anchorTag = $( cssSelector );

    return {
      url: context.request.url,
      pageTitle, anchorTag,
    };
}

在我的控制台中,我希望在一个名为anchorTag的属性中看到存在于目标页面上的一个或多个锚标记的href属性的值。我还希望在名为pageTitle的属性和url属性中看到页面标题。如下所示:

我希望看到的是:

代码语言:javascript
复制
{
  "url": "https://en.wikipedia.org/wiki/List_of_hedge_funds",
  "pageTitle": "List of hedge funds - Wikipedia",
  "anchorTag": {
    "0": "http://example0.com", // each instance of "http://example.com" represents a unique url on the target page to be scraped
    "1": "http://example1.com",
    "2": "http://example2.com",
    "3": "http://example3.com",
    ...
    "39": "http://example39.com",
}}

但执行元返回的不是URL列表,而是以下数据集:

我实际看到的是:

代码语言:javascript
复制
[{
  "url": "https://en.wikipedia.org/wiki/List_of_hedge_funds",
  "pageTitle": "List of hedge funds - Wikipedia",
  "anchorTag": {
    "0": {},
    "1": {},
    "2": {},
    "3": {},
    "4": {},
    "5": {},
    "6": {},
    "7": {},
    "8": {},
    "9": {},
    "10": {},
    "11": {},
    "12": {},
    "13": {},
    "14": {},
    "15": {},
    "16": {},
    "17": {},
    "18": {},
    "19": {},
    "20": {},
    "21": {},
    "22": {},
    "23": {},
    "24": {},
    "25": {},
    "26": {},
    "27": {},
    "28": {},
    "29": {},
    "30": {},
    "31": {},
    "32": {},
    "33": {},
    "34": {},
    "35": {},
    "36": {},
    "37": {},
    "38": {},
    "39": {},
    "length": 40,
    "prevObject": {
      "0": {
        "location": {
          "href": "https://en.wikipedia.org/wiki/List_of_hedge_funds",
          "ancestorOrigins": {},
          "origin": "https://en.wikipedia.org",
          "protocol": "https:",
          "host": "en.wikipedia.org",
          "hostname": "en.wikipedia.org",
          "port": "",
          "pathname": "/wiki/List_of_hedge_funds",
          "search": "",
          "hash": "",
          "assign": {},
          "reload": {},
          "toString": {},
          "replace": {}
        },
        "write": {},
        "writeln": {},
        "jQuery3410461525655351679551": {
          "events": {
            "mmv-setup-overlay": [
              {
                "type": "mmv-setup-overlay",
                "origType": "mmv-setup-overlay",
                "handler": {
                  "guid": 21
                },
                "guid": 21,
                "namespace": ""
              }
            ],
            "mmv-cleanup-overlay": [
              {
                "type": "mmv-cleanup-overlay",
                "origType": "mmv-cleanup-overlay",
                "handler": {
                  "guid": 22
                },
                "guid": 22,
                "namespace": ""
              }
            ],
            "keyup": [
              {
                "type": "keyup",
                "origType": "keyup",
                "handler": {
                  "guid": 24
                },
                "guid": 24,
                "selector": "#mw-content-text a[href][title]:not(.extiw, .image, .new, .internal, .external, .mw-cite-backlink a, .oo-ui-buttonedElement-button, .cancelLink a)",
                "needsContext": false,
                "namespace": ""
              }
            ],
            "mouseover": [
              {
                "type": "mouseover",
                "origType": "mouseover",
                "handler": {
                  "guid": 24
                },
                "guid": 24,
                "selector": "#mw-content-text a[href][title]:not(.extiw, .image, .new, .internal, .external, .mw-cite-backlink a, .oo-ui-buttonedElement-button, .cancelLink a)",
                "needsContext": false,
                "namespace": ""
              }
            ],
            "focusout": [
              {
                "type": "focusout",
                "origType": "blur",
                "handler": {
                  "guid": 25
                },
                "guid": 25,
                "selector": "#mw-content-text a[href][title]:not(.extiw, .image, .new, .internal, .external, .mw-cite-backlink a, .oo-ui-buttonedElement-button, .cancelLink a)",
                "needsContext": false,
                "namespace": ""
              }
            ],
            "mouseout": [
              {
                "type": "mouseout",
                "origType": "mouseout",
                "handler": {
                  "guid": 25
                },
                "guid": 25,
                "selector": "#mw-content-text a[href][title]:not(.extiw, .image, .new, .internal, .external, .mw-cite-backlink a, .oo-ui-buttonedElement-button, .cancelLink a)",
                "needsContext": false,
                "namespace": ""
              }
            ],
            "click": [
              {
                "type": "click",
                "origType": "click",
                "handler": {
                  "guid": 26
                },
                "guid": 26,
                "selector": "#mw-content-text a[href][title]:not(.extiw, .image, .new, .internal, .external, .mw-cite-backlink a, .oo-ui-buttonedElement-button, .cancelLink a)",
                "needsContext": false,
                "namespace": ""
              }
            ]
          },
          "handle": {},
          "focusin": 1,
          "focusout": 1
        }
      },
      "length": 1
    }
  }
}]

我做错了什么?

EN

回答 1

Stack Overflow用户

回答已采纳

发布于 2020-02-28 22:30:52

您必须访问a标记的href属性才能获得URL。此外,您还需要遍历所有a标记以将它们放入一个数组中。

代码语言:javascript
复制
// ...
const anchorTag = $( cssSelector );
const links = [];

// anchorTag in a JQuery handle, not a normal JavaScript value so it has special JQuery methods
anchorTag.each((index, el) => {
    const link = $(el).attr('href');
    if (link) {
         links.push(link);
    }
})

return {
   url: context.request.url,
   pageTitle,
   links,
};
票数 2
EN
页面原文内容由Stack Overflow提供。腾讯云小微IT领域专用引擎提供翻译支持
原文链接:

https://stackoverflow.com/questions/60444638

复制
相关文章

相似问题

领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档