第一次尝试在Node.js中刮擦。范丹戈上的电影列表嵌套在几个div中,这是否意味着我会执行类似于:$('div[id="page"]').find('div > div > div > div > ul > li').each的操作。然后,当我在控制台中记录html时,它似乎与在Chrome上检查它时不同。有些电影丢失了,ul类的名字在日志中也不一样。这是正常的吗?
const axios = require('axios');
const cheerio = require('cheerio');
const url = 'https://www.fandango.com/movies-in-theaters';
axios(url)
.then(response => {
const html = response.data;
console.log(html);
const $ = cheerio.load(html);
const movies = $('ul.browse-movielist > li');
const openingThisWeek = [];
movies.each(function () {
console.log("Found the list"); // this doesn't get called
const title = $(this).find('.heading-style-1 browse-movielist--title poster-card--title').text();
openingThisWeek.push({
title,
});
});
console.log(openingThisWeek);
})
.catch(console.error);发布于 2019-11-17 14:47:06
Fandango使用打开的电影的客户端呈现,所以我们不能使用axios来获取它。
另一种方法是使用无头浏览器来抓取数据。我用的是木偶师
const puppeteer = require("puppeteer");
const cheerio = require("cheerio");
(async () => {
const url = "https://www.fandango.com/movies-in-theaters";
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(url);
const body = await page.evaluate(() => document.body.outerHTML);
await browser.close();
const $ = cheerio.load(body);
const movies = [];
$(".browse-movielist > li").each((i, item) => {
const $item = $(item);
const title = $item.find(".poster-card--title").text();
movies.push({
title
});
});
console.log(movies);
})();https://stackoverflow.com/questions/58897041
复制相似问题