以下程序的目的是抓取CNN,并将其所有文本写入单个文件(使用两个第三方)。
我得到了
RangeError: Maximum call stack size exceeded如何排除此问题,以及如何绕过这一问题?有什么办法我可以“释放”记忆吗?怎么做呢?
//----------Configuration--------------
var startingUrl = "http://cnn.com"; //keep the http\https or www prefix
var crawlingDepth = "50";
var outputFileName = "cnn.txt";
//-------------------------------------
var Crawler = require("js-crawler");
var sanitizeHtml = require('sanitize-html');
var htmlToText = require('html-to-text');
var fs = require('fs');
var index = 0;
new Crawler().configure({depth: crawlingDepth})
.crawl(startingUrl, function onSuccess(page) {
var text = htmlToText.fromString(page.body, {
wordwrap: false,
hideLinkHrefIfSameAsText: true,
ignoreHref: true,
ignoreImage: true
});
index++;
console.log(index + " pages were crawled");
fs.appendFile(outputFileName, text, function (err) {
if (err) {
console.log(err);
};
console.log('It\'s saved! in same location.');
});
});发布于 2015-07-26 19:34:10
1)这是一个递归深度问题。
2)有必要避免这种情况:
3)唯一的概念:
var Urls = [ ["http://cnn.com/"] ]; // What we crawling
var crawledUrls = {}; // Check if already crawled
var crawlingDepth = 3;
var depth = 0; // Current depth
var index = 0; // Current index
var Crawler = require("js-crawler");
function crawling() {
console.log(depth, index, Urls[depth][index]);
// Prepare next level
if (typeof Urls[depth+1] === "undefined") Urls.push([]);
// Already crawled flag
crawledUrls[ Urls[depth][index] ] = true;
new Crawler().configure({depth: 1}).crawl({
url: Urls[depth][index],
success: function(page) {
// Do some with crawled page
// Collect urls at crawled page
var urls = Crawler.prototype._getAllUrls( page.url, page.body );
for(var j=0; j<urls.length; j++) {
// Check same domain and now crawled yet
if ( typeof crawledUrls[urls[j]] === "undefined"
&& urls[j].indexOf(Urls[0][0])===0 ) {
Urls[depth+1].push(urls[j]);
}
}
},
failure: function(page) {
},
finished: function(crawled) {
index++;
if (index<Urls[depth].length) {
setTimeout(crawling,0);
} else {
depth++;
index = 0;
if (depth<crawlingDepth) {
setTimeout(crawling,0);
} else {
// Finished
}
}
}
});
}
crawling();https://stackoverflow.com/questions/31640599
复制相似问题