我使用scrapy已经有一年多了,我的脚本是别人为我写的。它在一年多的时间里一直运行得很好,直到6-8周前,每当我尝试下载时,它都会给我以下错误。有谁有什么想法吗?
我在Ubuntu 14.04 LTS上运行这个。
命令:抓取googleplay
2015-08-30 13:10:31-0400 [googleplay] ERROR: Spider error processing <GET https://accounts.google.com/ServiceLogin?continue=https%3A%2F%2Fplay.google.com%2Fstore%2Fapps%2Fcategory%2FGAME&followup=https%3A%2F%2Fplay.google.com%2Fstore%2Fapps%2Fcategory%2FGAME&passive=1209600&service=googleplay>
Traceback (most recent call last):
File "/usr/lib/python2.7/dist-packages/twisted/internet/base.py", line 800, in runUntilCurrent
call.func(*call.args, **call.kw)
File "/usr/lib/python2.7/dist-packages/twisted/internet/task.py", line 595, in _tick
taskObj._oneWorkUnit()
File "/usr/lib/python2.7/dist-packages/twisted/internet/task.py", line 472, in _oneWorkUnit
result = self._iterator.next()
File "/usr/lib/pymodules/python2.7/scrapy/utils/defer.py", line 57, in <genexpr>
work = (callable(elem, *args, **named) for elem in iterable)
--- <exception caught here> ---
File "/usr/lib/pymodules/python2.7/scrapy/utils/defer.py", line 96, in iter_errback
yield next(it)
File "/usr/lib/pymodules/python2.7/scrapy/contrib/spidermiddleware/offsite.py", line 23, in process_spider_output
for x in result:
File "/usr/lib/pymodules/python2.7/scrapy/contrib/spidermiddleware/referer.py", line 22, in <genexpr>
return (_set_referer(r) for r in result or ())
File "/usr/lib/pymodules/python2.7/scrapy/contrib/spidermiddleware/urllength.py", line 33, in <genexpr>
return (r for r in result or () if _filter(r))
File "/usr/lib/pymodules/python2.7/scrapy/contrib/spidermiddleware/depth.py", line 50, in <genexpr>
return (r for r in result or () if _filter(r))
File "/usr/lib/pymodules/python2.7/scrapy/contrib/spiders/crawl.py", line 73, in _parse_response
for request_or_item in self._requests_to_follow(response):
File "/usr/lib/pymodules/python2.7/scrapy/contrib/spiders/crawl.py", line 52, in _requests_to_follow
links = [l for l in rule.link_extractor.extract_links(response) if l not in seen]
File "/usr/lib/pymodules/python2.7/scrapy/contrib/linkextractors/sgml.py", line 129, in extract_links
links = self._extract_links(body, response.url, response.encoding, base_url)
File "/usr/lib/pymodules/python2.7/scrapy/contrib/linkextractors/sgml.py", line 29, in _extract_links
self.feed(response_text)
File "/usr/lib/python2.7/sgmllib.py", line 104, in feed
self.goahead(0)
File "/usr/lib/python2.7/sgmllib.py", line 174, in goahead
k = self.parse_declaration(i)
File "/usr/lib/python2.7/markupbase.py", line 98, in parse_declaration
decltype, j = self._scan_name(j, i)
File "/usr/lib/python2.7/markupbase.py", line 392, in _scan_name
% rawdata[declstartpos:declstartpos+20])
File "/usr/lib/python2.7/sgmllib.py", line 111, in error
raise SGMLParseError(message)
sgmllib.SGMLParseError: expected name token at '<!\\\\])/g,"\\\\$1").rep'下面是我的GooglePlay爬行器(在更新之后)以及我现在收到的错误消息
import string
import requests
from scrapy import log
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.htmlparser import HtmlParserLinkExtractor
from scrapy.selector import Selector
from scrapy.http import Request
from scraper.items import ApkItem
from play import parse_app
class GooglePlaySpider(CrawlSpider):
name = 'googleplay'
start_urls = [
'https://play.google.com/store/apps'
]
rules = (
Rule(SgmlLinkExtractor(allow=('/store/apps$', )), callback='parse_category_group', follow=True),
Rule(SgmlLinkExtractor(allow=('/store/apps/category/.*', )), callback='parse_category', follow=True),
Rule(SgmlLinkExtractor(allow=('/store/search\?.*', )), callback='parse_search', follow=True),
)
def parse_category_group(self, response):
sel = Selector(response)
category_groups = sel.xpath('//div[@class="padded-content3 app-home-nav"]')
for category_group in category_groups:
category_group_name = category_group.xpath('h2/a/text()').extract()
categories = category_group.xpath('ul/li')
for category in categories:
category_name = category.xpath('a/text()').extract()
category_url = category.xpath('a/@href').extract()[0]
chars = string.ascii_uppercase + string.digits
for x in chars:
yield Request('https://play.google.com/store/search?q=' + x + '&c=apps', callback=self.parse_search)
for x in chars:
for y in chars:
yield Request('https://play.google.com/store/search?q=' + x + y + '&c=apps', callback=self.parse_search)
for x in chars:
for y in chars:
for z in chars:
yield Request('https://play.google.com/store/search?q=' + x + y + z + '&c=apps', callback=self.parse_search)
return
def parse_category(self, response):
base_path = response.url.split('?')[0]
if '/collection/' in response.url:
sel = Selector(response)
apps = sel.xpath('//a[@class="title"]')
has_app = False
for app in apps:
has_app = True
app_name = app.xpath('text()').extract()
app_url = app.xpath('@href').extract()
yield Request('https://play.google.com' + app_url[0], meta={'come_from': self.name}, callback=parse_app)
if has_app:
m = re.match(r'(.*)\?start=(\d+)&num=24', response.url)
if m is None:
start_number = 24
else:
start_number = int(m.group(2)) + 24
yield Request(base_path + '?start=' + str(start_number) + '&num=24', callback=self.parse_category)
return
def parse_search(self, response):
m = re.match(r'(.*)&start=(\d+)&num=24', response.url)
if m is None:
base_path = response.url
start_number = 24
else:
start_number = int(m.group(2)) + 24
base_path = m.group(1)
sel = Selector(response)
apps = sel.xpath('//a[contains(@href,"/store/apps/details")]')
has_app = False
for app in apps:
has_app = True
app_url = app.xpath('@href').extract()
yield Request('https://play.google.com' + app_url[0], meta={'come_from': self.name}, callback=parse_app)
if has_app:
yield Request(base_path + '&start=' + str(start_number) + '&num=24', callback=self.parse_search)
return
**** Error ****Traceback (most recent call last):
File "/usr/bin/scrapy", line 4, in <module>
execute()
File "/usr/lib/pymodules/python2.7/scrapy/cmdline.py", line 143, in execute
_run_print_help(parser, _run_command, cmd, args, opts)
File "/usr/lib/pymodules/python2.7/scrapy/cmdline.py", line 89, in _run_print_help
func(*a, **kw)
File "/usr/lib/pymodules/python2.7/scrapy/cmdline.py", line 150, in _run_command
cmd.run(args, opts)
File "/usr/lib/pymodules/python2.7/scrapy/commands/crawl.py", line 47, in run
crawler = self.crawler_process.create_crawler()
File "/usr/lib/pymodules/python2.7/scrapy/crawler.py", line 87, in create_crawler
self.crawlers[name] = Crawler(self.settings)
File "/usr/lib/pymodules/python2.7/scrapy/crawler.py", line 25, in __init__
self.spiders = spman_cls.from_crawler(self)
File "/usr/lib/pymodules/python2.7/scrapy/spidermanager.py", line 35, in from_crawler
sm = cls.from_settings(crawler.settings)
File "/usr/lib/pymodules/python2.7/scrapy/spidermanager.py", line 31, in from_settings
return cls(settings.getlist('SPIDER_MODULES'))
File "/usr/lib/pymodules/python2.7/scrapy/spidermanager.py", line 22, in __init__
for module in walk_modules(name):
File "/usr/lib/pymodules/python2.7/scrapy/utils/misc.py", line 68, in walk_modules
submod = import_module(fullpath)
File "/usr/lib/python2.7/importlib/__init__.py", line 37, in import_module
__import__(name)
File "/home/darwin/ProjectKrutz/scraper/scraper/spiders/googlePlaySpider.py", line 12, in <module>
class GooglePlaySpider(CrawlSpider):
File "/home/darwin/ProjectKrutz/scraper/scraper/spiders/googlePlaySpider.py", line 18, in GooglePlaySpider
Rule(SgmlLinkExtractor(allow=('/store/apps$', )), callback='parse_category_group', follow=True),
NameError: name 'SgmlLinkExtractor' is not defined发布于 2015-10-02 13:47:19
问题是SgmlLinkExtractor在评论方面有问题。错误消息告诉您有一个注释:<!。
因此,解决方案是更改您正在使用的爬行器,并将SgmlLinkExtractor替换为
from scrapy.contrib.linkextractors.htmlparser import HtmlParserLinkExtractor或
from scrapy.contrib.linkextractors.lxmlhtml import LxmlParserLinkExtractor当然,这些只是导入语句,您也必须更改使用链接提取器的Rule才能使用这些提取器之一。
没有代码,我不能给你更多的建议,在哪里改变部分。
https://stackoverflow.com/questions/32896037
复制相似问题