我的工作是一个刮刮项目,以刮视频游戏产品的信息和评论从元。我想要的数据位于不同的页面上,我希望将产品信息刮到一个CSV中,并将其评论到一个不同的CSV中。正因为如此,我的代码比“刮取数据,生成项”更复杂。我需要生成一种项目(产品信息),然后产生一个请求到游戏的评论页面,并产生另一种项目(产品评论)。
我的当前代码正在工作,但是被刮到anaconda提示终端窗口的数据打印出来,而CSV文件仍然是空的。但是,所有的数据都被正确地抓取了,因为我可以在我的终端中看到它。问题似乎是如何在pipeline.py中生成和处理项目。
下面是items.py、myspider.py和pipeline.py的代码。蜘蛛代码已经被编辑成只包含相关部分,因为它是相当长和复杂的。
items.py:
import scrapy
class GameItem(scrapy.Item):
url = scrapy.Field()
title = scrapy.Field()
platform = scrapy.Field()
genres = scrapy.Field()
release_date = scrapy.Field()
ESRB_rating = scrapy.Field()
summary = scrapy.Field()
average_user_score = scrapy.Field()
metascore = scrapy.Field()
developer = scrapy.Field()
publisher = scrapy.Field()
class ReviewItem(scrapy.Item):
title = scrapy.Field()
platform = scrapy.Field()
username = scrapy.Field()
score = scrapy.Field()
date = scrapy.Field()
review_text = scrapy.Field()
critic_flag = scrapy.Field()
game_spider.py:
from scrapy import Spider, Request
from games.items import GameItem, ReviewItem
class GameSpider(Spider):
name = 'game_spider'
allowed_urls = ['https://www.metacritic.com']
start_urls = ['https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=0']
def parse(self, response):
page_urls = #scrape all result pages
for url in page_urls:
yield Request(url=url, callback=self.parse_game_urls, dont_filter = True)
def parse_game_urls(self, response):
game_urls = #scrape each game url from each result page
for url in game_urls:
yield Request(url=url, callback=self.parse_game_page, dont_filter = True)
def parse_game_page(self, response):
#scrape game info
item = GameItem()
item['url'] = url
item['title'] = title
item['platform'] = platform
item['genres'] = genres
item['release_date'] = release_date
item['ESRB_rating'] = ESRB_rating
item['summary'] = summary
item['average_user_score'] = average_user_score
item['metascore'] = metascore
item['developer'] = developer
item['publisher'] = publisher
yield item
user_review_page = # scrape url to review page
yield Request(url=user_review_page, callback=self.parse_user_reviews, dont_filter = True)
def parse_user_reviews(self, response):
reviews = #scrape all reviews
for review in reviews:
#scrape review info
item = ReviewItem()
item['title'] = title
item['platform'] = platform
item['username'] = username
item['score'] = int(score)
item['date'] = date
item['review_text'] = review_text
item['critic_flag'] = 0
yield item
pipelines.py:
from scrapy.exporters import CsvItemExporter
from scrapy import signals
from pydispatch import dispatcher
class GamesPipeline(object):
def __init__(self):
self.fileNamesCsv = ['GameItem','ReviewItem']
self.files = {}
self.exporters = {}
dispatcher.connect(self.spider_opened, signal=signals.spider_opened)
dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
def spider_opened(self, spider):
self.files = dict([ (name, open(name + '.csv','wb')) for name in self.fileNamesCsv])
for name in self.fileNamesCsv:
self.exporters[name] = CsvItemExporter(self.files[name])
if name == 'GameItem':
self.exporters[name].fields_to_export = ['url','title','platform','genres','release_date','ESRB_rating','summary',
'average_user_score','metascore','developer','publisher']
self.exporters[name].start_exporting()
if name == 'ReviewItem':
self.exporters[name].fields_to_export = ['title','platform','username','score','date','review_text','critic_flag']
self.exporters[name].start_exporting()
def spider_closed(self, spider):
[e.finish_exporting() for e in self.exporters.values()]
[f.close() for f in self.files.values()]
def process_item(self, item, spider):
typesItem = type(item)
if typesItem in set(self.fileNamesCsv):
self.exporters[typesItem].export_item(item)
return item如果有帮助,终端输出就是这样的:
(base) C:\Users\bdbot\Desktop\games>scrapy crawl game_spider
2020-07-07 17:26:03 [scrapy.utils.log] INFO: Scrapy 1.6.0 started (bot: games)
2020-07-07 17:26:03 [scrapy.utils.log] INFO: Versions: lxml 4.3.4.0, libxml2 2.9.9, cssselect 1.1.0, parsel 1.5.2, w3lib 1.21.0, Twisted 18.9.0, Python 3.7.3 (default, Apr 24 2019, 15:29:51) [MSC v.1915 64 bit (AMD64)], pyOpenSSL 19.0.0 (OpenSSL 1.1.1g 21 Apr 2020), cryptography 2.7, Platform Windows-10-10.0.18362-SP0
2020-07-07 17:26:03 [scrapy.crawler] INFO: Overridden settings: {'BOT_NAME': 'games', 'DOWNLOAD_DELAY': 2, 'NEWSPIDER_MODULE': 'games.spiders', 'SPIDER_MODULES': ['games.spiders'], 'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'}
2020-07-07 17:26:03 [scrapy.extensions.telnet] INFO: Telnet Password: 51cb3c8116353545
2020-07-07 17:26:03 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.logstats.LogStats']
2020-07-07 17:26:03 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2020-07-07 17:26:03 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2020-07-07 17:26:03 [scrapy.middleware] INFO: Enabled item pipelines:
['games.pipelines.GamesPipeline']
2020-07-07 17:26:03 [scrapy.core.engine] INFO: Spider opened
2020-07-07 17:26:03 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2020-07-07 17:26:03 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
2020-07-07 17:26:03 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=0> (referer: None)
2020-07-07 17:26:06 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=0> (referer: https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=0)
2020-07-07 17:26:15 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=129> (referer: https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=0)
2020-07-07 17:26:18 [scrapy.downloadermiddlewares.retry] DEBUG: Retrying <GET https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=126> (failed 1 times): 504 Gateway Time-out
2020-07-07 17:26:19 [scrapy.downloadermiddlewares.retry] DEBUG: Retrying <GET https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=125> (failed 1 times): 504 Gateway Time-out
2020-07-07 17:26:22 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=128> (referer: https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=0)
2020-07-07 17:26:25 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=127> (referer: https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=0)
2020-07-07 17:26:33 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=124> (referer: https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=0)
2020-07-07 17:26:37 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=123> (referer: https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=0)
2020-07-07 17:26:40 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=122> (referer: https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=0)
2020-07-07 17:26:41 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=121> (referer: https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=0)
2020-07-07 17:26:41 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=117> (referer: https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=0)
2020-07-07 17:26:44 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=120> (referer: https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=0)
2020-07-07 17:26:45 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=119> (referer: https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=0)
2020-07-07 17:26:48 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.metacritic.com/game/xbox/burnout-3-takedown> (referer: https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=0)
2020-07-07 17:26:48 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.metacritic.com/game/xbox/burnout-3-takedown>
{'ESRB_rating': 'T',
'average_user_score': 7.6,
'developer': 'Criterion Games',
'genres': 'Driving, Racing, Arcade',
'metascore': 94.0,
'platform': 'Xbox',
'publisher': 'EA Games',
'release_date': 'Sep 7, 2004',
'summary': 'Burnout 3 challenges you to crash into (and through) busy '
'intersections, while creating as much damage as possible. You can '
'battle your way to the front of the pack by taking down rivals '
'and causing spectacular crashes. For those who thirst for '
'crashes, the game includes a crash mode that rewards you for '
'creating massive pileups. With multiplayer gameplay, more than '
'100 events, and 40 tracks, Burnout 3 provides intense speed and '
'action.',
'title': 'Burnout 3: Takedown',
'url': 'https://www.metacritic.com/game/xbox/burnout-3-takedown'}
Finished Scraping Burnout 3: Takedown
2020-07-07 17:26:50 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.metacritic.com/game/playstation-4/assassins-creed-chronicles-india> (referer: https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=129)
2020-07-07 17:26:50 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.metacritic.com/game/playstation-4/assassins-creed-chronicles-india>以此类推,对于每个游戏项目和每一个评论项目。他们都打印到终端窗口。
发布于 2020-07-08 07:14:28
尝试打印新创建的csv文件的绝对路径,以重复检查它们创建的位置。下面是一些伪代码:
# pipelines.py file
import os
...
def spider_opened(self, spider):
self.files = dict([ (name, open(name + '.csv','wb')) for name in self.fileNamesCsv])
for name in self.fileNamesCsv:
print(os.path.realpath(self.files[name].name)) # new
self.exporters[name] = CsvItemExporter(self.files[name])
...发布于 2020-07-09 20:43:46
将我的pipeline.py重写成两个单独的类解决了我的问题:
class GamesPipeline(object):
def __init__(self):
self.filename = 'games.csv'
def open_spider(self, spider):
self.csvfile = open(self.filename, 'wb')
self.exporter = CsvItemExporter(self.csvfile)
self.exporter.start_exporting()
def close_spider(self, spider):
self.exporter.finish_exporting()
self.csvfile.close()
def process_item(self, item, spider):
if isinstance(item, GameItem):
self.exporter.export_item(item)
return item
class ReviewsPipeline(object):
def __init__(self):
self.filename = 'game_reviews.csv'
def open_spider(self, spider):
self.csvfile = open(self.filename, 'wb')
self.exporter = CsvItemExporter(self.csvfile)
self.exporter.start_exporting()
def close_spider(self, spider):
self.exporter.finish_exporting()
self.csvfile.close()
def process_item(self, item, spider):
if isinstance(item, ReviewItem):
self.exporter.export_item(item)
return itemhttps://stackoverflow.com/questions/62785856
复制相似问题