我正在努力刮一家英国知名零售商的网站,但我的CrawlSpider面临一个问题-我收到了以下错误信息:
AttributeError: NlCrawlerSpider对象没有属性“_rules”
我使用示例这里将我的常规蜘蛛转换为爬行蜘蛛;我还按照建议的这里对规则进行了语法处理,但最终得到了相同的错误消息。所有您的帮助将是非常感谢-谢谢您提前!
# Scrapy
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
# Other Packages
import time
from datetime import date
from selenium import webdriver
class NlCrawlerSpider(CrawlSpider):
name = 'nl_crawler'
allowed_domains = ['newlook.com']
start_urls = ['http://www.newlook.com/uk/womens/clothing/c/uk-womens-clothing?comp=NavigationBar%7Cmn%7Cwomens%7Cclothing#/?q=:relevance&page=1&sort=relevance&content=false']
rules = (
Rule(LinkExtractor(allow=r'\?q=:relevance&page=[1-130]&sort=relevance&content=false', ), callback='parse_item', follow=True),
)
def __init__(self):
self.driver = webdriver.Safari()
self.driver.set_window_size(800,600)
time.sleep(2)
def parse_item(self, response):
driver = self.driver
driver.get(response.url)
time.sleep(2)
# Collect products
products = driver.find_elements_by_class_name('plp-item ng-scope')
# Iterate over products; extract data and append individual features to NlScrapeItem
for item in products:
# Pull features
desc = item.find_element_by_class_name('product-item__name link--nounderline ng-binding').text
href = item.find_element_by_class_name('plp-carousel__img-link ng-scope').get_attribute('href')
# Generate a product identifier
identifier = href.split('/p/')[1].split('?comp')[0]
identifier = int(identifier)
# datetime
dt = date.today()
dt = dt.isoformat()
# Price Symbol removal and integer conversion
try:
priceString = item.find_element_by_class_name('price ng-binding').text
except:
priceString = item.find_element_by_class_name('price price--previous-price product-item__price--previous-price ng-binding ng-scope').text
priceInt = priceString.split('£')[1]
originalPrice = float(priceInt)
# discountedPrice Logic
try:
discountedPriceString = item.find_element_by_class_name('price ng-binding price--marked-down').text
discountedPriceInt = discountedPriceString.split('£')[1]
discountedPrice = float(discountedPriceInt)
except:
discountedPrice = 'N/A'
# NlScrapeItem
item = NlScrapeItem()
# Append product to NlScrapeItem
item['identifier'] = identifier
item['href'] = href
item['description'] = desc
item['originalPrice'] = originalPrice
item['discountedPrice'] = discountedPrice
item['firstSighted'] = dt
item['lastSighted'] = dt
yield item添加:因此,我试图忽略使用crawlSpider的想法,并遵循@jabargas的想法--参见下面的内容:
def __init__(self):
self.driver = webdriver.Safari()
self.driver.set_window_size(800,600)
def start_requests(self):
n = 5
urls= []
for pageNumber in range(1,n):
url = 'http://www.newlook.com/uk/womens/clothing/c/uk-womens-clothing?comp=NavigationBar%%7Cmn%%7Cwomens%%7Cclothing#/?q=:relevance&page=%d&sort=relevance&content=false' % pageNumber
urls.append(url)
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
driver = self.driver
driver.get(response.url)
time.sleep(2)
# Collect products
products = driver.find_elements_by_class_name('plp-item ng-scope')
# Iterate over products; extract data and append individual features to NlScrapeItem
for item in products:
# Pull features
desc = item.find_element_by_class_name('product-item__name link--nounderline ng-binding').text
href = item.find_element_by_class_name('plp-carousel__img-link ng-scope').get_attribute('href')
# Generate a product identifier
identifier = href.split('/p/')[1].split('?comp')[0]
identifier = int(identifier)
# datetime
dt = date.today()
dt = dt.isoformat()
# Price Symbol removal and integer conversion
try:
priceString = item.find_element_by_class_name('price ng-binding').text
except:
priceString = item.find_element_by_class_name('price price--previous-price product-item__price--previous-price ng-binding ng-scope').text
priceInt = priceString.split('£')[1]
originalPrice = float(priceInt)
# discountedPrice Logic
try:
discountedPriceString = item.find_element_by_class_name('price ng-binding price--marked-down').text
discountedPriceInt = discountedPriceString.split('£')[1]
discountedPrice = float(discountedPriceInt)
except:
discountedPrice = 'N/A'
# NlScrapeItem
item = NlScrapeItem()
# Append product to NlScrapeItem
item['identifier'] = identifier
item['href'] = href
item['description'] = desc
item['originalPrice'] = originalPrice
item['discountedPrice'] = discountedPrice
item['firstSighted'] = dt
item['lastSighted'] = dt
yield item不幸的是,没有运气:它提取了48个项目的细节。
发布于 2017-08-30 18:48:44
你可以这样做来刮到第n页:
start_urls = ['http://www.newlook.com/uk/womens/clothing/c/uk-womens-clothing?comp=NavigationBar%%7Cmn%%7Cwomens%%7Cclothing#/?q=:relevance&page=%d&sort=relevance&content=false' % page_number' for page_number in range(1,n)]其中n是最后一页+1
或者,您可以使用刮伤分页-获取到下一页的链接,并跟随它,因为你可以找到这里。
发布于 2020-02-16 11:08:24
另一个可能的问题是您还没有在init方法中添加超级构造函数。
为它添加“self).init(*a,(MySpider,**kw)”。
我得到了同样的问题,并通过这个解决了问题。
因此,init应该如下所示
def __init__(self, *a, **kw):
super(MySpider, self).__init__(*a, **kw)
//your initializationshttps://stackoverflow.com/questions/45967157
复制相似问题