为了学习目的,我正在做网络抓取,我甚至在一个基本的抓取水平。下一个问题是,当我运行抓取时,获得的数据很少,而其他数据是空的。
这是代码:
items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class MercadolibreItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
#departamento = scrapy.Field()
#precio = scrapy.Field()
descripcion = scrapy.Field()
passmercadolibreperu.py
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from mercadolibre.items import MercadolibreItem
class MercadolibreperuSpider(CrawlSpider):
name = 'mercadolibreperu'
allowed_domains = ['mercadolibre.com.pe']
start_urls = ['https://listado.mercadolibre.com.pe/lima/mascarilla-n95_ITEM*CONDITION_2230284']
rules = (
#Rule(LinkExtractor(allow=r'Items/'), callback='parse_item', follow=True),
Rule(
LinkExtractor(
restrict_xpaths=(
'//section[@id="results-section"]',
),
),
callback='parse_item',
follow=True
),
)
def parse_item(self, response):
#item = {}
#item['domain_id'] = response.xpath('//input[@id="sid"]/@value').get()
#item['name'] = response.xpath('//div[@id="name"]').get()
#item['description'] = response.xpath('//div[@id="description"]').get()
item = MercadolibreItem()
item['descripcion'] = response.xpath('//h2[@class="item__title list-view-item-title"]/a/span/text()').get()
return item结果: mercadolibre.json
[
{"descripcion": null},
{"descripcion": " Mascarillas N95: 3m - 8210 Oferta!!! "},
{"descripcion": " Mascarillas N95 "},
{"descripcion": " Agotado Mascarillas N95 Sin Filtro "},
{"descripcion": " Mascarilla Steelpro M920v - N95 ( Caja De 10 Uni) Oferta "},
{"descripcion": " Mascarillas 3m 8511 N95 "},
{"descripcion": " Mascarillas N95 "},
{"descripcion": " Mascarillaa 3m N95 Por Unidad "},
{"descripcion": null},
{"descripcion": " Mascarilla N95 - 3m "},
{"descripcion": " Respirador N95 Normado Kimberly Clark (mascarilla) "},
{"descripcion": null},
{"descripcion": " Mascarillan95 "},
{"descripcion": " Mascarillas N95 Certificadas "},
{"descripcion": " Mascarilla N95 3m "},
{"descripcion": null},
{"descripcion": " Mascarilla N95 Segre Ffp2 "},
{"descripcion": null},
{"descripcion": null},
{"descripcion": null},
{"descripcion": null},
{"descripcion": null},
{"descripcion": null},
{"descripcion": null},
{"descripcion": null},
{"descripcion": null},
{"descripcion": null},
{"descripcion": null},
{"descripcion": " Mascarillas 3m N95 Modelo 8210 Certificada "},
{"descripcion": null},
{"descripcion": " Mascarilla N95 "},
{"descripcion": " Mascarillas N95 3m 1860 X Unidades A 43 Soles "},
{"descripcion": null},
{"descripcion": " Venta De Mascarilla 3m N95. "},
{"descripcion": null},
{"descripcion": null},
{"descripcion": null},
{"descripcion": null},
{"descripcion": null},
{"descripcion": null},
{"descripcion": null},
{"descripcion": null},
{"descripcion": null},
{"descripcion": null},
{"descripcion": null},
{"descripcion": null},
{"descripcion": null},
{"descripcion": null},
{"descripcion": null},
{"descripcion": null},
{"descripcion": null},
{"descripcion": null},
{"descripcion": null},
{"descripcion": null},
{"descripcion": null},
{"descripcion": null},
{"descripcion": " Mascarilla Respirador N95 X Unidad Gerson Equivale A 3m 8210 "},
{"descripcion": null},
{"descripcion": " Mascarilla 3m N95 1860 Originales S/.70 Unid.y 1100 Caja "},
{"descripcion": " Mascarilla Tipo N 95 C\u00f3nica Caja 50 Unidades "},
{"descripcion": " Mascarilla K N95 Selladas Y Certificada Ce, Fda. "},
{"descripcion": " Mascarilla D95 Selladas Original Tip N95 (entrega Inmediata) "},
{"descripcion": null},
{"descripcion": null},
{"descripcion": null},
{"descripcion": " Mascarilla 3m Respirador 8210 N95 Caja X 20 Unid Orginal "}
]请支持我看这个问题。干杯
发布于 2020-04-23 00:40:42
有结果的页面有很多元素,但是使用.get()只能在页面上获得第一个元素。您必须使用.getall() with循环,其中yield分隔MercadolibreItem()中的每个元素。
在测试中,我使用普通字典而不是MercadolibreItem()。
def parse_item(self, response):
#item['domain_id'] = response.xpath('//input[@id="sid"]/@value').get()
#item['name'] = response.xpath('//div[@id="name"]').get()
#item['description'] = response.xpath('//div[@id="description"]').get()
for element in response.xpath('//h2[@class="item__title list-view-item-title"]/a/span/text()').getall():
item = {}
#item = MercadolibreItem()
item['descripcion'] = element
yield item但是,要获得每个元素的许多值,最好是找到保留元素的对象(不使用get()或getall(),然后使用相对xpath (从点.开始)分别搜索每个evey对象中的名称和描述。
先搜索所有的名称,然后搜索所有的描述,您可能会得到具有不同数量的元素的列表(如果某些对象没有名称或描述),然后对名称进行分组,并以正确的对进行描述是有问题的。
def parse_item(self, response):
for element in response.xpath('//li[@class="results-item highlighted article stack item-without-installmets"]'):
item = {}
#item = MercadolibreItem()
item['title'] = element.xpath('.//span[@class="main-title"]//text()').get()
item['price_symbol'] = element.xpath('.//span[@class="price__symbol"]//text()').get()
item['price_fraction'] = element.xpath('.//span[@class="price__fraction"]//text()').get()
yield item最小的工作代码,您可以放在一个文件中,它不需要创建项目
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
#from mercadolibre.items import MercadolibreItem
class MercadolibreperuSpider(CrawlSpider):
name = 'mercadolibreperu'
allowed_domains = ['mercadolibre.com.pe']
start_urls = ['https://listado.mercadolibre.com.pe/lima/mascarilla-n95_ITEM*CONDITION_2230284']
rules = (
#Rule(LinkExtractor(allow=r'Items/'), callback='parse_item', follow=True),
Rule(
LinkExtractor(
restrict_xpaths=(
'//section[@id="results-section"]',
),
),
callback='parse_item',
follow=True
),
)
def parse_item_old(self, response):
#item['domain_id'] = response.xpath('//input[@id="sid"]/@value').get()
#item['name'] = response.xpath('//div[@id="name"]').get()
#item['description'] = response.xpath('//div[@id="description"]').get()
for element in response.xpath('//h2[@class="item__title list-view-item-title"]/a/span/text()').getall():
#item = {}
item = MercadolibreItem()
item['descripcion'] = element
yield item
def parse_item(self, response):
#item['domain_id'] = response.xpath('//input[@id="sid"]/@value').get()
#item['name'] = response.xpath('//div[@id="name"]').get()
#item['description'] = response.xpath('//div[@id="description"]').get()
for element in response.xpath('//li[@class="results-item highlighted article stack item-without-installmets"]'):
item = {}
#item = MercadolibreItem()
item['title'] = element.xpath('.//span[@class="main-title"]//text()').get()
item['price_symbol'] = element.xpath('.//span[@class="price__symbol"]//text()').get()
item['price_fraction'] = element.xpath('.//span[@class="price__fraction"]//text()').get()
yield item
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
# save in file CSV, JSON or XML
'FEED_FORMAT': 'csv', # csv, json, xml
'FEED_URI': 'output.csv', #
})
c.crawl(MercadolibreperuSpider)
c.start()https://stackoverflow.com/questions/61376200
复制相似问题