我正在用刮擦的https://www.rwaq.org/courses刮这个网站,但是当我希望所有的数据都写在csv文件中时,有些数据不是在正确的位置
这个网站是阿拉伯语的,是关于在线课程的
我想在第一页中搜索信息,然后转到讲师bio和刮,我也想要刮课程的细节。
我试过这段代码,但有些信息不一致,我几乎知道如何修复它,但我不知道-you是怎么知道这种感觉的-
`# -*- coding: utf-8 -*-
import scrapy
from ..items import RwaqItem
class XrwaqSpider(scrapy.Spider):
name = 'xrwaq'
start_urls = ['https://www.rwaq.org/courses']
def parse(self, response):
items = RwaqItem()
container = response.css('#courses .row-fluid')
base = 'https://www.rwaq.org'
for t in container:
course_name = t.css('h3 a::text').extract()
course_time = [i.strip()[12:] for i in t.css('.course-info span::text').extract()]
course_date_from = [c.strip()[3:17] for c in t.css('.subject-date::text').extract()]
course_date_to = [c.strip()[28:] for c in t.css('.subject-date::text').extract()]
course_lecturer_name = t.css('.lecturer-data span::text').extract()
course_lecturer_bio_link = t.css("div.lecturer-data a::attr(href)").extract()
course_category = t.css('.course-list-cat::text').extract()
course_date = [r.strip() for r in t.css('p span::text').extract()]
course_price = t.css('.course-price-tag-inner::text').extract()
course_price_tag = t.css('.course-price-tag-inner span::text').extract()
price = ''.join(course_price + course_price_tag)
# print(course_date)
course_link = t.css('h3 a::attr(href)').extract()
for rw in course_link:
url1 = base + rw
# yield response.follow(url1, callback=self.parse_course)
# print(url1)
for item in list(course_lecturer_bio_link):
url = base + item
yield response.follow(url, callback=self.parse_bio)
items['course_name'] = course_name
items['course_time'] = course_time
items['course_date'] = course_date
items['course_date_from'] = course_date_from
items['course_date_to'] = course_date_to
items['course_lecturer_name'] = course_lecturer_name
items['course_category'] = course_category
items['course_link'] = url1
items['price'] = price
yield items
def parse_bio(self, response):
items = RwaqItem()
bio_title = [we.strip() for we in response.css('div.page-title p::text').extract()]
# bio_text = [wre.strip() for wre in response.css('div.bio-text ul li::text').extract()]
# bio_text2 = [q.strip() for q in response.css('div.bio-text p::text').extract()]
items['bio_title'] = bio_title
# items['bio_text'] = bio_text
# items['bio_text2'] = bio_text2
yield items
def parse_course(self, response):
items = RwaqItem()
course_promo_link = response.css('iframe::attr(src)').extract()
course_desc = response.css('#summary_truncated p::text').extract()
course_material = response.css('#organization > div.course-content > div:nth-child(4) ul li::text').extract()
course_require = response.css('#organization > div.course-content > div:nth-child(5) ul li::text').extract()
course_out = response.css('#organization > div.course-content > div:nth-child(6) ul li::text').extract()
course_company = response.css('div.subject-organization p a::text').extract()
# items['course_promo_link'] = course_promo_link
# items['course_desc'] = course_desc
# items['course_material'] = course_material
# items['course_require'] = course_require
# items['course_out'] = course_out
# items['course_company'] = course_company
# yield items发布于 2019-09-01 07:57:09
我认为主要的问题是,您正在生成项目,而不是相互链接不同页面的结果。一种可行的方法是将项目添加到元数据中,并且只有当您获得完整的项信息时才能生成它们。下面的代码应该让您开始--我还省略了不必要的循环(至少我认为它们是),并在适当的时候使用了extract_first()而不是提取()。
# -*- coding: utf-8 -*-
import scrapy
from ..items import RwaqItem
class XrwaqSpider(scrapy.Spider):
name = 'xrwaq'
start_urls = ['https://www.rwaq.org/courses']
def parse(self, response):
items = RwaqItem()
container = response.css('#courses .row-fluid')
base = 'https://www.rwaq.org'
for t in container:
course_name = t.css('h3 a::text').extract_first()
course_time = t.css('.course-info span::text').extract_first().strip()[12:]
course_date_from = t.css('.subject-date::text').extract_first().strip()[3:17]
course_date_to = t.css('.subject-date::text').extract_first().strip()[28:]
course_lecturer_name = t.css('.lecturer-data span::text').extract_first()
course_lecturer_bio_link = t.css("div.lecturer-data a::attr(href)").extract_first()
course_category = t.css('.course-list-cat::text').extract_first()
course_date = t.css('p span::text').extract_first().strip()
course_price = t.css('.course-price-tag-inner::text').extract_first()
course_price_tag = t.css('.course-price-tag-inner span::text').extract_first()
if course_price and course_price_tag:
price = ''.join([course_price + course_price_tag])
else:
price = ''
items['price'] = price
course_link = t.css('h3 a::attr(href)').extract_first()
url1 = base + course_link
items['course_name'] = course_name
items['course_time'] = course_time
items['course_date'] = course_date
items['course_date_from'] = course_date_from
items['course_date_to'] = course_date_to
items['course_lecturer_name'] = course_lecturer_name
items['course_category'] = course_category
items['course_link'] = url1
meta = {'items': items,
'bio_url': course_lecturer_bio_link}
yield response.follow(url1,
meta=meta,
callback=self.parse_course)
def parse_course(self, response):
items = response.meta['items']
course_promo_link = response.css('iframe::attr(src)').extract_first()
course_desc = response.css('#summary_truncated p::text').extract_first()
course_material = response.css('#organization > div.course-content > div:nth-child(4) ul li::text').extract_first()
course_require = response.css('#organization > div.course-content > div:nth-child(5) ul li::text').extract_first()
course_out = response.css('#organization > div.course-content > div:nth-child(6) ul li::text').extract_first()
course_company = response.css('div.subject-organization p a::text').extract_first()
items['course_promo_link'] = course_promo_link
items['course_desc'] = course_desc
items['course_material'] = course_material
items['course_require'] = course_require
items['course_out'] = course_out
items['course_company'] = course_company
bio_url = response.meta['bio_url']
meta = {"items": items}
yield response.follow(bio_url,
meta=meta,
callback=self.parse_bio)
def parse_bio(self, response):
items = response.meta['items']
bio_title = response.css('div.page-title p::text').extract_first()
if bio_title:
items['bio_title'] = bio_title.strip()
# bio_text = [wre.strip() for wre in response.css('div.bio-text ul li::text').extract()]
# bio_text2 = [q.strip() for q in response.css('div.bio-text p::text').extract()]
# items['bio_text'] = bio_text
# items['bio_text2'] = bio_text2
yield itemshttps://stackoverflow.com/questions/57736857
复制相似问题