#Scrapy News Crawler
#Importing Scrapy library
import scrapy
#Defining spider's url,headers
class DawnSpider(scrapy.Spider):
name = 'dawn'
allowed_domains = ['www.dawn.com'] #Channel link
# start_urls = ['https://www.dawn.com/archive/2022-02-09']
# url = ['https://www.dawn.com']
# page = 1#定义函数来设置标题和设置链接,从哪里开始刮擦
def start_requests(self):
yield scrapy.Request(url='https://www.dawn.com/archive/2022-03-21', callback=self.parse, headers={'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:48.0) Gecko/20100101 Firefox/48.0'})
#Getting news healines and their links
def parse(self, response):
titles = response.xpath("//h2[@class = 'story__title text-6 font-bold font-merriweather pt-1 pb-2 ']/a")
for title in titles:
headline = title.xpath(".//text()").get()
headline_link = title.xpath(".//@href").get()
#itrating News headline links
yield response.follow(url=headline_link, callback=self.parse_headline, meta={'heading': headline}, headers={'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:48.0) Gecko/20100101 Firefox/48.0'})
#COde for going to previous pages
prev_page = response.xpath("//li[1]/a/@href").get()
prev = 'https://www.dawn.com' + str(prev_page)
yield scrapy.Request(url=prev, callback=self.parse, headers={'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:48.0) Gecko/20100101 Firefox/48.0'})#迭代标题链接并获取治疗细节和日期/时间
def parse_headline(self, response):
headline = response.request.meta['heading']
# logging.info(response.url)
full_detail = response.xpath("//div[contains(@class , story__content)]/p[1]")
date_and_time = response.xpath("//span[@class='timestamp--date']/text()").get()
for detail in full_detail:
data = detail.xpath(".//text()").get()
yield {
'headline': headline,
'date_and_time': date_and_time,
'details': data
}#Python脚本(单独的FIle )
from scrapy import cmdline
cmdline.execute("scrapy crawl dawn -o data.csv".split(" "))发布于 2022-06-22 08:02:43
cmdline.execute运行蜘蛛不同,您可以使用CrawlerProcess运行它,阅读有关共同做法的内容。您可以将main.py作为一个示例。下面是一个工作示例(我用'CLOSESPIDER_ITEMCOUNT': 10检查了它,所以在运行它时给它一些时间)。
spider.py:
#Importing Scrapy library
import scrapy
#Defining spider's url,headers
class DawnSpider(scrapy.Spider):
name = 'dawn'
allowed_domains = ['dawn.com'] #Channel link
# start_urls = ['https://www.dawn.com/archive/2022-02-09']
# url = ['https://www.dawn.com']
# page = 1
custom_settings = {
'DOWNLOAD_DELAY': 0.8,
'FEEDS': {'data.csv': {'format': 'csv'}},
}
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "en-US,en;q=0.5",
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"Cookie": "scribe=true",
"DNT": "1",
"Host": "www.dawn.com",
"Pragma": "no-cache",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
"Sec-GPC": "1",
"TE": "trailers",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:48.0) Gecko/20100101 Firefox/48.0"
}
def start_requests(self):
yield scrapy.Request(url='https://www.dawn.com/archive/2022-03-21', headers=self.headers)
#Getting news healines and their links
def parse(self, response):
titles = response.xpath("//h2[@class = 'story__title text-6 font-bold font-merriweather pt-1 pb-2 ']/a")
for title in titles:
headline = title.xpath(".//text()").get()
headline_link = title.xpath(".//@href").get()
#itrating News headline links
yield response.follow(url=headline_link, callback=self.parse_headline, cb_kwargs={'headline': headline}, headers=self.headers)
#COde for going to previous pages
prev_page = response.xpath("//li[1]/a/@href").get()
if prev_page:
prev = 'https://www.dawn.com' + str(prev_page)
yield scrapy.Request(url=prev, callback=self.parse, headers=self.headers)
def parse_headline(self, response, headline):
# logging.info(response.url)
full_detail = response.xpath("//div[contains(@class , story__content)]/p[1]")
date_and_time = response.xpath("//span[@class='timestamp--date']/text()").get()
for detail in full_detail:
data = detail.xpath(".//text()").get()
yield {
'headline': headline,
'date_and_time': date_and_time,
'details': data
}main.py:
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
if __name__ == "__main__":
settings = get_project_settings()
process = CrawlerProcess(settings)
process.crawl('dawn')
process.start()https://stackoverflow.com/questions/72701850
复制相似问题