目前,我一直在创建一个web爬虫,在这里我想调用正确的类,从给定的URL中抓取web元素。
目前,我创建了:
import sys
import tldextract
import requests
class Scraper:
scrapers = {}
def __init_subclass__(scraper_class):
Scraper.scrapers[scraper_class.url] = scraper_class
@classmethod
def for_url(cls, url):
k = tldextract.extract(url)
# return Scraper.scrapers[k.domain]()
# or
return cls.scrapers[k.domain]()
class BBCScraper(Scraper):
url = 'bbc.co.uk'
def scrape(s):
print(s)
# FIXME Scrape the correct values for BBC
return "Scraped BBC News"
url = 'https://www.bbc.co.uk/'
scraper = Scraper.for_url(url)
scraper.scrape(requests.get(url))我现在要做的是,如果BBC是域名,那么它应该进入class BBCScraper(Scraper):,既然我们称之为scraper.scrape(requests.get(url)),那么它就应该在BBCScraper -> scrape -> Return web elements中刮取web元素。
然而,当我试图运行这个脚本时,我确实遇到了问题--它打印出来:
Outprint >>> return cls.scrapers[k.domain]() KeyError: 'bbc'我想知道如何根据赋予for_url类方法的域调用正确的类
发布于 2021-05-24 09:02:37
问题是k.domain返回bbc,而您编写了url = 'bbc.co.uk',因此其中一个解决方案是
k.registered_domain
k.domain
一起使用url = 'bbc.co.uk'和url = 'bbc'
并在scrape方法中添加一个参数以获得响应
from abc import abstractmethod
import requests
import tldextract
class Scraper:
scrapers = {}
def __init_subclass__(scraper_class):
Scraper.scrapers[scraper_class.url] = scraper_class
@classmethod
def for_url(cls, url):
k = tldextract.extract(url)
return cls.scrapers[k.registered_domain]()
@abstractmethod
def scrape(self, content: requests.Response):
pass
class BBCScraper(Scraper):
url = 'bbc.co.uk'
def scrape(self, content: requests.Response):
return "Scraped BBC News"
if __name__ == "__main__":
url = 'https://www.bbc.co.uk/'
scraper = Scraper.for_url(url)
r = scraper.scrape(requests.get(url))
print(r) # Scraped BBC News改进
我建议将url存储在属性中,以便将requests.get放在scrape中,这样主代码就更少了
class Scraper:
scrapers = {}
def __init_subclass__(scraper_class):
Scraper.scrapers[scraper_class.domain] = scraper_class
@classmethod
def for_url(cls, url):
k = tldextract.extract(url)
return cls.scrapers[k.registered_domain](url)
@abstractmethod
def scrape(self):
pass
class BBCScraper(Scraper):
domain = 'bbc.co.uk'
def __init__(self, url):
self.url = url
def scrape(self):
rep = requests.Response = requests.get(self.url)
content = rep.text # ALL HTML CONTENT
return "Scraped BBC News" + content[:20]
if __name__ == "__main__":
url = 'https://www.bbc.co.uk/'
scraper = Scraper.for_url(url)
r = scraper.scrape()
print(r) # Scraped BBC News<!DOCTYPE html><htmlhttps://stackoverflow.com/questions/67669212
复制相似问题