在能够登录到一个受保护的网站后,我想刮掉动态加载的相同网页的一些内容。这个代码块可以正确地处理身份验证,但是如果我尝试访问类名为lang-py的前标记元素,我会得到None作为输出返回给我。
import sys
from PyQt5.QtCore import QByteArray, QUrl
from PyQt5.QtWidgets import QApplication
from PyQt5.QtWebEngineCore import QWebEngineHttpRequest
from PyQt5.QtWebEngineWidgets import QWebEnginePage
class Render(QWebEnginePage):
def __init__(self, url):
app = QApplication(sys.argv)
QWebEnginePage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self._html = ""
username = "username"
password = "password"
base64string = QByteArray(("%s:%s" % (username, password)).encode()).toBase64()
request = QWebEngineHttpRequest(QUrl.fromUserInput(url))
equest.setHeader(b"Authorization", b"Basic: %s" % (base64string,))
self.load(request)
app.exec_()
@property
def html(self):
return self._html
def _loadFinished(self):
self.toHtml(self.handle_to_html)
def handle_to_html(self, html):
self._html = html
QApplication.quit()
def main():
url = "https://stackoverflow.com/questions/64055445/scraping-websites-with-protected-content-using-pyqt5/64055601?noredirect=1#comment113272437_64055601"
r = Render(url)
print(r.html)
if __name__ == "__main__":
main()如何在<pre>中加载内容?
发布于 2020-09-25 09:14:18
带有标记"pre“和类"lang-py”的元素出现在html中,因此您可以使用BeautifulSoup来获取数据:
# ...
from bs4 import BeautifulSoup
# ...
def main():
url = "https://stackoverflow.com/questions/64055445/scraping-websites-with-protected-content-using-pyqt5/64055601?noredirect=1#comment113272437_64055601"
r = Render(url)
soup = BeautifulSoup(r.html, "html.parser")
for tag in soup.find_all("pre", {"class": "lang-py"}):
print("=" * 50)
print(tag.text)https://stackoverflow.com/questions/64056344
复制相似问题