我正在尝试学习使用selenium进行刮擦,同时用"html.parser" of BS4汤解析page_source。我有所有包含h2标记和class名称的标记,但是在中间提取文本似乎不起作用。
import os
import re
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup as soup
opts = webdriver.ChromeOptions()
opts.binary_location = os.environ.get('GOOGLE_CHROME_BIN', None)
opts.add_argument("--headless")
opts.add_argument("--disable-dev-shm-usage")
opts.add_argument("--no-sandbox")
browser = webdriver.Chrome(executable_path="chromedriver", options=opts)
url1='https://www.animechrono.com/date-a-live-series-watch-order'
browser.get(url1)
req = browser.page_source
sou = soup(req, "html.parser")
h = sou.find_all('h2', class_='heading-5')
p = sou.find_all('div', class_='text-block-5')
for i in range(len(h)):
h[i] == h[i].getText()
for j in range(len(p)):
p[j] = p[j].getText()
print(h)
print(p)
browser.quit()My输出:
[<h2 class="heading-5">Season 1</h2>, <h2 class="heading-5">Date to Date OVA</h2>, <h2 class="heading-5">Season 2</h2>, <h2 class="heading-5">Kurumi Star Festival OVA</h2>, <h2 class="heading-5">Date A Live Movie: Mayuri Judgement</h2>, <h2 class="heading-5">Season 3</h2>, <h2 class="heading-5">Date A Bullet: Dead or Bullet Movie</h2>, <h2 class="heading-5">Date A Bullet: Nightmare or Queen Movie</h2>]
['Episodes 1-12', 'Date to Date OVA', 'Episodes 1-10', 'Kurumi Star Festival OVA', 'Date A Live Movie: Mayuri Judgement', 'Episodes 1-12', 'Date A Bullet: Dead or Bullet Movie', 'Date A Bullet: Nightmare or Queen Movie']发布于 2020-11-04 16:05:17
在driver.quit()之前添加这一行
h = [elem.text for elem in h]
print(h)完整代码:
import os
import re
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup as soup
opts = webdriver.ChromeOptions()
opts.binary_location = os.environ.get('GOOGLE_CHROME_BIN', None)
opts.add_argument("--headless")
opts.add_argument("--disable-dev-shm-usage")
opts.add_argument("--no-sandbox")
browser = webdriver.Chrome(executable_path="chromedriver", options=opts)
url1='https://www.animechrono.com/date-a-live-series-watch-order'
browser.get(url1)
req = browser.page_source
sou = soup(req, "html.parser")
h = sou.find_all('h2', class_='heading-5')
p = sou.find_all('div', class_='text-block-5')
for j in range(len(p)):
p[j] = p[j].getText()
h = [elem.text for elem in h]
print(h)
browser.quit()输出:
['Season 1', 'Date to Date OVA', 'Season 2', 'Kurumi Star Festival OVA', 'Date A Live Movie: Mayuri Judgement', 'Season 3', 'Date A Bullet: Dead or Bullet Movie', 'Date A Bullet: Nightmare or Queen Movie']https://stackoverflow.com/questions/64683540
复制相似问题