我正试着用漂亮的汤和硒刮shopee.co.id。在一个搜索结果页面中有60个产品。在代码的末尾,我使用len()检查了提取的数据,结果显示我只提取了其中的42个。如何修正代码以获得所有搜索结果?
下面是我一直在尝试的代码:
import imp
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options # to customize chrome display
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from time import sleep
from collections import Counter
import threading
import time
import pandas as pd
import numpy as np
from numpy import nan
import re
import concurrent.futures
import csv
# Link product search result
from turtle import delay
url = 'https://shopee.co.id/search?keyword=obat%20kanker'
path = '/Applications/chromedriver'
# create object for chrome options
chrome_options = Options()
# Customize chrome display
chrome_options.add_argument('start-maximized')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--headless')
chrome_options.add_argument('disable-notifications')
# To disable the message, "Chrome is being controlled by automated test software"
chrome_options.add_argument('--disable-infobars')
# create webdriver object
driver = webdriver.Chrome(executable_path=path, options=chrome_options)
driver.get(url)
# get url
main_link = 'https://shopee.co.id/search?keyword=obat%20kanker&page=0'
driver.get(main_link)
WebDriverWait(driver, 5).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "shopee-search-item-result__item")))
driver.execute_script("""
var scroll = document.body.scrollHeight / 10;
var i = 0;
function scrollit(i) {
window.scrollBy({top: scroll, left: 0, behavior: 'smooth'});
i++;
if (i < 10) {
setTimeout(scrollit, 500, i);
}
}
scrollit(i);
""")
sleep(5)
html = driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML")
soup = BeautifulSoup(html, "html.parser")
# Scrape product name
product_name = soup.find_all('div', class_="ie3A+n bM+7UW Cve6sh")
product_name[0].get_text()
product_price = soup.find_all('span', {'class': 'ZEgDH9'})
product_price[0].get_text()
product_sold = soup.find_all('div', {'class':"r6HknA uEPGHT"})
product_sold[0].get_text()
len(product_name)发布于 2022-08-23 18:10:34
这是获得这些产品详细信息的一种方法(selenium安装程序是chrome/linux,您可以根据自己的设置修改代码,只需在定义浏览器后查看导入和代码):
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time as t
import json
chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument('disable-notifications')
chrome_options.add_argument("window-size=1280,720")
webdriver_service = Service("chromedriver/chromedriver") ## path to where you saved chromedriver binary
browser = webdriver.Chrome(service=webdriver_service, options=chrome_options)
url = 'https://shopee.co.id/search?keyword=obat%20kanker&page=0'
browser.get(url)
items = WebDriverWait(browser, 20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'script[data-rh="true"]')))
print(len(items))
for i in items:
json_obj = json.loads(i.get_attribute('innerHTML'))
if json_obj['@type'] == 'Product':
print(json_obj['name'], json_obj['offers'])
print('_____________')这将在终端打印出来:
61
OBAT KANKER TUMOR MIOM KISTA KELENJAR POLIP LIPOM BENJOLAN SEMBUH TOTAL TANPA OPERASI {'@type': 'Offer', 'price': '184000.00', 'priceCurrency': 'IDR', 'availability': 'http://schema.org/InStock'}
_____________
GRAVIDA BHARATA OBAT KANKER PAYUDARA AMPUH |KANKER GANAS HERBAL TERDAFTAR DBPOM MUI WARYANTO076 {'@type': 'Offer', 'price': '275000.00', 'priceCurrency': 'IDR', 'availability': 'http://schema.org/InStock'}
_____________
Walatra Zedoril 7 Asli Obat Herbal Kanker Tumor Dan Segala Jenis Benjolan Aman Tanpa Efek Samping {'@type': 'Offer', 'price': '255000.00', 'priceCurrency': 'IDR', 'availability': 'http://schema.org/InStock'}
_____________
PROMO PAKET SEMBUH OBAT TUMOR KANKER KISTA MIOM & KELENJAR TERLARIS, TERPERCAYA TERBUKTI &GARANSI {'@type': 'Offer', 'price': '349600.00', 'priceCurrency': 'IDR', 'availability': 'http://schema.org/InStock'}
_____________
Obat Herbal Kanker Payudara, Serviks, Hati, Usus, Prostat, Leukimia dan Paru Paru ORIGINAL 100% ASLI {'@type': 'Offer', 'price': '525000.00', 'priceCurrency': 'IDR', 'availability': 'http://schema.org/InStock'}
[...]您可以进一步剖析那些json对象,以提取所需的数据。
https://stackoverflow.com/questions/73462047
复制相似问题