首页
学习
活动
专区
圈层
工具
发布
社区首页 >专栏 >爬取淘宝数据

爬取淘宝数据

原创
作者头像
屿.
修改2024-10-31 15:53:48
修改2024-10-31 15:53:48
8360
举报
代码语言:Python
复制
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time

# TODO 浏览器相关配置
options = webdriver.EdgeOptions()
options.add_experimental_option('excludeSwitches', ['enable-automation'])
prefs = {
    'credentials_enable_service': False,
    'profile.password_manager_enabled': False
}
options.add_experimental_option('prefs', prefs)
options.add_argument('--disable-blink-features=AutomationControlled')

driver = webdriver.Edge(options = options)

# TODO 关键词和最大爬取页数49
KEYWORD = "ipad"
MAX_PAGE = 10

# TODO 模拟登录
login_url = f"https://login.taobao.com/member/login.jhtml?spm=a21n57.1.754894437.1.281d523cnqsuAo&f=top&redirectURL=https%3A%2F%2Fs.taobao.com%2Fsearch%3Fq%3D{KEYWORD}"
driver.get(url=login_url)

driver.find_element(By.CSS_SELECTOR, "#fm-login-id").send_keys("y999yu")
driver.find_element(By.CSS_SELECTOR, "#fm-login-password").send_keys("qq2003qq")
driver.find_element(By.CSS_SELECTOR, "#login-form > div.fm-btn > button").click()

print("登录成功,等待主页面加载...")
wait = WebDriverWait(driver, 30)

# TODO 数据爬取
def get_data():
    try:
        divs = wait.until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, ".doubleCard--U4iHXoyX")))
        print(f"找到的元素数量: {len(divs)}")

        for div in divs:
            try:
                # TODO 获取图像链接,增加显式等待
                image_element = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, ".mainPicWrapper--FyazGV69 .mainPic--CuSfUC4j")))
                image_src = image_element.get_attribute('src')

                # TODO 获取价格元素
                price_element1 = div.find_element(By.CSS_SELECTOR, ".unit--I_fIluR5").text
                price_element2 = div.find_element(By.CSS_SELECTOR, ".priceInt--j47mhkXk").text

                # TODO 获取销售数量,使用 try-except 防止找不到元素
                try:
                    deal_element = div.find_element(By.CSS_SELECTOR, ".realSales--nOat6VGM").text
                except NoSuchElementException:
                    deal_element = "未提供"

                # TODO 获取其他信息
                title_element = div.find_element(By.CSS_SELECTOR, ".title--F6pvp_RZ").text
                shop_element = div.find_element(By.CSS_SELECTOR, ".shopNameText--APRH8pWb").text
                location_element = div.find_element(By.CSS_SELECTOR, ".procity--QyzqB59i").text

                product = {
                    'image': image_src,
                    'price': price_element1 + price_element2,
                    'deal': deal_element,
                    'title': title_element,
                    'shop': shop_element,
                    'location': location_element
                }
                # TODO 控制台数据打印
                print(product)

            except NoSuchElementException as e:
                print(f"处理 div 时发生错误:{e}")
            except TimeoutException:
                print("超时:未能找到元素")

    except TimeoutException:
        print("超时:未找到任何匹配的元素。")

# TODO 翻页爬取
def index_page(page):
    print('正在爬取第 ', page, ' 页')
    if page > 0:
        input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#search-content-leftWrap > div.leftContent--BdYLMbH8 > div.pgWrap--RTFKoWa6 > div > div > span.next-input.next-medium.next-pagination-jump-input > input')))
        submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#search-content-leftWrap > div.leftContent--BdYLMbH8 > div.pgWrap--RTFKoWa6 > div > div > button.next-btn.next-medium.next-btn-normal.next-pagination-jump-go')))

        input.clear()
        input.send_keys(page)
        submit.click()

        # TODO 调用数据爬取函数
        get_data()

# TODO 主函数,调度翻页批量爬取
def main():
    for i in range(1, MAX_PAGE + 1):
        index_page(i)
        time.sleep(10)

main()

# TODO 程序结束
input("按回车键退出...")
driver.quit()

原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。

如有侵权,请联系 cloudcommunity@tencent.com 删除。

原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。

如有侵权,请联系 cloudcommunity@tencent.com 删除。

评论
登录后参与评论
0 条评论
热度
最新
推荐阅读
领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档