首页
学习
活动
专区
圈层
工具
发布
社区首页 >问答首页 >在无头模式下运行Selenium时没有输出,在无头状态下工作

在无头模式下运行Selenium时没有输出,在无头状态下工作
EN

Stack Overflow用户
提问于 2022-07-13 13:49:08
回答 1查看 179关注 0票数 2

寻求调试帮助的问题(“为什么这段代码不能工作?”)必须包括所需的行为、特定的问题或错误以及在问题本身中再现它所需的最短代码。

想要的行为是按照这个工作代码(在非头模式下)创建一个被刮过的页面的输出文件,下面是在问题本身中再现它所需的最短代码。

代码语言:javascript
复制
# script_concurrent.py

from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from concurrent.futures import ThreadPoolExecutor, wait
from time import sleep, time
from selenium import webdriver
import datetime
import os

from scrapers.scraper import connect_to_base, parse_html, write_to_file


def counted(f):
    def wrapped(*args, **kwargs):
        wrapped.calls += 1
        return f(*args, **kwargs)

    wrapped.calls = 0

    return wrapped


def sleepy(f):
    def wrapped(*args, **kwargs):
        with lock:
            wrapped.calls += 1
            print(f"{f.__name__} called {wrapped.calls} times")
            if wrapped.calls % 20 == 0:
                print(colored("Sleeping...", "blue"))
                sleep(randint(60, 65))
        return f(*args, **kwargs)

    lock = threading.Lock()
    wrapped.calls = 0

    return wrapped

@counted
@sleepy
def run_process(filename="Hitachi.csv"):

    # init browser
    os.environ["WDM_LOG_LEVEL"] = "0"
    browser = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

    if connect_to_base(browser):
        sleep(2)
        html = browser.page_source
        output_list = parse_html(html)
        write_to_file(output_list, filename)
    else:
        print("Error connecting to AVS")

    # exit
    browser.quit()


if __name__ == "__main__":

    start_time = time()
    output_timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
    output_filename = f"output_{output_timestamp}.csv"

    futures = []

    with ThreadPoolExecutor() as executor:
        futures.extend(executor.submit(run_process) for _ in range(2, 12))

    wait(futures)
    end_time = time()
    elapsed_time = end_time - start_time
    print(f"Elapsed run time: {elapsed_time / 60:.2f} minutes.")
    print(f"Calls to run_process: {run_process.calls}")
代码语言:javascript
复制
# scraper.py

import requests
import csv
from pathlib import Path
import itertools
import pandas as pd
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait

BASE_DIR = Path(__file__).resolve(strict=True).parent.parent


def csv_to_iter(filename, idx=0):
    pd.set_option("display.max_rows", None)
    df = pd.read_csv(filename)
    df = df.iloc[:, [idx]]
    df = df.values.tolist()
    df = list(itertools.chain(*df))
    df = sorted(list(set(df)))
    return iter(df)


my_iter = csv_to_iter(
    filename="/Users/myusername/Downloads/Code/AVS-concurrent-web-scraping/Sorted_MAH_Hitachi_urls.csv"
)


def connect_to_base(browser):
    my_next_iter = next(my_iter)
    connection_attempts = 0
    while connection_attempts < 3:
        try:
            browser.get(my_next_iter)
            # wait for table element with id = 'content' to load
            # before returning True
            WebDriverWait(browser, 5).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, ".container"))
            )
            return True
        except Exception as e:
            print(e)
            connection_attempts += 1
            print(f"Error connecting to {my_next_iter}.")
            print(f"Attempt #{connection_attempts}.")
    return False


def parse_html(html):
    # create soup object
    soup = BeautifulSoup(html, "html.parser")
    # parse soup object to get wikipedia article url, title, and last modified date
    # part_position = [
    #     item.text.strip() for item in soup.findAll("td", {"data-title": "Pos."})
    # ]
    part_number_1 = [
        item.text.strip() for item in soup.findAll("td", {"data-title": "Part â„–"})
    ]
    part_number_2 = [
        item.text.strip() for item in soup.findAll("td", {"data-title": "Part №"})
    ]

    if not part_number_1:
        pass
    else:
        part_number = part_number_1

    if not part_number_2:
        pass
    else:
        part_number = part_number_2

    part_qty = [item.text.strip() for item in soup.findAll("td", {"data-title": "Qty"})]

    part_name = [
        item.text.strip() for item in soup.findAll("td", {"data-title": "Part name"})
    ]

    part_comments = [
        item.text.strip() for item in soup.findAll("td", {"data-title": "Comments"})
    ]

    machine = [
        item.text.split()[0] for item in soup.findAll("article", {"id": "node-content"})
    ]

    alternative_machines = [
        item.text.split()[2] for item in soup.findAll("article", {"id": "node-content"})
    ]

    title = [item.text for item in soup.findAll("span", {"class": "trans"})]

    parts_group = [item.h3 for item in soup.findAll("div", {"class": "card-header"})]

    article_info = {
        # "Pos.": part_position,
        "Part No": part_number,
        "Qty": part_qty,
        "Parts name": part_name,
        "Comments": part_comments,
        "Machine": machine,
        "Alternative_machines": alternative_machines,
        "Title": title,
        "Parts_group": parts_group,
    }

    return [article_info]


def get_load_time(article_url):
    try:
        # set headers
        headers = {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36"
        }
        # make get request to article_url
        response = requests.get(
            article_url, headers=headers, stream=True, timeout=3.000
        )
        # get page load time
        load_time = response.elapsed.total_seconds()
    except Exception as e:
        print(e)
        load_time = "Loading Error"
    return load_time


def write_to_file(output_list, filename="Hitachi.csv"):
    for row in output_list:
        with open(Path(BASE_DIR).joinpath(filename), "a") as csvfile:
            fieldnames = [
                "Pos.",
                "Part No",
                "Qty",
                "Parts name",
                "Comments",
                "Machine",
                "Alternative_machines",
                "Title",
                "Parts_group",
            ]
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writerow(row)

输出

代码语言:javascript
复制
run_process called 1 times

[WDM] - ====== WebDriver manager ======
2022-07-13 15:35:26,409 INFO ====== WebDriver manager ======
run_process called 2 times

[WDM] - ====== WebDriver manager ======
2022-07-13 15:35:26,410 INFO ====== WebDriver manager ======
run_process called 3 times

[WDM] - ====== WebDriver manager ======
2022-07-13 15:35:26,410 INFO ====== WebDriver manager ======
run_process called 4 times

[WDM] - ====== WebDriver manager ======
2022-07-13 15:35:26,415 INFO ====== WebDriver manager ======
run_process called 5 times

[WDM] - ====== WebDriver manager ======
2022-07-13 15:35:26,417 INFO ====== WebDriver manager ======
run_process called 6 times

[WDM] - ====== WebDriver manager ======
2022-07-13 15:35:26,418 INFO ====== WebDriver manager ======
run_process called 7 times

[WDM] - ====== WebDriver manager ======
2022-07-13 15:35:26,420 INFO ====== WebDriver manager ======
run_process called 8 times

[WDM] - ====== WebDriver manager ======
2022-07-13 15:35:26,426 INFO ====== WebDriver manager ======
[WDM] - Current google-chrome version is 103.0.5060
2022-07-13 15:35:26,616 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-13 15:35:26,617 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Current google-chrome version is 103.0.5060
2022-07-13 15:35:26,695 INFO Current google-chrome version is 103.0.5060
[WDM] - Current google-chrome version is 103.0.5060
2022-07-13 15:35:26,697 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-13 15:35:26,700 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Current google-chrome version is 103.0.5060
2022-07-13 15:35:26,699 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-13 15:35:26,701 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-13 15:35:26,699 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Current google-chrome version is 103.0.5060
2022-07-13 15:35:26,710 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-13 15:35:26,710 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Current google-chrome version is 103.0.5060
2022-07-13 15:35:26,713 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-13 15:35:26,713 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Current google-chrome version is 103.0.5060
2022-07-13 15:35:26,717 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-13 15:35:26,717 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Current google-chrome version is 103.0.5060
(.venv) martinhewing@Martins-MacBook-Pro AVS-concurrent-web-scraping % python3 script_concurrent.py
run_process called 1 times

[WDM] - ====== WebDriver manager ======
2022-07-13 15:36:45,472 INFO ====== WebDriver manager ======
run_process called 2 times

[WDM] - ====== WebDriver manager ======
2022-07-13 15:36:45,476 INFO ====== WebDriver manager ======
run_process called 3 times

[WDM] - ====== WebDriver manager ======
2022-07-13 15:36:45,479 INFO ====== WebDriver manager ======
run_process called 4 times

[WDM] - ====== WebDriver manager ======
2022-07-13 15:36:45,480 INFO ====== WebDriver manager ======
run_process called 5 times
Sleeping...
[WDM] - Current google-chrome version is 103.0.5060
2022-07-13 15:36:45,616 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-13 15:36:45,617 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Current google-chrome version is 103.0.5060
2022-07-13 15:36:45,650 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-13 15:36:45,650 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Current google-chrome version is 103.0.5060
2022-07-13 15:36:45,660 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-13 15:36:45,660 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Current google-chrome version is 103.0.5060
(.venv) martinhewing@Martins-MacBook-Pro AVS-concurrent-web-scraping % python3 script_concurrent.py
run_process called 1 times

[WDM] - ====== WebDriver manager ======
2022-07-13 15:37:46,546 INFO ====== WebDriver manager ======
run_process called 2 times

[WDM] - ====== WebDriver manager ======
2022-07-13 15:37:46,550 INFO ====== WebDriver manager ======
run_process called 3 times

[WDM] - ====== WebDriver manager ======
2022-07-13 15:37:46,555 INFO ====== WebDriver manager ======
run_process called 4 times

[WDM] - ====== WebDriver manager ======
2022-07-13 15:37:46,695 INFO ====== WebDriver manager ======
[WDM] - Current google-chrome version is 103.0.5060
2022-07-13 15:37:46,708 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-13 15:37:46,708 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Current google-chrome version is 103.0.5060
2022-07-13 15:37:46,724 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-13 15:37:46,725 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Current google-chrome version is 103.0.5060
2022-07-13 15:37:46,733 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-13 15:37:46,734 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Current google-chrome version is 103.0.5060
2022-07-13 15:37:46,752 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-13 15:37:46,753 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
run_process called 5 times
Sleeping...
[WDM] - Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
2022-07-13 15:37:46,843 INFO Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
[WDM] - Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
2022-07-13 15:37:46,843 INFO Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
[WDM] - Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
2022-07-13 15:37:46,844 INFO Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
[WDM] - Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
2022-07-13 15:37:46,942 INFO Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
https://spare.avspart.com/catalog/hitachi/101:uh02/0d79c019-4621-4a47-8127-bd7baa5f0c0b/
https://spare.avspart.com/catalog/hitachi/101:uh02/1a7f894f-c1b8-456b-8ed3-bf78c60e4a71/
https://spare.avspart.com/catalog/hitachi/101:uh02/06e2437d-a240-49d0-ac8d-fc553bff6c53/
https://spare.avspart.com/catalog/hitachi/101:uh02/1c6fe013-e139-4112-81a5-c01fc4591803/

[WDM] - ====== WebDriver manager ======
2022-07-13 15:38:48,773 INFO ====== WebDriver manager ======
run_process called 6 times

[WDM] - ====== WebDriver manager ======
2022-07-13 15:38:48,778 INFO ====== WebDriver manager ======
run_process called 7 times

[WDM] - ====== WebDriver manager ======
2022-07-13 15:38:48,783 INFO ====== WebDriver manager ======
run_process called 8 times

[WDM] - ====== WebDriver manager ======
2022-07-13 15:38:48,793 INFO ====== WebDriver manager ======
run_process called 9 times

[WDM] - ====== WebDriver manager ======
2022-07-13 15:38:48,802 INFO ====== WebDriver manager ======
run_process called 10 times
Sleeping...
[WDM] - Current google-chrome version is 103.0.5060
2022-07-13 15:38:48,947 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-13 15:38:48,948 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Current google-chrome version is 103.0.5060
2022-07-13 15:38:48,964 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-13 15:38:48,964 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Current google-chrome version is 103.0.5060
2022-07-13 15:38:48,967 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-13 15:38:48,967 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Current google-chrome version is 103.0.5060
2022-07-13 15:38:48,971 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-13 15:38:48,973 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Current google-chrome version is 103.0.5060
2022-07-13 15:38:48,989 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-13 15:38:48,994 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
2022-07-13 15:38:49,065 INFO Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
[WDM] - Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
2022-07-13 15:38:49,108 INFO Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
[WDM] - Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
2022-07-13 15:38:49,129 INFO Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
[WDM] - Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
2022-07-13 15:38:49,181 INFO Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
[WDM] - Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
2022-07-13 15:38:49,189 INFO Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
https://spare.avspart.com/catalog/hitachi/101:uh02/1d07c2a9-d4f8-4b50-a6bc-e64951cd7e8e/
https://spare.avspart.com/catalog/hitachi/101:uh02/3aa2c54f-154e-4aae-8f2a-efb05b471bfa/
https://spare.avspart.com/catalog/hitachi/101:uh02/3c0b42bb-c6c9-4f60-8c2e-d5258a703d76/
https://spare.avspart.com/catalog/hitachi/101:uh02/2780b803-2f37-4777-a5c6-97ea9e54137d/
https://spare.avspart.com/catalog/hitachi/101:uh02/47a76d4e-70b0-4b6d-9308-67b91a4619ad/

[WDM] - ====== WebDriver manager ======
2022-07-13 15:39:49,816 INFO ====== WebDriver manager ======
[WDM] - Current google-chrome version is 103.0.5060
2022-07-13 15:39:50,147 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-13 15:39:50,148 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
2022-07-13 15:39:50,368 INFO Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
https://spare.avspart.com/catalog/hitachi/101:uh02/540f4b09-795a-41de-9715-8825e296018b/
Elapsed run time: 2.27 minutes.
Calls to run_process: 10

数据

代码语言:javascript
复制
0
https://spare.avspart.com/catalog/hitachi/101:uh02/06e2437d-a240-49d0-ac8d-fc553bff6c53/
https://spare.avspart.com/catalog/hitachi/101:uh02/0d79c019-4621-4a47-8127-bd7baa5f0c0b/
https://spare.avspart.com/catalog/hitachi/101:uh02/1a7f894f-c1b8-456b-8ed3-bf78c60e4a71/
https://spare.avspart.com/catalog/hitachi/101:uh02/1c6fe013-e139-4112-81a5-c01fc4591803/
https://spare.avspart.com/catalog/hitachi/101:uh02/1d07c2a9-d4f8-4b50-a6bc-e64951cd7e8e/
https://spare.avspart.com/catalog/hitachi/101:uh02/2780b803-2f37-4777-a5c6-97ea9e54137d/
https://spare.avspart.com/catalog/hitachi/101:uh02/3aa2c54f-154e-4aae-8f2a-efb05b471bfa/
https://spare.avspart.com/catalog/hitachi/101:uh02/3c0b42bb-c6c9-4f60-8c2e-d5258a703d76/
https://spare.avspart.com/catalog/hitachi/101:uh02/47a76d4e-70b0-4b6d-9308-67b91a4619ad/
https://spare.avspart.com/catalog/hitachi/101:uh02/540f4b09-795a-41de-9715-8825e296018b/
https://spare.avspart.com/catalog/hitachi/101:uh02/57cefeb3-9dd2-4f99-a552-50dc452b6565/
https://spare.avspart.com/catalog/hitachi/101:uh02/58c4d3b6-9a15-4be0-8082-19980c2119fe/
https://spare.avspart.com/catalog/hitachi/101:uh02/5b2f40e4-a61f-4a3d-a15f-a41659595b28/

这里是我实现无头模式的尝试。

代码语言:javascript
复制
def get_driver(headless):
    options = webdriver.Options()
    if headless:
        options.add_argument("--headless")

    # initialize driver
    driver = webdriver.Chrome(
        service=Service(ChromeDriverManager().install()), options=options
    )
    return driver
代码语言:javascript
复制
# script_concurrent.py

from concurrent.futures import ThreadPoolExecutor, wait
from time import sleep, time
from termcolor import colored
from random import randint
import threading
import datetime
import sys

from scrapers.scraper import get_driver, connect_to_base, parse_html, write_to_file


def counted(f):
    def wrapped(*args, **kwargs):
        wrapped.calls += 1
        return f(*args, **kwargs)

    wrapped.calls = 0

    return wrapped


def sleepy(f):
    def wrapped(*args, **kwargs):
        with lock:
            wrapped.calls += 1
            print(f"{f.__name__} called {wrapped.calls} times")
            if wrapped.calls % 20 == 0:
                print(colored("Sleeping...", "blue"))
                sleep(randint(60, 65))
        return f(*args, **kwargs)

    lock = threading.Lock()
    wrapped.calls = 0

    return wrapped


@counted
@sleepy
def run_process(filename, headless):

    # init browser
    browser = get_driver(headless)

    if connect_to_base(browser):
        sleep(2)
        html = browser.page_source
        output_list = parse_html(html)
        write_to_file(output_list, filename)
        # exit
        browser.quit()
    else:
        print("Error connecting to AVS")
        browser.quit()


if __name__ == "__main__":

    headless = False
    if len(sys.argv) > 1:
        if sys.argv[1] == "headless":
            print("Running in headless mode")
            headless = True

    start_time = time()
    output_timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
    output_filename = f"Hitachi_{output_timestamp}.csv"

    futures = []

    with ThreadPoolExecutor() as executor:
        futures.extend(
            executor.submit(run_process, output_filename, headless)
            for _ in range(2, 202)
        )

    wait(futures)
    end_time = time()
    elapsed_time = end_time - start_time
    print(f"Elapsed run time: {elapsed_time / 60:.2f} minutes.")
    print(f"Calls to run_process: {run_process.calls}")
代码语言:javascript
复制
# script.py

import csv
import requests
import itertools
import pandas as pd
from pathlib import Path
from selenium import webdriver
from termcolor import colored
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait

BASE_DIR = Path(__file__).resolve(strict=True).parent.parent


def csv_to_iter(filename, idx=0):
    pd.set_option("display.max_rows", None)
    df = pd.read_csv(filename)
    df = df.iloc[:, [idx]]
    df = df.values.tolist()
    df = list(itertools.chain(*df))
    df = sorted(list(set(df)))
    return iter(df)


my_iter = csv_to_iter(
    filename="/Users/martinhewing/Downloads/Code/AVS-concurrent-web-scraping/Sorted_MAH_Hitachi_urls.csv"
)


def get_driver(headless):
    options = webdriver.Options()
    if headless:
        options.add_argument("--headless")

    # initialize driver
    driver = webdriver.Chrome(
        service=Service(ChromeDriverManager().install()), options=options
    )
    return driver


def connect_to_base(browser):
    my_next_iter = next(my_iter)
    connection_attempts = 0
    while connection_attempts < 3:
        try:
            browser.get(my_next_iter)
            print(colored(browser.current_url, "green"))
            # wait for table element with id = 'content' to load
            # before returning True
            WebDriverWait(browser, 5).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, ".container"))
            )
            return True
        except Exception as e:
            print(e)
            connection_attempts += 1
            print(f"Error connecting to {my_next_iter}.")
            print(f"Attempt #{connection_attempts}.")
    return False


def parse_html(html):
    # create soup object
    soup = BeautifulSoup(html, "html.parser")
    # parse soup object to get wikipedia article url, title, and last modified date
    # part_position = [
    #     item.text.strip() for item in soup.findAll("td", {"data-title": "Pos."})
    # ]
    part_number_1 = [
        item.text.strip() for item in soup.findAll("td", {"data-title": "Part â"})
    ]
    part_number_2 = [
        item.text.strip() for item in soup.findAll("td", {"data-title": "Part №"})
    ]

    if not part_number_1:
        pass
    else:
        part_number = part_number_1

    if not part_number_2:
        pass
    else:
        part_number = part_number_2

    part_qty = [item.text.strip() for item in soup.findAll("td", {"data-title": "Qty"})]

    part_name = [
        item.text.strip() for item in soup.findAll("td", {"data-title": "Part name"})
    ]

    part_comments = [
        item.text.strip() for item in soup.findAll("td", {"data-title": "Comments"})
    ]

    machine = [
        item.text.split()[0] for item in soup.findAll("article", {"id": "node-content"})
    ]

    alternative_machines = [
        item.text.split()[2] for item in soup.findAll("article", {"id": "node-content"})
    ]

    title = [item.text for item in soup.findAll("span", {"class": "trans"})]

    parts_group = [item.h3 for item in soup.findAll("div", {"class": "card-header"})]

    article_info = {
        # "Pos.": part_position,
        "Part No": part_number,
        "Qty": part_qty,
        "Parts name": part_name,
        "Comments": part_comments,
        "Machine": machine,
        "Alternative_machines": alternative_machines,
        "Title": title,
        "Parts_group": parts_group,
    }

    return [article_info]


def get_load_time(article_url):
    try:
        # set headers
        headers = {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36"
        }
        # make get request to article_url
        response = requests.get(
            article_url, headers=headers, stream=True, timeout=3.000
        )
        # get page load time
        load_time = response.elapsed.total_seconds()
    except Exception as e:
        print(e)
        load_time = "Loading Error"
    return load_time


def write_to_file(output_list, filename):
    for row in output_list:
        with open(Path(BASE_DIR).joinpath(filename), "a") as csvfile:
            fieldnames = [
                "Pos.",
                "Part No",
                "Qty",
                "Parts name",
                "Comments",
                "Machine",
                "Alternative_machines",
                "Title",
                "Parts_group",
            ]
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writerow(row)

输出

代码语言:javascript
复制
Running in headless mode
run_process called 1 times
run_process called 2 times
run_process called 3 times
run_process called 4 times
run_process called 5 times
run_process called 6 times
run_process called 7 times
run_process called 8 times
run_process called 9 times
run_process called 10 times
run_process called 11 times
run_process called 12 times
run_process called 13 times
run_process called 14 times
run_process called 15 times
run_process called 16 times
run_process called 17 times
run_process called 18 times
run_process called 19 times
run_process called 20 times
Sleeping...

数据

代码语言:javascript
复制
0
https://spare.avspart.com/catalog/hitachi/101:uh02/06e2437d-a240-49d0-ac8d-fc553bff6c53/
https://spare.avspart.com/catalog/hitachi/101:uh02/0d79c019-4621-4a47-8127-bd7baa5f0c0b/
https://spare.avspart.com/catalog/hitachi/101:uh02/1a7f894f-c1b8-456b-8ed3-bf78c60e4a71/
https://spare.avspart.com/catalog/hitachi/101:uh02/1c6fe013-e139-4112-81a5-c01fc4591803/
https://spare.avspart.com/catalog/hitachi/101:uh02/1d07c2a9-d4f8-4b50-a6bc-e64951cd7e8e/
https://spare.avspart.com/catalog/hitachi/101:uh02/2780b803-2f37-4777-a5c6-97ea9e54137d/
https://spare.avspart.com/catalog/hitachi/101:uh02/3aa2c54f-154e-4aae-8f2a-efb05b471bfa/
https://spare.avspart.com/catalog/hitachi/101:uh02/3c0b42bb-c6c9-4f60-8c2e-d5258a703d76/
https://spare.avspart.com/catalog/hitachi/101:uh02/47a76d4e-70b0-4b6d-9308-67b91a4619ad/
https://spare.avspart.com/catalog/hitachi/101:uh02/540f4b09-795a-41de-9715-8825e296018b/
https://spare.avspart.com/catalog/hitachi/101:uh02/57cefeb3-9dd2-4f99-a552-50dc452b6565/
https://spare.avspart.com/catalog/hitachi/101:uh02/58c4d3b6-9a15-4be0-8082-19980c2119fe/
https://spare.avspart.com/catalog/hitachi/101:uh02/5b2f40e4-a61f-4a3d-a15f-a41659595b28/

当我在无头模式下运行时,没有错误,但是也没有输出,我已经回顾了类似的问题,但是,我不明白是什么导致了这种情况的发生。(请帮助:)

EN

回答 1

Stack Overflow用户

回答已采纳

发布于 2022-07-13 14:21:48

网站封锁:

网站可能会发现你刮到了。有几种不同的解决方案你可以尝试。

更改您的用户代理:

代码语言:javascript
复制
chrome_options.add_argument("USER AGENT")

使用以下链接中显示的内容更改“用户代理”字符串:我的用户代理是什么?

硒隐身:

代码语言:javascript
复制
stealth(driver,
        user_agent: 'USER AGENT',
        languages=["en-US", "en"],
        vendor="Google Inc.",
        platform="Win32",
        webgl_vendor="Intel Inc.",
        renderer="Intel Iris OpenGL Engine",
        fix_hairline=True,
        )

硒隐身是一个python包,它与Selenium一起使用,用于防止检测。它操作Selenium浏览器的关键元素,以绕过bot检测软件

票数 3
EN
页面原文内容由Stack Overflow提供。腾讯云小微IT领域专用引擎提供翻译支持
原文链接:

https://stackoverflow.com/questions/72967454

复制
相关文章

相似问题

领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档