以下是网站的链接:
这是我的剧本:
import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import time
from selenium import webdriver
from selenium.webdriver.support.select import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
PATH = "driver\chromedriver.exe"
options = webdriver.ChromeOptions()
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1200,900")
options.add_argument('enable-logging')
driver = webdriver.Chrome(options=options, executable_path=PATH)
wait = WebDriverWait(driver, 20)
driver.get('https://fr.hotels.com/search.do?destination-id=10398359&q-check-in=2021-06-26&q-check-out=2021-06-27&q-rooms=1&q-room-0-adults=2&q-room-0-children=0&sort-order=BEST_SELLER')
driver.maximize_window()
time.sleep(2)
webdriver.ActionChains(driver).send_keys(Keys.ESCAPE).perform()
time.sleep(2)
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'button[class="uolsaJ"]'))).click()
links = []
def is_element_visible(xpath):
wait1 = WebDriverWait(driver, 2)
try:
wait1.until(EC.visibility_of_element_located((By.XPATH, xpath)))
return True
except Exception:
return False
while not is_element_visible("//div[@id='20']"):
my_elems = driver.find_elements_by_xpath('//a[@class="_61P-R0"]')
links = [my_elem.get_attribute("href") for my_elem in my_elems]
driver.execute_script("window.scrollBy(0, 1000)")
time.sleep(5)
print(links)这是输出:
['https://fr.hotels.com/ho716157152/?q-rooms=1&q-room-0-adults=2&q-room-0-children=0&sort-order=BEST_SELLER&ZSX=0&SYE=3', 'https://fr.hotels.com/ho397103/?q-rooms=1&q-room-0-adults=2&q-room-0-children=0&sort-order=BEST_SELLER&ZSX=0&SYE=3', 'https://fr.hotels.com/ho1098309152/?q-rooms=1&q-room-0-adults=2&q-room-0-children=0&sort-order=BEST_SELLER&ZSX=0&SYE=3', 'https://fr.hotels.com/ho449686/?q-rooms=1&q-room-0-adults=2&q-room-0-children=0&sort-order=BEST_SELLER&ZSX=0&SYE=3', 'https://fr.hotels.com/ho315896/?q-rooms=1&q-room-0-adults=2&q-room-0-children=0&sort-order=BEST_SELLER&ZSX=0&SYE=3', 'https://fr.hotels.com/ho1574324896/?q-rooms=1&q-room-0-adults=2&q-room-0-children=0&sort-order=BEST_SELLER&ZSX=0&SYE=3', 'https://fr.hotels.com/ho288352/?q-rooms=1&q-room-0-adults=2&q-room-0-children=0&sort-order=BEST_SELLER&ZSX=0&SYE=3', 'https://fr.hotels.com/ho748227104/?q-rooms=1&q-room-0-adults=2&q-room-0-children=0&sort-order=BEST_SELLER&ZSX=0&SYE=3', 'https://fr.hotels.com/ho225263/?q-rooms=1&q-room-0-adults=2&q-room-0-children=0&sort-order=BEST_SELLER&ZSX=0&SYE=3', 'https://fr.hotels.com/ho225250/?q-rooms=1&q-room-0-adults=2&q-room-0-children=0&sort-order=BEST_SELLER&ZSX=0&SYE=3', 'https://fr.hotels.com/ho405210/?q-rooms=1&q-room-0-adults=2&q-room-0-children=0&sort-order=BEST_SELLER&ZSX=0&SYE=3', 'https://fr.hotels.com/ho547798/?q-rooms=1&q-room-0-adults=2&q-room-0-children=0&sort-order=BEST_SELLER&ZSX=0&SYE=3', 'https://fr.hotels.com/ho252584/?q-rooms=1&q-room-0-adults=2&q-room-0-children=0&sort-order=BEST_SELLER&ZSX=0&SYE=3', 'https://fr.hotels.com/ho351562/?q-rooms=1&q-room-0-adults=2&q-room-0-children=0&sort-order=BEST_SELLER&ZSX=0&SYE=3', 'https://fr.hotels.com/ho714011808/?q-rooms=1&q-room-0-adults=2&q-room-0-children=0&sort-order=BEST_SELLER&ZSX=0&SYE=3', 'https://fr.hotels.com/ho424335/?q-rooms=1&q-room-0-adults=2&q-room-0-children=0&sort-order=BEST_SELLER&ZSX=0&SYE=3', 'https://fr.hotels.com/ho442661/?q-rooms=1&q-room-0-adults=2&q-room-0-children=0&sort-order=BEST_SELLER&ZSX=0&SYE=3', 'https://fr.hotels.com/ho437481/?q-rooms=1&q-room-0-adults=2&q-room-0-children=0&sort-order=BEST_SELLER&ZSX=0&SYE=3']这些都是酒店的网址,我想知道如何有一个具体的部分。
我希望在每个URL中都有这些I:
'https://fr.hotels.com/ho437481/?q-rooms=1&q-room-0-adults=2&q-room-0-children=0&sort-order=BEST_SELLER&ZSX=0&SYE=3'-> 437481
类似于重新创建列表,但只使用这些数字,而不是URL。
就像这样:
['716157152', '397103', '1098309152' ... , '437481']发布于 2021-07-01 08:54:02
您可以使用正则表达式,但是如果结构总是https://fr.hotels.com/ho[your_id]/[...],split就足够了:
hotel_ids = [link.split('/')[3][2:] for link in links]split将字符串转换为类似于['https:', '', 'fr.hotels.com', 'ho[your_id]']的列表,因此id将始终位于第四个位置(index = 3),而[2:]将去掉前面的'ho‘。
发布于 2021-07-01 08:54:05
你可以在你拿到你的links后再这么做
links = [s.split('/')[3][2:] for s in links]# Output
['716157152', '397103', '1098309152', '449686', '315896', '1574324896', '288352', '748227104', '225263', '225250', '405210', '547798', '252584', '351562', '714011808', '424335', '442661', '437481']发布于 2021-07-01 08:57:31
我更喜欢其他的答案,但regex也是一个可行的选择。
import re
in_arr = ['https://fr.hotels.com/ho716157152/?q-rooms=1&q-room-0-adults=2&q-room-0-children=0&sort-order=BEST_SELLER&ZSX=0&SYE=3', 'https://fr.hotels.com/ho397103/?q-rooms=1&q-room-0-adults=2&q-room-0-children=0&sort-order=BEST_SELLER&ZSX=0&SYE=3', 'https://fr.hotels.com/ho1098309152/?q-rooms=1&q-room-0-adults=2&q-room-0-children=0&sort-order=BEST_SELLER&ZSX=0&SYE=3', 'https://fr.hotels.com/ho449686/?q-rooms=1&q-room-0-adults=2&q-room-0-children=0&sort-order=BEST_SELLER&ZSX=0&SYE=3', 'https://fr.hotels.com/ho315896/?q-rooms=1&q-room-0-adults=2&q-room-0-children=0&sort-order=BEST_SELLER&ZSX=0&SYE=3', 'https://fr.hotels.com/ho1574324896/?q-rooms=1&q-room-0-adults=2&q-room-0-children=0&sort-order=BEST_SELLER&ZSX=0&SYE=3', 'https://fr.hotels.com/ho288352/?q-rooms=1&q-room-0-adults=2&q-room-0-children=0&sort-order=BEST_SELLER&ZSX=0&SYE=3', 'https://fr.hotels.com/ho748227104/?q-rooms=1&q-room-0-adults=2&q-room-0-children=0&sort-order=BEST_SELLER&ZSX=0&SYE=3', 'https://fr.hotels.com/ho225263/?q-rooms=1&q-room-0-adults=2&q-room-0-children=0&sort-order=BEST_SELLER&ZSX=0&SYE=3', 'https://fr.hotels.com/ho225250/?q-rooms=1&q-room-0-adults=2&q-room-0-children=0&sort-order=BEST_SELLER&ZSX=0&SYE=3', 'https://fr.hotels.com/ho405210/?q-rooms=1&q-room-0-adults=2&q-room-0-children=0&sort-order=BEST_SELLER&ZSX=0&SYE=3', 'https://fr.hotels.com/ho547798/?q-rooms=1&q-room-0-adults=2&q-room-0-children=0&sort-order=BEST_SELLER&ZSX=0&SYE=3', 'https://fr.hotels.com/ho252584/?q-rooms=1&q-room-0-adults=2&q-room-0-children=0&sort-order=BEST_SELLER&ZSX=0&SYE=3', 'https://fr.hotels.com/ho351562/?q-rooms=1&q-room-0-adults=2&q-room-0-children=0&sort-order=BEST_SELLER&ZSX=0&SYE=3', 'https://fr.hotels.com/ho714011808/?q-rooms=1&q-room-0-adults=2&q-room-0-children=0&sort-order=BEST_SELLER&ZSX=0&SYE=3', 'https://fr.hotels.com/ho424335/?q-rooms=1&q-room-0-adults=2&q-room-0-children=0&sort-order=BEST_SELLER&ZSX=0&SYE=3', 'https://fr.hotels.com/ho442661/?q-rooms=1&q-room-0-adults=2&q-room-0-children=0&sort-order=BEST_SELLER&ZSX=0&SYE=3', 'https://fr.hotels.com/ho437481/?q-rooms=1&q-room-0-adults=2&q-room-0-children=0&sort-order=BEST_SELLER&ZSX=0&SYE=3']
regex = "(?<=\.com\/ho)[\w]+"
out = map(lambda x: re.findall(regex, x)[0], in_arr)
print(list(out))输出:
['716157152', '397103', '1098309152', '449686', '315896', '1574324896', '288352', '748227104', '225263', '225250', '405210', '547798', '252584', '351562', '714011808', '424335', '442661', '437481']https://stackoverflow.com/questions/68207132
复制相似问题