文章/答案/技术大牛

发布

社区首页 >问答首页 >用网站和WHED网站对所有高校进行网络扫描

问用网站和WHED网站对所有高校进行网络扫描
EN

Stack Overflow用户

提问于 2022-06-29 10:45:13

回答 2查看 136关注 0票数 0

任何人都可以帮助从https://www.whed.net/home.php中抓取代码，我正在使用的代码给了我空的df。希望大学有网站，也可能有研究领域。我的刮擦技能很弱，所以如果你能指导我度过这段时间，那就太好了，谢谢各位。

begin=time.time()
countries=['Emirates','United States of America (all)']
result = [] # List to store all data
univ_links=[] # Links for all universities
fields = ['Street:','City:','Province:','Post Code:','WWW:','Fields of study:','Job title:'] 

webD = wb.Chrome(executable_path=r'C:\Users\Admin\OneDrive\Sagasit\chromedriver.exe') # To launch chrome and run script
# Trigger the target website
webD.get("https://www.whed.net/results_institutions.php")
webD.implicitly_wait(5)

#all_countries=[]
cntry_el = webD.find_elements_by_xpath('//*[@id="Chp1"]/option')
#cntry_grp = webD.find_elements_by_xpath('//*[@id="Chp1"]/optgroup')
grps=webD.find_elements_by_xpath('//*[@id="Chp1"]/optgroup/option[1]')
for c in cntry_el:countries.append(c.text)
for g in grps: countries.append(g.text)
    
for cntry in countries:
    select = Select(webD.find_element_by_id('Chp1'))#select country dropdown
    select.select_by_visible_text(cntry)#choosing country

    Btn_GO = webD.find_element_by_xpath('//*[@id="fsearch"]/p/input')
    Btn_GO.click()

    select_rpp = Select(webD.find_element_by_name('nbr_ref_pge'))#select results per page drop down
    select_rpp.select_by_visible_text('100')#choosing 100 results per page option

    university_form = webD.find_element_by_xpath('//*[@id="contenu"]').find_element_by_id('results') 
    university_list = university_form.find_elements_by_xpath('//*[@id="results"]/li') # list of university elements
    for univ in range(len(university_list)):
            href = university_list[univ].find_element_by_class_name('details').find_elements_by_tag_name('a')[0].get_property('href') # University details link
            univ_links.append(href)
    while True:
        try:
            webD.find_element_by_partial_link_text('Next').click()
            university_form = webD.find_element_by_xpath('//*[@id="contenu"]').find_element_by_id('results')
            university_list = university_form.find_elements_by_xpath('//*[@id="results"]/li')
            for univ in range(len(university_list)):
                href = university_list[univ].find_element_by_class_name('details').find_elements_by_tag_name('a')[0].get_property('href') # University details link
                univ_links.append(href)
        except NoSuchElementException: break    
for l in univ_links:
    webD.get(l)
    webD.implicitly_wait(2)
    title=webD.find_element_by_xpath('//*[@id="page"]/div/div/div[2]/div[1]').text
    title_detailed = webD.find_element_by_xpath('//*[@id="page"]/div/div/div[2]/div[2]').text
    cntry_name=webD.find_element_by_xpath('//*[@id="contenu"]/p[2]').text
    t1=webD.find_elements_by_class_name('dt') 
    t2=webD.find_elements_by_class_name('dd')        
    labels=webD.find_elements_by_class_name('libelle') 
    content=webD.find_elements_by_class_name('contenu') 
    temp={}
    fos=''
    fos1=''
    temp.update({'Title': title,'Detailed Title':title_detailed,'Country':cntry_name})
    for i in range(len(t1)):
        if t1[i].text == '' or t1[i].text == 'Address':
            continue 
        else:
            value=t2[i].text
            temp.update({t1[i].text:value.replace('\n',',')})
    for j in range(len(content)):
        if labels[j].text in fields:
            if labels[j].text == 'Fields of study:':
                info=content[j].text
                fos=fos+','+info
            elif labels[j].text == 'Job title:':
                info1=content[j].text
                fos1=fos1+','+info1
            else:
                key=labels[j].text
                temp.update({key[:-1]: content[j].text})
        
    temp.update({'Fields of study': fos.lstrip(','),'Job titles':fos1.lstrip(',')})
    result.append(temp)
data=pd.DataFrame(result)
data
end=time.time()
print("Time taken : "+ str(end-begin) +"s")
data.to_csv("WHED1.csv",index=False)

这段代码是我可以从github项目中获取的。如果我能重新创建数据并保存它，那就太好了，我希望这个数据在web应用程序中作为下拉列表来使用，以确保在大学学习时不会出现错误。

python

回答 2

Stack Overflow用户

发布于 2022-11-30 06:29:10

更新1/12/22 -异步

使用aiohttp找到了一个更好的解决方案，它还在大约30秒内运行整个国家列表，而不是3个小时。

import json
import time
import aiohttp
import asyncio
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.support.select import Select
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service


def main():
    print("Init")
    driver = init_driver()

    print("Opening Homepage")
    url = "https://www.whed.net/results_institutions.php"
    driver.get(url)
    time.sleep(1)

    print("Gathering Countries")
    countries = get_countries(driver)
    driver.quit()

    print("Scraping")
    start = time.time()
    institution_list = asyncio.run(fetch_all(countries))

    print("Writing out")

    f = open('output.json', 'w')
    f.write(json.dumps(institution_list))
    f.close()
    end = time.time()
    print(f"Total time: {end - start}s")


def init_driver():
    chrome_executable = Service(executable_path='chromedriver.exe', log_path='NUL')
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    driver = webdriver.Chrome(service=chrome_executable, options=chrome_options)
    return driver


def get_countries(driver):
    select = Select(driver.find_element(By.ID, "Chp1"))
    countries = list(map(lambda c: c.get_attribute('value'), select.options))
    countries.pop(0)
    return countries


def extract_institutions(html, country):
    soup = BeautifulSoup(html, 'html.parser')
    page = soup.find('p', {'class': 'infos'}).text
    print(str(page))
    number_of_institutions = str(page).split()[0]
    if number_of_institutions == 'No':
        print(f"No results for {country}")
        return []

    results = []
    inst_index = 0

    raw = soup.find_all('a', {'class': 'fancybox fancybox.iframe'})
    for i in raw:
        results.append({
            'name': str(i.text).strip(),
            'url': 'https://www.whed.net/' + str(i.attrs['href']).strip(),
            'country': country
        })

        inst_index += 1

    return {
        'country': country,
        'count': number_of_institutions,
        'records': results
    }


async def get_institutions(country, session):
    try:
        async with session.post(
            url='https://www.whed.net/results_institutions.php',
            data={"Chp1": country, "nbr_ref_pge": 10000}
        ) as response:
            html = await response.read()
            print(f"Successfully got {country}")
            return extract_institutions(html, country)
    except Exception as e:
        print(f"Unable to get {country} due to {e.__class__}.")


async def fetch_all(countries):
    async with aiohttp.ClientSession() as session:
        return await asyncio.gather(*[get_institutions(country, session) for country in countries])


# Main call
main()

使用同步算法的旧答案

改进@Mithun的答案，因为它并不真正起作用，因为它会被粘在同一个页面上。

还添加了对名称和url的直接访问，以便在您想访问这些名称和url时更容易。

import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.select import Select
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service

print("Init")

chrome_executable = Service(executable_path='chromedriver.exe', log_path='NUL')
chrome_options = Options()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(service=chrome_executable, options=chrome_options)

print("Opening Homepage")
url = "https://www.whed.net/results_institutions.php"
driver.get(url)
time.sleep(1)

print("Selecting country")
select = Select(driver.find_element(By.ID, "Chp1"))
country = "Albania"
select.select_by_visible_text(country)
time.sleep(.5)

print("Searching")
driver.find_element(By.XPATH, "//input[@value='Go']").click()
time.sleep(1)

print("Parsing")
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')

page = soup.find('p', {'class': 'infos'}).text

number_of_pages = str(page).split()[0]

counter = 10
results = []
while True:
    raw = soup.find_all('a', {'class': 'fancybox fancybox.iframe'})
    for i in raw:
        results.append({
            'name': str(i.text).strip(),
            'url': 'https://www.whed.net/' + str(i.attrs['href']).strip(),
            'country': country
        })
        print(f'{len(results)}/{number_of_pages}')

    if counter >= int(number_of_pages):
        break
    counter += 10

    driver.find_element(By.LINK_TEXT, "Next page").click()
    time.sleep(0.5)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
driver.quit()
print(results)

票数 0

Stack Overflow用户

发布于 2022-06-30 07:56:14

您可以使用Selenium来抓取数据。下面的代码将帮助您刮起“美利坚合众国(all)”的大学名称。同样，您也可以使用循环或手动输入名称来查找其他国家。如果你需要每一所大学的学习领域，你可以使用bs4和它的研究领域刮起它的href。

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.select import Select
driver = webdriver.Chrome(r"chromedriver.exe")
url = "https://www.whed.net/results_institutions.php"
driver.get(url)
time.sleep(1)
select = Select(driver.find_element(By.ID, "Chp1"))
select.select_by_visible_text("United States of America (all)")
time.sleep(1)
driver.find_element(By.XPATH, "//input[@value='Go']").click()
time.sleep(1)
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
page = soup.find('p', {'class': 'infos'}).text
number_of_pages = str(page).split()[0]
counter = 10
while counter < int(number_of_pages):
    raw = soup.find_all('div', {'class': 'details'})
    for i in raw:
        i = (str(i.text).lstrip())
        i = i.replace("\n","")
        i = i.replace("\r", "")
        i = i.replace("\t", "")
        print(i)
    next_page = driver.find_element(By.LINK_TEXT, "Next page").click()
    counter += 10
driver.quit()

票数 -1

页面原文内容由Stack Overflow提供。腾讯云小微IT领域专用引擎提供翻译支持

原文链接：

https://stackoverflow.com/questions/72800062

复制

相似问题

问用网站和WHED网站对所有高校进行网络扫描
EN

回答 2

Stack Overflow用户

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问用网站和WHED网站对所有高校进行网络扫描EN

回答 2

Stack Overflow用户

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问用网站和WHED网站对所有高校进行网络扫描
EN