我想用英超网站上的统计数据来做一个班级项目。这是一个网站:https://www.premierleague.com/stats/top/players/goals
有过滤器,允许我们按季节和其他因素过滤,在页面底部有一个按钮,允许我们查看表中接下来的20个条目。
我的代码如下:
library(tidyverse)
library(rvest)
url <- "https://www.premierleague.com/stats/top/players/goals?se=79"
url %>%
read_html() %>%
html_nodes("table") %>%
.[[1]] %>%
html_table()其中产出:
Rank Player Club Nationality Stat
1 1 Alan Shearer - England 260
2 2 Wayne Rooney Everton England 208
3 3 Andrew Cole - England 187
4 4 Frank Lampard - England 177
5 5 Thierry Henry - France 175
6 6 Robbie Fowler - England 163
7 7 Jermain Defoe AFC Bournemouth England 162
8 8 Michael Owen - England 150
9 9 Les Ferdinand - England 149
10 10 Teddy Sheringham - England 146
11 11 Robin van Persie - Netherlands 144
12 12 Sergio Agüero Manchester City Argentina 143
13 13 Jimmy Floyd Hasselbaink - Netherlands 127
14 14 Robbie Keane - Ireland 126
15 15 Nicolas Anelka - France 125
16 16 Dwight Yorke - Trinidad And Tobago 123
17 17 Steven Gerrard - England 120
18 18 Ian Wright - England 113
19 19 Dion Dublin - England 111
20 20 Emile Heskey - England 110但是,当更改站点上的筛选器(例如,在我的用例中,将表限制在当前季节),并使用箭头访问表中接下来的20个条目时,URL不会更改。
我已经找到了相关领域的源代码。它们是:
<div data-script="pl_stats" data-widget="stats-table" data-current-size="20"
data-stat="" data-type="player" data-page-size="20" data-page="0" data-
comps="1" data-num-entries="2162">
<div class="dropDown noLabel topStatsFilterDropdown" data-listener="true">
<div data-metric="mins_played" class="current currentStatContainer"
aria-expanded="false">Minutes played</div>
<ul class="dropdownList" role="listbox">我希望能够修改数据度量和数据页字段。
发布于 2018-05-17 02:33:19
此解决方案要求您可以访问selenium服务器。
library(RSelenium) # not on cran (install with devtools::install_github("ropensci/RSelenium"))
library(rvest)
# helper functions ---------------------------
# click_el() solves the problem mentioned here:
# https://stackoverflow.com/questions/11908249/debugging-element-is-not-clickable-at-point-error
click_el <- function(rem_dr, el) {
rem_dr$executeScript("arguments[0].click();", args = list(el))
}
# wrapper around findElement()
find_el <- function(rem_dr, xpath) {
rem_dr$findElement("xpath", xpath)
}
# check if an element exists on the dom
el_exists <- function(rem_dr, xpath) {
maybe_el <- read_html(rem_dr$getPageSource()[[1]]) %>%
xml_find_first(xpath = xpath)
length(maybe_el) != 0
}
# try to click on a element if it exists
click_if_exists <- function(rem_dr, xpath) {
if (el_exists(rem_dr, xpath)) {
suppressMessages({
try({
el <- find_el(rem_dr, xpath)
el$clickElement()
}, silent = TRUE
)
})
}
}
# close google adds so they don't get in the way of clicking other elements
maybe_close_ads <- function(rem_dr) {
click_if_exists(rem_dr, '//a[@id="advertClose" and @class="closeBtn"]')
}
# click on button that requires we accept cookies
maybe_accept_cookies <- function(rem_dr) {
click_if_exists(rem_dr, "//div[@class='btn-primary cookies-notice-accept']")
}
# parse the data table you're interested in
get_tbl <- function(rem_dr) {
read_html(rem_dr$getPageSource()[[1]]) %>%
html_nodes("table") %>%
.[[1]] %>%
html_table()
}
# actual execution ---------------------------
# first u need to start selenium server...i'm running the server inside a
# docker container and having it listen on port 4445 on my local machine
# (see http://rpubs.com/johndharrison/RSelenium-Basics for more details):
`docker run -d -p 4445:4444 selenium/standalone-firefox:2.53.1`
# connect to selenium server from within r
rem_dr <- remoteDriver(
remoteServerAddr = "localhost", port = 4445L, browserName = "firefox"
)
rem_dr$open()
# go to webpage
rem_dr$navigate("https://www.premierleague.com/stats/top/players/goals")
# close adds
maybe_close_ads(rem_dr)
Sys.sleep(3)
# the seasons to iterate over
start <- 1992:2017 # u may want to replace this with `start <- 1992:1995` when testing
seasons <- paste0(start, "/", substr(start + 1, 3, 4))
# list to hold each season's data
out_list <- vector("list", length(seasons))
names(out_list) <- seasons
for (season in seasons) {
maybe_close_ads(rem_dr)
# to filter the data by season, we first need to click on the "filter by season" drop down
# menu, so that the divs representing the various seasons become active (otherwise,
# we can't click them)
cur_season <- find_el(
rem_dr, '//div[@class="current" and @data-dropdown-current="FOOTBALL_COMPSEASON" and @role="button"]'
)
click_el(rem_dr, cur_season)
Sys.sleep(3)
# now we can select the season of interest
xpath <- sprintf(
'//ul[@data-dropdown-list="FOOTBALL_COMPSEASON"]/li[@data-option-name="%s"]',
season
)
season_lnk <- find_el(rem_dr, xpath)
click_el(rem_dr, season_lnk)
Sys.sleep(3)
# parse the table shown on the first page
tbl <- get_tbl(rem_dr)
# iterate over all additional pages
nxt_page_act <- '//div[@class="paginationBtn paginationNextContainer"]'
nxt_page_inact <- '//div[@class="paginationBtn paginationNextContainer inactive"]'
while (!el_exists(rem_dr, nxt_page_inact)) {
maybe_close_ads(rem_dr)
maybe_accept_cookies(rem_dr)
rem_dr$maxWindowSize()
btn <- find_el(rem_dr, nxt_page_act)
click_el(rem_dr, btn) # click "next button"
maybe_accept_cookies(rem_dr)
new_tbl <- get_tbl(rem_dr)
tbl <- rbind(tbl, new_tbl)
cat(".")
Sys.sleep(2)
}
# put this season's data into the output list
out_list[[season]] <- tbl
print(season)
}这需要一些时间才能运行。当我运行它时,我得到了6,731行数据总数(贯穿所有季节)。
https://stackoverflow.com/questions/50310595
复制相似问题