我正在运行下面这个简单的代码,以便从“财富”500强的页面中刮取员工编号。我使用Chrome的扩展: SelectorGadget来识别我想要的数字与“..info_row--7f9lE:nth-child(13)..info_value-2 AH7”匹配
library(rvest)
library(dplyr)
#download google chrome extention: SelectorGadget
link = "https://fortune.com/company/walmart/"
page = read_html(link)
Employees = page %>% html_nodes(".info__row--7f9lE:nth-child(13) .info__value--2AHH7") %>% html_text()
Employees但是,它返回了“字符(0)”。有人知道原因是什么吗?我觉得这肯定是个简单的错误。提前感谢!
更新
下面是我根据Jon的评论修改的代码。
a <- c("https://fortune.com/company/walmart/", "https://fortune.com/company/amazon-com/"
,"https://fortune.com/company/apple/"
,"https://fortune.com/company/cvs-health/"
,"https://fortune.com/company/unitedhealth-group/"
, "https://fortune.com/company/berkshire-hathaway/"
, "https://fortune.com/company/mckesson/"
,"https://fortune.com/company/amerisourcebergen/"
, "https://fortune.com/company/alphabet/"
, "https://fortune.com/company/exxon-mobil/"
,"https://fortune.com/company/att/"
,"https://fortune.com/company/costco/"
,"https://fortune.com/company/cigna/"
, "https://fortune.com/company/cardinal-health/"
,"https://fortune.com/company/microsoft/"
,"https://fortune.com/company/walgreens-boots-alliance/"
,"https://fortune.com/company/kroger/"
, "https://fortune.com/company/home-depot/"
,"https://fortune.com/company/jpmorgan-chase/"
,"https://fortune.com/company/verizon/"
,"https://fortune.com/company/ford-motor/"
, "https://fortune.com/company/general-motors/"
,"https://fortune.com/company/anthem/"
, "https://fortune.com/company/centene/"
,"https://fortune.com/company/fannie-mae/"
, "https://fortune.com/company/comcast/"
, "https://fortune.com/company/chevron/"
,"https://fortune.com/company/dell-technologies/"
,"https://fortune.com/company/bank-of-america-corp/"
,"https://fortune.com/company/target/")
find_by_name <- function(list_data, name, elem = NULL) {
idx <- which(sapply(list_data, \(x) x$name) == name, arr.ind = TRUE)
stopifnot(length(idx) > 0)
if (length(idx) > 1) { idx <- idx[1] }
dat <- list_data[[idx]]
if (is.null(elem)) dat else dat[[elem]]
}
numEmp <- numeric()
for (i in 1:length(a)){
json_data <- read_html(a[i]) |>
html_element("script#preload") |>
html_text() |>
sub("\\s*window\\.__PRELOADED_STATE__ = ", "", x = _, perl = TRUE) |>
sub(";\\s*$", "", x = _, perl = TRUE) |>
fromJSON(simplifyVector = FALSE)
temp<-gsub(".*https://fortune.com", "", a[i])
page_data <- json_data$components$page[[temp]]
info_data <- page_data |>
find_by_name("body", "children") |>
find_by_name("company-about-wrapper", "children") |>
find_by_name("company-information", "config")
numEmp[i] <- info_data$employees # Results will be fed into this numEmp variable.
}
numEmp一个错误说
find_by_name中的错误(page_data,"body",“子”):length(idx) >0不是真
我是否应该更改代码stopifnot(length(idx) > 0)?
发布于 2022-07-11 04:34:40
当我做document.querySelectorAll(".info__row--7f9lE:nth-child(13) .info__value--2AHH7")时,我看到你想刮掉员工的#。Maurits是对的,看起来数据是以(内联) JSON的形式下载的,然后再呈现出来。您可以使用Selenium保存呈现的页面,然后在那里应用CSS选择器。或者您可以提取内联JSON并从那里刮取它。
在做了一些手工工作之后,您可以进行第二个选项,如R4.2.x中的下面所示
library(rvest)
library(jsonlite)
# R 4.1.x
sub2 <- function(x, pattern, replacement) sub(pattern, replacement, x = x, perl = TRUE)
url <- "https://fortune.com/company/walmart/"
json_data <- read_html(url) |>
html_element("script#preload") |>
html_text() |>
## sub("\\s*window\\.__PRELOADED_STATE__ = ", "", x = _, perl = TRUE) |> # R 4.2.x
sub2("\\s*window\\.__PRELOADED_STATE__ = ", "") |> # R 4.1.x
## sub(";\\s*$", "", x = _, perl = TRUE) |> # R 4.2.x
sub2(";\\s*$", "") |> # R 4.1.x
fromJSON(simplifyVector = FALSE)
page_data <- json_data$components$page[["/company/walmart/"]]
find_by_name <- function(list_data, name, elem = NULL) {
idx <- which(sapply(list_data, \(x) x$name) == name, arr.ind = TRUE)
stopifnot(length(idx) > 0)
if (length(idx) > 1) { idx <- idx[1] }
dat <- list_data[[idx]]
if (is.null(elem)) dat else dat[[elem]]
}
info_data <- page_data |>
find_by_name("body", "children") |>
find_by_name("company-about-wrapper", "children") |>
find_by_name("company-information", "config")
info_data$employees
#> [1] "2300000"
# Extra code to scrape company-data-table segments
library(purrr)
data_tables <- page_data |>
find_by_name("body", "children") |>
find_by_name("company-about-wrapper", "children") |>
find_by_name("company-table-wrapper", "children")
rows <- data_tables |>
lapply(\(x) c(x$config$data, x$config$change)) |>
purrr::flatten() |>
discard(~ is.null(.$key))
df <- data.frame(
key = rows |> map_chr(~ .$key),
title = rows |> map_chr(~ .$fieldMeta$title),
type = rows |> map_chr(~ .$fieldMeta$type),
value = rows |> map_chr(~ .$value)
)https://stackoverflow.com/questions/72932716
复制相似问题