首页
学习
活动
专区
圈层
工具
发布
社区首页 >问答首页 >用SelectorGadget实现r中的web抓取

用SelectorGadget实现r中的web抓取
EN

Stack Overflow用户
提问于 2022-07-11 00:20:26
回答 1查看 81关注 0票数 1

我正在运行下面这个简单的代码,以便从“财富”500强的页面中刮取员工编号。我使用Chrome的扩展: SelectorGadget来识别我想要的数字与“..info_row--7f9lE:nth-child(13)..info_value-2 AH7”匹配

代码语言:javascript
复制
library(rvest)
library(dplyr)
#download google chrome extention: SelectorGadget
link = "https://fortune.com/company/walmart/"
page = read_html(link)
Employees = page %>% html_nodes(".info__row--7f9lE:nth-child(13) .info__value--2AHH7") %>% html_text()
Employees

但是,它返回了“字符(0)”。有人知道原因是什么吗?我觉得这肯定是个简单的错误。提前感谢!

更新

下面是我根据Jon的评论修改的代码。

代码语言:javascript
复制
a <- c("https://fortune.com/company/walmart/", "https://fortune.com/company/amazon-com/"              
,"https://fortune.com/company/apple/"                   
,"https://fortune.com/company/cvs-health/"              
,"https://fortune.com/company/unitedhealth-group/"      
, "https://fortune.com/company/berkshire-hathaway/"      
, "https://fortune.com/company/mckesson/"                
,"https://fortune.com/company/amerisourcebergen/"       
, "https://fortune.com/company/alphabet/"                
, "https://fortune.com/company/exxon-mobil/"             
,"https://fortune.com/company/att/"                     
,"https://fortune.com/company/costco/"                  
,"https://fortune.com/company/cigna/"                   
, "https://fortune.com/company/cardinal-health/"         
,"https://fortune.com/company/microsoft/"               
,"https://fortune.com/company/walgreens-boots-alliance/"
,"https://fortune.com/company/kroger/"                  
, "https://fortune.com/company/home-depot/"              
,"https://fortune.com/company/jpmorgan-chase/"          
,"https://fortune.com/company/verizon/"                 
,"https://fortune.com/company/ford-motor/"              
, "https://fortune.com/company/general-motors/"          
,"https://fortune.com/company/anthem/"                  
, "https://fortune.com/company/centene/"                 
,"https://fortune.com/company/fannie-mae/"              
, "https://fortune.com/company/comcast/"                 
, "https://fortune.com/company/chevron/"                 
,"https://fortune.com/company/dell-technologies/"       
,"https://fortune.com/company/bank-of-america-corp/"    
,"https://fortune.com/company/target/")


find_by_name <- function(list_data, name, elem = NULL) {
  idx <- which(sapply(list_data, \(x) x$name) == name, arr.ind = TRUE)
  stopifnot(length(idx) > 0)
  if (length(idx) > 1) { idx <- idx[1] }
  dat <- list_data[[idx]]
  if (is.null(elem)) dat else dat[[elem]]
}

numEmp <- numeric()

for (i in 1:length(a)){
  json_data <- read_html(a[i]) |>
    html_element("script#preload") |> 
    html_text() |>
    sub("\\s*window\\.__PRELOADED_STATE__ = ", "", x = _, perl = TRUE) |>
    sub(";\\s*$", "", x = _, perl = TRUE) |>
    fromJSON(simplifyVector = FALSE)
  
  
  
  temp<-gsub(".*https://fortune.com", "", a[i])
  page_data <- json_data$components$page[[temp]]
  
  info_data <- page_data |> 
    find_by_name("body", "children") |>
    find_by_name("company-about-wrapper", "children") |>
    find_by_name("company-information", "config")
  
  
  numEmp[i] <- info_data$employees # Results will be fed into this numEmp variable.
}
numEmp

一个错误说

find_by_name中的错误(page_data,"body",“子”):length(idx) >0不是真

我是否应该更改代码stopifnot(length(idx) > 0)

EN

回答 1

Stack Overflow用户

回答已采纳

发布于 2022-07-11 04:34:40

当我做document.querySelectorAll(".info__row--7f9lE:nth-child(13) .info__value--2AHH7")时,我看到你想刮掉员工的#。Maurits是对的,看起来数据是以(内联) JSON的形式下载的,然后再呈现出来。您可以使用Selenium保存呈现的页面,然后在那里应用CSS选择器。或者您可以提取内联JSON并从那里刮取它。

在做了一些手工工作之后,您可以进行第二个选项,如R4.2.x中的下面所示

代码语言:javascript
复制
library(rvest)
library(jsonlite)

# R 4.1.x
sub2 <- function(x, pattern, replacement) sub(pattern, replacement, x = x, perl = TRUE)

url <- "https://fortune.com/company/walmart/"
json_data <- read_html(url) |>
  html_element("script#preload") |> 
  html_text() |>
  ## sub("\\s*window\\.__PRELOADED_STATE__ = ", "", x = _, perl = TRUE) |> # R 4.2.x
  sub2("\\s*window\\.__PRELOADED_STATE__ = ", "") |>                       # R 4.1.x
  ## sub(";\\s*$", "", x = _, perl = TRUE) |>  # R 4.2.x
  sub2(";\\s*$", "") |>                        # R 4.1.x
  fromJSON(simplifyVector = FALSE)

page_data <- json_data$components$page[["/company/walmart/"]]

find_by_name <- function(list_data, name, elem = NULL) {
  idx <- which(sapply(list_data, \(x) x$name) == name, arr.ind = TRUE)
  stopifnot(length(idx) > 0)
  if (length(idx) > 1) { idx <- idx[1] }
  dat <- list_data[[idx]]
  if (is.null(elem)) dat else dat[[elem]]
}

info_data <- page_data |> 
  find_by_name("body", "children") |>
  find_by_name("company-about-wrapper", "children") |>
  find_by_name("company-information", "config")

info_data$employees
#> [1] "2300000"

# Extra code to scrape company-data-table segments
library(purrr)
data_tables <- page_data |>
  find_by_name("body", "children") |>
  find_by_name("company-about-wrapper", "children") |>
  find_by_name("company-table-wrapper", "children")

rows <- data_tables |>
  lapply(\(x) c(x$config$data, x$config$change)) |>
  purrr::flatten() |>
  discard(~ is.null(.$key))

df <- data.frame(
  key = rows |> map_chr(~ .$key),
  title = rows |> map_chr(~ .$fieldMeta$title),
  type = rows |> map_chr(~ .$fieldMeta$type),
  value = rows |> map_chr(~ .$value)
)
票数 3
EN
页面原文内容由Stack Overflow提供。腾讯云小微IT领域专用引擎提供翻译支持
原文链接:

https://stackoverflow.com/questions/72932716

复制
相关文章

相似问题

领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档