我创建了一个函数,利用每次theGradCafe提交到1月1日之间的天数中位数,给出从研究生申请中得到答复的估计日期。我把意见书分为三类--一般,面试,接受,拒绝。
这是我的第一个R项目,我很想得到一些关于如何用它来清理事情的批评;代码确实很笨重,但似乎可以完成这项工作。
下面是:
grad=function(x,y){
#just one link?
if(missing(y)) {
#inserting the link
pat="U.*$"
require(rvest)
require(stringr)
require(dplyr)
#converting link to text
h=read_html(x)
nodes=h %>%
html_nodes(".tcol3")
#cleaning up text
gadmin=sapply(nodes, html_text)[2:length(nodes)]
gadmin=sub('.*on', '', gadmin)
a=sub('.*on', '', gadmin)
gadmin=sub(pat, '', gadmin) %>%
as.Date("%d %B %Y")
#finishing
gadmint_c=format(gadmin, "%m-%d") %>%
as.Date("%m-%d")
origin_date <- as.Date("2021-01-01")
#date estimates
lengthp=as.numeric(julian(gadmint_c, origin_date))
lengthp=ifelse(lengthp >=334, lengthp-365, lengthp)
lengthp=ifelse(lengthp > 5.5*sd(lengthp, na.rm=T) | lengthp < -5.5*sd(lengthp, na.rm=T), NA, lengthp)
estd=as.Date("2021-01-01")+median(lengthp, na.rm=T)
estd=format(estd, format="%m-%d")
print(paste("You should hear SOMETHING by", estd))
invisible(estd)
#interview estimates
nodes_i=nodes[which(str_extract(as.character(nodes), "Interview") =="Interview")]
gadmin_i=sapply(nodes_i, html_text)
gadmin_i=sub('.*on', '', gadmin_i)
gadmin_i= sub(pat, '', gadmin_i) %>%
as.Date("%d %B %Y")
#finishing interviews
gadmint_ci=format(gadmin_i, "%m-%d") %>%
as.Date("%m-%d")
origin_date <- as.Date("2021-01-01")
#interview estimates
lengthpi=as.numeric(julian(gadmint_ci, origin_date))
lengthpi=ifelse(lengthpi >=334, lengthpi-365, lengthpi)
lengthpi=ifelse(lengthpi > 5.5*sd(lengthpi, na.rm=T) | lengthpi < -5.5*sd(lengthpi, na.rm=T), NA, lengthpi)
estdi=as.Date("2021-01-01")+median(lengthpi, na.rm=T)
estdi=format(estdi, format="%m-%d")
print(paste("You should hear about Interviews by ", estdi))
invisible(estdi)
#acceptance/rejectance estimates
nodes_r=nodes[c(which(str_extract(as.character(nodes), "Rejected") =="Rejected"))]
nodes_a=nodes[c(which(str_extract(as.character(nodes), "Accepted") =="Accepted"))]
gadmin_r=sapply(nodes_r, html_text)
gadmin_a=sapply(nodes_a, html_text)
gadmin_r=sub('.*on', '', gadmin_r)
gadmin_a=sub('.*on', '', gadmin_a)
gadmin_r= sub(pat, '', gadmin_r) %>%
as.Date("%d %B %Y")
gadmin_a= sub(pat, '', gadmin_a) %>%
as.Date("%d %B %Y")
#finishing acceptance/rejectance
gadmint_car_r=format(gadmin_r, format="%m-%d") %>%
as.Date("%m-%d")
gadmint_car_a=format(gadmin_a, format="%m-%d") %>%
as.Date("%m-%d")
origin_date <- as.Date("2021-01-01")
#acceptance/rejections estimates
lengthpar_r=as.numeric(julian(gadmint_car_r, origin_date))
lengthpar_a=as.numeric(julian(gadmint_car_a, origin_date))
lengthpar_r=ifelse(lengthpar_r >=334, lengthpar_r-365, lengthpar_r)
lengthpar_r=ifelse(lengthpar_r > 5.5*sd(lengthpar_r, na.rm=T) | lengthpar_r < -5.5*sd(lengthpar_r, na.rm=T), NA, lengthpar_r)
lengthpar_a=ifelse(lengthpar_a >=334, lengthpar_a-365, lengthpar_a)
lengthpar_a=ifelse(lengthpar_a > 5.5*sd(lengthpar_a, na.rm=T) | lengthpar_a < -5.5*sd(lengthpar_a, na.rm=T), NA, lengthpar_a)
estdar_r=as.Date("2021-01-01")+median(lengthpar_r, na.rm=T)
estdar_a=as.Date("2021-01-01")+median(lengthpar_a, na.rm=T)
estdar_r=format(estdar_r, format="%m-%d")
estdar_a=format(estdar_a, format="%m-%d")
print(paste("You should hear about Rejections by", estdar_r))
print(paste("You should hear about Acceptances by", estdar_a))
invisible(estdar_r)
invisible(estdar_a)
} else {
#just two links?
#inserting the link
pat="U.*$"
require(rvest)
require(dplyr)
#converting link to text
hx=read_html(x)
hy=read_html(y)
nodesx=hx %>%
html_nodes(".tcol3")
nodesy=hy %>%
html_nodes(".tcol3")
#cleaning up text
gadminx=sapply(nodesx, html_text)[2:length(nodesx)]
gadminy=sapply(nodesy, html_text)[2:length(nodesy)]
gadminx=sub('.*on', '', gadminx)
ax=sub('.*on', '', gadminx)
gadminx=sub(pat, '', gadminx) %>%
as.Date("%d %B %Y")
gadminy=sub('.*on', '', gadminy)
ay=sub('.*on', '', gadminy)
gadminy=sub(pat, '', gadminy) %>%
as.Date("%d %B %Y")
gadmin=c(gadminx, gadminy)
#finishing
gadmint_c=format(gadmin, format="%m-%d")%>%
as.Date("%m-%d")
origin_date <- as.Date("2021-01-01")
#date estimates
lengthp=as.numeric(julian(gadmint_c, origin_date))
lengthp=ifelse(lengthp >=334, lengthp-365, lengthp)
lengthp=ifelse(lengthp > 5.5*sd(lengthp, na.rm=T) | lengthp < -5.5*sd(lengthp, na.rm=T), NA, lengthp)
estd=as.Date("2021-01-01")+median(lengthp, na.rm=T)
estd=format(estd, format="%m-%d")
print(paste("You should hear SOMETHING by", estd))
invisible(estd)
#interview estimates
nodes_ix=nodesx[which(str_extract(as.character(nodesx), "Interview") =="Interview")]
nodes_iy=nodesy[which(str_extract(as.character(nodesy), "Interview") =="Interview")]
gadmin_ix=sapply(nodes_ix, html_text)
gadmin_iy=sapply(nodes_iy, html_text)
gadmin_ix=sub('.*on', '', gadmin_ix)
gadmin_ix= sub(pat, '', gadmin_ix) %>%
as.Date("%d %B %Y")
gadmin_iy=sub('.*on', '', gadmin_iy)
gadmin_iy= sub(pat, '', gadmin_iy) %>%
as.Date("%d %B %Y")
#finishing interviews
gadmint_ci=c(gadmin_ix, gadmin_iy)
gadmint_ci=format(gadmint_ci, "%m-%d")%>%
as.Date("%m-%d")
origin_date <- as.Date("2021-01-01")
#interview estimates
lengthpi=as.numeric(julian(gadmint_ci, origin_date))
lengthpi=ifelse(lengthpi >=334, lengthpi-365, lengthpi)
lengthpi=ifelse(lengthpi > 5.5*sd(lengthpi, na.rm=T) | lengthpi < -5.5*sd(lengthpi, na.rm=T), NA, lengthpi)
estdi=as.Date("2021-01-01")+median(lengthpi, na.rm=T)
estdi=format(estdi, format="%m-%d")
print(paste("You should hear about Interviews by ", estdi))
invisible(estdi)
#acceptance/rejectance estimates
nodes_rx=nodesx[c(which(str_extract(as.character(nodesx), "Rejected") =="Rejected"))]
nodes_ax=nodesx[which(str_extract(as.character(nodesx), "Accepted") =="Accepted")]
nodes_ry=nodesy[c(which(str_extract(as.character(nodesy), "Rejected") =="Rejected"))]
nodes_ay=nodesy[which(str_extract(as.character(nodesy), "Accepted") =="Accepted")]
gadmin_rx=sapply(nodes_rx, html_text)
gadmin_ax=sapply(nodes_ax, html_text)
gadmin_ry=sapply(nodes_ry, html_text)
gadmin_ay=sapply(nodes_ay, html_text)
gadmin_rx=sub('.*on', '', gadmin_rx)
gadmin_ax=sub('.*on', '', gadmin_ax)
gadmin_ry=sub('.*on', '', gadmin_ry)
gadmin_ay=sub('.*on', '', gadmin_ay)
gadmin_rx= sub(pat, '', gadmin_rx) %>%
as.Date("%d %B %Y")
gadmin_ax=sub(pat, '', gadmin_ax) %>%
as.Date("%d %B %Y")
gadmin_ry= sub(pat, '', gadmin_ry) %>%
as.Date("%d %B %Y")
gadmin_ay=sub(pat, '', gadmin_ay) %>%
as.Date("%d %B %Y")
gadmin_r=c(gadmin_rx, gadmin_ry)
gadmin_a=c(gadmin_ax, gadmin_ay)
#finishing acceptance/rejectance
gadmint_c_r=format(gadmin_r, format="%m-%d")%>%
as.Date("%m-%d")
gadmint_c_a=format(gadmin_a, format="%m-%d") %>%
as.Date("%m-%d")
origin_date <- as.Date("2021-01-01")
#acceptance/rejections estimates
lengthpar_r=as.numeric(julian(gadmint_c_r, origin_date))
lengthpar_a=as.numeric(julian(gadmint_c_a, origin_date))
lengthpar_r=ifelse(lengthpar_r >=334, lengthpar_r-365, lengthpar_r)
lengthpar_r=ifelse(lengthpar_r > 5.5*sd(lengthpar_r, na.rm=T) | lengthpar_r < -5.5*sd(lengthpar_r,na.rm=T), NA, lengthpar_r)
lengthpar_a=ifelse(lengthpar_a >=334, lengthpar_a-365, lengthpar_a)
lengthpar_a=ifelse(lengthpar_a > 5.5*sd(lengthpar_a, na.rm=T) | lengthpar_a < -5.5*sd(lengthpar_a, na.rm=T), NA, lengthpar_a)
estd_a=as.Date("2021-01-01")+median(lengthpar_a, na.rm=T)
estd_r=as.Date("2021-01-01")+median(lengthpar_r, na.rm=T)
estd_r=format(estd_r, format="%m-%d")
estd_a=format(estd_a, format="%m-%d")
print(paste("You should hear about Acceptance by ", estd_a))
print(paste("You should hear about Rejection by ", estd_r))
invisible(estd_a)
invisible(estd_r)
}
}例如,如果我想查看斯坦福大学提交的报告,我会输入grad("https://www.thegradcafe.com/survey/index.php?q=Stanford+University&t=a&o=&pp=250")并返回:
grad("https://www.thegradcafe.com/survey/index.php?q=Stanford+University&t=a&o=&pp=250")
[1] "You should hear SOMETHING by 03-21"
[1] "You should hear about Interviews by 12-18"
[1] "You should hear about Rejections by 02-12"
[1] "You should hear about Acceptances by 03-20"或者,如果我申请斯坦福和哈佛:
grad("https://www.thegradcafe.com/survey/index.php?q=Stanford+University&t=a&o=&pp=250", "https://www.thegradcafe.com/survey/index.php?q=harvard&t=a&o=&pp=250")
[1] "You should hear SOMETHING by 03-17"
[1] "You should hear about Interviews by 12-18"
[1] "You should hear about Acceptance by 03-18"
[1] "You should hear about Rejection by 03-05"有些东西有时落后于其他事物的原因是因为它是一般的,把所有的东西集中在一起,而其他的则更具体。
发布于 2021-01-06 19:27:58
考虑将重复操作分解为不同的功能。特别是对于grad函数,使用R的省略特征,允许您传递多个具有相同调用的URL。这两个步骤都会使您的代码保持干燥(D在Repeat Y我们自己上)。最后,注意R对所有存在的对象的强调,并通过lapply迭代将结果存储在数据帧列表中(每个URL一个),而不是在没有持久数据的情况下一次性打印出来。
<-运算符进行对象分配invisible的多个return%>%为简单的一个巢,如as.Datenode_subset
node_subset <- function(nodes, search_word=NA) {
if(is.na(search_word)) {
sub_nodes <- nodes
} else {
sub_nodes <- nodes[which(str_extract(as.character(nodes), search_word)==search_word)]
}
pat <- "U.*$"
gadmin <- sapply(sub_nodes, html_text)
gadmin <- sub('.*on', '', gadmin)
gadmin <- as.Date(sub(pat, '', gadmin), format="%d %B %Y")
gadmin_dt <- as.Date(format(gadmin, "%m-%d"), format="%m-%d")
return(gadmin_dt)
}date_calc
date_calc <- function(param_dt, origin_date=as.Date("2021-01-01")) {
lengthp <- as.numeric(julian(param_dt, origin_date))
lengthp <- ifelse(lengthp >= 334, lengthp-365, lengthp)
lengthp <- ifelse(lengthp > 5.5 * sd(lengthp, na.rm=TRUE) |
lengthp < -5.5 * sd(lengthp, na.rm=TRUE), NA, lengthp)
estd <- as.Date("2021-01-01") + median(lengthp, na.rm=TRUE)
estd <- format(estd, format="%m-%d")
return(estd)
}grad
grad <- function(...) {
lapply(list(...), function(x) {
#inserting the link
require(rvest)
require(stringr)
require(dplyr)
#converting link to text
h <- read_html(x)
nodes <- html_nodes(h, ".tcol3")
#cleaning up text
gadmint_c <- node_subset(nodes)
gadmint_ci <- node_subset(nodes, "Interview")
gadmint_car_r <- node_subset(nodes, "Rejected")
gadmint_car_a <- node_subset(nodes, "Accepted")
#date estimates
estd <- date_calc(gadmint_c)
print(paste("You should hear SOMETHING by", estd))
#interview estimates
estdi <- date_calc(gadmint_ci)
print(paste("You should hear about Interviews by ", estdi))
#acceptance/rejections estimates
estdar_r <- date_calc(gadmint_car_r)
print(paste("You should hear about Rejections by", estdar_r))
estdar_a <- date_calc(gadmint_car_a)
print(paste("You should hear about Acceptances by", estdar_a))
return(data.frame(url = x,
estd = estd,
estdi = estdi,
estdar_r = estdar_r,
estdar_a = estdar_a,
stringsAsFactors = FALSE) # ARG UNNEEDED FOR R 4.0+
)
})
}df_list1 <- grad("https://www.thegradcafe.com/survey/index.php?q=Stanford+University&t=a&o=&pp=250")
# [1] "You should hear SOMETHING by 03-21"
# [1] "You should hear about Interviews by 12-18"
# [1] "You should hear about Rejections by 03-21"
# [1] "You should hear about Acceptances by 03-20"
df_list2 <- grad("https://www.thegradcafe.com/survey/index.php?q=Stanford+University&t=a&o=&pp=250",
"https://www.thegradcafe.com/survey/index.php?q=harvard&t=a&o=&pp=250")
# [1] "You should hear SOMETHING by 03-21"
# [1] "You should hear about Interviews by 12-18"
# [1] "You should hear about Rejections by 03-21"
# [1] "You should hear about Acceptances by 03-20"
# [1] "You should hear SOMETHING by 03-05"
# [1] "You should hear about Interviews by 12-18"
# [1] "You should hear about Rejections by 03-04"
# [1] "You should hear about Acceptances by 03-07"str(df_list1)
# List of 1
# $ :'data.frame': 1 obs. of 5 variables:
# ..$ url : chr "https://www.thegradcafe.com/survey/index.php?q=Stanford+University&t=a&o=&pp=250"
# ..$ estd : chr "03-21"
# ..$ estdi : chr "12-18"
# ..$ estdar_r: chr "03-21"
# ..$ estdar_a: chr "03-20"
str(df_list2)
# List of 2
# $ :'data.frame': 1 obs. of 5 variables:
# ..$ url : chr "https://www.thegradcafe.com/survey/index.php?q=Stanford+University&t=a&o=&pp=250"
# ..$ estd : chr "03-21"
# ..$ estdi : chr "12-18"
# ..$ estdar_r: chr "03-21"
# ..$ estdar_a: chr "03-20"
# $ :'data.frame': 1 obs. of 5 variables:
# ..$ url : chr "https://www.thegradcafe.com/survey/index.php?q=harvard&t=a&o=&pp=250"
# ..$ estd : chr "03-05"
# ..$ estdi : chr "12-18"
# ..$ estdar_r: chr "03-04"
# ..$ estdar_a: chr "03-07"甚至将多个数据帧的列表组合成一个数据帧:
final_df <- do.call(rbind, df_list2)
final_df
# url estd estdi estdar_r estdar_a
# 1 https://www.thegradcafe.com/survey/index.php?q=Stanford+University&t=a&o=&pp=250 03-21 12-18 03-21 03-20
# 2 https://www.thegradcafe.com/survey/index.php?q=harvard&t=a&o=&pp=250 03-05 12-18 03-04 03-07https://codereview.stackexchange.com/questions/254193
复制相似问题