首页
学习
活动
专区
圈层
工具
发布
社区首页 >问答首页 >如何加强这一职能?我想要它给我一个从研究生申请中得到回复的估计日期。

如何加强这一职能?我想要它给我一个从研究生申请中得到回复的估计日期。
EN

Code Review用户
提问于 2021-01-01 22:11:46
回答 1查看 48关注 0票数 2

我创建了一个函数,利用每次theGradCafe提交到1月1日之间的天数中位数,给出从研究生申请中得到答复的估计日期。我把意见书分为三类--一般,面试,接受,拒绝。

这是我的第一个R项目,我很想得到一些关于如何用它来清理事情的批评;代码确实很笨重,但似乎可以完成这项工作。

下面是:

代码语言:javascript
复制
grad=function(x,y){
  #just one link?
  if(missing(y)) {
    #inserting the link
    pat="U.*$"
    require(rvest)
    require(stringr)
    require(dplyr)
    #converting link to text
    h=read_html(x)
    nodes=h %>%
      html_nodes(".tcol3")
    #cleaning up text
    gadmin=sapply(nodes, html_text)[2:length(nodes)]
    gadmin=sub('.*on', '', gadmin)
    a=sub('.*on', '', gadmin)
    gadmin=sub(pat, '', gadmin) %>%
      as.Date("%d %B %Y")
    #finishing
    gadmint_c=format(gadmin, "%m-%d") %>%
      as.Date("%m-%d")
    origin_date <- as.Date("2021-01-01")
    #date estimates
    lengthp=as.numeric(julian(gadmint_c, origin_date))
    lengthp=ifelse(lengthp >=334, lengthp-365, lengthp)
    lengthp=ifelse(lengthp > 5.5*sd(lengthp, na.rm=T) | lengthp < -5.5*sd(lengthp, na.rm=T), NA, lengthp)
    estd=as.Date("2021-01-01")+median(lengthp, na.rm=T)
    estd=format(estd, format="%m-%d")
    print(paste("You should hear SOMETHING  by", estd))
    invisible(estd)
    #interview estimates
    nodes_i=nodes[which(str_extract(as.character(nodes), "Interview") =="Interview")]
    gadmin_i=sapply(nodes_i, html_text)
    gadmin_i=sub('.*on', '', gadmin_i)
    gadmin_i= sub(pat, '', gadmin_i) %>%
      as.Date("%d %B %Y")
    #finishing interviews
    gadmint_ci=format(gadmin_i, "%m-%d") %>%
      as.Date("%m-%d")
    origin_date <- as.Date("2021-01-01")
    #interview estimates
    lengthpi=as.numeric(julian(gadmint_ci, origin_date))
    lengthpi=ifelse(lengthpi >=334, lengthpi-365, lengthpi)
    lengthpi=ifelse(lengthpi > 5.5*sd(lengthpi, na.rm=T) | lengthpi < -5.5*sd(lengthpi, na.rm=T), NA, lengthpi)
    estdi=as.Date("2021-01-01")+median(lengthpi, na.rm=T)
    estdi=format(estdi, format="%m-%d")
    print(paste("You should hear about Interviews by ", estdi))
    invisible(estdi)
    #acceptance/rejectance estimates
    nodes_r=nodes[c(which(str_extract(as.character(nodes), "Rejected") =="Rejected"))]
    nodes_a=nodes[c(which(str_extract(as.character(nodes), "Accepted") =="Accepted"))]
    gadmin_r=sapply(nodes_r, html_text)
    gadmin_a=sapply(nodes_a, html_text)
    gadmin_r=sub('.*on', '', gadmin_r)
    gadmin_a=sub('.*on', '', gadmin_a)
    gadmin_r= sub(pat, '', gadmin_r) %>%
      as.Date("%d %B %Y")
    gadmin_a= sub(pat, '', gadmin_a) %>%
      as.Date("%d %B %Y")
    #finishing acceptance/rejectance
    gadmint_car_r=format(gadmin_r, format="%m-%d") %>%
      as.Date("%m-%d")
   gadmint_car_a=format(gadmin_a, format="%m-%d") %>%
     as.Date("%m-%d")
    origin_date <- as.Date("2021-01-01")
    #acceptance/rejections estimates
    lengthpar_r=as.numeric(julian(gadmint_car_r, origin_date))
    lengthpar_a=as.numeric(julian(gadmint_car_a, origin_date))
    lengthpar_r=ifelse(lengthpar_r >=334, lengthpar_r-365, lengthpar_r)
    lengthpar_r=ifelse(lengthpar_r > 5.5*sd(lengthpar_r, na.rm=T) | lengthpar_r < -5.5*sd(lengthpar_r, na.rm=T), NA, lengthpar_r)
    lengthpar_a=ifelse(lengthpar_a >=334, lengthpar_a-365, lengthpar_a)
    lengthpar_a=ifelse(lengthpar_a > 5.5*sd(lengthpar_a, na.rm=T) | lengthpar_a < -5.5*sd(lengthpar_a, na.rm=T), NA, lengthpar_a)
    estdar_r=as.Date("2021-01-01")+median(lengthpar_r, na.rm=T)
    estdar_a=as.Date("2021-01-01")+median(lengthpar_a, na.rm=T)
    estdar_r=format(estdar_r, format="%m-%d")
    estdar_a=format(estdar_a, format="%m-%d")
    print(paste("You should hear about  Rejections by", estdar_r))
    print(paste("You should hear about  Acceptances by", estdar_a))
    invisible(estdar_r)
    invisible(estdar_a)
  } else {
    #just two links?
    #inserting the link
    pat="U.*$"
    require(rvest)
    require(dplyr)
    #converting link to text
    hx=read_html(x)
    hy=read_html(y)
    nodesx=hx %>%
      html_nodes(".tcol3")
    nodesy=hy %>%
      html_nodes(".tcol3")
    #cleaning up text
    gadminx=sapply(nodesx, html_text)[2:length(nodesx)]
    gadminy=sapply(nodesy, html_text)[2:length(nodesy)]
    gadminx=sub('.*on', '', gadminx)
    ax=sub('.*on', '', gadminx)
    gadminx=sub(pat, '', gadminx) %>%
      as.Date("%d %B %Y")
    gadminy=sub('.*on', '', gadminy)
    ay=sub('.*on', '', gadminy)
    gadminy=sub(pat, '', gadminy) %>%
      as.Date("%d %B %Y")
    gadmin=c(gadminx, gadminy)
    #finishing
    gadmint_c=format(gadmin, format="%m-%d")%>%
      as.Date("%m-%d")
    origin_date <- as.Date("2021-01-01")
    #date estimates
    lengthp=as.numeric(julian(gadmint_c, origin_date))
    lengthp=ifelse(lengthp >=334, lengthp-365, lengthp)
    lengthp=ifelse(lengthp > 5.5*sd(lengthp, na.rm=T) | lengthp < -5.5*sd(lengthp, na.rm=T), NA, lengthp)
    estd=as.Date("2021-01-01")+median(lengthp, na.rm=T)
    estd=format(estd, format="%m-%d")
    print(paste("You should hear SOMETHING  by", estd))
    invisible(estd)
    #interview estimates
    nodes_ix=nodesx[which(str_extract(as.character(nodesx), "Interview") =="Interview")]
    nodes_iy=nodesy[which(str_extract(as.character(nodesy), "Interview") =="Interview")]
    gadmin_ix=sapply(nodes_ix, html_text)
    gadmin_iy=sapply(nodes_iy, html_text)
    gadmin_ix=sub('.*on', '', gadmin_ix)
    gadmin_ix= sub(pat, '', gadmin_ix) %>%
      as.Date("%d %B %Y")
    gadmin_iy=sub('.*on', '', gadmin_iy)
    gadmin_iy= sub(pat, '', gadmin_iy) %>%
      as.Date("%d %B %Y")
    #finishing interviews
    gadmint_ci=c(gadmin_ix, gadmin_iy)
    gadmint_ci=format(gadmint_ci, "%m-%d")%>%
      as.Date("%m-%d")
    origin_date <- as.Date("2021-01-01")
    #interview estimates
    lengthpi=as.numeric(julian(gadmint_ci, origin_date))
    lengthpi=ifelse(lengthpi >=334, lengthpi-365, lengthpi)
    lengthpi=ifelse(lengthpi > 5.5*sd(lengthpi, na.rm=T) | lengthpi < -5.5*sd(lengthpi, na.rm=T), NA, lengthpi)
    estdi=as.Date("2021-01-01")+median(lengthpi, na.rm=T)
    estdi=format(estdi, format="%m-%d")
    print(paste("You should hear about Interviews by ", estdi))
    invisible(estdi)
    #acceptance/rejectance estimates
    nodes_rx=nodesx[c(which(str_extract(as.character(nodesx), "Rejected") =="Rejected"))]
    nodes_ax=nodesx[which(str_extract(as.character(nodesx), "Accepted") =="Accepted")]
    nodes_ry=nodesy[c(which(str_extract(as.character(nodesy), "Rejected") =="Rejected"))]
    nodes_ay=nodesy[which(str_extract(as.character(nodesy), "Accepted") =="Accepted")]
    gadmin_rx=sapply(nodes_rx, html_text)
    gadmin_ax=sapply(nodes_ax, html_text)
    gadmin_ry=sapply(nodes_ry, html_text)
    gadmin_ay=sapply(nodes_ay, html_text)
    gadmin_rx=sub('.*on', '', gadmin_rx)
    gadmin_ax=sub('.*on', '', gadmin_ax)
    gadmin_ry=sub('.*on', '', gadmin_ry)
    gadmin_ay=sub('.*on', '', gadmin_ay)
    gadmin_rx= sub(pat, '', gadmin_rx) %>%
      as.Date("%d %B %Y")
    gadmin_ax=sub(pat, '', gadmin_ax) %>%
      as.Date("%d %B %Y")
    gadmin_ry= sub(pat, '', gadmin_ry) %>%
      as.Date("%d %B %Y")
    gadmin_ay=sub(pat, '', gadmin_ay) %>%
      as.Date("%d %B %Y")
    gadmin_r=c(gadmin_rx, gadmin_ry)
    gadmin_a=c(gadmin_ax, gadmin_ay)
    #finishing acceptance/rejectance
    gadmint_c_r=format(gadmin_r, format="%m-%d")%>%
      as.Date("%m-%d")
    gadmint_c_a=format(gadmin_a, format="%m-%d") %>%
      as.Date("%m-%d")
    origin_date <- as.Date("2021-01-01")
    #acceptance/rejections estimates
    lengthpar_r=as.numeric(julian(gadmint_c_r, origin_date))
    lengthpar_a=as.numeric(julian(gadmint_c_a, origin_date))
    lengthpar_r=ifelse(lengthpar_r >=334, lengthpar_r-365, lengthpar_r)
    lengthpar_r=ifelse(lengthpar_r > 5.5*sd(lengthpar_r, na.rm=T) | lengthpar_r < -5.5*sd(lengthpar_r,na.rm=T), NA, lengthpar_r)
    lengthpar_a=ifelse(lengthpar_a >=334, lengthpar_a-365, lengthpar_a)
    lengthpar_a=ifelse(lengthpar_a > 5.5*sd(lengthpar_a, na.rm=T) | lengthpar_a < -5.5*sd(lengthpar_a, na.rm=T), NA, lengthpar_a)
    estd_a=as.Date("2021-01-01")+median(lengthpar_a, na.rm=T)
    estd_r=as.Date("2021-01-01")+median(lengthpar_r, na.rm=T)
    estd_r=format(estd_r, format="%m-%d")
    estd_a=format(estd_a, format="%m-%d")
    print(paste("You should hear about Acceptance by ", estd_a))
    print(paste("You should hear about Rejection by ", estd_r))
    invisible(estd_a)
    invisible(estd_r)
  }
}

例如,如果我想查看斯坦福大学提交的报告,我会输入grad("https://www.thegradcafe.com/survey/index.php?q=Stanford+University&t=a&o=&pp=250")并返回:

代码语言:javascript
复制
 grad("https://www.thegradcafe.com/survey/index.php?q=Stanford+University&t=a&o=&pp=250")
[1] "You should hear SOMETHING  by 03-21"
[1] "You should hear about Interviews by  12-18"
[1] "You should hear about  Rejections by 02-12"
[1] "You should hear about  Acceptances by 03-20"

或者,如果我申请斯坦福和哈佛:

代码语言:javascript
复制
 grad("https://www.thegradcafe.com/survey/index.php?q=Stanford+University&t=a&o=&pp=250", "https://www.thegradcafe.com/survey/index.php?q=harvard&t=a&o=&pp=250")
[1] "You should hear SOMETHING  by 03-17"
[1] "You should hear about Interviews by  12-18"
[1] "You should hear about Acceptance by  03-18"
[1] "You should hear about Rejection by  03-05"

有些东西有时落后于其他事物的原因是因为它是一般的,把所有的东西集中在一起,而其他的则更具体。

EN

回答 1

Code Review用户

回答已采纳

发布于 2021-01-06 19:27:58

考虑将重复操作分解为不同的功能。特别是对于grad函数,使用R的省略特征,允许您传递多个具有相同调用的URL。这两个步骤都会使您的代码保持干燥(DRepeat Y我们自己上)。最后,注意R对所有存在的对象的强调,并通过lapply迭代将结果存储在数据帧列表中(每个URL一个),而不是在没有持久数据的情况下一次性打印出来。

函数

  • 划出你的部门和操作符。
  • 线路中断长呼
  • 使用<-运算符进行对象分配
  • 删除单个invisible的多个return
  • 避免管道%>%为简单的一个巢,如as.Date

node_subset

代码语言:javascript
复制
node_subset <- function(nodes, search_word=NA) {  
  if(is.na(search_word)) {
    sub_nodes <- nodes
  } else {
    sub_nodes <- nodes[which(str_extract(as.character(nodes), search_word)==search_word)]
  }
  
  pat <- "U.*$"
  gadmin <- sapply(sub_nodes, html_text)
  gadmin <- sub('.*on', '', gadmin)
  gadmin <-  as.Date(sub(pat, '', gadmin), format="%d %B %Y")
  gadmin_dt <- as.Date(format(gadmin, "%m-%d"), format="%m-%d")
  
  return(gadmin_dt)
}

date_calc

代码语言:javascript
复制
date_calc <- function(param_dt, origin_date=as.Date("2021-01-01")) {  
  lengthp <- as.numeric(julian(param_dt, origin_date))
  lengthp <- ifelse(lengthp >= 334, lengthp-365, lengthp)
  lengthp <- ifelse(lengthp > 5.5 * sd(lengthp, na.rm=TRUE) |
                      lengthp < -5.5 * sd(lengthp, na.rm=TRUE), NA, lengthp)
  estd <- as.Date("2021-01-01") + median(lengthp, na.rm=TRUE)
  estd <- format(estd, format="%m-%d")
  
  return(estd)
}

grad

代码语言:javascript
复制
grad <- function(...) {  
  lapply(list(...), function(x) {
  
    #inserting the link
    require(rvest)
    require(stringr)
    require(dplyr)
    
    #converting link to text
    h <- read_html(x)
    nodes <- html_nodes(h, ".tcol3")
    
    #cleaning up text
    gadmint_c <- node_subset(nodes)
    gadmint_ci <- node_subset(nodes, "Interview")
    gadmint_car_r <- node_subset(nodes, "Rejected")
    gadmint_car_a <- node_subset(nodes, "Accepted")
    
    #date estimates
    estd <- date_calc(gadmint_c)
    print(paste("You should hear SOMETHING  by", estd))
  
    #interview estimates
    estdi <- date_calc(gadmint_ci)
    print(paste("You should hear about Interviews by ", estdi))
    
    #acceptance/rejections estimates
    estdar_r <- date_calc(gadmint_car_r)
    print(paste("You should hear about  Rejections by", estdar_r))
    
    estdar_a <- date_calc(gadmint_car_a)
    print(paste("You should hear about  Acceptances by", estdar_a))

    return(data.frame(url = x, 
                      estd = estd, 
                      estdi = estdi, 
                      estdar_r = estdar_r, 
                      estdar_a = estdar_a,
                      stringsAsFactors = FALSE)    # ARG UNNEEDED FOR R 4.0+
           )
  })
}

调用

代码语言:javascript
复制
df_list1 <- grad("https://www.thegradcafe.com/survey/index.php?q=Stanford+University&t=a&o=&pp=250")
# [1] "You should hear SOMETHING  by 03-21"
# [1] "You should hear about Interviews by  12-18"
# [1] "You should hear about  Rejections by 03-21"
# [1] "You should hear about  Acceptances by 03-20"

df_list2 <- grad("https://www.thegradcafe.com/survey/index.php?q=Stanford+University&t=a&o=&pp=250", 
                 "https://www.thegradcafe.com/survey/index.php?q=harvard&t=a&o=&pp=250")
# [1] "You should hear SOMETHING  by 03-21"
# [1] "You should hear about Interviews by  12-18"
# [1] "You should hear about  Rejections by 03-21"
# [1] "You should hear about  Acceptances by 03-20"
# [1] "You should hear SOMETHING  by 03-05"
# [1] "You should hear about Interviews by  12-18"
# [1] "You should hear about  Rejections by 03-04"
# [1] "You should hear about  Acceptances by 03-07"

数据

代码语言:javascript
复制
str(df_list1)
# List of 1
#  $ :'data.frame': 1 obs. of  5 variables:
#   ..$ url     : chr "https://www.thegradcafe.com/survey/index.php?q=Stanford+University&t=a&o=&pp=250"
#   ..$ estd    : chr "03-21"
#   ..$ estdi   : chr "12-18"
#   ..$ estdar_r: chr "03-21"
#   ..$ estdar_a: chr "03-20"

str(df_list2)
# List of 2
#  $ :'data.frame': 1 obs. of  5 variables:
#   ..$ url     : chr "https://www.thegradcafe.com/survey/index.php?q=Stanford+University&t=a&o=&pp=250"
#   ..$ estd    : chr "03-21"
#   ..$ estdi   : chr "12-18"
#   ..$ estdar_r: chr "03-21"
#   ..$ estdar_a: chr "03-20"
#  $ :'data.frame': 1 obs. of  5 variables:
#   ..$ url     : chr "https://www.thegradcafe.com/survey/index.php?q=harvard&t=a&o=&pp=250"
#   ..$ estd    : chr "03-05"
#   ..$ estdi   : chr "12-18"
#   ..$ estdar_r: chr "03-04"
#   ..$ estdar_a: chr "03-07"

甚至将多个数据帧的列表组合成一个数据帧:

代码语言:javascript
复制
final_df <- do.call(rbind, df_list2)
final_df

#                                                                                url  estd estdi estdar_r estdar_a
# 1 https://www.thegradcafe.com/survey/index.php?q=Stanford+University&t=a&o=&pp=250 03-21 12-18    03-21    03-20
# 2             https://www.thegradcafe.com/survey/index.php?q=harvard&t=a&o=&pp=250 03-05 12-18    03-04    03-07
票数 3
EN
页面原文内容由Code Review提供。腾讯云小微IT领域专用引擎提供翻译支持
原文链接:

https://codereview.stackexchange.com/questions/254193

复制
相关文章

相似问题

领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档