首页
学习
活动
专区
圈层
工具
发布
社区首页 >问答首页 >如何将嵌套的for -循环转换为适用于R中的代码优化

如何将嵌套的for -循环转换为适用于R中的代码优化
EN

Stack Overflow用户
提问于 2021-12-23 09:21:59
回答 3查看 145关注 0票数 1

我正在尝试将for循环转换为任何应用程序系列,以便进行代码优化。

以下是示例数据

代码语言:javascript
复制
my_data = structure(list(Sector = c("AAA", "BBB", "AAA", "CCC", "AAA",
    "BBB", "AAA", "CCC"), Sub_Sector = c("AAA1", "BBB1", "AAA1",
    "CCC1", "AAA1", "BBB2", "AAA1", "CCC2"), count = c(1L, 1L, 1L,
    1L, 1L, 1L, 1L, 1L), type = c("Actual", "Actual", "Actual", "Actual",
    "Actual", "Actual", "Actual", "Actual")), class = "data.frame", row.names = c(NA,
    -8L))

实际函数(使用for循环)--这个函数给出了预期的输出

代码语言:javascript
复制
expand_collapse_compliance <- function(right_table){
  
  sector_list <- unique(right_table$Sector)
  df = data.frame("Sector1"=c(""),"Sector"=c(""),"Sub_Sector"=c(""),"Actual"=c(""))
  
  for(s in sector_list){
    df1 = right_table[right_table$Sector==s,]
    sector1 = df1$Sector[1]
    Sector = df1$Sector[1]
    Sub_Sector = ""
    actual = as.character(nrow(df1))
    mainrow = c(sector1,Sector,Sub_Sector,actual)
    df = rbind(df,mainrow)
    Sub_Sector_list <- unique(df1$Sub_Sector)
    
    for(i in Sub_Sector_list){
      df2 = right_table[right_table$Sub_Sector==i,]
      sector1 = df1$Sector[1]
      Sector = ""
      Sub_Sector = df2$Sub_Sector[1]
      actual = nrow(df2)
      subrow = c(sector1,Sector,Sub_Sector,actual)
      df = rbind(df,subrow)
    }
  }
  df = df[2:nrow(df),]
  df$Actual = as.numeric(df$Actual)
  df_total = nrow(right_table)
  df = rbind(df,c("","Total","",df_total))
  return(df)
  
}

DT::datatable(expand_collapse_compliance(mydata1), 
              rownames = F,escape = FALSE,
              selection=list(mode="single",target="row"),
              options = list(pageLength = 50,scrollX = TRUE,
                             dom = 'tp',ordering=F,
                             columnDefs = list(list(visible=FALSE, targets=0),
              list(className = 'dt-left', targets = '_all'))),class='hover cell-border stripe')

我尝试先将内部循环转换为lapply,同时在输出表中没有显示sub_sector值,请让我知道如何修复任何想法都会很感激。

代码语言:javascript
复制
expand_collapse_compliance <- function(right_table){
  sector_list <- unique(right_table$Sector)
  df = data.frame("Sector1"=c(""),"Sector"=c(""),"Sub_Sector"=c(""),"Actual"=c(""))
  
  for(s in sector_list){
    df1 = right_table[right_table$Sector==s,]
    sector1 = df1$Sector[1]
    Sector = df1$Sector[1]
    Sub_Sector = ""
    actual = as.character(nrow(df1))
    mainrow = c(sector1,Sector,Sub_Sector,actual)
    df = rbind(df,mainrow)
    Sub_Sector_list <- unique(df1$Sub_Sector)
    
    #for(i in Sub_Sector_list){
      lapply(Sub_Sector_list, function(x){
      df2 = right_table[right_table$Sub_Sector==Sub_Sector_list,]
      sector1 = df1$Sector[1]
      Sector = ""
      Sub_Sector = df2$Sub_Sector[1]
      actual = nrow(df2)
      subrow = c(sector1,Sector,Sub_Sector,actual)
      df = rbind(df,subrow)
      })
  }
  df = df[2:nrow(df),]
  df$Actual = as.numeric(df$Actual)
  df_total = nrow(right_table)
  df = rbind(df,c("","Total","",df_total))
  return(df)
  
}
EN

回答 3

Stack Overflow用户

回答已采纳

发布于 2021-12-23 09:59:48

使用dplyrtidyr可以做到:

注意:我放弃了DT部分。

代码语言:javascript
复制
library(dplyr)
library(tidyr)

expand_collapse_compliance1 <- function(x) {
  x <- x %>% 
    count(Sector, Sub_Sector, name = "Actual") %>% 
    group_split(Sector) %>% 
    lapply(function(x) {
      main <- group_by(x, Sector) %>% summarise(Actual = sum(Actual)) 
      bind_rows(main, x)
    }) %>%
    bind_rows() %>% 
    mutate(Sector1 = Sector) %>%
    select(Sector1, Sector, Sub_Sector, Actual)
  
  total <- x %>%
    filter(is.na(Sub_Sector)) %>% 
    group_by(Sector = "Total") %>% 
    summarise(Actual = sum(Actual))
  
  bind_rows(x, total) %>% 
    mutate(Sector = ifelse(!is.na(Sub_Sector), "", Sector)) %>% 
    replace_na(list(Sub_Sector = "", Sector1 = ""))
}

expand_collapse_compliance1(my_data)
#> # A tibble: 9 × 4
#>   Sector1 Sector  Sub_Sector Actual
#>   <chr>   <chr>   <chr>       <int>
#> 1 "AAA"   "AAA"   ""              4
#> 2 "AAA"   ""      "AAA1"          4
#> 3 "BBB"   "BBB"   ""              2
#> 4 "BBB"   ""      "BBB1"          1
#> 5 "BBB"   ""      "BBB2"          1
#> 6 "CCC"   "CCC"   ""              2
#> 7 "CCC"   ""      "CCC1"          1
#> 8 "CCC"   ""      "CCC2"          1
#> 9 ""      "Total" ""              8

expand_collapse_compliance(my_data)
#>    Sector1 Sector Sub_Sector Actual
#> 2      AAA    AAA                 4
#> 3      AAA              AAA1      4
#> 4      BBB    BBB                 2
#> 5      BBB              BBB1      1
#> 6      BBB              BBB2      1
#> 7      CCC    CCC                 2
#> 8      CCC              CCC1      1
#> 9      CCC              CCC2      1
#> 91          Total                 8
票数 2
EN

Stack Overflow用户

发布于 2021-12-23 11:09:13

*apply家族的适当功能可以是使用拆分应用-组合方法的tapply。由于只有在有多个tapply的情况下才需要Sub_Sector,所以为了速度起见,我们实现了一个案例处理。

代码语言:javascript
复制
expand_collapse_complianceA <- \(data) {
  r <- do.call(rbind, c(by(data, data$Sector, \(x) {
    if (length(unique(x$Sub_Sector)) != 1L) {
      tt <- t(unname(with(x, tapply(count, list(Sector, Sub_Sector), sum))))
      tt <- cbind(x[!duplicated(x$Sub_Sector), 1:2], foo='', Actual=tt)
    } else {
      tt <- as.data.frame(t(c(unlist(x[!duplicated(x$Sub_Sector), 1:2]), foo='',
                              Actual=sum(x$count))))
    }
    rbind(c(tt[1, 1], '', tt[1, 1], sum(as.numeric(tt[, 4]))), tt)[c(1, 3, 2, 4)]
  }), make.row.names=FALSE))
  rbind(r, c('', 'Total', '', sum(as.numeric(r$Actual[!r$foo %in% ''])))) |>
    setNames(c('Sector1', 'Sector', 'Sub_Sector', 'Actual'))
}

注: R version 4.1.2 (2021-11-01).

给出

代码语言:javascript
复制
expand_collapse_compliance(my_data)
#   Sector1 Sector Sub_Sector Actual
# 1     AAA    AAA                 4
# 2     AAA              AAA1      4
# 3     BBB    BBB                 2
# 4     BBB              BBB1      1
# 5     BBB              BBB2      1
# 6     CCC    CCC                 2
# 7     CCC              CCC1      1
# 8     CCC              CCC2      1
# 9          Total                 8


expand_collapse_complianceA(my_data) |> 
  (\(x) DT::datatable(
    x, rownames=F, escape=FALSE, selection=list(mode="single", target="row"), 
    options=list(pageLength=50, scrollX=TRUE, dom='tp', ordering=F, 
                 columnDefs=list(list(visible=FALSE, targets=0),
                                 list(className='dt-left', targets='_all'))), 
    class='hover cell-border stripe'))()

expand_collapse_complianceA现在只需要1/10的时间作为原始的for循环。这里是一个基准测试(在1080行上测试)。

代码语言:javascript
复制
# Unit: milliseconds
#       expr        min         lq       mean     median         uq       max neval cld
#    ecc_for 304.723781 305.426934 346.878188 308.208294 335.944407 598.94351    10   c
# ecc_tapply  29.768177  29.851975  31.083977  30.611982  32.058980  34.50901    10 a  
#   ecc_tidy 135.326594 135.952068 143.967550 138.475437 149.352409 164.94652    10  b 
#     ecc_DT   3.267969   3.611711   4.610916   3.664493   3.707528  13.48797    10 a  

当然,data.table更快。然而,当数据即将进入exhaust the RAM时,我想看到性能。

基准守则:

代码语言:javascript
复制
microbenchmark::microbenchmark(
  ecc_for=expand_collapse_compliance(dat),
  ecc_tapply=expand_collapse_complianceA(dat),
  ecc_tidy={library(dplyr);library(tidyr);expand_collapse_compliance1(dat)},
  ecc_DT={library(data.table);expand_collapse_complianceDT(as.data.table(dat))},
  times=10L)

注意到,认为“整洁”版本到目前为止有一些缺陷(至少在新的数据中是这样)。

代码语言:javascript
复制
res_for <- expand_collapse_compliance(dat)
res_tapply <- expand_collapse_complianceA(dat)
res_tidy <- {library(dplyr);library(tidyr);expand_collapse_compliance1(dat)}


all.equal(res_for, res_tapply, check.attributes=FALSE)
# [1] TRUE
all.equal(res_for, res_tidy, check.attributes=FALSE)
# [1] "Component “Sub_Sector”: 1053 string mismatches"             
# [2] "Component “Actual”: target is character, current is numeric"

数据

代码语言:javascript
复制
dat <- expand.grid(Sector=c("AA", "AB", "AC", "AD", "AE", "AF", "AG", "AH", "AI", "AJ", 
                     "AK", "AL", "AM", "AN", "AO", "AP", "AQ", "AR", "AS", "AT", "AU", 
                     "AV", "AW", "AX", "AY", "AZ", "BA"),
            Sub_Sector=1:40, stringsAsFactors=F)
dat <- transform(dat, Sub_Sector=Reduce(paste0, dat[1:2]), count=1, type='Actual')
dat <- dat[order(dat$Sector), ]
票数 1
EN

Stack Overflow用户

发布于 2021-12-23 14:04:51

不需要任何循环,也不需要应用,我们需要的是三个不同的组计数和一些格式。假设--从样本数据中可以看出--不需要分割。

代码语言:javascript
复制
my_data = structure(list(Sector = c("AAA", "BBB", "AAA", "CCC", "AAA",
    "BBB", "AAA", "CCC"), Sub_Sector = c("AAA1", "BBB1", "AAA1",
    "CCC1", "AAA1", "BBB2", "AAA1", "CCC2"), count = c(1L, 1L, 1L,
    1L, 1L, 1L, 1L, 1L), type = c("Actual", "Actual", "Actual", "Actual",
    "Actual", "Actual", "Actual", "Actual")), class = "data.frame", row.names = c(NA,
    -8L))

library(data.table)
setDT(my_data)

expand_collapse_compliance <- function(x) {
  x <- rbindlist(list(
    x[, .(Sector1 = Sector, Actual = .N), by = Sector], 
    setnames(x[, .(Actual = .N), by = .(Sector, Sub_Sector)], "Sector", "Sector1"),
    x[, .(Sector = "Total", Actual = .N)]
  ), fill = T)
  setcolorder(x, c("Sector1", "Sector", "Sub_Sector", "Actual"))
  setorder(x, Sector1, Sector, na.last = T)
  x
}

expand_collapse_compliance(my_data)

#    Sector1 Sector Sub_Sector Actual
# 1:     AAA    AAA       <NA>      4
# 2:     AAA   <NA>       AAA1      4
# 3:     BBB    BBB       <NA>      2
# 4:     BBB   <NA>       BBB1      1
# 5:     BBB   <NA>       BBB2      1
# 6:     CCC    CCC       <NA>      2
# 7:     CCC   <NA>       CCC1      1
# 8:     CCC   <NA>       CCC2      1
# 9:    <NA>  Total       <NA>      8

sidenote --没有必要将NA转换为"",因为在闪亮的DT中,会显示它的亮度。

测速

正如我所提到的,for在非常小的数据集上通常更快,因为任何库解决方案都使用一些函数,这些函数需要一些时间来加载一次.

代码语言:javascript
复制
my_data_small = structure(list(Sector = c("AAA", "BBB", "AAA", "CCC", "AAA",
    "BBB", "AAA", "CCC"), Sub_Sector = c("AAA1", "BBB1", "AAA1",
    "CCC1", "AAA1", "BBB2", "AAA1", "CCC2"), count = c(1L, 1L, 1L,
    1L, 1L, 1L, 1L, 1L), type = c("Actual", "Actual", "Actual", "Actual",
    "Actual", "Actual", "Actual", "Actual")), class = "data.frame", row.names = c(NA,
    -8L))

library(data.table)
setDT(my_data)

   test replications elapsed relative
2  eccB          150    0.32     1.00
1 eccDT          150    0.72     2.25

# well just make it a milion times bigger :D
my_data_large <- rbindlist(rep(list(my_data_small), 1000000L))

   test replications elapsed relative
2  eccB           50   79.30    5.146
1 eccDT           50   15.41    1.000
票数 1
EN
页面原文内容由Stack Overflow提供。腾讯云小微IT领域专用引擎提供翻译支持
原文链接:

https://stackoverflow.com/questions/70460012

复制
相关文章

相似问题

领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档