文章/答案/技术大牛

发布

社区首页 >问答首页 >如何根据一定条件优化粘贴单/多个列名及其值

问如何根据一定条件优化粘贴单/多个列名及其值
EN

Stack Overflow用户

提问于 2021-02-05 18:47:02

回答 1查看 164关注 0票数 1

我想贴上列名和它们的值。它必须基于某些条件(if语句)，并且可以基于一个变量或多个变量。

下面是一个显示数据外观的小示例。我希望加快这个过程，并获得与fun2、fun3和fun4相同的结果。

为了使这尽可能简单，如果列a、b、c和d的值大于零，则只有一条规则可设置为缺失。但是，我保留了规则的名称，因为它可以是不同的，比如“规则1”>0和“规则2”(如果不缺少)。

library("data.table")
library("tidytable")
library("glue")
library("stringi")
library("benchr")

dat <- data.table(id = 1:10,
                  t1 = rnorm(10),
                  t2 = rnorm(10),
                  a  = c(0, NA,  0,  1,  0, NA,  1,  1,  0, 1),
                  b  = c(0, NA, NA,  0,  1,  0,  1, NA,  1, 1),
                  c  = c(0, NA,  0, NA,  0,  1, NA,  1,  1, 1),
                  d  = c(0, NA,  1,  1,  0,  1,  0,  1, NA, 1),
                  re = "")

这就是数据的样子：

id         t1         t2  a  b  c  d re
 1  0.6883367 -0.3454049  0  0  0  0 '' 
 2 -1.0653127 -1.3035077 NA NA NA NA '' 
 3  0.5210550  0.8489376  0 NA  0  1 '' 
 4  0.3697369 -0.1135827  1  0 NA  1 '' 
 5  1.3195759 -1.5431305  0  1  0  0 '' 
 6 -0.2106836 -0.3421900 NA  0  1  1 '' 
 7 -0.2258871 -2.1644697  1  1 NA  0 '' 
 8 -0.7132686  1.7673775  1 NA  1  1 '' 
 9  0.9467068  1.8188665  0  1  1 NA '' 
10 -0.3900479  1.7306935  1  1  1  1 ''

波纹管是期望的输出。这样做的目的是保留一个描述列，并说明某些值被设置为缺失的原因。在本例中，只有前两个个体拥有t1和t2的记录。个人1、2和3有t1的记录，而个人1、2、5、7和9有t2的记录。

id       t1     t2     a     b     c     d    re                                      
 1  -0.182   1.43      0     0     0     0   ""                                      
 2  -1.31    0.733    NA    NA    NA    NA   ""                                      
 3  -0.0613 NA         0    NA     0     1   "Rule2:t2(d=1);"                       
 4  NA      NA         1     0    NA     1   "Rule2:t2(d=1); Rule1:t1(a=1);"        
 5  NA       1.78      0     1     0     0   "Rule1:t1(b=1); "                       
 6  NA      NA        NA     0     1     1   "Rule2:t2(d=1); Rule1:t1(c=1);"        
 7  NA      -0.345     1     1    NA     0   "Rule1:t1(a=1 b=1); "                   
 8  NA      NA         1    NA     1     1   "Rule2:t2(d=1); Rule1:t1(a=1 c=1);"   
 9  NA      -1.22      0     1     1    NA   "Rule1:t1(b=1 c=1); "                   
10  NA      NA         1     1     1     1   "Rule2:t2(d=1); Rule1:t1(a=1 b=1 c=1);"

第一次尝试(fun1)而不是预期的结果，因为它在变异中寻找单个空白。所有其他函数(fun2、fun3和fun4)都打印正确的结果。

fun1 <- function(tbl) {
  lhs0 <- c("t1", "t2")
  rhs0 <- list(c("a", "b", "c"), "d")
  rul0 <- c("Rule1", "Rule2")
  for (i in 1:length(lhs0)) {
    lhs <- lhs0[i]
    rhs <- rhs0[[i]]
    rul <- rul0[i]
    tbl[, aux := do.call(paste, Map(function(x, y) fifelse(y > 0, paste(x, y, sep = '='), "", na = ""), names(.SD), .SD)), .SDcols = rhs]
    tbl <- tbl %>%
      mutate.(
        re = case_when.(aux == "" ~ re, TRUE ~ paste0(rul, ":", lhs, "(", aux,"); ", re)),
        := rlang::parse_expr(glue("case_when.(aux == '' ~ {lhs}, TRUE ~ NA_real_)"))
      ) %>%
      select.(-aux)
  }
  return(tbl)
}

  id    t1     t2     a     b     c     d    re                                      
<int> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <chr>                                   
   1    NA   1.43     0     0     0     0   "Rule1:t1(  ); "                        
   2    NA   0.733   NA    NA    NA    NA   "Rule1:t1(  ); "                        
   3    NA  NA        0    NA     0     1   "Rule2:t2(d=1); Rule1:t1(  ); "         
   4    NA  NA        1     0    NA     1   "Rule2:t2(d=1); Rule1:t1(a=1  ); "      
   5    NA   1.78     0     1     0     0   "Rule1:t1( b=1 ); "                     
   6    NA  NA       NA     0     1     1   "Rule2:t2(d=1); Rule1:t1(  c=1); "      
   7    NA  -0.345    1     1    NA     0   "Rule1:t1(a=1 b=1 ); "                  
   8    NA  NA        1    NA     1     1   "Rule2:t2(d=1); Rule1:t1(a=1  c=1); "   
   9    NA  -1.22     0     1     1    NA   "Rule1:t1( b=1 c=1); "                  
  10    NA  NA        1     1     1     1   "Rule2:t2(d=1); Rule1:t1(a=1 b=1 c=1); "

函数2 (fun2)使用“裁剪”。

fun2 <- function(tbl) {
  lhs0 <- c("t1", "t2")
  rhs0 <- list(c("a", "b", "c"), "d")
  rul0 <- c("Rule1", "Rule2")
  for (i in 1:length(lhs0)) {
    lhs <- lhs0[i]
    rhs <- rhs0[[i]]
    rul <- rul0[i]
    tbl[, aux := trimws(do.call(paste, Map(function(x, y) fifelse(y > 0, paste(x, y, sep = '='), "", na = ""), names(.SD), .SD))), .SDcols = rhs]
    tbl <- tbl %>%
      mutate.(
        re = case_when.(aux == "" ~ re, TRUE ~ paste0(rul, ":", lhs, "(", aux,"); ", re)),
        := rlang::parse_expr(glue("case_when.(aux == '' ~ {lhs}, TRUE ~ NA_real_)"))
      ) %>%
      select.(-aux)
  }
  return(tbl)
}

  id      t1     t2     a     b     c     d    re                                      
<int>   <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <chr>                                   
   1 -0.182   1.43      0     0     0     0   ""                                      
   2 -1.31    0.733    NA    NA    NA    NA   ""                                      
   3 -0.0613 NA         0    NA     0     1   "Rule2:t2(d=1); "                       
   4 NA      NA         1     0    NA     1   "Rule2:t2(d=1); Rule1:t1(a=1); "        
   5 NA       1.78      0     1     0     0   "Rule1:t1(b=1); "                       
   6 NA      NA        NA     0     1     1   "Rule2:t2(d=1); Rule1:t1(c=1); "        
   7 NA      -0.345     1     1    NA     0   "Rule1:t1(a=1 b=1); "                   
   8 NA      NA         1    NA     1     1   "Rule2:t2(d=1); Rule1:t1(a=1  c=1); "   
   9 NA      -1.22      0     1     1    NA   "Rule1:t1(b=1 c=1); "                   
  10 NA      NA         1     1     1     1   "Rule2:t2(d=1); Rule1:t1(a=1 b=1 c=1); "

函数3 (fun3)使用带有正则表达式的"gsub“。

fun3 <- function(tbl) {
  lhs0 <- c("t1", "t2")
  rhs0 <- list(c("a", "b", "c"), "d")
  rul0 <- c("Rule1", "Rule2")
  for (i in 1:length(lhs0)) {
    lhs <- lhs0[i]
    rhs <- rhs0[[i]]
    rul <- rul0[i]
    tbl[, aux := gsub("\\s+","", do.call(paste, Map(function(x, y) fifelse(y > 0, paste(x, y, sep = '='), "", na = ""), names(.SD), .SD))), .SDcols = rhs]
    tbl <- tbl %>%
      mutate.(
        re = case_when.(aux == "" ~ re, TRUE ~ paste0(rul, ":", lhs, "(", aux,"); ", re)),
        := rlang::parse_expr(glue("case_when.(aux == '' ~ {lhs}, TRUE ~ NA_real_)"))
      ) %>%
      select.(-aux)
  }
  return(tbl)
}

  id      t1     t2     a     b     c     d    re                                      
<int>   <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <chr>                                   
  1 -0.182   1.43      0     0     0     0   ""                                      
  2 -1.31    0.733    NA    NA    NA    NA   ""                                      
  3 -0.0613 NA         0    NA     0     1   "Rule2:t2(d=1); "                       
  4 NA      NA         1     0    NA     1   "Rule2:t2(d=1); Rule1:t1(a=1); "        
  5 NA       1.78      0     1     0     0   "Rule1:t1(b=1); "                       
  6 NA      NA        NA     0     1     1   "Rule2:t2(d=1); Rule1:t1(c=1); "        
  7 NA      -0.345     1     1    NA     0   "Rule1:t1(a=1b=1); "                   
  8 NA      NA         1    NA     1     1   "Rule2:t2(d=1); Rule1:t1(a=1c=1); "   
  9 NA      -1.22      0     1     1    NA   "Rule1:t1(b=1c=1); "                   
 10 NA      NA         1     1     1     1   "Rule2:t2(d=1); Rule1:t1(a=1b=1c=1); "

函数4 (fun4)使用stri_detect内部的正则表达式进行变异。

fun4 <- function(tbl) {
  lhs0 <- c("t1", "t2")
  rhs0 <- list(c("a", "b", "c"), "d")
  rul0 <- c("Rule1", "Rule2")
  for (i in 1:length(lhs0)) {
    lhs <- lhs0[i]
    rhs <- rhs0[[i]]
    rul <- rul0[i]
    tbl[, aux := do.call(paste, Map(function(x, y) fifelse(y > 0, paste(x, y, sep = '='), "", na = ""), names(.SD), .SD)), .SDcols = rhs]
    tbl <- tbl %>%
      mutate.(
        re = case_when.(!stri_detect(aux, regex = "[[:alpha:]]") ~ re, TRUE ~ paste0(rul, ":", lhs, "(", aux,"); ", re)),
        := rlang::parse_expr(glue("case_when.(!stri_detect(aux, regex = '[[:alpha:]]') ~ {lhs}, TRUE ~ NA_real_)"))
      ) %>%
      select.(-aux)
  }
  return(tbl)
}

  id      t1     t2     a     b     c     d re                                      
<int>   <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <chr>                                   
   1 -0.182   1.43      0     0     0     0 ""                                      
   2 -1.31    0.733    NA    NA    NA    NA ""                                      
   3 -0.0613 NA         0    NA     0     1 "Rule2:t2(d=1); "                       
   4 NA      NA         1     0    NA     1 "Rule2:t2(d=1); Rule1:t1(a=1  ); "      
   5 NA       1.78      0     1     0     0 "Rule1:t1( b=1 ); "                     
   6 NA      NA        NA     0     1     1 "Rule2:t2(d=1); Rule1:t1(  c=1); "      
   7 NA      -0.345     1     1    NA     0 "Rule1:t1(a=1 b=1 ); "                  
   8 NA      NA         1    NA     1     1 "Rule2:t2(d=1); Rule1:t1(a=1  c=1); "   
   9 NA      -1.22      0     1     1    NA "Rule1:t1( b=1 c=1); "                  
  10 NA      NA         1     1     1     1 "Rule2:t2(d=1); Rule1:t1(a=1 b=1 c=1); "

有更多数据的基准测试

n <- 200000
dat <- data.table(id = 1:n,
                  t1 = rnorm(n),
                  t2 = rnorm(n),
                  a  = sample(c(0, NA, 1), n, replace = TRUE),
                  b  = sample(c(0, NA, 1), n, replace = TRUE),
                  c  = sample(c(0, NA, 1), n, replace = TRUE),
                  d  = sample(c(0, NA, 1), n, replace = TRUE),
                  re = "")

benchmark(fun1(dat),
          fun2(dat),
          fun3(dat),
          fun4(dat))

Benchmark summary:
  Time units : milliseconds 
     expr n.eval min lw.qu median mean up.qu  max total relative
fun1(dat)    100 642   653    660  668   666  774 66800     1.00
fun2(dat)    100 742   756    763  773   768  874 77300     1.16
fun3(dat)    100 765   779    785  794   791  903 79400     1.19
fun4(dat)    100 743   756    763  777   770 1010 77700     1.16

有没有人知道如何加快这一进程？

谢谢。

dplyr

data.table

tidyverse

tidytable

回答 1

Stack Overflow用户

回答已采纳

发布于 2021-02-06 19:56:12

首先，我承认我没有能够超过基准(谢谢你的挑战)。也许有一些方法可以加快速度，但是让我推荐一种方法，它可以做同样的事情(对于更小的数据更快，对于大数据更快)，但是支持每个规则函数。这不是您直接要求的，但是您暗示了每个规则的不同功能。

(我已经更新了的代码，这要感谢@Cole找到了我早期探索的遗迹。)

RULES <- list(
  Rule1 = list(
    rule = "Rule1",
    lhs = "t1",
    rhs = c("a", "b", "c"),
    fun = function(z) !is.na(z) & z > 0
  ),
  Rule2 = list(
    rule = "Rule2",
    lhs = "t2",
    rhs = "d",
    fun = is.na
    )
)

fun9 <- function(dat, RULES = list()) {
  nr <- nrow(dat)
  # RE <- lapply(seq_along(RULES), function(ign) rep("", nr))
  RE <- asplit(matrix("", nrow = length(RULES), ncol = nr), 1)
  for (r in seq_along(RULES)) {
    fun <- RULES[[r]]$fun
    lhs <- RULES[[r]]$lhs
    for (rhs in RULES[[r]]$rhs) {
      lgl <- do.call(fun, list(dat[[rhs]]))
      set(dat, which(lgl), lhs, NA)
      RE[[r]][lgl] <- sprintf("%s %s=1", RE[[r]][lgl], rhs)
    }
    ind <- nzchar(RE[[r]])
    RE[[r]][ind] <- sprintf("%s:%s(%s)", RULES[[r]]$rule, lhs, RE[[r]][ind])
  }
  set(dat, j = "re", value = do.call(paste, c(RE, sep = ";")))
}

RULES和使用fun9的前提应该是不言而喻的.

用小数据制定基准似乎很有希望：

set.seed(2021)
dat <- data.table(id = 1:10,
                  t1 = rnorm(10),
                  t2 = rnorm(10),
                  a  = c(0, NA,  0,  1,  0, NA,  1,  1,  0, 1),
                  b  = c(0, NA, NA,  0,  1,  0,  1, NA,  1, 1),
                  c  = c(0, NA,  0, NA,  0,  1, NA,  1,  1, 1),
                  d  = c(0, NA,  1,  1,  0,  1,  0,  1, NA, 1),
                  re = "")
fun9(dat, RULES)[]
#        id         t1         t2     a     b     c     d                                re
#     <int>      <num>      <num> <num> <num> <num> <num>                            <char>
#  1:     1 -0.1224600 -1.0822049     0     0     0     0                                 ;
#  2:     2  0.5524566         NA    NA    NA    NA    NA                   ;Rule2:t2( d=1)
#  3:     3  0.3486495  0.1819954     0    NA     0     1                                 ;
#  4:     4         NA  1.5085418     1     0    NA     1                   Rule1:t1( a=1);
#  5:     5         NA  1.6044701     0     1     0     0                   Rule1:t1( b=1);
#  6:     6         NA -1.8414756    NA     0     1     1                   Rule1:t1( c=1);
#  7:     7         NA  1.6233102     1     1    NA     0               Rule1:t1( a=1 b=1);
#  8:     8         NA  0.1313890     1    NA     1     1               Rule1:t1( a=1 c=1);
#  9:     9         NA         NA     0     1     1    NA Rule1:t1( b=1 c=1);Rule2:t2( d=1)
# 10:    10         NA  1.5133183     1     1     1     1           Rule1:t1( a=1 b=1 c=1);

bench::mark(fun4(dat), fun9(dat, RULES), check = FALSE)
# # A tibble: 2 x 13
#   expression            min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result memory                  time             gc                  
#   <bch:expr>       <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm> <list> <list>                  <list>           <list>              
# 1 fun4(dat)          9.52ms   11.1ms      88.5     316KB     2.06    43     1      486ms <NULL> <Rprofmem[,3] [84 x 3]> <bch:tm [44]>    <tibble [44 x 3]>   
# 2 fun9(dat, RULES)   97.5us  113.5us    7760.       416B     6.24  3731     3      481ms <NULL> <Rprofmem[,3] [2 x 3]>  <bch:tm [3,734]> <tibble [3,734 x 3]>

从`itr/sec`来看，这个fun9看起来要快一点。

有了更大的数据：

set.seed(2021)
n <- 200000
dat <- data.table(id = 1:n,
                  t1 = rnorm(n),
                  t2 = rnorm(n),
                  a  = sample(c(0, NA, 1), n, replace = TRUE),
                  b  = sample(c(0, NA, 1), n, replace = TRUE),
                  c  = sample(c(0, NA, 1), n, replace = TRUE),
                  d  = sample(c(0, NA, 1), n, replace = TRUE),
                  re = "")
bench::mark(fun4(dat), fun9(dat, RULES), check = FALSE)
# Warning: Some expressions had a GC in every iteration; so filtering is disabled.
# # A tibble: 2 x 13
#   expression            min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result memory                   time         gc              
#   <bch:expr>       <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm> <list> <list>                   <list>       <list>          
# 1 fun4(dat)           1.24s    1.24s     0.806    62.9MB     1.61     1     2      1.24s <NULL> <Rprofmem[,3] [150 x 3]> <bch:tm [1]> <tibble [1 x 3]>
# 2 fun9(dat, RULES) 296.11ms  315.4ms     3.17     53.8MB     4.76     2     3    630.8ms <NULL> <Rprofmem[,3] [70 x 3]>  <bch:tm [2]> <tibble [2 x 3]>

虽然此解决方案不使用tidytable或其流，但速度更快。清除re是另一个步骤，很可能会使这个速度降到致命的水平:-)。

附带注意:我试图使用lapply、mget和其他技巧在data.table数据环境中进行操作，但最终，使用data.table::set (https://stackoverflow.com/a/16846530/3358272)和简单向量似乎是最快的。

票数 3

页面原文内容由Stack Overflow提供。腾讯云小微IT领域专用引擎提供翻译支持

原文链接：

https://stackoverflow.com/questions/66068903

复制

相似问题

问如何根据一定条件优化粘贴单/多个列名及其值
EN

回答 1

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问如何根据一定条件优化粘贴单/多个列名及其值EN

回答 1

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问如何根据一定条件优化粘贴单/多个列名及其值
EN