文章/答案/技术大牛

发布

社区首页 >问答首页 >Dplyr创建一个唯一的情侣ID

问Dplyr创建一个唯一的情侣ID
EN

Stack Overflow用户

提问于 2021-05-11 09:25:14

回答 3查看 147关注 0票数 1

我正试图为两列创建唯一的I。dataframe有两个列，Sp1和Sp2，它们是字符串。Sp1和Sp2可以是相同的字符串。我希望有一个唯一的(Sp1;Sp2)耦合标识符，它不考虑(Sp1;Sp2)的顺序。例如，(A;B)应该具有与(B;A)相同的标识符。

我尝试使用dplyr，但是它没有工作，因为sort()函数为每一行呈现相同的标识符。

所以我使用了一个经典的循环，但是我有一个很大的数据集，它花费了太多的时间：

for (k in 1:nrow(data)){
 data$Couple[k] <- paste0(
  sort(c(as.character(data$Sp_1[k]), as.character(data$Sp_2[k])))[1],
  "_",
  sort(c(as.character(data$Sp_1[k]), as.character(data$Sp_2[k])))[2])
}

我也希望有数字，而不是字符串，因为它将占用较少的内存。

dplyr

回答 3

Stack Overflow用户

回答已采纳

发布于 2021-05-11 12:32:14

使用pmin和pmax

data %>% 
  mutate(id1 = paste0(pmin(Sp_1, Sp_2), pmax(Sp_1, Sp_2)),
         id2 = as.integer(as.factor(id1)))

# # A tibble: 10 x 4
#    Sp_1  Sp_2  id1     id2
#    <chr> <chr> <chr> <int>
#  1 a     e     ae       3
#  2 d     e     de       7
#  3 a     b     ab       1
#  4 b     b     bb       4
#  5 e     a     ae       3
#  6 c     e     ce       6
#  7 b     e     be       5
#  8 c     a     ac       2
#  9 c     a     ac       2
# 10 a     e     ae       3

编辑：如果我们追求的是效率，坚持与基：：转换，见基准。示例数据比示例数据快5倍，更大的数据快1.5倍：

# bigger data
set.seed(1); data <- tibble(
  Sp_1 = sample(letters[1:5], 10000, replace = TRUE),
  Sp_2 = sample(letters[1:5], 10000, replace = TRUE)
)

microbenchmark::microbenchmark(
  x1 = {
    data %>% 
      mutate(id = as.integer(as.factor(
        paste0(pmin(Sp_1, Sp_2), pmax(Sp_1, Sp_2)))))
  },
  x2 = {
    transform(data,
              id = as.integer(as.factor(
                paste0(pmin(Sp_1, Sp_2), pmax(Sp_1, Sp_2)))))
    
  }, unit = "relative")

# Unit: relative
# expr      min       lq     mean   median       uq      max neval
#   x1 1.476691 1.457313 1.414833 1.429563 1.303684 2.209446   100
#   x2 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000   100

票数 3

Stack Overflow用户

发布于 2021-05-11 10:06:33

这里有一个解决方案，它首先使用rowwise()来获取顺序不重要的Sp_1 + Sp_2的表示，然后使用purrr::map_int()将其转换为唯一的id：

# Make this reproducible
set.seed(1)

# Load packages
library(dplyr)
library(purrr)

# Define and inspect a test dataset
data <- tibble(
  Sp_1 = sample(letters[1:5], 10, replace = TRUE),
  Sp_2 = sample(letters[1:5], 10, replace = TRUE)
)

data
#> # A tibble: 10 x 2
#>    Sp_1  Sp_2 
#>    <chr> <chr>
#>  1 a     e    
#>  2 d     e    
#>  3 a     b    
#>  4 b     b    
#>  5 e     a    
#>  6 c     e    
#>  7 b     e    
#>  8 c     a    
#>  9 c     a    
#> 10 a     e

data %>% 
  
  # Add a unique representation of `Sp_1` and `Sp_2` where order doesn't matter
  rowwise() %>% 
  mutate(string = paste(sort(c(Sp_1, Sp_2)), collapse = "")) %>% 
  ungroup() %>% 
  
  # Use `map_int()` to get an integer `id` representation of `string`
  mutate(id = map_int(string, ~which(unique(string) == .)))
#> # A tibble: 10 x 4
#>    Sp_1  Sp_2  string    id
#>    <chr> <chr> <chr>  <int>
#>  1 a     e     ae         1
#>  2 d     e     de         2
#>  3 a     b     ab         3
#>  4 b     b     bb         4
#>  5 e     a     ae         1
#>  6 c     e     ce         5
#>  7 b     e     be         6
#>  8 c     a     ac         7
#>  9 c     a     ac         7
#> 10 a     e     ae         1

票数 1

Stack Overflow用户

发布于 2021-05-12 07:38:32

基准测试

@BluVoxe给出了一个可复制的例子。我们比较了@BluVoxe和@ We 8754的解决方案的输出和运行时间：

library(dplyr)
library(purrr)
library(microbenchmark)

# Make this reproducible
set.seed(1)

# Define and inspect a test dataset
data <- tibble(
  Sp_1 = sample(letters[1:5], 10, replace = TRUE),
  Sp_2 = sample(letters[1:5], 10, replace = TRUE)
)

data

# # A tibble: 10 x 2
#    Sp_1  Sp_2 
#    <chr> <chr>
#  1 a     e    
#  2 d     e    
#  3 a     b    
#  4 b     b    
#  5 e     a    
#  6 c     e    
#  7 b     e    
#  8 c     a    
#  9 c     a    
# 10 a     e

#First solution

data1 <- data%>% 
  
  # Add a unique representation of `Sp_1` and `Sp_2` where order doesn't matter
  dplyr::rowwise() %>% 
  dplyr::mutate(string = paste(sort(c(Sp_1, Sp_2)), collapse = "")) %>% 
  dplyr::ungroup() %>% 
  
  # Use `map_int()` to get an integer `id` representation of `string`
  dplyr::mutate(id = purrr::map_int(string, ~which(unique(string) == .)))

data1

# # A tibble: 10 x 4
#   Sp_1  Sp_2  string id
#   <chr> <chr> <chr>  <int>
# 1 a     e     ae     1
# 2 d     e     de     2
# 3 a     b     ab     3
# 4 b     b     bb     4
# 5 e     a     ae     1
# 6 c     e     ce     5
# 7 b     e     be     6
# 8 c     a     ac     7
# 9 c     a     ac     7
# 10 a    e     ae     1


# Second solution

data2 <- data %>% 
  dplyr::mutate(id1 = paste0(pmin(Sp_1, Sp_2), pmax(Sp_1, Sp_2)),
         id2 = as.integer(as.factor(id1)))

data2

# # A tibble: 10 x 4
#   Sp_1  Sp_2  id1   id2
#   <chr> <chr> <chr> <int>
# 1 a     e     ae    3
# 2 d     e     de    7
# 3 a     b     ab    1
# 4 b     b     bb    4
# 5 e     a     ae    3
# 6 c     e     ce    6
# 7 b     e     be    5
# 8 c     a     ac    2
# 9 c     a     ac    2
# 10 a    e     ae    3

# Third solution
data3 <- transform(
  data,
  id = as.integer(
    as.factor(
      paste0(pmin(Sp_1, Sp_2), pmax(Sp_1, Sp_2))
      )
    )
  )

data3

#       Sp_1 Sp_2 id
# 1     a    e    3
# 2     d    e    7
# 3     a    b    1
# 4     b    b    4
# 5     e    a    3
# 6     c    e    6
# 7     b    e    5
# 8     c    a    2
# 9     c    a    2
# 10    a    e    3

 #Compare efficiency :

microbenchmark::microbenchmark(
  x1 = {
    data%>% 
      dplyr::rowwise() %>% 
      dplyr::mutate(string = paste(sort(c(Sp_1, Sp_2)), collapse = "")) %>% 
      dplyr::ungroup() %>%
      dplyr::mutate(id = purrr::map_int(string, ~which(unique(string) == .)))
  },
  x2 = {
    data %>% 
      dplyr::mutate(id = as.integer(as.factor(
        paste0(pmin(Sp_1, Sp_2), pmax(Sp_1, Sp_2)))))
  },
  x3 = {
    transform(data,
              id = as.integer(as.factor(
                paste0(pmin(Sp_1, Sp_2), pmax(Sp_1, Sp_2)))))
    
  },
  unit = "relative")

#Unit: relative
 #expr       min        lq      mean    median        uq      max neval
  # x1 23.329340 24.151001 23.951911 23.710270 22.996736 28.23673   100
   #x2  8.064332  7.785381  8.214726  7.796895  7.741803 19.18936   100
   #x3  1.000000  1.000000  1.000000  1.000000  1.000000  1.00000   100

#With bigger data :

set.seed(1)
data <- tibble(
  Sp_1 = sample(letters[1:5], 10000, replace = TRUE),
  Sp_2 = sample(letters[1:5], 10000, replace = TRUE)
)

microbenchmark::microbenchmark(
  x1 = {
    data%>% 
      dplyr::rowwise() %>% 
      dplyr::mutate(string = paste(sort(c(Sp_1, Sp_2)), collapse = "")) %>% 
      dplyr::ungroup() %>% 
      dplyr::mutate(id = purrr::map_int(string, ~which(unique(string) == .)))
  },
  x2 = {
    data %>% 
      dplyr::mutate(id = as.integer(as.factor(
        paste0(pmin(Sp_1, Sp_2), pmax(Sp_1, Sp_2)))))
  },
  x3 = {
    transform(data,
              id = as.integer(as.factor(
                paste0(pmin(Sp_1, Sp_2), pmax(Sp_1, Sp_2)))))
    
  }, unit = "relative")

#Unit: relative
 #expr        min         lq       mean     median         uq        max neval
   #x1 524.626924 512.590748 506.051098 515.687843 521.642359 418.635195   100
   #x2   1.503782   1.514021   1.577941   1.559449   1.620967   1.648478   100
   #x3   1.000000   1.000000   1.000000   1.000000   1.000000   1.000000   100

最后一种解决方案是最有效的。

但是，有一些不同:第一个解决方案和lat解决方案呈现ID的顺序与它们在dataset中显示的顺序相同，而第二个解决方案以一对夫妇的字母顺序呈现ID。

票数 0

页面原文内容由Stack Overflow提供。腾讯云小微IT领域专用引擎提供翻译支持

原文链接：

https://stackoverflow.com/questions/67483983

复制

相似问题

问Dplyr创建一个唯一的情侣ID
EN

回答 3

Stack Overflow用户

Stack Overflow用户

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问Dplyr创建一个唯一的情侣IDEN

回答 3

Stack Overflow用户

Stack Overflow用户

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问Dplyr创建一个唯一的情侣ID
EN