首页
学习
活动
专区
圈层
工具
发布
社区首页 >问答首页 >在R中合并两个带有2列(逗号分隔的内容)的数据

在R中合并两个带有2列(逗号分隔的内容)的数据
EN

Stack Overflow用户
提问于 2022-01-17 15:50:04
回答 2查看 62关注 0票数 2

我有一个df,例如:

df1

代码语言:javascript
复制
Nb Groups_subgroups 
A  1_Cluster17972,1_Cluster2653,1_Cluster3900,2_Cluster12159,1_Cluster1798,1_Cluster17493
B  7_Cluster13022,1_Cluster4976
C  2_Cluster27505
D  7_Cluster25561,23_Cluster25561
E  1_Cluster23500

还有另一个df2,如:

代码语言:javascript
复制
Groups       Subgroups 
Cluster17972 1
Cluster17972 2 
Cluster2653  1
Cluster3900  1
Cluster12159 2
Cluster12159 3
Cluster1798  1
Cluster17493 1
Cluster00001 1
Cluster00001 2
Cluster13022 7
Cluster4976  1
Cluster27505 2
Cluster25561 7
Cluster25561 23
Cluster23500 1

我希望将df1$Groups_subgroupsdf2$Groupsdf2$Subgroups列合并,并得到如下结果:

代码语言:javascript
复制
Groups       Subgroups Nb
Cluster17972 1         A
Cluster17972 2         NA
Cluster2653  1         A
Cluster3900  1         A
Cluster12159 2         A
Cluster12159 3         NA
Cluster1798  1         A
Cluster17493 1         A
Cluster00001 1         NA
Cluster00001 2         NA
Cluster13022 7         B
Cluster4976  1         B
Cluster27505 2         C
Cluster25561 7         D
Cluster25561 23        D
Cluster23500 1         E

有人有主意吗?下面是两个dput格式的df:

df1

代码语言:javascript
复制
structure(list(Nb = c("A", "B", "C", "D", "E"), Groups_subgroups = c("1_Cluster17972,1_Cluster2653,1_Cluster3900,2_Cluster12159,1_Cluster1798,1_Cluster17493", 
"7_Cluster13022,1_Cluster4976", "2_Cluster27505", "7_Cluster25561,23_Cluster25561", 
"1_Cluster23500")), class = "data.frame", row.names = c(NA, -5L
))

df2

代码语言:javascript
复制
structure(list(Groups = c("Cluster17972", "Cluster17972", "Cluster2653", 
"Cluster3900", "Cluster12159", "Cluster12159", "Cluster1798", 
"Cluster17493", "Cluster00001", "Cluster00001", "Cluster13022", 
"Cluster4976", "Cluster27505", "Cluster25561", "Cluster25561", 
"Cluster23500"), Subgroups = c(1, 2, 1, 1, 2, 3, 1, 1, 1, 2, 
7, 1, 2, 7, 23, 1)), class = "data.frame", row.names = c(NA, 
-16L))
EN

回答 2

Stack Overflow用户

回答已采纳

发布于 2022-01-17 16:02:28

德普利

代码语言:javascript
复制
library(dplyr)
library(tidyr) # unnest, separate
df1 %>%
  mutate(Groups_subgroups = strsplit(Groups_subgroups, "[ ,]+")) %>%
  unnest(Groups_subgroups) %>%
  separate(Groups_subgroups, into = c("Subgroups", "Groups")) %>%
  mutate(Subgroups = as.integer(Subgroups)) %>%
  left_join(df2, ., by = c("Groups", "Subgroups"))
#          Groups Subgroups   Nb
# 1  Cluster17972         1    A
# 2  Cluster17972         2 <NA>
# 3   Cluster2653         1    A
# 4   Cluster3900         1    A
# 5  Cluster12159         2    A
# 6  Cluster12159         3 <NA>
# 7   Cluster1798         1    A
# 8  Cluster17493         1    A
# 9  Cluster00001         1 <NA>
# 10 Cluster00001         2 <NA>
# 11 Cluster13022         7    B
# 12  Cluster4976         1    B
# 13 Cluster27505         2    C
# 14 Cluster25561         7    D
# 15 Cluster25561        23    D
# 16 Cluster23500         1    E

基R

代码语言:javascript
复制
subgr <- strsplit(df1$Groups_subgroups, "[ ,]+")
subgr
# [[1]]
# [1] "1_Cluster17972" "1_Cluster2653"  "1_Cluster3900"  "2_Cluster12159" "1_Cluster1798"  "1_Cluster17493"
# [[2]]
# [1] "7_Cluster13022" "1_Cluster4976" 
# [[3]]
# [1] "2_Cluster27505"
# [[4]]
# [1] "7_Cluster25561"  "23_Cluster25561"
# [[5]]
# [1] "1_Cluster23500"
groups <- data.frame(Nb = rep(df1$Nb, lengths(subgr)), GSG = unlist(subgr))
groups <- cbind(groups, strcapture("^([^_]+)_(.*)", groups$GSG, list(Subgroups = 0L, Groups = "")))
groups <- groups[,-2]
groups
#    Nb Subgroups       Groups
# 1   A         1 Cluster17972
# 2   A         1  Cluster2653
# 3   A         1  Cluster3900
# 4   A         2 Cluster12159
# 5   A         1  Cluster1798
# 6   A         1 Cluster17493
# 7   B         7 Cluster13022
# 8   B         1  Cluster4976
# 9   C         2 Cluster27505
# 10  D         7 Cluster25561
# 11  D        23 Cluster25561
# 12  E         1 Cluster23500

有了这个,我们只需将两个对象merge/join放在一起:

代码语言:javascript
复制
merge(df2, groups, by = c("Groups", "Subgroups"), all.x = TRUE)
#          Groups Subgroups   Nb
# 1  Cluster00001         1 <NA>
# 2  Cluster00001         2 <NA>
# 3  Cluster12159         2    A
# 4  Cluster12159         3 <NA>
# 5  Cluster13022         7    B
# 6  Cluster17493         1    A
# 7  Cluster17972         1    A
# 8  Cluster17972         2 <NA>
# 9   Cluster1798         1    A
# 10 Cluster23500         1    E
# 11 Cluster25561         7    D
# 12 Cluster25561        23    D
# 13  Cluster2653         1    A
# 14 Cluster27505         2    C
# 15  Cluster3900         1    A
# 16  Cluster4976         1    B
票数 3
EN

Stack Overflow用户

发布于 2022-01-17 16:08:55

首先我们可以用separate_rowsseparate来制备separate_rowsseparate,然后与GroupsSubgroups结合。

代码语言:javascript
复制
library(dplyr)
Library(tidyr)

df1 <- df1 %>% 
  separate_rows(Groups_subgroups, sep = ',') %>% 
  separate(Groups_subgroups, c("Subgroups", "Groups")) %>% 
  type.convert(as.is = TRUE) 

  
df2 %>% 
  left_join(df1, by=c("Groups", "Subgroups"))
代码语言:javascript
复制
         Groups Subgroups   Nb
1  Cluster17972         1    A
2  Cluster17972         2 <NA>
3   Cluster2653         1    A
4   Cluster3900         1    A
5  Cluster12159         2    A
6  Cluster12159         3 <NA>
7   Cluster1798         1    A
8  Cluster17493         1    A
9  Cluster00001         1 <NA>
10 Cluster00001         2 <NA>
11 Cluster13022         7    B
12  Cluster4976         1    B
13 Cluster27505         2    C
14 Cluster25561         7    D
15 Cluster25561        23    D
16 Cluster23500         1    E
票数 3
EN
页面原文内容由Stack Overflow提供。腾讯云小微IT领域专用引擎提供翻译支持
原文链接:

https://stackoverflow.com/questions/70744052

复制
相关文章

相似问题

领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档