首页
学习
活动
专区
圈层
工具
发布
社区首页 >问答首页 >用表中的值字符串汇总多个列

用表中的值字符串汇总多个列
EN

Stack Overflow用户
提问于 2021-03-17 15:32:29
回答 2查看 194关注 0票数 1

我有这样的数据格式,其中大多数列包含值字符串;列A_aoiB_aoiC_aoi中的值表示凝视方向(ABC指向扬声器,* nowhere/A);列A_aoi_durB_aoi_durC_aoi_dur中的值表示这些凝视的持续时间:

代码语言:javascript
复制
df
# A tibble: 5 x 7
  speaker A_aoi     A_aoi_dur                            B_aoi B_aoi_dur    C_aoi C_aoi_dur           
  <chr>   <chr>     <chr>                                <chr> <chr>        <chr> <chr>               
1 ID01.B  B*B*B     494,251,416,217,35                   A*    153,1260     A     1413                
2 ID01.A  *B*C*C    445,412,116,533,600,153              A     2259         A*A*A 379,123,1300,144,313
3 ID01.A  B*B*B     1098,249,168,184,526                 A*A   1090,313,822 A*A   817,626,782         
4 ID01.C  C*C*B*    1794,1561,158,208,125,63             C*    2735,1174    *A    152,3757            
5 ID01.B  B*B*C*C*B 1585,1068,249,51,998,352,1016,66,425 *     5810         *B*B* 835,173,3827,661,314

对于每个speaker (可以通过列speaker中的后缀ABC来识别),我想要计算它们的凝视方向的总和时间和比例。我想要的表格是:

预期结果

代码语言:javascript
复制
  A_aoi Total     Prop B_aoi Total     Prop C_aoi Total      Prop
1     *  5431 34.77843     *  8557 54.79636     *  6021 38.556609
2     B  5533 35.43161     A  4324 27.68955     A  8761 56.102715
3     C  4652 29.78996     C  2735 17.51409     B   834  5.340676

我的感觉是,最好将数据格式转换成长格式。因此,使用separate_rows并为每个speaker的凝视、方向和凝视持续时间构造许多中间数据格式,我最终得到了这个复杂的代码--它完成了它应该做的事情。但是,,我很确定还有一种更经济、更优雅的方法--

那会是什么?帮助是非常感谢的!

代码语言:javascript
复制
library(dplyr)
library(tidyr)

### A:
a_dur <- df %>% 
  separate_rows(A_aoi_dur, sep = ",") %>%
  select(A_aoi_dur)
a_aoi <- df %>% 
  separate_rows(A_aoi, sep = "") %>%
  select(A_aoi) %>%
  filter(!A_aoi == "")
A <- cbind(a_dur, a_aoi)

# get grouped total durations and proportions: 
A_stat <- A %>%
  group_by(A_aoi) %>%
  summarise(Total = sum(as.numeric(A_aoi_dur))) %>%
  mutate(Prop = Total/sum(Total)*100)

### B:
b_dur <- df %>% 
  separate_rows(B_aoi_dur, sep = ",") %>%
  select(B_aoi_dur)
b_aoi <- df %>% 
  separate_rows(B_aoi, sep = "") %>%
  select(B_aoi) %>%
  filter(!B_aoi == "")
B <- cbind(b_dur, b_aoi)

# get grouped total durations and proportions: 
B_stat <- B %>%
  group_by(B_aoi) %>%
  summarise(Total = sum(as.numeric(B_aoi_dur))) %>%
  mutate(Prop = Total/sum(Total)*100)


### C:
c_dur <- df %>% 
  separate_rows(C_aoi_dur, sep = ",") %>%
  select(C_aoi_dur)
c_aoi <- df %>% 
  separate_rows(C_aoi, sep = "") %>%
  select(C_aoi) %>%
  filter(!C_aoi == "")
C <- cbind(c_dur, c_aoi)

# get grouped total durations and proportions: 
C_stat <- C %>%
  group_by(C_aoi) %>%
  summarise(Total = sum(as.numeric(C_aoi_dur))) %>%
  mutate(Prop = Total/sum(Total)*100)

# get final table:
cbind(A_stat, B_stat, C_stat)

可复制数据

代码语言:javascript
复制
df <- structure(list(speaker = c("ID01.B", "ID01.A", "ID01.A", "ID01.C", 
                                     "ID01.B"), A_aoi = c("B*B*B", "*B*C*C", "B*B*B", "C*C*B*", "B*B*C*C*B"
                                     ), A_aoi_dur = c("494,251,416,217,35", "445,412,116,533,600,153", 
                                                      "1098,249,168,184,526", "1794,1561,158,208,125,63", "1585,1068,249,51,998,352,1016,66,425"
                                     ), B_aoi = c("A*", "A", "A*A", "C*", "*"), B_aoi_dur = c("153,1260", 
                                                                                              "2259", "1090,313,822", "2735,1174", "5810"), C_aoi = c("A", 
                                                                                                                                                      "A*A*A", "A*A", "*A", "*B*B*"), C_aoi_dur = c("1413", "379,123,1300,144,313", 
                                                                                                                                                                                                    "817,626,782", "152,3757", "835,173,3827,661,314")), row.names = c(NA, 
                                                                                                                                                                                                                                                                       -5L), class = c("tbl_df", "tbl", "data.frame"))
EN

回答 2

Stack Overflow用户

回答已采纳

发布于 2021-03-17 16:29:30

一种方法(但避免重复列名):

代码语言:javascript
复制
library(dplyr)
library(purrr)
library(tidyr)
library(stringr)
map_columns <- function(aoi, dur){
  tibble(
    speaker = aoi,
    duration = as.integer(dur)
  )
}

df %>% 
select(-1) %>% #This column seems irrelevant
mutate(
    A_aoi = str_split(A_aoi, ''),
    B_aoi = str_split(B_aoi, ''),
    C_aoi = str_split(C_aoi, ''),
    A_aoi_dur = str_split(A_aoi_dur, ','),
    B_aoi_dur = str_split(B_aoi_dur, ','),
    C_aoi_dur = str_split(C_aoi_dur, ','),
    A_aoi = map2(A_aoi, A_aoi_dur, map_columns),
    B_aoi = map2(B_aoi, B_aoi_dur, map_columns),
    C_aoi = map2(C_aoi, C_aoi_dur, map_columns),
) %>% 
select(1, 3, 5) %>% 
gather() %>% 
unnest(cols = value) %>% 
group_by(key, speaker) %>% 
summarise(
    total = sum(duration)
) %>% 
mutate(
    prop = total/sum(total)*100
) %>% 
ungroup() %>% 
nest(data = -key) %>% 
spread(key, data) %>% 
unnest(cols = c(A_aoi, B_aoi, C_aoi), names_repair = ~paste0(., '_', rep(LETTERS[1:3], each = 3)))

输出:

代码语言:javascript
复制
# A tibble: 3 x 9
  speaker_A total_A prop_A speaker_B total_B prop_B speaker_C total_C prop_C
  <chr>       <int>  <dbl> <chr>       <int>  <dbl> <chr>       <int>  <dbl>
1 *            5431   34.8 *            8557   54.8 *            6021  38.6 
2 B            5533   35.4 A            4324   27.7 A            8761  56.1 
3 C            4652   29.8 C            2735   17.5 B             834   5.34
票数 2
EN

Stack Overflow用户

发布于 2021-03-17 16:26:59

这里还有一点需要对列进行排序,但我认为它与您的代码相比是一个整洁的版本,尽管输出有一些不同,因为它将所有的aoi都放在一列中,而不是有3列与您的不同。

代码语言:javascript
复制
library(dplyr)
library(tidyr)
library(purrr)
# Using group_split to separate duration & attention group
split_df <- df %>%
  pivot_longer(cols = contains("aoi"), names_to = "aoi",
    values_to = "aoi_values") %>%
  mutate(aoi_names = if_else(grepl("dur", aoi), "duration", "aoi")) %>%
  group_split(aoi_names)
# For each group apply the same logics you do then combined them together
tidy_df <- bind_cols(split_df[[1]] %>%
    separate_rows(aoi_values, sep = "") %>%
    filter(aoi_values != "") %>%
    select(speaker, aoi, aoi_values),
  split_df[[2]] %>%
    separate_rows(aoi_values, sep = ",") %>%
    mutate(aoi = gsub("_dur", "", aoi)) %>%
    select(duration = aoi_values))
# Finally calculate and pivot wider to have your desire output
tidy_df %>%
  group_by(aoi, aoi_values) %>%
  summarize(total_duration = sum(as.numeric(duration)),
    .groups = "drop") %>%
  group_by(aoi) %>%
  mutate(prop = total_duration / sum(total_duration) * 100) %>%
  pivot_wider(id_cols = aoi_values, names_from = aoi,
    names_glue = "{aoi}_{.value}",
    values_fill = 0,
    values_from = c(total_duration, prop)) %>%
  select(aoi_values, sort(names(.)))

输出

代码语言:javascript
复制
# A tibble: 4 x 7
  aoi_values A_aoi_prop A_aoi_total_duration B_aoi_prop B_aoi_total_duration C_aoi_prop C_aoi_total_duration
  <chr>           <dbl>                <dbl>      <dbl>                <dbl>      <dbl>                <dbl>
1 *                34.8                 5431       54.8                 8557      38.6                  6021
2 B                35.4                 5533        0                      0       5.34                  834
3 C                29.8                 4652       17.5                 2735       0                       0
4 A                 0                      0       27.7                 4324      56.1                  8761
票数 1
EN
页面原文内容由Stack Overflow提供。腾讯云小微IT领域专用引擎提供翻译支持
原文链接:

https://stackoverflow.com/questions/66676259

复制
相关文章

相似问题

领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档