我有这样的数据格式,其中大多数列包含值字符串;列A_aoi、B_aoi和C_aoi中的值表示凝视方向(A、B和C指向扬声器,* nowhere/A);列A_aoi_dur、B_aoi_dur和C_aoi_dur中的值表示这些凝视的持续时间:
df
# A tibble: 5 x 7
speaker A_aoi A_aoi_dur B_aoi B_aoi_dur C_aoi C_aoi_dur
<chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 ID01.B B*B*B 494,251,416,217,35 A* 153,1260 A 1413
2 ID01.A *B*C*C 445,412,116,533,600,153 A 2259 A*A*A 379,123,1300,144,313
3 ID01.A B*B*B 1098,249,168,184,526 A*A 1090,313,822 A*A 817,626,782
4 ID01.C C*C*B* 1794,1561,158,208,125,63 C* 2735,1174 *A 152,3757
5 ID01.B B*B*C*C*B 1585,1068,249,51,998,352,1016,66,425 * 5810 *B*B* 835,173,3827,661,314对于每个speaker (可以通过列speaker中的后缀A、B和C来识别),我想要计算它们的凝视方向的总和时间和比例。我想要的表格是:
预期结果
A_aoi Total Prop B_aoi Total Prop C_aoi Total Prop
1 * 5431 34.77843 * 8557 54.79636 * 6021 38.556609
2 B 5533 35.43161 A 4324 27.68955 A 8761 56.102715
3 C 4652 29.78996 C 2735 17.51409 B 834 5.340676我的感觉是,最好将数据格式转换成长格式。因此,使用separate_rows并为每个speaker的凝视、方向和凝视持续时间构造许多中间数据格式,我最终得到了这个复杂的代码--它完成了它应该做的事情。但是,,我很确定还有一种更经济、更优雅的方法--!
那会是什么?帮助是非常感谢的!
library(dplyr)
library(tidyr)
### A:
a_dur <- df %>%
separate_rows(A_aoi_dur, sep = ",") %>%
select(A_aoi_dur)
a_aoi <- df %>%
separate_rows(A_aoi, sep = "") %>%
select(A_aoi) %>%
filter(!A_aoi == "")
A <- cbind(a_dur, a_aoi)
# get grouped total durations and proportions:
A_stat <- A %>%
group_by(A_aoi) %>%
summarise(Total = sum(as.numeric(A_aoi_dur))) %>%
mutate(Prop = Total/sum(Total)*100)
### B:
b_dur <- df %>%
separate_rows(B_aoi_dur, sep = ",") %>%
select(B_aoi_dur)
b_aoi <- df %>%
separate_rows(B_aoi, sep = "") %>%
select(B_aoi) %>%
filter(!B_aoi == "")
B <- cbind(b_dur, b_aoi)
# get grouped total durations and proportions:
B_stat <- B %>%
group_by(B_aoi) %>%
summarise(Total = sum(as.numeric(B_aoi_dur))) %>%
mutate(Prop = Total/sum(Total)*100)
### C:
c_dur <- df %>%
separate_rows(C_aoi_dur, sep = ",") %>%
select(C_aoi_dur)
c_aoi <- df %>%
separate_rows(C_aoi, sep = "") %>%
select(C_aoi) %>%
filter(!C_aoi == "")
C <- cbind(c_dur, c_aoi)
# get grouped total durations and proportions:
C_stat <- C %>%
group_by(C_aoi) %>%
summarise(Total = sum(as.numeric(C_aoi_dur))) %>%
mutate(Prop = Total/sum(Total)*100)
# get final table:
cbind(A_stat, B_stat, C_stat)可复制数据
df <- structure(list(speaker = c("ID01.B", "ID01.A", "ID01.A", "ID01.C",
"ID01.B"), A_aoi = c("B*B*B", "*B*C*C", "B*B*B", "C*C*B*", "B*B*C*C*B"
), A_aoi_dur = c("494,251,416,217,35", "445,412,116,533,600,153",
"1098,249,168,184,526", "1794,1561,158,208,125,63", "1585,1068,249,51,998,352,1016,66,425"
), B_aoi = c("A*", "A", "A*A", "C*", "*"), B_aoi_dur = c("153,1260",
"2259", "1090,313,822", "2735,1174", "5810"), C_aoi = c("A",
"A*A*A", "A*A", "*A", "*B*B*"), C_aoi_dur = c("1413", "379,123,1300,144,313",
"817,626,782", "152,3757", "835,173,3827,661,314")), row.names = c(NA,
-5L), class = c("tbl_df", "tbl", "data.frame"))发布于 2021-03-17 16:29:30
一种方法(但避免重复列名):
library(dplyr)
library(purrr)
library(tidyr)
library(stringr)
map_columns <- function(aoi, dur){
tibble(
speaker = aoi,
duration = as.integer(dur)
)
}
df %>%
select(-1) %>% #This column seems irrelevant
mutate(
A_aoi = str_split(A_aoi, ''),
B_aoi = str_split(B_aoi, ''),
C_aoi = str_split(C_aoi, ''),
A_aoi_dur = str_split(A_aoi_dur, ','),
B_aoi_dur = str_split(B_aoi_dur, ','),
C_aoi_dur = str_split(C_aoi_dur, ','),
A_aoi = map2(A_aoi, A_aoi_dur, map_columns),
B_aoi = map2(B_aoi, B_aoi_dur, map_columns),
C_aoi = map2(C_aoi, C_aoi_dur, map_columns),
) %>%
select(1, 3, 5) %>%
gather() %>%
unnest(cols = value) %>%
group_by(key, speaker) %>%
summarise(
total = sum(duration)
) %>%
mutate(
prop = total/sum(total)*100
) %>%
ungroup() %>%
nest(data = -key) %>%
spread(key, data) %>%
unnest(cols = c(A_aoi, B_aoi, C_aoi), names_repair = ~paste0(., '_', rep(LETTERS[1:3], each = 3)))输出:
# A tibble: 3 x 9
speaker_A total_A prop_A speaker_B total_B prop_B speaker_C total_C prop_C
<chr> <int> <dbl> <chr> <int> <dbl> <chr> <int> <dbl>
1 * 5431 34.8 * 8557 54.8 * 6021 38.6
2 B 5533 35.4 A 4324 27.7 A 8761 56.1
3 C 4652 29.8 C 2735 17.5 B 834 5.34发布于 2021-03-17 16:26:59
这里还有一点需要对列进行排序,但我认为它与您的代码相比是一个整洁的版本,尽管输出有一些不同,因为它将所有的aoi都放在一列中,而不是有3列与您的不同。
library(dplyr)
library(tidyr)
library(purrr)
# Using group_split to separate duration & attention group
split_df <- df %>%
pivot_longer(cols = contains("aoi"), names_to = "aoi",
values_to = "aoi_values") %>%
mutate(aoi_names = if_else(grepl("dur", aoi), "duration", "aoi")) %>%
group_split(aoi_names)
# For each group apply the same logics you do then combined them together
tidy_df <- bind_cols(split_df[[1]] %>%
separate_rows(aoi_values, sep = "") %>%
filter(aoi_values != "") %>%
select(speaker, aoi, aoi_values),
split_df[[2]] %>%
separate_rows(aoi_values, sep = ",") %>%
mutate(aoi = gsub("_dur", "", aoi)) %>%
select(duration = aoi_values))
# Finally calculate and pivot wider to have your desire output
tidy_df %>%
group_by(aoi, aoi_values) %>%
summarize(total_duration = sum(as.numeric(duration)),
.groups = "drop") %>%
group_by(aoi) %>%
mutate(prop = total_duration / sum(total_duration) * 100) %>%
pivot_wider(id_cols = aoi_values, names_from = aoi,
names_glue = "{aoi}_{.value}",
values_fill = 0,
values_from = c(total_duration, prop)) %>%
select(aoi_values, sort(names(.)))输出
# A tibble: 4 x 7
aoi_values A_aoi_prop A_aoi_total_duration B_aoi_prop B_aoi_total_duration C_aoi_prop C_aoi_total_duration
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 * 34.8 5431 54.8 8557 38.6 6021
2 B 35.4 5533 0 0 5.34 834
3 C 29.8 4652 17.5 2735 0 0
4 A 0 0 27.7 4324 56.1 8761https://stackoverflow.com/questions/66676259
复制相似问题