我使用以下dplyr代码创建了这个表:
数据
demo <- structure(list(`Performance-1` = c(4, 5, 3, 3, 5, 4, 4, 4, 4,
4, 5, 4, 5, 5, 3, 5, 2, 3, 3, 4, 4, 5, 4, 3, 4), `Performance-2` = c(4,
5, 3, 3, 5, 4, 4, 3, 3, 4, 5, 5, 5, 4, 3, 5, 2, 3, 3, 4, 4, 5,
4, 3, 3), Gender = c("Male", "Female", "Male", "Male", "Male",
"Female", "Male", "Female", "Male", "Male", NA, "Male", "Male",
"Male", "Male", "Male", NA, "Female", NA, "Female", "Male", "Male",
"Male", "Male", NA)), row.names = c(NA, -25L), class = c("tbl_df",
"tbl", "data.frame"))这只是我无法访问的主要数据的一个示例。下面的结果可能不一样
analysis_vars <- c("Performance-1", "Performance-2")
demo %>%
pivot_longer(cols = analysis_vars,names_to = "Performance") %>%
select(Performance, value, Gender) %>%
filter(!is.na(Gender), Performance %in% c("Performance-1", "Performance-2")) %>%
group_by(Gender, Performance) %>%
summarise(mean = round(mean(value, na.rm=T),2), sd = round(sd(value, na.rm=T),2), N = sum(!is.na(value))) %>%
pivot_wider(names_from = Gender, values_from = c(mean, sd, N))
Performance mean_Female mean_Male sd_Female sd_Male N_Female N_Male
Performance-1 4.14 4.10 0.79 0.79 428 896
Performance-2 4.00 3.91 0.87 0.86 427 897我想得到一个巨大的平均值和巨大的std偏差,作为las行,但我不知道这一点。
当我尝试以下代码时:
demo %>%
pivot_longer(cols = analysis_vars,names_to = "Performance") %>%
select(Performance, value, Gender) %>%
filter(!is.na(Gender), Performance %in% c("Performance-1", "Performance-2")) %>%
group_by(Gender, Performance) %>%
summarise(mean = round(mean(value, na.rm=T),2), sd = round(sd(value, na.rm=T),2), N = sum(!is.na(value))) %>% ungroup() %>%
add_row(mean = sum(.$mean*.$N)/sum(.$N), sd = sum(.$N-1)*.$sd/sum(.$N)) %>%
pivot_wider(names_from = Gender, values_from = c(mean, sd, N)) 我得到的结果是:
Performance mean_Female mean_Male sd_Female sd_Male N_Female N_Male
Performance-1 <dbl [1]> <dbl [1]> <NULL> <dbl [1]> <dbl [1]> <NULL>
Performance-2 <dbl [1]> <dbl [1]> <NULL> <dbl [1]> <dbl [1]> <NULL> 当我移除pivot_wider (最后一行)以查看正在发生的事情时,这就是我所看到的。似乎它为两种性别都增加了行。
Gender Performance mean sd N
Female Performance-1 4.140000 0.7900000 428
Female Performance-2 4.000000 0.8700000 427
Male Performance-1 4.100000 0.7900000 896
Male Performance-2 3.910000 0.8600000 897
NA NA 4.025978 0.7888066 NA
NA NA 4.025978 0.8686858 NA
NA NA 4.025978 0.7888066 NA
NA NA 4.025978 0.8587009 NA所以,我想也许我应该在旋转之后这样做:
demo %>%
pivot_longer(cols = analysis_vars,names_to = "Performance") %>%
select(Performance, value, Gender) %>%
filter(!is.na(Gender), Performance %in% c("Performance-1", "Performance-2")) %>%
group_by(Gender, Performance) %>%
summarise(mean = round(mean(value, na.rm=T),2), sd = round(sd(value, na.rm=T),2), N = sum(!is.na(value))) %>% ungroup() %>%
pivot_wider(names_from = Gender, values_from = c(mean, sd, N)) %>% ungroup() %>%
add_row(mean_Male = sum(.$mean_Male*.$N_Male)/sum(.$N_Male),
mean_Female = sum(.$mean_Female*.$N_Female)/sum(.$N_Female),
sd_Male = sum(.$N_Male-1)*.$sd_Male/sum(.$N_Male),
sd_Female = sum(.$N_Female-1)*.$sd_Female/sum(.$N_Female)) 但我明白
Error in vec_rbind(old, new) : Internal error in `vec_assign()`: `value` should have been recycled to fit `x`.我不太清楚这意味着什么。有更简单的方法来计算大平均值和sd吗?
更新
我发现了上面的错误。我应该用.$N_male和.$N_female。这解决了错误,但仍然没有产生我想要的结果。我已经把上面的密码修好了。
更新- 2
如上表所示:
Performance mean_Female mean_Male sd_Female sd_Male N_Female N_Male
Performance-1 4.14 4.10 0.79 0.79 428 896
Performance-2 4.00 3.91 0.87 0.86 427 897让我们计算一下grand mean:
Female: ((4.14*428)+(4.00*427))/(428+427)
Male: ((4.10*896)+(3.91*897))/(896+897)那么对于sd: sqrt(((N1-1)*S1^2+(N2-1)*S2^2+(N3-1)*S3^2)/(N1+N2+N3-3))
sd_Female: ((428-1)*0.79+(427-1)*0.87)/(428+427-2)
sd_Female: ((896-1)*0.79+(897-1)*0.86)/(896+897-2)
Performance mean_Female mean_Male sd_Female sd_Male N_Female N_Male
Performance-1 4.14 4.10 0.79 0.79 428 896
Performance-2 4.00 3.91 0.87 0.86 427 897
Grand Mean 4.07 4.00 0.83 0.83 我还不知道如何处理N_male和女性,所以我不介意任何一种方式- null或一些calc。
发布于 2020-05-17 01:45:35
正如我在评论中提到的,在我们获得宽格式的数据之前,必须先进行计算。在这里,我建议两种方法,你可以选择一个适合你。
library(dplyr)
library(tidyr)
demo %>%
pivot_longer(cols = starts_with('Performance'),names_to = "Performance") %>%
select(Performance, value, Gender) %>%
filter(!is.na(Gender), Performance %in% c("Performance-1", "Performance-2")) %>%
group_by(Gender, Performance) %>%
summarise(mean = round(mean(value, na.rm=T),2),
sd = round(sd(value, na.rm=T),2),
N = sum(!is.na(value))) %>%
mutate(gm = sum(mean * N)/sum(N),
gsd = sum((N - 1) * sd)/sum(N - n())) %>%
pivot_wider(names_from = Gender, values_from = c(mean, sd, N, gm, gsd))
# A tibble: 2 x 11
# Performance mean_Female mean_Male sd_Female sd_Male N_Female N_Male gm_Female gm_Male gsd_Female gsd_Male
# <chr> <dbl> <dbl> <dbl> <dbl> <int> <int> <dbl> <dbl> <dbl> <dbl>
#1 Performance-1 4 4.06 0.71 0.77 5 16 3.9 4.03 1.03 0.852
#2 Performance-2 3.8 4 0.84 0.82 5 16 3.9 4.03 1.03 0.852在这里,我们可以看到gm和gsd的值在各自的列中,并且是重复的。
第二种更接近预期输出的方法是分两步进行。
demo %>%
pivot_longer(cols = starts_with('Performance'),names_to = "Performance") %>%
select(Performance, value, Gender) %>%
filter(!is.na(Gender), Performance %in% c("Performance-1", "Performance-2")) %>%
group_by(Gender, Performance) %>%
summarise(mean = round(mean(value, na.rm=T),2),
sd = round(sd(value, na.rm=T),2),
N = sum(!is.na(value))) -> p
p %>%
pivot_wider(names_from = Gender, values_from = c(mean, sd, N)) %>%
bind_rows(p %>%
summarise(mean = sum(mean * N)/sum(N),
sd = sum((N - 1) * sd)/sum(N - n()),
Performance = 'Total') %>%
pivot_wider(names_from = Gender, values_from = c(mean, sd)))
# Performance mean_Female mean_Male sd_Female sd_Male N_Female N_Male
# <chr> <dbl> <dbl> <dbl> <dbl> <int> <int>
#1 Performance-1 4 4.06 0.71 0.77 5 16
#2 Performance-2 3.8 4 0.84 0.82 5 16
#3 Total 3.9 4.03 1.03 0.852 NA NA发布于 2020-05-16 15:16:30
这也许是一种方法,尽管使用expss进行计算,然后将输出转换为data.frame,我认为这实现了您正在寻找的目标。
library (expss)
library (dplyr)
demo %>%
tidyr::gather(key,value,-Gender) %>% #get long
tab_cells(value) %>% #variable used for calculations
tab_rows(key,total(label = "Grand mean") %>% #total gets grand total
tab_cols(Gender) %>% #variable for cols
tab_stat_fun(Mean =mean,SD = sd,N = w_n, method =list) %>% #calculations
tab_pivot()%>% #makes a table
data.frame() %>% # convert to df
select(c(1,2,5,3,6,4,7)) -> out #order cols
#tidy up names
colnames(out) <-gsub("Gender[.]","",colnames(out))
colnames(out)[1] <- "Performance"
out发布于 2020-05-16 22:59:00
经过多次尝试-错误和想法,我找到了一个解决方案,似乎是有效的。我仍然欢迎一种优雅的解决方案:
p2 <- demo %>% pivot_longer(cols = analysis_vars, names_to = "Performance") %>%
select(Performance, value, !!var) %>%
filter(!is.na(!!var), Performance %in% c("Performance-1", "Performance-2")) %>%
group_by(!!var, Performance) %>%
summarise(mean = round(mean(value, na.rm=T),2), sd = round(sd(value, na.rm=T),2), N = sum(!is.na(value))) %>%
mutate(gm = round(sum(mean*N)/sum(N),2),
gsd = round(sqrt(sum((N-1)*sd^2)/sum(N-5)),2)) %>%
pivot_wider(names_from = !!var, values_from = c(mean, sd, N, gm, gsd))
g <- p2 %>% select(matches("gm_|gsd_"))
n <- g %>% rename_all(funs(str_replace(., "gm_", "mean_"))) %>%
rename_all(funs(str_replace(., "gsd_", "sd_"))) %>%
summarise_all(mean, na.rm=T) %>%
add_column(Item = "Grand Mean/SD", .before = 1)
p2 <- p2 %>%
bind_rows(n) %>%
select(-starts_with("gm_"), -starts_with("gsd_"))到目前为止,这是我唯一能想到的办法。
我需要用这种方式将excel电子表格作为一个表,并提供相应的方法。
https://stackoverflow.com/questions/61829831
复制相似问题