文章/答案/技术大牛

发布

社区首页 >问答首页 >在dplyr中分组的众多变量之间的相关性

问在dplyr中分组的众多变量之间的相关性
EN

Stack Overflow用户

提问于 2018-12-13 15:13:13

回答 3查看 2.7K关注 0票数 5

假设我有一个数据框架，就像这样：

# Set RNG seed
set.seed(33550336)

# Create dummy data frame
df <- data.frame(PC1 = runif(20),
                 PC2 = runif(20),
                 PC3 = runif(20),
                 A = runif(20),
                 B = runif(20),
                 loc = sample(LETTERS[1:2], 20, replace = TRUE),
                 seas = sample(c("W", "S"), 20, replace = TRUE))

# > head(df)
#         PC1        PC2       PC3         A         B loc seas
# 1 0.8636470 0.02220823 0.7553348 0.4679607 0.0787467   A    S
# 2 0.3522257 0.42733152 0.2412971 0.6691419 0.1194121   A    W
# 3 0.5257408 0.44293320 0.3225228 0.0934192 0.2966507   B    S
# 4 0.0667227 0.90273594 0.6297959 0.1962124 0.4894373   A    W
# 5 0.3751383 0.50477920 0.6567203 0.4510632 0.4742191   B    S
# 6 0.9197086 0.32024904 0.8382138 0.9907894 0.9335657   A    S

我感兴趣的是计算PC1、PC2和PC3之间的相关性，以及按loc和seas分组的每个变量A和B。因此，例如，基于this answer，我可以执行以下操作：

# Correlation of variable A and PC1 per loc & seas combination
df %>% 
  group_by(loc, seas) %>% 
  summarise(cor = cor(PC1, A)) %>% 
  ungroup

# # A tibble: 4 x 3
#   loc   seas      cor
#   <fct> <fct>   <dbl>
# 1 A     S      0.458 
# 2 A     W      0.748 
# 3 B     S     -0.0178
# 4 B     W     -0.450

这给了我我想要的东西：PC1和A之间对于loc和seas的每一个组合的相关性。太棒了。

我所挣扎的是推断这一点，为PC*变量和其他变量的每一个组合执行计算(在本例中是A和B )。我的预期结果是上面的tibble，但是PC*和其他变量的每个组合都有一个列。我可以做这么长的手..。cor(PC2, A)、cor(PC3, A)、cor(PC1, B)等，但想必有一种简洁的计算编码方法。我怀疑它涉及到do，但我不能完全理解它.有人能启发我吗？

解决方案

我使用了G.Grotandieck的solution below，但这需要进行一些重组才能使其达到所需的格式。我已经张贴了我在这里使用的代码，以防它对其他人有用。

# Perform calculation
res <- by(df[1:5], df[-(1:5)], cor)

# Combinations of loc & seas 
comb <- expand.grid(dimnames(res))

#   loc seas
# 1   A    S
# 2   B    S
# 3   A    W
# 4   B    W

# A matrix corresponding to a loc & seas
# Plus the loc & seas themselves
restructure <- function(m, n){
  # Convert to data frame
  # Add rownames as column
  # Retains PCs as rows, but not columns
  # Gather variables to long format
  # Unite PC & variable names
  # Spread to a single row
  # Add combination of loc & seas
  m %>% 
    data.frame %>% 
    rownames_to_column() %>% 
    filter(grepl("PC", rownames(m))) %>% 
    select(-contains("PC")) %>% 
    gather(variable, value, -rowname) %>% 
    unite(comb, rowname, variable) %>% 
    spread(comb, value) %>% 
    bind_cols(n)
}

# Restructure each list element & combine into data frame
do.call(rbind, lapply(1:length(res), function(x)restructure(res[[x]], comb[x, ])))

这给了，

#         PC1_A       PC1_B      PC2_A       PC2_B      PC3_A     PC3_B loc seas
# 1  0.45763159 -0.00925106  0.3522161  0.20916667 -0.2003091 0.3741403   A    S
# 2 -0.01779813 -0.74328144 -0.3501188  0.46324158  0.8034240 0.4580262   B    S
# 3  0.74835455  0.49639477 -0.3994917 -0.05233889 -0.5902400 0.3606690   A    W
# 4 -0.45025181 -0.66721038 -0.9899521 -0.80989058  0.7606430 0.3738706   B    W

dplyr

correlation

回答 3

Stack Overflow用户

回答已采纳

发布于 2018-12-13 15:19:36

像这样使用by：

By <- by(df[1:5], df[-(1:5)], cor)

给予：

> By
loc: A
seas: S
            PC1        PC2        PC3          A           B
PC1  1.00000000 -0.3941583  0.1872622  0.4576316 -0.00925106
PC2 -0.39415826  1.0000000 -0.6797708  0.3522161  0.20916667
PC3  0.18726218 -0.6797708  1.0000000 -0.2003091  0.37414025
A    0.45763159  0.3522161 -0.2003091  1.0000000  0.57292305
B   -0.00925106  0.2091667  0.3741403  0.5729230  1.00000000
----------------------------------------------------------------------------------------------------------------------------- 
loc: B
seas: S
            PC1         PC2         PC3           A          B
PC1  1.00000000 -0.52651449  0.07120701 -0.01779813 -0.7432814
PC2 -0.52651449  1.00000000 -0.05448583 -0.35011878  0.4632416
PC3  0.07120701 -0.05448583  1.00000000  0.80342399  0.4580262
A   -0.01779813 -0.35011878  0.80342399  1.00000000  0.5558740
B   -0.74328144  0.46324158  0.45802622  0.55587404  1.0000000
----------------------------------------------------------------------------------------------------------------------------- 
loc: A
seas: W
           PC1         PC2        PC3          A           B
PC1  1.0000000 -0.79784422  0.0932317  0.7483545  0.49639477
PC2 -0.7978442  1.00000000 -0.3526315 -0.3994917 -0.05233889
PC3  0.0932317 -0.35263151  1.0000000 -0.5902400  0.36066898
A    0.7483545 -0.39949171 -0.5902400  1.0000000  0.18081316
B    0.4963948 -0.05233889  0.3606690  0.1808132  1.00000000
----------------------------------------------------------------------------------------------------------------------------- 
loc: B
seas: W
           PC1        PC2        PC3          A          B
PC1  1.0000000  0.3441459  0.1135686 -0.4502518 -0.6672104
PC2  0.3441459  1.0000000 -0.8447551 -0.9899521 -0.8098906
PC3  0.1135686 -0.8447551  1.0000000  0.7606430  0.3738706
A   -0.4502518 -0.9899521  0.7606430  1.0000000  0.8832408
B   -0.6672104 -0.8098906  0.3738706  0.8832408  1.0000000

已添加

根据对所需内容的进一步讨论，定义接受相关矩阵或数据帧(在后一种情况下)的onerow函数(在后一种情况下，它将前5列转换为相关矩阵)，产生一行输出。if语句在onerow中是不需要的，但不会对adply代码行造成伤害，但是我们已经包含了它，以便在下面的后续示例中onerow也能以简单的方式工作。

library(plyr)

onerow <- function(x) {
  if (is.data.frame(x)) x <- cor(x[1:5])
  dtab <- as.data.frame.table(x[4:5, 1:3])
  with(dtab, setNames(Freq, paste(Var2, Var1, sep = "_")))
}

adply(By, 1:2, onerow)

给予：

  loc seas       PC1_A       PC1_B      PC2_A       PC2_B      PC3_A     PC3_B
1   A    S  0.45763159 -0.00925106  0.3522161  0.20916667 -0.2003091 0.3741403
2   B    S -0.01779813 -0.74328144 -0.3501188  0.46324158  0.8034240 0.4580262
3   A    W  0.74835455  0.49639477 -0.3994917 -0.05233889 -0.5902400 0.3606690
4   B    W -0.45025181 -0.66721038 -0.9899521 -0.80989058  0.7606430 0.3738706

或者完全放弃by并使用它提供相同的输出：

library(plyr)
ddply(df, -(1:5), onerow)

或使用dplyr：

library(dplyr)
df %>%
  group_by_at(-(1:5)) %>%
  do( onerow(.) %>% t %>% as.data.frame ) %>%
  ungroup

票数 7

Stack Overflow用户

发布于 2018-12-13 15:30:21

这里有一个通过tidyverse的解决方案，在这里我们使用summarise_at来指定所有PC[0-9]并将它们与A关联。B的相同过程，然后简单地合并，即

library(tidyverse)

df %>% 
 group_by(loc, seas) %>% 
 summarise_at(vars(starts_with('PC')), funs(cor(., A))) %>% 
 left_join(., df %>% 
                 group_by(loc, seas) %>% 
                 summarise_at(vars(starts_with('PC')), funs(cor(., B))), 
          by = c('loc', 'seas'), suffix = c('.A', '.B'))

这给了，

A tibble: 4×8#组: loc loc海PC1.A PC2.A PC3.B PC2.B 1 A 0.458 0.352 -0.200 -0.00925 0.374 2 A W 0.748 -0.399 0.590 0.496 -0.0523 0.361 3 B 0.0178 -0.350 0。803 -0.743 0.463 0.458 4 B 0.450 -0.990 0.761 -0.667 -0.810 0.374

票数 3

Stack Overflow用户

发布于 2018-12-13 15:36:04

我们可以在split和cor中使用base R

lapply(split(df[1:5], df[-(1:5)]), cor)
#$A.S
#            PC1        PC2        PC3          A           B
#PC1  1.00000000 -0.3941583  0.1872622  0.4576316 -0.00925106
#PC2 -0.39415826  1.0000000 -0.6797708  0.3522161  0.20916667
#PC3  0.18726218 -0.6797708  1.0000000 -0.2003091  0.37414025
#A    0.45763159  0.3522161 -0.2003091  1.0000000  0.57292305
#B   -0.00925106  0.2091667  0.3741403  0.5729230  1.00000000

#$B.S
#            PC1         PC2         PC3           A          B
#PC1  1.00000000 -0.52651449  0.07120701 -0.01779813 -0.7432814
#PC2 -0.52651449  1.00000000 -0.05448583 -0.35011878  0.4632416
#PC3  0.07120701 -0.05448583  1.00000000  0.80342399  0.4580262
#A   -0.01779813 -0.35011878  0.80342399  1.00000000  0.5558740
#B   -0.74328144  0.46324158  0.45802622  0.55587404  1.0000000

#$A.W
#           PC1         PC2        PC3          A           B
#PC1  1.0000000 -0.79784422  0.0932317  0.7483545  0.49639477
#PC2 -0.7978442  1.00000000 -0.3526315 -0.3994917 -0.05233889
#PC3  0.0932317 -0.35263151  1.0000000 -0.5902400  0.36066898
#A    0.7483545 -0.39949171 -0.5902400  1.0000000  0.18081316
#B    0.4963948 -0.05233889  0.3606690  0.1808132  1.00000000

#$B.W
#           PC1        PC2        PC3          A          B
#PC1  1.0000000  0.3441459  0.1135686 -0.4502518 -0.6672104
#PC2  0.3441459  1.0000000 -0.8447551 -0.9899521 -0.8098906
#PC3  0.1135686 -0.8447551  1.0000000  0.7606430  0.3738706
#A   -0.4502518 -0.9899521  0.7606430  1.0000000  0.8832408
#B   -0.6672104 -0.8098906  0.3738706  0.8832408  1.0000000

或者使用tidyverse

library(tidyverse)
df %>% 
    group_by_at(6:7) %>% 
    nest %>% 
    mutate(data = map(data, cor))

票数 3

页面原文内容由Stack Overflow提供。腾讯云小微IT领域专用引擎提供翻译支持

原文链接：

https://stackoverflow.com/questions/53764842

复制

相似问题

问在dplyr中分组的众多变量之间的相关性
EN

回答 3

Stack Overflow用户

Stack Overflow用户

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问在dplyr中分组的众多变量之间的相关性EN

回答 3

Stack Overflow用户

Stack Overflow用户

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问在dplyr中分组的众多变量之间的相关性
EN