我已经创建了一个空的数据框,如下面的代码所述。我正在尝试从样本中提取表达式数据,并使用它来填充数据框。
此外,如果样本不包含数据帧中特定mRNA的表达式值,我如何将0而不是安娜输入到单元格中?
任何帮助都将不胜感激!
示例代码:
sample_8765 <- data.frame(type = c("protein", "mRNA", "mRNA", "mRNA", "protein"),
name = c("DIABLO", "X1345", "X1234", "F1234", "p53"),
expression = c("1.23", "4.265", "3.44", "6.43", "8.00"))
sample_8901 <- data.frame(type = c("protein", "mRNA", "protein","mRNA", "protein"),
name = c("DIABLO", "X1345", "SMAC", "F4657", "MDM2"),
expression = c("3.24", "5.33", "4.35", "6.78", "9.11"))
sample_9084 <- data.frame(type = c("protein", "protein", "mRNA", "mRNA", "mRNA"),
name = c("SMAC", "DIABLO", "X1345", "F4657", "X1234" ),
expression = c("2.56", "8.11", "4.35", "6.78", "9.11"))
sample_ID <- c("sample_8765","sample_8901","sample_9084") #unique samples
#seperating by type
samples <- bind_rows(mget(paste0(sample_ID)), .id = "Sample_ID") %>%
split(.$type)
mRNA_samples <- samples$mRNA
mRNAs <- unique(mRNA_samples$name) #unique mRNAs
#creating empty matrix
mRNA_df <- matrix(nrow = 4, ncol = 3)
rownames(mRNA_df) = mRNAs
colnames(mRNA_df) = sample_ID我希望看到的输出是这样的!
# sample_8765 sample_8901 sample_9084
#X1345 4.265 5.33 4.35
#X1234 3.44 0 9.11
#F1234 6.43 0 0
#F4657 0 6.78 6.78发布于 2021-05-05 03:43:16
请注意,您的“数字”实际上是字符串。像max这样的许多函数都可以工作,但它们是按字典排序的,而不是数字的。例如,当20 > 3为true时,"20" > "3"为false。
tidyverse
如果您的数字实际上是数字,则根据数字/整数更改为values_fill=list(expression=0)或=0L。
library(dplyr)
library(tidyr) # pivot_wider
bind_rows(lst(sample_8765,sample_8901,sample_9084), .id = "id") %>%
select(name, id, expression) %>%
pivot_wider(name, names_from = "id", values_from = "expression", values_fill = list(expression = "0"))
# # A tibble: 8 x 4
# name sample_8765 sample_8901 sample_9084
# <chr> <chr> <chr> <chr>
# 1 DIABLO 1.23 3.24 8.11
# 2 X1345 4.265 5.33 4.35
# 3 X1234 3.44 0 9.11
# 4 F1234 6.43 0 0
# 5 p53 8.00 0 0
# 6 SMAC 0 4.35 2.56
# 7 F4657 0 6.78 6.78
# 8 MDM2 0 9.11 0 data.table
dat <- rbindlist(setNames(list(sample_8765,sample_8901,sample_9084),
nm = c("sample_8765","sample_8901","sample_9084")), idcol = "id")
dcast(dat, name ~ id, fun.aggregate = max, value.var = "expression", fill = 0)
# name sample_8765 sample_8901 sample_9084
# <char> <char> <char> <char>
# 1: DIABLO 1.23 3.24 8.11
# 2: F1234 6.43 0 0
# 3: F4657 0 6.78 6.78
# 4: MDM2 0 9.11 0
# 5: SMAC 0 4.35 2.56
# 6: X1234 3.44 0 9.11
# 7: X1345 4.265 5.33 4.35
# 8: p53 8.00 0 0发布于 2021-05-05 03:46:46
这里有一个非常tidyverse的方法。
library(tidyverse)
tibble(sample_id = c("sample_8765","sample_8901","sample_9084")) %>%
mutate(data = map(sample_id, get)) %>%
unnest(data) %>%
filter(type == "mRNA") %>%
select(-type) %>%
mutate(expression = as.numeric(expression)) %>%
pivot_wider(names_from = "sample_id", values_from = "expression") %>%
mutate(across(where(is.numeric), replace_na, 0))
# A tibble: 4 x 4
name sample_8765 sample_8901 sample_9084
<chr> <dbl> <dbl> <dbl>
1 X1345 4.26 5.33 4.35
2 X1234 3.44 0 9.11
3 F1234 6.43 0 0
4 F4657 0 6.78 6.78发布于 2021-05-05 04:12:43
我们可以在xtabs中使用base R方法
xtabs(expression ~ name + id, data = transform(do.call(rbind,
Map(cbind, id = sample_ID, mget(sample_ID)))[c('id', 'name', 'expression')],
expression = as.numeric(expression)))或用于观察值的子集
xtabs(expression ~ name + id, data = transform(subset(do.call(rbind,
Map(cbind, id = sample_ID, mget(sample_ID))),
type == 'mRNA')[c('id', 'name', 'expression')],
expression = as.numeric(expression)))-output
# id
#name sample_8765 sample_8901 sample_9084
# F1234 6.430 0.000 0.000
# F4657 0.000 6.780 6.780
# X1234 3.440 0.000 9.110
# X1345 4.265 5.330 4.350https://stackoverflow.com/questions/67391008
复制相似问题