问如何从重叠的基因列表中提取基因名称？
EN

Stack Overflow用户

提问于 2019-02-03 20:36:10

回答 1查看 174关注 0票数 0

我目前有一个脚本，研究不同基因列表之间的基因重叠。因此，代码返回的是一个8乘8的矩阵，其中包含发现重叠的不同数量的基因(即两个列表中共有的基因)。有没有一种方法可以让我观察这些特定的基因并找出它们的基因符号？

#-------------------------------------------------------------------------------
# Set the working directory and load the data files
#-------------------------------------------------------------------------------
setwd("~/Desktop/R_Project/Gene_overlap")
getwd()
files <- list.files(pattern="*.txt", full.names = TRUE)
files

data.list <- lapply(files, function(fil) {
  scan(file=fil, what=character())
})

names(data.list) <- basename(files) %>% stringr::str_remove("\\.txt$")

str(data.list)
# List of 8
# $ GSE108363_BCGdown_D:chr [1:350] "IL1B" "IL6" "IL1A" "CCL20" ...
# $ GSE108363_BCGdown_V: chr [1:267] "IL6" "CCL20" "IL1A" "CXCL5" ...
# $ GSE108363_BCGup_D  : chr [1:250] "FABP4" "CMTM2" "FUCA1" "CD36" ...
# $ GSE108363_BCGup_V  : chr [1:429] "FCN1" "FCGR3B" "MNDA" "CPVL" ...
# $ GSE108363_MTBdown_D: chr [1:86] "CCL20" "IL1B" "IL1A" "IL6" ...
# $ GSE108363_MTBdown_V: chr [1:244] "IL1B" "IL1A" "CCL20" "IL6" ...
# $ GSE108363_MTBup_D  : chr [1:128] "FUCA1" "FGL2" "TGFBI" "CPVL" ...
# $ GSE108363_MTBup_V  : chr [1:286] "FABP4" "RNASE1" "MNDA" "CPVL" ...

intersect(data.list$GSE108363_BCGdown_D, data.list$GSE108363_BCGdown_V) %>% length

sapply(data.list, length)



#-------------------------------------------------------------------------------
# Using the intersect function to see the overlaps 
#-------------------------------------------------------------------------------

data.file1 <- "GSE108363_BCGdown_D.txt"
data.file2 <- "GSE108363_BCGdown_V.txt"
data.file3 <- "GSE108363_BCGup_D.txt"
data.file4 <- "GSE108363_BCGup_V.txt"
data.file5 <- "GSE108363_MTBdown_D.txt"
data.file6 <- "GSE108363_MTBdown_V.txt"
data.file7 <- "GSE108363_MTBup_D.txt"
data.file8 <- "GSE108363_MTBup_V.txt"

genevect1 <- scan(data.file1, what=character(), sep="\n")
genevect2 <- scan(data.file2, what=character(), sep="\n")
genevect3 <- scan(data.file3, what=character(), sep="\n")
genevect4 <- scan(data.file4, what=character(), sep="\n")
genevect5 <- scan(data.file5, what=character(), sep="\n")
genevect6 <- scan(data.file6, what=character(), sep="\n")
genevect7 <- scan(data.file7, what=character(), sep="\n")
genevect8 <- scan(data.file8, what=character(), sep="\n")


filelist <- list(data.file1, data.file2, data.file3, data.file4, data.file5, data.file6, data.file7, data.file8)
all(sapply(filelist, file.exists))

#-------------------------------------------------------------------------------
# read files:
#-------------------------------------------------------------------------------

gene.lists <- lapply(filelist, function(f) {
  scan(file=f, what=character())
})


#-------------------------------------------------------------------------------
# set up empty matrix for overlaps
#-------------------------------------------------------------------------------
x <- (length(gene.lists))^2
x
y <- rep(NA, x)
mx <- matrix(y, ncol=length(gene.lists))
mx
row.names(mx) <- sapply(filelist, basename) %>% stringr::str_remove('.txt$')
colnames(mx) <- sapply(filelist, basename) %>% stringr::str_remove('.txt$')
mx

mx.overlap.count <- mx

#-------------------------------------------------------------------------------
# Overlaps
#-------------------------------------------------------------------------------
for (i in seq_along(gene.lists)) {
  g1 <- gene.lists[[i]]
  for (j in seq_along(gene.lists)) {
    g2 <- gene.lists[[j]]
    a <- intersect(g1, g2)
    b <- length(a)
    mx.overlap.count[j,i] <- b
  }
}

mx.overlap.count
round(as.numeric(mx.overlap.count),digits = 1)
View(mx.overlap.count)

目前，此代码返回数值。然而，我想为每个被比较的两个基因列表生成某种类型的列表(或者类似的东西)，这样我就可以确切地看到哪些基因是两者共同的。

回答 1

Stack Overflow用户

回答已采纳

发布于 2019-02-03 23:50:30

我会通过循环对比矩阵来解决这个问题，像您一样将交集的长度存储在矩阵中，并将基因名称存储在列表genes.overlap中。如下所示：

# Load files. 
file_names <- list.files(pattern=".txt")

# Extract gene lists. 
gene.lists <- lapply(file_names, function(f) {
  scan(file=f, what=character())
})

# Name the entries in the list. 
names(gene.lists) <- file_names
names(gene.lists)

# Initiate an empty list and matrix for storing output of loop.
genes.overlap <- list()
nfiles <- length(gene.lists)
mx.overlap.count <- matrix(NA,nrow=nfiles)

# Generate contrasts:
contrasts <- combn(nfiles,2)

# Loop to determine intersection:
for (i in 1:dim(contrasts)[2]){
  list1 <- contrasts[1,i]
  list2 <- contrasts[2,i]
  g1 <- gene.lists[[list1]]
  g2 <- gene.lists[[list2]]
  comparison_name <- paste(names(gene.lists[list1]),names(gene.lists[list2]),sep="_")
  genes.overlap[[i]] <- intersect(g1, g2)
  names(genes.overlap)[i] <- comparison_name
  b <- length(genes.overlap[[i]])
  mx.overlap.count[i] <- b
}

# You can index into a list like a df with the $ operator. 
genes.overlap$List.txt_List1.txt

票数 0

页面原文内容由Stack Overflow提供。腾讯云小微IT领域专用引擎提供翻译支持

原文链接：

https://stackoverflow.com/questions/54502894

复制

相似问题

问如何从重叠的基因列表中提取基因名称？
EN

回答 1

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问如何从重叠的基因列表中提取基因名称？EN

回答 1

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问如何从重叠的基因列表中提取基因名称？
EN