首页
学习
活动
专区
圈层
工具
发布
社区首页 >问答首页 >R:错误: sample_info必须是列sample_name、file_bam、paired_end、read_length、frag_length、lib_size的数据帧

R:错误: sample_info必须是列sample_name、file_bam、paired_end、read_length、frag_length、lib_size的数据帧
EN

Stack Overflow用户
提问于 2022-02-18 06:47:26
回答 1查看 70关注 0票数 1

我在chr16_bam文件夹中有一个bam文件列表,还有一个注释文件sgseq_sam.txt

代码语言:javascript
复制
library(SGSeq)
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
library(BSgenome.Hsapiens.UCSC.hg19)
library(org.Hs.eg.db)
library(tidyverse)
library(AnnotationDbi)
library(dplyr)
library(stringr)


# Rename the "file_bam" column values to the full path where the BAMs are stored
setwd("C:/Users/User/Downloads/")
bamPath = "C:/Users/User/Downloads/chr16_bam"
samFile <- read.delim("C:/Users/User/Downloads/sgseq_sam.txt", header=T) 
samFile <- samFile %>% 
  mutate(file_bam = paste0(bamPath, file_bam))

# Save the TxDb.Hsapiens.UCSC.hg19.knownGene object into a variable.
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
txdb <- keepSeqlevels(txdb, "chr16")
seqlevelsStyle(txdb) <- "NCBI"

# Read in the gene list from Supplementary Table 1
gene.list <- read.table("Table_1_Differential Expression Analysis Revealing CLCA1 to Be a Prognostic and Diagnostic Biomarker for Colorectal Cancer.xls", header=T)

# Convert the gene symbols to Entrez IDs and remove the genes that don't map to an Entrez ID.
entrez = mapIds(org.Hs.eg.db, keys=gene.list$Name, column = "ENTREZID", keytype="SYMBOL")

gene.list <- gene.list %>% 
  mutate(entrez) %>% 
  filter(!is.na(entrez))

# Convert the TxDb.Hsapiens.UCSC.hg19.knownGene object
txf_ucsc <- convertToTxFeatures(txdb)

# Cast this object to a dataframe and save as another variable name
txf_df <- as.data.frame(txf_ucsc)

# Subset by the Entrez IDs
txf_df <- txf_df %>% filter(geneName %in% gene.list$entrez)

# Find the number of common transcripts
unique <- unique(txf_df$geneName)
gene.list <- gene.list[gene.list$entrez %in% unique,]
nrow(gene.list)

# Cast the seqnames column to factor
txf_df$seqnames <- as.factor(as.character(txf_df$seqnames))

# Remove the "chr" prefix from the seqnames column 
txf_df %>% 
  rename_all(~stringr::str_replace(.,"^chr",""))

# Recast this dataframe back to a GRanges object
txf_grange <- makeGRangesFromDataFrame(txf_df, keep.extra.columns=T)

现在,我想为每个基因创建一个循环,在迭代时,子集Grange对象在txf_grange中仅由该基因,使用reduce函数将基因的范围折叠成一个单一的向量,运行analyzeFeatures,然后运行annotate函数,最后是plotFeatures

代码语言:javascript
复制
for (i in txf_grange$geneName) {
  if (i=="343") {
    next
  }
  else{
    # For each of the 15 genes, subset the Granges objects by only the gene
    grange.subset <- txf_grange[i == toString(i)]
    # Collapse the ranges of the genes into a single vector
    grange.subset <- unlist(IRanges::reduce((split(grange.subset, grange.subset$geneName))))
    # Run analyzeFeatures
    for (j in 1:dim(samFile)[[1]]) {
      si <- samFile
      for (k in si) {
        for (l in list.files(path="C:/Users/User/Downloads/chr16_bam", pattern=".bam$", all.files=F, full.names=F)) {
          sgfc_pred <- analyzeFeatures(k, which=grange.subset, features=txf_ucsc, predict=T)
          # Annotate predicted features
          sgfc_pred <- annotate(sgfc_pred, txf_ucsc)
          # Plot features
          pdf(paste("plot", l, ".pdf", sep=""))
          plotFeatures(sgfc_pred, geneID=1)
          dev.off()
        }
      }
    }    
  }
}

数据

代码语言:javascript
复制
> dput(head(txf_grange))
new("GRanges", seqnames = new("Rle", values = structure(1L, .Label = "16", class = "factor"), 
    lengths = 6L, elementMetadata = NULL, metadata = list()), 
    ranges = new("IRanges", start = c(12058964L, 12059311L, 12059311L, 
    12060052L, 12060198L, 12060198L), width = c(348L, 742L, 2117L, 
    147L, 680L, 1230L), NAMES = NULL, elementType = "ANY", elementMetadata = NULL, 
        metadata = list()), strand = new("Rle", values = structure(1L, .Label = c("+", 
    "-", "*"), class = "factor"), lengths = 6L, elementMetadata = NULL, 
        metadata = list()), seqinfo = new("Seqinfo", seqnames = "16", 
        seqlengths = NA_integer_, is_circular = NA, genome = NA_character_), 
    elementMetadata = new("DFrame", rownames = NULL, nrows = 6L, 
        listData = list(type = structure(c(3L, 1L, 1L, 2L, 1L, 
        1L), .Label = c("J", "I", "F", "L", "U"), class = "factor"), 
            txName = structure(list(c("uc002dbv.3", "uc010buy.3", 
            "uc010buz.3"), c("uc002dbv.3", "uc010buy.3"), "uc010buz.3", 
                c("uc002dbv.3", "uc010buy.3"), "uc010buy.3", 
                "uc002dbv.3"), class = "AsIs"), geneName = structure(list(
                "608", "608", "608", "608", "608", "608"), class = "AsIs")), 
        elementType = "ANY", elementMetadata = NULL, metadata = list()), 
    elementType = "ANY", metadata = list())
EN

回答 1

Stack Overflow用户

发布于 2022-02-18 21:22:14

变更线:

代码语言:javascript
复制
analyzeFeatures(samFile, grange.subset)

此外,您也不需要那么多循环来运行这个问题。这个问题需要14幅图,你可以用你的循环数来绘制更多的图。

票数 1
EN
页面原文内容由Stack Overflow提供。腾讯云小微IT领域专用引擎提供翻译支持
原文链接:

https://stackoverflow.com/questions/71169383

复制
相关文章

相似问题

领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档