文章/答案/技术大牛

发布

社区首页 >问答首页 >R:错误: sample_info必须是列sample_name、file_bam、paired_end、read_length、frag_length、lib_size的数据帧

问R:错误: sample_info必须是列sample_name、file_bam、paired_end、read_length、frag_length、lib_size的数据帧
EN

Stack Overflow用户

提问于 2022-02-18 06:47:26

回答 1查看 70关注 0票数 1

我在chr16_bam文件夹中有一个bam文件列表，还有一个注释文件sgseq_sam.txt。

library(SGSeq)
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
library(BSgenome.Hsapiens.UCSC.hg19)
library(org.Hs.eg.db)
library(tidyverse)
library(AnnotationDbi)
library(dplyr)
library(stringr)


# Rename the "file_bam" column values to the full path where the BAMs are stored
setwd("C:/Users/User/Downloads/")
bamPath = "C:/Users/User/Downloads/chr16_bam"
samFile <- read.delim("C:/Users/User/Downloads/sgseq_sam.txt", header=T) 
samFile <- samFile %>% 
  mutate(file_bam = paste0(bamPath, file_bam))

# Save the TxDb.Hsapiens.UCSC.hg19.knownGene object into a variable.
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
txdb <- keepSeqlevels(txdb, "chr16")
seqlevelsStyle(txdb) <- "NCBI"

# Read in the gene list from Supplementary Table 1
gene.list <- read.table("Table_1_Differential Expression Analysis Revealing CLCA1 to Be a Prognostic and Diagnostic Biomarker for Colorectal Cancer.xls", header=T)

# Convert the gene symbols to Entrez IDs and remove the genes that don't map to an Entrez ID.
entrez = mapIds(org.Hs.eg.db, keys=gene.list$Name, column = "ENTREZID", keytype="SYMBOL")

gene.list <- gene.list %>% 
  mutate(entrez) %>% 
  filter(!is.na(entrez))

# Convert the TxDb.Hsapiens.UCSC.hg19.knownGene object
txf_ucsc <- convertToTxFeatures(txdb)

# Cast this object to a dataframe and save as another variable name
txf_df <- as.data.frame(txf_ucsc)

# Subset by the Entrez IDs
txf_df <- txf_df %>% filter(geneName %in% gene.list$entrez)

# Find the number of common transcripts
unique <- unique(txf_df$geneName)
gene.list <- gene.list[gene.list$entrez %in% unique,]
nrow(gene.list)

# Cast the seqnames column to factor
txf_df$seqnames <- as.factor(as.character(txf_df$seqnames))

# Remove the "chr" prefix from the seqnames column 
txf_df %>% 
  rename_all(~stringr::str_replace(.,"^chr",""))

# Recast this dataframe back to a GRanges object
txf_grange <- makeGRangesFromDataFrame(txf_df, keep.extra.columns=T)

现在，我想为每个基因创建一个循环，在迭代时，子集Grange对象在txf_grange中仅由该基因，使用reduce函数将基因的范围折叠成一个单一的向量，运行analyzeFeatures，然后运行annotate函数，最后是plotFeatures。

for (i in txf_grange$geneName) {
  if (i=="343") {
    next
  }
  else{
    # For each of the 15 genes, subset the Granges objects by only the gene
    grange.subset <- txf_grange[i == toString(i)]
    # Collapse the ranges of the genes into a single vector
    grange.subset <- unlist(IRanges::reduce((split(grange.subset, grange.subset$geneName))))
    # Run analyzeFeatures
    for (j in 1:dim(samFile)[[1]]) {
      si <- samFile
      for (k in si) {
        for (l in list.files(path="C:/Users/User/Downloads/chr16_bam", pattern=".bam$", all.files=F, full.names=F)) {
          sgfc_pred <- analyzeFeatures(k, which=grange.subset, features=txf_ucsc, predict=T)
          # Annotate predicted features
          sgfc_pred <- annotate(sgfc_pred, txf_ucsc)
          # Plot features
          pdf(paste("plot", l, ".pdf", sep=""))
          plotFeatures(sgfc_pred, geneID=1)
          dev.off()
        }
      }
    }    
  }
}

数据

> dput(head(txf_grange))
new("GRanges", seqnames = new("Rle", values = structure(1L, .Label = "16", class = "factor"), 
    lengths = 6L, elementMetadata = NULL, metadata = list()), 
    ranges = new("IRanges", start = c(12058964L, 12059311L, 12059311L, 
    12060052L, 12060198L, 12060198L), width = c(348L, 742L, 2117L, 
    147L, 680L, 1230L), NAMES = NULL, elementType = "ANY", elementMetadata = NULL, 
        metadata = list()), strand = new("Rle", values = structure(1L, .Label = c("+", 
    "-", "*"), class = "factor"), lengths = 6L, elementMetadata = NULL, 
        metadata = list()), seqinfo = new("Seqinfo", seqnames = "16", 
        seqlengths = NA_integer_, is_circular = NA, genome = NA_character_), 
    elementMetadata = new("DFrame", rownames = NULL, nrows = 6L, 
        listData = list(type = structure(c(3L, 1L, 1L, 2L, 1L, 
        1L), .Label = c("J", "I", "F", "L", "U"), class = "factor"), 
            txName = structure(list(c("uc002dbv.3", "uc010buy.3", 
            "uc010buz.3"), c("uc002dbv.3", "uc010buy.3"), "uc010buz.3", 
                c("uc002dbv.3", "uc010buy.3"), "uc010buy.3", 
                "uc002dbv.3"), class = "AsIs"), geneName = structure(list(
                "608", "608", "608", "608", "608", "608"), class = "AsIs")), 
        elementType = "ANY", elementMetadata = NULL, metadata = list()), 
    elementType = "ANY", metadata = list())

bioinformatics

回答 1

Stack Overflow用户

发布于 2022-02-18 21:22:14

变更线：

analyzeFeatures(samFile, grange.subset)

此外，您也不需要那么多循环来运行这个问题。这个问题需要14幅图，你可以用你的循环数来绘制更多的图。

票数 1

页面原文内容由Stack Overflow提供。腾讯云小微IT领域专用引擎提供翻译支持

原文链接：

https://stackoverflow.com/questions/71169383

复制

相似问题

问R:错误: sample_info必须是列sample_name、file_bam、paired_end、read_length、frag_length、lib_size的数据帧
EN

回答 1

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问R:错误: sample_info必须是列sample_name、file_bam、paired_end、read_length、frag_length、lib_size的数据帧EN

回答 1

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问R:错误: sample_info必须是列sample_name、file_bam、paired_end、read_length、frag_length、lib_size的数据帧
EN