我试图使用这个R脚本从NCBI获取一些信息:
require(rentrez)
require(magrittr)
rs = "rs16891982"
rss = c("rs16891982", "rs12203592", "rs1408799", "rs10756819", "rs35264875", "rs1393350", "rs12821256", "rs17128291", "rs1800407", "rs12913832", "rs1805008", "rs4911414")
# given a rs number, return chr, bp, allele and gene name
annotateGeneName = function(rs) {
anno = rentrez::entrez_search(db = "snp", term = rs) %>%
"[["("ids") %>%
rentrez::entrez_summary(db = "snp", id = .)
if(length(anno) < 1) {
warning(sprintf("%s not found in dbSNP!", rs))
return(invisible(NULL))
}
# there might be multiple entries
# if "snp_id" is not in the list, then
# it means multiple SNPs have been return for this search
# just take the first hit
if(! "snp_id" %in% names(anno)) {
anno = anno[[1]]
}
chrpos = anno[["chrpos"]]
EA = anno$allele_origin %>% gsub("\\(.*", "", .)
fEA = anno$global_maf %>% gsub("/.*", "", .) %>% gsub("^.*=", "", .)
genes = dplyr::first(anno$genes, default = NA)
res = data.frame(snp = rs, chrpos = chrpos, EA = EA, fEA = fEA, genes = genes)
res
}
annotateGeneNames = function(rss) {
do.call(rbind, lapply(rss, annotateGeneName))
}
ids = rentrez::entrez_search(db = "snp", term = rs) %>% "[["("ids")
x = rentrez::entrez_fetch(db = "snp", id = ids[1], rettype="xml")
snp1xml = xmlParse(x)
snp1list = xmlToList(snp1xml)
print(snp1list)打印结果时,可以看到以下内容:
...
$Rs$Sequence$.attrs
exemplarSs ancestralAllele
"285153617" "C,C,C,C,C,C"
$Rs$Ss$.attrs
ssId handle batchId locSnpId subSnpClass orient
"23456916" "PERLEGEN" "12309" "afd3693051" "snp" "forward"
strand molType buildId methodClass validated
"bottom" "genomic" "123" "hybridize" "by-cluster"
$Rs$Ss$.attrs
ssId handle
"28510204" "MGC_GENOME_DIFF"
batchId locSnpId
"12314" "BC064405x37550355-C16403799G"
subSnpClass orient
"snp" "forward"
strand molType
"bottom" "cDNA"
buildId methodClass
"126" "computed"
$Rs$Ss
$Rs$Ss$Sequence
$Rs$Ss$Sequence$Seq5
[1] "TTCCCTTTCATTTTCCAGAGAAACTTGATCAGGAACCCACTGATTCCAAGAGCAAAGTAATCAGTGAGGAAATGACACCTAGAATTCATGATGAAAAAAGGATGCTTTATATGGTCCTTTTTAAGGTGATAGTTTTTCCTGACGTCCATAGATTTATTAAGAATCTGGTATTTTAAACAGTAGGAAATACACATAGAAATATCAAATCCAAGTTGTGCTAGACCAGAAACTTTTAGAAGACATCCTTAGGAGAGAGAAAGACTTACAAGAATAAAGTGAGGAAAACACGGAGTTGATGCA"
$Rs$Ss$.attrs
$Rs$Ss$Sequence
$Rs$Ss$Sequence$Seq5
[1] "AAGACATCCTTAGGAGAGAGAAAGACTTACAAGAATAAAGTGAGGAAAACACGGAGTTGATGCA"
$Rs$Assembly$Component$MapLoc$FxnSet
geneId symbol mrnaAcc mrnaVer protAcc protVer
"51151" "SLC45A2" "NM_016180" "4" "NP_057264" "3"
fxnClass readingFrame allele residue aaPosition
"reference" "3" "C" "F" "373"
$Rs$Assembly$Component$MapLoc$FxnSet
geneId symbol mrnaAcc
"51151" "SLC45A2" "NM_016180"
mrnaVer protAcc protVer
"4" "NP_057264" "3"
fxnClass readingFrame allele
"missense" "3" "G"
residue aaPosition soTerm
"L" "373" "non_synonymous_codon" 这个列表中有很多.attrs条目,它们经常是重复的。还有其他重复的条目,例如:
$Rs$Ss$Sequence$Seq5
$Rs$Assembly$Component$MapLoc$FxnSet等。
.attrs是什么意思,我如何理解这些数据?我不知道你怎么能在一个列表中有两个同名条目。
发布于 2015-04-08 15:47:08
在R中,attributes和attr是赋值或提取属性的函数,但据我所知,‘..attr’只是一个列表位置名。它的意思本质上是作者认为它应该是什么意思.在此之后,您的代码完成了对XML的解析并将其转换为R列表。这不是R定义的一部分,所以请阅读文档。
我现在看到,您被具有相同名称的列表项所困扰。这在R中是可能的,"[“和”[“将检索与名称匹配的树中的第一项。访问将需要是数字的,或由lapply或sapply介导的,这些函数遍历树的上层,以避免歧义。
> mylist=vector("list", length=2)
> mylist
[[1]]
NULL
[[2]]
NULL
> names(mylist) <- c("a","a")
> mylist
$a
NULL
$a
NULL
> mylist[['a']]
NULL
> mylist['a']
$a
NULL
> lapply( mylist , "[[", "a")
$a
NULL
$a
NULL(我也没有看到在提取和处理数据的过程中使用了这两种函数定义。)
https://stackoverflow.com/questions/29518849
复制相似问题