我加载一个json格式文件。示例数据sampleData(并非所有变量都包含在示例中):
[[1]]
[[1]]$id
[1] "000018ac-04ef-4270-81e6-9e3cb8274d31"
[[1]]$currentCompany
[1] ""
[[1]]$skills
list()
[[2]]
[[2]]$id
[1] "00000259-7c1c-4db6-9a2a-6d450626fbac"
[[2]]$currentCompany
[1] "Super Market 2"
[[2]]$skills
[[2]]$skills[[1]]
[[2]]$skills[[1]]$name
[1] "OpenGL"
[2]]$skills
[[2]]$skills[[1]]
[[2]]$skills[[1]]$type
[1] "link"
[2]]$skills
[[2]]$skills[[2]]
[[2]]$skills[[2]]$name
[1] "C"
[2]]$skills
[[2]]$skills[[2]]
[[2]]$skills[[2]]$type
[1] "link"因此,我想从这个文件中提取出列表中的技能。我使用以下代码:
skill <- list()
for (i in 1:length(sampleData){
skill[i][1] <- 'empty'
for (j in 1:length(sampleData[[i]]$skills)){
if (length(sampleData[[i]]$skills)==0){
skill[[i]][j] <- NA
}else{
skill[[i]][j] <- json[[i]]$skills[[j]]$name
}
}
}结果:
[[1]]
[1] NA
[[2]]
[1] "OpenGL" "C" 结果没问题。我的问题是有什么办法使它更有效率或更优雅吗?例如,不使用循环?此外,我使用了这行代码skill[i][1] <- 'empty'原因,而没有在错误内部初始化弹出的ups Error intmp[[i]] : subscript out of bounds。有什么可以避免这种转储初始化的吗?
发布于 2015-11-30 13:48:51
这个怎么样:
sampleData <-
list(
list(
id = "000018ac-04ef-4270-81e6-9e3cb8274d31",
curruentCompany = "",
skills = list()
),
list(
id = "00000259-7c1c-4db6-9a2a-6d450626fbac",
curruentCompany = "",
skills = list(
list( name = "OpenGL" ),
list( name = "C" )
)
)
)
A <- unlist(sampleData,recursive=FALSE)
n <- which(names(A)=="skills")
skill_L <- A[n]
skill <- lapply(skill_L,unlist)在skill_L中,技能仍然是列表,在skill中它们是向量:
> skill_L
$skills
list()
$skills
$skills[[1]]
$skills[[1]]$name
[1] "OpenGL"
$skills[[2]]
$skills[[2]]$name
[1] "C"
> skill
$skills
NULL
$skills
name name
"OpenGL" "C"
> “没有技能”的例子是由NULL而不是NA代表的。如果这不合适,则需要来自apply-family的另一个函数应用程序:
> skill[sapply(skill,is.null)] <- NA
> skill
$skills
[1] NA
$skills
name name
"OpenGL" "C"
> 解决方案中出现subscript out of bounds错误的原因可能是R通过for-loop运行
for (i in 1:0){...}不是0次而是两次,一次是i=1,一次是i=0。例如,看看我和我们的R巨人中的一个关于这个话题的heated debate。
基准测试:
#--------------------------------------------------------------
mra68 <- function(data)
{
A <- unlist(data,recursive=FALSE)
skill_vec <- lapply(A[which(names(A)=="skills")],unlist)
skill_vec[sapply(skill_vec,is.null)] <- NA
skill_vec
}
#--------------------------------------------------------------
thothal <- function(sampleData) {
lapply(sampleData, function(l) {
if (length(l$skills) == 0) {
NA
} else {
unlist(l$skills)
}
})
}
#--------------------------------------------------------------
createData <- function(length.out) {
ret <- vector("list", length.out)
for (i in seq_len(length.out)) {
skills.n <- sample(0:10, 1)
skills <- vector("list", skills.n)
for (j in seq_len(skills.n)) {
skills[[j]] <- list(name = paste0("skill = ", j, ", id = ", i))
}
ret[[i]] <- list(id = paste("id", i),
currentCompany = paste("Company", i),
skills = skills)
}
ret
}
#--------------------------------------------------------------
library(microbenchmark)
data <- createData(1e+4)
microbenchmark(mra68(data),
thothal(data), unit = "s")-
> microbenchmark(mra68(data),
+ thothal(data), unit = "s")
Unit: seconds
expr min lq mean median uq max neval
mra68(data) 0.1531227 0.2078011 0.2779383 0.2946479 0.3299953 0.5017640 100
thothal(data) 0.1664253 0.2260693 0.3168359 0.3277690 0.3726730 0.7728963 100
> 如果NULL而不是NA没有问题的话:
mra68_NULL <- function(data)
{
A <- unlist(data,recursive=FALSE)
skill_vec <- lapply(A[which(names(A)=="skills")],unlist)
skill_vec
}。
> microbenchmark(mra68_NULL(data),
+ mra68(data),
+ thothal(data), unit = "s" )
Unit: seconds
expr min lq mean median uq max neval
mra68_NULL(data) 0.1019852 0.1439472 0.2177345 0.2501026 0.2554921 0.3492776 100
mra68(data) 0.1141832 0.1871851 0.2647803 0.2884579 0.3091937 0.7359363 100
thothal(data) 0.1216594 0.2031117 0.2885294 0.3148583 0.3344948 0.8078336 100
> 有几个字段:
sampleData <-
list(
list(
id = "000018ac-04ef-4270-81e6-9e3cb8274d31",
curruentCompany = "",
skills = list()
),
list(
id = "00000259-7c1c-4db6-9a2a-6d450626fbac",
curruentCompany = "",
skills = list(
list( type = "link",
name = "OpenGL" ),
list( name = "C" )
)
),
list(
id = "00000259-7c1c-4db6-9a2a-6d450626fbac",
curruentCompany = "",
skills = list(
list( aaa = "X" ),
list( type = "Link",
name = "abc",
bbb = "xyz" ),
list( name = "E",
aaa = "123" )
)
)
)
#--------------------------------------------------------------
A <- unlist(sampleData,recursive=FALSE)
n <- which(names(A)=="skills")
skill_L <- A[n]
f <- function(x){
if (length(x)>0){x[sapply(x,is.null)] <- NA} else {x<-NA}
return(x)
}
skill <- list(
name = lapply( lapply(sapply(skill_L,sapply,"[","name"),f), unlist ),
type = lapply( lapply(sapply(skill_L,sapply,"[","type"),f), unlist ),
aaa = lapply( lapply(sapply(skill_L,sapply,"[","aaa" ),f), unlist ),
bbb = lapply( lapply(sapply(skill_L,sapply,"[","bbb" ),f), unlist )
)。
> skill$name
$skills
[1] NA
$skills
name name
"OpenGL" "C"
$skills
<NA> name name
NA "abc" "E" 。
> skill$type
$skills
[1] NA
$skills
type <NA>
"link" NA
$skills
<NA> type <NA>
NA "Link" NA 。
> skill$aaa
$skills
[1] NA
$skills
<NA> <NA>
NA NA
$skills
aaa <NA> aaa
"X" NA "123" 。
> skill$bbb
$skills
[1] NA
$skills
<NA> <NA>
NA NA
$skills
<NA> bbb <NA>
NA "xyz" NA 发布于 2015-11-30 08:38:25
您可以使用lapply (不过,这也是一个循环),但是代码变得更短了:
sampleData <- list(list(id = "abc",
currentCompany = "",
skills = list()),
list(id = "abc2",
currentCompany = "xyz",
skills = list(list(name = "OpenGL"),
list(name = "C"))))
lapply(sampleData, function(l) {
if (length(l$skills) == 0) {
NA
} else {
unlist(l$skills)
}
})
# [[1]]
# [1] NA
# [[2]]
# name name
# "OpenGL" "C"更新
快速(和肮脏)基准测试结果表明,lapply大约快50%。注:createData也可以做得更聪明。
library(microbenchmark)
microbenchmark(lps(createData(1e4)), lply(createData(1e4)), unit = "s")
# Unit: seconds
# expr min lq mean median uq
# lps(createData(10000)) 1.1829743 1.2541602 1.3069261 1.2873486 1.3478340
# lply(createData(10000)) 0.5331418 0.5613532 0.6009065 0.5926779 0.6294598
# max neval cld
# 1.4960136 100 b
# 0.8075482 100 a 码
createData <- function(length.out) {
ret <- vector("list", length.out)
for (i in seq_len(length.out)) {
skills.n <- sample(0:10, 1)
skills <- vector("list", skills.n)
for (j in seq_len(skills.n)) {
skills[[j]] <- list(name = paste0("skill = ", j, ", id = ", i))
}
ret[[i]] <- list(id = paste("id", i),
currentCompany = paste("Company", i),
skills = skills)
}
ret
}
lply <- function(sampleData) {
lapply(sampleData, function(l) {
if (length(l$skills) == 0) {
NA
} else {
unlist(l$skills)
}
})
}
lps <- function(sampleData) {
skill <- list()
for (i in 1:length(sampleData)){
skill[i][1] <- 'empty'
for (j in 1:length(sampleData[[i]]$skills)){
if (length(sampleData[[i]]$skills) == 0){
skill[[i]][j] <- NA
}else{
skill[[i]][j] <- sampleData[[i]]$skills[[j]]$name
}
}
}
skill
}https://stackoverflow.com/questions/33994066
复制相似问题