我正在尝试让foreach包在R中进行并行处理,我遇到了一些问题:
使foreach工作所需的doMC包在CRAN for Windows上不存在。一些博客建议doSNOW应该做同样的事情。但是,当我使用doSNOW运行foreach命令时,%dopar%似乎并不比%do%快。事实上,它的速度要慢得多。我的中央处理器是英特尔i7 860 @2.80 GB,内存为8 GB。下面是我的代码:
##Run example in 1 core
require(foreach)
require(doSNOW)
x= iris[which(iris[,5] != "setosa"),c(1,5)]
trials = 10000
system.time({
r= foreach(icount(trials), .combine=cbind) %do% {
ind=sample(100,100,replace=TRUE)
results1 = glm(x[ind,2]~x[ind,1],family=binomial(logit))
coefficients(results1)
}
})[3]
# elapsed
# 37.28
# Same example in 2 cores
registerDoSNOW(makeCluster(2,type="SOCK"))
getDoParWorkers()
trials = 10000
system.time({
r= foreach(icount(trials), .combine=cbind) %dopar% {
ind=sample(100,100,replace=TRUE)
results1 = glm(x[ind,2]~x[ind,1],family=binomial(logit))
coefficients(results1)
}
})[3]
# elapsed
# 108.14 我重新安装了所有需要的软件包,但仍然存在同样的问题。下面是输出:
sessionInfo()
#R version 2.15.1 (2012-06-22)
#Platform: i386-pc-mingw32/i386 (32-bit)
#locale:
#[1] LC_COLLATE=English_United States.1252
#[2] LC_CTYPE=English_United States.1252
#[3] LC_MONETARY=English_United States.1252
#[4] LC_NUMERIC=C
#[5] LC_TIME=English_United States.1252
#attached base packages:
#[1] parallel stats graphics grDevices datasets utils methods
#[8] base
#other attached packages:
#[1] doParallel_1.0.1 codetools_0.2-8 doSNOW_1.0.6 snow_0.3-10
#[5] iterators_1.0.6 foreach_1.4.0 rcom_2.2-5 rscproxy_2.0-5
#loaded via a namespace (and not attached):
#[1] compiler_2.15.1 tools_2.15.1 发布于 2012-07-24 01:50:40
你最好在Windows中使用doParallel()
require(foreach)
require(doParallel)
cl <- makeCluster(6) #use 6 cores, ie for an 8-core machine
registerDoParallel(cl)然后运行你的foreach() %dopar% {}
编辑: OP提到仍然看到这个问题,所以包括我的确切代码。运行在4核Windows7 VM上,R 2.15.1 32位,仅允许doParallel使用我的3个核心:
require(foreach)
require(doParallel)
cl <- makeCluster(3)
registerDoParallel(cl)
x= iris[which(iris[,5] != "setosa"),c(1,5)]
trials = 1000
system.time(
foreach(icount(trials), .combine=cbind) %do%
{
ind=sample(100,100,replace=TRUE)
results1 = glm(x[ind,2]~x[ind,1],family=binomial(logit))
results1 = glm(x[ind,2]~x[ind,1],family=binomial(logit))
results1 = glm(x[ind,2]~x[ind,1],family=binomial(logit))
results1 = glm(x[ind,2]~x[ind,1],family=binomial(logit))
coefficients(results1)
})[3]
system.time(
foreach(icount(trials), .combine=cbind) %dopar%
{
ind=sample(100,100,replace=TRUE)
results1 = glm(x[ind,2]~x[ind,1],family=binomial(logit))
results1 = glm(x[ind,2]~x[ind,1],family=binomial(logit))
results1 = glm(x[ind,2]~x[ind,1],family=binomial(logit))
results1 = glm(x[ind,2]~x[ind,1],family=binomial(logit))
coefficients(results1)
})[3] 在我的例子中,%do%为17.6秒,%dopar%为14.8秒。观察任务的执行,似乎大部分执行时间都是cbind,这是并行运行时的一个常见问题。在我自己的模拟中,我已经完成了自定义工作,将详细结果保存为并行任务的一部分,而不是通过foreach返回它们,以消除该部分开销。YMMV.
发布于 2012-12-22 04:43:24
我知道这是一个老问题,但我在搜索其他东西时遇到了它,并认为我应该添加我的解决方案。我发现将试验的总数分成单独的试验组(组的数量等于处理器核心的数量)会更有效,而不是试图同时并行所有的试验并处理所有的开销。下面是使用OP的示例进行的比较:
require(doParallel)
x <- iris[which(iris[,5] != "setosa"),c(1,5)]
trials <- 10000
# example using a single core
t1 <- system.time({
r1 <- foreach(icount(trials), .combine=cbind) %do% {
ind <- sample(100,100,replace= TRUE)
results1 <- glm(x[ind,2]~x[ind,1],family=binomial(logit))
coefficients(results1)
}
})[3]
# example using 4 cores and parallelizing each model trial
nCores <- 4
cl <- makeCluster(nCores)
registerDoParallel(cl)
t2 <- system.time({
r2 <- foreach(icount(trials), .combine=cbind) %dopar% {
ind <- sample(100,100,replace= TRUE)
results1 <- glm(x[ind,2]~x[ind,1],family=binomial(logit))
coefficients(results1)
}
})[3]
# example using 4 cores and parallelizing a group of trial runs
trialsPerCore <- as.integer(ceiling(trials / nCores)) # number of trials
# do to on each core
# function to do a single model run
model <- function(x) {
ind <- sample(100,100,replace= TRUE)
results1 <- glm(x[ind,2]~x[ind,1],family=binomial(logit))
coefficients(results1)
}
# function producing a group of model runs
modelRun <- function(trials, x) {
replicate(trials, model(x))
}
# call the model run for each core
t3 <- system.time(
r3 <- foreach(icount(nCores), .combine= cbind) %dopar% modelRun(trialsPerCore, x)
)[3]
stopCluster(cl)在运行Ubuntu12.04的3.4 GHz四核i7上的执行时间:
> t1
elapsed
34.5
> t2
elapsed
26.5
> t3
elapsed
8.295 发布于 2012-07-24 01:53:31
这种类型的并行性并不是不常见的,并且可能取决于操作系统。我也有类似的结果,但是当我在代码中做了一个愚蠢的更改时
require(foreach)
require(doSNOW)
x= iris[which(iris[,5] != "setosa"),c(1,5)]
trials = 1000
system.time(
foreach(icount(trials), .combine=cbind) %do%
{
ind=sample(100,100,replace=TRUE)
results1 = glm(x[ind,2]~x[ind,1],family=binomial(logit))
results1 = glm(x[ind,2]~x[ind,1],family=binomial(logit))
results1 = glm(x[ind,2]~x[ind,1],family=binomial(logit))
results1 = glm(x[ind,2]~x[ind,1],family=binomial(logit))
coefficients(results1)
})[3]
registerDoSNOW( makeCluster(2,type="SOCK"))
getDoParWorkers()
trials = 1000
system.time(
foreach(icount(trials), .combine=cbind) %dopar%
{
ind=sample(100,100,replace=TRUE)
results1 = glm(x[ind,2]~x[ind,1],family=binomial(logit))
results1 = glm(x[ind,2]~x[ind,1],family=binomial(logit))
results1 = glm(x[ind,2]~x[ind,1],family=binomial(logit))
results1 = glm(x[ind,2]~x[ind,1],family=binomial(logit))
coefficients(results1)
})[3]为了模拟foreach中的繁重工作,我得到了两者的收支平衡。这是间接费用的价格。我最近遇到了一个类似的情况,并直接使用MPI处理它,它的开销要低得多,但使用起来要复杂得多(我想Dirk不会同意)。(将其更改为“不那么优雅”。
https://stackoverflow.com/questions/11617506
复制相似问题