文章/答案/技术大牛

发布

社区首页 >问答首页 >获取向量中10个最小项的索引的有效方法(如`排序(partial=10)`但用于``order`‘)

问获取向量中10个最小项的索引的有效方法(如`排序(partial=10)`但用于``order`‘)
EN

Stack Overflow用户

提问于 2022-08-08 10:01:40

回答 1查看 96关注 0票数 1

使用sort函数，您可以添加参数partial=10来返回一个向量，其中最小的10项放在向量的开头(但它们可能仍然不整齐)：

> sort(10:1,partial=2)
[1]  1  2  8  7  6  5  4  3  9 10
> v=sample(1:1e6)
> head(sort(v,partial=10),10)
 [1]  8  6  3  2  7  5  4  9  1 10
> sort(head(sort(v,partial=10),10))
 [1]  1  2  3  4  5  6  7  8  9 10
> system.time(sort(head(sort(v,partial=10),10)))
   user  system elapsed
  0.009   0.000   0.010
> system.time(head(sort(v),10))
   user  system elapsed
  0.027   0.001   0.028

sort有一个index.return=T参数，但不支持部分排序：

> sort(10:1,partial=2,index.return=T)
Error in sort.int(x, na.last = na.last, decreasing = decreasing, ...) :
  unsupported options for partial sorting

sort的帮助页面还指出，“名称被丢弃用于部分排序”，因此您不能这样做：v=sample(1:1e6);max=10;as.integer(names(sort(head(sort(setNames(v,1:length(v)),partial=max),max))))。

有没有一种有效的方法只获取最小的10个项目的索引？在下面的基准测试中，head(order(v),10)在长度1e4和1e5时速度最快，但在1e6和1e7长度时，Rcpp方法更快，在长度1e7时，第三个方法也稍微快一些：

orderfirst=function(v,nhead){
  len=length(v)
  biggestfound=Inf;
  foundind=integer(nhead)
  foundval=integer(nhead)
  for(n in 1:len){
    val=v[n]
    if(n<=nhead||val<biggestfound){
      insertat=nhead
      for(i in 1:(nhead-1))if(val<foundval[i]||i==n){insertat=i;break}
      if(insertat!=nhead)for(i in nhead:(insertat+1)){foundind[i]=foundind[i-1];foundval[i]=foundval[i-1]}
      foundind[insertat]=n
      foundval[insertat]=val
      biggestfound=foundval[nhead]
    }
  }
  foundind
}

library(Rcpp)
cppFunction('NumericVector orderfirstcpp(NumericVector v,int nhead){
  NumericVector foundind(nhead),foundval(nhead);
  double biggestfound;
  for(int n=0;n<v.length();n++){
    double val=v(n);
    if(val<biggestfound||n<nhead){
      int insertat=nhead-1;
      for(int i=0;i<nhead-1;i++)if(val<foundval(i)||i==n){insertat=i;break;}
      for(int i=nhead-1;i>insertat;i--){foundind(i)=foundind(i-1);foundval(i)=foundval(i-1);}
      foundind(insertat)=n+1;
      foundval(insertat)=val;
      biggestfound=foundval(nhead-1);
    }
  }
  return foundind;
}')

s=function(x,...,i=F,p=F,f=F,b=F){a=match.call(expand.dots=F)$...;l=length(a);for(i in seq(1,l,2))x=gsub(a[[i]],if(i==l)""else a[[i+1]],x,ignore.case=i,perl=p,fixed=f,useBytes=b);x}
unfo=function(x)s(paste(x,collapse="\n"),"\\{\\n","\\{","\n *\\}","\\}",",\\n",",","\\n",";"," *([[:punct:]]+) *","\\1")

bench=function(times,...){
  arg=match.call(expand.dots=F)$...
  l=length(arg);out=double(times*l);rand=sample(rep(1:l,times))
  n=1;for(x in arg[rand]){t1=Sys.time();eval.parent(x);out[n]=Sys.time()-t1;n=n+1}
  setNames(out,sapply(arg[rand],function(x)unfo(deparse(x))))
}

len=10^(4:7)
r=sapply(len,function(l){
  nhead=10
  set.seed(0)
  v=sample(1:l)
  # v=rnorm(l) # test with doubles
  # v=c(v,v) # test with duplicated values

  b=bench(100,
    # this was the fastest method I found in base R at 1e6 and smaller lengths
    head(order(v),nhead),

    # this is an R implementation of a method that was fast in C++ and JavaScript but slow in R
    orderfirst(v,nhead),

    # this is a C++ implementation of the method above
    orderfirstcpp(v,nhead),

    # this produces the wrong result when the found items include duplicates
    # this is slow at large values of `nhead`
    {v2=v;vals=integer(nhead);for(i in 1:nhead){ind=which.min(v2);vals[i]=v2[ind];v2=v2[-ind]};match(vals,v)},

    # this is slightly slower than the option above, but this also works when the found items include duplicates
    {v2=v;vals=c();inds=c();for(i in 1:nhead){ind=which.min(v2);vals[i]=v2[ind];v2=v2[-ind]};for(x in vals)inds=c(inds,which(v==x));inds},

    # this produces the wrong result when the found items include duplicates
    match(sort(head(sort(v,partial=nhead),nhead)),v),

    # this is a variant of the option above that works when the found items include duplicates
    # this was slightly faster than `head(order(v),nhead)` at length 1e7
    {w=which(v%in%sort(head(sort(v,partial=nhead),nhead)));as.integer(names(sort(setNames(v[w],w))))},

    # this produces the wrong result when the found items include duplicates
    # the integers have to be converted to strings in order to do indexing by name and not by integer position
    unname(setNames(1:length(v),v)[as.character(sort(head(sort(v,partial=nhead),nhead)))])
  )

  a=aggregate(b,list(names(b)),median)
  setNames(a[,2],a[,1])
})

r2=r[order(r[,ncol(r)]),]
r2=apply(r2,2,function(x)formatC(x,max(0,2-ceiling(log10(min(x)))),,"f"))
r3=apply(rbind(paste0("1e",log10(len)),r2),2,function(x)formatC(x,max(nchar(x)),,"s"))
writeLines(apply(cbind(r3,c("",names(r2[,1]))),1,paste,collapse=" "))

这显示了以秒为单位的100次跑步的中位时间：

    1e4    1e5   1e6  1e7
0.00014 0.0011 0.011 0.11 orderfirstcpp(v,nhead)
0.00049 0.0031 0.026 0.19 {w=which(v%in%sort(head(sort(v,partial=nhead),nhead)));as.integer(names(sort(setNames(v[w],w))))}
0.00011 0.0010 0.022 0.26 head(order(v),nhead)
0.00065 0.0054 0.054 0.54 orderfirst(v,nhead)
0.00067 0.0189 0.043 0.59 match(sort(head(sort(v,partial=nhead),nhead)),v)
0.00154 0.0280 0.140 1.92 {v2=v;vals=integer(nhead);for(i in 1:nhead){ind=which.min(v2);vals[i]=v2[ind];v2=v2[-ind]};match(vals,v)}
0.00158 0.0141 0.144 1.93 {v2=v;vals=c();inds=c();for(i in 1:nhead){ind=which.min(v2);vals[i]=v2[ind];v2=v2[-ind]};for(x in vals)inds=c(inds,which(v==x));inds}
0.00190 0.0214 0.306 4.65 unname(setNames(1:length(v),v)[as.character(sort(head(sort(v,;partial=nhead),nhead)))])

回答 1

Stack Overflow用户

回答已采纳

发布于 2022-08-08 11:29:23

现在，我发现了一个新的方法，它比长度1e6的order(v)[1:10]快1.6倍，在1e7的时候大约快2.6倍，尽管长度1e5和1e4的速度仍然慢一些(这里需要[1:10]，以防第11项与第10项相同)：

> set.seed(0);v=sample(1e6)
> w=which(v<=sort(v,partial=10)[10]);w[order(v[w])][1:10]
 [1] 404533 512973 497026 128962 254308 664036 995894 834561 676599 302812

当您只需要找到几个最小的项时，下面的方法也可以相当快，例如本例中的10项：

> v2=v;sapply(1:10,\(x){i=which.min(v2);v2[i]<<-NA;i})
 [1] 404533 512973 497026 128962 254308 664036 995894 834561 676599 302812

编辑：Rfast::nth也和我的orderfirstcpp函数一样快：

> w=which(v<=Rfast::nth(v,10+1))[1:10];w[order(v[w])]
 [1] 404533 512973 497026 128962 254308 664036 995894 834561 676599 302812

编辑2:我尝试在R中实现快速选择算法，但是它非常慢：

quickselect=\(v,k){
  vold=v;l=1;r=length(v)
  repeat{
    if(l==r){kth=v[l];break}
    pivot=sample(l:r,1)
    val=v[pivot]
    v[pivot]=v[r];v[r]=val
    pivot=l
    for(i in l:(r-1))if(v[i]<val){temp=v[pivot];v[pivot]=v[i];v[i]=temp;pivot=pivot+1}
    temp=v[r];v[r]=v[pivot];v[pivot]=temp
    if(k<pivot)r=pivot-1 else if(k>pivot)l=pivot+1 else{kth=v[k];break}
  }
  w=head(which(vold<=kth),k);w[order(vold[w])]
}

编辑3:到目前为止，我找到的最快的方法是kit::topn，如果添加hasna=F，它会变得更快

> kit::topn(v,10,decreasing=F,hasna=F)
 [1] 404533 512973 497026 128962 254308 664036 995894 834561 676599 302812

编辑4:下面的Rcpp函数比我早期的orderfirstcpp函数要快得多，甚至在K=10上也是如此，但是在高K时，它变得更快了，因为我的早期函数使用了具有二次时间复杂度的插入排序，尽管在K值较低的情况下，它可以比快速排序更快。A data.table中并行排序的表示说“< 30项最快的是插入排序”，JDK中的DualPivotQuicksort.java对少于47项的数组使用插入排序而不是快速排序。

Rcpp::sourceCpp(,"#include<Rcpp.h>
#include<algorithm>
#include<vector>
#include<queue>
using namespace Rcpp;
using namespace std;
// [[Rcpp::export]]
NumericVector cppmaxheap(NumericVector v,int k){
  vector<double>v2=as<vector<double>>(v);
  priority_queue<double,vector<double>>pq(v2.begin(),v2.begin()+k);
  for(int i=k;i<v2.size();i++)if(v2[i]<pq.top()){pq.pop();pq.push(v2[i]);}
  double top=pq.top();
  vector<pair<int,double> >found;
  int l=v.length();
  for(int i=0;i<l;i++)if(v2[i]<=top)found.push_back(make_pair(v2[i],i));
  sort(found.begin(),found.end());
  vector<double>out;
  for(int i=0;i<k&&i<found.size();i++)out.push_back(found[i].second);
  return NumericVector(out.begin(),out.end());
}")

下面是一个新的基准(它再次显示了以秒为单位的100次运行的中位时间)：

     1e4     1e5     1e6    1e7
0.000026 0.00011 0.00092 0.0090 kit::topn(v,nhead,decreasing=F,hasna=F)
0.000030 0.00013 0.00120 0.0119 kit::topn(v,nhead,decreasing=F)
0.000114 0.00093 0.00922 0.0742 cppmaxheap(v,nhead)
0.000132 0.00107 0.01044 0.0831 orderfirstcpp(v,nhead)
0.000160 0.00106 0.01025 0.0957 {v2=v;sapply(1:nhead,\(i){i=which.min(v2);v2[i]<<-NA;i})}
0.000264 0.00161 0.01285 0.1013 {w=which(v<=sort(v,partial=nhead)[nhead]);w[order(v[w])][1:nhead]}
0.000150 0.00099 0.01073 0.1729 {w=which(v<=Rfast::nth(v,nhead+1))[1:nhead];w[order(v[w])]}
0.000491 0.00289 0.02466 0.2230 {w=which(v%in%sort(head(sort(v,partial=nhead),nhead)));as.integer(names(sort(setNames(v[w],w))))}
0.000112 0.00099 0.02120 0.2586 head(order(v),nhead)
0.000654 0.00531 0.05167 0.5147 orderfirst(v,nhead)
0.000654 0.01854 0.04098 0.6371 match(sort(head(sort(v,partial=nhead),nhead)),v)
0.001427 0.01278 0.19337 1.5632 {v2=v;vals=c();inds=c();for(i in 1:nhead){ind=which.min(v2);vals[i]=v2[ind];v2=v2[-ind]};for(x in vals)inds=c(inds,which(v==x));inds}
0.001418 0.02661 0.12866 1.6121 {v2=v;vals=integer(nhead);for(i in 1:nhead){ind=which.min(v2);vals[i]=v2[ind];v2=v2[-ind]};match(vals,v)}
0.001933 0.01487 0.16365 1.7394 quickselect(v,nhead)
0.004135 0.04311 0.32953 6.1154 as.integer(unname(setNames(1:length(v),v)[as.character(sort(head(sort(v,;partial=nhead),nhead)))]))

当K为1e4时，我的cppmaxheap函数在N=1e7和N=1e6上实际上比kit::topn快：

    1e4     1e5   1e6  1e7 
0.00098 0.00376 0.013 0.10 cppmaxheap(v,nhead)
0.00041 0.00235 0.015 0.16 {w=which(v<=sort(v,partial=nhead)[nhead]);w[order(v[w])][1:nhead]}
0.00036 0.00128 0.011 0.20 {w=which(v<=Rfast::nth(v,nhead+1))[1:nhead];w[order(v[w])]}
0.00013 0.00091 0.021 0.26 kit::topn(v,nhead,decreasing=F)
0.00013 0.00099 0.022 0.27 kit::topn(v,nhead,decreasing=F,hasna=F)
0.14801 1.15847 8.704 1.48 {v2=v;sapply(1:nhead,function(x){i=which.min(v2);v2[i]<<-NA;i})}
1.28641 3.59370 5.911 8.37 orderfirstcpp(v,nhead)

票数 0

页面原文内容由Stack Overflow提供。腾讯云小微IT领域专用引擎提供翻译支持

原文链接：

https://stackoverflow.com/questions/73276102

复制

相似问题

问获取向量中10个最小项的索引的有效方法(如`排序(partial=10)`但用于``order`‘)
EN

回答 1

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问获取向量中10个最小项的索引的有效方法(如`排序(partial=10)`但用于``order`‘)EN

回答 1

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问获取向量中10个最小项的索引的有效方法(如`排序(partial=10)`但用于``order`‘)
EN