首页
学习
活动
专区
圈层
工具
发布
社区首页 >问答首页 >获取向量中10个最小项的索引的有效方法(如`排序(partial=10)`但用于``order`‘)

获取向量中10个最小项的索引的有效方法(如`排序(partial=10)`但用于``order`‘)
EN

Stack Overflow用户
提问于 2022-08-08 10:01:40
回答 1查看 96关注 0票数 1

使用sort函数,您可以添加参数partial=10来返回一个向量,其中最小的10项放在向量的开头(但它们可能仍然不整齐):

代码语言:javascript
复制
> sort(10:1,partial=2)
[1]  1  2  8  7  6  5  4  3  9 10
> v=sample(1:1e6)
> head(sort(v,partial=10),10)
 [1]  8  6  3  2  7  5  4  9  1 10
> sort(head(sort(v,partial=10),10))
 [1]  1  2  3  4  5  6  7  8  9 10
> system.time(sort(head(sort(v,partial=10),10)))
   user  system elapsed
  0.009   0.000   0.010
> system.time(head(sort(v),10))
   user  system elapsed
  0.027   0.001   0.028

sort有一个index.return=T参数,但不支持部分排序:

代码语言:javascript
复制
> sort(10:1,partial=2,index.return=T)
Error in sort.int(x, na.last = na.last, decreasing = decreasing, ...) :
  unsupported options for partial sorting

sort的帮助页面还指出,“名称被丢弃用于部分排序”,因此您不能这样做:v=sample(1:1e6);max=10;as.integer(names(sort(head(sort(setNames(v,1:length(v)),partial=max),max))))

有没有一种有效的方法只获取最小的10个项目的索引?在下面的基准测试中,head(order(v),10)在长度1e4和1e5时速度最快,但在1e6和1e7长度时,Rcpp方法更快,在长度1e7时,第三个方法也稍微快一些:

代码语言:javascript
复制
orderfirst=function(v,nhead){
  len=length(v)
  biggestfound=Inf;
  foundind=integer(nhead)
  foundval=integer(nhead)
  for(n in 1:len){
    val=v[n]
    if(n<=nhead||val<biggestfound){
      insertat=nhead
      for(i in 1:(nhead-1))if(val<foundval[i]||i==n){insertat=i;break}
      if(insertat!=nhead)for(i in nhead:(insertat+1)){foundind[i]=foundind[i-1];foundval[i]=foundval[i-1]}
      foundind[insertat]=n
      foundval[insertat]=val
      biggestfound=foundval[nhead]
    }
  }
  foundind
}

library(Rcpp)
cppFunction('NumericVector orderfirstcpp(NumericVector v,int nhead){
  NumericVector foundind(nhead),foundval(nhead);
  double biggestfound;
  for(int n=0;n<v.length();n++){
    double val=v(n);
    if(val<biggestfound||n<nhead){
      int insertat=nhead-1;
      for(int i=0;i<nhead-1;i++)if(val<foundval(i)||i==n){insertat=i;break;}
      for(int i=nhead-1;i>insertat;i--){foundind(i)=foundind(i-1);foundval(i)=foundval(i-1);}
      foundind(insertat)=n+1;
      foundval(insertat)=val;
      biggestfound=foundval(nhead-1);
    }
  }
  return foundind;
}')

s=function(x,...,i=F,p=F,f=F,b=F){a=match.call(expand.dots=F)$...;l=length(a);for(i in seq(1,l,2))x=gsub(a[[i]],if(i==l)""else a[[i+1]],x,ignore.case=i,perl=p,fixed=f,useBytes=b);x}
unfo=function(x)s(paste(x,collapse="\n"),"\\{\\n","\\{","\n *\\}","\\}",",\\n",",","\\n",";"," *([[:punct:]]+) *","\\1")

bench=function(times,...){
  arg=match.call(expand.dots=F)$...
  l=length(arg);out=double(times*l);rand=sample(rep(1:l,times))
  n=1;for(x in arg[rand]){t1=Sys.time();eval.parent(x);out[n]=Sys.time()-t1;n=n+1}
  setNames(out,sapply(arg[rand],function(x)unfo(deparse(x))))
}

len=10^(4:7)
r=sapply(len,function(l){
  nhead=10
  set.seed(0)
  v=sample(1:l)
  # v=rnorm(l) # test with doubles
  # v=c(v,v) # test with duplicated values

  b=bench(100,
    # this was the fastest method I found in base R at 1e6 and smaller lengths
    head(order(v),nhead),

    # this is an R implementation of a method that was fast in C++ and JavaScript but slow in R
    orderfirst(v,nhead),

    # this is a C++ implementation of the method above
    orderfirstcpp(v,nhead),

    # this produces the wrong result when the found items include duplicates
    # this is slow at large values of `nhead`
    {v2=v;vals=integer(nhead);for(i in 1:nhead){ind=which.min(v2);vals[i]=v2[ind];v2=v2[-ind]};match(vals,v)},

    # this is slightly slower than the option above, but this also works when the found items include duplicates
    {v2=v;vals=c();inds=c();for(i in 1:nhead){ind=which.min(v2);vals[i]=v2[ind];v2=v2[-ind]};for(x in vals)inds=c(inds,which(v==x));inds},

    # this produces the wrong result when the found items include duplicates
    match(sort(head(sort(v,partial=nhead),nhead)),v),

    # this is a variant of the option above that works when the found items include duplicates
    # this was slightly faster than `head(order(v),nhead)` at length 1e7
    {w=which(v%in%sort(head(sort(v,partial=nhead),nhead)));as.integer(names(sort(setNames(v[w],w))))},

    # this produces the wrong result when the found items include duplicates
    # the integers have to be converted to strings in order to do indexing by name and not by integer position
    unname(setNames(1:length(v),v)[as.character(sort(head(sort(v,partial=nhead),nhead)))])
  )

  a=aggregate(b,list(names(b)),median)
  setNames(a[,2],a[,1])
})

r2=r[order(r[,ncol(r)]),]
r2=apply(r2,2,function(x)formatC(x,max(0,2-ceiling(log10(min(x)))),,"f"))
r3=apply(rbind(paste0("1e",log10(len)),r2),2,function(x)formatC(x,max(nchar(x)),,"s"))
writeLines(apply(cbind(r3,c("",names(r2[,1]))),1,paste,collapse=" "))

这显示了以秒为单位的100次跑步的中位时间:

代码语言:javascript
复制
    1e4    1e5   1e6  1e7
0.00014 0.0011 0.011 0.11 orderfirstcpp(v,nhead)
0.00049 0.0031 0.026 0.19 {w=which(v%in%sort(head(sort(v,partial=nhead),nhead)));as.integer(names(sort(setNames(v[w],w))))}
0.00011 0.0010 0.022 0.26 head(order(v),nhead)
0.00065 0.0054 0.054 0.54 orderfirst(v,nhead)
0.00067 0.0189 0.043 0.59 match(sort(head(sort(v,partial=nhead),nhead)),v)
0.00154 0.0280 0.140 1.92 {v2=v;vals=integer(nhead);for(i in 1:nhead){ind=which.min(v2);vals[i]=v2[ind];v2=v2[-ind]};match(vals,v)}
0.00158 0.0141 0.144 1.93 {v2=v;vals=c();inds=c();for(i in 1:nhead){ind=which.min(v2);vals[i]=v2[ind];v2=v2[-ind]};for(x in vals)inds=c(inds,which(v==x));inds}
0.00190 0.0214 0.306 4.65 unname(setNames(1:length(v),v)[as.character(sort(head(sort(v,;partial=nhead),nhead)))])
EN

回答 1

Stack Overflow用户

回答已采纳

发布于 2022-08-08 11:29:23

现在,我发现了一个新的方法,它比长度1e6的order(v)[1:10]快1.6倍,在1e7的时候大约快2.6倍,尽管长度1e5和1e4的速度仍然慢一些(这里需要[1:10],以防第11项与第10项相同):

代码语言:javascript
复制
> set.seed(0);v=sample(1e6)
> w=which(v<=sort(v,partial=10)[10]);w[order(v[w])][1:10]
 [1] 404533 512973 497026 128962 254308 664036 995894 834561 676599 302812

当您只需要找到几个最小的项时,下面的方法也可以相当快,例如本例中的10项:

代码语言:javascript
复制
> v2=v;sapply(1:10,\(x){i=which.min(v2);v2[i]<<-NA;i})
 [1] 404533 512973 497026 128962 254308 664036 995894 834561 676599 302812

编辑:Rfast::nth也和我的orderfirstcpp函数一样快:

代码语言:javascript
复制
> w=which(v<=Rfast::nth(v,10+1))[1:10];w[order(v[w])]
 [1] 404533 512973 497026 128962 254308 664036 995894 834561 676599 302812

编辑2:我尝试在R中实现快速选择算法,但是它非常慢:

代码语言:javascript
复制
quickselect=\(v,k){
  vold=v;l=1;r=length(v)
  repeat{
    if(l==r){kth=v[l];break}
    pivot=sample(l:r,1)
    val=v[pivot]
    v[pivot]=v[r];v[r]=val
    pivot=l
    for(i in l:(r-1))if(v[i]<val){temp=v[pivot];v[pivot]=v[i];v[i]=temp;pivot=pivot+1}
    temp=v[r];v[r]=v[pivot];v[pivot]=temp
    if(k<pivot)r=pivot-1 else if(k>pivot)l=pivot+1 else{kth=v[k];break}
  }
  w=head(which(vold<=kth),k);w[order(vold[w])]
}

编辑3:到目前为止,我找到的最快的方法是kit::topn,如果添加hasna=F,它会变得更快

代码语言:javascript
复制
> kit::topn(v,10,decreasing=F,hasna=F)
 [1] 404533 512973 497026 128962 254308 664036 995894 834561 676599 302812

编辑4:下面的Rcpp函数比我早期的orderfirstcpp函数要快得多,甚至在K=10上也是如此,但是在高K时,它变得更快了,因为我的早期函数使用了具有二次时间复杂度的插入排序,尽管在K值较低的情况下,它可以比快速排序更快。A data.table中并行排序的表示说“< 30项最快的是插入排序”,JDK中的DualPivotQuicksort.java对少于47项的数组使用插入排序而不是快速排序。

代码语言:javascript
复制
Rcpp::sourceCpp(,"#include<Rcpp.h>
#include<algorithm>
#include<vector>
#include<queue>
using namespace Rcpp;
using namespace std;
// [[Rcpp::export]]
NumericVector cppmaxheap(NumericVector v,int k){
  vector<double>v2=as<vector<double>>(v);
  priority_queue<double,vector<double>>pq(v2.begin(),v2.begin()+k);
  for(int i=k;i<v2.size();i++)if(v2[i]<pq.top()){pq.pop();pq.push(v2[i]);}
  double top=pq.top();
  vector<pair<int,double> >found;
  int l=v.length();
  for(int i=0;i<l;i++)if(v2[i]<=top)found.push_back(make_pair(v2[i],i));
  sort(found.begin(),found.end());
  vector<double>out;
  for(int i=0;i<k&&i<found.size();i++)out.push_back(found[i].second);
  return NumericVector(out.begin(),out.end());
}")

下面是一个新的基准(它再次显示了以秒为单位的100次运行的中位时间):

代码语言:javascript
复制
     1e4     1e5     1e6    1e7
0.000026 0.00011 0.00092 0.0090 kit::topn(v,nhead,decreasing=F,hasna=F)
0.000030 0.00013 0.00120 0.0119 kit::topn(v,nhead,decreasing=F)
0.000114 0.00093 0.00922 0.0742 cppmaxheap(v,nhead)
0.000132 0.00107 0.01044 0.0831 orderfirstcpp(v,nhead)
0.000160 0.00106 0.01025 0.0957 {v2=v;sapply(1:nhead,\(i){i=which.min(v2);v2[i]<<-NA;i})}
0.000264 0.00161 0.01285 0.1013 {w=which(v<=sort(v,partial=nhead)[nhead]);w[order(v[w])][1:nhead]}
0.000150 0.00099 0.01073 0.1729 {w=which(v<=Rfast::nth(v,nhead+1))[1:nhead];w[order(v[w])]}
0.000491 0.00289 0.02466 0.2230 {w=which(v%in%sort(head(sort(v,partial=nhead),nhead)));as.integer(names(sort(setNames(v[w],w))))}
0.000112 0.00099 0.02120 0.2586 head(order(v),nhead)
0.000654 0.00531 0.05167 0.5147 orderfirst(v,nhead)
0.000654 0.01854 0.04098 0.6371 match(sort(head(sort(v,partial=nhead),nhead)),v)
0.001427 0.01278 0.19337 1.5632 {v2=v;vals=c();inds=c();for(i in 1:nhead){ind=which.min(v2);vals[i]=v2[ind];v2=v2[-ind]};for(x in vals)inds=c(inds,which(v==x));inds}
0.001418 0.02661 0.12866 1.6121 {v2=v;vals=integer(nhead);for(i in 1:nhead){ind=which.min(v2);vals[i]=v2[ind];v2=v2[-ind]};match(vals,v)}
0.001933 0.01487 0.16365 1.7394 quickselect(v,nhead)
0.004135 0.04311 0.32953 6.1154 as.integer(unname(setNames(1:length(v),v)[as.character(sort(head(sort(v,;partial=nhead),nhead)))]))

当K为1e4时,我的cppmaxheap函数在N=1e7和N=1e6上实际上比kit::topn快:

代码语言:javascript
复制
    1e4     1e5   1e6  1e7 
0.00098 0.00376 0.013 0.10 cppmaxheap(v,nhead)
0.00041 0.00235 0.015 0.16 {w=which(v<=sort(v,partial=nhead)[nhead]);w[order(v[w])][1:nhead]}
0.00036 0.00128 0.011 0.20 {w=which(v<=Rfast::nth(v,nhead+1))[1:nhead];w[order(v[w])]}
0.00013 0.00091 0.021 0.26 kit::topn(v,nhead,decreasing=F)
0.00013 0.00099 0.022 0.27 kit::topn(v,nhead,decreasing=F,hasna=F)
0.14801 1.15847 8.704 1.48 {v2=v;sapply(1:nhead,function(x){i=which.min(v2);v2[i]<<-NA;i})}
1.28641 3.59370 5.911 8.37 orderfirstcpp(v,nhead)
票数 0
EN
页面原文内容由Stack Overflow提供。腾讯云小微IT领域专用引擎提供翻译支持
原文链接:

https://stackoverflow.com/questions/73276102

复制
相关文章

相似问题

领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档