文章/答案/技术大牛

发布

社区首页 >问答首页 >C优化技术

问C优化技术
EN

Stack Overflow用户

提问于 2013-10-22 04:00:15

回答 3查看 303关注 0票数 6

我正在寻找最快的优化从下面的函数w/o进入装配，因为它似乎是我的应用程序的瓶颈。请记住，以下函数已在内联中声明。

定义:P= 10和N= 240

void autocorrelation( int32_t *data , float *r){
    for ( int m=0 ; m < P+1 ; m++)
    {
        register float temp = 0;
        for ( int n=0 ; n<N-m ; n++)
        {
            temp += (float)(data[n])*(float)(data[n+m]);
        }
        r[m] = temp;
    }
}

任何帮助都是非常感谢的。

谢谢!

编辑：

大会：

temp += (float)(data[n])*(float)(data[n+m]);
800063A8  lddsp R8, SP[0x0]      
800063AA  add R1, R2, R8<<0      
800063AE  ld.w R12, R1[R7<<0]        
800063B2  mcall 0x80006f58       
800063B6  mov R4, R12        
800063B8  ld.w R12, R2[R7<<0]        
800063BC  mcall 0x80006f58       
800063C0  mov R11, R12       
800063C2  mov R12, R4        
800063C4  mcall 0x80006f5c       
800063C8  mov R11, R12       
800063CA  mov R12, R5        
800063CC  mcall 0x80006f60       
800063D0  mov R5, R12        
for ( int n=0 ; n<N-m ; n++)
800063D2  sub R6, -1         
800063D4  sub R7, -4         
800063D6  cp.w R6, R3        
800063D8  brne 0x800063ae        
r[m] = temp;
800063DA  ld.w R10, PC[2954]         
800063DE  lddsp R9, SP[0x0]      
800063E0  st.w R10[R9<<0], R5        
800063E4  sub R0, 1      
800063E6  sub R9, -4         
800063E8  stdsp SP[0x0], R9      
for ( int m=0 ; m < P+1 ; m++)
800063EA  cp.w R0, 229       
800063EE  breq 0x800063fc        
800063F0  mov R3, R0         
for ( int n=0 ; n<N-m ; n++)
800063F2  cp.w R0, 0         
800063F4  brgt 0x800063a2        
800063F8  mov R5, 0      
800063FA  rjmp 0x800063da

////////////////////////////////////////////////////////////////////////////////

所以我把代码改为：

void autocorrelation( float *data , float *r){
    for ( int m=0 ; m < P+1 ; m++)
    {
        register float temp = 0;
        for ( int n=0 ; n<N-m ; n++)
        {
            temp += data[n]*data[n+m];
        }
        r[m] = temp;
    }
}

把时间缩短三分之一(每个滴答是1/16000赫兹)-原来-现在是108

新装配：

temp += data[n]*data[n+m];
800063C2  add R2, R3, R0<<0      
800063C6  ld.w R11, R3[R7<<0]        
800063CA  ld.w R12, R2[R7<<0]        
800063CE  mcall 0x80006f68       
800063D2  mov R11, R12       
800063D4  mov R12, R5        
800063D6  mcall 0x80006f6c       
800063DA  mov R5, R12        
for ( int n=0 ; n<N-m ; n++)
800063DC  sub R6, -1         
800063DE  sub R7, -4         
800063E0  cp.w R6, R4        
800063E2  brne 0x800063c6        
r[m] = temp;
800063E4  ld.w R9, PC[2960]      
800063E8  st.w R9[R0<<0], R5         
800063EC  sub R1, 1      
800063EE  sub R0, -4         
for ( int m=0 ; m < P+1 ; m++)
800063F0  cp.w R1, 229       
800063F4  breq 0x80006402        
800063F6  mov R4, R1         
for ( int n=0 ; n<N-m ; n++)
800063F8  cp.w R1, 0         
800063FA  brgt 0x800063bc        
800063FE  mov R5, 0      
80006400  rjmp 0x800063e4

//////////////////////////////////////////////////////最终解决方案：(再次更改)

我将标记的解决方案组合在一起，打开循环，编写了一个应用程序，并将其保存在64位直到结束，性能从最后的60位提高到了20位。通过同样的调整，我能够从一开始就把最优的代码从250条到50条，在这里，我的乒乓缓冲器需要在160条蜱上完成所有的工作，所以我有一些头部空间：

void fastAutocorrelation( int64_t *data , float *r){

int64_t *temp;
int64_t *datan = data;
int64_t *datanm = data;

*temp = (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*r++ = (float)(*temp)/int64ToFloat;

datan = data;
datanm = data + 1;
*temp = (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
...

performance

function

optimization

回答 3

Stack Overflow用户

回答已采纳

发布于 2013-10-22 04:42:02

我看你已经剪掉一些石膏了。如果你想玩指针戏法，这里有一个可以试试。根据编译器的不同，您的里程将有所不同：

void autocorrelation( float *restrict data, float *restrict r)
{
    float *data_end = data + N;

    for ( int m=0 ; m < P+1 ; m++)
    {
        float temp = 0;

        for( float *data_n = data, *data_nm = data + m;
             data_nm != data_end;
             data_n++, data_nm++ )
        {
            temp += *data_n * *data_nm;
        }

        r[m] = temp;
    }
}

我添加了restrict关键字，这样编译器就知道data和r不重叠或指向相同的东西。

票数 1

Stack Overflow用户

发布于 2013-10-22 05:18:09

如果你想做一些冒险的实验，你可以试试英特尔的ICC编译器(那是x86汇编程序吗？)使用正确的命令行开关，通过为每个循环迭代使用单独的线程，可以自动将for循环并行化。然而，为了使线程的开销变得值得，循环确实需要相当丰富的内容。

另一种方法是对SSE和AVX进行处理。更好的是，我认为最新的英特尔x86s有一个乘法/加法指令，这是关联、FFTs等所需的。这样，您就可以将代码向量化，并且每个时钟周期将发生多个操作(超出普通CPU管道所能实现的)。实际上，还有一些附加的“函数”直接映射到SSE/AVX操作码上，使得它们可以很容易地在C代码中使用。编译器必须知道这些(英特尔的编译器当然知道)，否则你就会把你自己的在线汇编程序放进去。您还会遇到在运行时处理不同版本CPU的问题；并不是每台PC都有最新的Intel芯片。

或者你可以像我一样懒惰，使用一个预先优化的例程库，比如英特尔的IPP/MKL。它，就像ICC一样，需要钱，但是如果速度=在你的项目中花大钱的话，它是很值得的。

票数 1

Stack Overflow用户

发布于 2013-10-22 05:58:36

你试过：

1)前进指针而不是索引r数组？(就像data_n和data_m做的那样)。例如*(r++) = temp而不是r[m] = temp

2)环行展开。您的编译器显然不会这样做，例如，对于内部循环来说，这将是一个很大的速度增益。具体来说，请看一下http://www.quickiwiki.com/en/Duff的s_device for neat循环展开。

3)将循环展开到极致！(我知道，可能不是很漂亮，但还是很酷)：您知道N和P的值:您可以完全展开循环，编写所有没有分支的代码(尽管很长)。根据您的体系结构(足够的预取管道，再加上哑分支预测)，这很可能会提高速度。您甚至可以编写一个小实用程序，为您生成完整的展开循环，并将生成的代码包含在您的.c文件中。或者只使用宏和元编程。

票数 0

页面原文内容由Stack Overflow提供。腾讯云小微IT领域专用引擎提供翻译支持

原文链接：

https://stackoverflow.com/questions/19508712

复制

相似问题

问C优化技术
EN

回答 3

Stack Overflow用户

Stack Overflow用户

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问C优化技术EN

回答 3

Stack Overflow用户

Stack Overflow用户

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问C优化技术
EN