文章/答案/技术大牛

发布

社区首页 >问答首页 >Intel Intrinsics代码优化

问Intel Intrinsics代码优化
EN

Stack Overflow用户

提问于 2018-02-23 19:04:23

回答 1查看 179关注 0票数 0

所以我尝试将一个常量与短整数a101与英特尔内部函数相乘。我已经用加法做过了，但我似乎不明白为什么它不能用乘法。同样，在之前我们使用32位整数，现在我们使用16位短整型，所以我们可以在内部函数中有两倍的值来填充128位，据我所知？

我尝试去做的天真的例子：

int main(int argc, char **argv){
    short int a[101];
    int len = sizeof(a)/sizeof(short);

    /*Populating array a with values 1 to 101*/

    mult(len, a);

    return 0;
}

int mult(int len, short int *a){
    int result = 0;
    for(int i=0; i<len; i++){
        result += a[i]*20;  
    }
    return result;
}

我的代码试图在内部函数中做同样的事情

/*Same main as before with a short int a[101] containing values 1 to 101*/

int SIMD(int len, short int *a){
    int res;
    int val[4];

    /*Setting constant value to mulitply with*/
    __m128i sum = _mm_set1_epi16(20);
    __m128i s = _mm_setzero_si128( );

    for(int i=0; i<len/4*4; i += 4){
        __m128i vec = _mm_loadu_si128((__m128i *)(a+i));
        s += _mm_mul_epu32(vec,sum);
    }

    _mm_storeu_si128((__m128i*) val, s);
    res += val[0] + val[1] + val[2] + val[3];

    /*Haldeling tail*/
    for(int i=len/4*4; i<len; i++){
        res += a[i];
    }
    return res;
}

所以我确实得到了一个数字作为结果，但这个数字与朴素的方法不匹配，我尝试了其他内部函数，并更改了数字，看看是否有任何明显的差异，但没有接近我预期的输出。计算时间也与目前的naive几乎相同。

optimization

intel

intrinsics

回答 1

Stack Overflow用户

回答已采纳

发布于 2018-02-23 20:17:19

一个__m128i中有8个short。所以：

for(int i=0; i<len/4*4; i += 4)

应该是

for(int i=0; i<len/8*8; i += 8)`

和：

res += val[0] + val[1] + val[2] + val[3];

应该是：

res += val[0] + val[1] + val[2] + val[3] + val[4] + val[5] + val[6] + val[7];

和：

for(int i=len/4*4; i<len; i++)

应该是：

for(int i=len/8*8; i<len; i++)

在：

s += _mm_mul_epu32(vec,sum);

_mm_mul_epu32对32位元素进行操作。它应该是：

s += _mm_mullo_epi16(vec, sum);

对象res未初始化；它应该是：

int res = 0;

以下是工作代码：

#include <stdio.h>
#include <stdlib.h>

#include <immintrin.h>

//  Number of elements in an array.
#define NumberOf(x) (sizeof (x) / sizeof *(x))


//  Compute the result with scalar arithmetic.
static int mult(int len, short int *a)
{
    int result = 0;
    for (size_t i=0; i<len; i++)
    {
        result += a[i]*20;  
    }
    return result;
}


//  Compute the result with SIMD arithmetic.
static int SIMD(int len, short int *a)
{
    //  Initialize the multiplier and the sum.
    __m128i multiplier = _mm_set1_epi16(20);
    __m128i s = _mm_setzero_si128( );

    //  Process blocks of 8 short.
    for (int i=0; i<len/8*8; i += 8)
    {
        __m128i vec = _mm_loadu_si128((__m128i *)(a+i));

        //  Multtiply by multiplier and add to sum.
        s = _mm_add_epi16(s, _mm_mullo_epi16(vec, multiplier));
    }

    //  Store the sum so far so its individual elements can be manipulated.
    short val[8];
    _mm_storeu_si128((__m128i*) val, s);

    //  Add the individual elements.
    int res = 0;
    for (size_t i = 0; i < 8; ++i)
        res += val[i];

    //  Add the elements in the tail.
    for (size_t i = len/8*8; i < len; ++i)
    {
        res += a[i];
    }

    return res;
}



int main(int argc, char **argv)
{
    short int a[96];
    int len = NumberOf(a);

    //  Initiailize a.
    for (size_t i = 0; i < len; ++i)
        a[i] = i+1;

    printf("sum by scalar arithmetic is %d.\n", mult(len, a));
    printf("sum by SIMD arithmetic is %d.\n", SIMD(len, a));

    return 0;
}

票数 1

页面原文内容由Stack Overflow提供。腾讯云小微IT领域专用引擎提供翻译支持

原文链接：

https://stackoverflow.com/questions/48946384

复制

相似问题

问Intel Intrinsics代码优化
EN

回答 1

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问Intel Intrinsics代码优化EN

回答 1

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问Intel Intrinsics代码优化
EN