文章/答案/技术大牛

发布

社区首页 >问答首页 >OpenACC -复循环依赖于a->，c->，b->防止并行化。

问OpenACC -复循环依赖于a->，c->，b->防止并行化。
EN

Stack Overflow用户

提问于 2020-07-25 04:07:58

回答 1查看 298关注 0票数 0

我使用OpenACC比较并行化和非并行化矩阵乘法操作的执行时间，使用19.10 (在Windows上)。我使用的代码是：

#include <time.h>
#include <stdlib.h>

int main()
{
    // seed the random number generator
    srand(42);

    // Pick some arbitrary constraints to make the problem harder
    const int SIZE_XY = 1000;
    const int MIN_VAL = 5000;   
    const int MAX_VAL = 7000000;

    int i, j, k; // iterators

    double time_spent = 0.0;
    clock_t begin = clock();

    // Generate two 2D arrays to be filled with random numbers
    // and an array, c, with all 0s
    int *a[SIZE_XY];
    int *b[SIZE_XY];
    int *c[SIZE_XY];
    for (i = 0; i < SIZE_XY; i++)
    {
        a[i] = (int *)malloc(SIZE_XY * sizeof(int));
        b[i] = (int *)malloc(SIZE_XY * sizeof(int));
        c[i] = (int *)malloc(SIZE_XY * sizeof(int));
    }

    #pragma acc kernels
    {
        for (i = 0; i < SIZE_XY; i++)
        {
            for (j = 0; j < SIZE_XY; j++)
            {
                a[i][j] = (rand() % MAX_VAL) + MIN_VAL;
                b[i][j] = (rand() % MAX_VAL) + MIN_VAL;
                c[i][j] = 0;
            }
        }
    }

    printf("Array A allocated and filled with random numbers ...\n");
    printf("Array B allocated and filled with random numbers ...\n");
    printf("Array C initialized ...\n");

    // Dot product the two arrays together into c
    #pragma acc kernels //HERE
    {
        for (i = 0; i < SIZE_XY; i++)
        {
            for (j = 0; j < SIZE_XY; j++)
            {
                for (k = 0; k < SIZE_XY; k++)
                {
                    c[i][j] = c[i][j] + a[i][k] * b[k][j];
                }
            }
        }
    }

    printf("Matrices multiplied ...\n");
    printf("The first three values of A x B are %d, %d, %d\n", c[0][0], c[0][1], c[0][2]);

    clock_t end = clock();

    time_spent += (double)(end - begin) / CLOCKS_PER_SEC;

    printf("Time elpased is %f seconds", time_spent);
}

当我在PGI中运行以下命令时：pgcc -acc -ta=multicore -Minfo=all,accel matrixACC.c，我收到以下命令：

59, Complex loop carried dependence of a->,c->,b-> prevents parallelization
62, Complex loop carried dependence of a->,c->,b-> prevents parallelization
64, Complex loop carried dependence of a->,c->,b-> prevents parallelization
    Loop carried dependence due to exposed use of c[i1][i2] prevents parallelization

我能否得到一些帮助，了解为什么会发生这种情况，以及如何并行计算矩阵乘法的循环。

谢谢

parallel-processing

openacc

pgi

回答 1

Stack Overflow用户

回答已采纳

发布于 2020-07-25 05:26:06

编译器无法确定您的3个指针变量(a、b、c)是否可以彼此别名。如果它们以某种方式相互别名，则无法确定计算任何特定c[i][j]的独立性，也无法正确并行(任何一个)循环。

解决这一问题的一种可能方法是通知编译器，您作为程序员保证第一个循环代表独立的活动(在它的各种迭代中)。您可以将#pragma acc loop independent放在第一个for-loop语句之前，以实现这一点。对于您在这里选择的矩阵大小(以及多核目标)，这将给您提供大量公开的并行性。(编译器仍然会发出关于其他循环的非并行化的Minfo消息，但这很可能是正确的。对于多核目标来说，拥有1000个并行工作项应该足够获得良好的性能)。

请注意，您的计算将很容易使您所选择的初始化范围溢出int存储。你会得到毫无意义的结果。

以下代码有解决上述问题的可能方法：

$ cat t1.c
#include <time.h>
#include <stdlib.h>
#include <stdio.h>
int main()
{
    // seed the random number generator
    srand(42);

    // Pick some arbitrary constraints to make the problem harder
    const int SIZE_XY = 1000;
    const int MIN_VAL = 5000;
    const int MAX_VAL = 7000000;

    int i, j, k; // iterators

    double time_spent = 0.0;
    clock_t begin = clock();

    // Generate two 2D arrays to be filled with random numbers
    // and an array, c, with all 0s
    int * restrict a[SIZE_XY];
    int * restrict b[SIZE_XY];
    int * restrict c[SIZE_XY];
    for (i = 0; i < SIZE_XY; i++)
    {
        a[i] = (int *)malloc(SIZE_XY * sizeof(int));
        b[i] = (int *)malloc(SIZE_XY * sizeof(int));
        c[i] = (int *)malloc(SIZE_XY * sizeof(int));
    }

    #pragma acc kernels
    {
        for (i = 0; i < SIZE_XY; i++)
        {
            for (j = 0; j < SIZE_XY; j++)
            {
                a[i][j] = 1; //(rand() % MAX_VAL) + MIN_VAL;
                b[i][j] = 1; //(rand() % MAX_VAL) + MIN_VAL;
                c[i][j] = 0;
            }
        }
    }

    printf("Array A allocated and filled with random numbers ...\n");
    printf("Array B allocated and filled with random numbers ...\n");
    printf("Array C initialized ...\n");

    // Dot product the two arrays together into c
    #pragma acc kernels //HERE
    {
        #pragma acc loop independent
        for (i = 0; i < SIZE_XY; i++)
        {
            for (j = 0; j < SIZE_XY; j++)
            {
                for (k = 0; k < SIZE_XY; k++)
                {
                    c[i][j] = c[i][j] + a[i][k] * b[k][j];
                }
            }
        }
    }

    printf("Matrices multiplied ...\n");
    printf("The first three values of A x B are %d, %d, %d\n", c[0][0], c[0][1], c[0][2]);

    clock_t end = clock();

    time_spent += (double)(end - begin) / CLOCKS_PER_SEC;

    printf("Time elpased is %f seconds", time_spent);
}
$ gcc -o t1 t1.c -std=c99
$ pgcc -acc -ta=multicore -Minfo=all,accel t1.c -o t1p
"t1.c", line 21: warning: use of a const variable in a constant expression is
          nonstandard in C
      int * restrict a[SIZE_XY];
                       ^

"t1.c", line 22: warning: use of a const variable in a constant expression is
          nonstandard in C
      int * restrict b[SIZE_XY];
                       ^

"t1.c", line 23: warning: use of a const variable in a constant expression is
          nonstandard in C
      int * restrict c[SIZE_XY];
                       ^

"t1.c", line 11: warning: variable "MIN_VAL" was declared but never referenced
      const int MIN_VAL = 5000;
                ^

"t1.c", line 12: warning: variable "MAX_VAL" was declared but never referenced
      const int MAX_VAL = 7000000;
                ^

main:
     33, Loop is parallelizable
         Generating Multicore code
         33, #pragma acc loop gang
     35, Loop is parallelizable
     52, Loop is parallelizable
         Generating Multicore code
         52, #pragma acc loop gang
     54, Complex loop carried dependence of a->,c->,b-> prevents parallelization
     56, Complex loop carried dependence of a->,c->,b-> prevents parallelization
         Loop carried dependence of c-> prevents parallelization
         Loop carried backward dependence of c-> prevents vectorization
$ time ./t1
Array A allocated and filled with random numbers ...
Array B allocated and filled with random numbers ...
Array C initialized ...
Matrices multiplied ...
The first three values of A x B are 1000, 1000, 1000
Time elpased is 9.010000 seconds
real    0m9.079s
user    0m9.019s
sys     0m0.061s
$ time ./t1p
Array A allocated and filled with random numbers ...
Array B allocated and filled with random numbers ...
Array C initialized ...
Matrices multiplied ...
The first three values of A x B are 1000, 1000, 1000
Time elpased is 20.140000 seconds
real    0m0.563s
user    0m20.053s
sys     0m0.132s
$

在我的机器上，用gcc编译的代码大约需要9秒，而用PGI OpenACC编译器编译的代码大约需要0.5秒。

顺便说一句，我个人通常会避免使用您选择的数组分配方法，因为各种malloc操作并不一定会导致相邻/连续的分配。但是，对于multicore目标，代码可以正常工作。

为了解决这个问题，我建议对您的代码进行如下修改：

$ cat t1.c
#include <time.h>
#include <stdlib.h>
#include <stdio.h>

typedef int mt;
#define SIZE_XY 1000
typedef mt mat[SIZE_XY];

int main()
{
    // seed the random number generator
    srand(42);

    // Pick some arbitrary constraints to make the problem harder

    int i, j, k; // iterators

    double time_spent = 0.0;
    clock_t begin = clock();

    // Generate two 2D arrays to be filled with random numbers
    // and an array, c, with all 0s
    mat * restrict a;
    mat * restrict b;
    mat * restrict c;
    a = (mat *)malloc(SIZE_XY*SIZE_XY * sizeof(mt));
    b = (mat *)malloc(SIZE_XY*SIZE_XY * sizeof(mt));
    c = (mat *)malloc(SIZE_XY*SIZE_XY * sizeof(mt));

    #pragma acc kernels
    {
        for (i = 0; i < SIZE_XY; i++)
        {
            for (j = 0; j < SIZE_XY; j++)
            {
                a[i][j] = 1; //(rand() % MAX_VAL) + MIN_VAL;
                b[i][j] = 1; //(rand() % MAX_VAL) + MIN_VAL;
                c[i][j] = 0;
            }
        }
    }

    printf("Array A allocated and filled with random numbers ...\n");
    printf("Array B allocated and filled with random numbers ...\n");
    printf("Array C initialized ...\n");

    // Dot product the two arrays together into c
    #pragma acc kernels
    {
        for (i = 0; i < SIZE_XY; i++)
        {
            for (j = 0; j < SIZE_XY; j++)
            {
                for (k = 0; k < SIZE_XY; k++)
                {
                    c[i][j] = c[i][j] + a[i][k] * b[k][j];
                }
            }
        }
    }

    printf("Matrices multiplied ...\n");
    printf("The first three values of A x B are %d, %d, %d\n", c[0][0], c[0][1], c[0][2]);

    clock_t end = clock();

    time_spent += (double)(end - begin) / CLOCKS_PER_SEC;

    printf("Time elpased is %f seconds", time_spent);
}
$ gcc -o t1 t1.c -std=c99 -O3
$ pgcc -acc -ta=multicore -Minfo=all,accel t1.c -o t1p
main:
     32, Loop is parallelizable
         Generating Multicore code
         32, #pragma acc loop gang
     34, Loop is parallelizable
     51, Loop is parallelizable
         Generating Multicore code
         51, #pragma acc loop gang
     53, Loop is parallelizable
     55, Complex loop carried dependence of c-> prevents parallelization
         Loop carried dependence of c-> prevents parallelization
         Loop carried backward dependence of c-> prevents vectorization
$ time ./t1
Array A allocated and filled with random numbers ...
Array B allocated and filled with random numbers ...
Array C initialized ...
Matrices multiplied ...
The first three values of A x B are 1000, 1000, 1000
Time elpased is 0.650000 seconds
real    0m0.708s
user    0m0.663s
sys     0m0.047s
$ time ./t1p
Array A allocated and filled with random numbers ...
Array B allocated and filled with random numbers ...
Array C initialized ...
Matrices multiplied ...
The first three values of A x B are 1000, 1000, 1000
Time elpased is 17.510000 seconds
real    0m0.499s
user    0m17.466s
sys     0m0.093s
$

(gcc 4.8.5，pgcc 20.5-0，Xeon E5-2690 v2，共40芯)

有几个优点：

我们可以使用c99 restrict关键字向编译器传递我们的意图，而无需使用额外的实用程序。
这将是a、b和c的连续分配，如果您决定从multicore切换到加速器目标，则会产生更简单的行为。
现在，OpenACC编译器无需额外帮助就可以处理前两个循环嵌套。
gnu编译器也喜欢这种级别的通信。在我的机器上，“普通”gnu编译器(gcc)发出的代码几乎与OpenACC代码一样快。(~0.7s vs. ~0.5s)

票数 0

页面原文内容由Stack Overflow提供。腾讯云小微IT领域专用引擎提供翻译支持

原文链接：

https://stackoverflow.com/questions/63084212

复制

相似问题

问OpenACC -复循环依赖于a->，c->，b->防止并行化。
EN

回答 1

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问OpenACC -复循环依赖于a->，c->，b->防止并行化。EN

回答 1

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问OpenACC -复循环依赖于a->，c->，b->防止并行化。
EN