首页
学习
活动
专区
圈层
工具
发布
社区首页 >问答首页 >OpenACC -复循环依赖于a->,c->,b->防止并行化。

OpenACC -复循环依赖于a->,c->,b->防止并行化。
EN

Stack Overflow用户
提问于 2020-07-25 04:07:58
回答 1查看 298关注 0票数 0

我使用OpenACC比较并行化和非并行化矩阵乘法操作的执行时间,使用19.10 (在Windows上)。我使用的代码是:

代码语言:javascript
复制
#include <time.h>
#include <stdlib.h>

int main()
{
    // seed the random number generator
    srand(42);

    // Pick some arbitrary constraints to make the problem harder
    const int SIZE_XY = 1000;
    const int MIN_VAL = 5000;   
    const int MAX_VAL = 7000000;

    int i, j, k; // iterators

    double time_spent = 0.0;
    clock_t begin = clock();

    // Generate two 2D arrays to be filled with random numbers
    // and an array, c, with all 0s
    int *a[SIZE_XY];
    int *b[SIZE_XY];
    int *c[SIZE_XY];
    for (i = 0; i < SIZE_XY; i++)
    {
        a[i] = (int *)malloc(SIZE_XY * sizeof(int));
        b[i] = (int *)malloc(SIZE_XY * sizeof(int));
        c[i] = (int *)malloc(SIZE_XY * sizeof(int));
    }

    #pragma acc kernels
    {
        for (i = 0; i < SIZE_XY; i++)
        {
            for (j = 0; j < SIZE_XY; j++)
            {
                a[i][j] = (rand() % MAX_VAL) + MIN_VAL;
                b[i][j] = (rand() % MAX_VAL) + MIN_VAL;
                c[i][j] = 0;
            }
        }
    }

    printf("Array A allocated and filled with random numbers ...\n");
    printf("Array B allocated and filled with random numbers ...\n");
    printf("Array C initialized ...\n");

    // Dot product the two arrays together into c
    #pragma acc kernels //HERE
    {
        for (i = 0; i < SIZE_XY; i++)
        {
            for (j = 0; j < SIZE_XY; j++)
            {
                for (k = 0; k < SIZE_XY; k++)
                {
                    c[i][j] = c[i][j] + a[i][k] * b[k][j];
                }
            }
        }
    }

    printf("Matrices multiplied ...\n");
    printf("The first three values of A x B are %d, %d, %d\n", c[0][0], c[0][1], c[0][2]);

    clock_t end = clock();

    time_spent += (double)(end - begin) / CLOCKS_PER_SEC;

    printf("Time elpased is %f seconds", time_spent);
}

当我在PGI中运行以下命令时:pgcc -acc -ta=multicore -Minfo=all,accel matrixACC.c,我收到以下命令:

代码语言:javascript
复制
59, Complex loop carried dependence of a->,c->,b-> prevents parallelization
62, Complex loop carried dependence of a->,c->,b-> prevents parallelization
64, Complex loop carried dependence of a->,c->,b-> prevents parallelization
    Loop carried dependence due to exposed use of c[i1][i2] prevents parallelization 

我能否得到一些帮助,了解为什么会发生这种情况,以及如何并行计算矩阵乘法的循环。

谢谢

EN

回答 1

Stack Overflow用户

回答已采纳

发布于 2020-07-25 05:26:06

编译器无法确定您的3个指针变量(abc)是否可以彼此别名。如果它们以某种方式相互别名,则无法确定计算任何特定c[i][j]的独立性,也无法正确并行(任何一个)循环。

解决这一问题的一种可能方法是通知编译器,您作为程序员保证第一个循环代表独立的活动(在它的各种迭代中)。您可以将#pragma acc loop independent放在第一个for-loop语句之前,以实现这一点。对于您在这里选择的矩阵大小(以及多核目标),这将给您提供大量公开的并行性。(编译器仍然会发出关于其他循环的非并行化的Minfo消息,但这很可能是正确的。对于多核目标来说,拥有1000个并行工作项应该足够获得良好的性能)。

请注意,您的计算将很容易使您所选择的初始化范围溢出int存储。你会得到毫无意义的结果。

以下代码有解决上述问题的可能方法:

代码语言:javascript
复制
$ cat t1.c
#include <time.h>
#include <stdlib.h>
#include <stdio.h>
int main()
{
    // seed the random number generator
    srand(42);

    // Pick some arbitrary constraints to make the problem harder
    const int SIZE_XY = 1000;
    const int MIN_VAL = 5000;
    const int MAX_VAL = 7000000;

    int i, j, k; // iterators

    double time_spent = 0.0;
    clock_t begin = clock();

    // Generate two 2D arrays to be filled with random numbers
    // and an array, c, with all 0s
    int * restrict a[SIZE_XY];
    int * restrict b[SIZE_XY];
    int * restrict c[SIZE_XY];
    for (i = 0; i < SIZE_XY; i++)
    {
        a[i] = (int *)malloc(SIZE_XY * sizeof(int));
        b[i] = (int *)malloc(SIZE_XY * sizeof(int));
        c[i] = (int *)malloc(SIZE_XY * sizeof(int));
    }

    #pragma acc kernels
    {
        for (i = 0; i < SIZE_XY; i++)
        {
            for (j = 0; j < SIZE_XY; j++)
            {
                a[i][j] = 1; //(rand() % MAX_VAL) + MIN_VAL;
                b[i][j] = 1; //(rand() % MAX_VAL) + MIN_VAL;
                c[i][j] = 0;
            }
        }
    }

    printf("Array A allocated and filled with random numbers ...\n");
    printf("Array B allocated and filled with random numbers ...\n");
    printf("Array C initialized ...\n");

    // Dot product the two arrays together into c
    #pragma acc kernels //HERE
    {
        #pragma acc loop independent
        for (i = 0; i < SIZE_XY; i++)
        {
            for (j = 0; j < SIZE_XY; j++)
            {
                for (k = 0; k < SIZE_XY; k++)
                {
                    c[i][j] = c[i][j] + a[i][k] * b[k][j];
                }
            }
        }
    }

    printf("Matrices multiplied ...\n");
    printf("The first three values of A x B are %d, %d, %d\n", c[0][0], c[0][1], c[0][2]);

    clock_t end = clock();

    time_spent += (double)(end - begin) / CLOCKS_PER_SEC;

    printf("Time elpased is %f seconds", time_spent);
}
$ gcc -o t1 t1.c -std=c99
$ pgcc -acc -ta=multicore -Minfo=all,accel t1.c -o t1p
"t1.c", line 21: warning: use of a const variable in a constant expression is
          nonstandard in C
      int * restrict a[SIZE_XY];
                       ^

"t1.c", line 22: warning: use of a const variable in a constant expression is
          nonstandard in C
      int * restrict b[SIZE_XY];
                       ^

"t1.c", line 23: warning: use of a const variable in a constant expression is
          nonstandard in C
      int * restrict c[SIZE_XY];
                       ^

"t1.c", line 11: warning: variable "MIN_VAL" was declared but never referenced
      const int MIN_VAL = 5000;
                ^

"t1.c", line 12: warning: variable "MAX_VAL" was declared but never referenced
      const int MAX_VAL = 7000000;
                ^

main:
     33, Loop is parallelizable
         Generating Multicore code
         33, #pragma acc loop gang
     35, Loop is parallelizable
     52, Loop is parallelizable
         Generating Multicore code
         52, #pragma acc loop gang
     54, Complex loop carried dependence of a->,c->,b-> prevents parallelization
     56, Complex loop carried dependence of a->,c->,b-> prevents parallelization
         Loop carried dependence of c-> prevents parallelization
         Loop carried backward dependence of c-> prevents vectorization
$ time ./t1
Array A allocated and filled with random numbers ...
Array B allocated and filled with random numbers ...
Array C initialized ...
Matrices multiplied ...
The first three values of A x B are 1000, 1000, 1000
Time elpased is 9.010000 seconds
real    0m9.079s
user    0m9.019s
sys     0m0.061s
$ time ./t1p
Array A allocated and filled with random numbers ...
Array B allocated and filled with random numbers ...
Array C initialized ...
Matrices multiplied ...
The first three values of A x B are 1000, 1000, 1000
Time elpased is 20.140000 seconds
real    0m0.563s
user    0m20.053s
sys     0m0.132s
$

在我的机器上,用gcc编译的代码大约需要9秒,而用PGI OpenACC编译器编译的代码大约需要0.5秒。

顺便说一句,我个人通常会避免使用您选择的数组分配方法,因为各种malloc操作并不一定会导致相邻/连续的分配。但是,对于multicore目标,代码可以正常工作。

为了解决这个问题,我建议对您的代码进行如下修改:

代码语言:javascript
复制
$ cat t1.c
#include <time.h>
#include <stdlib.h>
#include <stdio.h>

typedef int mt;
#define SIZE_XY 1000
typedef mt mat[SIZE_XY];

int main()
{
    // seed the random number generator
    srand(42);

    // Pick some arbitrary constraints to make the problem harder

    int i, j, k; // iterators

    double time_spent = 0.0;
    clock_t begin = clock();

    // Generate two 2D arrays to be filled with random numbers
    // and an array, c, with all 0s
    mat * restrict a;
    mat * restrict b;
    mat * restrict c;
    a = (mat *)malloc(SIZE_XY*SIZE_XY * sizeof(mt));
    b = (mat *)malloc(SIZE_XY*SIZE_XY * sizeof(mt));
    c = (mat *)malloc(SIZE_XY*SIZE_XY * sizeof(mt));

    #pragma acc kernels
    {
        for (i = 0; i < SIZE_XY; i++)
        {
            for (j = 0; j < SIZE_XY; j++)
            {
                a[i][j] = 1; //(rand() % MAX_VAL) + MIN_VAL;
                b[i][j] = 1; //(rand() % MAX_VAL) + MIN_VAL;
                c[i][j] = 0;
            }
        }
    }

    printf("Array A allocated and filled with random numbers ...\n");
    printf("Array B allocated and filled with random numbers ...\n");
    printf("Array C initialized ...\n");

    // Dot product the two arrays together into c
    #pragma acc kernels
    {
        for (i = 0; i < SIZE_XY; i++)
        {
            for (j = 0; j < SIZE_XY; j++)
            {
                for (k = 0; k < SIZE_XY; k++)
                {
                    c[i][j] = c[i][j] + a[i][k] * b[k][j];
                }
            }
        }
    }

    printf("Matrices multiplied ...\n");
    printf("The first three values of A x B are %d, %d, %d\n", c[0][0], c[0][1], c[0][2]);

    clock_t end = clock();

    time_spent += (double)(end - begin) / CLOCKS_PER_SEC;

    printf("Time elpased is %f seconds", time_spent);
}
$ gcc -o t1 t1.c -std=c99 -O3
$ pgcc -acc -ta=multicore -Minfo=all,accel t1.c -o t1p
main:
     32, Loop is parallelizable
         Generating Multicore code
         32, #pragma acc loop gang
     34, Loop is parallelizable
     51, Loop is parallelizable
         Generating Multicore code
         51, #pragma acc loop gang
     53, Loop is parallelizable
     55, Complex loop carried dependence of c-> prevents parallelization
         Loop carried dependence of c-> prevents parallelization
         Loop carried backward dependence of c-> prevents vectorization
$ time ./t1
Array A allocated and filled with random numbers ...
Array B allocated and filled with random numbers ...
Array C initialized ...
Matrices multiplied ...
The first three values of A x B are 1000, 1000, 1000
Time elpased is 0.650000 seconds
real    0m0.708s
user    0m0.663s
sys     0m0.047s
$ time ./t1p
Array A allocated and filled with random numbers ...
Array B allocated and filled with random numbers ...
Array C initialized ...
Matrices multiplied ...
The first three values of A x B are 1000, 1000, 1000
Time elpased is 17.510000 seconds
real    0m0.499s
user    0m17.466s
sys     0m0.093s
$

(gcc 4.8.5,pgcc 20.5-0,Xeon E5-2690 v2,共40芯)

有几个优点:

  1. 我们可以使用c99 restrict关键字向编译器传递我们的意图,而无需使用额外的实用程序。
  2. 这将是abc的连续分配,如果您决定从multicore切换到加速器目标,则会产生更简单的行为。
  3. 现在,OpenACC编译器无需额外帮助就可以处理前两个循环嵌套。
  4. gnu编译器也喜欢这种级别的通信。在我的机器上,“普通”gnu编译器(gcc)发出的代码几乎与OpenACC代码一样快。(~0.7s vs. ~0.5s)
票数 0
EN
页面原文内容由Stack Overflow提供。腾讯云小微IT领域专用引擎提供翻译支持
原文链接:

https://stackoverflow.com/questions/63084212

复制
相关文章

相似问题

领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档