我使用OpenACC比较并行化和非并行化矩阵乘法操作的执行时间,使用19.10 (在Windows上)。我使用的代码是:
#include <time.h>
#include <stdlib.h>
int main()
{
// seed the random number generator
srand(42);
// Pick some arbitrary constraints to make the problem harder
const int SIZE_XY = 1000;
const int MIN_VAL = 5000;
const int MAX_VAL = 7000000;
int i, j, k; // iterators
double time_spent = 0.0;
clock_t begin = clock();
// Generate two 2D arrays to be filled with random numbers
// and an array, c, with all 0s
int *a[SIZE_XY];
int *b[SIZE_XY];
int *c[SIZE_XY];
for (i = 0; i < SIZE_XY; i++)
{
a[i] = (int *)malloc(SIZE_XY * sizeof(int));
b[i] = (int *)malloc(SIZE_XY * sizeof(int));
c[i] = (int *)malloc(SIZE_XY * sizeof(int));
}
#pragma acc kernels
{
for (i = 0; i < SIZE_XY; i++)
{
for (j = 0; j < SIZE_XY; j++)
{
a[i][j] = (rand() % MAX_VAL) + MIN_VAL;
b[i][j] = (rand() % MAX_VAL) + MIN_VAL;
c[i][j] = 0;
}
}
}
printf("Array A allocated and filled with random numbers ...\n");
printf("Array B allocated and filled with random numbers ...\n");
printf("Array C initialized ...\n");
// Dot product the two arrays together into c
#pragma acc kernels //HERE
{
for (i = 0; i < SIZE_XY; i++)
{
for (j = 0; j < SIZE_XY; j++)
{
for (k = 0; k < SIZE_XY; k++)
{
c[i][j] = c[i][j] + a[i][k] * b[k][j];
}
}
}
}
printf("Matrices multiplied ...\n");
printf("The first three values of A x B are %d, %d, %d\n", c[0][0], c[0][1], c[0][2]);
clock_t end = clock();
time_spent += (double)(end - begin) / CLOCKS_PER_SEC;
printf("Time elpased is %f seconds", time_spent);
}当我在PGI中运行以下命令时:pgcc -acc -ta=multicore -Minfo=all,accel matrixACC.c,我收到以下命令:
59, Complex loop carried dependence of a->,c->,b-> prevents parallelization
62, Complex loop carried dependence of a->,c->,b-> prevents parallelization
64, Complex loop carried dependence of a->,c->,b-> prevents parallelization
Loop carried dependence due to exposed use of c[i1][i2] prevents parallelization 我能否得到一些帮助,了解为什么会发生这种情况,以及如何并行计算矩阵乘法的循环。
谢谢
发布于 2020-07-25 05:26:06
编译器无法确定您的3个指针变量(a、b、c)是否可以彼此别名。如果它们以某种方式相互别名,则无法确定计算任何特定c[i][j]的独立性,也无法正确并行(任何一个)循环。
解决这一问题的一种可能方法是通知编译器,您作为程序员保证第一个循环代表独立的活动(在它的各种迭代中)。您可以将#pragma acc loop independent放在第一个for-loop语句之前,以实现这一点。对于您在这里选择的矩阵大小(以及多核目标),这将给您提供大量公开的并行性。(编译器仍然会发出关于其他循环的非并行化的Minfo消息,但这很可能是正确的。对于多核目标来说,拥有1000个并行工作项应该足够获得良好的性能)。
请注意,您的计算将很容易使您所选择的初始化范围溢出int存储。你会得到毫无意义的结果。
以下代码有解决上述问题的可能方法:
$ cat t1.c
#include <time.h>
#include <stdlib.h>
#include <stdio.h>
int main()
{
// seed the random number generator
srand(42);
// Pick some arbitrary constraints to make the problem harder
const int SIZE_XY = 1000;
const int MIN_VAL = 5000;
const int MAX_VAL = 7000000;
int i, j, k; // iterators
double time_spent = 0.0;
clock_t begin = clock();
// Generate two 2D arrays to be filled with random numbers
// and an array, c, with all 0s
int * restrict a[SIZE_XY];
int * restrict b[SIZE_XY];
int * restrict c[SIZE_XY];
for (i = 0; i < SIZE_XY; i++)
{
a[i] = (int *)malloc(SIZE_XY * sizeof(int));
b[i] = (int *)malloc(SIZE_XY * sizeof(int));
c[i] = (int *)malloc(SIZE_XY * sizeof(int));
}
#pragma acc kernels
{
for (i = 0; i < SIZE_XY; i++)
{
for (j = 0; j < SIZE_XY; j++)
{
a[i][j] = 1; //(rand() % MAX_VAL) + MIN_VAL;
b[i][j] = 1; //(rand() % MAX_VAL) + MIN_VAL;
c[i][j] = 0;
}
}
}
printf("Array A allocated and filled with random numbers ...\n");
printf("Array B allocated and filled with random numbers ...\n");
printf("Array C initialized ...\n");
// Dot product the two arrays together into c
#pragma acc kernels //HERE
{
#pragma acc loop independent
for (i = 0; i < SIZE_XY; i++)
{
for (j = 0; j < SIZE_XY; j++)
{
for (k = 0; k < SIZE_XY; k++)
{
c[i][j] = c[i][j] + a[i][k] * b[k][j];
}
}
}
}
printf("Matrices multiplied ...\n");
printf("The first three values of A x B are %d, %d, %d\n", c[0][0], c[0][1], c[0][2]);
clock_t end = clock();
time_spent += (double)(end - begin) / CLOCKS_PER_SEC;
printf("Time elpased is %f seconds", time_spent);
}
$ gcc -o t1 t1.c -std=c99
$ pgcc -acc -ta=multicore -Minfo=all,accel t1.c -o t1p
"t1.c", line 21: warning: use of a const variable in a constant expression is
nonstandard in C
int * restrict a[SIZE_XY];
^
"t1.c", line 22: warning: use of a const variable in a constant expression is
nonstandard in C
int * restrict b[SIZE_XY];
^
"t1.c", line 23: warning: use of a const variable in a constant expression is
nonstandard in C
int * restrict c[SIZE_XY];
^
"t1.c", line 11: warning: variable "MIN_VAL" was declared but never referenced
const int MIN_VAL = 5000;
^
"t1.c", line 12: warning: variable "MAX_VAL" was declared but never referenced
const int MAX_VAL = 7000000;
^
main:
33, Loop is parallelizable
Generating Multicore code
33, #pragma acc loop gang
35, Loop is parallelizable
52, Loop is parallelizable
Generating Multicore code
52, #pragma acc loop gang
54, Complex loop carried dependence of a->,c->,b-> prevents parallelization
56, Complex loop carried dependence of a->,c->,b-> prevents parallelization
Loop carried dependence of c-> prevents parallelization
Loop carried backward dependence of c-> prevents vectorization
$ time ./t1
Array A allocated and filled with random numbers ...
Array B allocated and filled with random numbers ...
Array C initialized ...
Matrices multiplied ...
The first three values of A x B are 1000, 1000, 1000
Time elpased is 9.010000 seconds
real 0m9.079s
user 0m9.019s
sys 0m0.061s
$ time ./t1p
Array A allocated and filled with random numbers ...
Array B allocated and filled with random numbers ...
Array C initialized ...
Matrices multiplied ...
The first three values of A x B are 1000, 1000, 1000
Time elpased is 20.140000 seconds
real 0m0.563s
user 0m20.053s
sys 0m0.132s
$在我的机器上,用gcc编译的代码大约需要9秒,而用PGI OpenACC编译器编译的代码大约需要0.5秒。
顺便说一句,我个人通常会避免使用您选择的数组分配方法,因为各种malloc操作并不一定会导致相邻/连续的分配。但是,对于multicore目标,代码可以正常工作。
为了解决这个问题,我建议对您的代码进行如下修改:
$ cat t1.c
#include <time.h>
#include <stdlib.h>
#include <stdio.h>
typedef int mt;
#define SIZE_XY 1000
typedef mt mat[SIZE_XY];
int main()
{
// seed the random number generator
srand(42);
// Pick some arbitrary constraints to make the problem harder
int i, j, k; // iterators
double time_spent = 0.0;
clock_t begin = clock();
// Generate two 2D arrays to be filled with random numbers
// and an array, c, with all 0s
mat * restrict a;
mat * restrict b;
mat * restrict c;
a = (mat *)malloc(SIZE_XY*SIZE_XY * sizeof(mt));
b = (mat *)malloc(SIZE_XY*SIZE_XY * sizeof(mt));
c = (mat *)malloc(SIZE_XY*SIZE_XY * sizeof(mt));
#pragma acc kernels
{
for (i = 0; i < SIZE_XY; i++)
{
for (j = 0; j < SIZE_XY; j++)
{
a[i][j] = 1; //(rand() % MAX_VAL) + MIN_VAL;
b[i][j] = 1; //(rand() % MAX_VAL) + MIN_VAL;
c[i][j] = 0;
}
}
}
printf("Array A allocated and filled with random numbers ...\n");
printf("Array B allocated and filled with random numbers ...\n");
printf("Array C initialized ...\n");
// Dot product the two arrays together into c
#pragma acc kernels
{
for (i = 0; i < SIZE_XY; i++)
{
for (j = 0; j < SIZE_XY; j++)
{
for (k = 0; k < SIZE_XY; k++)
{
c[i][j] = c[i][j] + a[i][k] * b[k][j];
}
}
}
}
printf("Matrices multiplied ...\n");
printf("The first three values of A x B are %d, %d, %d\n", c[0][0], c[0][1], c[0][2]);
clock_t end = clock();
time_spent += (double)(end - begin) / CLOCKS_PER_SEC;
printf("Time elpased is %f seconds", time_spent);
}
$ gcc -o t1 t1.c -std=c99 -O3
$ pgcc -acc -ta=multicore -Minfo=all,accel t1.c -o t1p
main:
32, Loop is parallelizable
Generating Multicore code
32, #pragma acc loop gang
34, Loop is parallelizable
51, Loop is parallelizable
Generating Multicore code
51, #pragma acc loop gang
53, Loop is parallelizable
55, Complex loop carried dependence of c-> prevents parallelization
Loop carried dependence of c-> prevents parallelization
Loop carried backward dependence of c-> prevents vectorization
$ time ./t1
Array A allocated and filled with random numbers ...
Array B allocated and filled with random numbers ...
Array C initialized ...
Matrices multiplied ...
The first three values of A x B are 1000, 1000, 1000
Time elpased is 0.650000 seconds
real 0m0.708s
user 0m0.663s
sys 0m0.047s
$ time ./t1p
Array A allocated and filled with random numbers ...
Array B allocated and filled with random numbers ...
Array C initialized ...
Matrices multiplied ...
The first three values of A x B are 1000, 1000, 1000
Time elpased is 17.510000 seconds
real 0m0.499s
user 0m17.466s
sys 0m0.093s
$(gcc 4.8.5,pgcc 20.5-0,Xeon E5-2690 v2,共40芯)
有几个优点:
restrict关键字向编译器传递我们的意图,而无需使用额外的实用程序。a、b和c的连续分配,如果您决定从multicore切换到加速器目标,则会产生更简单的行为。gcc)发出的代码几乎与OpenACC代码一样快。(~0.7s vs. ~0.5s)https://stackoverflow.com/questions/63084212
复制相似问题