我在论坛上看到了下面的一些代码,但当我开始编译它时,我得到了一些错误。我想把从#pragma scop到#pragma endscop的区域平行。
/* Main computational kernel. The whole function will be timed,
including the call and return. */
static
void kernel_fdtd_2d(int tmax,
int nx,
int ny,
DATA_TYPE POLYBENCH_2D(ex,NX,NY,nx,ny),
DATA_TYPE POLYBENCH_2D(ey,NX,NY,nx,ny),
DATA_TYPE POLYBENCH_2D(hz,NX,NY,nx,ny),
DATA_TYPE POLYBENCH_1D(_fict_,TMAX,tmax))
{
int t, i, j;
#pragma scop
#pragma omp parallel private (t,i,j)
{
#pragma omp master
{
for (t = 0; t < _PB_TMAX; t++)
{
#pragma omp for
for (j = 0; j < _PB_NY; j++)
ey[0][j] = _fict_[t];
#pragma omp barrier
#pragma omp for collapse(2) schedule(static)
for (i = 1; i < _PB_NX; i++)
for (j = 0; j < _PB_NY; j++)
ey[i][j] = ey[i][j] - 0.5*(hz[i][j]-hz[i-1][j]);
#pragma omp barrier
#pragma omp for collapse(2) schedule(static)
for (i = 0; i < _PB_NX; i++)
for (j = 1; j < _PB_NY; j++)
ex[i][j] = ex[i][j] - 0.5*(hz[i][j]-hz[i][j-1]);
#pragma omp barrier
#pragma omp for collapse(2) schedule(static)
for (i = 0; i < _PB_NX - 1; i++)
for (j = 0; j < _PB_NY - 1; j++)
hz[i][j] = hz[i][j] - 0.7* (ex[i][j+1] - ex[i][j] + ey[i+1][j] - ey[i][j]);
#pragma omp barrier
}
}
}
#pragma endscop
}
int main(int argc, char** argv)
{
/* Retrieve problem size. */
int tmax = TMAX;
int nx = NX;
int ny = NY;
/* Variable declaration/allocation. */
POLYBENCH_2D_ARRAY_DECL(ex,DATA_TYPE,NX,NY,nx,ny);
POLYBENCH_2D_ARRAY_DECL(ey,DATA_TYPE,NX,NY,nx,ny);
POLYBENCH_2D_ARRAY_DECL(hz,DATA_TYPE,NX,NY,nx,ny);
POLYBENCH_1D_ARRAY_DECL(_fict_,DATA_TYPE,TMAX,tmax);
/* Initialize array(s). */
init_array (tmax, nx, ny,
POLYBENCH_ARRAY(ex),
POLYBENCH_ARRAY(ey),
POLYBENCH_ARRAY(hz),
POLYBENCH_ARRAY(_fict_));
/* Start timer. */
polybench_start_instruments;
/* Run kernel. */
kernel_fdtd_2d (tmax, nx, ny,
POLYBENCH_ARRAY(ex),
POLYBENCH_ARRAY(ey),
POLYBENCH_ARRAY(hz),
POLYBENCH_ARRAY(_fict_));
/* Stop and print timer. */
polybench_stop_instruments;
polybench_print_instruments;
/* Prevent dead-code elimination. All live-out data must be printed
by the function call in argument. */
polybench_prevent_dce(print_array(nx, ny, POLYBENCH_ARRAY(ex),
POLYBENCH_ARRAY(ey),
POLYBENCH_ARRAY(hz)));
/* Be clean. */
POLYBENCH_FREE_ARRAY(ex);
POLYBENCH_FREE_ARRAY(ey);
POLYBENCH_FREE_ARRAY(hz);
POLYBENCH_FREE_ARRAY(_fict_);
return 0;
}错误如下:
stencils/fdtd-2d/fdtd-2dp.c:80:9: error: work-sharing region may not be closely nested inside of work-sharing, critical, ordered, master or explicit task region #pragma omp for ^
stencils/fdtd-2d/fdtd-2dp.c:83:9: error: barrier region may not be closely nested inside of work-sharing, critical, ordered, master or explicit task region #pragma omp barrier ^
stencils/fdtd-2d/fdtd-2dp.c:84:9: error: work-sharing region may not be closely nested inside of work-sharing, critical, ordered, master or explicit task region #pragma omp for collapse(2) schedule(static) ^
stencils/fdtd-2d/fdtd-2dp.c:88:9: error: barrier region may not be closely nested inside of work-sharing, critical, ordered, master or explicit task region #pragma omp barrier ^
stencils/fdtd-2d/fdtd-2dp.c:89:9: error: work-sharing region may not be closely nested inside of work-sharing, critical, ordered, master or explicit task region #pragma omp for collapse(2) schedule(static) ^
stencils/fdtd-2d/fdtd-2dp.c:93:9: error: barrier region may not be closely nested inside of work-sharing, critical, ordered, master or explicit task region #pragma omp barrier ^
stencils/fdtd-2d/fdtd-2dp.c:94:9: error: work-sharing region may not be closely nested inside of work-sharing, critical, ordered, master or explicit task region #pragma omp for collapse(2) schedule(static) ^
stencils/fdtd-2d/fdtd-2dp.c:98:9: error: barrier region may not be closely nested inside of work-sharing, critical, ordered, master or explicit task region #pragma omp barrier ^
如果对我如何编译这篇文章有任何帮助..
发布于 2015-12-31 13:10:14
老实说,这是非常糟糕的OpenMP代码。它不会在整个算法中考虑数据使用情况。你可能想要的是:
int t, i, j;
#pragma omp parallel private (t,i,j)
{
for (t = 0; t < _PB_TMAX; t++)
{
#pragma omp for nowait
for (j = 0; j < _PB_NY; j++)
ey[0][j] = _fict_[t];
#pragma omp for collapse(2) nowait schedule(static)
for (i = 1; i < _PB_NX; i++)
for (j = 0; j < _PB_NY; j++)
ey[i][j] = ey[i][j] - 0.5*(hz[i][j]-hz[i-1][j]);
#pragma omp for collapse(2) schedule(static)
for (i = 0; i < _PB_NX; i++)
for (j = 1; j < _PB_NY; j++)
ex[i][j] = ex[i][j] - 0.5*(hz[i][j]-hz[i][j-1]);
// #pragma omp barrier <- Implicit if nowait not specified
#pragma omp for collapse(2) schedule(static)
for (i = 0; i < _PB_NX - 1; i++)
for (j = 0; j < _PB_NY - 1; j++)
hz[i][j] = hz[i][j] - 0.7*(ex[i][j+1] - ex[i][j] + ey[i+1][j] - ey[i][j]);
// #pragma omp barrier <- Implicit if nowait not specified
}
}这些障碍应该被移除,因为它们在for循环结束后是隐式的,而没有指定nowait。
此外,我认为前两个障碍应该完全消除,因为前三个循环之间没有线程依赖--如果一个线程完成了它的一部分循环,并立即启动了下一个循环的一部分,就不会出现竞态条件。您可以添加nowait子句来覆盖omp for指令末尾的隐式屏障。
最后,如果_PB_NX和_PB_NY都很大,那么您不太可能通过折叠嵌套循环来获得任何好处。我可以想象,删除collapse(2)可以略微提高整体函数的性能。
希望这能有所帮助。
发布于 2015-12-31 06:08:33
从代码中删除#pragma omp master语句。这将修复编译问题。您可能不想在主线程中“只”运行该块,因为那样您将不会从使用Open MP中获得任何性能优势。
https://stackoverflow.com/questions/34535765
复制相似问题