我有这段C++代码,我想把它移植到CUDA上。
for (int im = 0; im < numImages; im++)
{
for (p = 0; p < xsize*ysize; p++)
{
bool ok = false;
for (f = 0; f < numFeatures; f++)
{
if (feature[im][f][p] != 0)
{
ok = true;
break;
}
}
if (ok)
{
minDist = 1e9;
for (i = 0; i < numBins; i++)
{
dist = 0;
for (f = 0; f < numFeatures; f++)
{
dist += (float)((feature[im][f][p]-clusterPoint[f][i])*(feature[im][f][p]-clusterPoint[f][i]));
}
if (dist < minDist)
{
minDist = dist;
tmp = i;
}
}//end for i
for (f = 0; f < numFeatures; f++)
csum[f][tmp] += feature[im][f][p];
ccount[tmp]++;
averageDist[tmp] += sqrt(minDist);
} // end if (ok)
} //end for p
}// end for im我想在图形处理器中计算csum,ccount和averageDist。csum和averagedist是浮点数,ccount是整数。
这是一个并行约简问题吗?
发布于 2013-01-28 22:27:15
是的,你可以使用CUDA进行求和。但是,元素的数量应该足够大,以便GPU上求和所用的时间应该小于CPU上求和所用的时间。This may help you
https://stackoverflow.com/questions/14558691
复制相似问题