我是库达的新手。我试图用Ricky动量的形式来解初值的波动方程。代码的性能为12 GFlops,尽管我的GPU性能为3900。为什么代码对我来说如此无效,我如何才能修复它呢?
main.cu
#include <iostream>
#include <cmath>
#include "step.cu"
#include <cuda.h>
#include "err.cu"
#include "err.h"
using namespace std;
int main(int argc, char const *argv[])
{
if (argc <= 3)
{
perror("Error in argc: argc<=3 (wait h, tau, C) \n");
exit(1);
}
char *eptr;
errno = 0;
long long int size,tmax;
double tau,cour,h,C, cour2;
h = std::strtod(argv[1], &eptr);
tau = std::strtod(argv[2], &eptr);
C = std::strtod(argv[3], &eptr);
tmax = 2000;
cour = C*tau/h;
cour2 = cour* cour;
size = 18*13*1024;
double *nxt_layer=nullptr;
double *layer_1=nullptr;
double *layer_2=nullptr;
double *rev_layer=nullptr;
dim3 blockSize = dim3(1024);
dim3 gridSize = dim3(size/blockSize.x);
float time;
cudaTimer timer;
cudaError_t ret = cudaMallocManaged(&nxt_layer, sizeof(double) * size);
if (ret != cudaSuccess)
{
std::cout << cudaGetErrorString(ret) << std::endl;
return 1;
}
ret = cudaMallocManaged(&layer_1, sizeof(double) * size);
if (ret != cudaSuccess)
{
std::cout << cudaGetErrorString(ret) << std::endl;
return 1;
}
ret = cudaMallocManaged(&layer_2, sizeof(double) * size);
if (ret != cudaSuccess)
{
std::cout << cudaGetErrorString(ret) << std::endl;
return 1;
}
for (int i = 0; i < size; ++i)
{
layer_1[i] = exp(-(i*h-7)*(i*h-7)/2)*((i*h-7)*(i*h-7)-1);
}
for (int i = 1; i < size/2; ++i)
{
nxt_layer[i] = layer_1[i+1]+0.5*cour2*(layer_1[i+1]-2*layer_1[i]+layer_1[i-1]);
}
nxt_layer[0] = 0; nxt_layer[size-1] = 0;
for (int i = size/2; i < size-1; ++i)
{
nxt_layer[i] = layer_1[i+1]+0.25*0.5*cour2*(layer_1[i+1]-2*layer_1[i]+layer_1[i-1]);
}
for (int i = 0; i < size-1; ++i)
{
layer_2[i] = layer_1[i];
layer_1[i] = nxt_layer[i];
}
nxt_layer[0] = 0; nxt_layer[size-1] = 0;
timer.start();
for (double t = 0; t < tmax; t=t+tau)
{
step<<<gridSize, blockSize>>>(nxt_layer, layer_1, layer_2, cour2, size);
if (CHECK_ERROR(cudaDeviceSynchronize()))
throw(-1);
nxt_layer[size-1]=0;
nxt_layer[0]=0;
}
time = timer.stop();
for (int i = 0; i < size; ++i)
{
cout<<i*h<<" "<<nxt_layer[i]<<endl;
}
}step.cu
inline __device__ double compute(double *layer_1_tmp, double layer_2_tmp, double cour2)
{
return __fmaf_rd(cour2, layer_1_tmp[0]+layer_1_tmp[2], __fmaf_rd(2.0-2*cour2,layer_1_tmp[1],-layer_2_tmp));
}
__global__ void step(double *tmp_layer, double *layer_1, double *layer_2, double cour2, int Nx)
{
int node = threadIdx.x + blockDim.x * blockIdx.x;
if(node >= Nx-1 || node<=0) return;
double layer_1_tmp[3];
layer_1_tmp[0]=layer_1[node-1];
layer_1_tmp[1]=layer_1[node];
layer_1_tmp[2]=layer_1[node+1];
double layer_2_tmp=layer_2[node];
if(node<=Nx/2)
{
tmp_layer[node] = compute(layer_1_tmp, layer_2_tmp, 0.25*cour2);
}
else
{
tmp_layer[node] = compute(layer_1_tmp, layer_2_tmp, cour2);
}
layer_2[node]=layer_1[node];
layer_1[node]=tmp_layer[node];
}我计算GFlops为
long long int perfomance = size*tmax/tau;
long long int perftime = 1000*perfomance/time;
double gflops =(8*perfomance/time)/1000000;我将非常感谢您的任何评论和建议。
发布于 2022-01-09 16:03:23
在内核中,每个工作项只执行几次乘法和加法.与每个cuda线程的内核启动开销和每个layer_1元素的内存访问延迟相比,这是可以忽略的。这相当于在内核时间的微秒内测量几纳秒。尝试计算()函数调用周围的时钟测量。它至少会给出一些“每次计算的周期”度量,并且您可以在计算调用期间找到总的性能。
clock_t c1 = clock();
compute();
clock_t c2 = clock();
timings[node] = c2-c1;甚至这也不是真正的性能度量,因为在一个接一个地进行多个计算调用时,它没有考虑到流水线。您可以在第一个之后添加另一个计算调用,并通过流水线和延迟隐藏获得更高的性能。
发布于 2021-12-23 16:50:16
许多(更多的面向消费者或半专业)图形卡有更好的单精度比双精度性能。GTX 970的单精度性能是其双精度性能的32倍。
将已使用的数据类型从“双”更改为“浮动”。
https://stackoverflow.com/questions/70464693
复制相似问题