我正在尝试提出一种准确的方法来测量两个操作的延迟: 1)双精度FMA操作的延迟。2)来自共享内存的双精度加载的延迟。我正在使用一个K20x,我想知道这个代码是否会给出准确的测量结果。
#include <cuda.h>
#include <stdlib.h>
#include <stdio.h>
#include <iostream>
using namespace std;
//Clock rate
#define MHZ 732e6
//number of streaming multiprocessors
#define SMS 14
// number of double precision units
#define DP_UNITS 16*4
//number of shared banks
#define SHARED_BANKS 32
#define ITER 100000
#define NEARONE 1.0000000000000004
__global__ void fma_latency_kernal(double *in, double *out){
int tid = blockIdx.x*blockDim.x+threadIdx.x;
double val = in[tid];
#pragma unroll 100
for(int i=0; i<ITER; i++){
val+=val*NEARONE;
}
out[tid]=val;
}
__global__ void shared_latency_kernel(double *in, double *out){
volatile extern __shared__ double smem[];
int tid = blockIdx.x*blockDim.x+threadIdx.x;
smem[threadIdx.x]=in[tid];
#pragma unroll 32
for(int i=0; i<ITER; i++){
smem[threadIdx.x]=smem[(threadIdx.x+i)%32]*NEARONE;
}
out[tid]=smem[threadIdx.x];
}
int main (int argc , char **argv){
float time;
cudaEvent_t start, stop, start2, stop2;
double *d_A, *d_B;
cudaMalloc(&d_A, DP_UNITS*SMS*sizeof(float));
cudaMalloc(&d_B, DP_UNITS*SMS*sizeof(float));
cudaError_t err;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
fma_latency_kernal<<<SMS, DP_UNITS>>>(d_A, d_B);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
time/=1000;
err = cudaGetLastError();
if(err!=cudaSuccess)
printf("Error FMA: %s\n", cudaGetErrorString(err));
printf("Latency of FMA = %3.1f clock cycles\n", (time/(double)ITER)*(double)MHZ);
cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeFourByte);
cudaEventCreate(&start2);
cudaEventCreate(&stop2);
cudaEventRecord(start2, 0);
shared_latency_kernel<<<1, SHARED_BANKS, sizeof(double)>>>(d_A, d_B );
cudaEventRecord(stop2, 0);
cudaEventSynchronize(stop2);
cudaEventElapsedTime(&time, start2, stop2);
time/=1000;
err = cudaGetLastError();
if(err!=cudaSuccess)
printf("Error Shared Memory: %s\n", cudaGetErrorString(err));
printf("Latency of Shared Memory = %3.1f clock cycles\n", time/(double)ITER*(double)MHZ);
}我在K20x上的结果如下: FMA的延迟= 16.4个时钟周期共享内存的延迟= 60.7个时钟周期这对我来说似乎是合理的,但我不确定它有多准确。
发布于 2015-02-10 10:43:18
在我看来,你的延迟值非常高--几乎是我预期的两倍。clock函数以int的形式返回当前周期,因此通过从第二个值中减去第一个值,可以得到调度第一个时钟指令和调度第二个时钟指令之间经过的周期数。
注意,从这个方法得到的数字将包括来自时钟指令本身的额外时间;我相信在默认情况下,线程将在每个时钟指令之前和之后立即阻塞几个周期,所以您可能想要尝试一下,看看它增加了多少周期,以便您可以将它们减去。
https://stackoverflow.com/questions/27890080
复制相似问题