文章/答案/技术大牛

发布

社区首页 >问答首页 >OpenCL传输速率超过PCI-e带宽

问OpenCL传输速率超过PCI-e带宽
EN

Stack Overflow用户

提问于 2013-12-06 11:22:25

回答 1查看 938关注 0票数 5

我编写了一个OpenCL程序，并使用固定内存(CL_MEM_ALLOC_HOST_PTR)从设备到主机获得更高的传输速率。

传输率如我所料(使用AMD应用程序档案器2.4获取传输率)。问题是传输速率高于矩阵4096 x 4096 (64 MB)的PCIe带宽(93703 GB /s)。

当我使用零复制缓冲区( CL_MEM_ALLOC_HOST_PTR + clEnqueueMapBuffer)时也会发生这种情况。我查找了一些信息，如果固定内存和零复制缓冲区具有较高的传输速率，但对于离散图形处理器来说，它仍然受到PCIe带宽的限制。那么，传输速率超过PCIe带宽(使用PCIe带宽2.0x16)是正常的吗？

我的操作系统是64位Windows 7。我使用AMD APP SDK 2.6和独立GPU AMD HD 6630M。

编辑:代码如下：

#include <Windows.h>
#include <iostream>
#include <fstream>
#include <string>
using namespace std;

#ifdef __APPLE__   
   #include <OpenCL/opencl.h>   
#else  
   #include <CL/cl.h>   
#endif 

#define MAX_SOURCE_SIZE (0x100000)

cl_context context = NULL; 
cl_command_queue queue = NULL; 
cl_program program = NULL; 

void MatrixMul(cl_mem d_A, cl_mem d_B, cl_mem d_C, int size)
{
cl_int err;
cl_kernel naive;

// Create Kernel Object Bound To Kernel Function 
naive = clCreateKernel(program, "naiveAlgorithm", &err);

//Set size of global work item and work tem in each work goups
int globalsize = size;
int localsize;

if(globalsize >= 16)
{
    localsize =16;
}else
{
    localsize = globalsize;
}

size_t global_work_items [2] = {globalsize, globalsize};
size_t local_work_items  [2] = {localsize, localsize};

// Setup Kernel Argument
err = clSetKernelArg(naive, 0, sizeof(cl_mem), (void *)&d_A);
err = clSetKernelArg(naive, 1, sizeof(cl_mem), (void *)&d_B);
err = clSetKernelArg(naive, 2, sizeof(cl_mem), (void *)&d_C);
err = clSetKernelArg(naive, 3, sizeof(cl_int), (void *)&size);



// Execute OpenCL kernel for Naive Algorithm
err = clEnqueueNDRangeKernel(queue, naive, 2, NULL, global_work_items, local_work_items, 0, NULL, NULL);
clFinish(queue);

//Release Kernel
err = clReleaseKernel(naive);
}

void Naive(cl_float* matrixA, cl_float* matrixB, cl_float* matrixC, int size)
{
int err;
// OpenCL device memory for matrices
cl_mem d_A;
cl_mem d_B;
cl_mem d_C;

// Allocate Device Memory For Input And Output
d_A = clCreateBuffer(context,  CL_MEM_READ_ONLY   ,   sizeof(cl_float)*size*size, 0, &err);
d_B = clCreateBuffer(context,  CL_MEM_READ_ONLY   ,   sizeof(cl_float)*size*size, 0, &err);
d_C = clCreateBuffer(context, CL_MEM_WRITE_ONLY|CL_MEM_ALLOC_HOST_PTR ,sizeof(cl_float)*size*size, 0,&err);     

// Copy Host Memory To Memory Device
err = clEnqueueWriteBuffer(queue, d_A, CL_FALSE, 0, sizeof(cl_float)*size*size, matrixA, 0, NULL, NULL); 
err = clEnqueueWriteBuffer(queue, d_B, CL_FALSE, 0, sizeof(cl_float)*size*size, matrixB, 0, NULL, NULL); 

MatrixMul(d_A, d_B, d_C, size);

err = clEnqueueReadBuffer(queue, d_C, CL_TRUE, 0, sizeof(cl_float)*size*size, matrixC, 0, NULL, NULL);

err = clReleaseMemObject(d_A);
err = clReleaseMemObject(d_B);
err = clReleaseMemObject(d_C);
}



//Main Function
int main(int argc, char **argv)
{
//Size of matrix for Strassen Algorithm
cl_int size = 4096; 

//Matrix for input and output
cl_float * matrixA;
cl_float * matrixB;
cl_float * matrixC;

//Allocate  and init memory for the host
matrixA = (cl_float *) malloc(size*size*sizeof(cl_float));
matrixB = (cl_float *) malloc(size*size*sizeof(cl_float));
matrixC = (cl_float *) malloc(size*size*sizeof(cl_float));

//Fill matrix
fillMatrix(matrixA,size);
fillMatrix(matrixB,size);

//print input for matrix A and B
cout<<"Input for matrix A :"<<endl;
printMatrix(matrixA, size*size, size);
cout<<"Input for matrix B :"<<endl;
printMatrix(matrixB, size*size, size);

cl_int err;     // error code   

cl_platform_id* platforms;
cl_uint platformCount;

cl_device_id device;

int platformtype = 0; //if 0 using amd app sdk but if 1 using intel sdk

clGetPlatformIDs(0, NULL, &platformCount); //get number of platform
platforms = (cl_platform_id*) malloc(sizeof(cl_platform_id) * platformCount); 
clGetPlatformIDs(platformCount, platforms, NULL);  //get list of platform
clGetDeviceIDs (platforms [platformtype], CL_DEVICE_TYPE_GPU, 1, &device, NULL); //get list of devices

const cl_context_properties contextProperties [] =
{CL_CONTEXT_PLATFORM,
     reinterpret_cast<cl_context_properties> (platforms [platformtype]),
     0, 0
};


context = clCreateContext(contextProperties, 1, &device, NULL, NULL, &err);
    ![enter image description here][2]queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err);


//Load Kernel Source 
FILE *fp;
const char fileName[] = "./MatMul_Kernel.cl";
size_t source_size;
char *source_str;

fp = fopen(fileName, "r");
if (!fp) 
{
    fprintf(stderr, "Failed to load kernel.\n");
    exit(1);
}
source_str = (char *)malloc(MAX_SOURCE_SIZE);
source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
fclose(fp);

// Create Program Object 
program = clCreateProgramWithSource(context, 1, (const char **) &source_str,(const size_t *),
    &source_size, &err); 

// Build Program 
    err = clBuildProgram(program, 1, &device, NULL, NULL, NULL);

Naive(matrixA, matrixB, matrixC, size);

    //Cleanup all memory
err = clFlush(queue);
    err = clFinish(queue);
    err = clReleaseProgram(program);
    err = clReleaseCommandQueue(queue);
    err = clReleaseContext(context);

// Display result of matrix multiplication
cout<<"Output for matrix C :"<<endl;
    printMatrix(matrixC, size*size, size);
cout<<endl;

free(matrixA);
    free(matrixB);
    free(matrixC);
free(source_str);

    return 0;
}

下面是内核代码：

 __kernel void naiveAlgorithm(__global float *A, __global float *B, __global float *C, int size) {

 int tx = get_global_id(0); //2D Thread IDx
 int ty = get_global_id(1); //2D Thread IDy

 float sum = 0;

 //Calculate result of one element of Matrix C
 for (int k = 0; k < size; k++) {
    sum += A[ty*size+k] * B[k*size+tx];
 }
  C[ty*size+tx] = sum;
 }

下面是图片：

opencl

回答 1

Stack Overflow用户

发布于 2014-05-19 20:21:16

我看到您的输出数组实际上位于主机内存中，因为下面一行中的CL_MEM_ALLOC_HOST_PTR标志：

d_C = clCreateBuffer(context, CL_MEM_WRITE_ONLY|CL_MEM_ALLOC_HOST_PTR ,sizeof(cl_float)*size*size, 0,&err);

这意味着您应该使用clEnqueueMapBuffer，然后以您认为合适的方式使用矩阵，最后使用clEnqueueUnmapMemObject。由于d_C已在主机内存中，因此不需要阵列matrixC。

从GPU到主机的数据传输实际上是在内核运行时发生的。map调用确保所有数据都已完成从GPU到CPU的移动。这就是传输时间实际上如此之短的原因。

我找不到任何关于clEnqueueReadBuffer是否适用于固定内存的文档。我还看到您正在检索每个操作的错误代码，但没有检查这些错误代码，因此您的代码可能会静默失败。

考虑到clEnqueueReadBuffer花费的时间和传输数据所花费的时间之间的巨大差异，请注意，所有排队的操作都不会立即调度到GPU。延迟的一个原因是图形卡的Windows显示驱动程序模型(WDDM)。clEnqueueReadBuffer使用的+-20微秒听起来适合这个延迟(我实际上看到过更长的延迟)。

票数 1

页面原文内容由Stack Overflow提供。腾讯云小微IT领域专用引擎提供翻译支持

原文链接：

https://stackoverflow.com/questions/20415347

复制

相似问题

问OpenCL传输速率超过PCI-e带宽
EN

回答 1

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问OpenCL传输速率超过PCI-e带宽EN

回答 1

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问OpenCL传输速率超过PCI-e带宽
EN