首页
学习
活动
专区
圈层
工具
发布
社区首页 >问答首页 >OpenCL传输速率超过PCI-e带宽

OpenCL传输速率超过PCI-e带宽
EN

Stack Overflow用户
提问于 2013-12-06 11:22:25
回答 1查看 938关注 0票数 5

我编写了一个OpenCL程序,并使用固定内存(CL_MEM_ALLOC_HOST_PTR)从设备到主机获得更高的传输速率。

传输率如我所料(使用AMD应用程序档案器2.4获取传输率)。问题是传输速率高于矩阵4096 x 4096 (64 MB)的PCIe带宽(93703 GB /s)。

当我使用零复制缓冲区( CL_MEM_ALLOC_HOST_PTR + clEnqueueMapBuffer)时也会发生这种情况。我查找了一些信息,如果固定内存和零复制缓冲区具有较高的传输速率,但对于离散图形处理器来说,它仍然受到PCIe带宽的限制。那么,传输速率超过PCIe带宽(使用PCIe带宽2.0x16)是正常的吗?

我的操作系统是64位Windows 7。我使用AMD APP SDK 2.6和独立GPU AMD HD 6630M。

编辑:代码如下:

代码语言:javascript
复制
#include <Windows.h>
#include <iostream>
#include <fstream>
#include <string>
using namespace std;

#ifdef __APPLE__   
   #include <OpenCL/opencl.h>   
#else  
   #include <CL/cl.h>   
#endif 

#define MAX_SOURCE_SIZE (0x100000)

cl_context context = NULL; 
cl_command_queue queue = NULL; 
cl_program program = NULL; 

void MatrixMul(cl_mem d_A, cl_mem d_B, cl_mem d_C, int size)
{
cl_int err;
cl_kernel naive;

// Create Kernel Object Bound To Kernel Function 
naive = clCreateKernel(program, "naiveAlgorithm", &err);

//Set size of global work item and work tem in each work goups
int globalsize = size;
int localsize;

if(globalsize >= 16)
{
    localsize =16;
}else
{
    localsize = globalsize;
}

size_t global_work_items [2] = {globalsize, globalsize};
size_t local_work_items  [2] = {localsize, localsize};

// Setup Kernel Argument
err = clSetKernelArg(naive, 0, sizeof(cl_mem), (void *)&d_A);
err = clSetKernelArg(naive, 1, sizeof(cl_mem), (void *)&d_B);
err = clSetKernelArg(naive, 2, sizeof(cl_mem), (void *)&d_C);
err = clSetKernelArg(naive, 3, sizeof(cl_int), (void *)&size);



// Execute OpenCL kernel for Naive Algorithm
err = clEnqueueNDRangeKernel(queue, naive, 2, NULL, global_work_items, local_work_items, 0, NULL, NULL);
clFinish(queue);

//Release Kernel
err = clReleaseKernel(naive);
}

void Naive(cl_float* matrixA, cl_float* matrixB, cl_float* matrixC, int size)
{
int err;
// OpenCL device memory for matrices
cl_mem d_A;
cl_mem d_B;
cl_mem d_C;

// Allocate Device Memory For Input And Output
d_A = clCreateBuffer(context,  CL_MEM_READ_ONLY   ,   sizeof(cl_float)*size*size, 0, &err);
d_B = clCreateBuffer(context,  CL_MEM_READ_ONLY   ,   sizeof(cl_float)*size*size, 0, &err);
d_C = clCreateBuffer(context, CL_MEM_WRITE_ONLY|CL_MEM_ALLOC_HOST_PTR ,sizeof(cl_float)*size*size, 0,&err);     

// Copy Host Memory To Memory Device
err = clEnqueueWriteBuffer(queue, d_A, CL_FALSE, 0, sizeof(cl_float)*size*size, matrixA, 0, NULL, NULL); 
err = clEnqueueWriteBuffer(queue, d_B, CL_FALSE, 0, sizeof(cl_float)*size*size, matrixB, 0, NULL, NULL); 

MatrixMul(d_A, d_B, d_C, size);

err = clEnqueueReadBuffer(queue, d_C, CL_TRUE, 0, sizeof(cl_float)*size*size, matrixC, 0, NULL, NULL);

err = clReleaseMemObject(d_A);
err = clReleaseMemObject(d_B);
err = clReleaseMemObject(d_C);
}



//Main Function
int main(int argc, char **argv)
{
//Size of matrix for Strassen Algorithm
cl_int size = 4096; 

//Matrix for input and output
cl_float * matrixA;
cl_float * matrixB;
cl_float * matrixC;

//Allocate  and init memory for the host
matrixA = (cl_float *) malloc(size*size*sizeof(cl_float));
matrixB = (cl_float *) malloc(size*size*sizeof(cl_float));
matrixC = (cl_float *) malloc(size*size*sizeof(cl_float));

//Fill matrix
fillMatrix(matrixA,size);
fillMatrix(matrixB,size);

//print input for matrix A and B
cout<<"Input for matrix A :"<<endl;
printMatrix(matrixA, size*size, size);
cout<<"Input for matrix B :"<<endl;
printMatrix(matrixB, size*size, size);

cl_int err;     // error code   

cl_platform_id* platforms;
cl_uint platformCount;

cl_device_id device;

int platformtype = 0; //if 0 using amd app sdk but if 1 using intel sdk

clGetPlatformIDs(0, NULL, &platformCount); //get number of platform
platforms = (cl_platform_id*) malloc(sizeof(cl_platform_id) * platformCount); 
clGetPlatformIDs(platformCount, platforms, NULL);  //get list of platform
clGetDeviceIDs (platforms [platformtype], CL_DEVICE_TYPE_GPU, 1, &device, NULL); //get list of devices

const cl_context_properties contextProperties [] =
{CL_CONTEXT_PLATFORM,
     reinterpret_cast<cl_context_properties> (platforms [platformtype]),
     0, 0
};


context = clCreateContext(contextProperties, 1, &device, NULL, NULL, &err);
    ![enter image description here][2]queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err);


//Load Kernel Source 
FILE *fp;
const char fileName[] = "./MatMul_Kernel.cl";
size_t source_size;
char *source_str;

fp = fopen(fileName, "r");
if (!fp) 
{
    fprintf(stderr, "Failed to load kernel.\n");
    exit(1);
}
source_str = (char *)malloc(MAX_SOURCE_SIZE);
source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
fclose(fp);

// Create Program Object 
program = clCreateProgramWithSource(context, 1, (const char **) &source_str,(const size_t *),
    &source_size, &err); 

// Build Program 
    err = clBuildProgram(program, 1, &device, NULL, NULL, NULL);

Naive(matrixA, matrixB, matrixC, size);

    //Cleanup all memory
err = clFlush(queue);
    err = clFinish(queue);
    err = clReleaseProgram(program);
    err = clReleaseCommandQueue(queue);
    err = clReleaseContext(context);

// Display result of matrix multiplication
cout<<"Output for matrix C :"<<endl;
    printMatrix(matrixC, size*size, size);
cout<<endl;

free(matrixA);
    free(matrixB);
    free(matrixC);
free(source_str);

    return 0;
}

下面是内核代码:

代码语言:javascript
复制
 __kernel void naiveAlgorithm(__global float *A, __global float *B, __global float *C, int size) {

 int tx = get_global_id(0); //2D Thread IDx
 int ty = get_global_id(1); //2D Thread IDy

 float sum = 0;

 //Calculate result of one element of Matrix C
 for (int k = 0; k < size; k++) {
    sum += A[ty*size+k] * B[k*size+tx];
 }
  C[ty*size+tx] = sum;
 }

下面是图片:

EN

回答 1

Stack Overflow用户

发布于 2014-05-19 20:21:16

我看到您的输出数组实际上位于主机内存中,因为下面一行中的CL_MEM_ALLOC_HOST_PTR标志:

代码语言:javascript
复制
d_C = clCreateBuffer(context, CL_MEM_WRITE_ONLY|CL_MEM_ALLOC_HOST_PTR ,sizeof(cl_float)*size*size, 0,&err);

这意味着您应该使用clEnqueueMapBuffer,然后以您认为合适的方式使用矩阵,最后使用clEnqueueUnmapMemObject。由于d_C已在主机内存中,因此不需要阵列matrixC。

从GPU到主机的数据传输实际上是在内核运行时发生的。map调用确保所有数据都已完成从GPU到CPU的移动。这就是传输时间实际上如此之短的原因。

我找不到任何关于clEnqueueReadBuffer是否适用于固定内存的文档。我还看到您正在检索每个操作的错误代码,但没有检查这些错误代码,因此您的代码可能会静默失败。

考虑到clEnqueueReadBuffer花费的时间和传输数据所花费的时间之间的巨大差异,请注意,所有排队的操作都不会立即调度到GPU。延迟的一个原因是图形卡的Windows显示驱动程序模型(WDDM)。clEnqueueReadBuffer使用的+-20微秒听起来适合这个延迟(我实际上看到过更长的延迟)。

票数 1
EN
页面原文内容由Stack Overflow提供。腾讯云小微IT领域专用引擎提供翻译支持
原文链接:

https://stackoverflow.com/questions/20415347

复制
相关文章

相似问题

领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档