这可能只是.NET框架分配的内存对象没有正确地按页对齐的问题,但我不明白为什么零复制比非零复制慢。
我将在这个问题中包含内联代码,但是完整的源代码可以在这里看到:https://github.com/kwende/ClooMatrixMultiply/blob/master/GiantMatrixOnGPU/GPUMatrixMultiplier.cs。
由于这是我第一次尝试零拷贝,所以我写了一个简单的矩阵乘法示例。我首先初始化OpenCL对象:
private void Initialize()
{
// get the intel integrated GPU
_integratedIntelGPUPlatform = ComputePlatform.Platforms.Where(n => n.Name.Contains("Intel")).First();
// create the compute context.
_context = new ComputeContext(
ComputeDeviceTypes.Gpu, // use the gpu
new ComputeContextPropertyList(_integratedIntelGPUPlatform), // use the intel openCL platform
null,
IntPtr.Zero);
// the command queue is the, well, queue of commands sent to the "device" (GPU)
_commandQueue = new ComputeCommandQueue(
_context, // the compute context
_context.Devices[0], // first device matching the context specifications
ComputeCommandQueueFlags.None); // no special flags
string kernelSource = null;
using (StreamReader sr = new StreamReader("kernel.cl"))
{
kernelSource = sr.ReadToEnd();
}
// create the "program"
_program = new ComputeProgram(_context, new string[] { kernelSource });
// compile.
_program.Build(null, null, null, IntPtr.Zero);
_kernel = _program.CreateKernel("ComputeMatrix");
}如果我的代码没有初始化,...this只执行一次。然后我进入主体。对于非零拷贝,我执行以下操作:
public float[] MultiplyMatrices(float[] matrix1, float[] matrix2,
int matrix1Height, int matrix1WidthMatrix2Height, int matrix2Width)
{
if (!_initialized)
{
Initialize();
_initialized = true;
}
ComputeBuffer<float> matrix1Buffer = new ComputeBuffer<float>(_context,
ComputeMemoryFlags.ReadOnly | ComputeMemoryFlags.CopyHostPointer,
matrix1);
_kernel.SetMemoryArgument(0, matrix1Buffer);
ComputeBuffer<float> matrix2Buffer = new ComputeBuffer<float>(_context,
ComputeMemoryFlags.ReadOnly | ComputeMemoryFlags.CopyHostPointer,
matrix2);
_kernel.SetMemoryArgument(1, matrix2Buffer);
float[] ret = new float[matrix1Height * matrix2Width];
ComputeBuffer<float> retBuffer = new ComputeBuffer<float>(_context,
ComputeMemoryFlags.ReadWrite | ComputeMemoryFlags.CopyHostPointer,
ret);
_kernel.SetMemoryArgument(2, retBuffer);
_kernel.SetValueArgument<int>(3, matrix1WidthMatrix2Height);
_kernel.SetValueArgument<int>(4, matrix2Width);
_commandQueue.Execute(_kernel,
new long[] { 0 },
new long[] { matrix2Width, matrix1Height },
null, null);
unsafe
{
fixed (float* retPtr = ret)
{
_commandQueue.Read(retBuffer,
false, 0,
ret.Length,
new IntPtr(retPtr),
null);
_commandQueue.Finish();
}
}
matrix1Buffer.Dispose();
matrix2Buffer.Dispose();
retBuffer.Dispose();
return ret;
}您可以看到我是如何显式地为我的所有ComputeBuffer分配设置CopyHostPointer的。这样可以很好地执行。
然后,我对(包括设置"UseHostPointer“和调用Map/Unmap而不是Read)进行以下调整:
public float[] MultiplyMatricesZeroCopy(float[] matrix1, float[] matrix2,
int matrix1Height, int matrix1WidthMatrix2Height, int matrix2Width)
{
if (!_initialized)
{
Initialize();
_initialized = true;
}
ComputeBuffer<float> matrix1Buffer = new ComputeBuffer<float>(_context,
ComputeMemoryFlags.ReadOnly | ComputeMemoryFlags.CopyHostPointer,
matrix1);
_kernel.SetMemoryArgument(0, matrix1Buffer);
ComputeBuffer<float> matrix2Buffer = new ComputeBuffer<float>(_context,
ComputeMemoryFlags.ReadOnly | ComputeMemoryFlags.CopyHostPointer,
matrix2);
_kernel.SetMemoryArgument(1, matrix2Buffer);
float[] ret = new float[matrix1Height * matrix2Width];
ComputeBuffer<float> retBuffer = new ComputeBuffer<float>(_context,
ComputeMemoryFlags.ReadWrite | ComputeMemoryFlags.UseHostPointer,
ret);
_kernel.SetMemoryArgument(2, retBuffer);
_kernel.SetValueArgument<int>(3, matrix1WidthMatrix2Height);
_kernel.SetValueArgument<int>(4, matrix2Width);
_commandQueue.Execute(_kernel,
new long[] { 0 },
new long[] { matrix2Width, matrix1Height },
null, null);
IntPtr retPtr = _commandQueue.Map(
retBuffer,
false,
ComputeMemoryMappingFlags.Read,
0,
ret.Length, null);
_commandQueue.Unmap(retBuffer, ref retPtr, null);
_commandQueue.Finish();
matrix1Buffer.Dispose();
matrix2Buffer.Dispose();
retBuffer.Dispose();
return ret;
}然而,时机说明了一切。我的程序是这样写的:
CPU矩阵乘法: 1178.5ms
GPU矩阵乘法(复制):115.1ms
GPU矩阵乘法(零拷贝):174.1ms
GPU (带复制)的速度是10.23892倍。
GPU (零拷贝)的速度是6.769098倍。
...so零拷贝速度较慢。
发布于 2017-02-17 05:55:20
多亏了huseyin buyukisik,我才能弄清楚到底是怎么回事。
我需要更新我的英特尔驱动程序。一旦我这样做了,零拷贝就快多了。
为了后人着想,这里是零拷贝代码的最终版本:
public float[] MultiplyMatricesZeroCopy(float[] matrix1, float[] matrix2,
int matrix1Height, int matrix1WidthMatrix2Height, int matrix2Width)
{
if (!_initialized)
{
Initialize();
_initialized = true;
}
ComputeBuffer<float> matrix1Buffer = new ComputeBuffer<float>(_context,
ComputeMemoryFlags.ReadOnly | ComputeMemoryFlags.CopyHostPointer,
matrix1);
_kernel.SetMemoryArgument(0, matrix1Buffer);
ComputeBuffer<float> matrix2Buffer = new ComputeBuffer<float>(_context,
ComputeMemoryFlags.ReadOnly | ComputeMemoryFlags.CopyHostPointer,
matrix2);
_kernel.SetMemoryArgument(1, matrix2Buffer);
float[] ret = new float[matrix1Height * matrix2Width];
GCHandle handle = GCHandle.Alloc(ret, GCHandleType.Pinned);
ComputeBuffer<float> retBuffer = new ComputeBuffer<float>(_context,
ComputeMemoryFlags.UseHostPointer,
ret);
_kernel.SetMemoryArgument(2, retBuffer);
_kernel.SetValueArgument<int>(3, matrix1WidthMatrix2Height);
_kernel.SetValueArgument<int>(4, matrix2Width);
_commandQueue.Execute(_kernel,
new long[] { 0 },
new long[] { matrix2Width, matrix1Height },
null, null);
IntPtr retPtr = _commandQueue.Map(
retBuffer,
true,
ComputeMemoryMappingFlags.Read,
0,
ret.Length, null);
_commandQueue.Unmap(retBuffer, ref retPtr, null);
//_commandQueue.Finish();
matrix1Buffer.Dispose();
matrix2Buffer.Dispose();
retBuffer.Dispose();
handle.Free();
return ret;
}https://stackoverflow.com/questions/42284262
复制相似问题