文章/答案/技术大牛

发布

社区首页 >问答首页 >使用带有nvcc的-G标志编译到cubin后，Cublas不能在内核中工作

问使用带有nvcc的-G标志编译到cubin后，Cublas不能在内核中工作
EN

Stack Overflow用户

提问于 2015-08-19 17:50:16

回答 1查看 559关注 0票数 0

我有一个CUDA内核，如下所示：

#include <cublas_v2.h>
#include <math_constants.h>
#include <stdio.h>


extern "C" {

    __device__ float ONE = 1.0f;
    __device__ float M_ONE = -1.0f;
    __device__ float ZERO = 0.0f;

    __global__ void kernel(float *W, float *input, int i, float *output, int o) {
        int idx = blockIdx.x*blockDim.x+threadIdx.x;
        cublasHandle_t cnpHandle;

        if(idx == 0) {
            cublasCreate(&cnpHandle);
            cublasStatus_t s = cublasSgemv(cnpHandle, CUBLAS_OP_N, o, i, &ONE, W, o, input, 1, &ZERO, output, 1);
            printf("status %d\n", s);
            cudaError_t e = cudaDeviceSynchronize();
            printf("sync %d\n", e);
        }

    }

}

主机码：

#include <iostream>
#include <numeric>
#include <stdlib.h>
#include <cstring>
#include <cuda_runtime.h>
#include <cuda.h>
#include <cublas_v2.h>

extern "C" {
    __global__ void kernel(float *W, float *input, int i, float *output, int o);
}

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess)
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

int main(int argc, char* argv[])
{

    cuInit(0);
    CUcontext pctx;
    CUdevice dev;
    cuDeviceGet(&dev, 0);
    cuCtxCreate(&pctx, 0, dev);

    CUmodule module;
    CUresult t = cuModuleLoad(&module, "pathto/src/minimalKernel.cubin");

    CUfunction function;
    CUresult r = cuModuleGetFunction(&function, module, "kernel");

    float *W = new float[2];
    W[0] = 0.1f;
    W[1] = 0.1f;
    float *input = new float[2];
    input[0] = 0.1f;
    input[1] = 0.1f;
    float *out = new float[1];
    out[0] = 0.0f;

    int i = 2;
    int o = 1;

    float *d_W;
    float *d_input;
    float *d_out;
    cudaMalloc((void**)&d_W, 2*sizeof(float));
    cudaMalloc((void**)&d_input, 2*sizeof(float));
    cudaMalloc((void**)&d_out, sizeof(float));
    cudaMemcpy(d_W, W, 2*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_input, input, 2*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_out, out, sizeof(float), cudaMemcpyHostToDevice);
    //kernel<<<1, 2>>>(d_W, d_input, i, d_out, o);

    //cudaMemcpy(out, d_out, sizeof(float), cudaMemcpyDeviceToHost);

    //std::cout<<"out:"<<out[0]<<std::endl;

    void * kernelParams[] { &d_W, &d_input, &i, &d_out, &o };

    CUresult k = cuLaunchKernel(function, 1, 1, 1, 2, 1, 1, 0, 0, (void**)kernelParams, 0);
    gpuErrchk( cudaPeekAtLastError() );
    gpuErrchk( cudaDeviceSynchronize() );
    cudaMemcpy(out, d_out, sizeof(float), cudaMemcpyDeviceToHost);

    std::cout<<"out:"<<out[0]<<std::endl;


}

当这个内核运行内联kernel<<<1,2>>>()、构建和链接(在eclipse Nsight中)时，内核运行得非常好，并且out会像预期的那样返回0.02。

如果我使用-G (生成设备调试符号)将内核编译成.cubin，则cublas函数永远不会运行，并且out始终为0.0

我可以在.cubin运行时设置断点，我可以看到进入cublas函数的数据是正确的，但看起来cublas函数根本不会运行。cublas函数也总是返回0 CUDA_SUCCESS。重要的是，只有从.cubin运行时才会发生这种情况

编译为我在-G中使用的多维数据集

nvcc -G -cubin -arch=sm_52 --device-c kernel.cu -o kernel.cubin -dlink -lcudadevrt -lcublas_device

它不返回任何错误。

如果添加了-G选项，为什么.cubin中的cublas函数会停止工作？

CUDA 7.0 linux 14.04 x64 980GTX

cuda

cublas

dynamic-parallelism

回答 1

Stack Overflow用户

发布于 2015-08-23 05:19:10

顺便说一句，不管有没有-G开关，你的代码都不能正常运行。您可以使用cuda-memcheck运行代码来帮助识别错误。(无论是在主机代码中还是在设备代码中，您似乎都没有执行proper CUDA error checking。使用动态并行，您可以在设备代码中使用类似的方法。并且CUBLAS API调用返回错误代码，而您似乎没有检查这些错误代码。)

这是错误的：

    if(idx == 0) {
        cublasCreate(&cnpHandle);
    }

这是一个线程局部变量：

cublasHandle_t cnpHandle;

由于您正在启动一个具有两个线程的内核：

CUresult k = cuLaunchKernel(function, 1, 1, 1, 2, 1, 1, 0, 0, (void**)kernelParams, 0);

您的一个线程(0)正在向cublasSgemv调用传递有效的句柄，而另一个线程(1)则不是。

当我修复这个错误时，你的代码为我“工作”。请注意，在这种情况下，您仍然需要为两个线程中的每个线程向cublasSgemv调用传递完全相同的参数。因此，每个调用都写入相同的输出位置。由于在这种情况下线程执行/行为的顺序未指定，因此您可能会看到非常不同的行为:即使另一个cublas调用失败，也能获得有效的输出(因为一个线程写入了正确的值作为成功的cublas调用的结果)。我认为，-G开关可能会影响这种排序，或者以某种方式影响这种行为。

$ cat t889_kern.cu
#include <cublas_v2.h>
#include <math_constants.h>
#include <stdio.h>


extern "C" {

    __device__ float ONE = 1.0f;
    __device__ float M_ONE = -1.0f;
    __device__ float ZERO = 0.0f;

    __global__ void kernel(float *W, float *input, int i, float *output, int o) {
//        int idx = blockIdx.x*blockDim.x+threadIdx.x;
        cublasHandle_t cnpHandle;

        cublasCreate(&cnpHandle);

        cublasSgemv(cnpHandle, CUBLAS_OP_N, o, i, &ONE, W, o, input, 1, &ZERO, output, 1);
        cudaDeviceSynchronize();
    }

}
$ cat t889.cpp
#include <iostream>
#include <numeric>
#include <stdlib.h>
#include <cstring>
#include <cuda_runtime.h>
#include <cuda.h>
#include <cublas_v2.h>

extern "C" {
    __global__ void kernel(float *W, float *input, int i, float *output, int o);
}

int main(int argc, char* argv[])
{

    cuInit(0);
    CUcontext pctx;
    CUdevice dev;
    cuDeviceGet(&dev, 0);
    cuCtxCreate(&pctx, 0, dev);

    CUmodule module;
    CUresult t = cuModuleLoad(&module, "kernel.cubin");

    CUfunction function;
    CUresult r = cuModuleGetFunction(&function, module, "kernel");

    float *W = new float[2];
    W[0] = 0.1f;
    W[1] = 0.1f;
    float *input = new float[2];
    input[0] = 0.1f;
    input[1] = 0.1f;
    float *out = new float[1];
    out[0] = 0.0f;

    int i = 2;
    int o = 1;

    float *d_W;
    float *d_input;
    float *d_out;
    cudaMalloc((void**)&d_W, 2*sizeof(float));
    cudaMalloc((void**)&d_input, 2*sizeof(float));
    cudaMalloc((void**)&d_out, sizeof(float));
    cudaMemcpy(d_W, W, 2*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_input, input, 2*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_out, out, sizeof(float), cudaMemcpyHostToDevice);
    //kernel<<<1, 2>>>(d_W, d_input, i, d_out, o);

    //cudaMemcpy(out, d_out, sizeof(float), cudaMemcpyDeviceToHost);

    //std::cout<<"out:"<<out[0]<<std::endl;

    void * kernelParams[] { &d_W, &d_input, &i, &d_out, &o };

    CUresult k = cuLaunchKernel(function, 1, 1, 1, 2, 1, 1, 0, 0, (void**)kernelParams, 0);

    cudaMemcpy(out, d_out, sizeof(float), cudaMemcpyDeviceToHost);

    std::cout<<"out:"<<out[0]<<std::endl;


}
$ nvcc -cubin -arch=sm_35 --device-c t889_kern.cu -o kernel.cubin -dlink -lcudadevrt -lcublas_device
ptxas info    : 'device-function-maxrregcount' is a BETA feature
$ g++ -std=c++11 -I/usr/local/cuda/include t889.cpp -o t889 -L/usr/local/cuda/lib64 -lcuda -lcudart
$ CUDA_VISIBLE_DEVICES="1" cuda-memcheck ./t889
========= CUDA-MEMCHECK
out:0.02
========= ERROR SUMMARY: 0 errors
$

票数 1

页面原文内容由Stack Overflow提供。腾讯云小微IT领域专用引擎提供翻译支持

原文链接：

https://stackoverflow.com/questions/32092093

复制

相似问题

问使用带有nvcc的-G标志编译到cubin后，Cublas不能在内核中工作
EN

回答 1

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问使用带有nvcc的-G标志编译到cubin后，Cublas不能在内核中工作EN

回答 1

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问使用带有nvcc的-G标志编译到cubin后，Cublas不能在内核中工作
EN