首页
学习
活动
专区
圈层
工具
发布
社区首页 >问答首页 >大矩阵CUDA cuSolver gesvdj

大矩阵CUDA cuSolver gesvdj
EN

Stack Overflow用户
提问于 2019-08-07 22:29:17
回答 1查看 764关注 0票数 0

我正在一个NVIDIA P6000上运行"G.2. SVD与奇异向量(通过Jacobi方法)“一节下的”查找“代码的稍微修改的版本。轻微的修改是动态地为A、U和V向量在堆中分配内存,并用依赖于索引的值填充指定大小的A向量。我还将从双倍到浮动的所有内容都转换为A。最后一个修改是对gesvdj调用本身进行循环,并检查某些#迭代的收敛性(在我的例子中为10)。

通过这些微小的修改,我能够克服我的第一个障碍,在大于1000x1000大小的对称数组上执行SVD。我最终需要在大小为1048576x20的数组上运行SVD。

目前,该算法对大小为10000x20的数组运行,但当我转到50000x20时失败。

这个问题似乎源于gesvdj电话本身。调用gesvdj后的同步调用失败,并返回一个通用访问错误。

如果我使用运行这个程序,就会得到在同一个块中的不同线程的一系列错误:

代码语言:javascript
复制
Invalid __global__ write of size 4
=========     at 0x00000108 in void eye_kernel<float, int=5, int=3>(int, int, float*, int)
=========     by thread (0,7,0) in block (16,1342,0)
=========     Address 0x7f4ed40414c0 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame:/lib64/libcuda.so.1 (cuLaunchKernel + 0x2c5) [0x269e85]
=========     Host Frame:/usr/local/cuda-10.0/lib64/libcusolver.so.10.0 [0x100c822]
=========     Host Frame:/usr/local/cuda-10.0/lib64/libcusolver.so.10.0 [0x100ca17]
=========     Host Frame:/usr/local/cuda-10.0/lib64/libcusolver.so.10.0 [0x1040dd5]
=========     Host Frame:/usr/local/cuda-10.0/lib64/libcusolver.so.10.0 [0x235c5d]
=========     Host Frame:/usr/local/cuda-10.0/lib64/libcusolver.so.10.0 (cusolverDnSgesvdj + 0x508) [0x21f9a8]
=========     Host Frame:./gesvdj_example [0x4518]
=========     Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xf5) [0x223d5]
=========     Host Frame:./gesvdj_example [0x3ab9]

我想知道我是否碰到了某种缓冲的内部限制?有人有什么想法吗?如果有必要的话,我可以提供我的确切代码,但它非常类似于这个例子,我想我只会指向那里的人。

谢谢!

编辑以添加我链接到的示例中的违规代码,该算法在assert(CUSOLVER_STATUS_SUCCESS == status);行处失败。我对C和CUDA的编码非常陌生,很抱歉,如果我遗漏了一些明显的调试信息。

代码语言:javascript
复制
/* step 5: compute SVD */
    status = cusolverDnDgesvdj(
        cusolverH,
        jobz,  /* CUSOLVER_EIG_MODE_NOVECTOR: compute singular values only */
               /* CUSOLVER_EIG_MODE_VECTOR: compute singular value and singular vectors */
        econ,  /* econ = 1 for economy size */
        m,     /* nubmer of rows of A, 0 <= m */
        n,     /* number of columns of A, 0 <= n  */
        d_A,   /* m-by-n */
        lda,   /* leading dimension of A */
        d_S,   /* min(m,n)  */
               /* the singular values in descending order */
        d_U,   /* m-by-m if econ = 0 */
               /* m-by-min(m,n) if econ = 1 */
        lda,   /* leading dimension of U, ldu >= max(1,m) */
        d_V,   /* n-by-n if econ = 0  */
               /* n-by-min(m,n) if econ = 1  */
        lda,   /* leading dimension of V, ldv >= max(1,n) */
        d_work,
        lwork,
        d_info,
        gesvdj_params);
    cudaStat1 = cudaDeviceSynchronize();
    assert(CUSOLVER_STATUS_SUCCESS == status);
    assert(cudaSuccess == cudaStat1);

编辑2添加我的代码..。

代码语言:javascript
复制
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <cusolverDn.h>
#include<iostream>
#include<iomanip>
#include<assert.h>
#include <cuda_runtime_api.h>
#include "cuda_runtime.h"

int main(int argc, char*argv[])
{

    //Setting which device to run on
    const int device = 0;
    cudaSetDevice(device);

    cusolverDnHandle_t cusolverH = NULL;
    cudaStream_t stream = NULL;
    gesvdjInfo_t gesvdj_params = NULL;

    cusolverStatus_t status = CUSOLVER_STATUS_SUCCESS;
    cudaError_t cudaStat1 = cudaSuccess;
    cudaError_t cudaStat2 = cudaSuccess;
    cudaError_t cudaStat3 = cudaSuccess;
    cudaError_t cudaStat4 = cudaSuccess;
    cudaError_t cudaStat5 = cudaSuccess;
    const long int m = 50000;
    const long int n = 20;
    const int lda = m;

    // --- Setting the host, Nrows x Ncols matrix
    float *A = (float *)malloc(m * n * sizeof(float));
      for(long int j = 0; j < m; j++)
          for(long int i = 0; i < n; i++)
              A[j + i*m] = sqrt((float)(i + j));

    float *U = (float *)malloc(m * m * sizeof(float)); /* m-by-m unitary matrix, left singular vectors  */
    float *V = (float *)malloc(m * n * sizeof(float)); /* n-by-n unitary matrix, right singular vectors */
    float S[n];     /* numerical singular value */ 
    float *d_A = NULL;  /* device copy of A */
    float *d_S = NULL;  /* singular values */
    float *d_U = NULL;  /* left singular vectors */
    float *d_V = NULL;  /* right singular vectors */
    int *d_info = NULL;  /* error info */
    int lwork = 0;       /* size of workspace */
    float *d_work = NULL; /* devie workspace for gesvdj */
    int info = 0;        /* host copy of error info */

/* configuration of gesvdj  */
    const double tol = 1.e-7;
    const int max_sweeps = 100;
    const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_VECTOR; // compute eigenvectors.
    const int econ = 0 ; /* econ = 1 for economy size */

/* numerical results of gesvdj  */
    double residual = 0;
    int executed_sweeps = 0;

    printf("tol = %E, default value is machine zero \n", tol);
    printf("max. sweeps = %d, default value is 100\n", max_sweeps);
    printf("econ = %d \n", econ);
    printf("=====\n");

/* step 1: create cusolver handle, bind a stream */
    status = cusolverDnCreate(&cusolverH);
    assert(CUSOLVER_STATUS_SUCCESS == status);

    cudaStat1 = cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
    assert(cudaSuccess == cudaStat1);

    status = cusolverDnSetStream(cusolverH, stream);
    assert(CUSOLVER_STATUS_SUCCESS == status);

/* step 2: configuration of gesvdj */
    status = cusolverDnCreateGesvdjInfo(&gesvdj_params);
    assert(CUSOLVER_STATUS_SUCCESS == status);

/* default value of tolerance is machine zero */
    status = cusolverDnXgesvdjSetTolerance(
        gesvdj_params,
        tol);
    assert(CUSOLVER_STATUS_SUCCESS == status);

/* default value of max. sweeps is 100 */
    status = cusolverDnXgesvdjSetMaxSweeps(
        gesvdj_params,
        max_sweeps);
    assert(CUSOLVER_STATUS_SUCCESS == status);

/* step 3: copy A to device */
    cudaStat1 = cudaMalloc ((void**)&d_A   , sizeof(float)*lda*n);
    cudaStat2 = cudaMalloc ((void**)&d_S   , sizeof(float)*n);
    cudaStat3 = cudaMalloc ((void**)&d_U   , sizeof(float)*lda*m);
    cudaStat4 = cudaMalloc ((void**)&d_V   , sizeof(float)*lda*n);
    cudaStat5 = cudaMalloc ((void**)&d_info, sizeof(int));
    assert(cudaSuccess == cudaStat1);
    assert(cudaSuccess == cudaStat2);
    assert(cudaSuccess == cudaStat3);
    assert(cudaSuccess == cudaStat4);
    assert(cudaSuccess == cudaStat5);

    cudaStat1 = cudaMemcpy(d_A, A, sizeof(float)*lda*n, cudaMemcpyHostToDevice);
    assert(cudaSuccess == cudaStat1);
    /* step 4: query workspace of SVD */
    status = cusolverDnSgesvdj_bufferSize(
        cusolverH,
        jobz, /* CUSOLVER_EIG_MODE_NOVECTOR: compute singular values only */
              /* CUSOLVER_EIG_MODE_VECTOR: compute singular value and singular vectors */
        econ, /* econ = 1 for economy size */
        m,    /* nubmer of rows of A, 0 <= m */
        n,    /* number of columns of A, 0 <= n  */
        d_A,  /* m-by-n */
        lda,  /* leading dimension of A */
        d_S,  /* min(m,n) */
              /* the singular values in descending order */
        d_U,  /* m-by-m if econ = 0 */
              /* m-by-min(m,n) if econ = 1 */
        lda,  /* leading dimension of U, ldu >= max(1,m) */
        d_V,  /* n-by-n if econ = 0  */
              /* n-by-min(m,n) if econ = 1  */
        lda,  /* leading dimension of V, ldv >= max(1,n) */
        &lwork,
        gesvdj_params);
    assert(CUSOLVER_STATUS_SUCCESS == status);

    cudaStat1 = cudaMalloc((void**)&d_work , sizeof(float)*lwork);
    assert(cudaSuccess == cudaStat1);

/* step 5: compute SVD */
    //Iterating over SVD calculation, not part of example
    int iters;
    for (iters = 10; iters > 0; iters--){

    status = cusolverDnSgesvdj(
        cusolverH,
        jobz,  /* CUSOLVER_EIG_MODE_NOVECTOR: compute singular values only */
               /* CUSOLVER_EIG_MODE_VECTOR: compute singular value and singular vectors */
        econ,  /* econ = 1 for economy size */
        m,     /* nubmer of rows of A, 0 <= m */
        n,     /* number of columns of A, 0 <= n  */
        d_A,   /* m-by-n */
        lda,   /* leading dimension of A */
        d_S,   /* min(m,n)  */
               /* the singular values in descending order */
        d_U,   /* m-by-m if econ = 0 */
               /* m-by-min(m,n) if econ = 1 */
        lda,   /* leading dimension of U, ldu >= max(1,m) */
        d_V,   /* n-by-n if econ = 0  */
               /* n-by-min(m,n) if econ = 1  */
        lda,   /* leading dimension of V, ldv >= max(1,n) */
        d_work,
        lwork,
        d_info,
        gesvdj_params);

    cudaStat1 = cudaDeviceSynchronize();
    assert(CUSOLVER_STATUS_SUCCESS == status);
    assert(cudaSuccess == cudaStat1);

    cudaStat1 = cudaMemcpy(U, d_U, sizeof(float)*lda*m, cudaMemcpyDeviceToHost);
    cudaStat2 = cudaMemcpy(V, d_V, sizeof(float)*lda*n, cudaMemcpyDeviceToHost);
    cudaStat3 = cudaMemcpy(S, d_S, sizeof(float)*n    , cudaMemcpyDeviceToHost);
    cudaStat4 = cudaMemcpy(&info, d_info, sizeof(int), cudaMemcpyDeviceToHost);
    cudaStat5 = cudaDeviceSynchronize();
    assert(cudaSuccess == cudaStat1);
    assert(cudaSuccess == cudaStat2);
    assert(cudaSuccess == cudaStat3);
    assert(cudaSuccess == cudaStat4);
    assert(cudaSuccess == cudaStat5);

    if ( 0 == info ){
        printf("gesvdj converges \n");
    }else if ( 0 > info ){
        printf("%d-th parameter is wrong \n", -info);
        exit(1);
    }else{
        printf("WARNING: info = %d : gesvdj does not converge \n", info );
    }
    }
/* step 6: measure error of singular value */
    status = cusolverDnXgesvdjGetSweeps(
        cusolverH,
        gesvdj_params,
        &executed_sweeps);
    assert(CUSOLVER_STATUS_SUCCESS == status);

    status = cusolverDnXgesvdjGetResidual(
        cusolverH,
        gesvdj_params,
        &residual);
    assert(CUSOLVER_STATUS_SUCCESS == status);

    printf("residual |A - U*S*V**H|_F = %E \n", residual );
    printf("number of executed sweeps = %d \n\n", executed_sweeps );

/*  free resources  */
    if (A      ) free(A);
    if (V      ) free(V);
    if (U      ) free(U);
    if (d_A    ) cudaFree(d_A);
    if (d_S    ) cudaFree(d_S);
    if (d_U    ) cudaFree(d_U);
    if (d_V    ) cudaFree(d_V);
    if (d_info ) cudaFree(d_info);
    if (d_work ) cudaFree(d_work);

    if (cusolverH    ) cusolverDnDestroy(cusolverH);
    if (stream       ) cudaStreamDestroy(stream);
    if (gesvdj_params) cusolverDnDestroyGesvdjInfo(gesvdj_params);

    cudaDeviceReset();
    return 0;
}
EN

回答 1

Stack Overflow用户

回答已采纳

发布于 2019-08-08 15:42:44

感谢@talonmies在诊断这个问题上的帮助。缓冲器的gesvdj方法具有一种经济模式,它将U矩阵和V矩阵存储在更经济的阵列中。我为使代码工作所做的修改很简单。

代码语言:javascript
复制
econ = 1
U array size (mxn)
V array size (nxn)
ldv paramater = n

代码如下:

代码语言:javascript
复制
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <cusolverDn.h>
#include<iostream>
#include<iomanip>
#include<assert.h>
#include <cuda_runtime_api.h>
#include "cuda_runtime.h"

int main(int argc, char*argv[])
{

    //Setting which device to run on
    const int device = 0;
    cudaSetDevice(device);

    cusolverDnHandle_t cusolverH = NULL;
    cudaStream_t stream = NULL;
    gesvdjInfo_t gesvdj_params = NULL;

    cusolverStatus_t status = CUSOLVER_STATUS_SUCCESS;
    cudaError_t cudaStat1 = cudaSuccess;
    cudaError_t cudaStat2 = cudaSuccess;
    cudaError_t cudaStat3 = cudaSuccess;
    cudaError_t cudaStat4 = cudaSuccess;
    cudaError_t cudaStat5 = cudaSuccess;
    const long int m = 1048576;
    const long int n = 20;
    const int lda = m;

    // --- Setting the host, Nrows x Ncols matrix
    float *A = (float *)malloc(m * n * sizeof(float));
      for(long int j = 0; j < m; j++)
          for(long int i = 0; i < n; i++)
              A[j + i*m] = sqrt((float)(i + j));

    float *U = (float *)malloc(m * n * sizeof(float)); /* m-by-m unitary matrix, left singular vectors  */
    float *V = (float *)malloc(n * n * sizeof(float)); /* n-by-n unitary matrix, right singular vectors */
    float S[n];     /* numerical singular value */ 
    float *d_A = NULL;  /* device copy of A */
    float *d_S = NULL;  /* singular values */
    float *d_U = NULL;  /* left singular vectors */
    float *d_V = NULL;  /* right singular vectors */
    int *d_info = NULL;  /* error info */
    int lwork = 0;       /* size of workspace */
    float *d_work = NULL; /* devie workspace for gesvdj */
    int info = 0;        /* host copy of error info */

/* configuration of gesvdj  */
    const double tol = 1.e-7;
    const int max_sweeps = 100;
    const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_VECTOR; // compute eigenvectors.
    const int econ = 1 ; /* econ = 1 for economy size */

/* numerical results of gesvdj  */
    double residual = 0;
    int executed_sweeps = 0;

    printf("tol = %E, default value is machine zero \n", tol);
    printf("max. sweeps = %d, default value is 100\n", max_sweeps);
    printf("econ = %d \n", econ);
    printf("=====\n");

/* step 1: create cusolver handle, bind a stream */
    status = cusolverDnCreate(&cusolverH);
    assert(CUSOLVER_STATUS_SUCCESS == status);

    cudaStat1 = cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
    assert(cudaSuccess == cudaStat1);

    status = cusolverDnSetStream(cusolverH, stream);
    assert(CUSOLVER_STATUS_SUCCESS == status);

/* step 2: configuration of gesvdj */
    status = cusolverDnCreateGesvdjInfo(&gesvdj_params);
    assert(CUSOLVER_STATUS_SUCCESS == status);

/* default value of tolerance is machine zero */
    status = cusolverDnXgesvdjSetTolerance(
        gesvdj_params,
        tol);
    assert(CUSOLVER_STATUS_SUCCESS == status);

/* default value of max. sweeps is 100 */
    status = cusolverDnXgesvdjSetMaxSweeps(
        gesvdj_params,
        max_sweeps);
    assert(CUSOLVER_STATUS_SUCCESS == status);

/* step 3: copy A to device */
    cudaStat1 = cudaMalloc ((void**)&d_A   , sizeof(float)*lda*n);
    cudaStat2 = cudaMalloc ((void**)&d_S   , sizeof(float)*n);
    cudaStat3 = cudaMalloc ((void**)&d_U   , sizeof(float)*lda*n);
    cudaStat4 = cudaMalloc ((void**)&d_V   , sizeof(float)*n*n);
    cudaStat5 = cudaMalloc ((void**)&d_info, sizeof(int));
    assert(cudaSuccess == cudaStat1);
    assert(cudaSuccess == cudaStat2);
    assert(cudaSuccess == cudaStat3);
    assert(cudaSuccess == cudaStat4);
    assert(cudaSuccess == cudaStat5);

    cudaStat1 = cudaMemcpy(d_A, A, sizeof(float)*lda*n, cudaMemcpyHostToDevice);
    assert(cudaSuccess == cudaStat1);
    /* step 4: query workspace of SVD */
    status = cusolverDnSgesvdj_bufferSize(
        cusolverH,
        jobz, /* CUSOLVER_EIG_MODE_NOVECTOR: compute singular values only */
              /* CUSOLVER_EIG_MODE_VECTOR: compute singular value and singular vectors */
        econ, /* econ = 1 for economy size */
        m,    /* nubmer of rows of A, 0 <= m */
        n,    /* number of columns of A, 0 <= n  */
        d_A,  /* m-by-n */
        lda,  /* leading dimension of A */
        d_S,  /* min(m,n) */
              /* the singular values in descending order */
        d_U,  /* m-by-m if econ = 0 */
              /* m-by-min(m,n) if econ = 1 */
        lda,  /* leading dimension of U, ldu >= max(1,m) */
        d_V,  /* n-by-n if econ = 0  */
              /* n-by-min(m,n) if econ = 1  */
        n,  /* leading dimension of V, ldv >= max(1,n) */
        &lwork,
        gesvdj_params);
    assert(CUSOLVER_STATUS_SUCCESS == status);

    cudaStat1 = cudaMalloc((void**)&d_work , sizeof(float)*lwork);
    assert(cudaSuccess == cudaStat1);

/* step 5: compute SVD */
    //Iterating over SVD calculation, not part of example
    int iters;
    for (iters = 10; iters > 0; iters--){ 
    status = cusolverDnSgesvdj(
        cusolverH,
        jobz,  /* CUSOLVER_EIG_MODE_NOVECTOR: compute singular values only */
               /* CUSOLVER_EIG_MODE_VECTOR: compute singular value and singular vectors */
        econ,  /* econ = 1 for economy size */
        m,     /* nubmer of rows of A, 0 <= m */
        n,     /* number of columns of A, 0 <= n  */
        d_A,   /* m-by-n */
        lda,   /* leading dimension of A */
        d_S,   /* min(m,n)  */
               /* the singular values in descending order */
        d_U,   /* m-by-m if econ = 0 */
               /* m-by-min(m,n) if econ = 1 */
        lda,   /* leading dimension of U, ldu >= max(1,m) */
        d_V,   /* n-by-n if econ = 0  */
               /* n-by-min(m,n) if econ = 1  */
        n,   /* leading dimension of V, ldv >= max(1,n) */
        d_work,
        lwork,
        d_info,
        gesvdj_params);

    cudaStat1 = cudaDeviceSynchronize();
    assert(CUSOLVER_STATUS_SUCCESS == status);
    assert(cudaSuccess == cudaStat1);

    cudaStat1 = cudaMemcpy(U, d_U, sizeof(float)*lda*n, cudaMemcpyDeviceToHost);
    cudaStat2 = cudaMemcpy(V, d_V, sizeof(float)*n*n, cudaMemcpyDeviceToHost);
    cudaStat3 = cudaMemcpy(S, d_S, sizeof(float)*n    , cudaMemcpyDeviceToHost);
    cudaStat4 = cudaMemcpy(&info, d_info, sizeof(int), cudaMemcpyDeviceToHost);
    cudaStat5 = cudaDeviceSynchronize();
    assert(cudaSuccess == cudaStat1);
    assert(cudaSuccess == cudaStat2);
    assert(cudaSuccess == cudaStat3);
    assert(cudaSuccess == cudaStat4);
    assert(cudaSuccess == cudaStat5);

    if ( 0 == info ){
        printf("gesvdj converges \n");
    }else if ( 0 > info ){
        printf("%d-th parameter is wrong \n", -info);
        exit(1);
    }else{
        printf("WARNING: info = %d : gesvdj does not converge \n", info );
    }
    }
/* step 6: measure error of singular value */
    status = cusolverDnXgesvdjGetSweeps(
        cusolverH,
        gesvdj_params,
        &executed_sweeps);
    assert(CUSOLVER_STATUS_SUCCESS == status);

    status = cusolverDnXgesvdjGetResidual(
        cusolverH,
        gesvdj_params,
        &residual);
    assert(CUSOLVER_STATUS_SUCCESS == status);

    printf("residual |A - U*S*V**H|_F = %E \n", residual );
    printf("number of executed sweeps = %d \n\n", executed_sweeps );

/*  free resources  */
    if (A      ) free(A);
    if (V      ) free(V);
    if (U      ) free(U);
    if (d_A    ) cudaFree(d_A);
    if (d_S    ) cudaFree(d_S);
    if (d_U    ) cudaFree(d_U);
    if (d_V    ) cudaFree(d_V);
    if (d_info ) cudaFree(d_info);
    if (d_work ) cudaFree(d_work);

    if (cusolverH    ) cusolverDnDestroy(cusolverH);
    if (stream       ) cudaStreamDestroy(stream);
    if (gesvdj_params) cusolverDnDestroyGesvdjInfo(gesvdj_params);

    cudaDeviceReset();
    return 0;
}
票数 2
EN
页面原文内容由Stack Overflow提供。腾讯云小微IT领域专用引擎提供翻译支持
原文链接:

https://stackoverflow.com/questions/57403017

复制
相关文章

相似问题

领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档