首页
学习
活动
专区
圈层
工具
发布
社区首页 >问答首页 >从一个大块CUDA设备内存共享/读取多个进程

从一个大块CUDA设备内存共享/读取多个进程
EN

Stack Overflow用户
提问于 2022-09-27 17:09:46
回答 1查看 75关注 0票数 0

我有一个多进程应用程序与一个单一的GPU使用CUDA多进程服务(MPS).每个进程创建几个设备数组,但是其中一个很大(~5GB ),并且它是一个常量数组,因此我认为可以用一个进程分配一次内存,并指示其他进程使用“进程间通信”(类似于显示这里的CUDA示例)从该内存块读取。

按照链接的CUDA示例,我尝试实现一个简单的测试程序,但是一直碰到一个API错误。当我调用cudaIPCOpenMemHandle时,似乎有一个无效的论点。我发布了下面的代码,希望有人能够很容易地找出错误的原因,或者建议更好地使用CUDA API来完成我想要做的事情。

代码语言:javascript
复制
#include <stdio.h>
#include <mpi.h>
#include <assert.h>
#include <sys/mman.h>

#define blockSize 128
#define N 1000
#define gpuErr(ans) { gpuAssert((ans), __FILE__, __LINE__); }


__global__ void kernel(double* out, double* in, double val){
    unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
    unsigned int thread_stride = blockDim.x * gridDim.x;
    for (int i=tid; i < N; i+=thread_stride){
        out[i] = in[i]*val;
    }
}

static void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
    if (code != cudaSuccess)
    {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

static void error_msg(cudaError_t err, int rank){
    if (err != cudaSuccess){
        printf("RANK %d recvd CUDA error message: %s\n", rank, cudaGetErrorString(err));
        exit(err);
    }
}

void check_access(){
    cudaDeviceProp prop;
    gpuErr(cudaGetDeviceProperties(&prop, 0));
    if (prop.unifiedAddressing)
        printf("> GPU%d = is capable of UVA\n", 0);

    // NOTE: only interested in enabling intra-device peer2peer, so I think this test doesnt matter ?
    //int can_access=-1;
    //int num_dev=2;
    //// note, here I was confused, I want the ability to have a process on device 0 access
    //for (peer_dev=0; peer_dev <num_dev, peer_dev++){
    //    int peer_dev=0; // note if peer_dev is 1
    //    gpuErr(cudaDeviceCanAccessPeer(&can_access, 0,peer_dev));
    //    if (can_access)
    //        printf("device 0 has peerdev=%d access\n", peer_dev);
    //    else
    //        printf("device 0 has no peerdev=%d access\n", peer_dev);
    //}
}

int main(){
    MPI_Init(NULL,NULL);
    int size,rank;
    MPI_Comm_size(MPI_COMM_WORLD, &size);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);

    if (rank==0)
        check_access();
    gpuErr(cudaSetDevice(0));

    double* out;
    double * in;
    gpuErr(cudaMallocManaged((void **)&out, N*sizeof(double)));

    cudaIpcMemHandle_t * memHand = (cudaIpcMemHandle_t *)
        mmap(NULL, sizeof(cudaIpcMemHandle_t),
            PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, 0, 0);

    assert(MAP_FAILED != memHand);
    memset((void *) memHand, 0, sizeof(cudaIpcMemHandle_t));

    MPI_Barrier(MPI_COMM_WORLD);

    if (rank==0){
        gpuErr(cudaMalloc((void **)&in, N*sizeof(double)));
        gpuErr(cudaIpcGetMemHandle((cudaIpcMemHandle_t *) &memHand[0], (void *)in));
        
        double * temp = new double[N];
        for (int i=0; i < N; i++)
            temp[i] = 1;
        gpuErr(cudaMemcpy(in, temp, N*sizeof(double), cudaMemcpyHostToDevice));
        delete temp;
    }
    MPI_Barrier(MPI_COMM_WORLD);

    // the following is throwing a CUDAerror, invalid
    if (rank >0 )
        gpuErr(cudaIpcOpenMemHandle((void **) &in, memHand[0], cudaIpcMemLazyEnablePeerAccess));
    
    MPI_Barrier(MPI_COMM_WORLD);

    int numBlocks = (N + blockSize - 1) / blockSize;
    double rank_val=(double) rank;
    kernel<<<numBlocks, blockSize>>>(out, in, rank_val);
    error_msg(cudaGetLastError(), rank);
    gpuErr(cudaDeviceSynchronize());
    MPI_Barrier(MPI_COMM_WORLD);

    // test the kernel results
    double sum = 0;
    for (int i=0; i < N; i++)
        sum += out[i];
    printf("mpirank=%d, comm.size=%d, result=%f\n", rank, size, sum);
    assert(sum==N*rank);

    // cleanup
    if (rank>0)
        cudaIpcCloseMemHandle(in);
    cudaFree(out);
    if (rank==0)
        cudaFree(in);

    return 0;
}

我用

代码语言:javascript
复制
 nvcc -I/usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/include  --compiler-options=-march=skylake-avx512 -L/usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib -lmpi ipc_tester.cu

单进程作业输出:(看起来正确)

代码语言:javascript
复制
$ srun -n1 -c2 ./a.out
> GPU0 = is capable of UVA
mpirank=0, comm.size=1, result=0.000000

多进程作业输出:(命中调用cudaIPCOpenMemHandle中的错误)

代码语言:javascript
复制
$ srun -n2 -c2 ./a.out
GPUassert: invalid argument ipc_tester.cu 92

计算杀菌剂输出:

代码语言:javascript
复制
$ srun -n2 -c2 compute-sanitizer ./a.out
========= COMPUTE-SANITIZER
========= COMPUTE-SANITIZER
========= Program hit invalid device context (error 201) on CUDA API call to cuCtxGetDevice.
=========     Saved host backtrace up to driver entry point at error
========= Program hit invalid device context (error 201) on CUDA API call to cuCtxGetDevice.
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:cuCtxGetDevice [0x155550d083eb]
=========                in /usr/common/software/sles15_cgpu/cuda/11.1.1/lib64/compat/libcuda.so.1
=========     Host Frame:uct_cuda_base_query_devices [0x15553e03f170]
=========                in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/ucx/libuct_cuda.so.0
=========     Host Frame:cuCtxGetDevice [0x155550d083eb]
=========                in /usr/common/software/sles15_cgpu/cuda/11.1.1/lib64/compat/libcuda.so.1
=========     Host Frame:uct_md_query_tl_resources [0x15553e6c44c6]
=========                in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libuct.so.0
=========     Host Frame: [0x15553e9095a9]
=========                in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
=========     Host Frame: [0x15553e90a7f9]
=========                in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
=========     Host Frame: [0x15553e90abfd]
=========                in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
=========     Host Frame:ucp_init_version [0x15553e90b7f3]
=========                in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
=========     Host Frame:uct_cuda_base_query_devices [0x155546040170]
=========                in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/ucx/libuct_cuda.so.0
=========     Host Frame:mca_pml_ucx_open [0x15553edc7e70]
=========                in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/openmpi/mca_pml_ucx.so
=========     Host Frame:mca_base_framework_components_open [0x15555299ef2d]
=========                in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libopen-pal.so.40
=========     Host Frame: [0x155554472ec7]
=========                in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libmpi.so.40
=========     Host Frame:uct_md_query_tl_resources [0x1555466c54c6]
=========                in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libuct.so.0
=========     Host Frame: [0x15554690a5a9]
=========                in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
=========     Host Frame: [0x15554690b7f9]
=========                in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
=========     Host Frame: [0x15554690bbfd]
=========                in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
=========     Host Frame:mca_base_framework_open [0x1555529a8b31]
=========                in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libopen-pal.so.40
=========     Host Frame:ompi_mpi_init [0x15555447fb5b]
=========                in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libmpi.so.40
=========     Host Frame:MPI_Init [0x15555442dc01]
=========                in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libmpi.so.40
=========     Host Frame:ucp_init_version [0x15554690c7f3]
=========                in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
=========     Host Frame: [0x403f04]
=========                in /global/cfs/cdirs/lcls/dermen/dulios/./a.out
=========     Host Frame:__libc_start_main [0x1555531173ea]
=========                in /lib64/libc.so.6
=========     Host Frame: [0x403d1a]
=========     Host Frame:mca_pml_ucx_open [0x155546dc8e70]
=========                in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/openmpi/mca_pml_ucx.so
=========                in /global/cfs/cdirs/lcls/dermen/dulios/./a.out
========= 
=========     Host Frame:mca_base_framework_components_open [0x15555299ef2d]
=========                in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libopen-pal.so.40
=========     Host Frame: [0x155554472ec7]
=========                in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libmpi.so.40
=========     Host Frame:mca_base_framework_open [0x1555529a8b31]
=========                in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libopen-pal.so.40
=========     Host Frame:ompi_mpi_init [0x15555447fb5b]
=========                in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libmpi.so.40
=========     Host Frame:MPI_Init [0x15555442dc01]
=========                in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libmpi.so.40
=========     Host Frame: [0x403f04]
=========                in /global/cfs/cdirs/lcls/dermen/dulios/./a.out
=========     Host Frame:__libc_start_main [0x1555531173ea]
=========                in /lib64/libc.so.6
=========     Host Frame: [0x403d1a]
=========                in /global/cfs/cdirs/lcls/dermen/dulios/./a.out
========= 
========= Program hit invalid device context (error 201) on CUDA API call to cuCtxGetDevice.
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:cuCtxGetDevice [0x155550d083eb]
=========                in /usr/common/software/sles15_cgpu/cuda/11.1.1/lib64/compat/libcuda.so.1
=========     Host Frame:uct_cuda_base_query_devices [0x15553e03f170]
=========                in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/ucx/libuct_cuda.so.0
=========     Host Frame:uct_md_query_tl_resources [0x15553e6c44c6]
=========                in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libuct.so.0
=========     Host Frame: [0x15553e9095a9]
=========                in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
=========     Host Frame: [0x15553e90a7f9]
=========                in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
=========     Host Frame: [0x15553e90abfd]
=========                in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
=========     Host Frame:ucp_init_version [0x15553e90b7f3]
=========                in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
=========     Host Frame:mca_pml_ucx_open [0x15553edc7e70]
=========                in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/openmpi/mca_pml_ucx.so
=========     Host Frame:mca_base_framework_components_open [0x15555299ef2d]
=========                in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libopen-pal.so.40
=========     Host Frame: [0x155554472ec7]
=========                in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libmpi.so.40
========= Program hit invalid device context (error 201) on CUDA API call to cuCtxGetDevice.
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:mca_base_framework_open [0x1555529a8b31]
=========                in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libopen-pal.so.40
=========     Host Frame:cuCtxGetDevice [0x155550d083eb]
=========                in /usr/common/software/sles15_cgpu/cuda/11.1.1/lib64/compat/libcuda.so.1
=========     Host Frame:ompi_mpi_init [0x15555447fb5b]
=========                in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libmpi.so.40
=========     Host Frame:uct_cuda_base_query_devices [0x155546040170]
=========                in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/ucx/libuct_cuda.so.0
=========     Host Frame:MPI_Init [0x15555442dc01]
=========                in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libmpi.so.40
=========     Host Frame: [0x403f04]
=========                in /global/cfs/cdirs/lcls/dermen/dulios/./a.out
=========     Host Frame:uct_md_query_tl_resources [0x1555466c54c6]
=========                in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libuct.so.0
=========     Host Frame: [0x15554690a5a9]
=========                in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
=========     Host Frame: [0x15554690b7f9]
=========                in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
=========     Host Frame: [0x15554690bbfd]
=========                in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
=========     Host Frame:ucp_init_version [0x15554690c7f3]
=========                in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
=========     Host Frame:__libc_start_main [0x1555531173ea]
=========                in /lib64/libc.so.6
=========     Host Frame: [0x403d1a]
=========                in /global/cfs/cdirs/lcls/dermen/dulios/./a.out
========= 
=========     Host Frame:mca_pml_ucx_open [0x155546dc8e70]
=========                in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/openmpi/mca_pml_ucx.so
=========     Host Frame:mca_base_framework_components_open [0x15555299ef2d]
=========                in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libopen-pal.so.40
=========     Host Frame: [0x155554472ec7]
=========                in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libmpi.so.40
=========     Host Frame:mca_base_framework_open [0x1555529a8b31]
=========                in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libopen-pal.so.40
=========     Host Frame:ompi_mpi_init [0x15555447fb5b]
=========                in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libmpi.so.40
=========     Host Frame:MPI_Init [0x15555442dc01]
=========                in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libmpi.so.40
=========     Host Frame: [0x403f04]
=========                in /global/cfs/cdirs/lcls/dermen/dulios/./a.out
=========     Host Frame:__libc_start_main [0x1555531173ea]
=========                in /lib64/libc.so.6
=========     Host Frame: [0x403d1a]
=========                in /global/cfs/cdirs/lcls/dermen/dulios/./a.out
========= 
========= Program hit invalid device context (error 201) on CUDA API call to cuCtxGetDevice.
=========     Saved host backtrace up to driver entry point at error
========= Program hit invalid device context (error 201) on CUDA API call to cuCtxGetDevice.
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:cuCtxGetDevice [0x155550d083eb]
=========                in /usr/common/software/sles15_cgpu/cuda/11.1.1/lib64/compat/libcuda.so.1
=========     Host Frame:cuCtxGetDevice [0x155550d083eb]
=========                in /usr/common/software/sles15_cgpu/cuda/11.1.1/lib64/compat/libcuda.so.1
=========     Host Frame:uct_cuda_base_query_devices [0x15553e03f170]
=========                in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/ucx/libuct_cuda.so.0
=========     Host Frame:uct_cuda_base_query_devices [0x155546040170]
=========                in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/ucx/libuct_cuda.so.0
=========     Host Frame:uct_md_query_tl_resources [0x15553e6c44c6]
=========                in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libuct.so.0
=========     Host Frame: [0x15553e9095a9]
=========     Host Frame:uct_md_query_tl_resources [0x1555466c54c6]
=========                in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libuct.so.0
=========     Host Frame: [0x15554690a5a9]
=========                in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
=========     Host Frame: [0x15554690b7f9]
=========                in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
=========     Host Frame: [0x15554690bbfd]
=========                in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
=========                in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
=========     Host Frame: [0x15553e90a7f9]
=========                in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
=========     Host Frame: [0x15553e90abfd]
=========                in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
=========     Host Frame:ucp_init_version [0x15553e90b7f3]
=========                in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
=========     Host Frame:ucp_init_version [0x15554690c7f3]
=========                in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
=========     Host Frame:mca_pml_ucx_open [0x155546dc8e70]
=========                in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/openmpi/mca_pml_ucx.so
=========     Host Frame:mca_pml_ucx_open [0x15553edc7e70]
=========                in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/openmpi/mca_pml_ucx.so
=========     Host Frame:mca_base_framework_components_open [0x15555299ef2d]
=========                in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libopen-pal.so.40
=========     Host Frame: [0x155554472ec7]
=========                in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libmpi.so.40
=========     Host Frame:mca_base_framework_components_open [0x15555299ef2d]
=========                in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libopen-pal.so.40
=========     Host Frame: [0x155554472ec7]
=========                in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libmpi.so.40
=========     Host Frame:mca_base_framework_open [0x1555529a8b31]
=========                in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libopen-pal.so.40
=========     Host Frame:mca_base_framework_open [0x1555529a8b31]
=========                in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libopen-pal.so.40
=========     Host Frame:ompi_mpi_init [0x15555447fb5b]
=========                in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libmpi.so.40
=========     Host Frame:ompi_mpi_init [0x15555447fb5b]
=========                in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libmpi.so.40
=========     Host Frame:MPI_Init [0x15555442dc01]
=========                in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libmpi.so.40
=========     Host Frame: [0x403f04]
=========     Host Frame:MPI_Init [0x15555442dc01]
=========                in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libmpi.so.40
=========     Host Frame: [0x403f04]
=========                in /global/cfs/cdirs/lcls/dermen/dulios/./a.out
=========                in /global/cfs/cdirs/lcls/dermen/dulios/./a.out
=========     Host Frame:__libc_start_main [0x1555531173ea]
=========                in /lib64/libc.so.6
=========     Host Frame: [0x403d1a]
=========                in /global/cfs/cdirs/lcls/dermen/dulios/./a.out
========= 
=========     Host Frame:__libc_start_main [0x1555531173ea]
=========                in /lib64/libc.so.6
=========     Host Frame: [0x403d1a]
=========                in /global/cfs/cdirs/lcls/dermen/dulios/./a.out
========= 
========= Program hit invalid argument (error 1) on CUDA API call to cudaIpcOpenMemHandle.
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame: [0x155550dde1b3]
=========                in /usr/common/software/sles15_cgpu/cuda/11.1.1/lib64/compat/libcuda.so.1
=========     Host Frame: [0x433fac]
=========                in /global/cfs/cdirs/lcls/dermen/dulios/./a.out
=========     Host Frame: [0x40412e]
=========                in /global/cfs/cdirs/lcls/dermen/dulios/./a.out
=========     Host Frame:__libc_start_main [0x1555531173ea]
=========                in /lib64/libc.so.6
=========     Host Frame: [0x403d1a]
=========                in /global/cfs/cdirs/lcls/dermen/dulios/./a.out
========= 
GPUassert: invalid argument ipc_tester.cu 92
========= Error: process didn't terminate successfully
========= Target application returned an error
========= ERROR SUMMARY: 4 errors

系统信息:

代码语言:javascript
复制
$ lsb_release  -a
LSB Version:    n/a
Distributor ID: SUSE
Description:    SUSE Linux Enterprise Server 15 SP2
Release:        15.2
Codename:       n/a

$ nvidia-smi 
Tue Sep 27 10:05:48 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.102.04   Driver Version: 450.102.04   CUDA Version: 11.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|===============================+======================+======================|
|   0  Tesla V100-SXM2...  On   | 00000000:89:00.0 Off |                    0 |
| N/A   34C    P0    38W / 300W |      0MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
EN

回答 1

Stack Overflow用户

回答已采纳

发布于 2022-09-27 21:02:52

正如评论中所指出的,memHandler没有正确地设置在ranks>0上。

在学习了如何使用广播memHandler之后,我找到了一个解决方案。下面的修补程序将导致一个工作代码。

代码语言:javascript
复制
@@ -66,12 +66,7 @@ int main(){
     double * in;
     gpuErr(cudaMallocManaged((void **)&out, N*sizeof(double)));
 
-    cudaIpcMemHandle_t * memHand = (cudaIpcMemHandle_t *)
-        mmap(NULL, sizeof(cudaIpcMemHandle_t),
-            PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, 0, 0);
-
-    assert(MAP_FAILED != memHand);
-    memset((void *) memHand, 0, sizeof(cudaIpcMemHandle_t));
+    cudaIpcMemHandle_t memHand[1];
 
     MPI_Barrier(MPI_COMM_WORLD);
 
@@ -87,6 +82,21 @@ int main(){
     }
     MPI_Barrier(MPI_COMM_WORLD);
 
+//  Broadcast the MPI handle
+//  get size of memHandler container needed for broadcast
+    int hand_size[1];
+    if (rank==0)
+        hand_size[0]= sizeof(memHand[0]);
+    MPI_Bcast(&hand_size[0], 1, MPI_INT, 0, MPI_COMM_WORLD);
+
+    // create the char container for memHandler broadcast
+    char memHand_C[hand_size[0]];
+    if (rank==0)
+        memcpy(&memHand_C, &memHand[0], hand_size[0]);
+    MPI_Bcast(&memHand_C, hand_size[0], MPI_BYTE, 0, MPI_COMM_WORLD);
+    if (rank >0)
+        memcpy(&memHand[0], &memHand_C, hand_size[0]);
+
     // the following is throwing a CUDAerror, invalid
     if (rank >0 )
         gpuErr(cudaIpcOpenMemHandle((void **) &in, memHand[0], cudaIpcMemLazyEnablePeerAccess));
票数 1
EN
页面原文内容由Stack Overflow提供。腾讯云小微IT领域专用引擎提供翻译支持
原文链接:

https://stackoverflow.com/questions/73871402

复制
相关文章

相似问题

领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档