首页
学习
活动
专区
圈层
工具
发布
社区首页 >问答首页 >针对多个GPU的SLI

针对多个GPU的SLI
EN

Stack Overflow用户
提问于 2012-06-21 00:59:32
回答 2查看 9.6K关注 0票数 26

我是CUDA编程的新手,我正在解决一个需要在一台机器上使用多个GPU的问题。我知道为了更好地进行图形编程,需要通过SLI组合多个GPU。但是,对于CUDA编程,我是否也需要通过SLI组合GPU?

EN

回答 2

Stack Overflow用户

回答已采纳

发布于 2012-06-21 01:15:46

不,通常情况下,如果您计划使用GPU进行计算而不是纯图形应用程序,则不希望使用SLI。您将能够从CUDA程序中以独立设备的形式访问这两个GPU。请注意,您需要显式地在GPU之间划分工作。

我没有解释为什么SLI不适合计算应用程序,但这是我在Nvidia论坛上读到的,也是从IRC频道的其他人那里听到的。

票数 27
EN

Stack Overflow用户

发布于 2022-02-25 20:38:25

您可以在没有SLI的多个GPU上使用CUDA,甚至可以在CUDA的不同架构之间使用CUDA,但您必须编写额外的代码来划分工作并同步划分的子工作。这是一个简单的程序,在3个GPU上进行负载均衡,用于示例内核vectorAdd (GT1030 GPU一个Pascal架构GPU+两个属于开普勒架构的K420 GPU,在同一任务池中一起工作没有问题):

代码语言:javascript
复制
/**
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

/**
 * Vector addition: C = A + B.
 *
 * This sample is a very basic sample that implements element by element
 * vector addition. It is the same as the sample illustrating Chapter 2
 * of the programming guide with some additions like error checking.
 */

#include <stdio.h>

// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>

#include <helper_cuda.h>

// for load balancing between 3 different GPUs
#include "LoadBalancerX.h"

/**
 * CUDA Kernel Device code
 *
 * Computes the vector addition of A and B into C. The 3 vectors have the same
 * number of elements numElements.
 */
__global__ void
vectorAdd(const float *A, const float *B, float *C, int numElements)
{
    int i = blockDim.x * blockIdx.x + threadIdx.x;

    if (i < numElements)
    {
        C[i] = A[i] + B[i];
    }
}


#include<iostream>
#include<map>
int
main(void)
{

    int numElements = 1500000;
    int numElementsPerGrain = 50000;
    size_t size = numElements * sizeof(float);

    float *h_A; cudaMallocHost((void**)&h_A,size);
    float *h_B; cudaMallocHost((void**)&h_B,size);
    float *h_C; cudaMallocHost((void**)&h_C,size);


    for (int i = 0; i < numElements; ++i)
    {
        h_A[i] = rand()/(float)RAND_MAX;
        h_B[i] = rand()/(float)RAND_MAX;
    }



    /*
     * default tutorial vecAdd logic

    cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);


    int threadsPerBlock = 256;
    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;

    vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements);
    cudaGetLastError();


    cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
    */

    /* load-balanced 3-GPU version setup */
    class GrainState
    {
    public:
        int offset;
        int range;
        std::map<int,float *> d_A;
        std::map<int,float *> d_B;
        std::map<int,float *> d_C;
        std::map<int,cudaStream_t> stream;
        ~GrainState(){
            for(auto a:d_A)
                cudaFree(a.second);
            for(auto b:d_B)
                cudaFree(b.second);
            for(auto c:d_C)
                cudaFree(c.second);
            for(auto s:stream)
                cudaStreamDestroy(s.second);
        }
    };

    class DeviceState
    {
    public:
        int gpuId;
        int amIgpu;
    };

    LoadBalanceLib::LoadBalancerX<DeviceState,GrainState> lb;
    lb.addDevice(LoadBalanceLib::ComputeDevice<DeviceState>({0,1})); // 1st cuda gpu in computer
    lb.addDevice(LoadBalanceLib::ComputeDevice<DeviceState>({1,1})); // 2nd cuda gpu in computer
    lb.addDevice(LoadBalanceLib::ComputeDevice<DeviceState>({2,1})); // 3rd cuda gpu in computer
    //lb.addDevice(LoadBalanceLib::ComputeDevice<DeviceState>({3,0})); // CPU single core

    for(int i=0;i<numElements;i+=numElementsPerGrain)
    {
        lb.addWork(LoadBalanceLib::GrainOfWork<DeviceState,GrainState>(
                [&,i](DeviceState gpu, GrainState& grain){
                    if(gpu.amIgpu)
                    {
                        cudaSetDevice(gpu.gpuId);
                        cudaStreamCreate(&grain.stream[gpu.gpuId]);
                        cudaMalloc((void **)&grain.d_A[gpu.gpuId], numElementsPerGrain*sizeof(float));
                        cudaMalloc((void **)&grain.d_B[gpu.gpuId], numElementsPerGrain*sizeof(float));
                        cudaMalloc((void **)&grain.d_C[gpu.gpuId], numElementsPerGrain*sizeof(float));
                    }
                },
                [&,i](DeviceState gpu, GrainState& grain){
                    if(gpu.amIgpu)
                    {
                        cudaSetDevice(gpu.gpuId);
                        cudaMemcpyAsync(grain.d_A[gpu.gpuId], h_A+i, numElementsPerGrain*sizeof(float), cudaMemcpyHostToDevice,grain.stream[gpu.gpuId]);
                        cudaMemcpyAsync(grain.d_B[gpu.gpuId], h_B+i, numElementsPerGrain*sizeof(float), cudaMemcpyHostToDevice,grain.stream[gpu.gpuId]);
                    }
                },
                [&,i](DeviceState gpu, GrainState& grain){
                    if(gpu.amIgpu)
                    {
                        int threadsPerBlock = 1000;
                        int blocksPerGrid =numElementsPerGrain/1000;
                        vectorAdd<<<blocksPerGrid, threadsPerBlock, 0, grain.stream[gpu.gpuId]>>>(grain.d_A[gpu.gpuId], grain.d_B[gpu.gpuId], grain.d_C[gpu.gpuId], numElements-i);
                    }
                    else
                    {
                        for(int j=0;j<numElementsPerGrain;j++)
                        {
                            const int index = j+i;
                            h_C[index]=h_A[index]+h_B[index];
                        }
                    }
                },
                [&,i](DeviceState gpu, GrainState& grain){
                    if(gpu.amIgpu)
                    {
                       cudaMemcpyAsync(h_C+i, grain.d_C[gpu.gpuId], numElementsPerGrain*sizeof(float), cudaMemcpyDeviceToHost,grain.stream[gpu.gpuId]);
                    }
                },
                [&,i](DeviceState gpu, GrainState& grain){
                    if(gpu.amIgpu)
                    {
                        cudaStreamSynchronize(grain.stream[gpu.gpuId]);
                    }
                }
        ));
    }

    /* load-balance setup end*/

    // run 100 times
    size_t nanoseconds=0;

    for(int i=0;i<100;i++)
    {
        nanoseconds += lb.run();

    }

    std::cout<<nanoseconds/100.0<<"ns  ("<<((numElements*12.0/(nanoseconds/100.0)))<<"GB/s)"<<std::endl;


    std::cout<<"??"<<std::endl;

    for (int i = 0; i < numElements; i+=numElementsPerGrain)
    {
        std::cout<<h_A[i]<<" + "<<h_B[i]<<" = "<<h_C[i]<<std::endl;
    }
    auto z = lb.getRelativePerformancesOfDevices();
    std::cout<<"work distribution to devices:"<<std::endl;
    for(auto zz:z)
    {
        std::cout<<zz<<"% ";
    }
    std::cout<<std::endl;
    cudaFreeHost(h_A);
    cudaFreeHost(h_B);
    cudaFreeHost(h_C);

    return 0;
}
票数 0
EN
页面原文内容由Stack Overflow提供。腾讯云小微IT领域专用引擎提供翻译支持
原文链接:

https://stackoverflow.com/questions/11124494

复制
相关文章

相似问题

领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档