文章/答案/技术大牛

发布

社区首页 >问答首页 >推力:在GTX960M中大小为300 k的结构数组排序缓慢

问推力:在GTX960M中大小为300 k的结构数组排序缓慢
EN

Stack Overflow用户

提问于 2020-06-06 21:48:08

回答 1查看 248关注 0票数 0

我目前正在研究一种GPU呈现算法，在该算法中，我需要对这个结构的数组进行排序：

struct RadiosityData {
    vec4 emission;
    vec4 radiosity;
    float nPixLight;
    float nPixCam;
    float __padding[2];
};

我使用以下代码对数组进行排序：

thrust::device_ptr<RadiosityData> dev_ptr = thrust::device_pointer_cast(GPUpointer_ssbo);
thrust::sort(dev_ptr, dev_ptr + N);

其中GPUpointer_ssbo是来自cudaOpenGL互操作的GPU指针，N等于~300 k。通过以下方法进行比较：

__host__ __device__ bool operator<(const RadiosityData& lhs, const RadiosityData& rhs) { return (lhs.nPixCam > rhs.nPixCam); };

在我的GTX960M上排序非常慢:没有排序，我的应用程序每帧执行10 is，而排序则需要35 is左右。这意味着排序大约需要25 is。我正在用VS测量执行时间。

我知道这个问题可能是GPU同步问题，因为我在调用prior之前正在执行OpenGL操作。然而，我不相信这个论点，因为如果我使用未排序的数组来用OpenGL显示数据，它仍然需要10 is的总和，这意味着OpenGL代码本身没有同步问题。

对于这样的“小”数组，这种性能是预期的吗？对于这类问题，是否有更好的GPU排序算法？

驱动程序API (NVCC编译类型为.cubin，.gpu，或.ptx)设置CUDAFE_ -ccbin =--sdk_dir "C:\Program (x86)\Windows \10\“"C:\Program \NVIDIA计算Toolkit\CUDA\v10.2\bin\nvcc.exe”-使用-local-ccbin "C:\Program (x86)\Microsoft Visual -x -x cu -备存-dir x 64\Release -maxrregcount=0 ---machine 64 -编译-cudart静态-o x64\Release\sortBufferCUDA.cu.obj

运行时API (NVCC编译类型是混合对象或.c文件)设置CUDAFE_ -ccbin = -- sdk_dir "C:\Program (x86)\Windows Kits\10“"C:\Program \NVIDIA计算Toolkit\CUDA\v10.2\bin\nvcc.exe”--使用-local-ccbin "C:\Program (x86)\Microsoft Visual -ccbin -x cu --保存-dir x 64 \Release -maxrregcount=0 --机器64-编译-cudart静态-Xcompiler "/EHsc /nologo /Fd /FS /Zi“-o x64\Release\sortBufferCUDA.cu.obj /Zi

-编辑2：

以下是一个最低限度的工作示例：

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

#include <thrust/device_ptr.h>
#include <thrust/sort.h>
#include <thrust/execution_policy.h>
#include <thrust/extrema.h>
#include <cuda_runtime_api.h>
#include <cuda.h>
#include <thrust/device_vector.h>


struct RadiosityData {
    float emission[4];
    float radiosity[4];
    float nPixLight;
    float nPixCam;
    float __padding[2];
};

extern "C" void CUDAsort();

__host__ __device__ bool operator<(const RadiosityData& lhs, const RadiosityData& rhs) { return (lhs.nPixCam > rhs.nPixCam); };

int pri = 1;

thrust::device_vector<RadiosityData> dev;

void CUDAsort() {
    if (pri == 1) {
        pri = 0;
        dev.resize(300000);

    }
    thrust::sort(dev.begin(), dev.end());

}

int main()
{
    float time;
    cudaEvent_t start, stop;


    while (true) {
        cudaEventCreate(&start);
        cudaEventCreate(&stop);
        cudaEventRecord(start, 0);
        CUDAsort();

        cudaEventRecord(stop, 0);
        cudaEventSynchronize(stop);
        cudaEventElapsedTime(&time, start, stop);
        printf("Time to generate:  %3.1f ms \n", time);
    }
    return 0;
}

opengl

cuda

thrust

回答 1

Stack Overflow用户

回答已采纳

发布于 2020-06-07 02:55:15

在排序过程中，请求推力移动48字节结构当然是可能的，但可能不是最有效的方法。

我们可以尝试的是：

将用于排序结构数组( 3...

sort_by_key of Structure，AoS )的浮点值拉到float数组中，

创建一个索引数组，以便与此0 1 2 AoS一起执行浮点数，使用重新排列的索引数组沿

携带整数索引，完成AoS从输入到输出的单次排列复制，将输出数组复制回输入数组，模拟“就地”排序<代码>H 211G 212/code>

这看起来是很多工作，但根据我的测试，它实际上要快一些：

$ cat t30.cu
#include <thrust/sort.h>
#include <thrust/device_vector.h>
#include <iostream>
#include <thrust/execution_policy.h>
#include <time.h>
#include <sys/time.h>
#include <cstdlib>
#define USECPSEC 1000000ULL
long long dtime_usec(unsigned long long start){

  timeval tv;
  gettimeofday(&tv, 0);
  return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}

struct RadiosityData {
#ifdef USE_VEC
 float4 emission;
 float4 radiosity;
#else
 float emission[4];
 float radiosity[4];
#endif
        float nPixLight;
 float nPixCam;
        float __padding[2];
};

__global__ void copyKernel(RadiosityData *d, float *f, int *i, int n){
 int idx=threadIdx.x+blockDim.x*blockIdx.x;
 if (idx < n){
  f[idx] = d[idx].nPixCam;
  i[idx] = idx;}
}

__host__ __device__ bool operator<(const RadiosityData &lhs, const RadiosityData  &rhs) { return (lhs.nPixCam > rhs.nPixCam); };
struct my_sort_functor
{
 template <typename T1, typename T2>
 __host__ __device__ bool operator()(T1 lhs, T2 rhs) { return (lhs.nPixCam > rhs.nPixCam); };
};
const int N = 300000;
int main(){
 RadiosityData *GPUpointer_ssbo, *o;
 int sz = N*sizeof(RadiosityData);
 thrust::device_vector<RadiosityData> ii(N);
 GPUpointer_ssbo = thrust::raw_pointer_cast(ii.data());
 thrust::device_ptr<RadiosityData> dev_ptr = thrust::device_pointer_cast(GPUpointer_ssbo);
//method 1: ordinary thrust sort
 long long dt = dtime_usec(0);
 thrust::sort(dev_ptr, dev_ptr+N);
 cudaDeviceSynchronize();
 dt = dtime_usec(dt);
 std::cout << "ordinary sort time: " << dt/(float)USECPSEC << "s" << std::endl;
//method 2: reduced sort-and-copy
 cudaMalloc(&o, sz);
 thrust::device_ptr<RadiosityData> dev_optr = thrust::device_pointer_cast(o);
 for (int i = 0; i < N; i++) {RadiosityData q{0}; q.nPixCam = rand(); ii[i] = q;}
 float *d;
 int *k;
 cudaMalloc(&d, N*sizeof(float));
 cudaMalloc(&k, N*sizeof(int));
 thrust::device_ptr<int> dev_kptr = thrust::device_pointer_cast(k);
 cudaDeviceSynchronize();
 dt = dtime_usec(0);
 copyKernel<<<(N+511)/512, 512>>>(GPUpointer_ssbo, d, k, N);
 thrust::sort_by_key(thrust::device, d, d+N, k);
 thrust::copy(thrust::make_permutation_iterator(dev_ptr, dev_kptr), thrust::make_permutation_iterator(dev_ptr, dev_kptr+N), dev_optr);
 cudaMemcpy(GPUpointer_ssbo, o, sz, cudaMemcpyDeviceToDevice);
 cudaDeviceSynchronize();
 dt = dtime_usec(dt);
 std::cout << "sort+copy time: " << dt/(float)USECPSEC << "s" << std::endl;
}

$ nvcc -o t30 t30.cu -arch=sm_52
$ ./t30
ordinary sort time: 0.009527s
sort+copy time: 0.003143s
$ nvcc -o t30 t30.cu -arch=sm_52 -DUSE_VEC
$ ./t30
ordinary sort time: 0.004409s
sort+copy time: 0.002859s
$

(CUDA 10.1.105，GTX960，fedora core 29)

因此，我们观察到大约50%或更快的速度与改进的方法.

如果您只想返回排序的top-M元素，使用这种解构的复制方法，我们可以通过缩小复制操作的大小来进行进一步的改进。整个排序是对浮点数进行的，但在复制AoS结果时，只复制了top-M值：

$ cat t30.cu
#include <thrust/sort.h>
#include <thrust/device_vector.h>
#include <iostream>
#include <thrust/execution_policy.h>
#include <time.h>
#include <sys/time.h>
#include <cstdlib>
#define USECPSEC 1000000ULL
long long dtime_usec(unsigned long long start){

  timeval tv;
  gettimeofday(&tv, 0);
  return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}

struct RadiosityData {
#ifdef USE_VEC
 float4 emission;
 float4 radiosity;
#else
 float emission[4];
 float radiosity[4];
#endif
        float nPixLight;
 float nPixCam;
        float __padding[2];
};

__global__ void copyKernel(RadiosityData *d, float *f, int *i, int n){
 int idx=threadIdx.x+blockDim.x*blockIdx.x;
 if (idx < n){
  f[idx] = d[idx].nPixCam;
  i[idx] = idx;}
}

__host__ __device__ bool operator<(const RadiosityData &lhs, const RadiosityData  &rhs) { return (lhs.nPixCam > rhs.nPixCam); };
struct my_sort_functor
{
 template <typename T1, typename T2>
 __host__ __device__ bool operator()(T1 lhs, T2 rhs) { return (lhs.nPixCam > rhs.nPixCam); };
};
const int N = 300000;
const int M = 1000;  // identifies top-M values to be returned by sort
int main(){
 RadiosityData *GPUpointer_ssbo, *o;
 int sz = N*sizeof(RadiosityData);
 thrust::device_vector<RadiosityData> ii(N);
 GPUpointer_ssbo = thrust::raw_pointer_cast(ii.data());
 thrust::device_ptr<RadiosityData> dev_ptr = thrust::device_pointer_cast(GPUpointer_ssbo);
//method 1: ordinary thrust sort
 long long dt = dtime_usec(0);
 thrust::sort(dev_ptr, dev_ptr+N);
 cudaDeviceSynchronize();
 dt = dtime_usec(dt);
 std::cout << "ordinary sort time: " << dt/(float)USECPSEC << "s" << std::endl;
//method 2: reduced sort-and-copy
 cudaMalloc(&o, sz);
 thrust::device_ptr<RadiosityData> dev_optr = thrust::device_pointer_cast(o);
 for (int i = 0; i < N; i++) {RadiosityData q{0}; q.nPixCam = rand(); ii[i] = q;}
 float *d;
 int *k;
 cudaMalloc(&d, N*sizeof(float));
 cudaMalloc(&k, N*sizeof(int));
 thrust::device_ptr<int> dev_kptr = thrust::device_pointer_cast(k);
        cudaDeviceSynchronize();
 dt = dtime_usec(0);
        copyKernel<<<(N+511)/512, 512>>>(GPUpointer_ssbo, d, k, N);
 thrust::sort_by_key(thrust::device, d, d+N, k);
 thrust::copy_n(thrust::make_permutation_iterator(dev_ptr, dev_kptr), M, dev_optr);
 cudaMemcpy(GPUpointer_ssbo, o, M*sizeof(RadiosityData), cudaMemcpyDeviceToDevice);
        cudaDeviceSynchronize();
 dt = dtime_usec(dt);
 std::cout << "sort+copy time: " << dt/(float)USECPSEC << "s" << std::endl;
}


$ nvcc -o t30 t30.cu -arch=sm_52 -lineinfo -DUSE_VEC
$ ./t30
ordinary sort time: 0.004425s
sort+copy time: 0.001042s
$

还有几个注意事项：

还观察到，当4-float量用向量类型(float4)而不是4元素数组表示时，AoS的处理效率更高。
还注意到，根据我的测试，为正确的GPU体系结构编译(在我的例子中是sm_52)似乎是一个小小的改进。YMMV.

我不主张这段代码或我发布的任何其他代码的正确性。任何使用我发布的代码的人都会冒着自己的风险。我只是声称我曾试图解决原文中的问题，并对此作了一些解释。我并不是说我的代码是无缺陷的，或者它适合于任何特定的用途。使用它(或不使用)由你自己承担风险。

票数 2

页面原文内容由Stack Overflow提供。腾讯云小微IT领域专用引擎提供翻译支持

原文链接：

https://stackoverflow.com/questions/62238424

复制

相似问题

问推力:在GTX960M中大小为300 k的结构数组排序缓慢
EN

回答 1

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问推力:在GTX960M中大小为300 k的结构数组排序缓慢EN

回答 1

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问推力:在GTX960M中大小为300 k的结构数组排序缓慢
EN