在C++中,不可能向类或函数传递某种类型的参数(通过模板或实际函数参数)。这也适用于数据自动化系统(至少据我所知)。以下问题解释了一些原因:为什么命名空间不能是模板参数?
下面是一个用例示例:
namespace experiment1
{
int repetitions() { return 2; }
void setup() { ... }
void f() { ... }
void teardown() { ... }
}
namespace experiment2
{
int repetitions() { return 4; }
void setup() { ... }
void f() { ... }
void teardown() { ... }
}
// Beware, this is invalid C++ and invalid CUDA
template<namespace NS>
void do_test()
{
// Do something with NS::repetitions(), NS::setup(), ...
} 这在C++中无效的原因之一是,在这种方法中,没有什么是您不能处理类的。您确实可以将每个名称空间转换为类,将函数转换为成员函数,然后将类作为模板参数传递给do_test函数,或者将类的实例作为参数传递给同一个函数(可能在前一种情况下使用静态函数,在后一种情况下使用虚拟函数)。
我同意这一点。但是,在CUDA的具体情况下,您可以使用名称空间,但不能使用类。假设f是一个内核,即一个__global__函数,并且使用setup或其他函数来指定,例如要分配给内核的共享内存的大小。内核不能是类的成员(请参阅这个问题的答案:CUDA内核可以是虚拟函数吗?)。但是,您可以在同一个namespace中将它与与相同实验相关的其他函数一起使用。
考虑上面代码中所示的情况:do_test是设置计时器、准备输入、检查输出、测量时间和执行其他操作的函数。每个实验都是由几个函数组成的集合,具有相同的名称和相同的接口,其中一个是内核。您希望do_test具有足够的通用性来处理所有这些实验。您希望每个实验的代码都是以某种形式的封装形式自我包含,如命名空间、结构、类、.
这个问题能否解决呢?
根据“魔芋”的要求(顺便说一句,非常感谢你的评论),我会让这个问题更加具体。
我有几个非常简单的内核,它们执行类似的操作。它们从一个大数组中加载值,对其应用模具操作,并将结果写入输出数组(与输入数组不同)。所谓模具操作,是指线程idx对输入值idx及其相邻值(例如从idx-3到idx+3)执行的操作。其中最简单的内核只执行从输入到输出的副本:每个线程读取input[idx]并写入output[idx]。另一个例子是执行output[idx] = input[idx+1] - input[idx-1]的差异模板。(我要留下一些细节,但你明白了。)
我想对这些内核进行基准测试,以获得性能模型。对于每个内核,我还需要一个能够检查结果的主机函数。在每一种情况下,我都有一个内核,它通过优化以稍微不同的方式执行相同的操作,但从结果的角度来看是等价的。最后,我有一个主机函数,它打印内核的名称。下面是代码中的摘要:
namespace copy
{
std::string name() { return "copy"; }
__global__ void kernel(const float* input, float* output, int size);
__global__ void kernelOptimized(const float* input, float* output, int size);
bool check(const float* input, const float* output);
}
namespace difference
{
std::string name() { return "difference"; }
__global__ void kernel(const float* input, float* output, int size);
__global__ void kernelOptimized(const float* input, float* output, int size);
bool check(const float* input, const float* output);
}我有一个函数do_test,我把它参数化为泛型:
typedef bool NameFunction(const float* input, const float* output);
typedef bool CheckFunction(const float* input, const float* output);
typedef void KernelFunction(const float* input, float* output, int size);
void do_test(NameFunction name, KernelFunction kernel1, KernelFunction kernel2, CheckFunction check)
{
// Set up input and output array
// Set up CUDA events
// Warm up kernels
// Run kernels
// Check results
// Measure time
// Do standard output
}
int main()
{
do_test<copy::name, copy::kernel, copy::kernelOptimized, copy::check>()
do_test<difference::name, difference::kernel, difference::kernelOptimized, difference::check>()
}现在,当然这种方式已经相当不错了。但是,如果我再引入一个每个实验都必须提供的函数,我将需要修改我调用do_test的所有这些行。我更喜欢传递这个命名空间或包含这些函数的某种对象。
发布于 2015-08-10 11:36:54
您可以将内核修改为“只是”__device__函数,然后通过kernel_wrapper调用该函数。
#include <iostream>
#include <stdio.h>
typedef void (*kernel_ptr)(const float* input, float* output, int size);
template <kernel_ptr kernel>
__global__
void kernel_wrapper(const float* input, float* output, int size)
{
kernel(input, output, size);
}
struct copy
{
std::string name() { return "copy"; }
__device__ static void kernel(const float* input, float* output, int size){ printf("copy: %d\n",threadIdx.x); }
__device__ static void kernelOptimized(const float* input, float* output, int size){ printf("copy optimized: %d\n",threadIdx.x); }
};
struct difference
{
std::string name() { return "difference"; }
__device__ static void kernel(const float* input, float* output,i nt size){ printf("difference: %d\n",threadIdx.x); }
__device__ static void kernelOptimized(const float* input, float* output, int size){ printf("difference optimized: %d\n",threadIdx.x); }
};
template <typename Experiment>
void do_test()
{
dim3 dimBlock( 4, 1 );
dim3 dimGrid( 1, 1 );
Experiment e;
std::cout << "running experiment " << e.name() << std::endl;
std::cout << "launching the normal kernel" << std::endl;
kernel_wrapper<Experiment::kernel><<<dimGrid, dimBlock>>>(0,0,0);
cudaDeviceSynchronize();
std::cout << "launching the optimized kernel" << std::endl;
kernel_wrapper<Experiment::kernelOptimized><<<dimGrid, dimBlock>>>(0,0,0);
cudaDeviceSynchronize();
}
int main()
{
do_test<copy>();
do_test<difference>();
return 0;
}输出
running experiment copy
launching the normal kernel
copy: 0
copy: 1
copy: 2
copy: 3
launching the optimized kernel
copy optimized: 0
copy optimized: 1
copy optimized: 2
copy optimized: 3
running experiment difference
launching the normal kernel
difference: 0
difference: 1
difference: 2
difference: 3
launching the optimized kernel
difference optimized: 0
difference optimized: 1
difference optimized: 2
difference optimized: 3或者,您可以使用CRTP和模板专门化的组合:
#include <iostream>
#include <stdio.h>
template <typename Experiment>
__global__ void f();
template <typename Derived>
struct experiment
{
void run()
{
int blocksize = static_cast<Derived*>(this)->blocksize();
int reps = static_cast<Derived*>(this)->repetitions();
for (int i = 0; i<reps; ++i)
{
dim3 dimBlock( blocksize, 1 );
dim3 dimGrid( 1, 1 );
f<Derived><<<dimGrid, dimBlock>>>();
}
cudaDeviceSynchronize();
}
};
struct experiment1 : experiment<experiment1>
{
int repetitions() { return 2; }
int blocksize() { return 4; }
experiment1() { std::cout << "setting up experiment 1" << std::endl; }
~experiment1() { std::cout << "shutting down experiment 1" << std::endl; }
};
template <>
__global__
void f<experiment1>()
{
printf("experiment1: %d\n",threadIdx.x);
}
struct experiment2 : experiment<experiment2>
{
int repetitions() { return 4; }
int blocksize() { return 2; }
experiment2() { std::cout << "setting up experiment 2" << std::endl; }
~experiment2() { std::cout << "shutting down experiment 2" << std::endl; }
};
template <>
__global__
void f<experiment2>()
{
printf("experiment2: %d\n",threadIdx.x);
}
template<typename Experiment>
void do_test()
{
Experiment e;
e.run();
}
#include <iostream>
#include <stdio.h>
template <typename Experiment>
__global__ void f();
template <typename Derived>
struct experiment
{
void run()
{
int blocksize = static_cast<Derived*>(this)->blocksize();
int reps = static_cast<Derived*>(this)->repetitions();
for (int i = 0; i<reps; ++i)
{
dim3 dimBlock( blocksize, 1 );
dim3 dimGrid( 1, 1 );
f<Derived><<<dimGrid, dimBlock>>>();
}
cudaDeviceSynchronize();
}
};
struct experiment1 : experiment<experiment1>
{
int repetitions() { return 2; }
int blocksize() { return 4; }
experiment1() { std::cout << "setting up experiment 1" << std::endl; }
~experiment1() { std::cout << "shutting down experiment 1" << std::endl; }
};
template <>
__global__
void f<experiment1>()
{
printf("experiment1: %d\n",threadIdx.x);
}
struct experiment2 : experiment<experiment2>
{
int repetitions() { return 4; }
int blocksize() { return 2; }
experiment2() { std::cout << "setting up experiment 2" << std::endl; }
~experiment2() { std::cout << "shutting down experiment 2" << std::endl; }
};
template <>
__global__
void f<experiment2>()
{
printf("experiment2: %d\n",threadIdx.x);
}
template<typename Experiment>
void do_test()
{
Experiment e;
e.run();
}
int main()
{
do_test<experiment1>();
do_test<experiment2>();
return 0;
}输出
setting up experiment 1
experiment1: 0
experiment1: 1
experiment1: 2
experiment1: 3
experiment1: 0
experiment1: 1
experiment1: 2
experiment1: 3
shutting down experiment 1
setting up experiment 2
experiment2: 0
experiment2: 1
experiment2: 0
experiment2: 1
experiment2: 0
experiment2: 1
experiment2: 0
experiment2: 1
shutting down experiment 2https://stackoverflow.com/questions/31917461
复制相似问题