每当我用cuFFT绘制程序得到的值,并将结果与Matlab的结果进行比较时,就得到了相同形状的图,最大值和极小值是在相同的点上得到的。然而,cuFFT产生的值比Matlab产生的值大得多。Matlab代码是
fs = 1000; % sample freq
D = [0:1:4]'; % pulse delay times
t = 0 : 1/fs : 4000/fs; % signal evaluation time
w = 0.5; % width of each pulse
yp = pulstran(t,D,'rectpuls',w);
filt = conj(fliplr(yp));
xx = fft(yp,1024).*fft(filt,1024);
xx = (abs(ifft(xx))); 输入相同的数据自动化系统代码如下:
cufftExecC2C(plan, (cufftComplex *)d_signal, (cufftComplex *)d_signal, CUFFT_FORWARD);
cufftExecC2C(plan, (cufftComplex *)d_filter_signal, (cufftComplex *)d_filter_signal, CUFFT_FORWARD);
ComplexPointwiseMul<<<blocksPerGrid, threadsPerBlock>>>(d_signal, d_filter_signal, NX);
cufftExecC2C(plan, (cufftComplex *)d_signal, (cufftComplex *)d_signal, CUFFT_INVERSE);cuFFT还执行批处理大小为2的1024点FFT。
使用NX=1024的缩放因子,这些值不正确。请告诉我该怎么做。
发布于 2014-05-29 16:40:24
这是从未回答的列表中删除此问题的延迟回答。
您没有提供足够的信息来诊断您的问题,因为您缺少指定设置cuFFT计划的方式。您甚至没有为Matlab和cuFFT的信号指定完全相同的形状(所以您只是缩放),或者您的形状大致相同。不过,让我提出以下两点意见:
yp向量有4000元素;与fft(yp,1024)相反的是,通过截断信号到1024元素来执行快速傅立叶变换;为了方便(它可能对其他用户有用),我在下面报告一个简单的FFT-IFFT方案,其中还包括使用CUDA推力库执行的缩放。
#include <cufft.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
/*********************/
/* SCALE BY CONSTANT */
/*********************/
class Scale_by_constant
{
private:
float c_;
public:
Scale_by_constant(float c) { c_ = c; };
__host__ __device__ float2 operator()(float2 &a) const
{
float2 output;
output.x = a.x / c_;
output.y = a.y / c_;
return output;
}
};
int main(void){
const int N=4;
// --- Setting up input device vector
thrust::device_vector<float2> d_vec(N,make_cuComplex(1.f,2.f));
cufftHandle plan;
cufftPlan1d(&plan, N, CUFFT_C2C, 1);
// --- Perform in-place direct Fourier transform
cufftExecC2C(plan, thrust::raw_pointer_cast(d_vec.data()),thrust::raw_pointer_cast(d_vec.data()), CUFFT_FORWARD);
// --- Perform in-place inverse Fourier transform
cufftExecC2C(plan, thrust::raw_pointer_cast(d_vec.data()),thrust::raw_pointer_cast(d_vec.data()), CUFFT_INVERSE);
thrust::transform(d_vec.begin(), d_vec.end(), d_vec.begin(), Scale_by_constant((float)(N)));
// --- Setting up output host vector
thrust::host_vector<float2> h_vec(d_vec);
for (int i=0; i<N; i++) printf("Element #%i; Real part = %f; Imaginary part: %f\n",i,h_vec[i].x,h_vec[i].y);
getchar();
}发布于 2014-10-07 21:37:09
通过引入cuFFT回调功能,通过将cuFFT的规范化操作定义为__device__函数,可以将cufftExecC2C调用直接嵌入到cufftExecC2C调用中。
除了cuFFT用户指南,有关cuFFT回调特性,请参见
CUDA Pro提示:使用cuFFT回调进行自定义数据处理
下面是通过cuFFT回调实现IFFT规范化的一个示例。
#include <stdio.h>
#include <assert.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cufft.h>
#include <cufftXt.h>
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/*********************/
/* CUFFT ERROR CHECK */
/*********************/
// See http://stackoverflow.com/questions/16267149/cufft-error-handling
#ifdef _CUFFT_H_
static const char *_cudaGetErrorEnum(cufftResult error)
{
switch (error)
{
case CUFFT_SUCCESS:
return "CUFFT_SUCCESS";
case CUFFT_INVALID_PLAN:
return "CUFFT_INVALID_PLAN";
case CUFFT_ALLOC_FAILED:
return "CUFFT_ALLOC_FAILED";
case CUFFT_INVALID_TYPE:
return "CUFFT_INVALID_TYPE";
case CUFFT_INVALID_VALUE:
return "CUFFT_INVALID_VALUE";
case CUFFT_INTERNAL_ERROR:
return "CUFFT_INTERNAL_ERROR";
case CUFFT_EXEC_FAILED:
return "CUFFT_EXEC_FAILED";
case CUFFT_SETUP_FAILED:
return "CUFFT_SETUP_FAILED";
case CUFFT_INVALID_SIZE:
return "CUFFT_INVALID_SIZE";
case CUFFT_UNALIGNED_DATA:
return "CUFFT_UNALIGNED_DATA";
}
return "<unknown>";
}
#endif
#define cufftSafeCall(err) __cufftSafeCall(err, __FILE__, __LINE__)
inline void __cufftSafeCall(cufftResult err, const char *file, const int line)
{
if( CUFFT_SUCCESS != err) {
fprintf(stderr, "CUFFT error in file '%s', line %d\n %s\nerror %d: %s\nterminating!\n",__FILE__, __LINE__,err, \
_cudaGetErrorEnum(err)); \
cudaDeviceReset(); assert(0); \
}
}
__device__ void IFFT_Scaling(void *dataOut, size_t offset, cufftComplex element, void *callerInfo, void *sharedPtr) {
float *scaling_factor = (float*)callerInfo;
float2 output;
output.x = cuCrealf(element);
output.y = cuCimagf(element);
output.x = output.x / scaling_factor[0];
output.y = output.y / scaling_factor[0];
((float2*)dataOut)[offset] = output;
}
__device__ cufftCallbackStoreC d_storeCallbackPtr = IFFT_Scaling;
/********/
/* MAIN */
/********/
int main() {
const int N = 16;
cufftHandle plan;
float2 *h_input = (float2*)malloc(N*sizeof(float2));
float2 *h_output1 = (float2*)malloc(N*sizeof(float2));
float2 *h_output2 = (float2*)malloc(N*sizeof(float2));
float2 *d_input; gpuErrchk(cudaMalloc((void**)&d_input, N*sizeof(float2)));
float2 *d_output1; gpuErrchk(cudaMalloc((void**)&d_output1, N*sizeof(float2)));
float2 *d_output2; gpuErrchk(cudaMalloc((void**)&d_output2, N*sizeof(float2)));
float *h_scaling_factor = (float*)malloc(sizeof(float));
h_scaling_factor[0] = 16.0f;
float *d_scaling_factor; gpuErrchk(cudaMalloc((void**)&d_scaling_factor, sizeof(float)));
gpuErrchk(cudaMemcpy(d_scaling_factor, h_scaling_factor, sizeof(float), cudaMemcpyHostToDevice));
for (int i=0; i<N; i++) {
h_input[i].x = 1.0f;
h_input[i].y = 0.f;
}
gpuErrchk(cudaMemcpy(d_input, h_input, N*sizeof(float2), cudaMemcpyHostToDevice));
cufftSafeCall(cufftPlan1d(&plan, N, CUFFT_C2C, 1));
cufftSafeCall(cufftExecC2C(plan, d_input, d_output1, CUFFT_FORWARD));
gpuErrchk(cudaMemcpy(h_output1, d_output1, N*sizeof(float2), cudaMemcpyDeviceToHost));
for (int i=0; i<N; i++) printf("Direct transform - %d - (%f, %f)\n", i, h_output1[i].x, h_output1[i].y);
cufftCallbackStoreC h_storeCallbackPtr;
gpuErrchk(cudaMemcpyFromSymbol(&h_storeCallbackPtr, d_storeCallbackPtr, sizeof(h_storeCallbackPtr)));
cufftSafeCall(cufftXtSetCallback(plan, (void **)&h_storeCallbackPtr, CUFFT_CB_ST_COMPLEX, (void **)&d_scaling_factor));
cufftSafeCall(cufftExecC2C(plan, d_output1, d_output2, CUFFT_INVERSE));
gpuErrchk(cudaMemcpy(h_output2, d_output2, N*sizeof(float2), cudaMemcpyDeviceToHost));
for (int i=0; i<N; i++) printf("Inverse transform - %d - (%f, %f)\n", i, h_output2[i].x, h_output2[i].y);
cufftSafeCall(cufftDestroy(plan));
gpuErrchk(cudaFree(d_input));
gpuErrchk(cudaFree(d_output1));
gpuErrchk(cudaFree(d_output2));
return 0;
}编辑
CUFFT_CB_ST_COMPLEX在对cufftXtSetCallback的调用中指定了执行回调操作的“时刻”。注意,您可以使用相同的cuFFT计划加载和存储回调。
发布于 2016-12-05 16:10:04
性能
我还添加了一个进一步的答案来比较这个特定的IFFT扩展情况下的回调性能和相同代码的非回调版本。我使用的代码是
#include <stdio.h>
#include <assert.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cufft.h>
#include <cufftXt.h>
#include <thrust/device_vector.h>
#include "Utilities.cuh"
#include "TimingGPU.cuh"
//#define DISPLAY
/*******************************/
/* THRUST FUNCTOR IFFT SCALING */
/*******************************/
class Scale_by_constant
{
private:
float c_;
public:
Scale_by_constant(float c) { c_ = c; };
__host__ __device__ float2 operator()(float2 &a) const
{
float2 output;
output.x = a.x / c_;
output.y = a.y / c_;
return output;
}
};
/**********************************/
/* IFFT SCALING CALLBACK FUNCTION */
/**********************************/
__device__ void IFFT_Scaling(void *dataOut, size_t offset, cufftComplex element, void *callerInfo, void *sharedPtr) {
float *scaling_factor = (float*)callerInfo;
float2 output;
output.x = cuCrealf(element);
output.y = cuCimagf(element);
output.x = output.x / scaling_factor[0];
output.y = output.y / scaling_factor[0];
((float2*)dataOut)[offset] = output;
}
__device__ cufftCallbackStoreC d_storeCallbackPtr = IFFT_Scaling;
/********/
/* MAIN */
/********/
int main() {
const int N = 100000000;
cufftHandle plan; cufftSafeCall(cufftPlan1d(&plan, N, CUFFT_C2C, 1));
TimingGPU timerGPU;
float2 *h_input = (float2*)malloc(N*sizeof(float2));
float2 *h_output1 = (float2*)malloc(N*sizeof(float2));
float2 *h_output2 = (float2*)malloc(N*sizeof(float2));
float2 *d_input; gpuErrchk(cudaMalloc((void**)&d_input, N*sizeof(float2)));
float2 *d_output1; gpuErrchk(cudaMalloc((void**)&d_output1, N*sizeof(float2)));
float2 *d_output2; gpuErrchk(cudaMalloc((void**)&d_output2, N*sizeof(float2)));
// --- Callback function parameters
float *h_scaling_factor = (float*)malloc(sizeof(float));
h_scaling_factor[0] = 16.0f;
float *d_scaling_factor; gpuErrchk(cudaMalloc((void**)&d_scaling_factor, sizeof(float)));
gpuErrchk(cudaMemcpy(d_scaling_factor, h_scaling_factor, sizeof(float), cudaMemcpyHostToDevice));
// --- Initializing the input on the host and moving it to the device
for (int i = 0; i < N; i++) {
h_input[i].x = 1.0f;
h_input[i].y = 0.f;
}
gpuErrchk(cudaMemcpy(d_input, h_input, N * sizeof(float2), cudaMemcpyHostToDevice));
// --- Execute direct FFT on the device and move the results to the host
cufftSafeCall(cufftExecC2C(plan, d_input, d_output1, CUFFT_FORWARD));
#ifdef DISPLAY
gpuErrchk(cudaMemcpy(h_output1, d_output1, N * sizeof(float2), cudaMemcpyDeviceToHost));
for (int i=0; i<N; i++) printf("Direct transform - %d - (%f, %f)\n", i, h_output1[i].x, h_output1[i].y);
#endif
// --- Execute inverse FFT with subsequent scaling on the device and move the results to the host
timerGPU.StartCounter();
cufftSafeCall(cufftExecC2C(plan, d_output1, d_output2, CUFFT_INVERSE));
thrust::transform(thrust::device_pointer_cast(d_output2), thrust::device_pointer_cast(d_output2) + N, thrust::device_pointer_cast(d_output2), Scale_by_constant((float)(N)));
#ifdef DISPLAY
gpuErrchk(cudaMemcpy(h_output2, d_output2, N * sizeof(float2), cudaMemcpyDeviceToHost));
for (int i=0; i<N; i++) printf("Inverse transform - %d - (%f, %f)\n", i, h_output2[i].x, h_output2[i].y);
#endif
printf("Timing NO callback %f\n", timerGPU.GetCounter());
// --- Setup store callback
// timerGPU.StartCounter();
cufftCallbackStoreC h_storeCallbackPtr;
gpuErrchk(cudaMemcpyFromSymbol(&h_storeCallbackPtr, d_storeCallbackPtr, sizeof(h_storeCallbackPtr)));
cufftSafeCall(cufftXtSetCallback(plan, (void **)&h_storeCallbackPtr, CUFFT_CB_ST_COMPLEX, (void **)&d_scaling_factor));
// --- Execute inverse callback FFT on the device and move the results to the host
timerGPU.StartCounter();
cufftSafeCall(cufftExecC2C(plan, d_output1, d_output2, CUFFT_INVERSE));
#ifdef DISPLAY
gpuErrchk(cudaMemcpy(h_output2, d_output2, N * sizeof(float2), cudaMemcpyDeviceToHost));
for (int i=0; i<N; i++) printf("Inverse transform - %d - (%f, %f)\n", i, h_output2[i].x, h_output2[i].y);
#endif
printf("Timing callback %f\n", timerGPU.GetCounter());
cufftSafeCall(cufftDestroy(plan));
gpuErrchk(cudaFree(d_input));
gpuErrchk(cudaFree(d_output1));
gpuErrchk(cudaFree(d_output2));
return 0;
}对于如此大的一维阵列和简单的处理(缩放),开普勒K20c的定时如下
Non-callback 69.029762 ms
Callback 65.868607 ms因此,情况并没有多大改善。我希望您所看到的改进是由于在非回调情况下避免了单独的内核调用。对于较小的一维数组,要么没有改进,要么非回调情况运行得更快。
https://stackoverflow.com/questions/14441142
复制相似问题