经过长时间的手动调优和半成品的调试,我终于找到了导致内核崩溃的原因。是不是你不能在内核中创建一个太长的for循环?下面是最小的完整和可验证的代码:错误是:未指明的启动失败
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <stdio.h>
using namespace std;
typedef unsigned char uchar;
struct NodePointer {
int id;
uchar dist;
int idintree;
NodePointer() :id(0), dist(0), idintree(-1){}
};
struct TreeNode {
NodePointer father;
NodePointer children[4];
int id;
int childrenNum;
int idintree;
TreeNode() :id(0), childrenNum(0), idintree(-1){}
};
__global__ void
kernel4(int NumTN, TreeNode* tempthistree, int size)
{
int index = blockIdx.x*blockDim.x + threadIdx.x;
if (index < NumTN)
{
for (int i = 0; i < size; i++)
{
TreeNode node1 = tempthistree[i];
printf(" node %d in tree %d, its id in tree is %d, its child num is %d\n", i, index, node1.idintree, node1.childrenNum);
}
}
}
int main()
{
int n1 = 33417;
TreeNode * testtree;
cudaMallocManaged(&testtree, n1*sizeof(TreeNode));
for (int i = 0; i < n1; i++)
{
TreeNode c;
c.idintree = i;
c.id = i;
c.father.id = i - 1;
c.father.dist = 1;
c.childrenNum = i % 4;
int aaa = i % 4;
for (int j = 0; j < 4; j++)
{
c.children[j].dist = j;
c.children[j].id = 1;
c.children[j].idintree = 10;
}
testtree[i] = c;
}
kernel4 << <1, 1 >> >(4000, testtree, n1);
cudaDeviceSynchronize();
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
{
printf("Kernel3 error :%s\n", cudaGetErrorString(err));
system("pause");
}
}https://stackoverflow.com/questions/44456629
复制相似问题