问CUDA可以并行存储8个无符号字符数据吗
EN

Stack Overflow用户

提问于 2019-09-13 09:10:24

回答 1查看 120关注 0票数 0

我正在尝试将8个无符号字符数据存储到全局内存中。然而，cuda构建的向量类型只支持uchar4。所以，我自己编写uchar_8。然而，当我分析代码时，我发现store行为需要两条STG.E指令，而不是一条STG.E.64。我知道cuda有STG.E.64，那么我如何修改我的代码来让编译器这样做呢？

我已经在我的结构上添加了__align(8)__指令，但它仍然不起作用。我的cuda版本是cuda8.0

typedef struct __align__(8){
    unsigned char x0;
    unsigned char y0;
    unsigned char z0;
    unsigned char w0;
    unsigned char x1;
    unsigned char y1;
    unsigned char z1;
    unsigned char w1;
}uchar_8;

_global__ void yuv420_to_rgb_gpu_(
        const uchar2*  y_component,
        const unsigned char*   u_component,
        const unsigned char*  v_component,
        uchar_8*  rgb_data,
        uint len,
        uint width
){
    uint bx = blockIdx.x;
    uint tx = threadIdx.x;
    uint current_index = bx*BLOCK_SIZE + tx;
    uchar2 y_tmp;
    uchar_8 rgb_tmp;

    if(current_index < len){
        unsigned char u_data = u_component[current_index];
        uint current_line = current_index / width ;
        unsigned char v_data = v_component[current_index];
        uint current_col = current_index - current_line * width;
        uint index_00 = 2*current_line*width + current_col;
        //uint index_01 = 2*current_line*width + 2*current_col + 1;
        y_tmp = y_component[index_00];
        unsigned char y_data_00 = y_tmp.x;
        unsigned char y_data_01 = y_tmp.y;
        uint index_10 = index_00 + width;
       // uint index_11 = 2*current_line*width + 2*current_col + width + 1;
        y_tmp = y_component[index_10];
        unsigned char y_data_10 = y_tmp.x;
        unsigned char y_data_11 = y_tmp.y;

        float r_component_0;
        float g_component_0;
        float b_component_0;
        float r_component_1;
        float g_component_1;
        float b_component_1;



        float r_v_tmp = (1.4075f * (v_data-128.0f));
        float g_v_tmp = (0.7169f * (v_data-128.0f));
        float g_u_tmp = (0.3455f * (u_data-128.0f));
        float y_u_tmp = (1.7790f * (u_data-128.0f));


        r_component_0 = y_data_00 + r_v_tmp;
        g_component_0 = y_data_00 - g_v_tmp - g_u_tmp;
        b_component_0 = y_data_00 + y_u_tmp;
        r_component_1 = y_data_01 + r_v_tmp;
        g_component_1 = y_data_01 - g_v_tmp - g_u_tmp;
        b_component_1 = y_data_01 + y_u_tmp;

        rgb_tmp.x0 = float_to_char(r_component_0);
        rgb_tmp.y0 = float_to_char(g_component_0);
        rgb_tmp.z0 = float_to_char(b_component_0);
        rgb_tmp.w0 = 0;
        rgb_tmp.x1 = float_to_char(r_component_1);
        rgb_tmp.y1 = float_to_char(g_component_1);
        rgb_tmp.z1 = float_to_char(b_component_1);
        rgb_tmp.w1 = 0;
        rgb_data[index_00] = rgb_tmp;




        r_component_0 = y_data_10 + r_v_tmp;
        g_component_0 = y_data_10 - g_v_tmp - g_u_tmp;
        b_component_0 = y_data_10 + y_u_tmp;
        r_component_1 = y_data_11 + r_v_tmp;
        g_component_1 = y_data_11 - g_v_tmp - g_u_tmp;
        b_component_1 = y_data_11 + y_u_tmp;

        rgb_tmp.x0 = float_to_char(r_component_0);
        rgb_tmp.y0 = float_to_char(g_component_0);
        rgb_tmp.z0 = float_to_char(b_component_0);
        rgb_tmp.w0 = 0;
        rgb_tmp.x1 = float_to_char(r_component_1);
        rgb_tmp.y1 = float_to_char(g_component_1);
        rgb_tmp.z1 = float_to_char(b_component_1);
        rgb_tmp.w1 = 0;
       // tmp.w = 0;
        rgb_data[index_10] = rgb_tmp;




    }
}

内存存储只发生在rgb_data[index_00] = rgb_tmp;和rgb_data[index_10] = rgb_tmp;上，反汇编代码如下

        BFI R3, R7, 0x808, R14;
        BFI R5, R9, 0x808, R8;
        LEA R4.CC, R2.reuse, c[0x0][0x158], 0x3;
        BFI R6, R6, 0x810, R3;
        BFI R5, R0, 0x810, R5;
        LEA.HI.X R3, R2, c[0x0][0x15c], RZ, 0x3;
        MOV R2, R4;
        BFI R0, RZ, 0x818, R6;
        {         BFI R4, RZ, 0x818, R5;
        STG.E [R2], R0;        }
        STG.E [R2+0x4], R4;
        EXIT;

cuda

回答 1

Stack Overflow用户

发布于 2019-09-13 17:04:34

只需将评论扩展为答案：

我测试的每个旧版本的编译器(8.0，9.1，10.0)都会在PTX中为内核末尾的uchar_8赋值发出两个st.global.v4.u8指令(即两个32位写入)。另一方面，CUDA10.1发出单个st.global.v4.u16指令来处理写操作。

因此，该解决方案将升级到CUDA 10.1。在此之前的任何工具包都不支持64位写请求。

票数 2

页面原文内容由Stack Overflow提供。腾讯云小微IT领域专用引擎提供翻译支持

原文链接：

https://stackoverflow.com/questions/57916173

复制

相似问题

问CUDA可以并行存储8个无符号字符数据吗
EN

回答 1

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问CUDA可以并行存储8个无符号字符数据吗EN

回答 1

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问CUDA可以并行存储8个无符号字符数据吗
EN