首页
学习
活动
专区
圈层
工具
发布
社区首页 >问答首页 >共享内存和不共享内存的CUDA C++ API的矩阵乘法实现

共享内存和不共享内存的CUDA C++ API的矩阵乘法实现
EN

Code Review用户
提问于 2018-04-16 01:25:53
回答 2查看 551关注 0票数 1

matrix.hpp

通过创建这个结构,我希望保持整洁,避免以后将许多参数传递给函数和内核。

代码语言:javascript
复制
#pragma once

struct matrix {
    matrix(int rows, int cols) {
        this->rows = rows;
        this->cols = cols;
        this->size = rows * cols;
    }
    double *elements;
    int rows;
    int cols;
    int size;
};

kernels.cuh

在这里,我已经放置了内核原型。我写了两个版本的矩阵乘法。一个使用共享内存的人,一个不使用共享内存的人。

代码语言:javascript
复制
#pragma once

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "matrix.hpp"

#if SHARED == 1
    __global__ void matrix_multiplication_kernel(matrix a, matrix b, matrix c, unsigned int tile_size);
#elif SHARED == 0
    __global__ void matrix_multiplication_kernel(matrix a, matrix b, matrix c);
#endif

kernels.cu

下面是内核的实际实现。

代码语言:javascript
复制
#include "kernels.cuh"

#if SHARED == 1
__global__ void matrix_multiplication_kernel(matrix a, matrix b, matrix c, unsigned int tile_size) {
    int bx = blockIdx.x;
    int by = blockIdx.y;

    int tx = threadIdx.x;
    int ty = threadIdx.y;

    int row = by * blockDim.y + ty;
    int col = bx * blockDim.x + tx;

    extern __shared__ double buffer[];
    double *a_shared = &buffer[0];
    double *b_shared = &buffer[tile_size * tile_size];

    double sum = 0;

    for (int k = 0; k < (tile_size + a.cols - 1) / tile_size; k++) {
        if (k * tile_size + tx < a.cols && row < a.rows) {
            a_shared[ty * tile_size + tx] = a.elements[row * a.cols + (k * tile_size + tx)];
        } else {
            a_shared[ty * tile_size + tx] = 0.0;
        }
        if (k * tile_size + ty < b.rows && col < b.cols) {
            b_shared[ty * tile_size + tx] = b.elements[(k * tile_size + ty) * b.cols + col];
        } else {
            b_shared[ty * tile_size + tx] = 0.0;
        }
        __syncthreads();
#pragma unroll
        for (int n = 0; n < tile_size; ++n) {
            sum += a_shared[ty * tile_size + n] * b_shared[n * tile_size + tx];
        }
        __syncthreads();
    }
    if (row < c.rows && col < c.cols) {
        c.elements[row * c.cols + col] = sum;
    }
}
#elif SHARED == 0
__global__ void matrix_multiplication_kernel(matrix a, matrix b, matrix c) {
    int bx = blockIdx.x;
    int by = blockIdx.y;

    int tx = threadIdx.x;
    int ty = threadIdx.y;

    int row = by * blockDim.y + ty;
    int col = bx * blockDim.x + tx;

    if (row < c.rows && col < c.cols) {
        double sum = 0;
#pragma unroll
        for (int k = 0; k < a.cols && k < b.rows; k++) {
            sum += a.elements[row * a.cols + k] * b.elements[k * b.cols + col];
        }
        c.elements[row * c.cols + col] = sum;
    }
}
#endif

wrappers.cu

我在这个文件中创建了一些wrapper functions,以保持main函数的干净,并向用户提供某种高级抽象。

代码语言:javascript
复制
#include "wrappers.cuh"
#include <iostream>

void matrix_multiplication(matrix a, matrix b, matrix c, unsigned int block_size) {
    cudaError_t error;
    dim3 dimBlock;
    dim3 dimGrid;
    dimBlock.x = block_size;
    dimBlock.y = block_size;
    dimBlock.z = 1;
    dimGrid.x = (c.cols - 1) / dimBlock.x + 1;
    dimGrid.y = (c.rows - 1) / dimBlock.y + 1;
    dimGrid.z = 1;

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    float milliseconds = 0;

    cudaEventRecord(start);
#if SHARED == 1
    unsigned int tile_size = block_size;
    matrix_multiplication_kernel <<<dimGrid, dimBlock, 2 * tile_size * tile_size * sizeof(double)>>> (a, b, c, tile_size);
#elif SHARED == 0
    matrix_multiplication_kernel <<<dimGrid, dimBlock>>> (a, b, c);
#endif
    cudaEventRecord(stop);

    cudaEventSynchronize(stop);

    cudaEventElapsedTime(&milliseconds, start, stop);
    std::cout << "kernel execution time" << " " << milliseconds << " " << "ms" << std::endl;

    error = cudaDeviceSynchronize();
    if (error != cudaSuccess) {
        std::cerr << cudaGetErrorString(error) << std::endl;
    }
}

wrappers.cuh

下面是wrapper functions的原型。

代码语言:javascript
复制
#pragma once

#include "kernels.cuh"

void matrix_multiplication(matrix a, matrix b, matrix c, unsigned int block_size);

main.cpp

这是main函数。

代码语言:javascript
复制
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include "wrappers.cuh"

#include <iostream>
#include <string>

void print(matrix m, std::string label) {
    std::cout << label << "[" << m.rows << "x" << m.cols << "] = " << std::endl;
    for (int row = 0; row < m.rows; row++) {
        for (int col = 0; col < m.cols; col++) {
            std::cout << m.elements[row * m.cols + col] << "\t";
        }
        std::cout << std::endl;
    }
}

int main(int argc, char **argv) {
    if (argc != 8) {
        std::cout << "NAME" << std::endl;
        std::cout << "\t" << "matrix-multiplication" << std::endl;
        std::cout << std::endl;
        return 0;
    }

    int nDevices;
    cudaGetDeviceCount(&nDevices);
    for (int i = 0; i < nDevices; i++) {
        cudaDeviceProp prop;
        cudaGetDeviceProperties(&prop, i);
        std::cout << "GPU #" << prop.pciDeviceID << " " << prop.name;
        std::cout << std::endl;
    }

    int a_rows = std::stoi(argv[1]);
    int a_cols = std::stoi(argv[2]);

    int b_rows = std::stoi(argv[3]);
    int b_cols = std::stoi(argv[4]);

    int c_rows = std::stoi(argv[5]);
    int c_cols = std::stoi(argv[6]);

    int block_size = std::stoi(argv[7]);

    matrix a(a_rows, a_cols);
    matrix b(b_rows, b_cols);
    matrix c(c_rows, c_cols);

    cudaMallocManaged(&a.elements, a.size * sizeof(double));
    cudaMallocManaged(&b.elements, b.size * sizeof(double));
    cudaMallocManaged(&c.elements, c.size * sizeof(double));

    fill_col(a, block_size); // Implementation not shown here
    fill_row(b, block_size); // Implementation not shown here

    matrix_multiplication(a, b, c, block_size);

    print(a, "a");
    print(b, "b");
    print(c, "c");

    cudaFree(a.elements);
    cudaFree(b.elements);
    cudaFree(c.elements);

    return 0;
}

所以..。你认为如何?看上去不错吗?你有什么建议要提吗?

EN

回答 2

Code Review用户

回答已采纳

发布于 2018-04-16 12:03:34

代码语言:javascript
复制
matrix(int rows, int cols) {
    this->rows = rows;
    this->cols = cols;
    this->size = rows * cols;
}

0_0

你的意思是

代码语言:javascript
复制
matrix(int rows, int cols): rows(rows), cols(cols), size(rows * cols) {}

(此外,还要注意整数溢出;size_t在这里会更好。)

代码语言:javascript
复制
cudaMallocManaged(&a.elements, a.size * sizeof(double));

在外部调用中管理对象的资源通常不是一个好主意。这里可以做两件事:

  1. cudaMallocManagedmatrix的S内部调用构造函数。
  2. 因此,cudaFree可以从matrix's析构函数中调用,但更好的解决方案是将elements转换为unique_ptr,并从elements的删除器调用cudaFree。除了提高一致性之外,这还使得您的matrix DefaultMoveable。
票数 2
EN

Code Review用户

发布于 2018-04-18 06:21:17

除了bipll注意到的内容外,elements成员不是由构造函数初始化的,而是处于垃圾状态。至少,使用内联数据成员初始化器使其成为nullptr

而且它没有破坏者。难道它不应该释放记忆吗?我认为您真的需要一个带有自定义删除器的unique_ptr

编译器为您生成赋值和复制成员,但是他们会做错误的事情。您应该将它们标记为=delete以禁用该功能。

票数 1
EN
页面原文内容由Code Review提供。腾讯云小微IT领域专用引擎提供翻译支持
原文链接:

https://codereview.stackexchange.com/questions/192154

复制
相关文章

相似问题

领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档