通过创建这个结构,我希望保持整洁,避免以后将许多参数传递给函数和内核。
#pragma once
struct matrix {
matrix(int rows, int cols) {
this->rows = rows;
this->cols = cols;
this->size = rows * cols;
}
double *elements;
int rows;
int cols;
int size;
};在这里,我已经放置了内核原型。我写了两个版本的矩阵乘法。一个使用共享内存的人,一个不使用共享内存的人。
#pragma once
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "matrix.hpp"
#if SHARED == 1
__global__ void matrix_multiplication_kernel(matrix a, matrix b, matrix c, unsigned int tile_size);
#elif SHARED == 0
__global__ void matrix_multiplication_kernel(matrix a, matrix b, matrix c);
#endif下面是内核的实际实现。
#include "kernels.cuh"
#if SHARED == 1
__global__ void matrix_multiplication_kernel(matrix a, matrix b, matrix c, unsigned int tile_size) {
int bx = blockIdx.x;
int by = blockIdx.y;
int tx = threadIdx.x;
int ty = threadIdx.y;
int row = by * blockDim.y + ty;
int col = bx * blockDim.x + tx;
extern __shared__ double buffer[];
double *a_shared = &buffer[0];
double *b_shared = &buffer[tile_size * tile_size];
double sum = 0;
for (int k = 0; k < (tile_size + a.cols - 1) / tile_size; k++) {
if (k * tile_size + tx < a.cols && row < a.rows) {
a_shared[ty * tile_size + tx] = a.elements[row * a.cols + (k * tile_size + tx)];
} else {
a_shared[ty * tile_size + tx] = 0.0;
}
if (k * tile_size + ty < b.rows && col < b.cols) {
b_shared[ty * tile_size + tx] = b.elements[(k * tile_size + ty) * b.cols + col];
} else {
b_shared[ty * tile_size + tx] = 0.0;
}
__syncthreads();
#pragma unroll
for (int n = 0; n < tile_size; ++n) {
sum += a_shared[ty * tile_size + n] * b_shared[n * tile_size + tx];
}
__syncthreads();
}
if (row < c.rows && col < c.cols) {
c.elements[row * c.cols + col] = sum;
}
}
#elif SHARED == 0
__global__ void matrix_multiplication_kernel(matrix a, matrix b, matrix c) {
int bx = blockIdx.x;
int by = blockIdx.y;
int tx = threadIdx.x;
int ty = threadIdx.y;
int row = by * blockDim.y + ty;
int col = bx * blockDim.x + tx;
if (row < c.rows && col < c.cols) {
double sum = 0;
#pragma unroll
for (int k = 0; k < a.cols && k < b.rows; k++) {
sum += a.elements[row * a.cols + k] * b.elements[k * b.cols + col];
}
c.elements[row * c.cols + col] = sum;
}
}
#endif我在这个文件中创建了一些wrapper functions,以保持main函数的干净,并向用户提供某种高级抽象。
#include "wrappers.cuh"
#include <iostream>
void matrix_multiplication(matrix a, matrix b, matrix c, unsigned int block_size) {
cudaError_t error;
dim3 dimBlock;
dim3 dimGrid;
dimBlock.x = block_size;
dimBlock.y = block_size;
dimBlock.z = 1;
dimGrid.x = (c.cols - 1) / dimBlock.x + 1;
dimGrid.y = (c.rows - 1) / dimBlock.y + 1;
dimGrid.z = 1;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
float milliseconds = 0;
cudaEventRecord(start);
#if SHARED == 1
unsigned int tile_size = block_size;
matrix_multiplication_kernel <<<dimGrid, dimBlock, 2 * tile_size * tile_size * sizeof(double)>>> (a, b, c, tile_size);
#elif SHARED == 0
matrix_multiplication_kernel <<<dimGrid, dimBlock>>> (a, b, c);
#endif
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&milliseconds, start, stop);
std::cout << "kernel execution time" << " " << milliseconds << " " << "ms" << std::endl;
error = cudaDeviceSynchronize();
if (error != cudaSuccess) {
std::cerr << cudaGetErrorString(error) << std::endl;
}
}下面是wrapper functions的原型。
#pragma once
#include "kernels.cuh"
void matrix_multiplication(matrix a, matrix b, matrix c, unsigned int block_size);这是main函数。
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "wrappers.cuh"
#include <iostream>
#include <string>
void print(matrix m, std::string label) {
std::cout << label << "[" << m.rows << "x" << m.cols << "] = " << std::endl;
for (int row = 0; row < m.rows; row++) {
for (int col = 0; col < m.cols; col++) {
std::cout << m.elements[row * m.cols + col] << "\t";
}
std::cout << std::endl;
}
}
int main(int argc, char **argv) {
if (argc != 8) {
std::cout << "NAME" << std::endl;
std::cout << "\t" << "matrix-multiplication" << std::endl;
std::cout << std::endl;
return 0;
}
int nDevices;
cudaGetDeviceCount(&nDevices);
for (int i = 0; i < nDevices; i++) {
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, i);
std::cout << "GPU #" << prop.pciDeviceID << " " << prop.name;
std::cout << std::endl;
}
int a_rows = std::stoi(argv[1]);
int a_cols = std::stoi(argv[2]);
int b_rows = std::stoi(argv[3]);
int b_cols = std::stoi(argv[4]);
int c_rows = std::stoi(argv[5]);
int c_cols = std::stoi(argv[6]);
int block_size = std::stoi(argv[7]);
matrix a(a_rows, a_cols);
matrix b(b_rows, b_cols);
matrix c(c_rows, c_cols);
cudaMallocManaged(&a.elements, a.size * sizeof(double));
cudaMallocManaged(&b.elements, b.size * sizeof(double));
cudaMallocManaged(&c.elements, c.size * sizeof(double));
fill_col(a, block_size); // Implementation not shown here
fill_row(b, block_size); // Implementation not shown here
matrix_multiplication(a, b, c, block_size);
print(a, "a");
print(b, "b");
print(c, "c");
cudaFree(a.elements);
cudaFree(b.elements);
cudaFree(c.elements);
return 0;
}所以..。你认为如何?看上去不错吗?你有什么建议要提吗?
发布于 2018-04-16 12:03:34
matrix(int rows, int cols) {
this->rows = rows;
this->cols = cols;
this->size = rows * cols;
}0_0
你的意思是
matrix(int rows, int cols): rows(rows), cols(cols), size(rows * cols) {}?
(此外,还要注意整数溢出;size_t在这里会更好。)
cudaMallocManaged(&a.elements, a.size * sizeof(double));在外部调用中管理对象的资源通常不是一个好主意。这里可以做两件事:
cudaMallocManaged从matrix的S内部调用构造函数。cudaFree可以从matrix's析构函数中调用,但更好的解决方案是将elements转换为unique_ptr,并从elements的删除器调用cudaFree。除了提高一致性之外,这还使得您的matrix DefaultMoveable。发布于 2018-04-18 06:21:17
除了bipll注意到的内容外,elements成员不是由构造函数初始化的,而是处于垃圾状态。至少,使用内联数据成员初始化器使其成为nullptr。
而且它没有破坏者。难道它不应该释放记忆吗?我认为您真的需要一个带有自定义删除器的unique_ptr。
编译器为您生成赋值和复制成员,但是他们会做错误的事情。您应该将它们标记为=delete以禁用该功能。
https://codereview.stackexchange.com/questions/192154
复制相似问题