我一直在优化矩阵乘法n上玩克里尔的视频,但我不明白他的速度。原因是什么?下面是我用来测试的程序。它有三种功能:原始增殖、B的原位转置和B+阻塞的原位转置.我在n= 4000和块大小为1、10、20、50、100、200的情况下运行这个程序。我的缓存是32 L1的L1D、256 L1的L2、4MB的共享L3,所以块大小10应该是20 * 20 *8*2= 6.4KB,并且适合于L1缓存。无论块大小,它需要50,这是相同的,因为只有转置。我和gcc -O3 -mavx2一起编译。
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
void matmul(size_t n, double A[n][n], double B[n][n], double result[n][n])
{
for (size_t i = 0; i < n; i++) {
for (size_t j = 0; j < n; j++) {
double acc = 0;
for (size_t k = 0; k < n; k++) {
acc += A[i][k] * B[k][j];
}
result[i][j] = acc;
}
}
}
void transpose(size_t n, double matrix[n][n])
{
for (size_t i = 0; i < n; i++) {
for (size_t j = 0; j < i; j++) {
double temp = matrix[i][j];
matrix[i][j] = matrix[j][i];
matrix[j][i] = temp;
}
}
}
void matmulTrans(size_t n, double A[n][n], double B[n][n], double result[n][n])
{
transpose(n, B);
for (size_t i = 0; i < n; i++) {
for (size_t j = 0; j < n; j++) {
double acc = 0;
for (size_t k = 0; k < n; k++) {
acc += A[i][k] * B[j][k];
}
result[i][j] = acc;
}
}
}
void matmulBlock(size_t n, double A[n][n], double B[n][n],
double result[n][n], size_t blockSize)
{
transpose(n, B);
for (size_t i = 0; i < n; i += blockSize) {
for (size_t j = 0; j < n; j += blockSize) {
for (size_t iBlock = i; iBlock < i + blockSize; iBlock++) {
for (size_t jBlock = j; jBlock < j + blockSize; jBlock++) {
double acc = 0;
for (size_t k = 0; k < n; k++) {
acc += A[iBlock][k] * B[jBlock][k];
}
result[iBlock][jBlock] = acc;
}
}
}
}
}
int main(int argc, char **argv)
{
if (argc != 3) {
printf("Provide two arguments!\n");
return 1;
}
int n = atoi(argv[1]);
int blockSize = atoi(argv[2]);
double (*A)[n] = malloc(n * n * sizeof(double));
double (*B)[n] = malloc(n * n * sizeof(double));
double (*result)[n] = malloc(n * n * sizeof(double));
clock_t time1 = clock();
matmulBlock(n, A, B, result, blockSize);
clock_t time2 = clock();
// matmul(n, A, B, result);
clock_t time3 = clock();
matmulTrans(n, A, B, result);
clock_t time4 = clock();
printf("Blocked version: %lfs.\nNaive version: %lfs.\n"
"Transposed version: %lfs.\n",
(double) (time2 - time1) / CLOCKS_PER_SEC,
(double) (time3 - time2) / CLOCKS_PER_SEC,
(double) (time4 - time3) / CLOCKS_PER_SEC);
free(A);
free(B);
free(result);
return 0;
}发布于 2022-07-12 19:17:26
如果缓存实际上是瓶颈,阻塞只会缩短执行时间。问题是当前的代码应该是compute-bound.实际上,GCC没有将代码向量化,因为浮点操作不是关联的,默认情况下也不会做出这种假设(它可以破坏某些代码)。您可以通过启用-ffast-math来解决这个问题,这也为自动向量化启用了其他有用的标志(但这也更加不安全:例如,假定不使用NaN值)。实际上,matmulBlock的热循环的一般汇编代码效率很低:
.L81:
vmovupd ymm4, YMMWORD PTR [rdx+rax]
vmulpd ymm2, ymm4, YMMWORD PTR [rcx+rax]
add rsi, 1
add rax, 32
vaddsd xmm0, xmm2, xmm0
vunpckhpd xmm3, xmm2, xmm2
vextractf128 xmm1, ymm2, 0x1
vaddsd xmm3, xmm3, xmm0
vaddsd xmm0, xmm1, xmm3
vunpckhpd xmm1, xmm1, xmm1
vaddsd xmm0, xmm0, xmm1
cmp rsi, r13
jne .L81使用-ffast-math的情况要好得多,但仍然不太理想:
.L79:
vmovupd ymm4, YMMWORD PTR [rdx+rax]
vmulpd ymm0, ymm4, YMMWORD PTR [rcx+rax]
add rsi, 1
add rax, 32
vaddpd ymm1, ymm1, ymm0
cmp rsi, r13
jne .L79为了获得更好的性能,您可以启用FMA指令集,这是AFAIK通常可以在机器上支持AVX-2 (特别是在最近的处理器)。然后,展开可以用来使代码更具表现力。
https://stackoverflow.com/questions/72952612
复制相似问题