我需要优化矩阵向量乘法。数据如下:
对于这个例程,还必须满足一些非功能要求:
std::vector )Eigen或Blas )。这是我的(为了可读性起见,我假设输入完全被阻塞)代码,
// input_height = 90000
// input_width = 81
for (uint32_t y = 0; y < input_height; y += 4) {
float32x4_t sum0 = vmovq_n_f32(0);
float32x4_t sum1 = vmovq_n_f32(0);
float32x4_t sum2 = vmovq_n_f32(0);
float32x4_t sum3 = vmovq_n_f32(0);
for (uint32_t x = 0; x < input_width; x += 16) {
float32x4x4_t A = load_matrix_transpose(kernel + x);
float32x4x4_t B0 = load_matrix_transpose(input + y * input_width + x);
float32x4x4_t B1 = load_matrix_transpose(input + (y + 1) * input_width + x);
float32x4x4_t B2 = load_matrix_transpose(input + (y + 2) * input_width + x);
float32x4x4_t B3 = load_matrix_transpose(input + (y + 3) * input_width + x);
matrix_element_wise_multiplication(A, B0, sum0);
matrix_element_wise_multiplication(A, B1, sum1);
matrix_element_wise_multiplication(A, B2, sum2);
matrix_element_wise_multiplication(A, B3, sum3);
}
output[y] = vaddvq_f32(sum0);
output[y + 1] = vaddvq_f32(sum1);
output[y + 2] = vaddvq_f32(sum2);
output[y + 3] = vaddvq_f32(sum3);
}其中,load_matrix_transpose、matrix_element_wise_multiplication是以下函数:
inline float32x4x4_t load_matrix_transpose(float *a) {
float32x4x4_t ret;
ret.val[0] = simd_load(a);
ret.val[1] = simd_load(a + 4);
ret.val[2] = simd_load(a + 8);
ret.val[3] = simd_load(a + 12);
return ret;
}
inline void simd_matrix_element_wise_multiplication(float32x4x4_t & A, float32x4x4_t & B, float32x4x4_t & C) {
C = vmlaq_f32(C, A.val[0], B.val[0]);
C = vmlaq_f32(C, A.val[1], B.val[1]);
C = vmlaq_f32(C, A.val[2], B.val[2]);
C = vmlaq_f32(C, A.val[3], B.val[3]);
}在我的Rasperry 4 (ARMv8,8GBRAM,4个核)上,代码采用了关于60ms的优化级别-O3。
长期运行(许多循环),Neon寄存器版本的速度正好是正常代码的两倍。
我的问题是,是否存在进一步优化代码的问题?我试过很多东西,但对正常的代码不能做任何改进。
发布于 2021-03-17 14:21:55
当涉及到优化时,数据局部性是最高优先级,而且您应该知道寄存器容量,因为寄存器是迄今为止最快和最稀缺的资源。
aarch64:32x128位neon寄存器(512个字节)
aarch32:16x128位neon寄存器(256个字节)
当转换时,一个81x90000矩阵需要保存90000个中间值才能进行乘法,而且由于360000字节不适合一个由512个字节组成的寄存器组,因此将产生大量的内存交换,这将带来巨大的性能损失。
另一方面,向量的4*81字节很好地适应于512字节。
void matVecMult81x90000(float *pDst, float *pMat, float *pVec)
{
register float32x4_t vec0_3, vec4_7, vec8_11, vec12_15, vec16_19, vec20_23, vec24_27, vec28_31, vec32_35, vec36_39, vec40_43, vec44_47, vec48_51, vec52_55, vec56_59, vec60_63, vec64_67, vec68_71, vec72_75, vec76_79, vec80;
register float32x4_t mat0, mat1, mat2, mat3, mat4, rslt;
register float32x2_t drslt;
register uint32_t nRows = 90000;
vec80 = vdupq_n_f32(0.0f);
mat4 =vdupq_n_f32(0.0f);
vec0_3 = vld1q_f32(pVec); pVec += 4;
vec4_7 = vld1q_f32(pVec); pVec += 4;
vec8_11 = vld1q_f32(pVec); pVec += 4;
vec12_15 = vld1q_f32(pVec); pVec += 4;
vec16_19 = vld1q_f32(pVec); pVec += 4;
vec20_23 = vld1q_f32(pVec); pVec += 4;
vec24_27 = vld1q_f32(pVec); pVec += 4;
vec28_31 = vld1q_f32(pVec); pVec += 4;
vec32_35 = vld1q_f32(pVec); pVec += 4;
vec36_39 = vld1q_f32(pVec); pVec += 4;
vec40_43 = vld1q_f32(pVec); pVec += 4;
vec44_47 = vld1q_f32(pVec); pVec += 4;
vec48_51 = vld1q_f32(pVec); pVec += 4;
vec52_55 = vld1q_f32(pVec); pVec += 4;
vec56_59 = vld1q_f32(pVec); pVec += 4;
vec60_63 = vld1q_f32(pVec); pVec += 4;
vec64_67 = vld1q_f32(pVec); pVec += 4;
vec68_71 = vld1q_f32(pVec); pVec += 4;
vec72_75 = vld1q_f32(pVec); pVec += 4;
vec76_79 = vld1q_f32(pVec); pVec += 4;
vld1q_lane_f32(pVec, vec80, 0);
do {
mat0 = vld1q_f32(pMat); pMat += 4;
mat1 = vld1q_f32(pMat); pMat += 4;
mat2 = vld1q_f32(pMat); pMat += 4;
mat3 = vld1q_f32(pMat); pMat += 4;
rslt = vmulq_f32(mat0, vec0_3);
rslt += vmulq_f32(mat1, vec4_7);
rslt += vmulq_f32(mat2, vec8_11);
rslt += vmulq_f32(mat3, vec12_15);
mat0 = vld1q_f32(pMat); pMat += 4;
mat1 = vld1q_f32(pMat); pMat += 4;
mat2 = vld1q_f32(pMat); pMat += 4;
mat3 = vld1q_f32(pMat); pMat += 4;
rslt += vmulq_f32(mat0, vec16_19);
rslt += vmulq_f32(mat1, vec20_23);
rslt += vmulq_f32(mat2, vec24_27);
rslt += vmulq_f32(mat3, vec28_31);
mat0 = vld1q_f32(pMat); pMat += 4;
mat1 = vld1q_f32(pMat); pMat += 4;
mat2 = vld1q_f32(pMat); pMat += 4;
mat3 = vld1q_f32(pMat); pMat += 4;
rslt += vmulq_f32(mat0, vec32_35);
rslt += vmulq_f32(mat1, vec36_39);
rslt += vmulq_f32(mat2, vec40_43);
rslt += vmulq_f32(mat3, vec44_47);
mat0 = vld1q_f32(pMat); pMat += 4;
mat1 = vld1q_f32(pMat); pMat += 4;
mat2 = vld1q_f32(pMat); pMat += 4;
mat3 = vld1q_f32(pMat); pMat += 4;
rslt += vmulq_f32(mat0, vec48_51);
rslt += vmulq_f32(mat1, vec52_55);
rslt += vmulq_f32(mat2, vec56_59);
rslt += vmulq_f32(mat3, vec60_63);
mat0 = vld1q_f32(pMat); pMat += 4;
mat1 = vld1q_f32(pMat); pMat += 4;
mat2 = vld1q_f32(pMat); pMat += 4;
mat3 = vld1q_f32(pMat); pMat += 4;
vld1q_lane_f32(pMat, mat4, 0); pMat += 1;
rslt += vmulq_f32(mat0, vec64_67);
rslt += vmulq_f32(mat1, vec68_71);
rslt += vmulq_f32(mat2, vec72_75);
rslt += vmulq_f32(mat3, vec76_79);
rslt += vmulq_f32(mat4, vec80);
*pDst++ = vaddvq_f32(rslt);
} while (--nRows);
}不幸的是,编译器没有很好地发挥作用。( GCC和Clang)
生成的代码显示了在循环中的向量上的一些堆栈交换。下面是手写程序集中没有任何堆栈交换的相同功能:
.arch armv8-a
.global matVecMult81x90000_asm
.text
.balign 64
.func
matVecMult81x90000_asm:
// init loop counter
mov w3, #90000 & 0xffff
movk w3, #90000>>16, lsl #16
// preserve registers
stp d8, d9, [sp, #-48]!
stp d10, d11, [sp, #1*16]
stp d12, d13, [sp, #2*16]
// load vectors
ldp q0, q1, [x2, #0*32]
ldp q2, q3, [x2, #1*32]
ldp q4, q5, [x2, #2*32]
ldp q6, q7, [x2, #3*32]
ldp q8, q9, [x2, #4*32]
ldp q10, q11, [x2, #5*32]
ldp q12, q13, [x2, #6*32]
ldp q16, q17, [x2, #7*32]
ldp q18, q19, [x2, #8*32]
ldp q20, q21, [x2, #9*32]
ldr s22, [x2, #10*32]
// loop
.balign 64
1:
ldp q24, q25, [x1, #0*32]
ldp q26, q27, [x1, #1*32]
ldp q28, q29, [x1, #2*32]
ldp q30, q31, [x1, #3*32]
subs w3, w3, #1
fmul v23.4s, v24.4s, v0.4s
fmla v23.4s, v25.4s, v1.4s
fmla v23.4s, v26.4s, v2.4s
fmla v23.4s, v27.4s, v3.4s
fmla v23.4s, v28.4s, v4.4s
fmla v23.4s, v29.4s, v5.4s
fmla v23.4s, v30.4s, v6.4s
fmla v23.4s, v31.4s, v7.4s
ldp q24, q25, [x1, #4*32]
ldp q26, q27, [x1, #5*32]
ldp q28, q29, [x1, #6*32]
ldp q30, q31, [x1, #7*32]
fmla v23.4s, v24.4s, v8.4s
fmla v23.4s, v25.4s, v9.4s
fmla v23.4s, v26.4s, v10.4s
fmla v23.4s, v27.4s, v11.4s
fmla v23.4s, v28.4s, v12.4s
fmla v23.4s, v29.4s, v13.4s
fmla v23.4s, v30.4s, v16.4s
fmla v23.4s, v31.4s, v17.4s
ldp q24, q25, [x1, #8*32]
ldp q26, q27, [x1, #9*32]
ldr s28, [x1, #10*32]
fmla v23.4s, v24.4s, v18.4s
fmla v23.4s, v25.4s, v19.4s
fmla v23.4s, v26.4s, v20.4s
fmla v23.4s, v27.4s, v21.4s
fmla v23.4s, v28.4s, v22.4s
add x1, x1, #81*4
faddp v23.4s, v23.4s, v23.4s
faddp v23.2s, v23.2s, v23.2s
str s23, [x0], #4
b.ne 1b
.balign 8
//restore registers
ldp d10, d11, [sp, #1*16]
ldp d12, d13, [sp, #2*16]
ldp d8, d9, [sp], #48
// return
ret
.endfunc
.end在RK3368上的测试结果:
Clang本质: 10.41ms
大会: 9.59ms
在这种情况下,编译器并没有表现得那么糟糕,但它们比通常更愚蠢得令人难以置信。我强烈建议学习集会。
发布于 2021-03-17 22:43:16
这是对杰克答案的优化。
使用4个累加器而不是单个累加器会有所帮助,因为FMA指令的延迟远远高于吞吐量。根据皮质-A72优化指南的说法,FMLA指令的等待时间对于完成的东西是7个周期,当依赖于累加器时是3个周期(如果你想知道Q-form和D-form到底是什么,q是16字节向量,D是8字节向量)。吞吐量要高得多,只有一个周期,CPU每个周期可以运行一个FMA。
下面的版本使用了4个独立的累加器,而不是一个单独的累加器,尽管在循环结束时需要3个额外的指令来加和累加器,但是应该可以提高吞吐量。
我还使用了一些宏来帮助重复代码。未经测试。
void matVecMult81( float *pDst, const float *pMat, const float *pVec, size_t nRows = 90000 )
{
// 30 vector registers in total; ARM64 has 32 of them, so we're good.
float32x4_t vec0_3, vec4_7, vec8_11, vec12_15, vec16_19, vec20_23, vec24_27, vec28_31, vec32_35, vec36_39, vec40_43, vec44_47, vec48_51, vec52_55, vec56_59, vec60_63, vec64_67, vec68_71, vec72_75, vec76_79, vec80;
float32x4_t mat0, mat1, mat2, mat3, mat4;
float32x4_t res0, res1, res2, res3;
vec80 = mat4 = vdupq_n_f32( 0.0f );
// Load 16 numbers from pVec into 3 vector registers, incrementing the source pointer
#define LOAD_VEC_16( v0, v1, v2, v3 ) \
v0 = vld1q_f32( pVec ); pVec += 4; \
v1 = vld1q_f32( pVec ); pVec += 4; \
v2 = vld1q_f32( pVec ); pVec += 4; \
v3 = vld1q_f32( pVec ); pVec += 4
// Load the complete vector into registers using the above macro
LOAD_VEC_16( vec0_3, vec4_7, vec8_11, vec12_15 );
LOAD_VEC_16( vec16_19, vec20_23, vec24_27, vec28_31 );
LOAD_VEC_16( vec32_35, vec36_39, vec40_43, vec44_47 );
LOAD_VEC_16( vec48_51, vec52_55, vec56_59, vec60_63 );
LOAD_VEC_16( vec64_67, vec68_71, vec72_75, vec76_79 );
// Load the final scalar of the vector
vec80 = vld1q_lane_f32( pVec, vec80, 0 );
#undef LOAD_VEC_16
// Load 16 numbers from pMat into mat0 - mat3, incrementing the source pointer
#define LOAD_MATRIX_16() \
mat0 = vld1q_f32( pMat ); pMat += 4; \
mat1 = vld1q_f32( pMat ); pMat += 4; \
mat2 = vld1q_f32( pMat ); pMat += 4; \
mat3 = vld1q_f32( pMat ); pMat += 4
// Multiply 16 numbers in mat0 - mat3 by the specified pieces of the vector, and accumulate into res0 - res3
// Multiple accumulators is critical for performance, 4 instructions produced by this macro don't have data dependencies between them.
#define HANDLE_BLOCK_16( v0, v1, v2, v3 ) \
res0 = vfmaq_f32( res0, mat0, v0 ); \
res1 = vfmaq_f32( res1, mat1, v1 ); \
res2 = vfmaq_f32( res2, mat2, v2 ); \
res3 = vfmaq_f32( res3, mat3, v3 )
const float* const pMatEnd = pMat + nRows * 81;
while( pMat < pMatEnd )
{
// Initial 16 elements only need multiplication.
LOAD_MATRIX_16();
res0 = vmulq_f32( mat0, vec0_3 );
res1 = vmulq_f32( mat1, vec4_7 );
res2 = vmulq_f32( mat2, vec8_11 );
res3 = vmulq_f32( mat3, vec12_15 );
// Handle the rest of the row using FMA instructions.
LOAD_MATRIX_16();
HANDLE_BLOCK_16( vec16_19, vec20_23, vec24_27, vec28_31 );
LOAD_MATRIX_16();
HANDLE_BLOCK_16( vec32_35, vec36_39, vec40_43, vec44_47 );
LOAD_MATRIX_16();
HANDLE_BLOCK_16( vec48_51, vec52_55, vec56_59, vec60_63 );
// The final block of the row has 17 scalars instead of 16
LOAD_MATRIX_16();
mat4 = vld1q_lane_f32( pMat, mat4, 0 ); pMat++;
HANDLE_BLOCK_16( vec64_67, vec68_71, vec72_75, vec76_79 );
res0 = vfmaq_f32( res0, mat4, vec80 );
// Vertically add 4 accumulators into res0
res1 = vaddq_f32( res1, res2 );
res0 = vaddq_f32( res3, res0 );
res0 = vaddq_f32( res1, res0 );
// Store the horizontal sum of the accumulator
*pDst = vaddvq_f32( res0 );
pDst++;
}
#undef LOAD_MATRIX_16
#undef HANDLE_BLOCK_16
}该程序集由该源生成,使用GCC 10.1 看上去或多或少没问题。
https://stackoverflow.com/questions/66662113
复制相似问题