我想知道在覆盆子Pi 1和树莓Pi 2中的ARM1176JZF-S核心的每个周期的峰值触发器。
从ARM1176JZF-S技术参考手册中可以看出,VFPv2每个时钟周期都可以执行一个SP,每隔一个时钟周期就可以执行一个DP MAC。此外,还有三条管道可以并行工作: MAC管道(FMAC)、除法和sqrt管道(DS)和负载/存储管道(LS)。基于此,看来Raspberry PI 1的ARM1176JZF-S至少可以(从FMAC管道)完成。
维基百科声称覆盆子PI 1的失败是0.041 DP GFLOPS。除以0.700 GHz后,每周产生小于0.06DP的触发器。这大约比我估计的1 DP失败/周期少17倍。
那么正确的答案是什么?
对于Raspberry Pi 2中的Cortex-A7处理器,我相信它和Cortex-A9是一样的。皮质的触发器/循环/核心-A9是:
Raspberry Pi 2的触发器/周期/核心是否与Corrtex-A9相同?如果没有,正确的答案是什么?
编辑:
皮质A9与皮质A7的主要区别 (涉及高峰、触发器/周期)是:
我不知道为什么OoO会影响高峰失败。双重问题当然应该如此。我想这样就能把高峰跌到一半了。
编辑:基于表http://hardwarebug.org/2014/05/15/cortex-a7-instruction-cycle-timings/,斯蒂芬·佳能在评论中给出了我的新的Cortex-A7峰值触发器。
发布于 2015-06-23 22:18:14
示例1编译代码MFLOPSPiNeon,它在900 MHz Rpi2上获得>647 MFLOPS (数据字3.2k至3.2M)。反汇编似乎是一样的,没有线程。对于每个数据字使用的编译/链接命令和用于32个操作的C代码,有人可能会建议使用更快的编译选项。
MP-MFLOPS Compiled NEON v1.0
gcc mpmflops.c cpuidc.c -lrt -lc -lm -O3 -mcpu=cortex-a7
-mfloat-abi=hard -mfpu=neon-vfpv4 -funsafe-math-optimizations -lpthread -o MP-MFLOPSPiNeon
32 OPs/Word 1 CPU 692 MFLOPS
void triadplus2(int n, float a, float b, float c, float d,
float e, float f, float g, float h, float j,
float k, float l, float m, float o, float p,
float q, float r, float s, float t, float u,
float v, float w, float y, float *x)
{
int i;
for(i=0; i<n; i++)
x[i] = (x[i]+a)*b-(x[i]+c)*d+(x[i]+e)*f-(x[i]+g)*h+(x[i]+j)*k
-(x[i]+l)*m+(x[i]+o)*p-(x[i]+q)*r+(x[i]+s)*t-(x[i]+u)*v+(x[i]+w)*y;
}以下是复杂的拆卸。注:高亮显示了与过多负载合并、累加或减除指令。
triadplus2:
@ args = 24, pretend = 0, frame = 272
@ frame_needed = 0, uses_anonymous_args = 0
@ link register save eliminated.
stmfd sp!, {r4, r5, r6, r7}
cmp r0, #0
fstmfdd sp!, {d8, d9, d10, d11, d12, d13, d14, d15}
sub sp, sp, #272
flds s21, [sp, #352]
flds s18, [sp, #356]
flds s19, [sp, #360]
flds s16, [sp, #364]
flds s20, [sp, #368]
flds s17, [sp, #372]
ble .L57
sbfx r3, r1, #2, #1
and r3, r3, #3
cmp r3, r0
movcs r3, r0
cmp r0, #4
movls r3, r0
bhi .L80
LOOP HERE
.L59:
flds s23, [r1]
cmp r3, #1
fadds s22, s23, s4
movls r2, #1
fadds s24, s23, s0
fadds s31, s23, s8
fadds s30, s23, s12
fmuls s22, s22, s5
fadds s29, s23, s21
fadds s28, s23, s20
fadds s27, s23, s6
vfma.f32 s22, s24, s1
fadds s26, s23, s2
fadds s25, s23, s10
fadds s24, s23, s14
fadds s23, s23, s19
vfma.f32 s22, s31, s9
vfma.f32 s22, s30, s13
vfma.f32 s22, s29, s18
vfma.f32 s22, s28, s17
vfms.f32 s22, s27, s7
vfms.f32 s22, s26, s3
vfms.f32 s22, s25, s11
vfms.f32 s22, s24, s15
vfms.f32 s22, s23, s16
fsts s22, [r1]
bls .L61
flds s23, [r1, #4]
cmp r3, #2
fadds s22, s23, s4
movls r2, #2
fadds s24, s23, s0
fadds s31, s23, s8
fadds s30, s23, s12
fmuls s22, s22, s5
fadds s29, s23, s21
fadds s28, s23, s20
fadds s27, s23, s6
vfma.f32 s22, s24, s1
fadds s26, s23, s2
fadds s25, s23, s10
fadds s24, s23, s14
fadds s23, s23, s19
vfma.f32 s22, s31, s9
vfma.f32 s22, s30, s13
vfma.f32 s22, s29, s18
vfma.f32 s22, s28, s17
vfms.f32 s22, s27, s7
vfms.f32 s22, s26, s3
vfms.f32 s22, s25, s11
vfms.f32 s22, s24, s15
vfms.f32 s22, s23, s16
fsts s22, [r1, #4]
bls .L61
flds s23, [r1, #8]
cmp r3, #3
fadds s22, s23, s4
movls r2, #3
fadds s24, s23, s0
fadds s31, s23, s8
fadds s30, s23, s12
fmuls s22, s22, s5
fadds s29, s23, s21
fadds s28, s23, s20
fadds s27, s23, s6
vfma.f32 s22, s24, s1
fadds s26, s23, s2
fadds s25, s23, s10
fadds s24, s23, s14
fadds s23, s23, s19
vfma.f32 s22, s31, s9
vfma.f32 s22, s30, s13
vfma.f32 s22, s29, s18
vfma.f32 s22, s28, s17
vfms.f32 s22, s27, s7
vfms.f32 s22, s26, s3
vfms.f32 s22, s25, s11
vfms.f32 s22, s24, s15
vfms.f32 s22, s23, s16
fsts s22, [r1, #8]
bls .L61
flds s23, [r1, #12]
mov r2, #4
fadds s22, s23, s20
fadds s24, s23, s21
fadds s31, s23, s12
fadds s30, s23, s8
fmuls s22, s22, s17
fadds s29, s23, s4
fadds s28, s23, s0
fadds s27, s23, s6
vfma.f32 s22, s24, s18
fadds s26, s23, s2
fadds s25, s23, s10
fadds s24, s23, s14
fadds s23, s23, s19
vfma.f32 s22, s31, s13
vfma.f32 s22, s30, s9
vfma.f32 s22, s29, s5
vfma.f32 s22, s28, s1
vfms.f32 s22, s27, s7
vfms.f32 s22, s26, s3
vfms.f32 s22, s25, s11
vfms.f32 s22, s24, s15
vfms.f32 s22, s23, s16
fsts s22, [r1, #12]
.L61:
cmp r3, r0
beq .L57
rsb r6, r3, r0
mov r4, r6, lsr #2
movs r7, r4, asl #2
beq .L63
.L81:
vdup.32 q12, d1[1]
vdup.32 q8, d0[0]
vdup.32 q10, d0[1]
vdup.32 q11, d1[0]
vstr d24, [sp, #64]
vstr d25, [sp, #72]
vdup.32 q12, d3[1]
vstr d16, [sp, #16]
vstr d17, [sp, #24]
vstr d20, [sp, #32]
vstr d21, [sp, #40]
vdup.32 q8, d2[0]
vdup.32 q10, d2[1]
vstr d22, [sp, #48]
vstr d23, [sp, #56]
vstr d24, [sp, #128]
vstr d25, [sp, #136]
vdup.32 q11, d3[0]
vdup.32 q12, d5[1]
vstr d16, [sp, #80]
vstr d17, [sp, #88]
vstr d20, [sp, #96]
vstr d21, [sp, #104]
vdup.32 q8, d4[0]
vdup.32 q10, d4[1]
vstr d22, [sp, #112]
vstr d23, [sp, #120]
vstr d24, [sp, #192]
vstr d25, [sp, #200]
vdup.32 q11, d5[0]
vdup.32 q12, d10[0]
vstr d16, [sp, #144]
vstr d17, [sp, #152]
vstr d20, [sp, #160]
vstr d21, [sp, #168]
vstr d22, [sp, #176]
vstr d23, [sp, #184]
vdup.32 q8, d6[0]
vdup.32 q10, d9[1]
vdup.32 q11, d8[0]
vstr d24, [sp, #256]
vstr d25, [sp, #264]
vdup.32 q12, d8[1]
vstr d16, [sp, #208]
vstr d17, [sp, #216]
vdup.32 q7, d6[1]
vdup.32 q6, d7[0]
vdup.32 q15, d7[1]
vdup.32 q14, d10[1]
vdup.32 q13, d9[0]
vstr d20, [sp, #224]
vstr d21, [sp, #232]
vstr d22, [sp, #240]
vstr d23, [sp, #248]
vst1.64 {d24-d25}, [sp:64]
add r3, r1, r3, asl #2
mov ip, #0
mov r5, r3
.L69:
vfma FUSED MULTIPLY ACCUMULATE or vfms SUBTRACT QUAD WORDS
vld1.64 {d18-d19}, [r3:64]!
vldr d20, [sp, #80]
vldr d21, [sp, #88]
vldr d22, [sp, #16]
vldr d23, [sp, #24]
vadd.f32 q8, q9, q10
vldr d24, [sp, #96]
vldr d25, [sp, #104]
vadd.f32 q10, q9, q11
vmul.f32 q8, q8, q12
vldr d22, [sp, #32]
vldr d23, [sp, #40]
vldr d24, [sp, #144]
vldr d25, [sp, #152]
vfma.f32 q8, q10, q11
add ip, ip, #1
vadd.f32 q11, q9, q12
vldr d24, [sp, #208]
vldr d25, [sp, #216]
cmp r4, ip
vadd.f32 q10, q9, q12
vldr d24, [sp, #160]
vldr d25, [sp, #168]
vfma.f32 q8, q11, q12
vadd.f32 q11, q9, q14
vldr d24, [sp, #256]
vldr d25, [sp, #264]
vfma.f32 q8, q10, q7
vadd.f32 q10, q9, q12
vldr d24, [sp, #112]
vldr d25, [sp, #120]
vfma.f32 q8, q11, q13
vadd.f32 q11, q9, q12
vld1.64 {d24-d25}, [sp:64]
vfma.f32 q8, q10, q12
vldr d24, [sp, #48]
vldr d25, [sp, #56]
vadd.f32 q10, q9, q12
vldr d24, [sp, #128]
vldr d25, [sp, #136]
vfms.f32 q8, q11, q12
vldr d24, [sp, #176]
vldr d25, [sp, #184]
vadd.f32 q11, q9, q12
vldr d24, [sp, #64]
vldr d25, [sp, #72]
vfms.f32 q8, q10, q12
vldr d24, [sp, #224]
vldr d25, [sp, #232]
vadd.f32 q10, q9, q6
vadd.f32 q9, q9, q12
vldr d24, [sp, #192]
vldr d25, [sp, #200]
vfms.f32 q8, q11, q12
vfms.f32 q8, q10, q15
vldr d20, [sp, #240]
vldr d21, [sp, #248]
vfms.f32 q8, q9, q10
vst1.64 {d16-d17}, [r5:64]!
bhi .L69
END vfma FUSED MULTIPLY ACCUMULATE or vfms SUBTRACT QUAD WORDS
cmp r7, r6
add r2, r2, r7
beq .L57
.L63:
add ip, r1, r2, asl #2
add r3, r2, #1
cmp r0, r3
flds s23, [ip]
fadds s22, s23, s4
fadds s24, s23, s0
fadds s31, s23, s8
fadds s30, s23, s12
fmuls s22, s22, s5
fadds s29, s23, s21
fadds s28, s23, s20
fadds s27, s23, s2
vfma.f32 s22, s24, s1
fadds s26, s23, s6
fadds s25, s23, s14
fadds s24, s23, s10
fadds s23, s23, s19
vfma.f32 s22, s31, s9
vfma.f32 s22, s30, s13
vfma.f32 s22, s29, s18
vfma.f32 s22, s28, s17
vfms.f32 s22, s27, s3
vfms.f32 s22, s26, s7
vfms.f32 s22, s25, s15
vfms.f32 s22, s24, s11
vfms.f32 s22, s23, s16
fsts s22, [ip]
ble .L57
add r3, r1, r3, asl #2
add r2, r2, #2
cmp r0, r2
flds s23, [r3]
fadds s22, s23, s4
fadds s24, s23, s0
fadds s31, s23, s8
fadds s30, s23, s12
fmuls s22, s22, s5
fadds s29, s23, s21
fadds s28, s23, s20
fadds s27, s23, s6
vfma.f32 s22, s24, s1
fadds s26, s23, s2
fadds s25, s23, s10
fadds s24, s23, s14
fadds s23, s23, s19
vfma.f32 s22, s31, s9
vfma.f32 s22, s30, s13
vfma.f32 s22, s29, s18
vfma.f32 s22, s28, s17
vfms.f32 s22, s27, s7
vfms.f32 s22, s26, s3
vfms.f32 s22, s25, s11
vfms.f32 s22, s24, s15
vfms.f32 s22, s23, s16
fsts s22, [r3]
ble .L57
add r2, r1, r2, asl #2
flds s22, [r2]
fadds s4, s22, s4
fadds s0, s22, s0
fadds s8, s22, s8
fadds s12, s22, s12
fmuls s5, s4, s5
fadds s21, s22, s21
fadds s20, s22, s20
fadds s6, s22, s6
vfma.f32 s5, s0, s1
fadds s2, s22, s2
fadds s10, s22, s10
fadds s14, s22, s14
fadds s19, s22, s19
vfma.f32 s5, s8, s9
vfma.f32 s5, s12, s13
vfma.f32 s5, s21, s18
vfma.f32 s5, s20, s17
vfms.f32 s5, s6, s7
vfms.f32 s5, s2, s3
vfms.f32 s5, s10, s11
vfms.f32 s5, s14, s15
vfms.f32 s5, s19, s16
fsts s5, [r2]
.L57:
add sp, sp, #272
@ sp needed
fldmfdd sp!, {d8-d15}
ldmfd sp!, {r4, r5, r6, r7}
bx lr
.L80:
cmp r3, #0
moveq r2, r3
bne .L59
rsb r6, r3, r0
mov r4, r6, lsr #2
movs r7, r4, asl #2
bne .L81
b .L63
.size triadplus2, .-triadplus2例2-使用氖内禀函数(在我知道融合指令之前)> 700 MFLOPS。第一个C代码:
32 Operations per word
C NEON Intrinsics
n = words 3.2k, 32k, 3.2M
similar results > 700 MFLOPS.
for(i=0; i<n; i=i+4)
{
x41 = vld1q_f32(ptrx1);
z41 = vaddq_f32(x41, a41);
z41 = vmulq_f32(z41, b41);
z42 = vaddq_f32(x41, c41);
z42 = vmulq_f32(z42, d41);
z41 = vsubq_f32(z41, z42);
z42 = vaddq_f32(x41, e41);
z42 = vmulq_f32(z42, f41);
z41 = vaddq_f32(z41, z42);
z42 = vaddq_f32(x41, g41);
z42 = vmulq_f32(z42, h41);
z41 = vsubq_f32(z41, z42);
z42 = vaddq_f32(x41, j41);
z42 = vmulq_f32(z42, k41);
z41 = vaddq_f32(z41, z42);
z42 = vaddq_f32(x41, l41);
z42 = vmulq_f32(z42, m41);
z41 = vsubq_f32(z41, z42);
z42 = vaddq_f32(x41, o41);
z42 = vmulq_f32(z42, p41);
z41 = vaddq_f32(z41, z42);
z42 = vaddq_f32(x41, q41);
z42 = vmulq_f32(z42, r41);
z41 = vsubq_f32(z41, z42);
z42 = vaddq_f32(x41, s41);
z42 = vmulq_f32(z42, t41);
z41 = vaddq_f32(z41, z42);
z42 = vaddq_f32(x41, u41);
z42 = vmulq_f32(z42, v41);
z41 = vsubq_f32(z41, z42);
z42 = vaddq_f32(x41, w41);
z42 = vmulq_f32(z42, y41);
z41 = vaddq_f32(z41, z42);
vst1q_f32(ptrx1, z41);
ptrx1 = ptrx1 + 4;
}接下来是反汇编,再一次使用过多的加载指令。
Assembly Code
.L26:
vld1.32 {d16-d17}, [ip]
vld1.64 {d20-d21}, [sp:64]
vadd.f32 q9, q8, q14
vadd.f32 q11, q8, q10
vldr d24, [sp, #16]
vldr d25, [sp, #24]
vmul.f32 q11, q11, q13
vmul.f32 q9, q9, q12
vldr d24, [sp, #32]
vldr d25, [sp, #40]
vsub.f32 q11, q11, q9
vadd.f32 q10, q8, q12
vldr d18, [sp, #48]
vldr d19, [sp, #56]
vldr d24, [sp, #64]
vldr d25, [sp, #72]
vmul.f32 q10, q10, q9
vadd.f32 q9, q8, q12
vadd.f32 q11, q11, q10
vldr d20, [sp, #80]
vldr d21, [sp, #88]
vldr d24, [sp, #96]
vldr d25, [sp, #104]
vmul.f32 q9, q9, q10
vadd.f32 q10, q8, q12
vsub.f32 q11, q11, q9
vldr d18, [sp, #112]
vldr d19, [sp, #120]
vldr d24, [sp, #128]
vldr d25, [sp, #136]
vmul.f32 q10, q10, q9
vadd.f32 q9, q8, q12
vadd.f32 q11, q11, q10
vldr d24, [sp, #160]
vldr d25, [sp, #168]
vldr d20, [sp, #144]
vldr d21, [sp, #152]
add r3, r3, #4
cmp r0, r3
vmul.f32 q9, q9, q10
vadd.f32 q10, q8, q12
vsub.f32 q11, q11, q9
vmul.f32 q10, q10, q15
vadd.f32 q9, q8, q3
vadd.f32 q11, q11, q10
vmul.f32 q9, q9, q2
vadd.f32 q10, q8, q1
vsub.f32 q11, q11, q9
vmul.f32 q10, q10, q0
vadd.f32 q9, q8, q4
vadd.f32 q10, q11, q10
vmul.f32 q9, q9, q5
vadd.f32 q8, q8, q6
vsub.f32 q10, q10, q9
vmul.f32 q8, q8, q7
vadd.f32 q10, q10, q8
vst1.32 {d20-d21}, [ip]!
bgt .L26https://stackoverflow.com/questions/30976071
复制相似问题