我正在学习有关锈病和asm,并使用天螺栓为这一点。
我有一个程序看起来是:
pub fn test() -> i32 {
let a = 1;
let b = 2;
let c = 3;
a + b + c
}我希望输出结果类似于
example::test:
subq $16, %rsp
movl $1, (%rsp)
movl $2, 4(%rsp)
movl $3, 8(%rsp)
movl (%rsp), %eax
addl 4(%rsp), %eax
addl 8(%rsp), %eax
addq $16, %rsp
retq但实际上我得到了:
example::test:
mov eax, 6
ret当试图演示堆栈分配、添加等时,这是无用的。
我正在使用编译器标志:-Z mir-opt-level=0 -C opt-level=0 -C overflow-checks=off
所以和平号并不是在优化增加的设备。MIR的产出是:
// WARNING: This output format is intended for human consumers only
// and is subject to change without notice. Knock yourself out.
fn test() -> i32 {
let mut _0: i32; // return place in scope 0 at /app/example.rs:2:18: 2:21
let _1: i32; // in scope 0 at /app/example.rs:3:9: 3:10
let mut _4: i32; // in scope 0 at /app/example.rs:6:5: 6:10
let mut _5: i32; // in scope 0 at /app/example.rs:6:5: 6:6
let mut _6: i32; // in scope 0 at /app/example.rs:6:9: 6:10
let mut _7: i32; // in scope 0 at /app/example.rs:6:13: 6:14
scope 1 {
debug a => _1; // in scope 1 at /app/example.rs:3:9: 3:10
let _2: i32; // in scope 1 at /app/example.rs:4:9: 4:10
scope 2 {
debug b => _2; // in scope 2 at /app/example.rs:4:9: 4:10
let _3: i32; // in scope 2 at /app/example.rs:5:9: 5:10
scope 3 {
debug c => _3; // in scope 3 at /app/example.rs:5:9: 5:10
}
}
}
bb0: {
StorageLive(_1); // scope 0 at /app/example.rs:3:9: 3:10
_1 = const 1_i32; // scope 0 at /app/example.rs:3:13: 3:14
StorageLive(_2); // scope 1 at /app/example.rs:4:9: 4:10
_2 = const 2_i32; // scope 1 at /app/example.rs:4:13: 4:14
StorageLive(_3); // scope 2 at /app/example.rs:5:9: 5:10
_3 = const 3_i32; // scope 2 at /app/example.rs:5:13: 5:14
StorageLive(_4); // scope 3 at /app/example.rs:6:5: 6:10
StorageLive(_5); // scope 3 at /app/example.rs:6:5: 6:6
_5 = _1; // scope 3 at /app/example.rs:6:5: 6:6
StorageLive(_6); // scope 3 at /app/example.rs:6:9: 6:10
_6 = _2; // scope 3 at /app/example.rs:6:9: 6:10
_4 = Add(move _5, move _6); // scope 3 at /app/example.rs:6:5: 6:10
StorageDead(_6); // scope 3 at /app/example.rs:6:9: 6:10
StorageDead(_5); // scope 3 at /app/example.rs:6:9: 6:10
StorageLive(_7); // scope 3 at /app/example.rs:6:13: 6:14
_7 = _3; // scope 3 at /app/example.rs:6:13: 6:14
_0 = Add(move _4, move _7); // scope 3 at /app/example.rs:6:5: 6:14
StorageDead(_7); // scope 3 at /app/example.rs:6:13: 6:14
StorageDead(_4); // scope 3 at /app/example.rs:6:13: 6:14
StorageDead(_3); // scope 2 at /app/example.rs:7:1: 7:2
StorageDead(_2); // scope 1 at /app/example.rs:7:1: 7:2
StorageDead(_1); // scope 0 at /app/example.rs:7:1: 7:2
return; // scope 0 at /app/example.rs:7:2: 7:2
}
}而LLVM红外输出是:
define i32 @_ZN7example4test17h2e9277ab15e59fbdE() unnamed_addr #0 !dbg !5 {
start:
ret i32 6, !dbg !10
}
attributes #0 = { nonlazybind uwtable "probe-stack"="__rust_probestack" "target-cpu"="x86-64" }因此,这是在MIR->LLVM级别,当增加是优化的。
我怎么才能阻止这一切?
谢谢!
备注
如果我使用元组,优化就不会发生。e.g
pub fn test() -> i32 {
let a = (1,2,3);
a.0 + a.1 + a.2
}变成:
example::test:
subq $16, %rsp
movl $1, (%rsp)
movl $2, 4(%rsp)
movl $3, 8(%rsp)
movl (%rsp), %eax
addl 4(%rsp), %eax
addl 8(%rsp), %eax
addq $16, %rsp
retq发布于 2022-06-09 16:23:54
有一个black_box提示可以防止计算在编译时发生。
请注意,它只有在夜间,在撰写本报告时才可用。
#![feature(bench_black_box)]
pub fn test() -> i32 {
let a = std::hint::black_box(1);
let b = std::hint::black_box(2);
let c = std::hint::black_box(3);
a + b + c
}example::test:
sub rsp, 12
mov dword ptr [rsp], 1
mov rax, rsp
mov eax, dword ptr [rsp]
mov dword ptr [rsp + 4], 2
lea rcx, [rsp + 4]
add eax, dword ptr [rsp + 4]
mov dword ptr [rsp + 8], 3
lea rcx, [rsp + 8]
add eax, dword ptr [rsp + 8]
add rsp, 12
ret用rust nightly和-C opt-level=3编译。
发布于 2022-06-09 18:07:53
将对它们的可变引用传递给其他函数(或内联asm),以强制它们具有内存地址。在没有定义函数的情况下声明函数的一种方法是extern "C"。
extern "C" {
fn ext(x: &i32); // void ext(const int *x);
}
pub fn test(a: i32, b: i32) -> i32 {
let c = 3;
unsafe{ ext(&b); }
//dummy(&c, &a); // alternative, declare as non-inline an use std::hint::black_box
a + b + c
}example::test:
push rax // align the stack and reserve 8 bytes
mov dword ptr [rsp], edi
mov dword ptr [rsp + 4], esi
lea rdi, [rsp + 4] // &b
call qword ptr [rip + ext@GOTPCREL] // function call -fno-plt
style
mov eax, dword ptr [rsp] // reload a and b
add eax, dword ptr [rsp + 4]
add eax, 3 // constant-propagation for c
pop rcx // dealloc stack space with a dummy pop
ret在不禁用优化的情况下,LLVM按预期保存/恢复一个调用保留寄存器,以在函数调用中保持a。
example::test:
push rbx // save a call-preserved reg
sub rsp, 16
mov ebx, edi // use it to hold a
mov dword ptr [rsp + 12], esi // spill b
lea rdi, [rsp + 12] // and pass a pointer to it
call qword ptr [rip + ext@GOTPCREL]
mov eax, dword ptr [rsp + 12]
add eax, ebx
add eax, 3
add rsp, 16 // epilogue
pop rbx
ret或者具体地阻止常量折叠,使用函数args而不是常量。见:How to remove "noise" from GCC/clang assembly output?
pub fn test2(a: i32, b: i32) -> i32 {
let c = 3;
a + b + c
}但即使在-C opt-level=0 -C overflow-checks=off上,
Rustc仍然不会像clang -O0那样溢出/重新加载到堆栈空间。
example::test2:
mov eax, edi
add eax, esi
add eax, 3
ret(opt-level=3当然使用LEA而不是MOV+ADD,但仍然使用一个单独的常数3添加来优化延迟,而不是像Skylake这样的CPU的吞吐量,在这里,3分量LEA有3个周期延迟,而不是1个。与Alder不同,lea eax, [rsi+rdi+3]是1个周期延迟,并且有一个缩放索引。或两个周期的禅宗或奥尔德湖的E-核心,所以打破-平衡与单独的LEA/添加,但较少的uop。https://uops.info/)
#[inline(never)]
这是在How to declare a function without implementing it?上建议的一种获得非内联函数调用的方法。我们可以使用@Finomnis建议的std::hint::black_box来实际使用args,并强制调用方在传递引用时在内存中物化一个值。
取消注释,在上面的哥德螺栓链接,以尝试它。
#![feature(bench_black_box)]
pub fn test(a: i32, b: i32) -> i32 {
let c = 3;
dummy(&c, &a);
a + b + c
}
#[inline(never)]
pub extern fn dummy(_a: &i32, _b: &i32) {
//use std::sync::atomic::*;
//compiler_fence(Ordering::Release); // make the function non-empty even without args
std::hint::black_box(_a);
std::hint::black_box(_b);
}https://stackoverflow.com/questions/72563619
复制相似问题