文章/答案/技术大牛

发布

社区首页 >问答首页 >如何禁用龙芯锈病的LLVM优化(用于asm教育目的)

问如何禁用龙芯锈病的LLVM优化(用于asm教育目的)
EN

Stack Overflow用户

提问于 2022-06-09 16:18:03

回答 2查看 202关注 0票数 1

我正在学习有关锈病和asm，并使用天螺栓为这一点。

我有一个程序看起来是：

pub fn test() -> i32 {
    let a = 1;
    let b = 2;
    let c = 3;
    a + b + c
}

我希望输出结果类似于

example::test:
        subq    $16, %rsp
        movl    $1, (%rsp)
        movl    $2, 4(%rsp)
        movl    $3, 8(%rsp)
        movl    (%rsp), %eax
        addl    4(%rsp), %eax
        addl    8(%rsp), %eax
        addq    $16, %rsp
        retq

但实际上我得到了：

example::test:
        mov     eax, 6
        ret

当试图演示堆栈分配、添加等时，这是无用的。

我正在使用编译器标志：-Z mir-opt-level=0 -C opt-level=0 -C overflow-checks=off

所以和平号并不是在优化增加的设备。MIR的产出是：

// WARNING: This output format is intended for human consumers only
// and is subject to change without notice. Knock yourself out.
fn test() -> i32 {
    let mut _0: i32;                     // return place in scope 0 at /app/example.rs:2:18: 2:21
    let _1: i32;                         // in scope 0 at /app/example.rs:3:9: 3:10
    let mut _4: i32;                     // in scope 0 at /app/example.rs:6:5: 6:10
    let mut _5: i32;                     // in scope 0 at /app/example.rs:6:5: 6:6
    let mut _6: i32;                     // in scope 0 at /app/example.rs:6:9: 6:10
    let mut _7: i32;                     // in scope 0 at /app/example.rs:6:13: 6:14
    scope 1 {
        debug a => _1;                   // in scope 1 at /app/example.rs:3:9: 3:10
        let _2: i32;                     // in scope 1 at /app/example.rs:4:9: 4:10
        scope 2 {
            debug b => _2;               // in scope 2 at /app/example.rs:4:9: 4:10
            let _3: i32;                 // in scope 2 at /app/example.rs:5:9: 5:10
            scope 3 {
                debug c => _3;           // in scope 3 at /app/example.rs:5:9: 5:10
            }
        }
    }

    bb0: {
        StorageLive(_1);                 // scope 0 at /app/example.rs:3:9: 3:10
        _1 = const 1_i32;                // scope 0 at /app/example.rs:3:13: 3:14
        StorageLive(_2);                 // scope 1 at /app/example.rs:4:9: 4:10
        _2 = const 2_i32;                // scope 1 at /app/example.rs:4:13: 4:14
        StorageLive(_3);                 // scope 2 at /app/example.rs:5:9: 5:10
        _3 = const 3_i32;                // scope 2 at /app/example.rs:5:13: 5:14
        StorageLive(_4);                 // scope 3 at /app/example.rs:6:5: 6:10
        StorageLive(_5);                 // scope 3 at /app/example.rs:6:5: 6:6
        _5 = _1;                         // scope 3 at /app/example.rs:6:5: 6:6
        StorageLive(_6);                 // scope 3 at /app/example.rs:6:9: 6:10
        _6 = _2;                         // scope 3 at /app/example.rs:6:9: 6:10
        _4 = Add(move _5, move _6);      // scope 3 at /app/example.rs:6:5: 6:10
        StorageDead(_6);                 // scope 3 at /app/example.rs:6:9: 6:10
        StorageDead(_5);                 // scope 3 at /app/example.rs:6:9: 6:10
        StorageLive(_7);                 // scope 3 at /app/example.rs:6:13: 6:14
        _7 = _3;                         // scope 3 at /app/example.rs:6:13: 6:14
        _0 = Add(move _4, move _7);      // scope 3 at /app/example.rs:6:5: 6:14
        StorageDead(_7);                 // scope 3 at /app/example.rs:6:13: 6:14
        StorageDead(_4);                 // scope 3 at /app/example.rs:6:13: 6:14
        StorageDead(_3);                 // scope 2 at /app/example.rs:7:1: 7:2
        StorageDead(_2);                 // scope 1 at /app/example.rs:7:1: 7:2
        StorageDead(_1);                 // scope 0 at /app/example.rs:7:1: 7:2
        return;                          // scope 0 at /app/example.rs:7:2: 7:2
    }
}

而LLVM红外输出是：

define i32 @_ZN7example4test17h2e9277ab15e59fbdE() unnamed_addr #0 !dbg !5 {
start:
  ret i32 6, !dbg !10
}

attributes #0 = { nonlazybind uwtable "probe-stack"="__rust_probestack" "target-cpu"="x86-64" }

因此，这是在MIR->LLVM级别，当增加是优化的。

我怎么才能阻止这一切？

谢谢!

备注

如果我使用元组，优化就不会发生。e.g

pub fn test() -> i32 {
    let a = (1,2,3);
    a.0 + a.1 + a.2
}

变成：

example::test:
        subq    $16, %rsp
        movl    $1, (%rsp)
        movl    $2, 4(%rsp)
        movl    $3, 8(%rsp)
        movl    (%rsp), %eax
        addl    4(%rsp), %eax
        addl    8(%rsp), %eax
        addq    $16, %rsp
        retq

rust

llvm

assembly

optimization

回答 2

Stack Overflow用户

发布于 2022-06-09 16:23:54

有一个black_box提示可以防止计算在编译时发生。

请注意，它只有在夜间，在撰写本报告时才可用。

#![feature(bench_black_box)]

pub fn test() -> i32 {
    let a = std::hint::black_box(1);
    let b = std::hint::black_box(2);
    let c = std::hint::black_box(3);
    a + b + c
}

example::test:
        sub     rsp, 12
        mov     dword ptr [rsp], 1
        mov     rax, rsp
        mov     eax, dword ptr [rsp]
        mov     dword ptr [rsp + 4], 2
        lea     rcx, [rsp + 4]
        add     eax, dword ptr [rsp + 4]
        mov     dword ptr [rsp + 8], 3
        lea     rcx, [rsp + 8]
        add     eax, dword ptr [rsp + 8]
        add     rsp, 12
        ret

用rust nightly和-C opt-level=3编译。

https://rust.godbolt.org/z/rMWhao11W

票数 2

Stack Overflow用户

发布于 2022-06-09 18:07:53

将对它们的可变引用传递给其他函数(或内联asm)，以强制它们具有内存地址。在没有定义函数的情况下声明函数的一种方法是extern "C"。

extern "C" {
    fn ext(x: &i32);   // void ext(const int *x);
}

pub fn test(a: i32, b: i32) -> i32 {
    let c = 3;
    unsafe{ ext(&b); }
    //dummy(&c, &a);   // alternative, declare as non-inline an use std::hint::black_box
    a + b + c
}

使用-C opt-level=0 -C overflow-checks=off +i32+{ ++++let+c+=+3; ++++unsafe{+ext(&b);+} ++++//dummy(&c,+&a); ++++a+++b+++c } /* #[inline(never)] pub+extern+fn+dummy(_a:+&i32,+_b:+&i32)+{ ++++//use+std::sync::atomic::*; ++++//compiler_fence(Ordering::Release); ++++std::hint::black_box(_a); ++++std::hint::black_box(_b); } */ pub+fn+test2(a:+i32,+b:+i32)+->+i32+{ ++++let+c+=+3; ++++a+++b+++c } '),l:'5',n:'0',o:'Rust+source+#1',t:'0')),k:35.208047553726566,l:'4',n:'0',o:'',s:0,t:'0'),(g:!((g:!((h:compiler,i:(compiler:nightly,filters:(b:'0',binary:'1',commentOnly:'0',demangle:'0',directives:'0',execute:'1',intel:'0',libraryCode:'0',trim:'1'),flagsViewOpen:'1',fontScale:14,fontUsePx:'0',j:2,lang:rust,libs:!(),options:'-C+opt-level=0+-C+overflow-checks=off',selection:(endColumn:12,endLineNumber:17,positionColumn:12,positionLineNumber:17,selectionStartColumn:12,selectionStartLineNumber:17,startColumn:12,startLineNumber:17),source:1,tree:'1'),l:'5',n:'0',o:'rustc+nightly+(Rust,+Editor+#1,+Compiler+#2)',t:'0')),header:(),k:49.326718639262936,l:'4',n:'0',o:'',s:0,t:'0'),(g:!((h:compiler,i:(compiler:nightly,filters:(b:'0',binary:'1',commentOnly:'0',demangle:'0',directives:'0',execute:'1',intel:'0',libraryCode:'0',trim:'1'),flagsViewOpen:'1',fontScale:14,fontUsePx:'0',j:1,lang:rust,libs:!(),options:'-C+opt-level=3',selection:(endColumn:1,endLineNumber:1,positionColumn:1,positionLineNumber:1,selectionStartColumn:1,selectionStartLineNumber:1,startColumn:1,startLineNumber:1),source:1,tree:'1'),l:'5',n:'0',o:'rustc+nightly+(Rust,+Editor+#1,+Compiler+#1)',t:'0')),k:50.673281360737064,l:'4',m:71.02952913008778,n:'0',o:'',s:0,t:'0')),k:64.79195244627343,l:'2',m:100,n:'0',o:'',t:'0')),l:'2',n:'0',o:'',t:'0')),version:4)">，编译器将两个arg都溢出到函数调用周围的内存中。

example::test:
        push    rax                          // align the stack and reserve 8 bytes
        mov     dword ptr [rsp], edi
        mov     dword ptr [rsp + 4], esi
        lea     rdi, [rsp + 4]               // &b
        call    qword ptr [rip + ext@GOTPCREL]  // function call  -fno-plt 
style
        mov     eax, dword ptr [rsp]         // reload a and b
        add     eax, dword ptr [rsp + 4]
        add     eax, 3                       // constant-propagation for c
        pop     rcx                          // dealloc stack space with a dummy pop
        ret

在不禁用优化的情况下，LLVM按预期保存/恢复一个调用保留寄存器，以在函数调用中保持a。

example::test:
        push    rbx                          // save a call-preserved reg
        sub     rsp, 16

        mov     ebx, edi                     // use it to hold a
        mov     dword ptr [rsp + 12], esi    // spill b
        lea     rdi, [rsp + 12]              // and pass a pointer to it
        call    qword ptr [rip + ext@GOTPCREL]
        mov     eax, dword ptr [rsp + 12]
        add     eax, ebx
        add     eax, 3

        add     rsp, 16                      // epilogue
        pop     rbx
        ret

或者具体地阻止常量折叠，使用函数args而不是常量。见：How to remove "noise" from GCC/clang assembly output?

pub fn test2(a: i32, b: i32) -> i32 {
    let c = 3;
    a + b + c
}

但即使在-C opt-level=0 -C overflow-checks=off上，

Rustc仍然不会像clang -O0那样溢出/重新加载到堆栈空间。

example::test2:
        mov     eax, edi
        add     eax, esi
        add     eax, 3
        ret

(opt-level=3当然使用LEA而不是MOV+ADD，但仍然使用一个单独的常数3添加来优化延迟，而不是像Skylake这样的CPU的吞吐量，在这里，3分量LEA有3个周期延迟，而不是1个。与Alder不同，lea eax, [rsi+rdi+3]是1个周期延迟，并且有一个缩放索引。或两个周期的禅宗或奥尔德湖的E-核心，所以打破-平衡与单独的LEA/添加，但较少的uop。https://uops.info/)

#[inline(never)]

这是在How to declare a function without implementing it?上建议的一种获得非内联函数调用的方法。我们可以使用@Finomnis建议的std::hint::black_box来实际使用args，并强制调用方在传递引用时在内存中物化一个值。

取消注释，在上面的哥德螺栓链接，以尝试它。

#![feature(bench_black_box)]

pub fn test(a: i32, b: i32) -> i32 {
    let c = 3;
    dummy(&c, &a);
    a + b + c
}


#[inline(never)]
pub extern fn dummy(_a: &i32, _b: &i32) {
    //use std::sync::atomic::*;
    //compiler_fence(Ordering::Release);   // make the function non-empty even without args
    std::hint::black_box(_a);
    std::hint::black_box(_b);
}

票数 0

页面原文内容由Stack Overflow提供。腾讯云小微IT领域专用引擎提供翻译支持

原文链接：

https://stackoverflow.com/questions/72563619

复制

相似问题

问如何禁用龙芯锈病的LLVM优化(用于asm教育目的)
EN

回答 2

Stack Overflow用户

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问如何禁用龙芯锈病的LLVM优化(用于asm教育目的)EN

回答 2

Stack Overflow用户

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问如何禁用龙芯锈病的LLVM优化(用于asm教育目的)
EN