我在visual中用程序集编写了两个函数,用于我的C++项目。它们是一个无符号64位乘法函数,产生128位结果,而无符号128位除法函数产生128位商,并返回32位余数。
我需要的是函数的签名版本,但我不知道如何实现。
下面是带有无符号函数的.asm文件的代码:
.MODEL flat, stdcall
.CODE
MUL64 PROC, A:QWORD, B:QWORD, pu128:DWORD
push EAX
push EDX
push EBX
push ECX
push EDI
mov EDI,pu128
; LO(A) * LO(B)
mov EAX,DWORD PTR A
mov EDX,DWORD PTR B
MUL EDX
mov [EDI],EAX ; Save the partial product.
mov ECX,EDX
; LO(A) * HI(B)
mov EAX,DWORD PTR A
mov EDX,DWORD PTR B+4
MUL EDX
ADD EAX,ECX
ADC EDX,0
mov EBX,EAX
mov ECX,EDX
; HI(A) * LO(B)
mov EAX,DWORD PTR A+4
mov EDX,DWORD PTR B
MUL EDX
ADD EAX,EBX
ADC ECX,EDX
PUSHFD ; Save carry.
mov [EDI+4],EAX ; Save the partial product.
; HI(A) * HI(B)
mov EAX,DWORD PTR A+4
mov EDX,DWORD PTR B+4
MUL EDX
POPFD ; Retrieve carry from above.
ADC EAX,ECX
ADC EDX,0
mov [EDI+8],EAX ; Save the partial product.
mov [EDI+12],EDX ; Save the partial product.
pop EDI
pop ECX
pop EBX
pop EDX
pop EAX
ret 20
MUL64 ENDP
IMUL64 PROC, A:SQWORD, B:SQWORD, pi128:DWORD
; How to make this work?
ret 20
IMUL64 ENDP
DIV128 PROC, pDividend128:DWORD, Divisor:DWORD, pQuotient128:DWORD
push EDX
push EBX
push ESI
push EDI
MOV ESI,pDividend128
MOV EDI,pQuotient128
MOV EBX,Divisor
XOR EDX,EDX
MOV EAX,[ESI+12]
DIV EBX
MOV [EDI+12],EAX
MOV EAX,[ESI+8]
DIV EBX
MOV [EDI+8],EAX
MOV EAX,[ESI+4]
DIV EBX
MOV [EDI+4],EAX
MOV EAX,[ESI]
DIV EBX
MOV [EDI],EAX
MOV EAX,EDX
pop EDI
pop ESI
pop EBX
pop EDX
ret 12
DIV128 ENDP
IDIV128 PROC, pDividend128:DWORD, Divisor:DWORD, pQuotient128:DWORD
; How to make this work?
ret 12
IDIV128 ENDP
END如果您发现这在任何情况下都有帮助,请通过帮助编写函数的签名版本来帮助项目。
发布于 2018-02-17 06:00:40
首先,MUL64函数不能100%工作。
如果尝试执行0xFFFFFFFFFFFFF x 0xFFFFFFFFFFFFFFFFFFF,Hi 64位结果为0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
要解决这个问题,POPFD指令后的进位标志应该添加到EDX中,这是结果中最高的32位部分。现在,按照Peter的建议,删除EAX/ECX/EDX的推送和弹出。最后,使用setc BL和movzx EBX,BL保存标志。注意:您不能轻易地使用xor EBX,EBX对其进行零设置,因为xor会影响标志。我们使用movzx,因为它比add BL,0xFF和add更快,比基于Skylake规范的adc更快。
结果:
MUL64 PROC, A:QWORD, B:QWORD, pu128:DWORD
push EBX
push EDI
mov EDI,pu128
; LO(A) * LO(B)
mov EAX,DWORD PTR A
mov EDX,DWORD PTR B
mul EDX
mov [EDI],EAX ; Save the partial product.
mov ECX,EDX
; LO(A) * HI(B)
mov EAX,DWORD PTR A
mov EDX,DWORD PTR B+4
mul EDX
add EAX,ECX
adc EDX,0
mov EBX,EAX
mov ECX,EDX
; HI(A) * LO(B)
mov EAX,DWORD PTR A+4
mov EDX,DWORD PTR B
mul EDX
add EAX,EBX
adc ECX,EDX
setc BL ; Save carry.
movzx EBX,BL ; Zero-Extend carry.
mov [EDI+4],EAX ; Save the partial product.
; HI(A) * HI(B)
mov EAX,DWORD PTR A+4
mov EDX,DWORD PTR B+4
mul EDX
add EDX,EBX ; Add carry from above.
add EAX,ECX
adc EDX,0
mov [EDI+8],EAX ; Save the partial product.
mov [EDI+12],EDX ; Save the partial product.
pop EDI
pop EBX
ret 20
MUL64 ENDP现在,要使函数的有符号版本使用以下公式:
my128.Hi -= (((A < 0) ? B : 0) + ((B < 0) ? A : 0));
结果:
IMUL64 PROC, A:SQWORD, B:SQWORD, pi128:DWORD
push EBX
push EDI
mov EDI,pi128
; LO(A) * LO(B)
mov EAX,DWORD PTR A
mov EDX,DWORD PTR B
mul EDX
mov [EDI],EAX ; Save the partial product.
mov ECX,EDX
; LO(A) * HI(B)
mov EAX,DWORD PTR A
mov EDX,DWORD PTR B+4
mul EDX
add EAX,ECX
adc EDX,0
mov EBX,EAX
mov ECX,EDX
; HI(A) * LO(B)
mov EAX,DWORD PTR A+4
mov EDX,DWORD PTR B
mul EDX
add EAX,EBX
adc ECX,EDX
setc BL ; Save carry.
movzx EBX,BL ; Zero-Extend carry.
mov [EDI+4],EAX ; Save the partial product.
; HI(A) * HI(B)
mov EAX,DWORD PTR A+4
mov EDX,DWORD PTR B+4
mul EDX
add EDX,EBX ; Add carry from above.
add EAX,ECX
adc EDX,0
mov [EDI+8],EAX ; Save the partial product.
mov [EDI+12],EDX ; Save the partial product.
; Signed version only:
cmp DWORD PTR A+4,0
jg zero_b
jl use_b
cmp DWORD PTR A,0
jae zero_b
use_b:
mov ECX,DWORD PTR B
mov EBX,DWORD PTR B+4
jmp test_b
zero_b:
xor ECX,ECX
mov EBX,ECX
test_b:
cmp DWORD PTR B+4,0
jg zero_a
jl use_a
cmp DWORD PTR B,0
jae zero_a
use_a:
mov EAX,DWORD PTR A
mov EDX,DWORD PTR A+4
jmp do_last_op
zero_a:
xor EAX,EAX
mov EDX,EAX
do_last_op:
add EAX,ECX
adc EDX,EBX
sub [EDI+8],EAX
sbb [EDI+12],EDX
; End of signed version!
pop EDI
pop EBX
ret 20
IMUL64 ENDP对于从32位除数中获得128位商来说,DIV128函数应该很好(可能也是最快的),但是如果您需要使用128位除数,那么看看这个代码https://www.codeproject.com/Tips/785014/UInt-Division-Modulus,它有一个使用二进制移位算法进行128位除法的例子。如果在程序集中编写,它可能会快3倍。
要制作DIV128的签名版本,首先确定除数和派息的符号是相同的还是不同的。如果它们是相同的,那么结果应该是积极的。如果他们是不同的,那么结果应该是负面的。所以..。如果分红和除数为负值,则将其设为正,并调用DIV128,然后,如果符号不同,则否定结果。
下面是用C++编写的一些示例代码
VOID IDIV128(PSDQWORD Dividend, PSDQWORD Divisor, PSDQWORD Quotient, PSDQWORD Remainder)
{
BOOL Negate;
DQWORD DD, DV;
Negate = TRUE;
// Use local DD and DV so Dividend and Divisor dont get currupted.
DD.Lo = Dividend->Lo;
DD.Hi = Dividend->Hi;
DV.Lo = Divisor->Lo;
DV.Hi = Divisor->Hi;
// if the signs are the same then: Negate = FALSE;
if ((DD.Hi & 0x8000000000000000) == (DV.Hi & 0x8000000000000000)) Negate = FALSE;
// Covert Dividend and Divisor to possitive if negative: (negate)
if (DD.Hi & 0x8000000000000000) NEG128((PSDQWORD)&DD);
if (DV.Hi & 0x8000000000000000) NEG128((PSDQWORD)&DV);
DIV128(&DD, &DV, (PDQWORD)Quotient, (PDQWORD)Remainder);
if (Negate == TRUE)
{
NEG128(Quotient);
NEG128(Remainder);
}
}编辑:
按照Peter的建议,我们可以更多地优化MUL64/IMUL64。请查看评论,了解正在进行的具体更改。我还将MUL64 PROC, A:QWORD, B:QWORD, pu128:DWORD替换为MUL64@20:和IMUL64@20:,以消除masm添加的不必要的EBP使用。我还优化了IMUL64的标记修复工作。
MUL64/IMUL64 64的当前.asm文件
.MODEL flat, stdcall
EXTERNDEF MUL64@20 :PROC
EXTERNDEF IMUL64@20 :PROC
.CODE
MUL64@20:
push EBX
push EDI
; -----------------
; | pu128 |
; |---------------|
; | B |
; |---------------|
; | A |
; |---------------|
; | ret address |
; |---------------|
; | EBX |
; |---------------|
; ESP---->| EDI |
; -----------------
A TEXTEQU <[ESP+12]>
B TEXTEQU <[ESP+20]>
pu128 TEXTEQU <[ESP+28]>
mov EDI,pu128
; LO(A) * LO(B)
mov EAX,DWORD PTR A
mul DWORD PTR B
mov [EDI],EAX ; Save the partial product.
mov ECX,EDX
; LO(A) * HI(B)
mov EAX,DWORD PTR A
mul DWORD PTR B+4
add EAX,ECX
adc EDX,0
mov EBX,EAX
mov ECX,EDX
; HI(A) * LO(B)
mov EAX,DWORD PTR A+4
mul DWORD PTR B
add EAX,EBX
adc ECX,EDX
setc BL ; Save carry.
mov [EDI+4],EAX ; Save the partial product.
; HI(A) * HI(B)
mov EAX,DWORD PTR A+4
mul DWORD PTR B+4
add EAX,ECX
movzx ECX,BL ; Zero-Extend saved carry from above.
adc EDX,ECX
mov [EDI+8],EAX ; Save the partial product.
mov [EDI+12],EDX ; Save the partial product.
pop EDI
pop EBX
ret 20
IMUL64@20:
push EBX
push EDI
; -----------------
; | pi128 |
; |---------------|
; | B |
; |---------------|
; | A |
; |---------------|
; | ret address |
; |---------------|
; | EBX |
; |---------------|
; ESP---->| EDI |
; -----------------
A TEXTEQU <[ESP+12]>
B TEXTEQU <[ESP+20]>
pi128 TEXTEQU <[ESP+28]>
mov EDI,pi128
; LO(A) * LO(B)
mov EAX,DWORD PTR A
mul DWORD PTR B
mov [EDI],EAX ; Save the partial product.
mov ECX,EDX
; LO(A) * HI(B)
mov EAX,DWORD PTR A
mul DWORD PTR B+4
add EAX,ECX
adc EDX,0
mov EBX,EAX
mov ECX,EDX
; HI(A) * LO(B)
mov EAX,DWORD PTR A+4
mul DWORD PTR B
add EAX,EBX
adc ECX,EDX
setc BL ; Save carry.
mov [EDI+4],EAX ; Save the partial product.
; HI(A) * HI(B)
mov EAX,DWORD PTR A+4
mul DWORD PTR B+4
add EAX,ECX
movzx ECX,BL ; Zero-Extend saved carry from above.
adc EDX,ECX
mov [EDI+8],EAX ; Save the partial product.
mov [EDI+12],EDX ; Save the partial product.
; Signed version only:
mov BL,BYTE PTR B+7
and BL,80H
jz zero_a
mov EAX,DWORD PTR A
mov EDX,DWORD PTR A+4
jmp test_a
zero_a:
xor EAX,EAX
mov EDX,EAX
test_a:
mov BL,BYTE PTR A+7
and BL,80H
jz do_last_op
add EAX,DWORD PTR B
adc EDX,DWORD PTR B+4
do_last_op:
sub [EDI+8],EAX
sbb [EDI+12],EDX
; End of signed version!
pop EDI
pop EBX
ret 20
ENDhttps://stackoverflow.com/questions/48669484
复制相似问题