我在学习AVX-512。我有个关于VORPS的问题。
文件上写着:
EVEX.512.0F.W0 56 /r VORPS zmm1 {k1}{z}, zmm2, zmm3/m512/m32bcst
在zmm2和zmm3/m 512/m32bcst中返回打包的单精度浮点值的按位逻辑或(按k1写)。
EVEX编码版本:第一个源操作数是ZMM/YMM/XMM寄存器。第二个源操作数可以是ZMM/YMM/XMM寄存器、512/256/128位内存位置或从32位内存位置广播的512/256/128位矢量。目标操作数是一个ZMM/YMM/XMM寄存器,有条件地使用写函数k1进行更新。
参考文献:https://www.felixcloutier.com/x86/orps
什么是“受写k1的限制”?
有人能给出k1在这个指令中贡献的一个具体例子吗?
我编写这段代码是为了做一些关于VORPS:https://godbolt.org/z/fMcqoa的实验。
代码
#include <stdio.h>
#include <stddef.h>
#include <stdint.h>
int main()
{
register uint8_t *st_data asm("rbx");
asm volatile(
// Fix stack alignment
"andq $~0x3f, %%rsp\n\t"
// Allocate stack
"subq $0x100, %%rsp\n\t"
// Take stack pointer, save it to st_data
"movq %%rsp, %[st_data]\n\t"
// Fill 64 bytes top of stack with 0x01
"movq %%rsp, %%rdi\n\t"
"movl $0x40, %%ecx\n\t"
"movl $0x1, %%eax\n\t"
"rep stosb\n\t"
// Fill 64 bytes next with 0x02
"incl %%eax\n\t"
"leaq 0x40(%%rsp), %%rdi\n\t"
"movl $0x40, %%ecx\n\t"
"rep stosb\n\t"
// Take 0x1 and 0x2 to ZMM register
"vmovdqa64 (%%rsp), %%zmm0\n\t"
"vmovdqa64 0x40(%%rsp), %%zmm1\n\t"
// Set write mask
"movq $0x123456, %%rax\n\t"
"kmovq %%rax, %%k0\n\t"
"kmovq %%rax, %%k1\n\t"
"kmovq %%rax, %%k2\n\t"
// Execute vorps, store the result to ZMM2
"vorps %%zmm0, %%zmm1, %%zmm2\n\t"
// Plug back the result to memory
"vmovdqa64 %%zmm2, 0x80(%%rsp)\n\t"
"vzeroupper"
: [st_data]"=r"(st_data)
:
: "rax", "rcx", "rdi", "zmm0", "zmm1",
"zmm2", "memory", "cc"
);
static const char *x[] = {
"Data 1:", "Data 2:", "Result:"
};
for (size_t i = 0; i < 3; i++) {
printf("%s\n", x[i]);
for (size_t j = 0; j < 8; j++) {
for (size_t k = 0; k < 8; k ++) {
printf("%02x ", *st_data++);
}
printf("\n");
}
printf("\n");
}
fflush(stdout);
asm volatile(
// sys_exit
"movl $0x3c, %eax\n\t"
"xorl %edi, %edi\n\t"
"syscall"
);
}在这里,我试图更改k0、k1、k2的值。但结果总是一样的。
Result:
03 03 03 03 03 03 03 03
03 03 03 03 03 03 03 03
03 03 03 03 03 03 03 03
03 03 03 03 03 03 03 03
03 03 03 03 03 03 03 03
03 03 03 03 03 03 03 03
03 03 03 03 03 03 03 03
03 03 03 03 03 03 03 03发布于 2021-02-25 04:11:11
掩码寄存器不影响结果的原因是因为我没有在vorps的目标操作数中对掩码寄存器进行编码。
在AT&T语法中,用法类似于:
# Without z-bit (merge-masking)
vorps %zmm0, %zmm1, %zmm2 {%k1}
# With z-bit (zero-masking)
vorps %zmm0, %zmm1, %zmm2 {%k1}{z}在GCC的内联asm中,必须像这样对{}进行转义:
# Without z-bit
vorps %%zmm0, %%zmm1, %%zmm2 %{%%k1%}
# With z-bit
vorps %%zmm0, %%zmm1, %%zmm2 %{%%k1%}%{z%}在这种情况下,z位可以用来清除目标操作数的值.
带z位
例如,如果在vorps操作之前,zmm2的值是:
ff ff ff ff ff ff ff ff
ff ff ff ff ff ff ff ff
ff ff ff ff ff ff ff ff
ff ff ff ff ff ff ff ff
ff ff ff ff ff ff ff ff
ff ff ff ff ff ff ff ff
ff ff ff ff ff ff ff ff
ff ff ff ff ff ff ff ff 问题中,zmm0和zmm1的值与上述情况是一致的。
在这些指示之后:
// Set write mask
"movq $0b11111111, %%rax\n\t"
"kmovq %%rax, %%k1\n\t"
// Execute vorps, store the result to ZMM2
"vorps %%zmm0, %%zmm1, %%zmm2 %{%%k1%}%{z%}\n\t"
// Plug back the result to memory
"vmovdqa64 %%zmm2, 0x80(%[buf])\n\t"结果是:
03 03 03 03 03 03 03 03
03 03 03 03 03 03 03 03
03 03 03 03 03 03 03 03
03 03 03 03 03 03 03 03
00 00 00 00 00 00 00 00
00 00 00 00 00 00 00 00
00 00 00 00 00 00 00 00
00 00 00 00 00 00 00 00 如果没有z位,结果将是
03 03 03 03 03 03 03 03
03 03 03 03 03 03 03 03
03 03 03 03 03 03 03 03
03 03 03 03 03 03 03 03
ff ff ff ff ff ff ff ff
ff ff ff ff ff ff ff ff
ff ff ff ff ff ff ff ff
ff ff ff ff ff ff ff ff 代码示例
https://godbolt.org/z/4rq5M8链接
#include <stdio.h>
#include <stddef.h>
#include <stdint.h>
#include <stdalign.h>
int main()
{
alignas(64) uint8_t buf[0x100];
uint8_t *st_data = buf;
asm(
// Fill ZMM2 with 0xff garbage.
"vpternlogd $0xff, %%zmm2, %%zmm2, %%zmm2\n\t"
// Fill ZMM0 with 0x01
"movl $0x01010101, %%eax\n\t"
"vpbroadcastd %%eax, %%zmm0\n\t"
// Fill ZMM1 with 0x02
"movl $0x02020202, %%eax\n\t"
"vpbroadcastd %%eax, %%zmm1\n\t"
// Plug ZMM0 and ZMM1 value to memory to print later
"vmovdqa64 %%zmm0, %[buf_0x00]\n\t"
"vmovdqa64 %%zmm1, %[buf_0x40]\n\t"
// Set write mask
"movl $0b11111111, %%eax\n\t"
"kmovq %%rax, %%k1\n\t"
// vorps without z-bit (merge into ZMM2)
"vorps %%zmm0, %%zmm1, %%zmm2 %{%%k1%}\n\t"
// // vorps with z-bit (zero-mask, overwrite ZMM2)
// "vorps %%zmm0, %%zmm1, %%zmm2 %{%%k1%}%{z%}\n\t"
// Plug the result to memory
"vmovdqa64 %%zmm2, %[buf_0x80]\n\t"
#ifndef __AVX__
/*
* Note:
* If we pass -mavx or -mavx2 or -mavx512* and then we clobber
* AVX register(s) with inline assembly, then the compiler will
* yield "vzeroupper" after the inline assembly.
*
* So we should only put vzeroupper when there is no AVX flag
* to prevent duplicate vzeroupper.
*/
"vzeroupper"
#endif
: [buf_0x00]"=m"(*(uint8_t (*)[0x40])(buf + 0x00)),
[buf_0x40]"=m"(*(uint8_t (*)[0x40])(buf + 0x40)),
[buf_0x80]"=m"(*(uint8_t (*)[0x40])(buf + 0x80))
/*
* Yes, it is all `*(uint8_t (*)[0x40])`, meaning we
* are going to write 0x40 bytes for each constraint.
*/
:
: "rax", "zmm0", "zmm1", "zmm2", "k1"
);
static const char *x[] = {
"Data 1:", "Data 2:", "Result:"
};
for (size_t i = 0; i < 3; i++) {
printf("%s\n", x[i]);
for (size_t j = 0; j < 8; j++) {
for (size_t k = 0; k < 8; k ++) {
printf("%02x ", *st_data++);
}
printf("\n");
}
printf("\n");
}
return 0;
}https://stackoverflow.com/questions/66355144
复制相似问题