CodeGen: Implement support for vdpps AVX instruction

This will help optimize lowering of vector.dot and vector.normalize
This commit is contained in:
Arseny Kapoulkine 2024-11-08 09:46:13 -08:00
parent 26b2307a8b
commit d2c008c3a8
3 changed files with 9 additions and 0 deletions

View file

@ -167,6 +167,8 @@ public:
void vpshufps(RegisterX64 dst, RegisterX64 src1, OperandX64 src2, uint8_t shuffle);
void vpinsrd(RegisterX64 dst, RegisterX64 src1, OperandX64 src2, uint8_t offset);
void vdpps(OperandX64 dst, OperandX64 src1, OperandX64 src2, uint8_t mask);
// Run final checks
bool finalize();

View file

@ -946,6 +946,11 @@ void AssemblyBuilderX64::vpinsrd(RegisterX64 dst, RegisterX64 src1, OperandX64 s
placeAvx("vpinsrd", dst, src1, src2, offset, 0x22, false, AVX_0F3A, AVX_66);
}
void AssemblyBuilderX64::vdpps(OperandX64 dst, OperandX64 src1, OperandX64 src2, uint8_t mask)
{
placeAvx("vdpps", dst, src1, src2, mask, 0x40, false, AVX_0F3A, AVX_66);
}
bool AssemblyBuilderX64::finalize()
{
code.resize(codePos - code.data());

View file

@ -577,6 +577,8 @@ TEST_CASE_FIXTURE(AssemblyBuilderX64Fixture, "AVXTernaryInstructionForms")
SINGLE_COMPARE(vpshufps(xmm7, xmm12, xmmword[rcx + r10], 0b11010100), 0xc4, 0xa1, 0x18, 0xc6, 0x3c, 0x11, 0xd4);
SINGLE_COMPARE(vpinsrd(xmm7, xmm12, xmmword[rcx + r10], 2), 0xc4, 0xa3, 0x19, 0x22, 0x3c, 0x11, 0x02);
SINGLE_COMPARE(vdpps(xmm7, xmm12, xmmword[rcx + r10], 2), 0xc4, 0xa3, 0x19, 0x40, 0x3c, 0x11, 0x02);
}
TEST_CASE_FIXTURE(AssemblyBuilderX64Fixture, "MiscInstructions")