CodeGen: Implement SIMD math and vector moves

This change extends fadd/fsub/fmul/fdiv/fneg and mov to work on qN registers.
For arithmetics we assume .4s format (4 single-precision floats).

Alternatively we could also make new fmul_4s et al but it seems unlikely that
we'll need alternative variants in the future; if we do, we could always rework
the code.

placeVR mnemonic is the same as placeR3 but I split it so that it is easier to
modify in the future. For fneg I've reused placeR1 (it won't print .4s suffix in
text disassembly but... it's unclear that we care).

Vector moves are weird on A64: they are an alias for OR of a register with itself.
This commit is contained in:
Arseny Kapoulkine 2024-02-13 15:35:17 -08:00
parent d6c2472f0c
commit 0338e0e52d
3 changed files with 73 additions and 10 deletions

View file

@ -230,6 +230,7 @@ private:
void placeBM(const char* name, RegisterA64 dst, RegisterA64 src1, uint32_t src2, uint8_t op);
void placeBFM(const char* name, RegisterA64 dst, RegisterA64 src1, int src2, uint8_t op, int immr, int imms);
void placeER(const char* name, RegisterA64 dst, RegisterA64 src1, RegisterA64 src2, uint8_t op, int shift);
void placeVR(const char* name, RegisterA64 dst, RegisterA64 src1, RegisterA64 src2, uint16_t op, uint8_t op2);
void place(uint32_t word);

View file

@ -63,13 +63,22 @@ AssemblyBuilderA64::~AssemblyBuilderA64()
void AssemblyBuilderA64::mov(RegisterA64 dst, RegisterA64 src)
{
LUAU_ASSERT(dst.kind == KindA64::w || dst.kind == KindA64::x || dst == sp);
LUAU_ASSERT(dst.kind == src.kind || (dst.kind == KindA64::x && src == sp) || (dst == sp && src.kind == KindA64::x));
if (dst.kind != KindA64::q)
{
LUAU_ASSERT(dst.kind == KindA64::w || dst.kind == KindA64::x || dst == sp);
LUAU_ASSERT(dst.kind == src.kind || (dst.kind == KindA64::x && src == sp) || (dst == sp && src.kind == KindA64::x));
if (dst == sp || src == sp)
placeR1("mov", dst, src, 0b00'100010'0'000000000000);
if (dst == sp || src == sp)
placeR1("mov", dst, src, 0b00'100010'0'000000000000);
else
placeSR2("mov", dst, src, 0b01'01010);
}
else
placeSR2("mov", dst, src, 0b01'01010);
{
LUAU_ASSERT(dst.kind == src.kind);
placeR1("mov", dst, src, 0b10'01110'10'1'00000'00011'1 | (src.index << 6));
}
}
void AssemblyBuilderA64::mov(RegisterA64 dst, int src)
@ -575,12 +584,18 @@ void AssemblyBuilderA64::fadd(RegisterA64 dst, RegisterA64 src1, RegisterA64 src
placeR3("fadd", dst, src1, src2, 0b11110'01'1, 0b0010'10);
}
else
else if (dst.kind == KindA64::s)
{
LUAU_ASSERT(dst.kind == KindA64::s && src1.kind == KindA64::s && src2.kind == KindA64::s);
placeR3("fadd", dst, src1, src2, 0b11110'00'1, 0b0010'10);
}
else
{
LUAU_ASSERT(dst.kind == KindA64::q && src1.kind == KindA64::q && src2.kind == KindA64::q);
placeVR("fadd", dst, src1, src2, 0b0'01110'0'0'1, 0b11010'1);
}
}
void AssemblyBuilderA64::fdiv(RegisterA64 dst, RegisterA64 src1, RegisterA64 src2)
@ -591,12 +606,18 @@ void AssemblyBuilderA64::fdiv(RegisterA64 dst, RegisterA64 src1, RegisterA64 src
placeR3("fdiv", dst, src1, src2, 0b11110'01'1, 0b0001'10);
}
else
else if (dst.kind == KindA64::s)
{
LUAU_ASSERT(dst.kind == KindA64::s && src1.kind == KindA64::s && src2.kind == KindA64::s);
placeR3("fdiv", dst, src1, src2, 0b11110'00'1, 0b0001'10);
}
else
{
LUAU_ASSERT(dst.kind == KindA64::q && src1.kind == KindA64::q && src2.kind == KindA64::q);
placeVR("fdiv", dst, src1, src2, 0b1'01110'00'1, 0b11111'1);
}
}
void AssemblyBuilderA64::fmul(RegisterA64 dst, RegisterA64 src1, RegisterA64 src2)
@ -607,12 +628,18 @@ void AssemblyBuilderA64::fmul(RegisterA64 dst, RegisterA64 src1, RegisterA64 src
placeR3("fmul", dst, src1, src2, 0b11110'01'1, 0b0000'10);
}
else
else if (dst.kind == KindA64::s)
{
LUAU_ASSERT(dst.kind == KindA64::s && src1.kind == KindA64::s && src2.kind == KindA64::s);
placeR3("fmul", dst, src1, src2, 0b11110'00'1, 0b0000'10);
}
else
{
LUAU_ASSERT(dst.kind == KindA64::q && src1.kind == KindA64::q && src2.kind == KindA64::q);
placeVR("fmul", dst, src1, src2, 0b1'01110'00'1, 0b11011'1);
}
}
void AssemblyBuilderA64::fneg(RegisterA64 dst, RegisterA64 src)
@ -623,12 +650,18 @@ void AssemblyBuilderA64::fneg(RegisterA64 dst, RegisterA64 src)
placeR1("fneg", dst, src, 0b000'11110'01'1'0000'10'10000);
}
else
else if (dst.kind == KindA64::s)
{
LUAU_ASSERT(dst.kind == KindA64::s && src.kind == KindA64::s);
placeR1("fneg", dst, src, 0b000'11110'00'1'0000'10'10000);
}
else
{
LUAU_ASSERT(dst.kind == KindA64::q && src.kind == KindA64::q);
placeR1("fneg", dst, src, 0b011'01110'1'0'10000'01111'10);
}
}
void AssemblyBuilderA64::fsqrt(RegisterA64 dst, RegisterA64 src)
@ -646,12 +679,18 @@ void AssemblyBuilderA64::fsub(RegisterA64 dst, RegisterA64 src1, RegisterA64 src
placeR3("fsub", dst, src1, src2, 0b11110'01'1, 0b0011'10);
}
else
else if (dst.kind == KindA64::s)
{
LUAU_ASSERT(dst.kind == KindA64::s && src1.kind == KindA64::s && src2.kind == KindA64::s);
placeR3("fsub", dst, src1, src2, 0b11110'00'1, 0b0011'10);
}
else
{
LUAU_ASSERT(dst.kind == KindA64::q && src1.kind == KindA64::q && src2.kind == KindA64::q);
placeVR("fsub", dst, src1, src2, 0b0'01110'10'1, 0b11010'1);
}
}
void AssemblyBuilderA64::ins_4s(RegisterA64 dst, RegisterA64 src, uint8_t index)
@ -1226,6 +1265,17 @@ void AssemblyBuilderA64::placeER(const char* name, RegisterA64 dst, RegisterA64
commit();
}
void AssemblyBuilderA64::placeVR(const char* name, RegisterA64 dst, RegisterA64 src1, RegisterA64 src2, uint16_t op, uint8_t op2)
{
if (logText)
logAppend(" %-12sv%d.4s,v%d.4s,v%d.4s\n", name, dst.index, src1.index, src2.index);
LUAU_ASSERT(dst.kind == KindA64::q && dst.kind == src1.kind && dst.kind == src2.kind);
place(dst.index | (src1.index << 5) | (op2 << 10) | (src2.index << 16) | (op << 21) | (1 << 30));
commit();
}
void AssemblyBuilderA64::place(uint32_t word)
{
LUAU_ASSERT(codePos < codeEnd);

View file

@ -218,6 +218,7 @@ TEST_CASE_FIXTURE(AssemblyBuilderA64Fixture, "Moves")
{
SINGLE_COMPARE(mov(x0, x1), 0xAA0103E0);
SINGLE_COMPARE(mov(w0, w1), 0x2A0103E0);
SINGLE_COMPARE(mov(q0, q1), 0x4EA11C20);
SINGLE_COMPARE(movz(x0, 42), 0xD2800540);
SINGLE_COMPARE(movz(w0, 42), 0x52800540);
@ -501,6 +502,15 @@ TEST_CASE_FIXTURE(AssemblyBuilderA64Fixture, "PrePostIndexing")
SINGLE_COMPARE(str(q0, mem(x1, 1, AddressKindA64::post)), 0x3C801420);
}
TEST_CASE_FIXTURE(AssemblyBuilderA64Fixture, "SIMDMath")
{
SINGLE_COMPARE(fadd(q0, q1, q2), 0x4E22D420);
SINGLE_COMPARE(fsub(q0, q1, q2), 0x4EA2D420);
SINGLE_COMPARE(fmul(q0, q1, q2), 0x6E22DC20);
SINGLE_COMPARE(fdiv(q0, q1, q2), 0x6E22FC20);
SINGLE_COMPARE(fneg(q0, q1), 0x6EA0F820);
}
TEST_CASE("LogTest")
{
AssemblyBuilderA64 build(/* logText= */ true);
@ -552,6 +562,7 @@ TEST_CASE("LogTest")
build.ins_4s(q31, 1, q29, 2);
build.dup_4s(s29, q31, 2);
build.dup_4s(q29, q30, 0);
build.fmul(q0, q1, q2);
build.setLabel(l);
build.ret();
@ -594,6 +605,7 @@ TEST_CASE("LogTest")
ins v31.s[1],v29.s[2]
dup s29,v31.s[2]
dup v29.4s,v30.s[0]
fmul v0.4s,v1.4s,v2.4s
.L1:
ret
)";