mirror of
https://github.com/luau-lang/luau.git
synced 2025-05-04 10:33:46 +01:00
CodeGen: Implement SIMD math and vector moves
This change extends fadd/fsub/fmul/fdiv/fneg and mov to work on qN registers. For arithmetics we assume .4s format (4 single-precision floats). Alternatively we could also make new fmul_4s et al but it seems unlikely that we'll need alternative variants in the future; if we do, we could always rework the code. placeVR mnemonic is the same as placeR3 but I split it so that it is easier to modify in the future. For fneg I've reused placeR1 (it won't print .4s suffix in text disassembly but... it's unclear that we care). Vector moves are weird on A64: they are an alias for OR of a register with itself.
This commit is contained in:
parent
d6c2472f0c
commit
0338e0e52d
3 changed files with 73 additions and 10 deletions
|
@ -230,6 +230,7 @@ private:
|
|||
void placeBM(const char* name, RegisterA64 dst, RegisterA64 src1, uint32_t src2, uint8_t op);
|
||||
void placeBFM(const char* name, RegisterA64 dst, RegisterA64 src1, int src2, uint8_t op, int immr, int imms);
|
||||
void placeER(const char* name, RegisterA64 dst, RegisterA64 src1, RegisterA64 src2, uint8_t op, int shift);
|
||||
void placeVR(const char* name, RegisterA64 dst, RegisterA64 src1, RegisterA64 src2, uint16_t op, uint8_t op2);
|
||||
|
||||
void place(uint32_t word);
|
||||
|
||||
|
|
|
@ -63,13 +63,22 @@ AssemblyBuilderA64::~AssemblyBuilderA64()
|
|||
|
||||
void AssemblyBuilderA64::mov(RegisterA64 dst, RegisterA64 src)
|
||||
{
|
||||
LUAU_ASSERT(dst.kind == KindA64::w || dst.kind == KindA64::x || dst == sp);
|
||||
LUAU_ASSERT(dst.kind == src.kind || (dst.kind == KindA64::x && src == sp) || (dst == sp && src.kind == KindA64::x));
|
||||
if (dst.kind != KindA64::q)
|
||||
{
|
||||
LUAU_ASSERT(dst.kind == KindA64::w || dst.kind == KindA64::x || dst == sp);
|
||||
LUAU_ASSERT(dst.kind == src.kind || (dst.kind == KindA64::x && src == sp) || (dst == sp && src.kind == KindA64::x));
|
||||
|
||||
if (dst == sp || src == sp)
|
||||
placeR1("mov", dst, src, 0b00'100010'0'000000000000);
|
||||
if (dst == sp || src == sp)
|
||||
placeR1("mov", dst, src, 0b00'100010'0'000000000000);
|
||||
else
|
||||
placeSR2("mov", dst, src, 0b01'01010);
|
||||
}
|
||||
else
|
||||
placeSR2("mov", dst, src, 0b01'01010);
|
||||
{
|
||||
LUAU_ASSERT(dst.kind == src.kind);
|
||||
|
||||
placeR1("mov", dst, src, 0b10'01110'10'1'00000'00011'1 | (src.index << 6));
|
||||
}
|
||||
}
|
||||
|
||||
void AssemblyBuilderA64::mov(RegisterA64 dst, int src)
|
||||
|
@ -575,12 +584,18 @@ void AssemblyBuilderA64::fadd(RegisterA64 dst, RegisterA64 src1, RegisterA64 src
|
|||
|
||||
placeR3("fadd", dst, src1, src2, 0b11110'01'1, 0b0010'10);
|
||||
}
|
||||
else
|
||||
else if (dst.kind == KindA64::s)
|
||||
{
|
||||
LUAU_ASSERT(dst.kind == KindA64::s && src1.kind == KindA64::s && src2.kind == KindA64::s);
|
||||
|
||||
placeR3("fadd", dst, src1, src2, 0b11110'00'1, 0b0010'10);
|
||||
}
|
||||
else
|
||||
{
|
||||
LUAU_ASSERT(dst.kind == KindA64::q && src1.kind == KindA64::q && src2.kind == KindA64::q);
|
||||
|
||||
placeVR("fadd", dst, src1, src2, 0b0'01110'0'0'1, 0b11010'1);
|
||||
}
|
||||
}
|
||||
|
||||
void AssemblyBuilderA64::fdiv(RegisterA64 dst, RegisterA64 src1, RegisterA64 src2)
|
||||
|
@ -591,12 +606,18 @@ void AssemblyBuilderA64::fdiv(RegisterA64 dst, RegisterA64 src1, RegisterA64 src
|
|||
|
||||
placeR3("fdiv", dst, src1, src2, 0b11110'01'1, 0b0001'10);
|
||||
}
|
||||
else
|
||||
else if (dst.kind == KindA64::s)
|
||||
{
|
||||
LUAU_ASSERT(dst.kind == KindA64::s && src1.kind == KindA64::s && src2.kind == KindA64::s);
|
||||
|
||||
placeR3("fdiv", dst, src1, src2, 0b11110'00'1, 0b0001'10);
|
||||
}
|
||||
else
|
||||
{
|
||||
LUAU_ASSERT(dst.kind == KindA64::q && src1.kind == KindA64::q && src2.kind == KindA64::q);
|
||||
|
||||
placeVR("fdiv", dst, src1, src2, 0b1'01110'00'1, 0b11111'1);
|
||||
}
|
||||
}
|
||||
|
||||
void AssemblyBuilderA64::fmul(RegisterA64 dst, RegisterA64 src1, RegisterA64 src2)
|
||||
|
@ -607,12 +628,18 @@ void AssemblyBuilderA64::fmul(RegisterA64 dst, RegisterA64 src1, RegisterA64 src
|
|||
|
||||
placeR3("fmul", dst, src1, src2, 0b11110'01'1, 0b0000'10);
|
||||
}
|
||||
else
|
||||
else if (dst.kind == KindA64::s)
|
||||
{
|
||||
LUAU_ASSERT(dst.kind == KindA64::s && src1.kind == KindA64::s && src2.kind == KindA64::s);
|
||||
|
||||
placeR3("fmul", dst, src1, src2, 0b11110'00'1, 0b0000'10);
|
||||
}
|
||||
else
|
||||
{
|
||||
LUAU_ASSERT(dst.kind == KindA64::q && src1.kind == KindA64::q && src2.kind == KindA64::q);
|
||||
|
||||
placeVR("fmul", dst, src1, src2, 0b1'01110'00'1, 0b11011'1);
|
||||
}
|
||||
}
|
||||
|
||||
void AssemblyBuilderA64::fneg(RegisterA64 dst, RegisterA64 src)
|
||||
|
@ -623,12 +650,18 @@ void AssemblyBuilderA64::fneg(RegisterA64 dst, RegisterA64 src)
|
|||
|
||||
placeR1("fneg", dst, src, 0b000'11110'01'1'0000'10'10000);
|
||||
}
|
||||
else
|
||||
else if (dst.kind == KindA64::s)
|
||||
{
|
||||
LUAU_ASSERT(dst.kind == KindA64::s && src.kind == KindA64::s);
|
||||
|
||||
placeR1("fneg", dst, src, 0b000'11110'00'1'0000'10'10000);
|
||||
}
|
||||
else
|
||||
{
|
||||
LUAU_ASSERT(dst.kind == KindA64::q && src.kind == KindA64::q);
|
||||
|
||||
placeR1("fneg", dst, src, 0b011'01110'1'0'10000'01111'10);
|
||||
}
|
||||
}
|
||||
|
||||
void AssemblyBuilderA64::fsqrt(RegisterA64 dst, RegisterA64 src)
|
||||
|
@ -646,12 +679,18 @@ void AssemblyBuilderA64::fsub(RegisterA64 dst, RegisterA64 src1, RegisterA64 src
|
|||
|
||||
placeR3("fsub", dst, src1, src2, 0b11110'01'1, 0b0011'10);
|
||||
}
|
||||
else
|
||||
else if (dst.kind == KindA64::s)
|
||||
{
|
||||
LUAU_ASSERT(dst.kind == KindA64::s && src1.kind == KindA64::s && src2.kind == KindA64::s);
|
||||
|
||||
placeR3("fsub", dst, src1, src2, 0b11110'00'1, 0b0011'10);
|
||||
}
|
||||
else
|
||||
{
|
||||
LUAU_ASSERT(dst.kind == KindA64::q && src1.kind == KindA64::q && src2.kind == KindA64::q);
|
||||
|
||||
placeVR("fsub", dst, src1, src2, 0b0'01110'10'1, 0b11010'1);
|
||||
}
|
||||
}
|
||||
|
||||
void AssemblyBuilderA64::ins_4s(RegisterA64 dst, RegisterA64 src, uint8_t index)
|
||||
|
@ -1226,6 +1265,17 @@ void AssemblyBuilderA64::placeER(const char* name, RegisterA64 dst, RegisterA64
|
|||
commit();
|
||||
}
|
||||
|
||||
void AssemblyBuilderA64::placeVR(const char* name, RegisterA64 dst, RegisterA64 src1, RegisterA64 src2, uint16_t op, uint8_t op2)
|
||||
{
|
||||
if (logText)
|
||||
logAppend(" %-12sv%d.4s,v%d.4s,v%d.4s\n", name, dst.index, src1.index, src2.index);
|
||||
|
||||
LUAU_ASSERT(dst.kind == KindA64::q && dst.kind == src1.kind && dst.kind == src2.kind);
|
||||
|
||||
place(dst.index | (src1.index << 5) | (op2 << 10) | (src2.index << 16) | (op << 21) | (1 << 30));
|
||||
commit();
|
||||
}
|
||||
|
||||
void AssemblyBuilderA64::place(uint32_t word)
|
||||
{
|
||||
LUAU_ASSERT(codePos < codeEnd);
|
||||
|
|
|
@ -218,6 +218,7 @@ TEST_CASE_FIXTURE(AssemblyBuilderA64Fixture, "Moves")
|
|||
{
|
||||
SINGLE_COMPARE(mov(x0, x1), 0xAA0103E0);
|
||||
SINGLE_COMPARE(mov(w0, w1), 0x2A0103E0);
|
||||
SINGLE_COMPARE(mov(q0, q1), 0x4EA11C20);
|
||||
|
||||
SINGLE_COMPARE(movz(x0, 42), 0xD2800540);
|
||||
SINGLE_COMPARE(movz(w0, 42), 0x52800540);
|
||||
|
@ -501,6 +502,15 @@ TEST_CASE_FIXTURE(AssemblyBuilderA64Fixture, "PrePostIndexing")
|
|||
SINGLE_COMPARE(str(q0, mem(x1, 1, AddressKindA64::post)), 0x3C801420);
|
||||
}
|
||||
|
||||
TEST_CASE_FIXTURE(AssemblyBuilderA64Fixture, "SIMDMath")
|
||||
{
|
||||
SINGLE_COMPARE(fadd(q0, q1, q2), 0x4E22D420);
|
||||
SINGLE_COMPARE(fsub(q0, q1, q2), 0x4EA2D420);
|
||||
SINGLE_COMPARE(fmul(q0, q1, q2), 0x6E22DC20);
|
||||
SINGLE_COMPARE(fdiv(q0, q1, q2), 0x6E22FC20);
|
||||
SINGLE_COMPARE(fneg(q0, q1), 0x6EA0F820);
|
||||
}
|
||||
|
||||
TEST_CASE("LogTest")
|
||||
{
|
||||
AssemblyBuilderA64 build(/* logText= */ true);
|
||||
|
@ -552,6 +562,7 @@ TEST_CASE("LogTest")
|
|||
build.ins_4s(q31, 1, q29, 2);
|
||||
build.dup_4s(s29, q31, 2);
|
||||
build.dup_4s(q29, q30, 0);
|
||||
build.fmul(q0, q1, q2);
|
||||
|
||||
build.setLabel(l);
|
||||
build.ret();
|
||||
|
@ -594,6 +605,7 @@ TEST_CASE("LogTest")
|
|||
ins v31.s[1],v29.s[2]
|
||||
dup s29,v31.s[2]
|
||||
dup v29.4s,v30.s[0]
|
||||
fmul v0.4s,v1.4s,v2.4s
|
||||
.L1:
|
||||
ret
|
||||
)";
|
||||
|
|
Loading…
Add table
Reference in a new issue