From 0338e0e52d65557b61233f19d982c8db95be3dee Mon Sep 17 00:00:00 2001 From: Arseny Kapoulkine Date: Tue, 13 Feb 2024 15:35:17 -0800 Subject: [PATCH] CodeGen: Implement SIMD math and vector moves This change extends fadd/fsub/fmul/fdiv/fneg and mov to work on qN registers. For arithmetics we assume .4s format (4 single-precision floats). Alternatively we could also make new fmul_4s et al but it seems unlikely that we'll need alternative variants in the future; if we do, we could always rework the code. placeVR mnemonic is the same as placeR3 but I split it so that it is easier to modify in the future. For fneg I've reused placeR1 (it won't print .4s suffix in text disassembly but... it's unclear that we care). Vector moves are weird on A64: they are an alias for OR of a register with itself. --- CodeGen/include/Luau/AssemblyBuilderA64.h | 1 + CodeGen/src/AssemblyBuilderA64.cpp | 70 +++++++++++++++++++---- tests/AssemblyBuilderA64.test.cpp | 12 ++++ 3 files changed, 73 insertions(+), 10 deletions(-) diff --git a/CodeGen/include/Luau/AssemblyBuilderA64.h b/CodeGen/include/Luau/AssemblyBuilderA64.h index 78251012..0c827f64 100644 --- a/CodeGen/include/Luau/AssemblyBuilderA64.h +++ b/CodeGen/include/Luau/AssemblyBuilderA64.h @@ -230,6 +230,7 @@ private: void placeBM(const char* name, RegisterA64 dst, RegisterA64 src1, uint32_t src2, uint8_t op); void placeBFM(const char* name, RegisterA64 dst, RegisterA64 src1, int src2, uint8_t op, int immr, int imms); void placeER(const char* name, RegisterA64 dst, RegisterA64 src1, RegisterA64 src2, uint8_t op, int shift); + void placeVR(const char* name, RegisterA64 dst, RegisterA64 src1, RegisterA64 src2, uint16_t op, uint8_t op2); void place(uint32_t word); diff --git a/CodeGen/src/AssemblyBuilderA64.cpp b/CodeGen/src/AssemblyBuilderA64.cpp index de0eb0cd..44bbe5e8 100644 --- a/CodeGen/src/AssemblyBuilderA64.cpp +++ b/CodeGen/src/AssemblyBuilderA64.cpp @@ -63,13 +63,22 @@ AssemblyBuilderA64::~AssemblyBuilderA64() void AssemblyBuilderA64::mov(RegisterA64 dst, RegisterA64 src) { - LUAU_ASSERT(dst.kind == KindA64::w || dst.kind == KindA64::x || dst == sp); - LUAU_ASSERT(dst.kind == src.kind || (dst.kind == KindA64::x && src == sp) || (dst == sp && src.kind == KindA64::x)); + if (dst.kind != KindA64::q) + { + LUAU_ASSERT(dst.kind == KindA64::w || dst.kind == KindA64::x || dst == sp); + LUAU_ASSERT(dst.kind == src.kind || (dst.kind == KindA64::x && src == sp) || (dst == sp && src.kind == KindA64::x)); - if (dst == sp || src == sp) - placeR1("mov", dst, src, 0b00'100010'0'000000000000); + if (dst == sp || src == sp) + placeR1("mov", dst, src, 0b00'100010'0'000000000000); + else + placeSR2("mov", dst, src, 0b01'01010); + } else - placeSR2("mov", dst, src, 0b01'01010); + { + LUAU_ASSERT(dst.kind == src.kind); + + placeR1("mov", dst, src, 0b10'01110'10'1'00000'00011'1 | (src.index << 6)); + } } void AssemblyBuilderA64::mov(RegisterA64 dst, int src) @@ -575,12 +584,18 @@ void AssemblyBuilderA64::fadd(RegisterA64 dst, RegisterA64 src1, RegisterA64 src placeR3("fadd", dst, src1, src2, 0b11110'01'1, 0b0010'10); } - else + else if (dst.kind == KindA64::s) { LUAU_ASSERT(dst.kind == KindA64::s && src1.kind == KindA64::s && src2.kind == KindA64::s); placeR3("fadd", dst, src1, src2, 0b11110'00'1, 0b0010'10); } + else + { + LUAU_ASSERT(dst.kind == KindA64::q && src1.kind == KindA64::q && src2.kind == KindA64::q); + + placeVR("fadd", dst, src1, src2, 0b0'01110'0'0'1, 0b11010'1); + } } void AssemblyBuilderA64::fdiv(RegisterA64 dst, RegisterA64 src1, RegisterA64 src2) @@ -591,12 +606,18 @@ void AssemblyBuilderA64::fdiv(RegisterA64 dst, RegisterA64 src1, RegisterA64 src placeR3("fdiv", dst, src1, src2, 0b11110'01'1, 0b0001'10); } - else + else if (dst.kind == KindA64::s) { LUAU_ASSERT(dst.kind == KindA64::s && src1.kind == KindA64::s && src2.kind == KindA64::s); placeR3("fdiv", dst, src1, src2, 0b11110'00'1, 0b0001'10); } + else + { + LUAU_ASSERT(dst.kind == KindA64::q && src1.kind == KindA64::q && src2.kind == KindA64::q); + + placeVR("fdiv", dst, src1, src2, 0b1'01110'00'1, 0b11111'1); + } } void AssemblyBuilderA64::fmul(RegisterA64 dst, RegisterA64 src1, RegisterA64 src2) @@ -607,12 +628,18 @@ void AssemblyBuilderA64::fmul(RegisterA64 dst, RegisterA64 src1, RegisterA64 src placeR3("fmul", dst, src1, src2, 0b11110'01'1, 0b0000'10); } - else + else if (dst.kind == KindA64::s) { LUAU_ASSERT(dst.kind == KindA64::s && src1.kind == KindA64::s && src2.kind == KindA64::s); placeR3("fmul", dst, src1, src2, 0b11110'00'1, 0b0000'10); } + else + { + LUAU_ASSERT(dst.kind == KindA64::q && src1.kind == KindA64::q && src2.kind == KindA64::q); + + placeVR("fmul", dst, src1, src2, 0b1'01110'00'1, 0b11011'1); + } } void AssemblyBuilderA64::fneg(RegisterA64 dst, RegisterA64 src) @@ -623,12 +650,18 @@ void AssemblyBuilderA64::fneg(RegisterA64 dst, RegisterA64 src) placeR1("fneg", dst, src, 0b000'11110'01'1'0000'10'10000); } - else + else if (dst.kind == KindA64::s) { LUAU_ASSERT(dst.kind == KindA64::s && src.kind == KindA64::s); placeR1("fneg", dst, src, 0b000'11110'00'1'0000'10'10000); } + else + { + LUAU_ASSERT(dst.kind == KindA64::q && src.kind == KindA64::q); + + placeR1("fneg", dst, src, 0b011'01110'1'0'10000'01111'10); + } } void AssemblyBuilderA64::fsqrt(RegisterA64 dst, RegisterA64 src) @@ -646,12 +679,18 @@ void AssemblyBuilderA64::fsub(RegisterA64 dst, RegisterA64 src1, RegisterA64 src placeR3("fsub", dst, src1, src2, 0b11110'01'1, 0b0011'10); } - else + else if (dst.kind == KindA64::s) { LUAU_ASSERT(dst.kind == KindA64::s && src1.kind == KindA64::s && src2.kind == KindA64::s); placeR3("fsub", dst, src1, src2, 0b11110'00'1, 0b0011'10); } + else + { + LUAU_ASSERT(dst.kind == KindA64::q && src1.kind == KindA64::q && src2.kind == KindA64::q); + + placeVR("fsub", dst, src1, src2, 0b0'01110'10'1, 0b11010'1); + } } void AssemblyBuilderA64::ins_4s(RegisterA64 dst, RegisterA64 src, uint8_t index) @@ -1226,6 +1265,17 @@ void AssemblyBuilderA64::placeER(const char* name, RegisterA64 dst, RegisterA64 commit(); } +void AssemblyBuilderA64::placeVR(const char* name, RegisterA64 dst, RegisterA64 src1, RegisterA64 src2, uint16_t op, uint8_t op2) +{ + if (logText) + logAppend(" %-12sv%d.4s,v%d.4s,v%d.4s\n", name, dst.index, src1.index, src2.index); + + LUAU_ASSERT(dst.kind == KindA64::q && dst.kind == src1.kind && dst.kind == src2.kind); + + place(dst.index | (src1.index << 5) | (op2 << 10) | (src2.index << 16) | (op << 21) | (1 << 30)); + commit(); +} + void AssemblyBuilderA64::place(uint32_t word) { LUAU_ASSERT(codePos < codeEnd); diff --git a/tests/AssemblyBuilderA64.test.cpp b/tests/AssemblyBuilderA64.test.cpp index 6657d889..320a7a6a 100644 --- a/tests/AssemblyBuilderA64.test.cpp +++ b/tests/AssemblyBuilderA64.test.cpp @@ -218,6 +218,7 @@ TEST_CASE_FIXTURE(AssemblyBuilderA64Fixture, "Moves") { SINGLE_COMPARE(mov(x0, x1), 0xAA0103E0); SINGLE_COMPARE(mov(w0, w1), 0x2A0103E0); + SINGLE_COMPARE(mov(q0, q1), 0x4EA11C20); SINGLE_COMPARE(movz(x0, 42), 0xD2800540); SINGLE_COMPARE(movz(w0, 42), 0x52800540); @@ -501,6 +502,15 @@ TEST_CASE_FIXTURE(AssemblyBuilderA64Fixture, "PrePostIndexing") SINGLE_COMPARE(str(q0, mem(x1, 1, AddressKindA64::post)), 0x3C801420); } +TEST_CASE_FIXTURE(AssemblyBuilderA64Fixture, "SIMDMath") +{ + SINGLE_COMPARE(fadd(q0, q1, q2), 0x4E22D420); + SINGLE_COMPARE(fsub(q0, q1, q2), 0x4EA2D420); + SINGLE_COMPARE(fmul(q0, q1, q2), 0x6E22DC20); + SINGLE_COMPARE(fdiv(q0, q1, q2), 0x6E22FC20); + SINGLE_COMPARE(fneg(q0, q1), 0x6EA0F820); +} + TEST_CASE("LogTest") { AssemblyBuilderA64 build(/* logText= */ true); @@ -552,6 +562,7 @@ TEST_CASE("LogTest") build.ins_4s(q31, 1, q29, 2); build.dup_4s(s29, q31, 2); build.dup_4s(q29, q30, 0); + build.fmul(q0, q1, q2); build.setLabel(l); build.ret(); @@ -594,6 +605,7 @@ TEST_CASE("LogTest") ins v31.s[1],v29.s[2] dup s29,v31.s[2] dup v29.4s,v30.s[0] + fmul v0.4s,v1.4s,v2.4s .L1: ret )";