From f666594fb6784a7a4986d04e8fc6e834954c0b23 Mon Sep 17 00:00:00 2001 From: Arseny Kapoulkine Date: Tue, 12 Mar 2024 11:10:40 -0700 Subject: [PATCH] CodeGen: Improve lowering of NUM_TO_VEC on A64 for constants When the input is a constant, we use a fairly inefficient sequence of fmov+fcvt+dup or, when the double isn't encodable in fmov, adr+ldr+fcvt+dup. Instead, we can use the same lowering as X64 when the input is a constant, and load the vector from memory. However, if the constant is encodable via fmov, we can use a vector fmov instead (which is just one instruction and doesn't need constant space). Fortunately the bit encoding of fmov for 32-bit floating point numbers matches that of 64-bit: the decoding algorithm is a little different because it expands into a larger exponent, but the values are compatible, so if a double can be encoded into a scalar fmov with a given abcdefgh pattern, the same pattern should encode the same float; due to the very limited number of mantissa and exponent bits, all values that are encodable are also exact in both 32-bit and 64-bit floats. This strategy is ~same as what gcc uses. For complex vectors, we previously used 4 instructions and 8 bytes of constant storage, and now we use 2 instructions and 16 bytes of constant storage, so the memory footprint is the same; for simple vectors we just need 1 instruction (4 bytes). clang lowers vector constants a little differently, opting to synthesize a 64-bit integer using 4 instructions (mov/movk) and then move it to the vector register - this requires 5 instructions and 20 bytes, vs ours/gcc 2 instructions and 8+16=24 bytes. I tried a simpler version of this that would be more compact - synthesize a 32-bit integer constant with mov+movk, and move it to vector register via dup.4s - but this was a little slower on M2, so for now we prefer the slightly larger version as it's not a regression vs current implementation. --- CodeGen/include/Luau/AssemblyBuilderA64.h | 5 +-- CodeGen/src/AssemblyBuilderA64.cpp | 20 ++++++++--- CodeGen/src/IrLoweringA64.cpp | 44 +++++++++++++++++------ tests/AssemblyBuilderA64.test.cpp | 4 +++ 4 files changed, 56 insertions(+), 17 deletions(-) diff --git a/CodeGen/include/Luau/AssemblyBuilderA64.h b/CodeGen/include/Luau/AssemblyBuilderA64.h index bea70fd0..a4d857a4 100644 --- a/CodeGen/include/Luau/AssemblyBuilderA64.h +++ b/CodeGen/include/Luau/AssemblyBuilderA64.h @@ -125,12 +125,12 @@ public: // Address of code (label) void adr(RegisterA64 dst, Label& label); - // Floating-point scalar moves + // Floating-point scalar/vector moves // Note: constant must be compatible with immediate floating point moves (see isFmovSupported) void fmov(RegisterA64 dst, RegisterA64 src); void fmov(RegisterA64 dst, double src); - // Floating-point scalar math + // Floating-point scalar/vector math void fabs(RegisterA64 dst, RegisterA64 src); void fadd(RegisterA64 dst, RegisterA64 src1, RegisterA64 src2); void fdiv(RegisterA64 dst, RegisterA64 src1, RegisterA64 src2); @@ -139,6 +139,7 @@ public: void fsqrt(RegisterA64 dst, RegisterA64 src); void fsub(RegisterA64 dst, RegisterA64 src1, RegisterA64 src2); + // Vector component manipulation void ins_4s(RegisterA64 dst, RegisterA64 src, uint8_t index); void ins_4s(RegisterA64 dst, uint8_t dstIndex, RegisterA64 src, uint8_t srcIndex); void dup_4s(RegisterA64 dst, RegisterA64 src, uint8_t index); diff --git a/CodeGen/src/AssemblyBuilderA64.cpp b/CodeGen/src/AssemblyBuilderA64.cpp index ffb0a774..9d0522c0 100644 --- a/CodeGen/src/AssemblyBuilderA64.cpp +++ b/CodeGen/src/AssemblyBuilderA64.cpp @@ -557,16 +557,26 @@ void AssemblyBuilderA64::fmov(RegisterA64 dst, RegisterA64 src) void AssemblyBuilderA64::fmov(RegisterA64 dst, double src) { - CODEGEN_ASSERT(dst.kind == KindA64::d); + CODEGEN_ASSERT(dst.kind == KindA64::d || dst.kind == KindA64::q); int imm = getFmovImm(src); CODEGEN_ASSERT(imm >= 0 && imm <= 256); - // fmov can't encode 0, but movi can; movi is otherwise not useful for 64-bit fp immediates because it encodes repeating patterns - if (imm == 256) - placeFMOV("movi", dst, src, 0b001'0111100000'000'1110'01'00000); + // fmov can't encode 0, but movi can; movi is otherwise not useful for fp immediates because it encodes repeating patterns + if (dst.kind == KindA64::d) + { + if (imm == 256) + placeFMOV("movi", dst, src, 0b001'0111100000'000'1110'01'00000); + else + placeFMOV("fmov", dst, src, 0b000'11110'01'1'00000000'100'00000 | (imm << 8)); + } else - placeFMOV("fmov", dst, src, 0b000'11110'01'1'00000000'100'00000 | (imm << 8)); + { + if (imm == 256) + placeFMOV("movi.4s", dst, src, 0b010'0111100000'000'0000'01'00000); + else + placeFMOV("fmov.4s", dst, src, 0b010'0111100000'000'1111'0'1'00000 | ((imm >> 5) << 11) | (imm & 31)); + } } void AssemblyBuilderA64::fabs(RegisterA64 dst, RegisterA64 src) diff --git a/CodeGen/src/IrLoweringA64.cpp b/CodeGen/src/IrLoweringA64.cpp index 2a296949..284cef4d 100644 --- a/CodeGen/src/IrLoweringA64.cpp +++ b/CodeGen/src/IrLoweringA64.cpp @@ -12,6 +12,7 @@ #include "lgc.h" LUAU_FASTFLAGVARIABLE(LuauCodeGenVectorA64, false) +LUAU_FASTFLAGVARIABLE(LuauCodeGenOptVecA64, false) LUAU_FASTFLAG(LuauCodegenVectorTag2) @@ -1176,17 +1177,40 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next) { inst.regA64 = regs.allocReg(KindA64::q, index); - RegisterA64 tempd = tempDouble(inst.a); - RegisterA64 temps = castReg(KindA64::s, tempd); - RegisterA64 tempw = regs.allocTemp(KindA64::w); - - build.fcvt(temps, tempd); - build.dup_4s(inst.regA64, castReg(KindA64::q, temps), 0); - - if (!FFlag::LuauCodegenVectorTag2) + if (FFlag::LuauCodeGenOptVecA64 && FFlag::LuauCodegenVectorTag2 && inst.a.kind == IrOpKind::Constant) { - build.mov(tempw, LUA_TVECTOR); - build.ins_4s(inst.regA64, tempw, 3); + float value = float(doubleOp(inst.a)); + uint32_t asU32; + static_assert(sizeof(asU32) == sizeof(value), "Expecting float to be 32-bit"); + memcpy(&asU32, &value, sizeof(value)); + + if (AssemblyBuilderA64::isFmovSupported(value)) + { + build.fmov(inst.regA64, value); + } + else + { + RegisterA64 temp = regs.allocTemp(KindA64::x); + + uint32_t vec[4] = { asU32, asU32, asU32, 0 }; + build.adr(temp, vec, sizeof(vec)); + build.ldr(inst.regA64, temp); + } + } + else + { + RegisterA64 tempd = tempDouble(inst.a); + RegisterA64 temps = castReg(KindA64::s, tempd); + RegisterA64 tempw = regs.allocTemp(KindA64::w); + + build.fcvt(temps, tempd); + build.dup_4s(inst.regA64, castReg(KindA64::q, temps), 0); + + if (!FFlag::LuauCodegenVectorTag2) + { + build.mov(tempw, LUA_TVECTOR); + build.ins_4s(inst.regA64, tempw, 3); + } } break; } diff --git a/tests/AssemblyBuilderA64.test.cpp b/tests/AssemblyBuilderA64.test.cpp index 320a7a6a..f7cab8a1 100644 --- a/tests/AssemblyBuilderA64.test.cpp +++ b/tests/AssemblyBuilderA64.test.cpp @@ -452,6 +452,10 @@ TEST_CASE_FIXTURE(AssemblyBuilderA64Fixture, "FPImm") SINGLE_COMPARE(fmov(d0, 0.125), 0x1E681000); SINGLE_COMPARE(fmov(d0, -0.125), 0x1E781000); + SINGLE_COMPARE(fmov(q0, 0), 0x4F000400); + SINGLE_COMPARE(fmov(q0, 0.125), 0x4F02F400); + SINGLE_COMPARE(fmov(q0, -0.125), 0x4F06F400); + CHECK(!AssemblyBuilderA64::isFmovSupported(-0.0)); CHECK(!AssemblyBuilderA64::isFmovSupported(0.12389)); }