From 24cacc94edaae6311f668e8c8cb4b223ff39cf24 Mon Sep 17 00:00:00 2001 From: Arseny Kapoulkine Date: Thu, 16 Jan 2025 10:48:27 -0800 Subject: [PATCH] CodeGen: Implement support for math.lerp lowering (#1609) To implement math.lerp without branches, we add SELECT_NUM which selects one of the two inputs based on the comparison condition. For simplicity, we only support C == D for now; this can be extended to a more generic version with a IrCondition operand E, but that requires more work on the SSE side (to flip the comparison for some conditions like Greater, and expose more generic vcmpsd). Note: On AArch64 this will effectively result in a change in floating point behavior between native code and non-native code: clang synthesizes fmadd (because floating point contraction is allowed by default, and the arch always has the instruction), whereas this change will use fmul+fadd. I am not sure if this is good or bad, and if this is a problem in C or not. Specifically, clang's behavior results in different results between X64 and AArch64 when *not* using codegen, and with this change the behavior when using codegen is... the same? :) Fixing this will require either using LERP_NUM instead and hand-coding lowering, or exposing some sort of "quasi" MADD_NUM (which would lower to fma on AArch64 and mul+add on X64). A small benefit to the current approach is `lerp(1, 5, t)` constant-folds the subtraction. With LERP_NUM this optimization will need to be implemented manually as a partial constant-folding for LERP_NUM. A similar problem exists today for vector.cross & vector.dot. So maybe this is not something we need to fix, unsure. --- CodeGen/include/Luau/AssemblyBuilderX64.h | 1 + CodeGen/include/Luau/IrData.h | 5 +++ CodeGen/include/Luau/IrUtils.h | 1 + CodeGen/src/AssemblyBuilderX64.cpp | 5 +++ CodeGen/src/IrDump.cpp | 2 ++ CodeGen/src/IrLoweringA64.cpp | 15 +++++++++ CodeGen/src/IrLoweringX64.cpp | 25 +++++++++++++++ CodeGen/src/IrTranslateBuiltins.cpp | 39 +++++++++++++++++++++++ CodeGen/src/IrUtils.cpp | 12 +++++++ CodeGen/src/OptimizeConstProp.cpp | 1 + tests/AssemblyBuilderX64.test.cpp | 1 + tests/conformance/math.lua | 1 + 12 files changed, 108 insertions(+) diff --git a/CodeGen/include/Luau/AssemblyBuilderX64.h b/CodeGen/include/Luau/AssemblyBuilderX64.h index 30790ee5..ca5fa7a9 100644 --- a/CodeGen/include/Luau/AssemblyBuilderX64.h +++ b/CodeGen/include/Luau/AssemblyBuilderX64.h @@ -160,6 +160,7 @@ public: void vmaxsd(OperandX64 dst, OperandX64 src1, OperandX64 src2); void vminsd(OperandX64 dst, OperandX64 src1, OperandX64 src2); + void vcmpeqsd(OperandX64 dst, OperandX64 src1, OperandX64 src2); void vcmpltsd(OperandX64 dst, OperandX64 src1, OperandX64 src2); void vblendvpd(RegisterX64 dst, RegisterX64 src1, OperandX64 mask, RegisterX64 src3); diff --git a/CodeGen/include/Luau/IrData.h b/CodeGen/include/Luau/IrData.h index 779fe012..44f2495b 100644 --- a/CodeGen/include/Luau/IrData.h +++ b/CodeGen/include/Luau/IrData.h @@ -185,6 +185,11 @@ enum class IrCmd : uint8_t // A: double SIGN_NUM, + // Select B if C == D, otherwise select A + // A, B: double (endpoints) + // C, D: double (condition arguments) + SELECT_NUM, + // Add/Sub/Mul/Div/Idiv two vectors // A, B: TValue ADD_VEC, diff --git a/CodeGen/include/Luau/IrUtils.h b/CodeGen/include/Luau/IrUtils.h index 773b23a6..1afa1a34 100644 --- a/CodeGen/include/Luau/IrUtils.h +++ b/CodeGen/include/Luau/IrUtils.h @@ -174,6 +174,7 @@ inline bool hasResult(IrCmd cmd) case IrCmd::SQRT_NUM: case IrCmd::ABS_NUM: case IrCmd::SIGN_NUM: + case IrCmd::SELECT_NUM: case IrCmd::ADD_VEC: case IrCmd::SUB_VEC: case IrCmd::MUL_VEC: diff --git a/CodeGen/src/AssemblyBuilderX64.cpp b/CodeGen/src/AssemblyBuilderX64.cpp index 803732e2..1fb1b671 100644 --- a/CodeGen/src/AssemblyBuilderX64.cpp +++ b/CodeGen/src/AssemblyBuilderX64.cpp @@ -927,6 +927,11 @@ void AssemblyBuilderX64::vminsd(OperandX64 dst, OperandX64 src1, OperandX64 src2 placeAvx("vminsd", dst, src1, src2, 0x5d, false, AVX_0F, AVX_F2); } +void AssemblyBuilderX64::vcmpeqsd(OperandX64 dst, OperandX64 src1, OperandX64 src2) +{ + placeAvx("vcmpeqsd", dst, src1, src2, 0x00, 0xc2, false, AVX_0F, AVX_F2); +} + void AssemblyBuilderX64::vcmpltsd(OperandX64 dst, OperandX64 src1, OperandX64 src2) { placeAvx("vcmpltsd", dst, src1, src2, 0x01, 0xc2, false, AVX_0F, AVX_F2); diff --git a/CodeGen/src/IrDump.cpp b/CodeGen/src/IrDump.cpp index fe6a2397..dcc9d879 100644 --- a/CodeGen/src/IrDump.cpp +++ b/CodeGen/src/IrDump.cpp @@ -169,6 +169,8 @@ const char* getCmdName(IrCmd cmd) return "ABS_NUM"; case IrCmd::SIGN_NUM: return "SIGN_NUM"; + case IrCmd::SELECT_NUM: + return "SELECT_NUM"; case IrCmd::ADD_VEC: return "ADD_VEC"; case IrCmd::SUB_VEC: diff --git a/CodeGen/src/IrLoweringA64.cpp b/CodeGen/src/IrLoweringA64.cpp index c7fcac27..d29755f1 100644 --- a/CodeGen/src/IrLoweringA64.cpp +++ b/CodeGen/src/IrLoweringA64.cpp @@ -13,6 +13,7 @@ LUAU_FASTFLAG(LuauVectorLibNativeDot) LUAU_FASTFLAG(LuauCodeGenVectorDeadStoreElim) +LUAU_FASTFLAG(LuauCodeGenLerp) namespace Luau { @@ -703,6 +704,20 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next) build.fcsel(inst.regA64, temp1, inst.regA64, getConditionFP(IrCondition::Less)); break; } + case IrCmd::SELECT_NUM: + { + LUAU_ASSERT(FFlag::LuauCodeGenLerp); + inst.regA64 = regs.allocReuse(KindA64::d, index, {inst.a, inst.b, inst.c, inst.d}); + + RegisterA64 temp1 = tempDouble(inst.a); + RegisterA64 temp2 = tempDouble(inst.b); + RegisterA64 temp3 = tempDouble(inst.c); + RegisterA64 temp4 = tempDouble(inst.d); + + build.fcmp(temp3, temp4); + build.fcsel(inst.regA64, temp2, temp1, getConditionFP(IrCondition::Equal)); + break; + } case IrCmd::ADD_VEC: { inst.regA64 = regs.allocReuse(KindA64::q, index, {inst.a, inst.b}); diff --git a/CodeGen/src/IrLoweringX64.cpp b/CodeGen/src/IrLoweringX64.cpp index 814c6d8c..c1a84c8e 100644 --- a/CodeGen/src/IrLoweringX64.cpp +++ b/CodeGen/src/IrLoweringX64.cpp @@ -17,6 +17,7 @@ LUAU_FASTFLAG(LuauVectorLibNativeDot) LUAU_FASTFLAG(LuauCodeGenVectorDeadStoreElim) +LUAU_FASTFLAG(LuauCodeGenLerp) namespace Luau { @@ -622,6 +623,30 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next) build.vblendvpd(inst.regX64, tmp1.reg, build.f64x2(1, 1), inst.regX64); break; } + case IrCmd::SELECT_NUM: + { + LUAU_ASSERT(FFlag::LuauCodeGenLerp); + inst.regX64 = regs.allocRegOrReuse(SizeX64::xmmword, index, {inst.a, inst.c, inst.d}); // can't reuse b if a is a memory operand + + ScopedRegX64 tmp{regs, SizeX64::xmmword}; + + if (inst.c.kind == IrOpKind::Inst) + build.vcmpeqsd(tmp.reg, regOp(inst.c), memRegDoubleOp(inst.d)); + else + { + build.vmovsd(tmp.reg, memRegDoubleOp(inst.c)); + build.vcmpeqsd(tmp.reg, tmp.reg, memRegDoubleOp(inst.d)); + } + + if (inst.a.kind == IrOpKind::Inst) + build.vblendvpd(inst.regX64, regOp(inst.a), memRegDoubleOp(inst.b), tmp.reg); + else + { + build.vmovsd(inst.regX64, memRegDoubleOp(inst.a)); + build.vblendvpd(inst.regX64, inst.regX64, memRegDoubleOp(inst.b), tmp.reg); + } + break; + } case IrCmd::ADD_VEC: { inst.regX64 = regs.allocRegOrReuse(SizeX64::xmmword, index, {inst.a, inst.b}); diff --git a/CodeGen/src/IrTranslateBuiltins.cpp b/CodeGen/src/IrTranslateBuiltins.cpp index ebded522..a5fa3ad0 100644 --- a/CodeGen/src/IrTranslateBuiltins.cpp +++ b/CodeGen/src/IrTranslateBuiltins.cpp @@ -15,6 +15,7 @@ static const int kBit32BinaryOpUnrolledParams = 5; LUAU_FASTFLAGVARIABLE(LuauVectorLibNativeCodegen); LUAU_FASTFLAGVARIABLE(LuauVectorLibNativeDot); +LUAU_FASTFLAGVARIABLE(LuauCodeGenLerp); namespace Luau { @@ -284,6 +285,42 @@ static BuiltinImplResult translateBuiltinMathClamp( return {BuiltinImplType::UsesFallback, 1}; } +static BuiltinImplResult translateBuiltinMathLerp( + IrBuilder& build, + int nparams, + int ra, + int arg, + IrOp args, + IrOp arg3, + int nresults, + IrOp fallback, + int pcpos +) +{ + LUAU_ASSERT(FFlag::LuauCodeGenLerp); + + if (nparams < 3 || nresults > 1) + return {BuiltinImplType::None, -1}; + + builtinCheckDouble(build, build.vmReg(arg), pcpos); + builtinCheckDouble(build, args, pcpos); + builtinCheckDouble(build, arg3, pcpos); + + IrOp a = builtinLoadDouble(build, build.vmReg(arg)); + IrOp b = builtinLoadDouble(build, args); + IrOp t = builtinLoadDouble(build, arg3); + + IrOp l = build.inst(IrCmd::ADD_NUM, a, build.inst(IrCmd::MUL_NUM, build.inst(IrCmd::SUB_NUM, b, a), t)); + IrOp r = build.inst(IrCmd::SELECT_NUM, l, b, t, build.constDouble(1.0)); // select on t==1.0 + + build.inst(IrCmd::STORE_DOUBLE, build.vmReg(ra), r); + + if (ra != arg) + build.inst(IrCmd::STORE_TAG, build.vmReg(ra), build.constTag(LUA_TNUMBER)); + + return {BuiltinImplType::Full, 1}; +} + static BuiltinImplResult translateBuiltinMathUnary(IrBuilder& build, IrCmd cmd, int nparams, int ra, int arg, int nresults, int pcpos) { if (nparams < 1 || nresults > 1) @@ -1387,6 +1424,8 @@ BuiltinImplResult translateBuiltin( case LBF_VECTOR_MAX: return FFlag::LuauVectorLibNativeCodegen ? translateBuiltinVectorMap2(build, IrCmd::MAX_NUM, nparams, ra, arg, args, arg3, nresults, pcpos) : noneResult; + case LBF_MATH_LERP: + return FFlag::LuauCodeGenLerp ? translateBuiltinMathLerp(build, nparams, ra, arg, args, arg3, nresults, fallback, pcpos) : noneResult; default: return {BuiltinImplType::None, -1}; } diff --git a/CodeGen/src/IrUtils.cpp b/CodeGen/src/IrUtils.cpp index 5f384807..74bbc6d7 100644 --- a/CodeGen/src/IrUtils.cpp +++ b/CodeGen/src/IrUtils.cpp @@ -13,6 +13,7 @@ #include LUAU_FASTFLAG(LuauVectorLibNativeDot); +LUAU_FASTFLAG(LuauCodeGenLerp); namespace Luau { @@ -70,6 +71,7 @@ IrValueKind getCmdValueKind(IrCmd cmd) case IrCmd::SQRT_NUM: case IrCmd::ABS_NUM: case IrCmd::SIGN_NUM: + case IrCmd::SELECT_NUM: return IrValueKind::Double; case IrCmd::ADD_VEC: case IrCmd::SUB_VEC: @@ -656,6 +658,16 @@ void foldConstants(IrBuilder& build, IrFunction& function, IrBlock& block, uint3 substitute(function, inst, build.constDouble(v > 0.0 ? 1.0 : v < 0.0 ? -1.0 : 0.0)); } break; + case IrCmd::SELECT_NUM: + LUAU_ASSERT(FFlag::LuauCodeGenLerp); + if (inst.c.kind == IrOpKind::Constant && inst.d.kind == IrOpKind::Constant) + { + double c = function.doubleOp(inst.c); + double d = function.doubleOp(inst.d); + + substitute(function, inst, c == d ? inst.b : inst.a); + } + break; case IrCmd::NOT_ANY: if (inst.a.kind == IrOpKind::Constant) { diff --git a/CodeGen/src/OptimizeConstProp.cpp b/CodeGen/src/OptimizeConstProp.cpp index 5920f7cc..ce44f5d1 100644 --- a/CodeGen/src/OptimizeConstProp.cpp +++ b/CodeGen/src/OptimizeConstProp.cpp @@ -1382,6 +1382,7 @@ static void constPropInInst(ConstPropState& state, IrBuilder& build, IrFunction& case IrCmd::SQRT_NUM: case IrCmd::ABS_NUM: case IrCmd::SIGN_NUM: + case IrCmd::SELECT_NUM: case IrCmd::NOT_ANY: state.substituteOrRecord(inst, index); break; diff --git a/tests/AssemblyBuilderX64.test.cpp b/tests/AssemblyBuilderX64.test.cpp index 504e40e4..fd1deccf 100644 --- a/tests/AssemblyBuilderX64.test.cpp +++ b/tests/AssemblyBuilderX64.test.cpp @@ -506,6 +506,7 @@ TEST_CASE_FIXTURE(AssemblyBuilderX64Fixture, "AVXBinaryInstructionForms") SINGLE_COMPARE(vmaxsd(xmm8, xmm10, xmm14), 0xc4, 0x41, 0x2b, 0x5f, 0xc6); SINGLE_COMPARE(vminsd(xmm8, xmm10, xmm14), 0xc4, 0x41, 0x2b, 0x5d, 0xc6); + SINGLE_COMPARE(vcmpeqsd(xmm8, xmm10, xmm14), 0xc4, 0x41, 0x2b, 0xc2, 0xc6, 0x00); SINGLE_COMPARE(vcmpltsd(xmm8, xmm10, xmm14), 0xc4, 0x41, 0x2b, 0xc2, 0xc6, 0x01); } diff --git a/tests/conformance/math.lua b/tests/conformance/math.lua index fbd8f9dd..586023ed 100644 --- a/tests/conformance/math.lua +++ b/tests/conformance/math.lua @@ -408,6 +408,7 @@ assert(math.lerp(1, 5, 1) == 5) assert(math.lerp(1, 5, 0.5) == 3) assert(math.lerp(1, 5, 1.5) == 7) assert(math.lerp(1, 5, -0.5) == -1) +assert(math.lerp(1, 5, noinline(0.5)) == 3) -- lerp properties local sq2, sq3 = math.sqrt(2), math.sqrt(3)