diff --git a/CodeGen/include/Luau/AssemblyBuilderA64.h b/CodeGen/include/Luau/AssemblyBuilderA64.h index a86403d4..bea70fd0 100644 --- a/CodeGen/include/Luau/AssemblyBuilderA64.h +++ b/CodeGen/include/Luau/AssemblyBuilderA64.h @@ -211,7 +211,6 @@ private: void placeSR3(const char* name, RegisterA64 dst, RegisterA64 src1, RegisterA64 src2, uint8_t op, int shift = 0, int N = 0); void placeSR2(const char* name, RegisterA64 dst, RegisterA64 src, uint8_t op, uint8_t op2 = 0); void placeR3(const char* name, RegisterA64 dst, RegisterA64 src1, RegisterA64 src2, uint8_t op, uint8_t op2); - void placeR3(const char* name, RegisterA64 dst, RegisterA64 src1, RegisterA64 src2, uint8_t sizes, uint8_t op, uint8_t op2); void placeR1(const char* name, RegisterA64 dst, RegisterA64 src, uint32_t op); void placeI12(const char* name, RegisterA64 dst, RegisterA64 src1, int src2, uint8_t op); void placeI16(const char* name, RegisterA64 dst, int src, uint8_t op, int shift = 0); @@ -230,6 +229,7 @@ private: void placeBM(const char* name, RegisterA64 dst, RegisterA64 src1, uint32_t src2, uint8_t op); void placeBFM(const char* name, RegisterA64 dst, RegisterA64 src1, int src2, uint8_t op, int immr, int imms); void placeER(const char* name, RegisterA64 dst, RegisterA64 src1, RegisterA64 src2, uint8_t op, int shift); + void placeVR(const char* name, RegisterA64 dst, RegisterA64 src1, RegisterA64 src2, uint16_t op, uint8_t op2); void place(uint32_t word); diff --git a/CodeGen/include/Luau/IrData.h b/CodeGen/include/Luau/IrData.h index 129536d1..1d9bbc73 100644 --- a/CodeGen/include/Luau/IrData.h +++ b/CodeGen/include/Luau/IrData.h @@ -304,7 +304,11 @@ enum class IrCmd : uint8_t // Converts a double number to a vector with the value in X/Y/Z // A: double - NUM_TO_VECTOR, + NUM_TO_VEC, + + // Adds VECTOR type tag to a vector, preserving X/Y/Z components + // A: TValue + TAG_VECTOR, // Adjust stack top (L->top) to point at 'B' TValues *after* the specified register // This is used to return multiple values diff --git a/CodeGen/include/Luau/IrUtils.h b/CodeGen/include/Luau/IrUtils.h index 47ef505b..0c8495e8 100644 --- a/CodeGen/include/Luau/IrUtils.h +++ b/CodeGen/include/Luau/IrUtils.h @@ -186,7 +186,8 @@ inline bool hasResult(IrCmd cmd) case IrCmd::UINT_TO_NUM: case IrCmd::NUM_TO_INT: case IrCmd::NUM_TO_UINT: - case IrCmd::NUM_TO_VECTOR: + case IrCmd::NUM_TO_VEC: + case IrCmd::TAG_VECTOR: case IrCmd::SUBSTITUTE: case IrCmd::INVOKE_FASTCALL: case IrCmd::BITAND_UINT: diff --git a/CodeGen/src/AssemblyBuilderA64.cpp b/CodeGen/src/AssemblyBuilderA64.cpp index 96d17192..42374ca5 100644 --- a/CodeGen/src/AssemblyBuilderA64.cpp +++ b/CodeGen/src/AssemblyBuilderA64.cpp @@ -66,10 +66,10 @@ void AssemblyBuilderA64::mov(RegisterA64 dst, RegisterA64 src) CODEGEN_ASSERT(dst.kind == KindA64::w || dst.kind == KindA64::x || dst == sp); CODEGEN_ASSERT(dst.kind == src.kind || (dst.kind == KindA64::x && src == sp) || (dst == sp && src.kind == KindA64::x)); - if (dst == sp || src == sp) - placeR1("mov", dst, src, 0b00'100010'0'000000000000); - else - placeSR2("mov", dst, src, 0b01'01010); + if (dst == sp || src == sp) + placeR1("mov", dst, src, 0b00'100010'0'000000000000); + else + placeSR2("mov", dst, src, 0b01'01010); } void AssemblyBuilderA64::mov(RegisterA64 dst, int src) @@ -575,12 +575,18 @@ void AssemblyBuilderA64::fadd(RegisterA64 dst, RegisterA64 src1, RegisterA64 src placeR3("fadd", dst, src1, src2, 0b11110'01'1, 0b0010'10); } - else + else if (dst.kind == KindA64::s) { CODEGEN_ASSERT(dst.kind == KindA64::s && src1.kind == KindA64::s && src2.kind == KindA64::s); placeR3("fadd", dst, src1, src2, 0b11110'00'1, 0b0010'10); } + else + { + CODEGEN_ASSERT(dst.kind == KindA64::q && src1.kind == KindA64::q && src2.kind == KindA64::q); + + placeVR("fadd", dst, src1, src2, 0b0'01110'0'0'1, 0b11010'1); + } } void AssemblyBuilderA64::fdiv(RegisterA64 dst, RegisterA64 src1, RegisterA64 src2) @@ -591,12 +597,18 @@ void AssemblyBuilderA64::fdiv(RegisterA64 dst, RegisterA64 src1, RegisterA64 src placeR3("fdiv", dst, src1, src2, 0b11110'01'1, 0b0001'10); } - else + else if (dst.kind == KindA64::s) { CODEGEN_ASSERT(dst.kind == KindA64::s && src1.kind == KindA64::s && src2.kind == KindA64::s); placeR3("fdiv", dst, src1, src2, 0b11110'00'1, 0b0001'10); } + else + { + CODEGEN_ASSERT(dst.kind == KindA64::q && src1.kind == KindA64::q && src2.kind == KindA64::q); + + placeVR("fdiv", dst, src1, src2, 0b1'01110'00'1, 0b11111'1); + } } void AssemblyBuilderA64::fmul(RegisterA64 dst, RegisterA64 src1, RegisterA64 src2) @@ -607,12 +619,18 @@ void AssemblyBuilderA64::fmul(RegisterA64 dst, RegisterA64 src1, RegisterA64 src placeR3("fmul", dst, src1, src2, 0b11110'01'1, 0b0000'10); } - else + else if (dst.kind == KindA64::s) { CODEGEN_ASSERT(dst.kind == KindA64::s && src1.kind == KindA64::s && src2.kind == KindA64::s); placeR3("fmul", dst, src1, src2, 0b11110'00'1, 0b0000'10); } + else + { + CODEGEN_ASSERT(dst.kind == KindA64::q && src1.kind == KindA64::q && src2.kind == KindA64::q); + + placeVR("fmul", dst, src1, src2, 0b1'01110'00'1, 0b11011'1); + } } void AssemblyBuilderA64::fneg(RegisterA64 dst, RegisterA64 src) @@ -623,12 +641,18 @@ void AssemblyBuilderA64::fneg(RegisterA64 dst, RegisterA64 src) placeR1("fneg", dst, src, 0b000'11110'01'1'0000'10'10000); } - else + else if (dst.kind == KindA64::s) { CODEGEN_ASSERT(dst.kind == KindA64::s && src.kind == KindA64::s); placeR1("fneg", dst, src, 0b000'11110'00'1'0000'10'10000); } + else + { + CODEGEN_ASSERT(dst.kind == KindA64::q && src.kind == KindA64::q); + + placeR1("fneg", dst, src, 0b011'01110'1'0'10000'01111'10); + } } void AssemblyBuilderA64::fsqrt(RegisterA64 dst, RegisterA64 src) @@ -646,12 +670,18 @@ void AssemblyBuilderA64::fsub(RegisterA64 dst, RegisterA64 src1, RegisterA64 src placeR3("fsub", dst, src1, src2, 0b11110'01'1, 0b0011'10); } - else + else if (dst.kind == KindA64::s) { CODEGEN_ASSERT(dst.kind == KindA64::s && src1.kind == KindA64::s && src2.kind == KindA64::s); placeR3("fsub", dst, src1, src2, 0b11110'00'1, 0b0011'10); } + else + { + CODEGEN_ASSERT(dst.kind == KindA64::q && src1.kind == KindA64::q && src2.kind == KindA64::q); + + placeVR("fsub", dst, src1, src2, 0b0'01110'10'1, 0b11010'1); + } } void AssemblyBuilderA64::ins_4s(RegisterA64 dst, RegisterA64 src, uint8_t index) @@ -1226,6 +1256,17 @@ void AssemblyBuilderA64::placeER(const char* name, RegisterA64 dst, RegisterA64 commit(); } +void AssemblyBuilderA64::placeVR(const char* name, RegisterA64 dst, RegisterA64 src1, RegisterA64 src2, uint16_t op, uint8_t op2) +{ + if (logText) + logAppend(" %-12sv%d.4s,v%d.4s,v%d.4s\n", name, dst.index, src1.index, src2.index); + + CODEGEN_ASSERT(dst.kind == KindA64::q && dst.kind == src1.kind && dst.kind == src2.kind); + + place(dst.index | (src1.index << 5) | (op2 << 10) | (src2.index << 16) | (op << 21) | (1 << 30)); + commit(); +} + void AssemblyBuilderA64::place(uint32_t word) { CODEGEN_ASSERT(codePos < codeEnd); diff --git a/CodeGen/src/IrDump.cpp b/CodeGen/src/IrDump.cpp index de7a7fa4..9a115953 100644 --- a/CodeGen/src/IrDump.cpp +++ b/CodeGen/src/IrDump.cpp @@ -205,8 +205,10 @@ const char* getCmdName(IrCmd cmd) return "NUM_TO_INT"; case IrCmd::NUM_TO_UINT: return "NUM_TO_UINT"; - case IrCmd::NUM_TO_VECTOR: - return "NUM_TO_VECTOR"; + case IrCmd::NUM_TO_VEC: + return "NUM_TO_VEC"; + case IrCmd::TAG_VECTOR: + return "TAG_VECTOR"; case IrCmd::ADJUST_STACK_TO_REG: return "ADJUST_STACK_TO_REG"; case IrCmd::ADJUST_STACK_TO_TOP: diff --git a/CodeGen/src/IrLoweringA64.cpp b/CodeGen/src/IrLoweringA64.cpp index 681c56ec..6a5703d1 100644 --- a/CodeGen/src/IrLoweringA64.cpp +++ b/CodeGen/src/IrLoweringA64.cpp @@ -12,6 +12,9 @@ #include "lgc.h" LUAU_DYNAMIC_FASTFLAGVARIABLE(LuauCodeGenFixBufferLenCheckA64, false) +LUAU_FASTFLAGVARIABLE(LuauCodeGenVectorA64, false) + +LUAU_FASTFLAG(LuauCodegenVectorTag) namespace Luau { @@ -673,15 +676,29 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next) { inst.regA64 = regs.allocReuse(KindA64::q, index, {inst.a, inst.b}); - RegisterA64 tempa = regs.allocTemp(KindA64::s); - RegisterA64 tempb = regs.allocTemp(KindA64::s); - - for (uint8_t i = 0; i < 3; i++) + if (FFlag::LuauCodeGenVectorA64) { - build.dup_4s(tempa, regOp(inst.a), i); - build.dup_4s(tempb, regOp(inst.b), i); - build.fadd(tempa, tempa, tempb); - build.ins_4s(inst.regA64, i, castReg(KindA64::q, tempa), 0); + build.fadd(inst.regA64, regOp(inst.a), regOp(inst.b)); + + if (!FFlag::LuauCodegenVectorTag) + { + RegisterA64 tempw = regs.allocTemp(KindA64::w); + build.mov(tempw, LUA_TVECTOR); + build.ins_4s(inst.regA64, tempw, 3); + } + } + else + { + RegisterA64 tempa = regs.allocTemp(KindA64::s); + RegisterA64 tempb = regs.allocTemp(KindA64::s); + + for (uint8_t i = 0; i < 3; i++) + { + build.dup_4s(tempa, regOp(inst.a), i); + build.dup_4s(tempb, regOp(inst.b), i); + build.fadd(tempa, tempa, tempb); + build.ins_4s(inst.regA64, i, castReg(KindA64::q, tempa), 0); + } } break; } @@ -689,15 +706,29 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next) { inst.regA64 = regs.allocReuse(KindA64::q, index, {inst.a, inst.b}); - RegisterA64 tempa = regs.allocTemp(KindA64::s); - RegisterA64 tempb = regs.allocTemp(KindA64::s); - - for (uint8_t i = 0; i < 3; i++) + if (FFlag::LuauCodeGenVectorA64) { - build.dup_4s(tempa, regOp(inst.a), i); - build.dup_4s(tempb, regOp(inst.b), i); - build.fsub(tempa, tempa, tempb); - build.ins_4s(inst.regA64, i, castReg(KindA64::q, tempa), 0); + build.fsub(inst.regA64, regOp(inst.a), regOp(inst.b)); + + if (!FFlag::LuauCodegenVectorTag) + { + RegisterA64 tempw = regs.allocTemp(KindA64::w); + build.mov(tempw, LUA_TVECTOR); + build.ins_4s(inst.regA64, tempw, 3); + } + } + else + { + RegisterA64 tempa = regs.allocTemp(KindA64::s); + RegisterA64 tempb = regs.allocTemp(KindA64::s); + + for (uint8_t i = 0; i < 3; i++) + { + build.dup_4s(tempa, regOp(inst.a), i); + build.dup_4s(tempb, regOp(inst.b), i); + build.fsub(tempa, tempa, tempb); + build.ins_4s(inst.regA64, i, castReg(KindA64::q, tempa), 0); + } } break; } @@ -705,15 +736,29 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next) { inst.regA64 = regs.allocReuse(KindA64::q, index, {inst.a, inst.b}); - RegisterA64 tempa = regs.allocTemp(KindA64::s); - RegisterA64 tempb = regs.allocTemp(KindA64::s); - - for (uint8_t i = 0; i < 3; i++) + if (FFlag::LuauCodeGenVectorA64) { - build.dup_4s(tempa, regOp(inst.a), i); - build.dup_4s(tempb, regOp(inst.b), i); - build.fmul(tempa, tempa, tempb); - build.ins_4s(inst.regA64, i, castReg(KindA64::q, tempa), 0); + build.fmul(inst.regA64, regOp(inst.a), regOp(inst.b)); + + if (!FFlag::LuauCodegenVectorTag) + { + RegisterA64 tempw = regs.allocTemp(KindA64::w); + build.mov(tempw, LUA_TVECTOR); + build.ins_4s(inst.regA64, tempw, 3); + } + } + else + { + RegisterA64 tempa = regs.allocTemp(KindA64::s); + RegisterA64 tempb = regs.allocTemp(KindA64::s); + + for (uint8_t i = 0; i < 3; i++) + { + build.dup_4s(tempa, regOp(inst.a), i); + build.dup_4s(tempb, regOp(inst.b), i); + build.fmul(tempa, tempa, tempb); + build.ins_4s(inst.regA64, i, castReg(KindA64::q, tempa), 0); + } } break; } @@ -721,15 +766,29 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next) { inst.regA64 = regs.allocReuse(KindA64::q, index, {inst.a, inst.b}); - RegisterA64 tempa = regs.allocTemp(KindA64::s); - RegisterA64 tempb = regs.allocTemp(KindA64::s); - - for (uint8_t i = 0; i < 3; i++) + if (FFlag::LuauCodeGenVectorA64) { - build.dup_4s(tempa, regOp(inst.a), i); - build.dup_4s(tempb, regOp(inst.b), i); - build.fdiv(tempa, tempa, tempb); - build.ins_4s(inst.regA64, i, castReg(KindA64::q, tempa), 0); + build.fdiv(inst.regA64, regOp(inst.a), regOp(inst.b)); + + if (!FFlag::LuauCodegenVectorTag) + { + RegisterA64 tempw = regs.allocTemp(KindA64::w); + build.mov(tempw, LUA_TVECTOR); + build.ins_4s(inst.regA64, tempw, 3); + } + } + else + { + RegisterA64 tempa = regs.allocTemp(KindA64::s); + RegisterA64 tempb = regs.allocTemp(KindA64::s); + + for (uint8_t i = 0; i < 3; i++) + { + build.dup_4s(tempa, regOp(inst.a), i); + build.dup_4s(tempb, regOp(inst.b), i); + build.fdiv(tempa, tempa, tempb); + build.ins_4s(inst.regA64, i, castReg(KindA64::q, tempa), 0); + } } break; } @@ -737,13 +796,27 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next) { inst.regA64 = regs.allocReuse(KindA64::q, index, {inst.a}); - RegisterA64 tempa = regs.allocTemp(KindA64::s); - - for (uint8_t i = 0; i < 3; i++) + if (FFlag::LuauCodeGenVectorA64) { - build.dup_4s(tempa, regOp(inst.a), i); - build.fneg(tempa, tempa); - build.ins_4s(inst.regA64, i, castReg(KindA64::q, tempa), 0); + build.fneg(inst.regA64, regOp(inst.a)); + + if (!FFlag::LuauCodegenVectorTag) + { + RegisterA64 tempw = regs.allocTemp(KindA64::w); + build.mov(tempw, LUA_TVECTOR); + build.ins_4s(inst.regA64, tempw, 3); + } + } + else + { + RegisterA64 tempa = regs.allocTemp(KindA64::s); + + for (uint8_t i = 0; i < 3; i++) + { + build.dup_4s(tempa, regOp(inst.a), i); + build.fneg(tempa, tempa); + build.ins_4s(inst.regA64, i, castReg(KindA64::q, tempa), 0); + } } break; } @@ -1100,7 +1173,7 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next) build.fcvtzs(castReg(KindA64::x, inst.regA64), temp); break; } - case IrCmd::NUM_TO_VECTOR: + case IrCmd::NUM_TO_VEC: { inst.regA64 = regs.allocReg(KindA64::q, index); @@ -1111,6 +1184,23 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next) build.fcvt(temps, tempd); build.dup_4s(inst.regA64, castReg(KindA64::q, temps), 0); + if (!FFlag::LuauCodegenVectorTag) + { + build.mov(tempw, LUA_TVECTOR); + build.ins_4s(inst.regA64, tempw, 3); + } + break; + } + case IrCmd::TAG_VECTOR: + { + inst.regA64 = regs.allocReuse(KindA64::q, index, {inst.a}); + + RegisterA64 reg = regOp(inst.a); + RegisterA64 tempw = regs.allocTemp(KindA64::w); + + if (inst.regA64 != reg) + build.mov(inst.regA64, reg); + build.mov(tempw, LUA_TVECTOR); build.ins_4s(inst.regA64, tempw, 3); break; diff --git a/CodeGen/src/IrLoweringX64.cpp b/CodeGen/src/IrLoweringX64.cpp index babfdf46..c5188dc4 100644 --- a/CodeGen/src/IrLoweringX64.cpp +++ b/CodeGen/src/IrLoweringX64.cpp @@ -15,6 +15,8 @@ #include "lstate.h" #include "lgc.h" +LUAU_FASTFLAG(LuauCodegenVectorTag) + namespace Luau { namespace CodeGen @@ -608,7 +610,9 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next) build.vandps(tmp1.reg, regOp(inst.a), vectorAndMaskOp()); build.vandps(tmp2.reg, regOp(inst.b), vectorAndMaskOp()); build.vaddps(inst.regX64, tmp1.reg, tmp2.reg); - build.vorps(inst.regX64, inst.regX64, vectorOrMaskOp()); + + if (!FFlag::LuauCodegenVectorTag) + build.vorps(inst.regX64, inst.regX64, vectorOrMaskOp()); break; } case IrCmd::SUB_VEC: @@ -622,7 +626,8 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next) build.vandps(tmp1.reg, regOp(inst.a), vectorAndMaskOp()); build.vandps(tmp2.reg, regOp(inst.b), vectorAndMaskOp()); build.vsubps(inst.regX64, tmp1.reg, tmp2.reg); - build.vorps(inst.regX64, inst.regX64, vectorOrMaskOp()); + if (!FFlag::LuauCodegenVectorTag) + build.vorps(inst.regX64, inst.regX64, vectorOrMaskOp()); break; } case IrCmd::MUL_VEC: @@ -636,7 +641,8 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next) build.vandps(tmp1.reg, regOp(inst.a), vectorAndMaskOp()); build.vandps(tmp2.reg, regOp(inst.b), vectorAndMaskOp()); build.vmulps(inst.regX64, tmp1.reg, tmp2.reg); - build.vorps(inst.regX64, inst.regX64, vectorOrMaskOp()); + if (!FFlag::LuauCodegenVectorTag) + build.vorps(inst.regX64, inst.regX64, vectorOrMaskOp()); break; } case IrCmd::DIV_VEC: @@ -650,7 +656,8 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next) build.vandps(tmp1.reg, regOp(inst.a), vectorAndMaskOp()); build.vandps(tmp2.reg, regOp(inst.b), vectorAndMaskOp()); build.vdivps(inst.regX64, tmp1.reg, tmp2.reg); - build.vpinsrd(inst.regX64, inst.regX64, build.i32(LUA_TVECTOR), 3); + if (!FFlag::LuauCodegenVectorTag) + build.vpinsrd(inst.regX64, inst.regX64, build.i32(LUA_TVECTOR), 3); break; } case IrCmd::UNM_VEC: @@ -669,7 +676,8 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next) build.vxorpd(inst.regX64, inst.regX64, build.f32x4(-0.0, -0.0, -0.0, -0.0)); } - build.vpinsrd(inst.regX64, inst.regX64, build.i32(LUA_TVECTOR), 3); + if (!FFlag::LuauCodegenVectorTag) + build.vpinsrd(inst.regX64, inst.regX64, build.i32(LUA_TVECTOR), 3); break; } case IrCmd::NOT_ANY: @@ -964,7 +972,7 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next) build.vcvttsd2si(qwordReg(inst.regX64), memRegDoubleOp(inst.a)); break; - case IrCmd::NUM_TO_VECTOR: + case IrCmd::NUM_TO_VEC: inst.regX64 = regs.allocReg(SizeX64::xmmword, index); if (inst.a.kind == IrOpKind::Constant) @@ -974,15 +982,25 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next) static_assert(sizeof(asU32) == sizeof(value), "Expecting float to be 32-bit"); memcpy(&asU32, &value, sizeof(value)); - build.vmovaps(inst.regX64, build.u32x4(asU32, asU32, asU32, LUA_TVECTOR)); + if (FFlag::LuauCodegenVectorTag) + build.vmovaps(inst.regX64, build.u32x4(asU32, asU32, asU32, 0)); + else + build.vmovaps(inst.regX64, build.u32x4(asU32, asU32, asU32, LUA_TVECTOR)); } else { build.vcvtsd2ss(inst.regX64, inst.regX64, memRegDoubleOp(inst.a)); build.vpshufps(inst.regX64, inst.regX64, inst.regX64, 0b00'00'00'00); - build.vpinsrd(inst.regX64, inst.regX64, build.i32(LUA_TVECTOR), 3); + + if (!FFlag::LuauCodegenVectorTag) + build.vpinsrd(inst.regX64, inst.regX64, build.i32(LUA_TVECTOR), 3); } break; + case IrCmd::TAG_VECTOR: + inst.regX64 = regs.allocRegOrReuse(SizeX64::xmmword, index, {inst.a}); + + build.vpinsrd(inst.regX64, regOp(inst.a), build.i32(LUA_TVECTOR), 3); + break; case IrCmd::ADJUST_STACK_TO_REG: { ScopedRegX64 tmp{regs, SizeX64::qword}; diff --git a/CodeGen/src/IrTranslation.cpp b/CodeGen/src/IrTranslation.cpp index 44d0a264..686d5130 100644 --- a/CodeGen/src/IrTranslation.cpp +++ b/CodeGen/src/IrTranslation.cpp @@ -14,6 +14,7 @@ LUAU_FASTFLAGVARIABLE(LuauCodegenLuData, false) LUAU_FASTFLAGVARIABLE(LuauCodegenVector, false) +LUAU_FASTFLAGVARIABLE(LuauCodegenVectorTag, false) namespace Luau { @@ -380,9 +381,12 @@ static void translateInstBinaryNumeric(IrBuilder& build, int ra, int rb, int rc, result = build.inst(IrCmd::DIV_VEC, vb, vc); break; default: - break; + CODEGEN_ASSERT(!"Unknown TM op"); } + if (FFlag::LuauCodegenVectorTag) + result = build.inst(IrCmd::TAG_VECTOR, result); + build.inst(IrCmd::STORE_TVALUE, build.vmReg(ra), result); return; } @@ -393,7 +397,7 @@ static void translateInstBinaryNumeric(IrBuilder& build, int ra, int rb, int rc, build.inst(IrCmd::CHECK_TAG, build.inst(IrCmd::LOAD_TAG, build.vmReg(rc)), build.constTag(LUA_TVECTOR), build.vmExit(pcpos)); - IrOp vb = build.inst(IrCmd::NUM_TO_VECTOR, loadDoubleOrConstant(build, opb)); + IrOp vb = build.inst(IrCmd::NUM_TO_VEC, loadDoubleOrConstant(build, opb)); IrOp vc = build.inst(IrCmd::LOAD_TVALUE, opc); IrOp result; @@ -406,9 +410,12 @@ static void translateInstBinaryNumeric(IrBuilder& build, int ra, int rb, int rc, result = build.inst(IrCmd::DIV_VEC, vb, vc); break; default: - break; + CODEGEN_ASSERT(!"Unknown TM op"); } + if (FFlag::LuauCodegenVectorTag) + result = build.inst(IrCmd::TAG_VECTOR, result); + build.inst(IrCmd::STORE_TVALUE, build.vmReg(ra), result); return; } @@ -420,7 +427,7 @@ static void translateInstBinaryNumeric(IrBuilder& build, int ra, int rb, int rc, build.inst(IrCmd::CHECK_TAG, build.inst(IrCmd::LOAD_TAG, build.vmReg(rc)), build.constTag(LUA_TNUMBER), build.vmExit(pcpos)); IrOp vb = build.inst(IrCmd::LOAD_TVALUE, opb); - IrOp vc = build.inst(IrCmd::NUM_TO_VECTOR, loadDoubleOrConstant(build, opc)); + IrOp vc = build.inst(IrCmd::NUM_TO_VEC, loadDoubleOrConstant(build, opc)); IrOp result; switch (tm) @@ -432,9 +439,12 @@ static void translateInstBinaryNumeric(IrBuilder& build, int ra, int rb, int rc, result = build.inst(IrCmd::DIV_VEC, vb, vc); break; default: - break; + CODEGEN_ASSERT(!"Unknown TM op"); } + if (FFlag::LuauCodegenVectorTag) + result = build.inst(IrCmd::TAG_VECTOR, result); + build.inst(IrCmd::STORE_TVALUE, build.vmReg(ra), result); return; } @@ -596,6 +606,8 @@ void translateInstMinus(IrBuilder& build, const Instruction* pc, int pcpos) IrOp vb = build.inst(IrCmd::LOAD_TVALUE, build.vmReg(rb)); IrOp va = build.inst(IrCmd::UNM_VEC, vb); + if (FFlag::LuauCodegenVectorTag) + va = build.inst(IrCmd::TAG_VECTOR, va); build.inst(IrCmd::STORE_TVALUE, build.vmReg(ra), va); return; } diff --git a/CodeGen/src/IrUtils.cpp b/CodeGen/src/IrUtils.cpp index b49e974f..caa6b178 100644 --- a/CodeGen/src/IrUtils.cpp +++ b/CodeGen/src/IrUtils.cpp @@ -106,7 +106,8 @@ IrValueKind getCmdValueKind(IrCmd cmd) case IrCmd::NUM_TO_INT: case IrCmd::NUM_TO_UINT: return IrValueKind::Int; - case IrCmd::NUM_TO_VECTOR: + case IrCmd::NUM_TO_VEC: + case IrCmd::TAG_VECTOR: return IrValueKind::Tvalue; case IrCmd::ADJUST_STACK_TO_REG: case IrCmd::ADJUST_STACK_TO_TOP: diff --git a/CodeGen/src/OptimizeConstProp.cpp b/CodeGen/src/OptimizeConstProp.cpp index 0c543572..4214d015 100644 --- a/CodeGen/src/OptimizeConstProp.cpp +++ b/CodeGen/src/OptimizeConstProp.cpp @@ -18,6 +18,7 @@ LUAU_FASTINTVARIABLE(LuauCodeGenMinLinearBlockPath, 3) LUAU_FASTINTVARIABLE(LuauCodeGenReuseSlotLimit, 64) LUAU_FASTFLAGVARIABLE(DebugLuauAbortingChecks, false) LUAU_FASTFLAG(LuauCodegenVector) +LUAU_FASTFLAG(LuauCodegenVectorTag) LUAU_DYNAMIC_FASTFLAGVARIABLE(LuauCodeGenCheckGcEffectFix, false) namespace Luau @@ -715,9 +716,17 @@ static void constPropInInst(ConstPropState& state, IrBuilder& build, IrFunction& { if (IrInst* arg = function.asInstOp(inst.b)) { - if (arg->cmd == IrCmd::ADD_VEC || arg->cmd == IrCmd::SUB_VEC || arg->cmd == IrCmd::MUL_VEC || arg->cmd == IrCmd::DIV_VEC || - arg->cmd == IrCmd::UNM_VEC) - tag = LUA_TVECTOR; + if (FFlag::LuauCodegenVectorTag) + { + if (arg->cmd == IrCmd::TAG_VECTOR) + tag = LUA_TVECTOR; + } + else + { + if (arg->cmd == IrCmd::ADD_VEC || arg->cmd == IrCmd::SUB_VEC || arg->cmd == IrCmd::MUL_VEC || arg->cmd == IrCmd::DIV_VEC || + arg->cmd == IrCmd::UNM_VEC) + tag = LUA_TVECTOR; + } } } @@ -1250,6 +1259,28 @@ static void constPropInInst(ConstPropState& state, IrBuilder& build, IrFunction& if (int(state.checkSlotMatchCache.size()) < FInt::LuauCodeGenReuseSlotLimit) state.checkSlotMatchCache.push_back(index); break; + + case IrCmd::ADD_VEC: + case IrCmd::SUB_VEC: + case IrCmd::MUL_VEC: + case IrCmd::DIV_VEC: + if (FFlag::LuauCodegenVectorTag) + { + if (IrInst* a = function.asInstOp(inst.a); a && a->cmd == IrCmd::TAG_VECTOR) + inst.a = a->a; + if (IrInst* b = function.asInstOp(inst.b); b && b->cmd == IrCmd::TAG_VECTOR) + inst.b = b->a; + } + break; + + case IrCmd::UNM_VEC: + if (FFlag::LuauCodegenVectorTag) + { + if (IrInst* a = function.asInstOp(inst.a); a && a->cmd == IrCmd::TAG_VECTOR) + inst.a = a->a; + } + break; + case IrCmd::CHECK_NODE_NO_NEXT: case IrCmd::CHECK_NODE_VALUE: case IrCmd::BARRIER_TABLE_BACK: @@ -1278,12 +1309,8 @@ static void constPropInInst(ConstPropState& state, IrBuilder& build, IrFunction& case IrCmd::GET_TYPE: case IrCmd::GET_TYPEOF: case IrCmd::FINDUPVAL: - case IrCmd::ADD_VEC: - case IrCmd::SUB_VEC: - case IrCmd::MUL_VEC: - case IrCmd::DIV_VEC: - case IrCmd::UNM_VEC: - case IrCmd::NUM_TO_VECTOR: + case IrCmd::NUM_TO_VEC: + case IrCmd::TAG_VECTOR: break; case IrCmd::DO_ARITH: diff --git a/tests/AssemblyBuilderA64.test.cpp b/tests/AssemblyBuilderA64.test.cpp index 6657d889..320a7a6a 100644 --- a/tests/AssemblyBuilderA64.test.cpp +++ b/tests/AssemblyBuilderA64.test.cpp @@ -218,6 +218,7 @@ TEST_CASE_FIXTURE(AssemblyBuilderA64Fixture, "Moves") { SINGLE_COMPARE(mov(x0, x1), 0xAA0103E0); SINGLE_COMPARE(mov(w0, w1), 0x2A0103E0); + SINGLE_COMPARE(mov(q0, q1), 0x4EA11C20); SINGLE_COMPARE(movz(x0, 42), 0xD2800540); SINGLE_COMPARE(movz(w0, 42), 0x52800540); @@ -501,6 +502,15 @@ TEST_CASE_FIXTURE(AssemblyBuilderA64Fixture, "PrePostIndexing") SINGLE_COMPARE(str(q0, mem(x1, 1, AddressKindA64::post)), 0x3C801420); } +TEST_CASE_FIXTURE(AssemblyBuilderA64Fixture, "SIMDMath") +{ + SINGLE_COMPARE(fadd(q0, q1, q2), 0x4E22D420); + SINGLE_COMPARE(fsub(q0, q1, q2), 0x4EA2D420); + SINGLE_COMPARE(fmul(q0, q1, q2), 0x6E22DC20); + SINGLE_COMPARE(fdiv(q0, q1, q2), 0x6E22FC20); + SINGLE_COMPARE(fneg(q0, q1), 0x6EA0F820); +} + TEST_CASE("LogTest") { AssemblyBuilderA64 build(/* logText= */ true); @@ -552,6 +562,7 @@ TEST_CASE("LogTest") build.ins_4s(q31, 1, q29, 2); build.dup_4s(s29, q31, 2); build.dup_4s(q29, q30, 0); + build.fmul(q0, q1, q2); build.setLabel(l); build.ret(); @@ -594,6 +605,7 @@ TEST_CASE("LogTest") ins v31.s[1],v29.s[2] dup s29,v31.s[2] dup v29.4s,v30.s[0] + fmul v0.4s,v1.4s,v2.4s .L1: ret )"; diff --git a/tests/IrLowering.test.cpp b/tests/IrLowering.test.cpp index bdb7e38c..13f44dca 100644 --- a/tests/IrLowering.test.cpp +++ b/tests/IrLowering.test.cpp @@ -13,6 +13,7 @@ #include LUAU_FASTFLAG(LuauCodegenVector) +LUAU_FASTFLAG(LuauCodegenVectorTag) LUAU_FASTFLAG(LuauCodegenMathMemArgs) static std::string getCodegenAssembly(const char* source) @@ -65,6 +66,7 @@ TEST_SUITE_BEGIN("IrLowering"); TEST_CASE("VectorReciprocal") { ScopedFastFlag luauCodegenVector{FFlag::LuauCodegenVector, true}; + ScopedFastFlag luauCodegenVectorTag{FFlag::LuauCodegenVectorTag, true}; CHECK_EQ("\n" + getCodegenAssembly(R"( local function vecrcp(a: vector) @@ -79,10 +81,11 @@ bb_0: bb_2: JUMP bb_bytecode_1 bb_bytecode_1: - %6 = NUM_TO_VECTOR 1 + %6 = NUM_TO_VEC 1 %7 = LOAD_TVALUE R0 %8 = DIV_VEC %6, %7 - STORE_TVALUE R1, %8 + %9 = TAG_VECTOR %8 + STORE_TVALUE R1, %9 INTERRUPT 1u RETURN R1, 1i )"); @@ -127,6 +130,7 @@ bb_bytecode_1: TEST_CASE("VectorAdd") { ScopedFastFlag luauCodegenVector{FFlag::LuauCodegenVector, true}; + ScopedFastFlag luauCodegenVectorTag{FFlag::LuauCodegenVectorTag, true}; CHECK_EQ("\n" + getCodegenAssembly(R"( local function vec3add(a: vector, b: vector) @@ -145,7 +149,8 @@ bb_bytecode_1: %10 = LOAD_TVALUE R0 %11 = LOAD_TVALUE R1 %12 = ADD_VEC %10, %11 - STORE_TVALUE R2, %12 + %13 = TAG_VECTOR %12 + STORE_TVALUE R2, %13 INTERRUPT 1u RETURN R2, 1i )"); @@ -154,6 +159,7 @@ bb_bytecode_1: TEST_CASE("VectorMinus") { ScopedFastFlag luauCodegenVector{FFlag::LuauCodegenVector, true}; + ScopedFastFlag luauCodegenVectorTag{FFlag::LuauCodegenVectorTag, true}; CHECK_EQ("\n" + getCodegenAssembly(R"( local function vec3minus(a: vector) @@ -170,7 +176,8 @@ bb_2: bb_bytecode_1: %6 = LOAD_TVALUE R0 %7 = UNM_VEC %6 - STORE_TVALUE R1, %7 + %8 = TAG_VECTOR %7 + STORE_TVALUE R1, %8 INTERRUPT 1u RETURN R1, 1i )"); @@ -179,6 +186,7 @@ bb_bytecode_1: TEST_CASE("VectorSubMulDiv") { ScopedFastFlag luauCodegenVector{FFlag::LuauCodegenVector, true}; + ScopedFastFlag luauCodegenVectorTag{FFlag::LuauCodegenVectorTag, true}; CHECK_EQ("\n" + getCodegenAssembly(R"( local function vec3combo(a: vector, b: vector, c: vector, d: vector) @@ -199,13 +207,16 @@ bb_bytecode_1: %14 = LOAD_TVALUE R0 %15 = LOAD_TVALUE R1 %16 = MUL_VEC %14, %15 - STORE_TVALUE R5, %16 - %22 = LOAD_TVALUE R2 - %23 = LOAD_TVALUE R3 - %24 = DIV_VEC %22, %23 - STORE_TVALUE R6, %24 - %32 = SUB_VEC %16, %24 - STORE_TVALUE R4, %32 + %17 = TAG_VECTOR %16 + STORE_TVALUE R5, %17 + %23 = LOAD_TVALUE R2 + %24 = LOAD_TVALUE R3 + %25 = DIV_VEC %23, %24 + %26 = TAG_VECTOR %25 + STORE_TVALUE R6, %26 + %34 = SUB_VEC %16, %25 + %35 = TAG_VECTOR %34 + STORE_TVALUE R4, %35 INTERRUPT 3u RETURN R4, 1i )"); @@ -214,6 +225,7 @@ bb_bytecode_1: TEST_CASE("VectorMulDivMixed") { ScopedFastFlag luauCodegenVector{FFlag::LuauCodegenVector, true}; + ScopedFastFlag luauCodegenVectorTag{FFlag::LuauCodegenVectorTag, true}; CHECK_EQ("\n" + getCodegenAssembly(R"( local function vec3combo(a: vector, b: vector, c: vector, d: vector) @@ -232,29 +244,36 @@ bb_2: JUMP bb_bytecode_1 bb_bytecode_1: %12 = LOAD_TVALUE R0 - %13 = NUM_TO_VECTOR 2 + %13 = NUM_TO_VEC 2 %14 = MUL_VEC %12, %13 - STORE_TVALUE R7, %14 - %18 = LOAD_TVALUE R1 - %19 = NUM_TO_VECTOR 4 - %20 = DIV_VEC %18, %19 - STORE_TVALUE R8, %20 - %28 = ADD_VEC %14, %20 - STORE_TVALUE R6, %28 + %15 = TAG_VECTOR %14 + STORE_TVALUE R7, %15 + %19 = LOAD_TVALUE R1 + %20 = NUM_TO_VEC 4 + %21 = DIV_VEC %19, %20 + %22 = TAG_VECTOR %21 + STORE_TVALUE R8, %22 + %30 = ADD_VEC %14, %21 + %31 = TAG_VECTOR %30 + STORE_TVALUE R6, %31 STORE_DOUBLE R8, 0.5 STORE_TAG R8, tnumber - %37 = NUM_TO_VECTOR 0.5 - %38 = LOAD_TVALUE R2 - %39 = MUL_VEC %37, %38 - STORE_TVALUE R7, %39 - %47 = ADD_VEC %28, %39 - STORE_TVALUE R5, %47 - %51 = NUM_TO_VECTOR 40 - %52 = LOAD_TVALUE R3 - %53 = DIV_VEC %51, %52 - STORE_TVALUE R6, %53 - %61 = ADD_VEC %47, %53 - STORE_TVALUE R4, %61 + %40 = NUM_TO_VEC 0.5 + %41 = LOAD_TVALUE R2 + %42 = MUL_VEC %40, %41 + %43 = TAG_VECTOR %42 + STORE_TVALUE R7, %43 + %51 = ADD_VEC %30, %42 + %52 = TAG_VECTOR %51 + STORE_TVALUE R5, %52 + %56 = NUM_TO_VEC 40 + %57 = LOAD_TVALUE R3 + %58 = DIV_VEC %56, %57 + %59 = TAG_VECTOR %58 + STORE_TVALUE R6, %59 + %67 = ADD_VEC %51, %58 + %68 = TAG_VECTOR %67 + STORE_TVALUE R4, %68 INTERRUPT 8u RETURN R4, 1i )");