diff --git a/CodeGen/src/IrLoweringA64.cpp b/CodeGen/src/IrLoweringA64.cpp index 04804e67..7d33b3e9 100644 --- a/CodeGen/src/IrLoweringA64.cpp +++ b/CodeGen/src/IrLoweringA64.cpp @@ -12,6 +12,7 @@ #include "lgc.h" LUAU_DYNAMIC_FASTFLAGVARIABLE(LuauCodeGenFixBufferLenCheckA64, false) +LUAU_FASTFLAGVARIABLE(LuauCodeGenVectorA64, false) namespace Luau { @@ -673,15 +674,29 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next) { inst.regA64 = regs.allocReuse(KindA64::q, index, {inst.a, inst.b}); - RegisterA64 tempa = regs.allocTemp(KindA64::s); - RegisterA64 tempb = regs.allocTemp(KindA64::s); - - for (uint8_t i = 0; i < 3; i++) + if (FFlag::LuauCodeGenVectorA64) { - build.dup_4s(tempa, regOp(inst.a), i); - build.dup_4s(tempb, regOp(inst.b), i); - build.fadd(tempa, tempa, tempb); - build.ins_4s(inst.regA64, i, castReg(KindA64::q, tempa), 0); + RegisterA64 tempa = tempVectorPure(inst.a, index); + RegisterA64 tempb = tempVectorPure(inst.b, index); + + build.fadd(inst.regA64, tempa, tempb); + + RegisterA64 tempw = regs.allocTemp(KindA64::w); + build.mov(tempw, LUA_TVECTOR); + build.ins_4s(inst.regA64, tempw, 3); + } + else + { + RegisterA64 tempa = regs.allocTemp(KindA64::s); + RegisterA64 tempb = regs.allocTemp(KindA64::s); + + for (uint8_t i = 0; i < 3; i++) + { + build.dup_4s(tempa, regOp(inst.a), i); + build.dup_4s(tempb, regOp(inst.b), i); + build.fadd(tempa, tempa, tempb); + build.ins_4s(inst.regA64, i, castReg(KindA64::q, tempa), 0); + } } break; } @@ -689,15 +704,29 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next) { inst.regA64 = regs.allocReuse(KindA64::q, index, {inst.a, inst.b}); - RegisterA64 tempa = regs.allocTemp(KindA64::s); - RegisterA64 tempb = regs.allocTemp(KindA64::s); - - for (uint8_t i = 0; i < 3; i++) + if (FFlag::LuauCodeGenVectorA64) { - build.dup_4s(tempa, regOp(inst.a), i); - build.dup_4s(tempb, regOp(inst.b), i); - build.fsub(tempa, tempa, tempb); - build.ins_4s(inst.regA64, i, castReg(KindA64::q, tempa), 0); + RegisterA64 tempa = tempVectorPure(inst.a, index); + RegisterA64 tempb = tempVectorPure(inst.b, index); + + build.fsub(inst.regA64, tempa, tempb); + + RegisterA64 tempw = regs.allocTemp(KindA64::w); + build.mov(tempw, LUA_TVECTOR); + build.ins_4s(inst.regA64, tempw, 3); + } + else + { + RegisterA64 tempa = regs.allocTemp(KindA64::s); + RegisterA64 tempb = regs.allocTemp(KindA64::s); + + for (uint8_t i = 0; i < 3; i++) + { + build.dup_4s(tempa, regOp(inst.a), i); + build.dup_4s(tempb, regOp(inst.b), i); + build.fsub(tempa, tempa, tempb); + build.ins_4s(inst.regA64, i, castReg(KindA64::q, tempa), 0); + } } break; } @@ -705,15 +734,29 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next) { inst.regA64 = regs.allocReuse(KindA64::q, index, {inst.a, inst.b}); - RegisterA64 tempa = regs.allocTemp(KindA64::s); - RegisterA64 tempb = regs.allocTemp(KindA64::s); - - for (uint8_t i = 0; i < 3; i++) + if (FFlag::LuauCodeGenVectorA64) { - build.dup_4s(tempa, regOp(inst.a), i); - build.dup_4s(tempb, regOp(inst.b), i); - build.fmul(tempa, tempa, tempb); - build.ins_4s(inst.regA64, i, castReg(KindA64::q, tempa), 0); + RegisterA64 tempa = tempVectorPure(inst.a, index); + RegisterA64 tempb = tempVectorPure(inst.b, index); + + build.fmul(inst.regA64, tempa, tempb); + + RegisterA64 tempw = regs.allocTemp(KindA64::w); + build.mov(tempw, LUA_TVECTOR); + build.ins_4s(inst.regA64, tempw, 3); + } + else + { + RegisterA64 tempa = regs.allocTemp(KindA64::s); + RegisterA64 tempb = regs.allocTemp(KindA64::s); + + for (uint8_t i = 0; i < 3; i++) + { + build.dup_4s(tempa, regOp(inst.a), i); + build.dup_4s(tempb, regOp(inst.b), i); + build.fmul(tempa, tempa, tempb); + build.ins_4s(inst.regA64, i, castReg(KindA64::q, tempa), 0); + } } break; } @@ -721,15 +764,29 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next) { inst.regA64 = regs.allocReuse(KindA64::q, index, {inst.a, inst.b}); - RegisterA64 tempa = regs.allocTemp(KindA64::s); - RegisterA64 tempb = regs.allocTemp(KindA64::s); - - for (uint8_t i = 0; i < 3; i++) + if (FFlag::LuauCodeGenVectorA64) { - build.dup_4s(tempa, regOp(inst.a), i); - build.dup_4s(tempb, regOp(inst.b), i); - build.fdiv(tempa, tempa, tempb); - build.ins_4s(inst.regA64, i, castReg(KindA64::q, tempa), 0); + RegisterA64 tempa = tempVectorPure(inst.a, index); + RegisterA64 tempb = tempVectorPure(inst.b, index); + + build.fdiv(inst.regA64, tempa, tempb); + + RegisterA64 tempw = regs.allocTemp(KindA64::w); + build.mov(tempw, LUA_TVECTOR); + build.ins_4s(inst.regA64, tempw, 3); + } + else + { + RegisterA64 tempa = regs.allocTemp(KindA64::s); + RegisterA64 tempb = regs.allocTemp(KindA64::s); + + for (uint8_t i = 0; i < 3; i++) + { + build.dup_4s(tempa, regOp(inst.a), i); + build.dup_4s(tempb, regOp(inst.b), i); + build.fdiv(tempa, tempa, tempb); + build.ins_4s(inst.regA64, i, castReg(KindA64::q, tempa), 0); + } } break; } @@ -737,13 +794,25 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next) { inst.regA64 = regs.allocReuse(KindA64::q, index, {inst.a}); - RegisterA64 tempa = regs.allocTemp(KindA64::s); - - for (uint8_t i = 0; i < 3; i++) + if (FFlag::LuauCodeGenVectorA64) { - build.dup_4s(tempa, regOp(inst.a), i); - build.fneg(tempa, tempa); - build.ins_4s(inst.regA64, i, castReg(KindA64::q, tempa), 0); + RegisterA64 temp = tempVectorPure(inst.a, index); + build.fneg(inst.regA64, temp); + + RegisterA64 tempw = regs.allocTemp(KindA64::w); + build.mov(tempw, LUA_TVECTOR); + build.ins_4s(inst.regA64, tempw, 3); + } + else + { + RegisterA64 tempa = regs.allocTemp(KindA64::s); + + for (uint8_t i = 0; i < 3; i++) + { + build.dup_4s(tempa, regOp(inst.a), i); + build.fneg(tempa, tempa); + build.ins_4s(inst.regA64, i, castReg(KindA64::q, tempa), 0); + } } break; } @@ -2566,6 +2635,27 @@ AddressA64 IrLoweringA64::tempAddrBuffer(IrOp bufferOp, IrOp indexOp) } } +RegisterA64 IrLoweringA64::tempVectorPure(IrOp op, uint32_t index) +{ + RegisterA64 reg = regOp(op); + + IrInst& source = function.instructions[op.index]; + LUAU_ASSERT(source.regA64 == reg); + + if (source.lastUse == index) + { + build.ins_4s(reg, wzr, 3); + return reg; + } + else + { + RegisterA64 temp = regs.allocTemp(KindA64::q); + build.mov(temp, reg); + build.ins_4s(temp, wzr, 3); + return temp; + } +} + RegisterA64 IrLoweringA64::regOp(IrOp op) { IrInst& inst = function.instOp(op); diff --git a/CodeGen/src/IrLoweringA64.h b/CodeGen/src/IrLoweringA64.h index 5fb7f2b8..9d2efb24 100644 --- a/CodeGen/src/IrLoweringA64.h +++ b/CodeGen/src/IrLoweringA64.h @@ -45,6 +45,7 @@ struct IrLoweringA64 RegisterA64 tempUint(IrOp op); AddressA64 tempAddr(IrOp op, int offset); AddressA64 tempAddrBuffer(IrOp bufferOp, IrOp indexOp); + RegisterA64 tempVectorPure(IrOp op, uint32_t index); // May emit restore instructions RegisterA64 regOp(IrOp op);