From 8c0115c03ae0e8fabc574860eda4aa37150d8dc3 Mon Sep 17 00:00:00 2001 From: Arseny Kapoulkine Date: Tue, 13 Feb 2024 17:24:26 -0800 Subject: [PATCH] CodeGen: Rempve tempVectorPure for now On Apple Mn, it looks like denormals do not have an extra cost at least on the benchmarks tested. On old ARM CPUs it looks like denormals are flushed to zero; there might still be some CPUs that would benefit from zeroing out the 4th component, but for now let's assume that they are all well behaved. Removing zero-ins increases performance of some code by ~20% on Apple Mn. --- CodeGen/src/IrLoweringA64.cpp | 44 ++++------------------------------- CodeGen/src/IrLoweringA64.h | 1 - 2 files changed, 5 insertions(+), 40 deletions(-) diff --git a/CodeGen/src/IrLoweringA64.cpp b/CodeGen/src/IrLoweringA64.cpp index 7d33b3e9..10148acf 100644 --- a/CodeGen/src/IrLoweringA64.cpp +++ b/CodeGen/src/IrLoweringA64.cpp @@ -676,10 +676,7 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next) if (FFlag::LuauCodeGenVectorA64) { - RegisterA64 tempa = tempVectorPure(inst.a, index); - RegisterA64 tempb = tempVectorPure(inst.b, index); - - build.fadd(inst.regA64, tempa, tempb); + build.fadd(inst.regA64, regOp(inst.a), regOp(inst.b)); RegisterA64 tempw = regs.allocTemp(KindA64::w); build.mov(tempw, LUA_TVECTOR); @@ -706,10 +703,7 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next) if (FFlag::LuauCodeGenVectorA64) { - RegisterA64 tempa = tempVectorPure(inst.a, index); - RegisterA64 tempb = tempVectorPure(inst.b, index); - - build.fsub(inst.regA64, tempa, tempb); + build.fsub(inst.regA64, regOp(inst.a), regOp(inst.b)); RegisterA64 tempw = regs.allocTemp(KindA64::w); build.mov(tempw, LUA_TVECTOR); @@ -736,10 +730,7 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next) if (FFlag::LuauCodeGenVectorA64) { - RegisterA64 tempa = tempVectorPure(inst.a, index); - RegisterA64 tempb = tempVectorPure(inst.b, index); - - build.fmul(inst.regA64, tempa, tempb); + build.fmul(inst.regA64, regOp(inst.a), regOp(inst.b)); RegisterA64 tempw = regs.allocTemp(KindA64::w); build.mov(tempw, LUA_TVECTOR); @@ -766,10 +757,7 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next) if (FFlag::LuauCodeGenVectorA64) { - RegisterA64 tempa = tempVectorPure(inst.a, index); - RegisterA64 tempb = tempVectorPure(inst.b, index); - - build.fdiv(inst.regA64, tempa, tempb); + build.fdiv(inst.regA64, regOp(inst.a), regOp(inst.b)); RegisterA64 tempw = regs.allocTemp(KindA64::w); build.mov(tempw, LUA_TVECTOR); @@ -796,8 +784,7 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next) if (FFlag::LuauCodeGenVectorA64) { - RegisterA64 temp = tempVectorPure(inst.a, index); - build.fneg(inst.regA64, temp); + build.fneg(inst.regA64, regOp(inst.a)); RegisterA64 tempw = regs.allocTemp(KindA64::w); build.mov(tempw, LUA_TVECTOR); @@ -2635,27 +2622,6 @@ AddressA64 IrLoweringA64::tempAddrBuffer(IrOp bufferOp, IrOp indexOp) } } -RegisterA64 IrLoweringA64::tempVectorPure(IrOp op, uint32_t index) -{ - RegisterA64 reg = regOp(op); - - IrInst& source = function.instructions[op.index]; - LUAU_ASSERT(source.regA64 == reg); - - if (source.lastUse == index) - { - build.ins_4s(reg, wzr, 3); - return reg; - } - else - { - RegisterA64 temp = regs.allocTemp(KindA64::q); - build.mov(temp, reg); - build.ins_4s(temp, wzr, 3); - return temp; - } -} - RegisterA64 IrLoweringA64::regOp(IrOp op) { IrInst& inst = function.instOp(op); diff --git a/CodeGen/src/IrLoweringA64.h b/CodeGen/src/IrLoweringA64.h index 9d2efb24..5fb7f2b8 100644 --- a/CodeGen/src/IrLoweringA64.h +++ b/CodeGen/src/IrLoweringA64.h @@ -45,7 +45,6 @@ struct IrLoweringA64 RegisterA64 tempUint(IrOp op); AddressA64 tempAddr(IrOp op, int offset); AddressA64 tempAddrBuffer(IrOp bufferOp, IrOp indexOp); - RegisterA64 tempVectorPure(IrOp op, uint32_t index); // May emit restore instructions RegisterA64 regOp(IrOp op);