CodeGen: Rempve tempVectorPure for now

On Apple Mn, it looks like denormals do not have an extra cost at least on
the benchmarks tested. On old ARM CPUs it looks like denormals are flushed
to zero; there might still be some CPUs that would benefit from zeroing
out the 4th component, but for now let's assume that they are all well behaved.

Removing zero-ins increases performance of some code by ~20% on Apple Mn.
This commit is contained in:
Arseny Kapoulkine 2024-02-13 17:24:26 -08:00
parent c4da73ecf9
commit 8c0115c03a
2 changed files with 5 additions and 40 deletions

View file

@ -676,10 +676,7 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next)
if (FFlag::LuauCodeGenVectorA64)
{
RegisterA64 tempa = tempVectorPure(inst.a, index);
RegisterA64 tempb = tempVectorPure(inst.b, index);
build.fadd(inst.regA64, tempa, tempb);
build.fadd(inst.regA64, regOp(inst.a), regOp(inst.b));
RegisterA64 tempw = regs.allocTemp(KindA64::w);
build.mov(tempw, LUA_TVECTOR);
@ -706,10 +703,7 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next)
if (FFlag::LuauCodeGenVectorA64)
{
RegisterA64 tempa = tempVectorPure(inst.a, index);
RegisterA64 tempb = tempVectorPure(inst.b, index);
build.fsub(inst.regA64, tempa, tempb);
build.fsub(inst.regA64, regOp(inst.a), regOp(inst.b));
RegisterA64 tempw = regs.allocTemp(KindA64::w);
build.mov(tempw, LUA_TVECTOR);
@ -736,10 +730,7 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next)
if (FFlag::LuauCodeGenVectorA64)
{
RegisterA64 tempa = tempVectorPure(inst.a, index);
RegisterA64 tempb = tempVectorPure(inst.b, index);
build.fmul(inst.regA64, tempa, tempb);
build.fmul(inst.regA64, regOp(inst.a), regOp(inst.b));
RegisterA64 tempw = regs.allocTemp(KindA64::w);
build.mov(tempw, LUA_TVECTOR);
@ -766,10 +757,7 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next)
if (FFlag::LuauCodeGenVectorA64)
{
RegisterA64 tempa = tempVectorPure(inst.a, index);
RegisterA64 tempb = tempVectorPure(inst.b, index);
build.fdiv(inst.regA64, tempa, tempb);
build.fdiv(inst.regA64, regOp(inst.a), regOp(inst.b));
RegisterA64 tempw = regs.allocTemp(KindA64::w);
build.mov(tempw, LUA_TVECTOR);
@ -796,8 +784,7 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next)
if (FFlag::LuauCodeGenVectorA64)
{
RegisterA64 temp = tempVectorPure(inst.a, index);
build.fneg(inst.regA64, temp);
build.fneg(inst.regA64, regOp(inst.a));
RegisterA64 tempw = regs.allocTemp(KindA64::w);
build.mov(tempw, LUA_TVECTOR);
@ -2635,27 +2622,6 @@ AddressA64 IrLoweringA64::tempAddrBuffer(IrOp bufferOp, IrOp indexOp)
}
}
RegisterA64 IrLoweringA64::tempVectorPure(IrOp op, uint32_t index)
{
RegisterA64 reg = regOp(op);
IrInst& source = function.instructions[op.index];
LUAU_ASSERT(source.regA64 == reg);
if (source.lastUse == index)
{
build.ins_4s(reg, wzr, 3);
return reg;
}
else
{
RegisterA64 temp = regs.allocTemp(KindA64::q);
build.mov(temp, reg);
build.ins_4s(temp, wzr, 3);
return temp;
}
}
RegisterA64 IrLoweringA64::regOp(IrOp op)
{
IrInst& inst = function.instOp(op);

View file

@ -45,7 +45,6 @@ struct IrLoweringA64
RegisterA64 tempUint(IrOp op);
AddressA64 tempAddr(IrOp op, int offset);
AddressA64 tempAddrBuffer(IrOp bufferOp, IrOp indexOp);
RegisterA64 tempVectorPure(IrOp op, uint32_t index);
// May emit restore instructions
RegisterA64 regOp(IrOp op);