mirror of
https://github.com/luau-lang/luau.git
synced 2025-05-04 10:33:46 +01:00
CodeGen: Rempve tempVectorPure for now
On Apple Mn, it looks like denormals do not have an extra cost at least on the benchmarks tested. On old ARM CPUs it looks like denormals are flushed to zero; there might still be some CPUs that would benefit from zeroing out the 4th component, but for now let's assume that they are all well behaved. Removing zero-ins increases performance of some code by ~20% on Apple Mn.
This commit is contained in:
parent
c4da73ecf9
commit
8c0115c03a
2 changed files with 5 additions and 40 deletions
|
@ -676,10 +676,7 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next)
|
|||
|
||||
if (FFlag::LuauCodeGenVectorA64)
|
||||
{
|
||||
RegisterA64 tempa = tempVectorPure(inst.a, index);
|
||||
RegisterA64 tempb = tempVectorPure(inst.b, index);
|
||||
|
||||
build.fadd(inst.regA64, tempa, tempb);
|
||||
build.fadd(inst.regA64, regOp(inst.a), regOp(inst.b));
|
||||
|
||||
RegisterA64 tempw = regs.allocTemp(KindA64::w);
|
||||
build.mov(tempw, LUA_TVECTOR);
|
||||
|
@ -706,10 +703,7 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next)
|
|||
|
||||
if (FFlag::LuauCodeGenVectorA64)
|
||||
{
|
||||
RegisterA64 tempa = tempVectorPure(inst.a, index);
|
||||
RegisterA64 tempb = tempVectorPure(inst.b, index);
|
||||
|
||||
build.fsub(inst.regA64, tempa, tempb);
|
||||
build.fsub(inst.regA64, regOp(inst.a), regOp(inst.b));
|
||||
|
||||
RegisterA64 tempw = regs.allocTemp(KindA64::w);
|
||||
build.mov(tempw, LUA_TVECTOR);
|
||||
|
@ -736,10 +730,7 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next)
|
|||
|
||||
if (FFlag::LuauCodeGenVectorA64)
|
||||
{
|
||||
RegisterA64 tempa = tempVectorPure(inst.a, index);
|
||||
RegisterA64 tempb = tempVectorPure(inst.b, index);
|
||||
|
||||
build.fmul(inst.regA64, tempa, tempb);
|
||||
build.fmul(inst.regA64, regOp(inst.a), regOp(inst.b));
|
||||
|
||||
RegisterA64 tempw = regs.allocTemp(KindA64::w);
|
||||
build.mov(tempw, LUA_TVECTOR);
|
||||
|
@ -766,10 +757,7 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next)
|
|||
|
||||
if (FFlag::LuauCodeGenVectorA64)
|
||||
{
|
||||
RegisterA64 tempa = tempVectorPure(inst.a, index);
|
||||
RegisterA64 tempb = tempVectorPure(inst.b, index);
|
||||
|
||||
build.fdiv(inst.regA64, tempa, tempb);
|
||||
build.fdiv(inst.regA64, regOp(inst.a), regOp(inst.b));
|
||||
|
||||
RegisterA64 tempw = regs.allocTemp(KindA64::w);
|
||||
build.mov(tempw, LUA_TVECTOR);
|
||||
|
@ -796,8 +784,7 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next)
|
|||
|
||||
if (FFlag::LuauCodeGenVectorA64)
|
||||
{
|
||||
RegisterA64 temp = tempVectorPure(inst.a, index);
|
||||
build.fneg(inst.regA64, temp);
|
||||
build.fneg(inst.regA64, regOp(inst.a));
|
||||
|
||||
RegisterA64 tempw = regs.allocTemp(KindA64::w);
|
||||
build.mov(tempw, LUA_TVECTOR);
|
||||
|
@ -2635,27 +2622,6 @@ AddressA64 IrLoweringA64::tempAddrBuffer(IrOp bufferOp, IrOp indexOp)
|
|||
}
|
||||
}
|
||||
|
||||
RegisterA64 IrLoweringA64::tempVectorPure(IrOp op, uint32_t index)
|
||||
{
|
||||
RegisterA64 reg = regOp(op);
|
||||
|
||||
IrInst& source = function.instructions[op.index];
|
||||
LUAU_ASSERT(source.regA64 == reg);
|
||||
|
||||
if (source.lastUse == index)
|
||||
{
|
||||
build.ins_4s(reg, wzr, 3);
|
||||
return reg;
|
||||
}
|
||||
else
|
||||
{
|
||||
RegisterA64 temp = regs.allocTemp(KindA64::q);
|
||||
build.mov(temp, reg);
|
||||
build.ins_4s(temp, wzr, 3);
|
||||
return temp;
|
||||
}
|
||||
}
|
||||
|
||||
RegisterA64 IrLoweringA64::regOp(IrOp op)
|
||||
{
|
||||
IrInst& inst = function.instOp(op);
|
||||
|
|
|
@ -45,7 +45,6 @@ struct IrLoweringA64
|
|||
RegisterA64 tempUint(IrOp op);
|
||||
AddressA64 tempAddr(IrOp op, int offset);
|
||||
AddressA64 tempAddrBuffer(IrOp bufferOp, IrOp indexOp);
|
||||
RegisterA64 tempVectorPure(IrOp op, uint32_t index);
|
||||
|
||||
// May emit restore instructions
|
||||
RegisterA64 regOp(IrOp op);
|
||||
|
|
Loading…
Add table
Reference in a new issue