CodeGen: Rempve tempVectorPure for now

On Apple Mn, it looks like denormals do not have an extra cost at least on the benchmarks tested. On old ARM CPUs it looks like denormals are flushed to zero; there might still be some CPUs that would benefit from zeroing out the 4th component, but for now let's assume that they are all well behaved. Removing zero-ins increases performance of some code by ~20% on Apple Mn.
2025-05-04 10:33:46 +01:00 · 2024-02-13 17:24:26 -08:00 · 2024-02-13 17:24:26 -08:00 · 8c0115c03a
commit 8c0115c03a
parent c4da73ecf9
2 changed files with 5 additions and 40 deletions
--- a/CodeGen/src/IrLoweringA64.cpp
+++ b/CodeGen/src/IrLoweringA64.cpp
@ -676,10 +676,7 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next)

        if (FFlag::LuauCodeGenVectorA64)
        {
-            RegisterA64 tempa = tempVectorPure(inst.a, index);
-            RegisterA64 tempb = tempVectorPure(inst.b, index);
-
-            build.fadd(inst.regA64, tempa, tempb);
+            build.fadd(inst.regA64, regOp(inst.a), regOp(inst.b));

            RegisterA64 tempw = regs.allocTemp(KindA64::w);
            build.mov(tempw, LUA_TVECTOR);
@ -706,10 +703,7 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next)

        if (FFlag::LuauCodeGenVectorA64)
        {
-            RegisterA64 tempa = tempVectorPure(inst.a, index);
-            RegisterA64 tempb = tempVectorPure(inst.b, index);
-
-            build.fsub(inst.regA64, tempa, tempb);
+            build.fsub(inst.regA64, regOp(inst.a), regOp(inst.b));

            RegisterA64 tempw = regs.allocTemp(KindA64::w);
            build.mov(tempw, LUA_TVECTOR);
@ -736,10 +730,7 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next)

        if (FFlag::LuauCodeGenVectorA64)
        {
-            RegisterA64 tempa = tempVectorPure(inst.a, index);
-            RegisterA64 tempb = tempVectorPure(inst.b, index);
-
-            build.fmul(inst.regA64, tempa, tempb);
+            build.fmul(inst.regA64, regOp(inst.a), regOp(inst.b));

            RegisterA64 tempw = regs.allocTemp(KindA64::w);
            build.mov(tempw, LUA_TVECTOR);
@ -766,10 +757,7 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next)

        if (FFlag::LuauCodeGenVectorA64)
        {
-            RegisterA64 tempa = tempVectorPure(inst.a, index);
-            RegisterA64 tempb = tempVectorPure(inst.b, index);
-
-            build.fdiv(inst.regA64, tempa, tempb);
+            build.fdiv(inst.regA64, regOp(inst.a), regOp(inst.b));

            RegisterA64 tempw = regs.allocTemp(KindA64::w);
            build.mov(tempw, LUA_TVECTOR);
@ -796,8 +784,7 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next)

        if (FFlag::LuauCodeGenVectorA64)
        {
-            RegisterA64 temp = tempVectorPure(inst.a, index);
-            build.fneg(inst.regA64, temp);
+            build.fneg(inst.regA64, regOp(inst.a));

            RegisterA64 tempw = regs.allocTemp(KindA64::w);
            build.mov(tempw, LUA_TVECTOR);
@ -2635,27 +2622,6 @@ AddressA64 IrLoweringA64::tempAddrBuffer(IrOp bufferOp, IrOp indexOp)
    }
 }

-RegisterA64 IrLoweringA64::tempVectorPure(IrOp op, uint32_t index)
-{
-    RegisterA64 reg = regOp(op);
-
-    IrInst& source = function.instructions[op.index];
-    LUAU_ASSERT(source.regA64 == reg);
-
-    if (source.lastUse == index)
-    {
-        build.ins_4s(reg, wzr, 3);
-        return reg;
-    }
-    else
-    {
-        RegisterA64 temp = regs.allocTemp(KindA64::q);
-        build.mov(temp, reg);
-        build.ins_4s(temp, wzr, 3);
-        return temp;
-    }
-}
-
 RegisterA64 IrLoweringA64::regOp(IrOp op)
 {
    IrInst& inst = function.instOp(op);
--- a/CodeGen/src/IrLoweringA64.h
+++ b/CodeGen/src/IrLoweringA64.h
@ -45,7 +45,6 @@ struct IrLoweringA64
    RegisterA64 tempUint(IrOp op);
    AddressA64 tempAddr(IrOp op, int offset);
    AddressA64 tempAddrBuffer(IrOp bufferOp, IrOp indexOp);
-    RegisterA64 tempVectorPure(IrOp op, uint32_t index);

    // May emit restore instructions
    RegisterA64 regOp(IrOp op);