CodeGen: Use vector instructions for A64 vector math

This change uses newly added vector instructions for A64 lowering of vector ops. This significantly cuts down on useless instructions. To create vectors that we can work with without worrying about denormals, we patch the last component with 0 - copying it into a fresh register if necessary.
2025-05-04 10:33:46 +01:00 · 2024-02-13 15:37:47 -08:00 · 2024-02-13 15:37:47 -08:00 · c4da73ecf9
commit c4da73ecf9
parent 0338e0e52d
2 changed files with 129 additions and 38 deletions
--- a/CodeGen/src/IrLoweringA64.cpp
+++ b/CodeGen/src/IrLoweringA64.cpp
@ -12,6 +12,7 @@
 #include "lgc.h"

 LUAU_DYNAMIC_FASTFLAGVARIABLE(LuauCodeGenFixBufferLenCheckA64, false)
+LUAU_FASTFLAGVARIABLE(LuauCodeGenVectorA64, false)

 namespace Luau
 {
@ -673,15 +674,29 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next)
    {
        inst.regA64 = regs.allocReuse(KindA64::q, index, {inst.a, inst.b});

-        RegisterA64 tempa = regs.allocTemp(KindA64::s);
-        RegisterA64 tempb = regs.allocTemp(KindA64::s);
-
-        for (uint8_t i = 0; i < 3; i++)
+        if (FFlag::LuauCodeGenVectorA64)
        {
-            build.dup_4s(tempa, regOp(inst.a), i);
-            build.dup_4s(tempb, regOp(inst.b), i);
-            build.fadd(tempa, tempa, tempb);
-            build.ins_4s(inst.regA64, i, castReg(KindA64::q, tempa), 0);
+            RegisterA64 tempa = tempVectorPure(inst.a, index);
+            RegisterA64 tempb = tempVectorPure(inst.b, index);
+
+            build.fadd(inst.regA64, tempa, tempb);
+
+            RegisterA64 tempw = regs.allocTemp(KindA64::w);
+            build.mov(tempw, LUA_TVECTOR);
+            build.ins_4s(inst.regA64, tempw, 3);
+        }
+        else
+        {
+            RegisterA64 tempa = regs.allocTemp(KindA64::s);
+            RegisterA64 tempb = regs.allocTemp(KindA64::s);
+
+            for (uint8_t i = 0; i < 3; i++)
+            {
+                build.dup_4s(tempa, regOp(inst.a), i);
+                build.dup_4s(tempb, regOp(inst.b), i);
+                build.fadd(tempa, tempa, tempb);
+                build.ins_4s(inst.regA64, i, castReg(KindA64::q, tempa), 0);
+            }
        }
        break;
    }
@ -689,15 +704,29 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next)
    {
        inst.regA64 = regs.allocReuse(KindA64::q, index, {inst.a, inst.b});

-        RegisterA64 tempa = regs.allocTemp(KindA64::s);
-        RegisterA64 tempb = regs.allocTemp(KindA64::s);
-
-        for (uint8_t i = 0; i < 3; i++)
+        if (FFlag::LuauCodeGenVectorA64)
        {
-            build.dup_4s(tempa, regOp(inst.a), i);
-            build.dup_4s(tempb, regOp(inst.b), i);
-            build.fsub(tempa, tempa, tempb);
-            build.ins_4s(inst.regA64, i, castReg(KindA64::q, tempa), 0);
+            RegisterA64 tempa = tempVectorPure(inst.a, index);
+            RegisterA64 tempb = tempVectorPure(inst.b, index);
+
+            build.fsub(inst.regA64, tempa, tempb);
+
+            RegisterA64 tempw = regs.allocTemp(KindA64::w);
+            build.mov(tempw, LUA_TVECTOR);
+            build.ins_4s(inst.regA64, tempw, 3);
+        }
+        else
+        {
+            RegisterA64 tempa = regs.allocTemp(KindA64::s);
+            RegisterA64 tempb = regs.allocTemp(KindA64::s);
+
+            for (uint8_t i = 0; i < 3; i++)
+            {
+                build.dup_4s(tempa, regOp(inst.a), i);
+                build.dup_4s(tempb, regOp(inst.b), i);
+                build.fsub(tempa, tempa, tempb);
+                build.ins_4s(inst.regA64, i, castReg(KindA64::q, tempa), 0);
+            }
        }
        break;
    }
@ -705,15 +734,29 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next)
    {
        inst.regA64 = regs.allocReuse(KindA64::q, index, {inst.a, inst.b});

-        RegisterA64 tempa = regs.allocTemp(KindA64::s);
-        RegisterA64 tempb = regs.allocTemp(KindA64::s);
-
-        for (uint8_t i = 0; i < 3; i++)
+        if (FFlag::LuauCodeGenVectorA64)
        {
-            build.dup_4s(tempa, regOp(inst.a), i);
-            build.dup_4s(tempb, regOp(inst.b), i);
-            build.fmul(tempa, tempa, tempb);
-            build.ins_4s(inst.regA64, i, castReg(KindA64::q, tempa), 0);
+            RegisterA64 tempa = tempVectorPure(inst.a, index);
+            RegisterA64 tempb = tempVectorPure(inst.b, index);
+
+            build.fmul(inst.regA64, tempa, tempb);
+
+            RegisterA64 tempw = regs.allocTemp(KindA64::w);
+            build.mov(tempw, LUA_TVECTOR);
+            build.ins_4s(inst.regA64, tempw, 3);
+        }
+        else
+        {
+            RegisterA64 tempa = regs.allocTemp(KindA64::s);
+            RegisterA64 tempb = regs.allocTemp(KindA64::s);
+
+            for (uint8_t i = 0; i < 3; i++)
+            {
+                build.dup_4s(tempa, regOp(inst.a), i);
+                build.dup_4s(tempb, regOp(inst.b), i);
+                build.fmul(tempa, tempa, tempb);
+                build.ins_4s(inst.regA64, i, castReg(KindA64::q, tempa), 0);
+            }
        }
        break;
    }
@ -721,15 +764,29 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next)
    {
        inst.regA64 = regs.allocReuse(KindA64::q, index, {inst.a, inst.b});

-        RegisterA64 tempa = regs.allocTemp(KindA64::s);
-        RegisterA64 tempb = regs.allocTemp(KindA64::s);
-
-        for (uint8_t i = 0; i < 3; i++)
+        if (FFlag::LuauCodeGenVectorA64)
        {
-            build.dup_4s(tempa, regOp(inst.a), i);
-            build.dup_4s(tempb, regOp(inst.b), i);
-            build.fdiv(tempa, tempa, tempb);
-            build.ins_4s(inst.regA64, i, castReg(KindA64::q, tempa), 0);
+            RegisterA64 tempa = tempVectorPure(inst.a, index);
+            RegisterA64 tempb = tempVectorPure(inst.b, index);
+
+            build.fdiv(inst.regA64, tempa, tempb);
+
+            RegisterA64 tempw = regs.allocTemp(KindA64::w);
+            build.mov(tempw, LUA_TVECTOR);
+            build.ins_4s(inst.regA64, tempw, 3);
+        }
+        else
+        {
+            RegisterA64 tempa = regs.allocTemp(KindA64::s);
+            RegisterA64 tempb = regs.allocTemp(KindA64::s);
+
+            for (uint8_t i = 0; i < 3; i++)
+            {
+                build.dup_4s(tempa, regOp(inst.a), i);
+                build.dup_4s(tempb, regOp(inst.b), i);
+                build.fdiv(tempa, tempa, tempb);
+                build.ins_4s(inst.regA64, i, castReg(KindA64::q, tempa), 0);
+            }
        }
        break;
    }
@ -737,13 +794,25 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next)
    {
        inst.regA64 = regs.allocReuse(KindA64::q, index, {inst.a});

-        RegisterA64 tempa = regs.allocTemp(KindA64::s);
-
-        for (uint8_t i = 0; i < 3; i++)
+        if (FFlag::LuauCodeGenVectorA64)
        {
-            build.dup_4s(tempa, regOp(inst.a), i);
-            build.fneg(tempa, tempa);
-            build.ins_4s(inst.regA64, i, castReg(KindA64::q, tempa), 0);
+            RegisterA64 temp = tempVectorPure(inst.a, index);
+            build.fneg(inst.regA64, temp);
+
+            RegisterA64 tempw = regs.allocTemp(KindA64::w);
+            build.mov(tempw, LUA_TVECTOR);
+            build.ins_4s(inst.regA64, tempw, 3);
+        }
+        else
+        {
+            RegisterA64 tempa = regs.allocTemp(KindA64::s);
+
+            for (uint8_t i = 0; i < 3; i++)
+            {
+                build.dup_4s(tempa, regOp(inst.a), i);
+                build.fneg(tempa, tempa);
+                build.ins_4s(inst.regA64, i, castReg(KindA64::q, tempa), 0);
+            }
        }
        break;
    }
@ -2566,6 +2635,27 @@ AddressA64 IrLoweringA64::tempAddrBuffer(IrOp bufferOp, IrOp indexOp)
    }
 }

+RegisterA64 IrLoweringA64::tempVectorPure(IrOp op, uint32_t index)
+{
+    RegisterA64 reg = regOp(op);
+
+    IrInst& source = function.instructions[op.index];
+    LUAU_ASSERT(source.regA64 == reg);
+
+    if (source.lastUse == index)
+    {
+        build.ins_4s(reg, wzr, 3);
+        return reg;
+    }
+    else
+    {
+        RegisterA64 temp = regs.allocTemp(KindA64::q);
+        build.mov(temp, reg);
+        build.ins_4s(temp, wzr, 3);
+        return temp;
+    }
+}
+
 RegisterA64 IrLoweringA64::regOp(IrOp op)
 {
    IrInst& inst = function.instOp(op);
--- a/CodeGen/src/IrLoweringA64.h
+++ b/CodeGen/src/IrLoweringA64.h
@ -45,6 +45,7 @@ struct IrLoweringA64
    RegisterA64 tempUint(IrOp op);
    AddressA64 tempAddr(IrOp op, int offset);
    AddressA64 tempAddrBuffer(IrOp bufferOp, IrOp indexOp);
+    RegisterA64 tempVectorPure(IrOp op, uint32_t index);

    // May emit restore instructions
    RegisterA64 regOp(IrOp op);