From c4da73ecf989cce43a41e2a729bba1ac64fabd02 Mon Sep 17 00:00:00 2001
From: Arseny Kapoulkine <arseny.kapoulkine@gmail.com>
Date: Tue, 13 Feb 2024 15:37:47 -0800
Subject: [PATCH] CodeGen: Use vector instructions for A64 vector math

This change uses newly added vector instructions for A64 lowering of vector ops.
This significantly cuts down on useless instructions.

To create vectors that we can work with without worrying about denormals, we patch
the last component with 0 - copying it into a fresh register if necessary.
---
 CodeGen/src/IrLoweringA64.cpp | 166 ++++++++++++++++++++++++++--------
 CodeGen/src/IrLoweringA64.h   |   1 +
 2 files changed, 129 insertions(+), 38 deletions(-)

diff --git a/CodeGen/src/IrLoweringA64.cpp b/CodeGen/src/IrLoweringA64.cpp
index 04804e67..7d33b3e9 100644
--- a/CodeGen/src/IrLoweringA64.cpp
+++ b/CodeGen/src/IrLoweringA64.cpp
@@ -12,6 +12,7 @@
 #include "lgc.h"
 
 LUAU_DYNAMIC_FASTFLAGVARIABLE(LuauCodeGenFixBufferLenCheckA64, false)
+LUAU_FASTFLAGVARIABLE(LuauCodeGenVectorA64, false)
 
 namespace Luau
 {
@@ -673,15 +674,29 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next)
     {
         inst.regA64 = regs.allocReuse(KindA64::q, index, {inst.a, inst.b});
 
-        RegisterA64 tempa = regs.allocTemp(KindA64::s);
-        RegisterA64 tempb = regs.allocTemp(KindA64::s);
-
-        for (uint8_t i = 0; i < 3; i++)
+        if (FFlag::LuauCodeGenVectorA64)
         {
-            build.dup_4s(tempa, regOp(inst.a), i);
-            build.dup_4s(tempb, regOp(inst.b), i);
-            build.fadd(tempa, tempa, tempb);
-            build.ins_4s(inst.regA64, i, castReg(KindA64::q, tempa), 0);
+            RegisterA64 tempa = tempVectorPure(inst.a, index);
+            RegisterA64 tempb = tempVectorPure(inst.b, index);
+
+            build.fadd(inst.regA64, tempa, tempb);
+
+            RegisterA64 tempw = regs.allocTemp(KindA64::w);
+            build.mov(tempw, LUA_TVECTOR);
+            build.ins_4s(inst.regA64, tempw, 3);
+        }
+        else
+        {
+            RegisterA64 tempa = regs.allocTemp(KindA64::s);
+            RegisterA64 tempb = regs.allocTemp(KindA64::s);
+
+            for (uint8_t i = 0; i < 3; i++)
+            {
+                build.dup_4s(tempa, regOp(inst.a), i);
+                build.dup_4s(tempb, regOp(inst.b), i);
+                build.fadd(tempa, tempa, tempb);
+                build.ins_4s(inst.regA64, i, castReg(KindA64::q, tempa), 0);
+            }
         }
         break;
     }
@@ -689,15 +704,29 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next)
     {
         inst.regA64 = regs.allocReuse(KindA64::q, index, {inst.a, inst.b});
 
-        RegisterA64 tempa = regs.allocTemp(KindA64::s);
-        RegisterA64 tempb = regs.allocTemp(KindA64::s);
-
-        for (uint8_t i = 0; i < 3; i++)
+        if (FFlag::LuauCodeGenVectorA64)
         {
-            build.dup_4s(tempa, regOp(inst.a), i);
-            build.dup_4s(tempb, regOp(inst.b), i);
-            build.fsub(tempa, tempa, tempb);
-            build.ins_4s(inst.regA64, i, castReg(KindA64::q, tempa), 0);
+            RegisterA64 tempa = tempVectorPure(inst.a, index);
+            RegisterA64 tempb = tempVectorPure(inst.b, index);
+
+            build.fsub(inst.regA64, tempa, tempb);
+
+            RegisterA64 tempw = regs.allocTemp(KindA64::w);
+            build.mov(tempw, LUA_TVECTOR);
+            build.ins_4s(inst.regA64, tempw, 3);
+        }
+        else
+        {
+            RegisterA64 tempa = regs.allocTemp(KindA64::s);
+            RegisterA64 tempb = regs.allocTemp(KindA64::s);
+
+            for (uint8_t i = 0; i < 3; i++)
+            {
+                build.dup_4s(tempa, regOp(inst.a), i);
+                build.dup_4s(tempb, regOp(inst.b), i);
+                build.fsub(tempa, tempa, tempb);
+                build.ins_4s(inst.regA64, i, castReg(KindA64::q, tempa), 0);
+            }
         }
         break;
     }
@@ -705,15 +734,29 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next)
     {
         inst.regA64 = regs.allocReuse(KindA64::q, index, {inst.a, inst.b});
 
-        RegisterA64 tempa = regs.allocTemp(KindA64::s);
-        RegisterA64 tempb = regs.allocTemp(KindA64::s);
-
-        for (uint8_t i = 0; i < 3; i++)
+        if (FFlag::LuauCodeGenVectorA64)
         {
-            build.dup_4s(tempa, regOp(inst.a), i);
-            build.dup_4s(tempb, regOp(inst.b), i);
-            build.fmul(tempa, tempa, tempb);
-            build.ins_4s(inst.regA64, i, castReg(KindA64::q, tempa), 0);
+            RegisterA64 tempa = tempVectorPure(inst.a, index);
+            RegisterA64 tempb = tempVectorPure(inst.b, index);
+
+            build.fmul(inst.regA64, tempa, tempb);
+
+            RegisterA64 tempw = regs.allocTemp(KindA64::w);
+            build.mov(tempw, LUA_TVECTOR);
+            build.ins_4s(inst.regA64, tempw, 3);
+        }
+        else
+        {
+            RegisterA64 tempa = regs.allocTemp(KindA64::s);
+            RegisterA64 tempb = regs.allocTemp(KindA64::s);
+
+            for (uint8_t i = 0; i < 3; i++)
+            {
+                build.dup_4s(tempa, regOp(inst.a), i);
+                build.dup_4s(tempb, regOp(inst.b), i);
+                build.fmul(tempa, tempa, tempb);
+                build.ins_4s(inst.regA64, i, castReg(KindA64::q, tempa), 0);
+            }
         }
         break;
     }
@@ -721,15 +764,29 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next)
     {
         inst.regA64 = regs.allocReuse(KindA64::q, index, {inst.a, inst.b});
 
-        RegisterA64 tempa = regs.allocTemp(KindA64::s);
-        RegisterA64 tempb = regs.allocTemp(KindA64::s);
-
-        for (uint8_t i = 0; i < 3; i++)
+        if (FFlag::LuauCodeGenVectorA64)
         {
-            build.dup_4s(tempa, regOp(inst.a), i);
-            build.dup_4s(tempb, regOp(inst.b), i);
-            build.fdiv(tempa, tempa, tempb);
-            build.ins_4s(inst.regA64, i, castReg(KindA64::q, tempa), 0);
+            RegisterA64 tempa = tempVectorPure(inst.a, index);
+            RegisterA64 tempb = tempVectorPure(inst.b, index);
+
+            build.fdiv(inst.regA64, tempa, tempb);
+
+            RegisterA64 tempw = regs.allocTemp(KindA64::w);
+            build.mov(tempw, LUA_TVECTOR);
+            build.ins_4s(inst.regA64, tempw, 3);
+        }
+        else
+        {
+            RegisterA64 tempa = regs.allocTemp(KindA64::s);
+            RegisterA64 tempb = regs.allocTemp(KindA64::s);
+
+            for (uint8_t i = 0; i < 3; i++)
+            {
+                build.dup_4s(tempa, regOp(inst.a), i);
+                build.dup_4s(tempb, regOp(inst.b), i);
+                build.fdiv(tempa, tempa, tempb);
+                build.ins_4s(inst.regA64, i, castReg(KindA64::q, tempa), 0);
+            }
         }
         break;
     }
@@ -737,13 +794,25 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next)
     {
         inst.regA64 = regs.allocReuse(KindA64::q, index, {inst.a});
 
-        RegisterA64 tempa = regs.allocTemp(KindA64::s);
-
-        for (uint8_t i = 0; i < 3; i++)
+        if (FFlag::LuauCodeGenVectorA64)
         {
-            build.dup_4s(tempa, regOp(inst.a), i);
-            build.fneg(tempa, tempa);
-            build.ins_4s(inst.regA64, i, castReg(KindA64::q, tempa), 0);
+            RegisterA64 temp = tempVectorPure(inst.a, index);
+            build.fneg(inst.regA64, temp);
+
+            RegisterA64 tempw = regs.allocTemp(KindA64::w);
+            build.mov(tempw, LUA_TVECTOR);
+            build.ins_4s(inst.regA64, tempw, 3);
+        }
+        else
+        {
+            RegisterA64 tempa = regs.allocTemp(KindA64::s);
+
+            for (uint8_t i = 0; i < 3; i++)
+            {
+                build.dup_4s(tempa, regOp(inst.a), i);
+                build.fneg(tempa, tempa);
+                build.ins_4s(inst.regA64, i, castReg(KindA64::q, tempa), 0);
+            }
         }
         break;
     }
@@ -2566,6 +2635,27 @@ AddressA64 IrLoweringA64::tempAddrBuffer(IrOp bufferOp, IrOp indexOp)
     }
 }
 
+RegisterA64 IrLoweringA64::tempVectorPure(IrOp op, uint32_t index)
+{
+    RegisterA64 reg = regOp(op);
+
+    IrInst& source = function.instructions[op.index];
+    LUAU_ASSERT(source.regA64 == reg);
+
+    if (source.lastUse == index)
+    {
+        build.ins_4s(reg, wzr, 3);
+        return reg;
+    }
+    else
+    {
+        RegisterA64 temp = regs.allocTemp(KindA64::q);
+        build.mov(temp, reg);
+        build.ins_4s(temp, wzr, 3);
+        return temp;
+    }
+}
+
 RegisterA64 IrLoweringA64::regOp(IrOp op)
 {
     IrInst& inst = function.instOp(op);
diff --git a/CodeGen/src/IrLoweringA64.h b/CodeGen/src/IrLoweringA64.h
index 5fb7f2b8..9d2efb24 100644
--- a/CodeGen/src/IrLoweringA64.h
+++ b/CodeGen/src/IrLoweringA64.h
@@ -45,6 +45,7 @@ struct IrLoweringA64
     RegisterA64 tempUint(IrOp op);
     AddressA64 tempAddr(IrOp op, int offset);
     AddressA64 tempAddrBuffer(IrOp bufferOp, IrOp indexOp);
+    RegisterA64 tempVectorPure(IrOp op, uint32_t index);
 
     // May emit restore instructions
     RegisterA64 regOp(IrOp op);