CodeGen: Use vector instructions for A64 vector math

This change uses newly added vector instructions for A64 lowering of vector ops.
This significantly cuts down on useless instructions.

To create vectors that we can work with without worrying about denormals, we patch
the last component with 0 - copying it into a fresh register if necessary.
This commit is contained in:
Arseny Kapoulkine 2024-02-13 15:37:47 -08:00
parent 0338e0e52d
commit c4da73ecf9
2 changed files with 129 additions and 38 deletions

View file

@ -12,6 +12,7 @@
#include "lgc.h" #include "lgc.h"
LUAU_DYNAMIC_FASTFLAGVARIABLE(LuauCodeGenFixBufferLenCheckA64, false) LUAU_DYNAMIC_FASTFLAGVARIABLE(LuauCodeGenFixBufferLenCheckA64, false)
LUAU_FASTFLAGVARIABLE(LuauCodeGenVectorA64, false)
namespace Luau namespace Luau
{ {
@ -673,15 +674,29 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next)
{ {
inst.regA64 = regs.allocReuse(KindA64::q, index, {inst.a, inst.b}); inst.regA64 = regs.allocReuse(KindA64::q, index, {inst.a, inst.b});
RegisterA64 tempa = regs.allocTemp(KindA64::s); if (FFlag::LuauCodeGenVectorA64)
RegisterA64 tempb = regs.allocTemp(KindA64::s);
for (uint8_t i = 0; i < 3; i++)
{ {
build.dup_4s(tempa, regOp(inst.a), i); RegisterA64 tempa = tempVectorPure(inst.a, index);
build.dup_4s(tempb, regOp(inst.b), i); RegisterA64 tempb = tempVectorPure(inst.b, index);
build.fadd(tempa, tempa, tempb);
build.ins_4s(inst.regA64, i, castReg(KindA64::q, tempa), 0); build.fadd(inst.regA64, tempa, tempb);
RegisterA64 tempw = regs.allocTemp(KindA64::w);
build.mov(tempw, LUA_TVECTOR);
build.ins_4s(inst.regA64, tempw, 3);
}
else
{
RegisterA64 tempa = regs.allocTemp(KindA64::s);
RegisterA64 tempb = regs.allocTemp(KindA64::s);
for (uint8_t i = 0; i < 3; i++)
{
build.dup_4s(tempa, regOp(inst.a), i);
build.dup_4s(tempb, regOp(inst.b), i);
build.fadd(tempa, tempa, tempb);
build.ins_4s(inst.regA64, i, castReg(KindA64::q, tempa), 0);
}
} }
break; break;
} }
@ -689,15 +704,29 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next)
{ {
inst.regA64 = regs.allocReuse(KindA64::q, index, {inst.a, inst.b}); inst.regA64 = regs.allocReuse(KindA64::q, index, {inst.a, inst.b});
RegisterA64 tempa = regs.allocTemp(KindA64::s); if (FFlag::LuauCodeGenVectorA64)
RegisterA64 tempb = regs.allocTemp(KindA64::s);
for (uint8_t i = 0; i < 3; i++)
{ {
build.dup_4s(tempa, regOp(inst.a), i); RegisterA64 tempa = tempVectorPure(inst.a, index);
build.dup_4s(tempb, regOp(inst.b), i); RegisterA64 tempb = tempVectorPure(inst.b, index);
build.fsub(tempa, tempa, tempb);
build.ins_4s(inst.regA64, i, castReg(KindA64::q, tempa), 0); build.fsub(inst.regA64, tempa, tempb);
RegisterA64 tempw = regs.allocTemp(KindA64::w);
build.mov(tempw, LUA_TVECTOR);
build.ins_4s(inst.regA64, tempw, 3);
}
else
{
RegisterA64 tempa = regs.allocTemp(KindA64::s);
RegisterA64 tempb = regs.allocTemp(KindA64::s);
for (uint8_t i = 0; i < 3; i++)
{
build.dup_4s(tempa, regOp(inst.a), i);
build.dup_4s(tempb, regOp(inst.b), i);
build.fsub(tempa, tempa, tempb);
build.ins_4s(inst.regA64, i, castReg(KindA64::q, tempa), 0);
}
} }
break; break;
} }
@ -705,15 +734,29 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next)
{ {
inst.regA64 = regs.allocReuse(KindA64::q, index, {inst.a, inst.b}); inst.regA64 = regs.allocReuse(KindA64::q, index, {inst.a, inst.b});
RegisterA64 tempa = regs.allocTemp(KindA64::s); if (FFlag::LuauCodeGenVectorA64)
RegisterA64 tempb = regs.allocTemp(KindA64::s);
for (uint8_t i = 0; i < 3; i++)
{ {
build.dup_4s(tempa, regOp(inst.a), i); RegisterA64 tempa = tempVectorPure(inst.a, index);
build.dup_4s(tempb, regOp(inst.b), i); RegisterA64 tempb = tempVectorPure(inst.b, index);
build.fmul(tempa, tempa, tempb);
build.ins_4s(inst.regA64, i, castReg(KindA64::q, tempa), 0); build.fmul(inst.regA64, tempa, tempb);
RegisterA64 tempw = regs.allocTemp(KindA64::w);
build.mov(tempw, LUA_TVECTOR);
build.ins_4s(inst.regA64, tempw, 3);
}
else
{
RegisterA64 tempa = regs.allocTemp(KindA64::s);
RegisterA64 tempb = regs.allocTemp(KindA64::s);
for (uint8_t i = 0; i < 3; i++)
{
build.dup_4s(tempa, regOp(inst.a), i);
build.dup_4s(tempb, regOp(inst.b), i);
build.fmul(tempa, tempa, tempb);
build.ins_4s(inst.regA64, i, castReg(KindA64::q, tempa), 0);
}
} }
break; break;
} }
@ -721,15 +764,29 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next)
{ {
inst.regA64 = regs.allocReuse(KindA64::q, index, {inst.a, inst.b}); inst.regA64 = regs.allocReuse(KindA64::q, index, {inst.a, inst.b});
RegisterA64 tempa = regs.allocTemp(KindA64::s); if (FFlag::LuauCodeGenVectorA64)
RegisterA64 tempb = regs.allocTemp(KindA64::s);
for (uint8_t i = 0; i < 3; i++)
{ {
build.dup_4s(tempa, regOp(inst.a), i); RegisterA64 tempa = tempVectorPure(inst.a, index);
build.dup_4s(tempb, regOp(inst.b), i); RegisterA64 tempb = tempVectorPure(inst.b, index);
build.fdiv(tempa, tempa, tempb);
build.ins_4s(inst.regA64, i, castReg(KindA64::q, tempa), 0); build.fdiv(inst.regA64, tempa, tempb);
RegisterA64 tempw = regs.allocTemp(KindA64::w);
build.mov(tempw, LUA_TVECTOR);
build.ins_4s(inst.regA64, tempw, 3);
}
else
{
RegisterA64 tempa = regs.allocTemp(KindA64::s);
RegisterA64 tempb = regs.allocTemp(KindA64::s);
for (uint8_t i = 0; i < 3; i++)
{
build.dup_4s(tempa, regOp(inst.a), i);
build.dup_4s(tempb, regOp(inst.b), i);
build.fdiv(tempa, tempa, tempb);
build.ins_4s(inst.regA64, i, castReg(KindA64::q, tempa), 0);
}
} }
break; break;
} }
@ -737,13 +794,25 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next)
{ {
inst.regA64 = regs.allocReuse(KindA64::q, index, {inst.a}); inst.regA64 = regs.allocReuse(KindA64::q, index, {inst.a});
RegisterA64 tempa = regs.allocTemp(KindA64::s); if (FFlag::LuauCodeGenVectorA64)
for (uint8_t i = 0; i < 3; i++)
{ {
build.dup_4s(tempa, regOp(inst.a), i); RegisterA64 temp = tempVectorPure(inst.a, index);
build.fneg(tempa, tempa); build.fneg(inst.regA64, temp);
build.ins_4s(inst.regA64, i, castReg(KindA64::q, tempa), 0);
RegisterA64 tempw = regs.allocTemp(KindA64::w);
build.mov(tempw, LUA_TVECTOR);
build.ins_4s(inst.regA64, tempw, 3);
}
else
{
RegisterA64 tempa = regs.allocTemp(KindA64::s);
for (uint8_t i = 0; i < 3; i++)
{
build.dup_4s(tempa, regOp(inst.a), i);
build.fneg(tempa, tempa);
build.ins_4s(inst.regA64, i, castReg(KindA64::q, tempa), 0);
}
} }
break; break;
} }
@ -2566,6 +2635,27 @@ AddressA64 IrLoweringA64::tempAddrBuffer(IrOp bufferOp, IrOp indexOp)
} }
} }
RegisterA64 IrLoweringA64::tempVectorPure(IrOp op, uint32_t index)
{
RegisterA64 reg = regOp(op);
IrInst& source = function.instructions[op.index];
LUAU_ASSERT(source.regA64 == reg);
if (source.lastUse == index)
{
build.ins_4s(reg, wzr, 3);
return reg;
}
else
{
RegisterA64 temp = regs.allocTemp(KindA64::q);
build.mov(temp, reg);
build.ins_4s(temp, wzr, 3);
return temp;
}
}
RegisterA64 IrLoweringA64::regOp(IrOp op) RegisterA64 IrLoweringA64::regOp(IrOp op)
{ {
IrInst& inst = function.instOp(op); IrInst& inst = function.instOp(op);

View file

@ -45,6 +45,7 @@ struct IrLoweringA64
RegisterA64 tempUint(IrOp op); RegisterA64 tempUint(IrOp op);
AddressA64 tempAddr(IrOp op, int offset); AddressA64 tempAddr(IrOp op, int offset);
AddressA64 tempAddrBuffer(IrOp bufferOp, IrOp indexOp); AddressA64 tempAddrBuffer(IrOp bufferOp, IrOp indexOp);
RegisterA64 tempVectorPure(IrOp op, uint32_t index);
// May emit restore instructions // May emit restore instructions
RegisterA64 regOp(IrOp op); RegisterA64 regOp(IrOp op);