mirror of
https://github.com/luau-lang/luau.git
synced 2025-05-04 10:33:46 +01:00
CodeGen: Use vector instructions for A64 vector math
This change uses newly added vector instructions for A64 lowering of vector ops. This significantly cuts down on useless instructions. To create vectors that we can work with without worrying about denormals, we patch the last component with 0 - copying it into a fresh register if necessary.
This commit is contained in:
parent
0338e0e52d
commit
c4da73ecf9
2 changed files with 129 additions and 38 deletions
|
@ -12,6 +12,7 @@
|
|||
#include "lgc.h"
|
||||
|
||||
LUAU_DYNAMIC_FASTFLAGVARIABLE(LuauCodeGenFixBufferLenCheckA64, false)
|
||||
LUAU_FASTFLAGVARIABLE(LuauCodeGenVectorA64, false)
|
||||
|
||||
namespace Luau
|
||||
{
|
||||
|
@ -673,15 +674,29 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next)
|
|||
{
|
||||
inst.regA64 = regs.allocReuse(KindA64::q, index, {inst.a, inst.b});
|
||||
|
||||
RegisterA64 tempa = regs.allocTemp(KindA64::s);
|
||||
RegisterA64 tempb = regs.allocTemp(KindA64::s);
|
||||
|
||||
for (uint8_t i = 0; i < 3; i++)
|
||||
if (FFlag::LuauCodeGenVectorA64)
|
||||
{
|
||||
build.dup_4s(tempa, regOp(inst.a), i);
|
||||
build.dup_4s(tempb, regOp(inst.b), i);
|
||||
build.fadd(tempa, tempa, tempb);
|
||||
build.ins_4s(inst.regA64, i, castReg(KindA64::q, tempa), 0);
|
||||
RegisterA64 tempa = tempVectorPure(inst.a, index);
|
||||
RegisterA64 tempb = tempVectorPure(inst.b, index);
|
||||
|
||||
build.fadd(inst.regA64, tempa, tempb);
|
||||
|
||||
RegisterA64 tempw = regs.allocTemp(KindA64::w);
|
||||
build.mov(tempw, LUA_TVECTOR);
|
||||
build.ins_4s(inst.regA64, tempw, 3);
|
||||
}
|
||||
else
|
||||
{
|
||||
RegisterA64 tempa = regs.allocTemp(KindA64::s);
|
||||
RegisterA64 tempb = regs.allocTemp(KindA64::s);
|
||||
|
||||
for (uint8_t i = 0; i < 3; i++)
|
||||
{
|
||||
build.dup_4s(tempa, regOp(inst.a), i);
|
||||
build.dup_4s(tempb, regOp(inst.b), i);
|
||||
build.fadd(tempa, tempa, tempb);
|
||||
build.ins_4s(inst.regA64, i, castReg(KindA64::q, tempa), 0);
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
@ -689,15 +704,29 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next)
|
|||
{
|
||||
inst.regA64 = regs.allocReuse(KindA64::q, index, {inst.a, inst.b});
|
||||
|
||||
RegisterA64 tempa = regs.allocTemp(KindA64::s);
|
||||
RegisterA64 tempb = regs.allocTemp(KindA64::s);
|
||||
|
||||
for (uint8_t i = 0; i < 3; i++)
|
||||
if (FFlag::LuauCodeGenVectorA64)
|
||||
{
|
||||
build.dup_4s(tempa, regOp(inst.a), i);
|
||||
build.dup_4s(tempb, regOp(inst.b), i);
|
||||
build.fsub(tempa, tempa, tempb);
|
||||
build.ins_4s(inst.regA64, i, castReg(KindA64::q, tempa), 0);
|
||||
RegisterA64 tempa = tempVectorPure(inst.a, index);
|
||||
RegisterA64 tempb = tempVectorPure(inst.b, index);
|
||||
|
||||
build.fsub(inst.regA64, tempa, tempb);
|
||||
|
||||
RegisterA64 tempw = regs.allocTemp(KindA64::w);
|
||||
build.mov(tempw, LUA_TVECTOR);
|
||||
build.ins_4s(inst.regA64, tempw, 3);
|
||||
}
|
||||
else
|
||||
{
|
||||
RegisterA64 tempa = regs.allocTemp(KindA64::s);
|
||||
RegisterA64 tempb = regs.allocTemp(KindA64::s);
|
||||
|
||||
for (uint8_t i = 0; i < 3; i++)
|
||||
{
|
||||
build.dup_4s(tempa, regOp(inst.a), i);
|
||||
build.dup_4s(tempb, regOp(inst.b), i);
|
||||
build.fsub(tempa, tempa, tempb);
|
||||
build.ins_4s(inst.regA64, i, castReg(KindA64::q, tempa), 0);
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
@ -705,15 +734,29 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next)
|
|||
{
|
||||
inst.regA64 = regs.allocReuse(KindA64::q, index, {inst.a, inst.b});
|
||||
|
||||
RegisterA64 tempa = regs.allocTemp(KindA64::s);
|
||||
RegisterA64 tempb = regs.allocTemp(KindA64::s);
|
||||
|
||||
for (uint8_t i = 0; i < 3; i++)
|
||||
if (FFlag::LuauCodeGenVectorA64)
|
||||
{
|
||||
build.dup_4s(tempa, regOp(inst.a), i);
|
||||
build.dup_4s(tempb, regOp(inst.b), i);
|
||||
build.fmul(tempa, tempa, tempb);
|
||||
build.ins_4s(inst.regA64, i, castReg(KindA64::q, tempa), 0);
|
||||
RegisterA64 tempa = tempVectorPure(inst.a, index);
|
||||
RegisterA64 tempb = tempVectorPure(inst.b, index);
|
||||
|
||||
build.fmul(inst.regA64, tempa, tempb);
|
||||
|
||||
RegisterA64 tempw = regs.allocTemp(KindA64::w);
|
||||
build.mov(tempw, LUA_TVECTOR);
|
||||
build.ins_4s(inst.regA64, tempw, 3);
|
||||
}
|
||||
else
|
||||
{
|
||||
RegisterA64 tempa = regs.allocTemp(KindA64::s);
|
||||
RegisterA64 tempb = regs.allocTemp(KindA64::s);
|
||||
|
||||
for (uint8_t i = 0; i < 3; i++)
|
||||
{
|
||||
build.dup_4s(tempa, regOp(inst.a), i);
|
||||
build.dup_4s(tempb, regOp(inst.b), i);
|
||||
build.fmul(tempa, tempa, tempb);
|
||||
build.ins_4s(inst.regA64, i, castReg(KindA64::q, tempa), 0);
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
@ -721,15 +764,29 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next)
|
|||
{
|
||||
inst.regA64 = regs.allocReuse(KindA64::q, index, {inst.a, inst.b});
|
||||
|
||||
RegisterA64 tempa = regs.allocTemp(KindA64::s);
|
||||
RegisterA64 tempb = regs.allocTemp(KindA64::s);
|
||||
|
||||
for (uint8_t i = 0; i < 3; i++)
|
||||
if (FFlag::LuauCodeGenVectorA64)
|
||||
{
|
||||
build.dup_4s(tempa, regOp(inst.a), i);
|
||||
build.dup_4s(tempb, regOp(inst.b), i);
|
||||
build.fdiv(tempa, tempa, tempb);
|
||||
build.ins_4s(inst.regA64, i, castReg(KindA64::q, tempa), 0);
|
||||
RegisterA64 tempa = tempVectorPure(inst.a, index);
|
||||
RegisterA64 tempb = tempVectorPure(inst.b, index);
|
||||
|
||||
build.fdiv(inst.regA64, tempa, tempb);
|
||||
|
||||
RegisterA64 tempw = regs.allocTemp(KindA64::w);
|
||||
build.mov(tempw, LUA_TVECTOR);
|
||||
build.ins_4s(inst.regA64, tempw, 3);
|
||||
}
|
||||
else
|
||||
{
|
||||
RegisterA64 tempa = regs.allocTemp(KindA64::s);
|
||||
RegisterA64 tempb = regs.allocTemp(KindA64::s);
|
||||
|
||||
for (uint8_t i = 0; i < 3; i++)
|
||||
{
|
||||
build.dup_4s(tempa, regOp(inst.a), i);
|
||||
build.dup_4s(tempb, regOp(inst.b), i);
|
||||
build.fdiv(tempa, tempa, tempb);
|
||||
build.ins_4s(inst.regA64, i, castReg(KindA64::q, tempa), 0);
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
@ -737,13 +794,25 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next)
|
|||
{
|
||||
inst.regA64 = regs.allocReuse(KindA64::q, index, {inst.a});
|
||||
|
||||
RegisterA64 tempa = regs.allocTemp(KindA64::s);
|
||||
|
||||
for (uint8_t i = 0; i < 3; i++)
|
||||
if (FFlag::LuauCodeGenVectorA64)
|
||||
{
|
||||
build.dup_4s(tempa, regOp(inst.a), i);
|
||||
build.fneg(tempa, tempa);
|
||||
build.ins_4s(inst.regA64, i, castReg(KindA64::q, tempa), 0);
|
||||
RegisterA64 temp = tempVectorPure(inst.a, index);
|
||||
build.fneg(inst.regA64, temp);
|
||||
|
||||
RegisterA64 tempw = regs.allocTemp(KindA64::w);
|
||||
build.mov(tempw, LUA_TVECTOR);
|
||||
build.ins_4s(inst.regA64, tempw, 3);
|
||||
}
|
||||
else
|
||||
{
|
||||
RegisterA64 tempa = regs.allocTemp(KindA64::s);
|
||||
|
||||
for (uint8_t i = 0; i < 3; i++)
|
||||
{
|
||||
build.dup_4s(tempa, regOp(inst.a), i);
|
||||
build.fneg(tempa, tempa);
|
||||
build.ins_4s(inst.regA64, i, castReg(KindA64::q, tempa), 0);
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
@ -2566,6 +2635,27 @@ AddressA64 IrLoweringA64::tempAddrBuffer(IrOp bufferOp, IrOp indexOp)
|
|||
}
|
||||
}
|
||||
|
||||
RegisterA64 IrLoweringA64::tempVectorPure(IrOp op, uint32_t index)
|
||||
{
|
||||
RegisterA64 reg = regOp(op);
|
||||
|
||||
IrInst& source = function.instructions[op.index];
|
||||
LUAU_ASSERT(source.regA64 == reg);
|
||||
|
||||
if (source.lastUse == index)
|
||||
{
|
||||
build.ins_4s(reg, wzr, 3);
|
||||
return reg;
|
||||
}
|
||||
else
|
||||
{
|
||||
RegisterA64 temp = regs.allocTemp(KindA64::q);
|
||||
build.mov(temp, reg);
|
||||
build.ins_4s(temp, wzr, 3);
|
||||
return temp;
|
||||
}
|
||||
}
|
||||
|
||||
RegisterA64 IrLoweringA64::regOp(IrOp op)
|
||||
{
|
||||
IrInst& inst = function.instOp(op);
|
||||
|
|
|
@ -45,6 +45,7 @@ struct IrLoweringA64
|
|||
RegisterA64 tempUint(IrOp op);
|
||||
AddressA64 tempAddr(IrOp op, int offset);
|
||||
AddressA64 tempAddrBuffer(IrOp bufferOp, IrOp indexOp);
|
||||
RegisterA64 tempVectorPure(IrOp op, uint32_t index);
|
||||
|
||||
// May emit restore instructions
|
||||
RegisterA64 regOp(IrOp op);
|
||||
|
|
Loading…
Add table
Reference in a new issue