Improve codegen for a+a and a*a

When the two registers are the same and they come from a load, we only need
to emit one vandps; sub/div probably aren't worth optimizing in this way.
This commit is contained in:
Arseny Kapoulkine 2024-02-26 11:05:31 -08:00
parent 0d1db6a0b0
commit 3eb1a0628a

View file

@ -608,7 +608,7 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next)
ScopedRegX64 tmp2{regs};
RegisterX64 tmpa = vecOp(inst.a, tmp1);
RegisterX64 tmpb = vecOp(inst.b, tmp2);
RegisterX64 tmpb = (inst.a == inst.b) ? tmpa : vecOp(inst.b, tmp2);
build.vaddps(inst.regX64, tmpa, tmpb);
@ -639,7 +639,7 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next)
ScopedRegX64 tmp2{regs};
RegisterX64 tmpa = vecOp(inst.a, tmp1);
RegisterX64 tmpb = vecOp(inst.b, tmp2);
RegisterX64 tmpb = (inst.a == inst.b) ? tmpa : vecOp(inst.b, tmp2);
build.vmulps(inst.regX64, tmpa, tmpb);
if (!FFlag::LuauCodegenVectorTag)