From 3eb1a0628a01ec198910bc3b468122a7aea8603e Mon Sep 17 00:00:00 2001 From: Arseny Kapoulkine Date: Mon, 26 Feb 2024 11:05:31 -0800 Subject: [PATCH] Improve codegen for a+a and a*a When the two registers are the same and they come from a load, we only need to emit one vandps; sub/div probably aren't worth optimizing in this way. --- CodeGen/src/IrLoweringX64.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CodeGen/src/IrLoweringX64.cpp b/CodeGen/src/IrLoweringX64.cpp index c036d749..cc29e42f 100644 --- a/CodeGen/src/IrLoweringX64.cpp +++ b/CodeGen/src/IrLoweringX64.cpp @@ -608,7 +608,7 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next) ScopedRegX64 tmp2{regs}; RegisterX64 tmpa = vecOp(inst.a, tmp1); - RegisterX64 tmpb = vecOp(inst.b, tmp2); + RegisterX64 tmpb = (inst.a == inst.b) ? tmpa : vecOp(inst.b, tmp2); build.vaddps(inst.regX64, tmpa, tmpb); @@ -639,7 +639,7 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next) ScopedRegX64 tmp2{regs}; RegisterX64 tmpa = vecOp(inst.a, tmp1); - RegisterX64 tmpb = vecOp(inst.b, tmp2); + RegisterX64 tmpb = (inst.a == inst.b) ? tmpa : vecOp(inst.b, tmp2); build.vmulps(inst.regX64, tmpa, tmpb); if (!FFlag::LuauCodegenVectorTag)