From b47cd4521c0d0926d9916df73f996a03c8c0232a Mon Sep 17 00:00:00 2001 From: Arseny Kapoulkine Date: Sat, 2 Mar 2024 11:07:18 -0800 Subject: [PATCH] CodeGen: Use more efficient lowering for UNM_* UNM_NUM and UNM_VEC were both implemented assuming SSE-style restrictions (2-argument form), but using AVX that doesn't have them. There's no need to copy source to destination separately - we can just vxorpd into destination. Most occurrences of UNM_NUM/UNM_VEC followed the self-xor path, but this saves a couple instructions in trig benchmark and makes it execute ~0.1% fewer instructions (the actual runtime delta is within the noise). --- CodeGen/src/IrLoweringX64.cpp | 25 ++----------------------- 1 file changed, 2 insertions(+), 23 deletions(-) diff --git a/CodeGen/src/IrLoweringX64.cpp b/CodeGen/src/IrLoweringX64.cpp index bf82be52..62fd2b98 100644 --- a/CodeGen/src/IrLoweringX64.cpp +++ b/CodeGen/src/IrLoweringX64.cpp @@ -542,18 +542,7 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next) { inst.regX64 = regs.allocRegOrReuse(SizeX64::xmmword, index, {inst.a}); - RegisterX64 src = regOp(inst.a); - - if (inst.regX64 == src) - { - build.vxorpd(inst.regX64, inst.regX64, build.f64(-0.0)); - } - else - { - build.vmovsd(inst.regX64, src, src); - build.vxorpd(inst.regX64, inst.regX64, build.f64(-0.0)); - } - + build.vxorpd(inst.regX64, regOp(inst.a), build.f64(-0.0)); break; } case IrCmd::FLOOR_NUM: @@ -665,17 +654,7 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next) { inst.regX64 = regs.allocRegOrReuse(SizeX64::xmmword, index, {inst.a}); - RegisterX64 src = regOp(inst.a); - - if (inst.regX64 == src) - { - build.vxorpd(inst.regX64, inst.regX64, build.f32x4(-0.0, -0.0, -0.0, -0.0)); - } - else - { - build.vmovsd(inst.regX64, src, src); - build.vxorpd(inst.regX64, inst.regX64, build.f32x4(-0.0, -0.0, -0.0, -0.0)); - } + build.vxorpd(inst.regX64, regOp(inst.a), build.f32x4(-0.0, -0.0, -0.0, -0.0)); if (!FFlag::LuauCodegenVectorTag2) build.vpinsrd(inst.regX64, inst.regX64, build.i32(LUA_TVECTOR), 3);