CodeGen: Use more efficient lowering for UNM_*

UNM_NUM and UNM_VEC were both implemented assuming SSE-style
restrictions (2-argument form), but using AVX that doesn't have them.
There's no need to copy source to destination separately - we can just
vxorpd into destination.

Most occurrences of UNM_NUM/UNM_VEC followed the self-xor path, but this
saves a couple instructions in trig benchmark and makes it execute ~0.1%
fewer instructions (the actual runtime delta is within the noise).
This commit is contained in:
Arseny Kapoulkine 2024-03-02 11:07:18 -08:00
parent 443903aa00
commit b47cd4521c

View file

@ -542,18 +542,7 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next)
{
inst.regX64 = regs.allocRegOrReuse(SizeX64::xmmword, index, {inst.a});
RegisterX64 src = regOp(inst.a);
if (inst.regX64 == src)
{
build.vxorpd(inst.regX64, inst.regX64, build.f64(-0.0));
}
else
{
build.vmovsd(inst.regX64, src, src);
build.vxorpd(inst.regX64, inst.regX64, build.f64(-0.0));
}
build.vxorpd(inst.regX64, regOp(inst.a), build.f64(-0.0));
break;
}
case IrCmd::FLOOR_NUM:
@ -665,17 +654,7 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next)
{
inst.regX64 = regs.allocRegOrReuse(SizeX64::xmmword, index, {inst.a});
RegisterX64 src = regOp(inst.a);
if (inst.regX64 == src)
{
build.vxorpd(inst.regX64, inst.regX64, build.f32x4(-0.0, -0.0, -0.0, -0.0));
}
else
{
build.vmovsd(inst.regX64, src, src);
build.vxorpd(inst.regX64, inst.regX64, build.f32x4(-0.0, -0.0, -0.0, -0.0));
}
build.vxorpd(inst.regX64, regOp(inst.a), build.f32x4(-0.0, -0.0, -0.0, -0.0));
if (!FFlag::LuauCodegenVectorTag2)
build.vpinsrd(inst.regX64, inst.regX64, build.i32(LUA_TVECTOR), 3);