From 81b691bf91f2fdd6d6bd01faa27e6d688e6120fb Mon Sep 17 00:00:00 2001 From: Arseny Kapoulkine Date: Fri, 8 Nov 2024 10:59:44 -0800 Subject: [PATCH] CodeGen: Rewrite DOT_VEC lowering for A64 using faddp This results in about the same performance as a naive version on M2, but uses fewer registers and is what clang generates for a similar source. --- CodeGen/src/IrLoweringA64.cpp | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/CodeGen/src/IrLoweringA64.cpp b/CodeGen/src/IrLoweringA64.cpp index 5621cfd1..45ae5eeb 100644 --- a/CodeGen/src/IrLoweringA64.cpp +++ b/CodeGen/src/IrLoweringA64.cpp @@ -732,17 +732,15 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next) { inst.regA64 = regs.allocReg(KindA64::d, index); - RegisterA64 temp1 = regs.allocTemp(KindA64::q); - RegisterA64 temp2 = regs.allocTemp(KindA64::q); - RegisterA64 temp3 = regs.allocTemp(KindA64::q); + RegisterA64 temp = regs.allocTemp(KindA64::q); + RegisterA64 temps = castReg(KindA64::s, temp); + RegisterA64 regs = castReg(KindA64::s, inst.regA64); - build.fmul(temp1, regOp(inst.a), regOp(inst.b)); - build.dup_4s(temp2, temp1, 1); - build.dup_4s(temp3, temp1, 2); - - build.fadd(castReg(KindA64::s, temp1), castReg(KindA64::s, temp1), castReg(KindA64::s, temp2)); - build.fadd(castReg(KindA64::s, temp1), castReg(KindA64::s, temp1), castReg(KindA64::s, temp3)); - build.fcvt(inst.regA64, castReg(KindA64::s, temp1)); + build.fmul(temp, regOp(inst.a), regOp(inst.b)); + build.faddp(regs, temps); // x+y + build.dup_4s(temp, temp, 2); + build.fadd(regs, regs, temps); // +z + build.fcvt(inst.regA64, regs); break; } case IrCmd::NOT_ANY: