From 8fc458edbdc217b01d72dc142492e6f759fcc5ec Mon Sep 17 00:00:00 2001 From: Arseny Kapoulkine Date: Fri, 8 Nov 2024 10:40:12 -0800 Subject: [PATCH] CodeGen: Implement a naive version of A64 DOT_VEC This is using existing instructions and scalar adds to have a baseline. This is still faster than the original implementation of vector. ops. --- CodeGen/src/IrLoweringA64.cpp | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/CodeGen/src/IrLoweringA64.cpp b/CodeGen/src/IrLoweringA64.cpp index 2c55fa09..5621cfd1 100644 --- a/CodeGen/src/IrLoweringA64.cpp +++ b/CodeGen/src/IrLoweringA64.cpp @@ -730,7 +730,19 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next) } case IrCmd::DOT_VEC: { - CODEGEN_ASSERT(!"DOT_VEC is not implemented for A64"); + inst.regA64 = regs.allocReg(KindA64::d, index); + + RegisterA64 temp1 = regs.allocTemp(KindA64::q); + RegisterA64 temp2 = regs.allocTemp(KindA64::q); + RegisterA64 temp3 = regs.allocTemp(KindA64::q); + + build.fmul(temp1, regOp(inst.a), regOp(inst.b)); + build.dup_4s(temp2, temp1, 1); + build.dup_4s(temp3, temp1, 2); + + build.fadd(castReg(KindA64::s, temp1), castReg(KindA64::s, temp1), castReg(KindA64::s, temp2)); + build.fadd(castReg(KindA64::s, temp1), castReg(KindA64::s, temp1), castReg(KindA64::s, temp3)); + build.fcvt(inst.regA64, castReg(KindA64::s, temp1)); break; } case IrCmd::NOT_ANY: