From 80928acb92d1e4b6db16bada6d21b1fb6fa66265 Mon Sep 17 00:00:00 2001
From: Arseny Kapoulkine <arseny.kapoulkine@gmail.com>
Date: Wed, 21 Feb 2024 07:06:11 -0800
Subject: [PATCH] CodeGen: Extract all vector tag patching into TAG_VECTOR
 (#1171)

Instead of patching the tag component with TVECTOR in every instruction
that produces a vector value, we now use a separate IR instruction to do
this. This reduces implementation redundancy, but more importantly
allows for a class of optimizations:

- NUM_TO_VECTOR previously patched the component unconditionally but the
result was used only in MUL/DIV_VEC instructions that ignore it anyway;
we can now remove this.

- ADD_VEC et al can now forward the source of TAG_VECTOR instruction of
either input; this shortens the latency chain and in the future could
allow us to generate optimal vector instruction sequence once the
temporary stores are marked as dead.

- In the future on X64, ADD_VEC et al will be able to analyze the input
instruction and remove tag masking conditionally. This is not part of
this PR as it requires a decision around expected FP environment and/or
the necessity of the existing masking to begin with.

I've also renamed NUM_TO_VECTOR to NUM_TO_VEC so that "VEC" always
refers to "3 float values" and for consistency with ADD/etc.

Note: ADD_VEC input forwarding is currently performed unconditionally;
it may or may not increase the spills that can't be reloaded from the
stack.

On A64 this makes the Taylor series computation a tiny bit faster
(11.3ns => 11.0ns) as it removes the redundant ins instructions along
the NUM_TO_VEC path. Curiously, the optimization of forwarding
TAG_VECTOR input to arithmetic instructions actually has a small penalty
as without it this PR runs at 10.9 ns. I don't know if this is a
property of the benchmark though, as I just noticed that in this
benchmark type inference actually fails to infer parts of the
computation as a vector op. If desired I will happily omit this part of
the change and we can explore that separately.
---
 CodeGen/include/Luau/IrData.h     |  6 ++-
 CodeGen/include/Luau/IrUtils.h    |  3 +-
 CodeGen/src/IrDump.cpp            |  6 ++-
 CodeGen/src/IrLoweringA64.cpp     | 66 +++++++++++++++++++------
 CodeGen/src/IrLoweringX64.cpp     | 34 ++++++++++---
 CodeGen/src/IrTranslation.cpp     | 22 +++++++--
 CodeGen/src/IrUtils.cpp           |  3 +-
 CodeGen/src/OptimizeConstProp.cpp | 45 +++++++++++++----
 tests/IrLowering.test.cpp         | 81 +++++++++++++++++++------------
 9 files changed, 192 insertions(+), 74 deletions(-)

diff --git a/CodeGen/include/Luau/IrData.h b/CodeGen/include/Luau/IrData.h
index 129536d1..1d9bbc73 100644
--- a/CodeGen/include/Luau/IrData.h
+++ b/CodeGen/include/Luau/IrData.h
@@ -304,7 +304,11 @@ enum class IrCmd : uint8_t
 
     // Converts a double number to a vector with the value in X/Y/Z
     // A: double
-    NUM_TO_VECTOR,
+    NUM_TO_VEC,
+
+    // Adds VECTOR type tag to a vector, preserving X/Y/Z components
+    // A: TValue
+    TAG_VECTOR,
 
     // Adjust stack top (L->top) to point at 'B' TValues *after* the specified register
     // This is used to return multiple values
diff --git a/CodeGen/include/Luau/IrUtils.h b/CodeGen/include/Luau/IrUtils.h
index 47ef505b..0c8495e8 100644
--- a/CodeGen/include/Luau/IrUtils.h
+++ b/CodeGen/include/Luau/IrUtils.h
@@ -186,7 +186,8 @@ inline bool hasResult(IrCmd cmd)
     case IrCmd::UINT_TO_NUM:
     case IrCmd::NUM_TO_INT:
     case IrCmd::NUM_TO_UINT:
-    case IrCmd::NUM_TO_VECTOR:
+    case IrCmd::NUM_TO_VEC:
+    case IrCmd::TAG_VECTOR:
     case IrCmd::SUBSTITUTE:
     case IrCmd::INVOKE_FASTCALL:
     case IrCmd::BITAND_UINT:
diff --git a/CodeGen/src/IrDump.cpp b/CodeGen/src/IrDump.cpp
index de7a7fa4..9a115953 100644
--- a/CodeGen/src/IrDump.cpp
+++ b/CodeGen/src/IrDump.cpp
@@ -205,8 +205,10 @@ const char* getCmdName(IrCmd cmd)
         return "NUM_TO_INT";
     case IrCmd::NUM_TO_UINT:
         return "NUM_TO_UINT";
-    case IrCmd::NUM_TO_VECTOR:
-        return "NUM_TO_VECTOR";
+    case IrCmd::NUM_TO_VEC:
+        return "NUM_TO_VEC";
+    case IrCmd::TAG_VECTOR:
+        return "TAG_VECTOR";
     case IrCmd::ADJUST_STACK_TO_REG:
         return "ADJUST_STACK_TO_REG";
     case IrCmd::ADJUST_STACK_TO_TOP:
diff --git a/CodeGen/src/IrLoweringA64.cpp b/CodeGen/src/IrLoweringA64.cpp
index 9d9df188..6a5703d1 100644
--- a/CodeGen/src/IrLoweringA64.cpp
+++ b/CodeGen/src/IrLoweringA64.cpp
@@ -14,6 +14,8 @@
 LUAU_DYNAMIC_FASTFLAGVARIABLE(LuauCodeGenFixBufferLenCheckA64, false)
 LUAU_FASTFLAGVARIABLE(LuauCodeGenVectorA64, false)
 
+LUAU_FASTFLAG(LuauCodegenVectorTag)
+
 namespace Luau
 {
 namespace CodeGen
@@ -678,9 +680,12 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next)
         {
             build.fadd(inst.regA64, regOp(inst.a), regOp(inst.b));
 
-            RegisterA64 tempw = regs.allocTemp(KindA64::w);
-            build.mov(tempw, LUA_TVECTOR);
-            build.ins_4s(inst.regA64, tempw, 3);
+            if (!FFlag::LuauCodegenVectorTag)
+            {
+                RegisterA64 tempw = regs.allocTemp(KindA64::w);
+                build.mov(tempw, LUA_TVECTOR);
+                build.ins_4s(inst.regA64, tempw, 3);
+            }
         }
         else
         {
@@ -705,9 +710,12 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next)
         {
             build.fsub(inst.regA64, regOp(inst.a), regOp(inst.b));
 
-            RegisterA64 tempw = regs.allocTemp(KindA64::w);
-            build.mov(tempw, LUA_TVECTOR);
-            build.ins_4s(inst.regA64, tempw, 3);
+            if (!FFlag::LuauCodegenVectorTag)
+            {
+                RegisterA64 tempw = regs.allocTemp(KindA64::w);
+                build.mov(tempw, LUA_TVECTOR);
+                build.ins_4s(inst.regA64, tempw, 3);
+            }
         }
         else
         {
@@ -732,9 +740,12 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next)
         {
             build.fmul(inst.regA64, regOp(inst.a), regOp(inst.b));
 
-            RegisterA64 tempw = regs.allocTemp(KindA64::w);
-            build.mov(tempw, LUA_TVECTOR);
-            build.ins_4s(inst.regA64, tempw, 3);
+            if (!FFlag::LuauCodegenVectorTag)
+            {
+                RegisterA64 tempw = regs.allocTemp(KindA64::w);
+                build.mov(tempw, LUA_TVECTOR);
+                build.ins_4s(inst.regA64, tempw, 3);
+            }
         }
         else
         {
@@ -759,9 +770,12 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next)
         {
             build.fdiv(inst.regA64, regOp(inst.a), regOp(inst.b));
 
-            RegisterA64 tempw = regs.allocTemp(KindA64::w);
-            build.mov(tempw, LUA_TVECTOR);
-            build.ins_4s(inst.regA64, tempw, 3);
+            if (!FFlag::LuauCodegenVectorTag)
+            {
+                RegisterA64 tempw = regs.allocTemp(KindA64::w);
+                build.mov(tempw, LUA_TVECTOR);
+                build.ins_4s(inst.regA64, tempw, 3);
+            }
         }
         else
         {
@@ -786,9 +800,12 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next)
         {
             build.fneg(inst.regA64, regOp(inst.a));
 
-            RegisterA64 tempw = regs.allocTemp(KindA64::w);
-            build.mov(tempw, LUA_TVECTOR);
-            build.ins_4s(inst.regA64, tempw, 3);
+            if (!FFlag::LuauCodegenVectorTag)
+            {
+                RegisterA64 tempw = regs.allocTemp(KindA64::w);
+                build.mov(tempw, LUA_TVECTOR);
+                build.ins_4s(inst.regA64, tempw, 3);
+            }
         }
         else
         {
@@ -1156,7 +1173,7 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next)
         build.fcvtzs(castReg(KindA64::x, inst.regA64), temp);
         break;
     }
-    case IrCmd::NUM_TO_VECTOR:
+    case IrCmd::NUM_TO_VEC:
     {
         inst.regA64 = regs.allocReg(KindA64::q, index);
 
@@ -1167,6 +1184,23 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next)
         build.fcvt(temps, tempd);
         build.dup_4s(inst.regA64, castReg(KindA64::q, temps), 0);
 
+        if (!FFlag::LuauCodegenVectorTag)
+        {
+            build.mov(tempw, LUA_TVECTOR);
+            build.ins_4s(inst.regA64, tempw, 3);
+        }
+        break;
+    }
+    case IrCmd::TAG_VECTOR:
+    {
+        inst.regA64 = regs.allocReuse(KindA64::q, index, {inst.a});
+
+        RegisterA64 reg = regOp(inst.a);
+        RegisterA64 tempw = regs.allocTemp(KindA64::w);
+
+        if (inst.regA64 != reg)
+            build.mov(inst.regA64, reg);
+
         build.mov(tempw, LUA_TVECTOR);
         build.ins_4s(inst.regA64, tempw, 3);
         break;
diff --git a/CodeGen/src/IrLoweringX64.cpp b/CodeGen/src/IrLoweringX64.cpp
index babfdf46..c5188dc4 100644
--- a/CodeGen/src/IrLoweringX64.cpp
+++ b/CodeGen/src/IrLoweringX64.cpp
@@ -15,6 +15,8 @@
 #include "lstate.h"
 #include "lgc.h"
 
+LUAU_FASTFLAG(LuauCodegenVectorTag)
+
 namespace Luau
 {
 namespace CodeGen
@@ -608,7 +610,9 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next)
         build.vandps(tmp1.reg, regOp(inst.a), vectorAndMaskOp());
         build.vandps(tmp2.reg, regOp(inst.b), vectorAndMaskOp());
         build.vaddps(inst.regX64, tmp1.reg, tmp2.reg);
-        build.vorps(inst.regX64, inst.regX64, vectorOrMaskOp());
+
+        if (!FFlag::LuauCodegenVectorTag)
+            build.vorps(inst.regX64, inst.regX64, vectorOrMaskOp());
         break;
     }
     case IrCmd::SUB_VEC:
@@ -622,7 +626,8 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next)
         build.vandps(tmp1.reg, regOp(inst.a), vectorAndMaskOp());
         build.vandps(tmp2.reg, regOp(inst.b), vectorAndMaskOp());
         build.vsubps(inst.regX64, tmp1.reg, tmp2.reg);
-        build.vorps(inst.regX64, inst.regX64, vectorOrMaskOp());
+        if (!FFlag::LuauCodegenVectorTag)
+            build.vorps(inst.regX64, inst.regX64, vectorOrMaskOp());
         break;
     }
     case IrCmd::MUL_VEC:
@@ -636,7 +641,8 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next)
         build.vandps(tmp1.reg, regOp(inst.a), vectorAndMaskOp());
         build.vandps(tmp2.reg, regOp(inst.b), vectorAndMaskOp());
         build.vmulps(inst.regX64, tmp1.reg, tmp2.reg);
-        build.vorps(inst.regX64, inst.regX64, vectorOrMaskOp());
+        if (!FFlag::LuauCodegenVectorTag)
+            build.vorps(inst.regX64, inst.regX64, vectorOrMaskOp());
         break;
     }
     case IrCmd::DIV_VEC:
@@ -650,7 +656,8 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next)
         build.vandps(tmp1.reg, regOp(inst.a), vectorAndMaskOp());
         build.vandps(tmp2.reg, regOp(inst.b), vectorAndMaskOp());
         build.vdivps(inst.regX64, tmp1.reg, tmp2.reg);
-        build.vpinsrd(inst.regX64, inst.regX64, build.i32(LUA_TVECTOR), 3);
+        if (!FFlag::LuauCodegenVectorTag)
+            build.vpinsrd(inst.regX64, inst.regX64, build.i32(LUA_TVECTOR), 3);
         break;
     }
     case IrCmd::UNM_VEC:
@@ -669,7 +676,8 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next)
             build.vxorpd(inst.regX64, inst.regX64, build.f32x4(-0.0, -0.0, -0.0, -0.0));
         }
 
-        build.vpinsrd(inst.regX64, inst.regX64, build.i32(LUA_TVECTOR), 3);
+        if (!FFlag::LuauCodegenVectorTag)
+            build.vpinsrd(inst.regX64, inst.regX64, build.i32(LUA_TVECTOR), 3);
         break;
     }
     case IrCmd::NOT_ANY:
@@ -964,7 +972,7 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next)
 
         build.vcvttsd2si(qwordReg(inst.regX64), memRegDoubleOp(inst.a));
         break;
-    case IrCmd::NUM_TO_VECTOR:
+    case IrCmd::NUM_TO_VEC:
         inst.regX64 = regs.allocReg(SizeX64::xmmword, index);
 
         if (inst.a.kind == IrOpKind::Constant)
@@ -974,15 +982,25 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next)
             static_assert(sizeof(asU32) == sizeof(value), "Expecting float to be 32-bit");
             memcpy(&asU32, &value, sizeof(value));
 
-            build.vmovaps(inst.regX64, build.u32x4(asU32, asU32, asU32, LUA_TVECTOR));
+            if (FFlag::LuauCodegenVectorTag)
+                build.vmovaps(inst.regX64, build.u32x4(asU32, asU32, asU32, 0));
+            else
+                build.vmovaps(inst.regX64, build.u32x4(asU32, asU32, asU32, LUA_TVECTOR));
         }
         else
         {
             build.vcvtsd2ss(inst.regX64, inst.regX64, memRegDoubleOp(inst.a));
             build.vpshufps(inst.regX64, inst.regX64, inst.regX64, 0b00'00'00'00);
-            build.vpinsrd(inst.regX64, inst.regX64, build.i32(LUA_TVECTOR), 3);
+
+            if (!FFlag::LuauCodegenVectorTag)
+                build.vpinsrd(inst.regX64, inst.regX64, build.i32(LUA_TVECTOR), 3);
         }
         break;
+    case IrCmd::TAG_VECTOR:
+        inst.regX64 = regs.allocRegOrReuse(SizeX64::xmmword, index, {inst.a});
+
+        build.vpinsrd(inst.regX64, regOp(inst.a), build.i32(LUA_TVECTOR), 3);
+        break;
     case IrCmd::ADJUST_STACK_TO_REG:
     {
         ScopedRegX64 tmp{regs, SizeX64::qword};
diff --git a/CodeGen/src/IrTranslation.cpp b/CodeGen/src/IrTranslation.cpp
index 44d0a264..686d5130 100644
--- a/CodeGen/src/IrTranslation.cpp
+++ b/CodeGen/src/IrTranslation.cpp
@@ -14,6 +14,7 @@
 
 LUAU_FASTFLAGVARIABLE(LuauCodegenLuData, false)
 LUAU_FASTFLAGVARIABLE(LuauCodegenVector, false)
+LUAU_FASTFLAGVARIABLE(LuauCodegenVectorTag, false)
 
 namespace Luau
 {
@@ -380,9 +381,12 @@ static void translateInstBinaryNumeric(IrBuilder& build, int ra, int rb, int rc,
                 result = build.inst(IrCmd::DIV_VEC, vb, vc);
                 break;
             default:
-                break;
+                CODEGEN_ASSERT(!"Unknown TM op");
             }
 
+            if (FFlag::LuauCodegenVectorTag)
+                result = build.inst(IrCmd::TAG_VECTOR, result);
+
             build.inst(IrCmd::STORE_TVALUE, build.vmReg(ra), result);
             return;
         }
@@ -393,7 +397,7 @@ static void translateInstBinaryNumeric(IrBuilder& build, int ra, int rb, int rc,
 
             build.inst(IrCmd::CHECK_TAG, build.inst(IrCmd::LOAD_TAG, build.vmReg(rc)), build.constTag(LUA_TVECTOR), build.vmExit(pcpos));
 
-            IrOp vb = build.inst(IrCmd::NUM_TO_VECTOR, loadDoubleOrConstant(build, opb));
+            IrOp vb = build.inst(IrCmd::NUM_TO_VEC, loadDoubleOrConstant(build, opb));
             IrOp vc = build.inst(IrCmd::LOAD_TVALUE, opc);
             IrOp result;
 
@@ -406,9 +410,12 @@ static void translateInstBinaryNumeric(IrBuilder& build, int ra, int rb, int rc,
                 result = build.inst(IrCmd::DIV_VEC, vb, vc);
                 break;
             default:
-                break;
+                CODEGEN_ASSERT(!"Unknown TM op");
             }
 
+            if (FFlag::LuauCodegenVectorTag)
+                result = build.inst(IrCmd::TAG_VECTOR, result);
+
             build.inst(IrCmd::STORE_TVALUE, build.vmReg(ra), result);
             return;
         }
@@ -420,7 +427,7 @@ static void translateInstBinaryNumeric(IrBuilder& build, int ra, int rb, int rc,
                 build.inst(IrCmd::CHECK_TAG, build.inst(IrCmd::LOAD_TAG, build.vmReg(rc)), build.constTag(LUA_TNUMBER), build.vmExit(pcpos));
 
             IrOp vb = build.inst(IrCmd::LOAD_TVALUE, opb);
-            IrOp vc = build.inst(IrCmd::NUM_TO_VECTOR, loadDoubleOrConstant(build, opc));
+            IrOp vc = build.inst(IrCmd::NUM_TO_VEC, loadDoubleOrConstant(build, opc));
             IrOp result;
 
             switch (tm)
@@ -432,9 +439,12 @@ static void translateInstBinaryNumeric(IrBuilder& build, int ra, int rb, int rc,
                 result = build.inst(IrCmd::DIV_VEC, vb, vc);
                 break;
             default:
-                break;
+                CODEGEN_ASSERT(!"Unknown TM op");
             }
 
+            if (FFlag::LuauCodegenVectorTag)
+                result = build.inst(IrCmd::TAG_VECTOR, result);
+
             build.inst(IrCmd::STORE_TVALUE, build.vmReg(ra), result);
             return;
         }
@@ -596,6 +606,8 @@ void translateInstMinus(IrBuilder& build, const Instruction* pc, int pcpos)
 
         IrOp vb = build.inst(IrCmd::LOAD_TVALUE, build.vmReg(rb));
         IrOp va = build.inst(IrCmd::UNM_VEC, vb);
+        if (FFlag::LuauCodegenVectorTag)
+            va = build.inst(IrCmd::TAG_VECTOR, va);
         build.inst(IrCmd::STORE_TVALUE, build.vmReg(ra), va);
         return;
     }
diff --git a/CodeGen/src/IrUtils.cpp b/CodeGen/src/IrUtils.cpp
index b49e974f..caa6b178 100644
--- a/CodeGen/src/IrUtils.cpp
+++ b/CodeGen/src/IrUtils.cpp
@@ -106,7 +106,8 @@ IrValueKind getCmdValueKind(IrCmd cmd)
     case IrCmd::NUM_TO_INT:
     case IrCmd::NUM_TO_UINT:
         return IrValueKind::Int;
-    case IrCmd::NUM_TO_VECTOR:
+    case IrCmd::NUM_TO_VEC:
+    case IrCmd::TAG_VECTOR:
         return IrValueKind::Tvalue;
     case IrCmd::ADJUST_STACK_TO_REG:
     case IrCmd::ADJUST_STACK_TO_TOP:
diff --git a/CodeGen/src/OptimizeConstProp.cpp b/CodeGen/src/OptimizeConstProp.cpp
index 0c543572..4214d015 100644
--- a/CodeGen/src/OptimizeConstProp.cpp
+++ b/CodeGen/src/OptimizeConstProp.cpp
@@ -18,6 +18,7 @@ LUAU_FASTINTVARIABLE(LuauCodeGenMinLinearBlockPath, 3)
 LUAU_FASTINTVARIABLE(LuauCodeGenReuseSlotLimit, 64)
 LUAU_FASTFLAGVARIABLE(DebugLuauAbortingChecks, false)
 LUAU_FASTFLAG(LuauCodegenVector)
+LUAU_FASTFLAG(LuauCodegenVectorTag)
 LUAU_DYNAMIC_FASTFLAGVARIABLE(LuauCodeGenCheckGcEffectFix, false)
 
 namespace Luau
@@ -715,9 +716,17 @@ static void constPropInInst(ConstPropState& state, IrBuilder& build, IrFunction&
             {
                 if (IrInst* arg = function.asInstOp(inst.b))
                 {
-                    if (arg->cmd == IrCmd::ADD_VEC || arg->cmd == IrCmd::SUB_VEC || arg->cmd == IrCmd::MUL_VEC || arg->cmd == IrCmd::DIV_VEC ||
-                        arg->cmd == IrCmd::UNM_VEC)
-                        tag = LUA_TVECTOR;
+                    if (FFlag::LuauCodegenVectorTag)
+                    {
+                        if (arg->cmd == IrCmd::TAG_VECTOR)
+                            tag = LUA_TVECTOR;
+                    }
+                    else
+                    {
+                        if (arg->cmd == IrCmd::ADD_VEC || arg->cmd == IrCmd::SUB_VEC || arg->cmd == IrCmd::MUL_VEC || arg->cmd == IrCmd::DIV_VEC ||
+                            arg->cmd == IrCmd::UNM_VEC)
+                            tag = LUA_TVECTOR;
+                    }
                 }
             }
 
@@ -1250,6 +1259,28 @@ static void constPropInInst(ConstPropState& state, IrBuilder& build, IrFunction&
         if (int(state.checkSlotMatchCache.size()) < FInt::LuauCodeGenReuseSlotLimit)
             state.checkSlotMatchCache.push_back(index);
         break;
+
+    case IrCmd::ADD_VEC:
+    case IrCmd::SUB_VEC:
+    case IrCmd::MUL_VEC:
+    case IrCmd::DIV_VEC:
+        if (FFlag::LuauCodegenVectorTag)
+        {
+            if (IrInst* a = function.asInstOp(inst.a); a && a->cmd == IrCmd::TAG_VECTOR)
+                inst.a = a->a;
+            if (IrInst* b = function.asInstOp(inst.b); b && b->cmd == IrCmd::TAG_VECTOR)
+                inst.b = b->a;
+        }
+        break;
+
+    case IrCmd::UNM_VEC:
+        if (FFlag::LuauCodegenVectorTag)
+        {
+            if (IrInst* a = function.asInstOp(inst.a); a && a->cmd == IrCmd::TAG_VECTOR)
+                inst.a = a->a;
+        }
+        break;
+
     case IrCmd::CHECK_NODE_NO_NEXT:
     case IrCmd::CHECK_NODE_VALUE:
     case IrCmd::BARRIER_TABLE_BACK:
@@ -1278,12 +1309,8 @@ static void constPropInInst(ConstPropState& state, IrBuilder& build, IrFunction&
     case IrCmd::GET_TYPE:
     case IrCmd::GET_TYPEOF:
     case IrCmd::FINDUPVAL:
-    case IrCmd::ADD_VEC:
-    case IrCmd::SUB_VEC:
-    case IrCmd::MUL_VEC:
-    case IrCmd::DIV_VEC:
-    case IrCmd::UNM_VEC:
-    case IrCmd::NUM_TO_VECTOR:
+    case IrCmd::NUM_TO_VEC:
+    case IrCmd::TAG_VECTOR:
         break;
 
     case IrCmd::DO_ARITH:
diff --git a/tests/IrLowering.test.cpp b/tests/IrLowering.test.cpp
index bdb7e38c..13f44dca 100644
--- a/tests/IrLowering.test.cpp
+++ b/tests/IrLowering.test.cpp
@@ -13,6 +13,7 @@
 #include <memory>
 
 LUAU_FASTFLAG(LuauCodegenVector)
+LUAU_FASTFLAG(LuauCodegenVectorTag)
 LUAU_FASTFLAG(LuauCodegenMathMemArgs)
 
 static std::string getCodegenAssembly(const char* source)
@@ -65,6 +66,7 @@ TEST_SUITE_BEGIN("IrLowering");
 TEST_CASE("VectorReciprocal")
 {
     ScopedFastFlag luauCodegenVector{FFlag::LuauCodegenVector, true};
+    ScopedFastFlag luauCodegenVectorTag{FFlag::LuauCodegenVectorTag, true};
 
     CHECK_EQ("\n" + getCodegenAssembly(R"(
 local function vecrcp(a: vector)
@@ -79,10 +81,11 @@ bb_0:
 bb_2:
   JUMP bb_bytecode_1
 bb_bytecode_1:
-  %6 = NUM_TO_VECTOR 1
+  %6 = NUM_TO_VEC 1
   %7 = LOAD_TVALUE R0
   %8 = DIV_VEC %6, %7
-  STORE_TVALUE R1, %8
+  %9 = TAG_VECTOR %8
+  STORE_TVALUE R1, %9
   INTERRUPT 1u
   RETURN R1, 1i
 )");
@@ -127,6 +130,7 @@ bb_bytecode_1:
 TEST_CASE("VectorAdd")
 {
     ScopedFastFlag luauCodegenVector{FFlag::LuauCodegenVector, true};
+    ScopedFastFlag luauCodegenVectorTag{FFlag::LuauCodegenVectorTag, true};
 
     CHECK_EQ("\n" + getCodegenAssembly(R"(
 local function vec3add(a: vector, b: vector)
@@ -145,7 +149,8 @@ bb_bytecode_1:
   %10 = LOAD_TVALUE R0
   %11 = LOAD_TVALUE R1
   %12 = ADD_VEC %10, %11
-  STORE_TVALUE R2, %12
+  %13 = TAG_VECTOR %12
+  STORE_TVALUE R2, %13
   INTERRUPT 1u
   RETURN R2, 1i
 )");
@@ -154,6 +159,7 @@ bb_bytecode_1:
 TEST_CASE("VectorMinus")
 {
     ScopedFastFlag luauCodegenVector{FFlag::LuauCodegenVector, true};
+    ScopedFastFlag luauCodegenVectorTag{FFlag::LuauCodegenVectorTag, true};
 
     CHECK_EQ("\n" + getCodegenAssembly(R"(
 local function vec3minus(a: vector)
@@ -170,7 +176,8 @@ bb_2:
 bb_bytecode_1:
   %6 = LOAD_TVALUE R0
   %7 = UNM_VEC %6
-  STORE_TVALUE R1, %7
+  %8 = TAG_VECTOR %7
+  STORE_TVALUE R1, %8
   INTERRUPT 1u
   RETURN R1, 1i
 )");
@@ -179,6 +186,7 @@ bb_bytecode_1:
 TEST_CASE("VectorSubMulDiv")
 {
     ScopedFastFlag luauCodegenVector{FFlag::LuauCodegenVector, true};
+    ScopedFastFlag luauCodegenVectorTag{FFlag::LuauCodegenVectorTag, true};
 
     CHECK_EQ("\n" + getCodegenAssembly(R"(
 local function vec3combo(a: vector, b: vector, c: vector, d: vector)
@@ -199,13 +207,16 @@ bb_bytecode_1:
   %14 = LOAD_TVALUE R0
   %15 = LOAD_TVALUE R1
   %16 = MUL_VEC %14, %15
-  STORE_TVALUE R5, %16
-  %22 = LOAD_TVALUE R2
-  %23 = LOAD_TVALUE R3
-  %24 = DIV_VEC %22, %23
-  STORE_TVALUE R6, %24
-  %32 = SUB_VEC %16, %24
-  STORE_TVALUE R4, %32
+  %17 = TAG_VECTOR %16
+  STORE_TVALUE R5, %17
+  %23 = LOAD_TVALUE R2
+  %24 = LOAD_TVALUE R3
+  %25 = DIV_VEC %23, %24
+  %26 = TAG_VECTOR %25
+  STORE_TVALUE R6, %26
+  %34 = SUB_VEC %16, %25
+  %35 = TAG_VECTOR %34
+  STORE_TVALUE R4, %35
   INTERRUPT 3u
   RETURN R4, 1i
 )");
@@ -214,6 +225,7 @@ bb_bytecode_1:
 TEST_CASE("VectorMulDivMixed")
 {
     ScopedFastFlag luauCodegenVector{FFlag::LuauCodegenVector, true};
+    ScopedFastFlag luauCodegenVectorTag{FFlag::LuauCodegenVectorTag, true};
 
     CHECK_EQ("\n" + getCodegenAssembly(R"(
 local function vec3combo(a: vector, b: vector, c: vector, d: vector)
@@ -232,29 +244,36 @@ bb_2:
   JUMP bb_bytecode_1
 bb_bytecode_1:
   %12 = LOAD_TVALUE R0
-  %13 = NUM_TO_VECTOR 2
+  %13 = NUM_TO_VEC 2
   %14 = MUL_VEC %12, %13
-  STORE_TVALUE R7, %14
-  %18 = LOAD_TVALUE R1
-  %19 = NUM_TO_VECTOR 4
-  %20 = DIV_VEC %18, %19
-  STORE_TVALUE R8, %20
-  %28 = ADD_VEC %14, %20
-  STORE_TVALUE R6, %28
+  %15 = TAG_VECTOR %14
+  STORE_TVALUE R7, %15
+  %19 = LOAD_TVALUE R1
+  %20 = NUM_TO_VEC 4
+  %21 = DIV_VEC %19, %20
+  %22 = TAG_VECTOR %21
+  STORE_TVALUE R8, %22
+  %30 = ADD_VEC %14, %21
+  %31 = TAG_VECTOR %30
+  STORE_TVALUE R6, %31
   STORE_DOUBLE R8, 0.5
   STORE_TAG R8, tnumber
-  %37 = NUM_TO_VECTOR 0.5
-  %38 = LOAD_TVALUE R2
-  %39 = MUL_VEC %37, %38
-  STORE_TVALUE R7, %39
-  %47 = ADD_VEC %28, %39
-  STORE_TVALUE R5, %47
-  %51 = NUM_TO_VECTOR 40
-  %52 = LOAD_TVALUE R3
-  %53 = DIV_VEC %51, %52
-  STORE_TVALUE R6, %53
-  %61 = ADD_VEC %47, %53
-  STORE_TVALUE R4, %61
+  %40 = NUM_TO_VEC 0.5
+  %41 = LOAD_TVALUE R2
+  %42 = MUL_VEC %40, %41
+  %43 = TAG_VECTOR %42
+  STORE_TVALUE R7, %43
+  %51 = ADD_VEC %30, %42
+  %52 = TAG_VECTOR %51
+  STORE_TVALUE R5, %52
+  %56 = NUM_TO_VEC 40
+  %57 = LOAD_TVALUE R3
+  %58 = DIV_VEC %56, %57
+  %59 = TAG_VECTOR %58
+  STORE_TVALUE R6, %59
+  %67 = ADD_VEC %51, %58
+  %68 = TAG_VECTOR %67
+  STORE_TVALUE R4, %68
   INTERRUPT 8u
   RETURN R4, 1i
 )");