luau/CodeGen/src/EmitCommonX64.cpp
Arseny Kapoulkine 89b437bb4e
Add SUBRK and DIVRK bytecode instructions to bytecode v5 (#1115)
Right now, we can compile R\*K for all arithmetic instructions, but K\*R
gets compiled into two instructions (LOADN/LOADK + arithmetic opcode).

This is problematic since it leads to reduced performance for some code.
However, we'd like to avoid adding reverse variants of ADDK et al for
all opcodes to avoid the increase in I$ footprint for interpreter.

Looking at the arithmetic instructions, % and // don't have interesting
use cases for K\*V; ^ is sometimes used with constant on the left hand
side but this would need to call pow() by necessity in all cases so it
would be slow regardless of the dispatch overhead. This leaves the four
basic arithmetic operations.

For + and \*, we can implement a compiler-side optimization in the
future that transforms K\*R to R\*K automatically. This could either be
done unconditionally at -O2, or conditionally based on the type of the
value (driven by type annotations / inference) -- this technically
changes behavior in presence of metamethods, although it might be
sensible to just always do this because non-commutative +/* are evil.

However, for - and / it is impossible for the compiler to optimize this
in the future, so we need dedicated opcodes. This only increases the
interpreter size by ~300 bytes (~1.5%) on X64.

This makes spectral-norm and math-partial-sums 6% faster; maybe more
importantly, voxelgen gets 1.5% faster (so this change does have
real-world impact).

To avoid the proliferation of bytecode versions this change piggybacks
on the bytecode version bump that was just made in 604 for vector
constants; we would still be able to enable these independently but
we'll consider v5 complete when both are enabled.

Related: #626

---------

Co-authored-by: vegorov-rbx <75688451+vegorov-rbx@users.noreply.github.com>
2023-11-28 07:35:01 -08:00

467 lines
16 KiB
C++

// This file is part of the Luau programming language and is licensed under MIT License; see LICENSE.txt for details
#include "EmitCommonX64.h"
#include "Luau/AssemblyBuilderX64.h"
#include "Luau/IrCallWrapperX64.h"
#include "Luau/IrData.h"
#include "Luau/IrRegAllocX64.h"
#include "Luau/IrUtils.h"
#include "NativeState.h"
#include "lgc.h"
#include "lstate.h"
#include <utility>
namespace Luau
{
namespace CodeGen
{
namespace X64
{
void jumpOnNumberCmp(AssemblyBuilderX64& build, RegisterX64 tmp, OperandX64 lhs, OperandX64 rhs, IrCondition cond, Label& label)
{
// Refresher on comi/ucomi EFLAGS:
// all zero: greater
// CF only: less
// ZF only: equal
// PF+CF+ZF: unordered (NaN)
// To avoid the lack of conditional jumps that check for "greater" conditions in IEEE 754 compliant way, we use "less" forms to emulate these
if (cond == IrCondition::Greater || cond == IrCondition::GreaterEqual || cond == IrCondition::NotGreater || cond == IrCondition::NotGreaterEqual)
std::swap(lhs, rhs);
if (rhs.cat == CategoryX64::reg)
{
build.vucomisd(rhs, lhs);
}
else
{
build.vmovsd(tmp, rhs);
build.vucomisd(tmp, lhs);
}
// Keep in mind that 'Not' conditions want 'true' for comparisons with NaN
// And because of NaN, integer check interchangeability like 'not less or equal' <-> 'greater' does not hold
switch (cond)
{
case IrCondition::NotLessEqual:
case IrCondition::NotGreaterEqual:
// (b < a) is the same as !(a <= b). jnae checks CF=1 which means < or NaN
build.jcc(ConditionX64::NotAboveEqual, label);
break;
case IrCondition::LessEqual:
case IrCondition::GreaterEqual:
// (b >= a) is the same as (a <= b). jae checks CF=0 which means >= and not NaN
build.jcc(ConditionX64::AboveEqual, label);
break;
case IrCondition::NotLess:
case IrCondition::NotGreater:
// (b <= a) is the same as !(a < b). jna checks CF=1 or ZF=1 which means <= or NaN
build.jcc(ConditionX64::NotAbove, label);
break;
case IrCondition::Less:
case IrCondition::Greater:
// (b > a) is the same as (a < b). ja checks CF=0 and ZF=0 which means > and not NaN
build.jcc(ConditionX64::Above, label);
break;
case IrCondition::NotEqual:
// ZF=0 or PF=1 means != or NaN
build.jcc(ConditionX64::NotZero, label);
build.jcc(ConditionX64::Parity, label);
break;
default:
LUAU_ASSERT(!"Unsupported condition");
}
}
ConditionX64 getConditionInt(IrCondition cond)
{
switch (cond)
{
case IrCondition::Equal:
return ConditionX64::Equal;
case IrCondition::NotEqual:
return ConditionX64::NotEqual;
case IrCondition::Less:
return ConditionX64::Less;
case IrCondition::NotLess:
return ConditionX64::NotLess;
case IrCondition::LessEqual:
return ConditionX64::LessEqual;
case IrCondition::NotLessEqual:
return ConditionX64::NotLessEqual;
case IrCondition::Greater:
return ConditionX64::Greater;
case IrCondition::NotGreater:
return ConditionX64::NotGreater;
case IrCondition::GreaterEqual:
return ConditionX64::GreaterEqual;
case IrCondition::NotGreaterEqual:
return ConditionX64::NotGreaterEqual;
case IrCondition::UnsignedLess:
return ConditionX64::Below;
case IrCondition::UnsignedLessEqual:
return ConditionX64::BelowEqual;
case IrCondition::UnsignedGreater:
return ConditionX64::Above;
case IrCondition::UnsignedGreaterEqual:
return ConditionX64::AboveEqual;
default:
LUAU_ASSERT(!"Unsupported condition");
return ConditionX64::Zero;
}
}
void getTableNodeAtCachedSlot(AssemblyBuilderX64& build, RegisterX64 tmp, RegisterX64 node, RegisterX64 table, int pcpos)
{
LUAU_ASSERT(tmp != node);
LUAU_ASSERT(table != node);
build.mov(node, qword[table + offsetof(Table, node)]);
// compute cached slot
build.mov(tmp, sCode);
build.movzx(dwordReg(tmp), byte[tmp + pcpos * sizeof(Instruction) + kOffsetOfInstructionC]);
build.and_(byteReg(tmp), byte[table + offsetof(Table, nodemask8)]);
// LuaNode* n = &h->node[slot];
build.shl(dwordReg(tmp), kLuaNodeSizeLog2);
build.add(node, tmp);
}
void convertNumberToIndexOrJump(AssemblyBuilderX64& build, RegisterX64 tmp, RegisterX64 numd, RegisterX64 numi, Label& label)
{
LUAU_ASSERT(numi.size == SizeX64::dword);
// Convert to integer, NaN is converted into 0x80000000
build.vcvttsd2si(numi, numd);
// Convert that integer back to double
build.vcvtsi2sd(tmp, numd, numi);
build.vucomisd(tmp, numd); // Sets ZF=1 if equal or NaN
// We don't need non-integer values
// But to skip the PF=1 check, we proceed with NaN because 0x80000000 index is out of bounds
build.jcc(ConditionX64::NotZero, label);
}
void callArithHelper(IrRegAllocX64& regs, AssemblyBuilderX64& build, int ra, OperandX64 b, OperandX64 c, TMS tm)
{
IrCallWrapperX64 callWrap(regs, build);
callWrap.addArgument(SizeX64::qword, rState);
callWrap.addArgument(SizeX64::qword, luauRegAddress(ra));
callWrap.addArgument(SizeX64::qword, b);
callWrap.addArgument(SizeX64::qword, c);
callWrap.addArgument(SizeX64::dword, tm);
callWrap.call(qword[rNativeContext + offsetof(NativeContext, luaV_doarith)]);
emitUpdateBase(build);
}
void callLengthHelper(IrRegAllocX64& regs, AssemblyBuilderX64& build, int ra, int rb)
{
IrCallWrapperX64 callWrap(regs, build);
callWrap.addArgument(SizeX64::qword, rState);
callWrap.addArgument(SizeX64::qword, luauRegAddress(ra));
callWrap.addArgument(SizeX64::qword, luauRegAddress(rb));
callWrap.call(qword[rNativeContext + offsetof(NativeContext, luaV_dolen)]);
emitUpdateBase(build);
}
void callGetTable(IrRegAllocX64& regs, AssemblyBuilderX64& build, int rb, OperandX64 c, int ra)
{
IrCallWrapperX64 callWrap(regs, build);
callWrap.addArgument(SizeX64::qword, rState);
callWrap.addArgument(SizeX64::qword, luauRegAddress(rb));
callWrap.addArgument(SizeX64::qword, c);
callWrap.addArgument(SizeX64::qword, luauRegAddress(ra));
callWrap.call(qword[rNativeContext + offsetof(NativeContext, luaV_gettable)]);
emitUpdateBase(build);
}
void callSetTable(IrRegAllocX64& regs, AssemblyBuilderX64& build, int rb, OperandX64 c, int ra)
{
IrCallWrapperX64 callWrap(regs, build);
callWrap.addArgument(SizeX64::qword, rState);
callWrap.addArgument(SizeX64::qword, luauRegAddress(rb));
callWrap.addArgument(SizeX64::qword, c);
callWrap.addArgument(SizeX64::qword, luauRegAddress(ra));
callWrap.call(qword[rNativeContext + offsetof(NativeContext, luaV_settable)]);
emitUpdateBase(build);
}
void checkObjectBarrierConditions(AssemblyBuilderX64& build, RegisterX64 tmp, RegisterX64 object, IrOp ra, int ratag, Label& skip)
{
// Barrier should've been optimized away if we know that it's not collectable, checking for correctness
if (ratag == -1 || !isGCO(ratag))
{
// iscollectable(ra)
OperandX64 tag = (ra.kind == IrOpKind::VmReg) ? luauRegTag(vmRegOp(ra)) : luauConstantTag(vmConstOp(ra));
build.cmp(tag, LUA_TSTRING);
build.jcc(ConditionX64::Less, skip);
}
// isblack(obj2gco(o))
build.test(byte[object + offsetof(GCheader, marked)], bitmask(BLACKBIT));
build.jcc(ConditionX64::Zero, skip);
// iswhite(gcvalue(ra))
OperandX64 value = (ra.kind == IrOpKind::VmReg) ? luauRegValue(vmRegOp(ra)) : luauConstantValue(vmConstOp(ra));
build.mov(tmp, value);
build.test(byte[tmp + offsetof(GCheader, marked)], bit2mask(WHITE0BIT, WHITE1BIT));
build.jcc(ConditionX64::Zero, skip);
}
void callBarrierObject(IrRegAllocX64& regs, AssemblyBuilderX64& build, RegisterX64 object, IrOp objectOp, IrOp ra, int ratag)
{
Label skip;
ScopedRegX64 tmp{regs, SizeX64::qword};
checkObjectBarrierConditions(build, tmp.reg, object, ra, ratag, skip);
{
ScopedSpills spillGuard(regs);
IrCallWrapperX64 callWrap(regs, build);
callWrap.addArgument(SizeX64::qword, rState);
callWrap.addArgument(SizeX64::qword, object, objectOp);
callWrap.addArgument(SizeX64::qword, tmp);
callWrap.call(qword[rNativeContext + offsetof(NativeContext, luaC_barrierf)]);
}
build.setLabel(skip);
}
void callBarrierTableFast(IrRegAllocX64& regs, AssemblyBuilderX64& build, RegisterX64 table, IrOp tableOp)
{
Label skip;
// isblack(obj2gco(t))
build.test(byte[table + offsetof(GCheader, marked)], bitmask(BLACKBIT));
build.jcc(ConditionX64::Zero, skip);
{
ScopedSpills spillGuard(regs);
IrCallWrapperX64 callWrap(regs, build);
callWrap.addArgument(SizeX64::qword, rState);
callWrap.addArgument(SizeX64::qword, table, tableOp);
callWrap.addArgument(SizeX64::qword, addr[table + offsetof(Table, gclist)]);
callWrap.call(qword[rNativeContext + offsetof(NativeContext, luaC_barrierback)]);
}
build.setLabel(skip);
}
void callStepGc(IrRegAllocX64& regs, AssemblyBuilderX64& build)
{
Label skip;
{
ScopedRegX64 tmp1{regs, SizeX64::qword};
ScopedRegX64 tmp2{regs, SizeX64::qword};
build.mov(tmp1.reg, qword[rState + offsetof(lua_State, global)]);
build.mov(tmp2.reg, qword[tmp1.reg + offsetof(global_State, totalbytes)]);
build.cmp(tmp2.reg, qword[tmp1.reg + offsetof(global_State, GCthreshold)]);
build.jcc(ConditionX64::Below, skip);
}
{
ScopedSpills spillGuard(regs);
IrCallWrapperX64 callWrap(regs, build);
callWrap.addArgument(SizeX64::qword, rState);
callWrap.addArgument(SizeX64::dword, 1);
callWrap.call(qword[rNativeContext + offsetof(NativeContext, luaC_step)]);
emitUpdateBase(build);
}
build.setLabel(skip);
}
void emitClearNativeFlag(AssemblyBuilderX64& build)
{
build.mov(rax, qword[rState + offsetof(lua_State, ci)]);
build.and_(dword[rax + offsetof(CallInfo, flags)], ~LUA_CALLINFO_NATIVE);
}
void emitExit(AssemblyBuilderX64& build, bool continueInVm)
{
if (continueInVm)
build.mov(eax, 1);
else
build.xor_(eax, eax);
build.jmp(qword[rNativeContext + offsetof(NativeContext, gateExit)]);
}
void emitUpdateBase(AssemblyBuilderX64& build)
{
build.mov(rBase, qword[rState + offsetof(lua_State, base)]);
}
void emitInterrupt(AssemblyBuilderX64& build)
{
// rax = pcpos + 1
// rbx = return address in native code
// note: rbx is non-volatile so it will be saved across interrupt call automatically
RegisterX64 rArg1 = (build.abi == ABIX64::Windows) ? rcx : rdi;
RegisterX64 rArg2 = (build.abi == ABIX64::Windows) ? rdx : rsi;
Label skip;
// Update L->ci->savedpc; required in case interrupt errors
build.mov(rcx, sCode);
build.lea(rcx, addr[rcx + rax * sizeof(Instruction)]);
build.mov(rax, qword[rState + offsetof(lua_State, ci)]);
build.mov(qword[rax + offsetof(CallInfo, savedpc)], rcx);
// Load interrupt handler; it may be nullptr in case the update raced with the check before we got here
build.mov(rax, qword[rState + offsetof(lua_State, global)]);
build.mov(rax, qword[rax + offsetof(global_State, cb.interrupt)]);
build.test(rax, rax);
build.jcc(ConditionX64::Zero, skip);
// Call interrupt
build.mov(rArg1, rState);
build.mov(dwordReg(rArg2), -1);
build.call(rax);
// Check if we need to exit
build.mov(al, byte[rState + offsetof(lua_State, status)]);
build.test(al, al);
build.jcc(ConditionX64::Zero, skip);
build.mov(rax, qword[rState + offsetof(lua_State, ci)]);
build.sub(qword[rax + offsetof(CallInfo, savedpc)], sizeof(Instruction));
emitExit(build, /* continueInVm */ false);
build.setLabel(skip);
emitUpdateBase(build); // interrupt may have reallocated stack
build.jmp(rbx);
}
void emitFallback(IrRegAllocX64& regs, AssemblyBuilderX64& build, int offset, int pcpos)
{
// fallback(L, instruction, base, k)
IrCallWrapperX64 callWrap(regs, build);
callWrap.addArgument(SizeX64::qword, rState);
RegisterX64 reg = callWrap.suggestNextArgumentRegister(SizeX64::qword);
build.mov(reg, sCode);
callWrap.addArgument(SizeX64::qword, addr[reg + pcpos * sizeof(Instruction)]);
callWrap.addArgument(SizeX64::qword, rBase);
callWrap.addArgument(SizeX64::qword, rConstants);
callWrap.call(qword[rNativeContext + offset]);
emitUpdateBase(build);
}
void emitUpdatePcForExit(AssemblyBuilderX64& build)
{
// edx = pcpos * sizeof(Instruction)
build.add(rdx, sCode);
build.mov(rax, qword[rState + offsetof(lua_State, ci)]);
build.mov(qword[rax + offsetof(CallInfo, savedpc)], rdx);
}
void emitReturn(AssemblyBuilderX64& build, ModuleHelpers& helpers)
{
// input: res in rdi, number of written values in ecx
RegisterX64 res = rdi;
RegisterX64 written = ecx;
RegisterX64 ci = r8;
RegisterX64 cip = r9;
RegisterX64 nresults = esi;
build.mov(ci, qword[rState + offsetof(lua_State, ci)]);
build.lea(cip, addr[ci - sizeof(CallInfo)]);
// nresults = ci->nresults
build.mov(nresults, dword[ci + offsetof(CallInfo, nresults)]);
Label skipResultCopy;
// Fill the rest of the expected results (nresults - written) with 'nil'
RegisterX64 counter = written;
build.sub(counter, nresults); // counter = -(nresults - written)
build.jcc(ConditionX64::GreaterEqual, skipResultCopy);
Label repeatNilLoop = build.setLabel();
build.mov(dword[res + offsetof(TValue, tt)], LUA_TNIL);
build.add(res, sizeof(TValue));
build.inc(counter);
build.jcc(ConditionX64::NotZero, repeatNilLoop);
build.setLabel(skipResultCopy);
build.mov(qword[rState + offsetof(lua_State, ci)], cip); // L->ci = cip
build.mov(rBase, qword[cip + offsetof(CallInfo, base)]); // sync base = L->base while we have a chance
build.mov(qword[rState + offsetof(lua_State, base)], rBase); // L->base = cip->base
Label skipFixedRetTop;
build.test(nresults, nresults); // test here will set SF=1 for a negative number and it always sets OF to 0
build.jcc(ConditionX64::Less, skipFixedRetTop); // jl jumps if SF != OF
build.mov(res, qword[cip + offsetof(CallInfo, top)]); // res = cip->top
build.setLabel(skipFixedRetTop);
build.mov(qword[rState + offsetof(lua_State, top)], res); // L->top = res
// Unlikely, but this might be the last return from VM
build.test(byte[ci + offsetof(CallInfo, flags)], LUA_CALLINFO_RETURN);
build.jcc(ConditionX64::NotZero, helpers.exitNoContinueVm);
// Returning back to the previous function is a bit tricky
// Registers alive: r9 (cip)
RegisterX64 proto = rcx;
RegisterX64 execdata = rbx;
// Change closure
build.mov(rax, qword[cip + offsetof(CallInfo, func)]);
build.mov(rax, qword[rax + offsetof(TValue, value.gc)]);
build.mov(sClosure, rax);
build.mov(proto, qword[rax + offsetof(Closure, l.p)]);
build.mov(execdata, qword[proto + offsetof(Proto, execdata)]);
build.test(byte[cip + offsetof(CallInfo, flags)], LUA_CALLINFO_NATIVE);
build.jcc(ConditionX64::Zero, helpers.exitContinueVm); // Continue in interpreter if function has no native data
// Change constants
build.mov(rConstants, qword[proto + offsetof(Proto, k)]);
// Change code
build.mov(rdx, qword[proto + offsetof(Proto, code)]);
build.mov(sCode, rdx);
build.mov(rax, qword[cip + offsetof(CallInfo, savedpc)]);
// To get instruction index from instruction pointer, we need to divide byte offset by 4
// But we will actually need to scale instruction index by 4 back to byte offset later so it cancels out
build.sub(rax, rdx);
// Get new instruction location and jump to it
build.mov(edx, dword[execdata + rax]);
build.add(rdx, qword[proto + offsetof(Proto, exectarget)]);
build.jmp(rdx);
}
} // namespace X64
} // namespace CodeGen
} // namespace Luau