mirror of
https://github.com/luau-lang/luau.git
synced 2025-01-07 11:59:11 +00:00
89b437bb4e
Right now, we can compile R\*K for all arithmetic instructions, but K\*R gets compiled into two instructions (LOADN/LOADK + arithmetic opcode). This is problematic since it leads to reduced performance for some code. However, we'd like to avoid adding reverse variants of ADDK et al for all opcodes to avoid the increase in I$ footprint for interpreter. Looking at the arithmetic instructions, % and // don't have interesting use cases for K\*V; ^ is sometimes used with constant on the left hand side but this would need to call pow() by necessity in all cases so it would be slow regardless of the dispatch overhead. This leaves the four basic arithmetic operations. For + and \*, we can implement a compiler-side optimization in the future that transforms K\*R to R\*K automatically. This could either be done unconditionally at -O2, or conditionally based on the type of the value (driven by type annotations / inference) -- this technically changes behavior in presence of metamethods, although it might be sensible to just always do this because non-commutative +/* are evil. However, for - and / it is impossible for the compiler to optimize this in the future, so we need dedicated opcodes. This only increases the interpreter size by ~300 bytes (~1.5%) on X64. This makes spectral-norm and math-partial-sums 6% faster; maybe more importantly, voxelgen gets 1.5% faster (so this change does have real-world impact). To avoid the proliferation of bytecode versions this change piggybacks on the bytecode version bump that was just made in 604 for vector constants; we would still be able to enable these independently but we'll consider v5 complete when both are enabled. Related: #626 --------- Co-authored-by: vegorov-rbx <75688451+vegorov-rbx@users.noreply.github.com>
467 lines
16 KiB
C++
467 lines
16 KiB
C++
// This file is part of the Luau programming language and is licensed under MIT License; see LICENSE.txt for details
|
|
#include "EmitCommonX64.h"
|
|
|
|
#include "Luau/AssemblyBuilderX64.h"
|
|
#include "Luau/IrCallWrapperX64.h"
|
|
#include "Luau/IrData.h"
|
|
#include "Luau/IrRegAllocX64.h"
|
|
#include "Luau/IrUtils.h"
|
|
|
|
#include "NativeState.h"
|
|
|
|
#include "lgc.h"
|
|
#include "lstate.h"
|
|
|
|
#include <utility>
|
|
|
|
namespace Luau
|
|
{
|
|
namespace CodeGen
|
|
{
|
|
namespace X64
|
|
{
|
|
|
|
void jumpOnNumberCmp(AssemblyBuilderX64& build, RegisterX64 tmp, OperandX64 lhs, OperandX64 rhs, IrCondition cond, Label& label)
|
|
{
|
|
// Refresher on comi/ucomi EFLAGS:
|
|
// all zero: greater
|
|
// CF only: less
|
|
// ZF only: equal
|
|
// PF+CF+ZF: unordered (NaN)
|
|
|
|
// To avoid the lack of conditional jumps that check for "greater" conditions in IEEE 754 compliant way, we use "less" forms to emulate these
|
|
if (cond == IrCondition::Greater || cond == IrCondition::GreaterEqual || cond == IrCondition::NotGreater || cond == IrCondition::NotGreaterEqual)
|
|
std::swap(lhs, rhs);
|
|
|
|
if (rhs.cat == CategoryX64::reg)
|
|
{
|
|
build.vucomisd(rhs, lhs);
|
|
}
|
|
else
|
|
{
|
|
build.vmovsd(tmp, rhs);
|
|
build.vucomisd(tmp, lhs);
|
|
}
|
|
|
|
// Keep in mind that 'Not' conditions want 'true' for comparisons with NaN
|
|
// And because of NaN, integer check interchangeability like 'not less or equal' <-> 'greater' does not hold
|
|
switch (cond)
|
|
{
|
|
case IrCondition::NotLessEqual:
|
|
case IrCondition::NotGreaterEqual:
|
|
// (b < a) is the same as !(a <= b). jnae checks CF=1 which means < or NaN
|
|
build.jcc(ConditionX64::NotAboveEqual, label);
|
|
break;
|
|
case IrCondition::LessEqual:
|
|
case IrCondition::GreaterEqual:
|
|
// (b >= a) is the same as (a <= b). jae checks CF=0 which means >= and not NaN
|
|
build.jcc(ConditionX64::AboveEqual, label);
|
|
break;
|
|
case IrCondition::NotLess:
|
|
case IrCondition::NotGreater:
|
|
// (b <= a) is the same as !(a < b). jna checks CF=1 or ZF=1 which means <= or NaN
|
|
build.jcc(ConditionX64::NotAbove, label);
|
|
break;
|
|
case IrCondition::Less:
|
|
case IrCondition::Greater:
|
|
// (b > a) is the same as (a < b). ja checks CF=0 and ZF=0 which means > and not NaN
|
|
build.jcc(ConditionX64::Above, label);
|
|
break;
|
|
case IrCondition::NotEqual:
|
|
// ZF=0 or PF=1 means != or NaN
|
|
build.jcc(ConditionX64::NotZero, label);
|
|
build.jcc(ConditionX64::Parity, label);
|
|
break;
|
|
default:
|
|
LUAU_ASSERT(!"Unsupported condition");
|
|
}
|
|
}
|
|
|
|
ConditionX64 getConditionInt(IrCondition cond)
|
|
{
|
|
switch (cond)
|
|
{
|
|
case IrCondition::Equal:
|
|
return ConditionX64::Equal;
|
|
case IrCondition::NotEqual:
|
|
return ConditionX64::NotEqual;
|
|
case IrCondition::Less:
|
|
return ConditionX64::Less;
|
|
case IrCondition::NotLess:
|
|
return ConditionX64::NotLess;
|
|
case IrCondition::LessEqual:
|
|
return ConditionX64::LessEqual;
|
|
case IrCondition::NotLessEqual:
|
|
return ConditionX64::NotLessEqual;
|
|
case IrCondition::Greater:
|
|
return ConditionX64::Greater;
|
|
case IrCondition::NotGreater:
|
|
return ConditionX64::NotGreater;
|
|
case IrCondition::GreaterEqual:
|
|
return ConditionX64::GreaterEqual;
|
|
case IrCondition::NotGreaterEqual:
|
|
return ConditionX64::NotGreaterEqual;
|
|
case IrCondition::UnsignedLess:
|
|
return ConditionX64::Below;
|
|
case IrCondition::UnsignedLessEqual:
|
|
return ConditionX64::BelowEqual;
|
|
case IrCondition::UnsignedGreater:
|
|
return ConditionX64::Above;
|
|
case IrCondition::UnsignedGreaterEqual:
|
|
return ConditionX64::AboveEqual;
|
|
default:
|
|
LUAU_ASSERT(!"Unsupported condition");
|
|
return ConditionX64::Zero;
|
|
}
|
|
}
|
|
|
|
void getTableNodeAtCachedSlot(AssemblyBuilderX64& build, RegisterX64 tmp, RegisterX64 node, RegisterX64 table, int pcpos)
|
|
{
|
|
LUAU_ASSERT(tmp != node);
|
|
LUAU_ASSERT(table != node);
|
|
|
|
build.mov(node, qword[table + offsetof(Table, node)]);
|
|
|
|
// compute cached slot
|
|
build.mov(tmp, sCode);
|
|
build.movzx(dwordReg(tmp), byte[tmp + pcpos * sizeof(Instruction) + kOffsetOfInstructionC]);
|
|
build.and_(byteReg(tmp), byte[table + offsetof(Table, nodemask8)]);
|
|
|
|
// LuaNode* n = &h->node[slot];
|
|
build.shl(dwordReg(tmp), kLuaNodeSizeLog2);
|
|
build.add(node, tmp);
|
|
}
|
|
|
|
void convertNumberToIndexOrJump(AssemblyBuilderX64& build, RegisterX64 tmp, RegisterX64 numd, RegisterX64 numi, Label& label)
|
|
{
|
|
LUAU_ASSERT(numi.size == SizeX64::dword);
|
|
|
|
// Convert to integer, NaN is converted into 0x80000000
|
|
build.vcvttsd2si(numi, numd);
|
|
|
|
// Convert that integer back to double
|
|
build.vcvtsi2sd(tmp, numd, numi);
|
|
|
|
build.vucomisd(tmp, numd); // Sets ZF=1 if equal or NaN
|
|
// We don't need non-integer values
|
|
// But to skip the PF=1 check, we proceed with NaN because 0x80000000 index is out of bounds
|
|
build.jcc(ConditionX64::NotZero, label);
|
|
}
|
|
|
|
void callArithHelper(IrRegAllocX64& regs, AssemblyBuilderX64& build, int ra, OperandX64 b, OperandX64 c, TMS tm)
|
|
{
|
|
IrCallWrapperX64 callWrap(regs, build);
|
|
callWrap.addArgument(SizeX64::qword, rState);
|
|
callWrap.addArgument(SizeX64::qword, luauRegAddress(ra));
|
|
callWrap.addArgument(SizeX64::qword, b);
|
|
callWrap.addArgument(SizeX64::qword, c);
|
|
callWrap.addArgument(SizeX64::dword, tm);
|
|
callWrap.call(qword[rNativeContext + offsetof(NativeContext, luaV_doarith)]);
|
|
|
|
emitUpdateBase(build);
|
|
}
|
|
|
|
void callLengthHelper(IrRegAllocX64& regs, AssemblyBuilderX64& build, int ra, int rb)
|
|
{
|
|
IrCallWrapperX64 callWrap(regs, build);
|
|
callWrap.addArgument(SizeX64::qword, rState);
|
|
callWrap.addArgument(SizeX64::qword, luauRegAddress(ra));
|
|
callWrap.addArgument(SizeX64::qword, luauRegAddress(rb));
|
|
callWrap.call(qword[rNativeContext + offsetof(NativeContext, luaV_dolen)]);
|
|
|
|
emitUpdateBase(build);
|
|
}
|
|
|
|
void callGetTable(IrRegAllocX64& regs, AssemblyBuilderX64& build, int rb, OperandX64 c, int ra)
|
|
{
|
|
IrCallWrapperX64 callWrap(regs, build);
|
|
callWrap.addArgument(SizeX64::qword, rState);
|
|
callWrap.addArgument(SizeX64::qword, luauRegAddress(rb));
|
|
callWrap.addArgument(SizeX64::qword, c);
|
|
callWrap.addArgument(SizeX64::qword, luauRegAddress(ra));
|
|
callWrap.call(qword[rNativeContext + offsetof(NativeContext, luaV_gettable)]);
|
|
|
|
emitUpdateBase(build);
|
|
}
|
|
|
|
void callSetTable(IrRegAllocX64& regs, AssemblyBuilderX64& build, int rb, OperandX64 c, int ra)
|
|
{
|
|
IrCallWrapperX64 callWrap(regs, build);
|
|
callWrap.addArgument(SizeX64::qword, rState);
|
|
callWrap.addArgument(SizeX64::qword, luauRegAddress(rb));
|
|
callWrap.addArgument(SizeX64::qword, c);
|
|
callWrap.addArgument(SizeX64::qword, luauRegAddress(ra));
|
|
callWrap.call(qword[rNativeContext + offsetof(NativeContext, luaV_settable)]);
|
|
|
|
emitUpdateBase(build);
|
|
}
|
|
|
|
void checkObjectBarrierConditions(AssemblyBuilderX64& build, RegisterX64 tmp, RegisterX64 object, IrOp ra, int ratag, Label& skip)
|
|
{
|
|
// Barrier should've been optimized away if we know that it's not collectable, checking for correctness
|
|
if (ratag == -1 || !isGCO(ratag))
|
|
{
|
|
// iscollectable(ra)
|
|
OperandX64 tag = (ra.kind == IrOpKind::VmReg) ? luauRegTag(vmRegOp(ra)) : luauConstantTag(vmConstOp(ra));
|
|
build.cmp(tag, LUA_TSTRING);
|
|
build.jcc(ConditionX64::Less, skip);
|
|
}
|
|
|
|
// isblack(obj2gco(o))
|
|
build.test(byte[object + offsetof(GCheader, marked)], bitmask(BLACKBIT));
|
|
build.jcc(ConditionX64::Zero, skip);
|
|
|
|
// iswhite(gcvalue(ra))
|
|
OperandX64 value = (ra.kind == IrOpKind::VmReg) ? luauRegValue(vmRegOp(ra)) : luauConstantValue(vmConstOp(ra));
|
|
build.mov(tmp, value);
|
|
build.test(byte[tmp + offsetof(GCheader, marked)], bit2mask(WHITE0BIT, WHITE1BIT));
|
|
build.jcc(ConditionX64::Zero, skip);
|
|
}
|
|
|
|
|
|
void callBarrierObject(IrRegAllocX64& regs, AssemblyBuilderX64& build, RegisterX64 object, IrOp objectOp, IrOp ra, int ratag)
|
|
{
|
|
Label skip;
|
|
|
|
ScopedRegX64 tmp{regs, SizeX64::qword};
|
|
checkObjectBarrierConditions(build, tmp.reg, object, ra, ratag, skip);
|
|
|
|
{
|
|
ScopedSpills spillGuard(regs);
|
|
|
|
IrCallWrapperX64 callWrap(regs, build);
|
|
callWrap.addArgument(SizeX64::qword, rState);
|
|
callWrap.addArgument(SizeX64::qword, object, objectOp);
|
|
callWrap.addArgument(SizeX64::qword, tmp);
|
|
callWrap.call(qword[rNativeContext + offsetof(NativeContext, luaC_barrierf)]);
|
|
}
|
|
|
|
build.setLabel(skip);
|
|
}
|
|
|
|
void callBarrierTableFast(IrRegAllocX64& regs, AssemblyBuilderX64& build, RegisterX64 table, IrOp tableOp)
|
|
{
|
|
Label skip;
|
|
|
|
// isblack(obj2gco(t))
|
|
build.test(byte[table + offsetof(GCheader, marked)], bitmask(BLACKBIT));
|
|
build.jcc(ConditionX64::Zero, skip);
|
|
|
|
{
|
|
ScopedSpills spillGuard(regs);
|
|
|
|
IrCallWrapperX64 callWrap(regs, build);
|
|
callWrap.addArgument(SizeX64::qword, rState);
|
|
callWrap.addArgument(SizeX64::qword, table, tableOp);
|
|
callWrap.addArgument(SizeX64::qword, addr[table + offsetof(Table, gclist)]);
|
|
callWrap.call(qword[rNativeContext + offsetof(NativeContext, luaC_barrierback)]);
|
|
}
|
|
|
|
build.setLabel(skip);
|
|
}
|
|
|
|
void callStepGc(IrRegAllocX64& regs, AssemblyBuilderX64& build)
|
|
{
|
|
Label skip;
|
|
|
|
{
|
|
ScopedRegX64 tmp1{regs, SizeX64::qword};
|
|
ScopedRegX64 tmp2{regs, SizeX64::qword};
|
|
|
|
build.mov(tmp1.reg, qword[rState + offsetof(lua_State, global)]);
|
|
build.mov(tmp2.reg, qword[tmp1.reg + offsetof(global_State, totalbytes)]);
|
|
build.cmp(tmp2.reg, qword[tmp1.reg + offsetof(global_State, GCthreshold)]);
|
|
build.jcc(ConditionX64::Below, skip);
|
|
}
|
|
|
|
{
|
|
ScopedSpills spillGuard(regs);
|
|
|
|
IrCallWrapperX64 callWrap(regs, build);
|
|
callWrap.addArgument(SizeX64::qword, rState);
|
|
callWrap.addArgument(SizeX64::dword, 1);
|
|
callWrap.call(qword[rNativeContext + offsetof(NativeContext, luaC_step)]);
|
|
emitUpdateBase(build);
|
|
}
|
|
|
|
build.setLabel(skip);
|
|
}
|
|
|
|
void emitClearNativeFlag(AssemblyBuilderX64& build)
|
|
{
|
|
build.mov(rax, qword[rState + offsetof(lua_State, ci)]);
|
|
build.and_(dword[rax + offsetof(CallInfo, flags)], ~LUA_CALLINFO_NATIVE);
|
|
}
|
|
|
|
void emitExit(AssemblyBuilderX64& build, bool continueInVm)
|
|
{
|
|
if (continueInVm)
|
|
build.mov(eax, 1);
|
|
else
|
|
build.xor_(eax, eax);
|
|
|
|
build.jmp(qword[rNativeContext + offsetof(NativeContext, gateExit)]);
|
|
}
|
|
|
|
void emitUpdateBase(AssemblyBuilderX64& build)
|
|
{
|
|
build.mov(rBase, qword[rState + offsetof(lua_State, base)]);
|
|
}
|
|
|
|
void emitInterrupt(AssemblyBuilderX64& build)
|
|
{
|
|
// rax = pcpos + 1
|
|
// rbx = return address in native code
|
|
|
|
// note: rbx is non-volatile so it will be saved across interrupt call automatically
|
|
|
|
RegisterX64 rArg1 = (build.abi == ABIX64::Windows) ? rcx : rdi;
|
|
RegisterX64 rArg2 = (build.abi == ABIX64::Windows) ? rdx : rsi;
|
|
|
|
Label skip;
|
|
|
|
// Update L->ci->savedpc; required in case interrupt errors
|
|
build.mov(rcx, sCode);
|
|
build.lea(rcx, addr[rcx + rax * sizeof(Instruction)]);
|
|
build.mov(rax, qword[rState + offsetof(lua_State, ci)]);
|
|
build.mov(qword[rax + offsetof(CallInfo, savedpc)], rcx);
|
|
|
|
// Load interrupt handler; it may be nullptr in case the update raced with the check before we got here
|
|
build.mov(rax, qword[rState + offsetof(lua_State, global)]);
|
|
build.mov(rax, qword[rax + offsetof(global_State, cb.interrupt)]);
|
|
build.test(rax, rax);
|
|
build.jcc(ConditionX64::Zero, skip);
|
|
|
|
// Call interrupt
|
|
build.mov(rArg1, rState);
|
|
build.mov(dwordReg(rArg2), -1);
|
|
build.call(rax);
|
|
|
|
// Check if we need to exit
|
|
build.mov(al, byte[rState + offsetof(lua_State, status)]);
|
|
build.test(al, al);
|
|
build.jcc(ConditionX64::Zero, skip);
|
|
|
|
build.mov(rax, qword[rState + offsetof(lua_State, ci)]);
|
|
build.sub(qword[rax + offsetof(CallInfo, savedpc)], sizeof(Instruction));
|
|
emitExit(build, /* continueInVm */ false);
|
|
|
|
build.setLabel(skip);
|
|
|
|
emitUpdateBase(build); // interrupt may have reallocated stack
|
|
|
|
build.jmp(rbx);
|
|
}
|
|
|
|
void emitFallback(IrRegAllocX64& regs, AssemblyBuilderX64& build, int offset, int pcpos)
|
|
{
|
|
// fallback(L, instruction, base, k)
|
|
IrCallWrapperX64 callWrap(regs, build);
|
|
callWrap.addArgument(SizeX64::qword, rState);
|
|
|
|
RegisterX64 reg = callWrap.suggestNextArgumentRegister(SizeX64::qword);
|
|
build.mov(reg, sCode);
|
|
callWrap.addArgument(SizeX64::qword, addr[reg + pcpos * sizeof(Instruction)]);
|
|
|
|
callWrap.addArgument(SizeX64::qword, rBase);
|
|
callWrap.addArgument(SizeX64::qword, rConstants);
|
|
callWrap.call(qword[rNativeContext + offset]);
|
|
|
|
emitUpdateBase(build);
|
|
}
|
|
|
|
void emitUpdatePcForExit(AssemblyBuilderX64& build)
|
|
{
|
|
// edx = pcpos * sizeof(Instruction)
|
|
build.add(rdx, sCode);
|
|
build.mov(rax, qword[rState + offsetof(lua_State, ci)]);
|
|
build.mov(qword[rax + offsetof(CallInfo, savedpc)], rdx);
|
|
}
|
|
|
|
void emitReturn(AssemblyBuilderX64& build, ModuleHelpers& helpers)
|
|
{
|
|
// input: res in rdi, number of written values in ecx
|
|
RegisterX64 res = rdi;
|
|
RegisterX64 written = ecx;
|
|
|
|
RegisterX64 ci = r8;
|
|
RegisterX64 cip = r9;
|
|
RegisterX64 nresults = esi;
|
|
|
|
build.mov(ci, qword[rState + offsetof(lua_State, ci)]);
|
|
build.lea(cip, addr[ci - sizeof(CallInfo)]);
|
|
|
|
// nresults = ci->nresults
|
|
build.mov(nresults, dword[ci + offsetof(CallInfo, nresults)]);
|
|
|
|
Label skipResultCopy;
|
|
|
|
// Fill the rest of the expected results (nresults - written) with 'nil'
|
|
RegisterX64 counter = written;
|
|
build.sub(counter, nresults); // counter = -(nresults - written)
|
|
build.jcc(ConditionX64::GreaterEqual, skipResultCopy);
|
|
|
|
Label repeatNilLoop = build.setLabel();
|
|
build.mov(dword[res + offsetof(TValue, tt)], LUA_TNIL);
|
|
build.add(res, sizeof(TValue));
|
|
build.inc(counter);
|
|
build.jcc(ConditionX64::NotZero, repeatNilLoop);
|
|
|
|
build.setLabel(skipResultCopy);
|
|
|
|
build.mov(qword[rState + offsetof(lua_State, ci)], cip); // L->ci = cip
|
|
build.mov(rBase, qword[cip + offsetof(CallInfo, base)]); // sync base = L->base while we have a chance
|
|
build.mov(qword[rState + offsetof(lua_State, base)], rBase); // L->base = cip->base
|
|
|
|
Label skipFixedRetTop;
|
|
build.test(nresults, nresults); // test here will set SF=1 for a negative number and it always sets OF to 0
|
|
build.jcc(ConditionX64::Less, skipFixedRetTop); // jl jumps if SF != OF
|
|
build.mov(res, qword[cip + offsetof(CallInfo, top)]); // res = cip->top
|
|
build.setLabel(skipFixedRetTop);
|
|
|
|
build.mov(qword[rState + offsetof(lua_State, top)], res); // L->top = res
|
|
|
|
// Unlikely, but this might be the last return from VM
|
|
build.test(byte[ci + offsetof(CallInfo, flags)], LUA_CALLINFO_RETURN);
|
|
build.jcc(ConditionX64::NotZero, helpers.exitNoContinueVm);
|
|
|
|
// Returning back to the previous function is a bit tricky
|
|
// Registers alive: r9 (cip)
|
|
RegisterX64 proto = rcx;
|
|
RegisterX64 execdata = rbx;
|
|
|
|
// Change closure
|
|
build.mov(rax, qword[cip + offsetof(CallInfo, func)]);
|
|
build.mov(rax, qword[rax + offsetof(TValue, value.gc)]);
|
|
build.mov(sClosure, rax);
|
|
|
|
build.mov(proto, qword[rax + offsetof(Closure, l.p)]);
|
|
|
|
build.mov(execdata, qword[proto + offsetof(Proto, execdata)]);
|
|
|
|
build.test(byte[cip + offsetof(CallInfo, flags)], LUA_CALLINFO_NATIVE);
|
|
build.jcc(ConditionX64::Zero, helpers.exitContinueVm); // Continue in interpreter if function has no native data
|
|
|
|
// Change constants
|
|
build.mov(rConstants, qword[proto + offsetof(Proto, k)]);
|
|
|
|
// Change code
|
|
build.mov(rdx, qword[proto + offsetof(Proto, code)]);
|
|
build.mov(sCode, rdx);
|
|
|
|
build.mov(rax, qword[cip + offsetof(CallInfo, savedpc)]);
|
|
|
|
// To get instruction index from instruction pointer, we need to divide byte offset by 4
|
|
// But we will actually need to scale instruction index by 4 back to byte offset later so it cancels out
|
|
build.sub(rax, rdx);
|
|
|
|
// Get new instruction location and jump to it
|
|
build.mov(edx, dword[execdata + rax]);
|
|
build.add(rdx, qword[proto + offsetof(Proto, exectarget)]);
|
|
build.jmp(rdx);
|
|
}
|
|
|
|
|
|
} // namespace X64
|
|
} // namespace CodeGen
|
|
} // namespace Luau
|