bench: Implement first class support for callgrind

Since callgrind allows to control stats collection from the guest, this
allows us to reset the collection right before the benchmark starts.

This change exposes this to the benchmark runner and integrates
callgrind data parsing into bench.py, so that we can run bench.py with
--callgrind argument and, as long as the runner was built with callgrind
support, we get instruction counts from the run.

We convert instruction counts to seconds using 10G instructions/second
rate; there's no correct way to do this without simulating the full CPU
pipeline but it results in time units on a similar scale to real runs.
This commit is contained in:
Arseny Kapoulkine 2022-06-30 19:48:54 -07:00
parent 6467c855e8
commit 755f447859
4 changed files with 79 additions and 0 deletions

View file

@ -21,6 +21,10 @@
#include <fcntl.h>
#endif
#ifdef CALLGRIND
#include <valgrind/callgrind.h>
#endif
#include <locale.h>
LUAU_FASTFLAG(DebugLuauTimeTracing)
@ -166,6 +170,36 @@ static int lua_collectgarbage(lua_State* L)
luaL_error(L, "collectgarbage must be called with 'count' or 'collect'");
}
#ifdef CALLGRIND
static int lua_callgrind(lua_State* L)
{
const char* option = luaL_checkstring(L, 1);
if (strcmp(option, "running") == 0)
{
int r = RUNNING_ON_VALGRIND;
lua_pushboolean(L, r);
return 1;
}
if (strcmp(option, "zero") == 0)
{
CALLGRIND_ZERO_STATS;
return 0;
}
if (strcmp(option, "dump") == 0)
{
const char* name = luaL_checkstring(L, 2);
CALLGRIND_DUMP_STATS_AT(name);
return 0;
}
luaL_error(L, "callgrind must be called with one of 'running', 'zero', 'dump'");
}
#endif
void setupState(lua_State* L)
{
luaL_openlibs(L);
@ -174,6 +208,9 @@ void setupState(lua_State* L)
{"loadstring", lua_loadstring},
{"require", lua_require},
{"collectgarbage", lua_collectgarbage},
#ifdef CALLGRIND
{"callgrind", lua_callgrind},
#endif
{NULL, NULL},
};

View file

@ -93,6 +93,10 @@ ifeq ($(config),fuzz)
LDFLAGS+=-fsanitize=address,fuzzer
endif
ifneq ($(CALLGRIND),)
CXXFLAGS+=-DCALLGRIND=$(CALLGRIND)
endif
# target-specific flags
$(AST_OBJECTS): CXXFLAGS+=-std=c++17 -ICommon/include -IAst/include
$(COMPILER_OBJECTS): CXXFLAGS+=-std=c++17 -ICompiler/include -ICommon/include -IAst/include

View file

@ -40,6 +40,7 @@ argumentParser.add_argument('--results', dest='results',type=str,nargs='*',help=
argumentParser.add_argument('--run-test', action='store', default=None, help='Regex test filter')
argumentParser.add_argument('--extra-loops', action='store',type=int,default=0, help='Amount of times to loop over one test (one test already performs multiple runs)')
argumentParser.add_argument('--filename', action='store',type=str,default='bench', help='File name for graph and results file')
argumentParser.add_argument('--callgrind', dest='callgrind',action='store_const',const=1,default=0,help='Use callgrind to run benchmarks')
if matplotlib != None:
argumentParser.add_argument('--absolute', dest='absolute',action='store_const',const=1,default=0,help='Display absolute values instead of relative (enabled by default when benchmarking a single VM)')
@ -55,6 +56,9 @@ argumentParser.add_argument('--no-print-influx-debugging', action='store_false',
argumentParser.add_argument('--no-print-final-summary', action='store_false', dest='print_final_summary', help="Don't print a table summarizing the results after all tests are run")
# Assume 2.5 IPC on a 4 GHz CPU; this is obviously incorrect but it allows us to display simulated instruction counts using regular time units
CALLGRIND_INSN_PER_SEC = 2.5 * 4e9
def arrayRange(count):
result = []
@ -71,6 +75,21 @@ def arrayRangeOffset(count, offset):
return result
def getCallgrindOutput(lines):
result = []
name = None
for l in lines:
if l.startswith("desc: Trigger: Client Request: "):
name = l[31:].strip()
elif l.startswith("summary: ") and name != None:
insn = int(l[9:])
# Note: we only run each bench once under callgrind so we only report a single time per run; callgrind instruction count variance is ~0.01% so it might as well be zero
result += "|><|" + name + "|><|" + str(insn / CALLGRIND_INSN_PER_SEC * 1000.0) + "||_||"
name = None
return "".join(result)
def getVmOutput(cmd):
if os.name == "nt":
try:
@ -79,6 +98,15 @@ def getVmOutput(cmd):
exit(1)
except:
return ""
elif arguments.callgrind:
try:
subprocess.check_call("valgrind --tool=callgrind --callgrind-out-file=callgrind.out --combine-dumps=yes --dump-line=no " + cmd, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, cwd=scriptdir)
file = open(os.path.join(scriptdir, "callgrind.out"), "r")
lines = file.readlines()
return getCallgrindOutput(lines)
except e:
print(e)
return ""
else:
with subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, cwd=scriptdir) as p:
# Try to lock to a single processor

View file

@ -5,6 +5,16 @@ bench.runs = 20
bench.extraRuns = 4
function bench.runCode(f, description)
-- Under Callgrind, run the test only once and measure just the execution cost
if callgrind and callgrind("running") then
if collectgarbage then collectgarbage() end
callgrind("zero")
f() -- unfortunately we can't easily separate setup cost from runtime cost in f unless it calls callgrind()
callgrind("dump", description)
return
end
local timeTable = {}
for i = 1,bench.runs + bench.extraRuns do