bench: Implement first class support for callgrind

Since callgrind allows to control stats collection from the guest, this allows us to reset the collection right before the benchmark starts. This change exposes this to the benchmark runner and integrates callgrind data parsing into bench.py, so that we can run bench.py with --callgrind argument and, as long as the runner was built with callgrind support, we get instruction counts from the run. We convert instruction counts to seconds using 10G instructions/second rate; there's no correct way to do this without simulating the full CPU pipeline but it results in time units on a similar scale to real runs.
2025-05-04 10:33:46 +01:00 · 2022-06-30 19:48:54 -07:00 · 2022-06-30 19:48:54 -07:00 · 755f447859
commit 755f447859
parent 6467c855e8
4 changed files with 79 additions and 0 deletions
--- a/CLI/Repl.cpp
+++ b/CLI/Repl.cpp
@ -21,6 +21,10 @@
 #include <fcntl.h>
 #endif

+#ifdef CALLGRIND
+#include <valgrind/callgrind.h>
+#endif
+
 #include <locale.h>

 LUAU_FASTFLAG(DebugLuauTimeTracing)
@ -166,6 +170,36 @@ static int lua_collectgarbage(lua_State* L)
    luaL_error(L, "collectgarbage must be called with 'count' or 'collect'");
 }

+#ifdef CALLGRIND
+static int lua_callgrind(lua_State* L)
+{
+    const char* option = luaL_checkstring(L, 1);
+
+    if (strcmp(option, "running") == 0)
+    {
+        int r = RUNNING_ON_VALGRIND;
+        lua_pushboolean(L, r);
+        return 1;
+    }
+
+    if (strcmp(option, "zero") == 0)
+    {
+        CALLGRIND_ZERO_STATS;
+        return 0;
+    }
+
+    if (strcmp(option, "dump") == 0)
+    {
+        const char* name = luaL_checkstring(L, 2);
+
+        CALLGRIND_DUMP_STATS_AT(name);
+        return 0;
+    }
+
+    luaL_error(L, "callgrind must be called with one of 'running', 'zero', 'dump'");
+}
+#endif
+
 void setupState(lua_State* L)
 {
    luaL_openlibs(L);
@ -174,6 +208,9 @@ void setupState(lua_State* L)
        {"loadstring", lua_loadstring},
        {"require", lua_require},
        {"collectgarbage", lua_collectgarbage},
+#ifdef CALLGRIND
+        {"callgrind", lua_callgrind},
+#endif
        {NULL, NULL},
    };

--- a/4
+++ b/4
@ -93,6 +93,10 @@ ifeq ($(config),fuzz)
 	LDFLAGS+=-fsanitize=address,fuzzer
 endif

+ifneq ($(CALLGRIND),)
+	CXXFLAGS+=-DCALLGRIND=$(CALLGRIND)
+endif
+
 # target-specific flags
 $(AST_OBJECTS): CXXFLAGS+=-std=c++17 -ICommon/include -IAst/include
 $(COMPILER_OBJECTS): CXXFLAGS+=-std=c++17 -ICompiler/include -ICommon/include -IAst/include
--- a/bench/bench.py
+++ b/bench/bench.py
@ -40,6 +40,7 @@ argumentParser.add_argument('--results', dest='results',type=str,nargs='*',help=
 argumentParser.add_argument('--run-test', action='store', default=None, help='Regex test filter')
 argumentParser.add_argument('--extra-loops', action='store',type=int,default=0, help='Amount of times to loop over one test (one test already performs multiple runs)')
 argumentParser.add_argument('--filename', action='store',type=str,default='bench', help='File name for graph and results file')
+argumentParser.add_argument('--callgrind', dest='callgrind',action='store_const',const=1,default=0,help='Use callgrind to run benchmarks')

 if matplotlib != None:
    argumentParser.add_argument('--absolute', dest='absolute',action='store_const',const=1,default=0,help='Display absolute values instead of relative (enabled by default when benchmarking a single VM)')
@ -55,6 +56,9 @@ argumentParser.add_argument('--no-print-influx-debugging', action='store_false',

 argumentParser.add_argument('--no-print-final-summary', action='store_false', dest='print_final_summary', help="Don't print a table summarizing the results after all tests are run")

+# Assume 2.5 IPC on a 4 GHz CPU; this is obviously incorrect but it allows us to display simulated instruction counts using regular time units
+CALLGRIND_INSN_PER_SEC = 2.5 * 4e9
+
 def arrayRange(count):
    result = []

@ -71,6 +75,21 @@ def arrayRangeOffset(count, offset):

    return result

+def getCallgrindOutput(lines):
+    result = []
+    name = None
+
+    for l in lines:
+        if l.startswith("desc: Trigger: Client Request: "):
+            name = l[31:].strip()
+        elif l.startswith("summary: ") and name != None:
+            insn = int(l[9:])
+            # Note: we only run each bench once under callgrind so we only report a single time per run; callgrind instruction count variance is ~0.01% so it might as well be zero
+            result += "|><|" + name + "|><|" + str(insn / CALLGRIND_INSN_PER_SEC * 1000.0) + "||_||"
+            name = None
+
+    return "".join(result)
+
 def getVmOutput(cmd):
    if os.name == "nt":
        try:
@ -79,6 +98,15 @@ def getVmOutput(cmd):
            exit(1)
        except:
            return ""
+    elif arguments.callgrind:
+        try:
+            subprocess.check_call("valgrind --tool=callgrind --callgrind-out-file=callgrind.out --combine-dumps=yes --dump-line=no " + cmd, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, cwd=scriptdir)
+            file = open(os.path.join(scriptdir, "callgrind.out"), "r")
+            lines = file.readlines()
+            return getCallgrindOutput(lines)
+        except e:
+            print(e)
+            return ""
    else:
        with subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, cwd=scriptdir) as p:
            # Try to lock to a single processor
--- a/bench/bench_support.lua
+++ b/bench/bench_support.lua
@ -5,6 +5,16 @@ bench.runs = 20
 bench.extraRuns = 4

 function bench.runCode(f, description)
+    -- Under Callgrind, run the test only once and measure just the execution cost
+    if callgrind and callgrind("running") then
+        if collectgarbage then collectgarbage() end
+
+        callgrind("zero")
+        f() -- unfortunately we can't easily separate setup cost from runtime cost in f unless it calls callgrind()
+        callgrind("dump", description)
+        return
+    end
+
    local timeTable = {}

    for i = 1,bench.runs + bench.extraRuns do