From 755f447859d859dd5f393e98368bfb943d4f8a56 Mon Sep 17 00:00:00 2001
From: Arseny Kapoulkine <arseny.kapoulkine@gmail.com>
Date: Thu, 30 Jun 2022 19:48:54 -0700
Subject: [PATCH] bench: Implement first class support for callgrind

Since callgrind allows to control stats collection from the guest, this
allows us to reset the collection right before the benchmark starts.

This change exposes this to the benchmark runner and integrates
callgrind data parsing into bench.py, so that we can run bench.py with
--callgrind argument and, as long as the runner was built with callgrind
support, we get instruction counts from the run.

We convert instruction counts to seconds using 10G instructions/second
rate; there's no correct way to do this without simulating the full CPU
pipeline but it results in time units on a similar scale to real runs.
---
 CLI/Repl.cpp            | 37 +++++++++++++++++++++++++++++++++++++
 Makefile                |  4 ++++
 bench/bench.py          | 28 ++++++++++++++++++++++++++++
 bench/bench_support.lua | 10 ++++++++++
 4 files changed, 79 insertions(+)

diff --git a/CLI/Repl.cpp b/CLI/Repl.cpp
index 83060f5b..5fe12bec 100644
--- a/CLI/Repl.cpp
+++ b/CLI/Repl.cpp
@@ -21,6 +21,10 @@
 #include <fcntl.h>
 #endif
 
+#ifdef CALLGRIND
+#include <valgrind/callgrind.h>
+#endif
+
 #include <locale.h>
 
 LUAU_FASTFLAG(DebugLuauTimeTracing)
@@ -166,6 +170,36 @@ static int lua_collectgarbage(lua_State* L)
     luaL_error(L, "collectgarbage must be called with 'count' or 'collect'");
 }
 
+#ifdef CALLGRIND
+static int lua_callgrind(lua_State* L)
+{
+    const char* option = luaL_checkstring(L, 1);
+
+    if (strcmp(option, "running") == 0)
+    {
+        int r = RUNNING_ON_VALGRIND;
+        lua_pushboolean(L, r);
+        return 1;
+    }
+
+    if (strcmp(option, "zero") == 0)
+    {
+        CALLGRIND_ZERO_STATS;
+        return 0;
+    }
+
+    if (strcmp(option, "dump") == 0)
+    {
+        const char* name = luaL_checkstring(L, 2);
+
+        CALLGRIND_DUMP_STATS_AT(name);
+        return 0;
+    }
+
+    luaL_error(L, "callgrind must be called with one of 'running', 'zero', 'dump'");
+}
+#endif
+
 void setupState(lua_State* L)
 {
     luaL_openlibs(L);
@@ -174,6 +208,9 @@ void setupState(lua_State* L)
         {"loadstring", lua_loadstring},
         {"require", lua_require},
         {"collectgarbage", lua_collectgarbage},
+#ifdef CALLGRIND
+        {"callgrind", lua_callgrind},
+#endif
         {NULL, NULL},
     };
 
diff --git a/Makefile b/Makefile
index 1082666d..b8077897 100644
--- a/Makefile
+++ b/Makefile
@@ -93,6 +93,10 @@ ifeq ($(config),fuzz)
 	LDFLAGS+=-fsanitize=address,fuzzer
 endif
 
+ifneq ($(CALLGRIND),)
+	CXXFLAGS+=-DCALLGRIND=$(CALLGRIND)
+endif
+
 # target-specific flags
 $(AST_OBJECTS): CXXFLAGS+=-std=c++17 -ICommon/include -IAst/include
 $(COMPILER_OBJECTS): CXXFLAGS+=-std=c++17 -ICompiler/include -ICommon/include -IAst/include
diff --git a/bench/bench.py b/bench/bench.py
index 67fc8cf7..b4b1eb1d 100644
--- a/bench/bench.py
+++ b/bench/bench.py
@@ -40,6 +40,7 @@ argumentParser.add_argument('--results', dest='results',type=str,nargs='*',help=
 argumentParser.add_argument('--run-test', action='store', default=None, help='Regex test filter')
 argumentParser.add_argument('--extra-loops', action='store',type=int,default=0, help='Amount of times to loop over one test (one test already performs multiple runs)')
 argumentParser.add_argument('--filename', action='store',type=str,default='bench', help='File name for graph and results file')
+argumentParser.add_argument('--callgrind', dest='callgrind',action='store_const',const=1,default=0,help='Use callgrind to run benchmarks')
 
 if matplotlib != None:
     argumentParser.add_argument('--absolute', dest='absolute',action='store_const',const=1,default=0,help='Display absolute values instead of relative (enabled by default when benchmarking a single VM)')
@@ -55,6 +56,9 @@ argumentParser.add_argument('--no-print-influx-debugging', action='store_false',
 
 argumentParser.add_argument('--no-print-final-summary', action='store_false', dest='print_final_summary', help="Don't print a table summarizing the results after all tests are run")
 
+# Assume 2.5 IPC on a 4 GHz CPU; this is obviously incorrect but it allows us to display simulated instruction counts using regular time units
+CALLGRIND_INSN_PER_SEC = 2.5 * 4e9
+
 def arrayRange(count):
     result = []
 
@@ -71,6 +75,21 @@ def arrayRangeOffset(count, offset):
 
     return result
 
+def getCallgrindOutput(lines):
+    result = []
+    name = None
+
+    for l in lines:
+        if l.startswith("desc: Trigger: Client Request: "):
+            name = l[31:].strip()
+        elif l.startswith("summary: ") and name != None:
+            insn = int(l[9:])
+            # Note: we only run each bench once under callgrind so we only report a single time per run; callgrind instruction count variance is ~0.01% so it might as well be zero
+            result += "|><|" + name + "|><|" + str(insn / CALLGRIND_INSN_PER_SEC * 1000.0) + "||_||"
+            name = None
+
+    return "".join(result)
+
 def getVmOutput(cmd):
     if os.name == "nt":
         try:
@@ -79,6 +98,15 @@ def getVmOutput(cmd):
             exit(1)
         except:
             return ""
+    elif arguments.callgrind:
+        try:
+            subprocess.check_call("valgrind --tool=callgrind --callgrind-out-file=callgrind.out --combine-dumps=yes --dump-line=no " + cmd, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, cwd=scriptdir)
+            file = open(os.path.join(scriptdir, "callgrind.out"), "r")
+            lines = file.readlines()
+            return getCallgrindOutput(lines)
+        except e:
+            print(e)
+            return ""
     else:
         with subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, cwd=scriptdir) as p:
             # Try to lock to a single processor
diff --git a/bench/bench_support.lua b/bench/bench_support.lua
index 171b8da7..a9608ecc 100644
--- a/bench/bench_support.lua
+++ b/bench/bench_support.lua
@@ -5,6 +5,16 @@ bench.runs = 20
 bench.extraRuns = 4
 
 function bench.runCode(f, description)
+    -- Under Callgrind, run the test only once and measure just the execution cost
+    if callgrind and callgrind("running") then
+        if collectgarbage then collectgarbage() end
+
+        callgrind("zero")
+        f() -- unfortunately we can't easily separate setup cost from runtime cost in f unless it calls callgrind()
+        callgrind("dump", description)
+        return
+    end
+
     local timeTable = {}
 
     for i = 1,bench.runs + bench.extraRuns do