diff --git a/CLI/Repl.cpp b/CLI/Repl.cpp
index 83060f5b..5fe12bec 100644
--- a/CLI/Repl.cpp
+++ b/CLI/Repl.cpp
@@ -21,6 +21,10 @@
 #include <fcntl.h>
 #endif
 
+#ifdef CALLGRIND
+#include <valgrind/callgrind.h>
+#endif
+
 #include <locale.h>
 
 LUAU_FASTFLAG(DebugLuauTimeTracing)
@@ -166,6 +170,36 @@ static int lua_collectgarbage(lua_State* L)
     luaL_error(L, "collectgarbage must be called with 'count' or 'collect'");
 }
 
+#ifdef CALLGRIND
+static int lua_callgrind(lua_State* L)
+{
+    const char* option = luaL_checkstring(L, 1);
+
+    if (strcmp(option, "running") == 0)
+    {
+        int r = RUNNING_ON_VALGRIND;
+        lua_pushboolean(L, r);
+        return 1;
+    }
+
+    if (strcmp(option, "zero") == 0)
+    {
+        CALLGRIND_ZERO_STATS;
+        return 0;
+    }
+
+    if (strcmp(option, "dump") == 0)
+    {
+        const char* name = luaL_checkstring(L, 2);
+
+        CALLGRIND_DUMP_STATS_AT(name);
+        return 0;
+    }
+
+    luaL_error(L, "callgrind must be called with one of 'running', 'zero', 'dump'");
+}
+#endif
+
 void setupState(lua_State* L)
 {
     luaL_openlibs(L);
@@ -174,6 +208,9 @@ void setupState(lua_State* L)
         {"loadstring", lua_loadstring},
         {"require", lua_require},
         {"collectgarbage", lua_collectgarbage},
+#ifdef CALLGRIND
+        {"callgrind", lua_callgrind},
+#endif
         {NULL, NULL},
     };
 
diff --git a/Makefile b/Makefile
index 1082666d..b8077897 100644
--- a/Makefile
+++ b/Makefile
@@ -93,6 +93,10 @@ ifeq ($(config),fuzz)
 	LDFLAGS+=-fsanitize=address,fuzzer
 endif
 
+ifneq ($(CALLGRIND),)
+	CXXFLAGS+=-DCALLGRIND=$(CALLGRIND)
+endif
+
 # target-specific flags
 $(AST_OBJECTS): CXXFLAGS+=-std=c++17 -ICommon/include -IAst/include
 $(COMPILER_OBJECTS): CXXFLAGS+=-std=c++17 -ICompiler/include -ICommon/include -IAst/include
diff --git a/bench/bench.py b/bench/bench.py
index 67fc8cf7..b4b1eb1d 100644
--- a/bench/bench.py
+++ b/bench/bench.py
@@ -40,6 +40,7 @@ argumentParser.add_argument('--results', dest='results',type=str,nargs='*',help=
 argumentParser.add_argument('--run-test', action='store', default=None, help='Regex test filter')
 argumentParser.add_argument('--extra-loops', action='store',type=int,default=0, help='Amount of times to loop over one test (one test already performs multiple runs)')
 argumentParser.add_argument('--filename', action='store',type=str,default='bench', help='File name for graph and results file')
+argumentParser.add_argument('--callgrind', dest='callgrind',action='store_const',const=1,default=0,help='Use callgrind to run benchmarks')
 
 if matplotlib != None:
     argumentParser.add_argument('--absolute', dest='absolute',action='store_const',const=1,default=0,help='Display absolute values instead of relative (enabled by default when benchmarking a single VM)')
@@ -55,6 +56,9 @@ argumentParser.add_argument('--no-print-influx-debugging', action='store_false',
 
 argumentParser.add_argument('--no-print-final-summary', action='store_false', dest='print_final_summary', help="Don't print a table summarizing the results after all tests are run")
 
+# Assume 2.5 IPC on a 4 GHz CPU; this is obviously incorrect but it allows us to display simulated instruction counts using regular time units
+CALLGRIND_INSN_PER_SEC = 2.5 * 4e9
+
 def arrayRange(count):
     result = []
 
@@ -71,6 +75,21 @@ def arrayRangeOffset(count, offset):
 
     return result
 
+def getCallgrindOutput(lines):
+    result = []
+    name = None
+
+    for l in lines:
+        if l.startswith("desc: Trigger: Client Request: "):
+            name = l[31:].strip()
+        elif l.startswith("summary: ") and name != None:
+            insn = int(l[9:])
+            # Note: we only run each bench once under callgrind so we only report a single time per run; callgrind instruction count variance is ~0.01% so it might as well be zero
+            result += "|><|" + name + "|><|" + str(insn / CALLGRIND_INSN_PER_SEC * 1000.0) + "||_||"
+            name = None
+
+    return "".join(result)
+
 def getVmOutput(cmd):
     if os.name == "nt":
         try:
@@ -79,6 +98,15 @@ def getVmOutput(cmd):
             exit(1)
         except:
             return ""
+    elif arguments.callgrind:
+        try:
+            subprocess.check_call("valgrind --tool=callgrind --callgrind-out-file=callgrind.out --combine-dumps=yes --dump-line=no " + cmd, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, cwd=scriptdir)
+            file = open(os.path.join(scriptdir, "callgrind.out"), "r")
+            lines = file.readlines()
+            return getCallgrindOutput(lines)
+        except e:
+            print(e)
+            return ""
     else:
         with subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, cwd=scriptdir) as p:
             # Try to lock to a single processor
diff --git a/bench/bench_support.lua b/bench/bench_support.lua
index 171b8da7..a9608ecc 100644
--- a/bench/bench_support.lua
+++ b/bench/bench_support.lua
@@ -5,6 +5,16 @@ bench.runs = 20
 bench.extraRuns = 4
 
 function bench.runCode(f, description)
+    -- Under Callgrind, run the test only once and measure just the execution cost
+    if callgrind and callgrind("running") then
+        if collectgarbage then collectgarbage() end
+
+        callgrind("zero")
+        f() -- unfortunately we can't easily separate setup cost from runtime cost in f unless it calls callgrind()
+        callgrind("dump", description)
+        return
+    end
+
     local timeTable = {}
 
     for i = 1,bench.runs + bench.extraRuns do