mirror of
https://github.com/luau-lang/luau.git
synced 2025-01-25 12:08:11 +00:00
459 lines
21 KiB
Python
459 lines
21 KiB
Python
|
#!/usr/bin/python3
|
||
|
# This file is part of the Luau programming language and is licensed under MIT License; see LICENSE.txt for details
|
||
|
|
||
|
import argparse
|
||
|
import asyncio
|
||
|
import copy
|
||
|
import json
|
||
|
import math
|
||
|
import os
|
||
|
import platform
|
||
|
import re
|
||
|
import subprocess
|
||
|
import sys
|
||
|
import textwrap
|
||
|
from enum import Enum
|
||
|
|
||
|
def add_parser(subparsers):
|
||
|
flag_bisect_command = subparsers.add_parser('flag-bisect',
|
||
|
help=help(),
|
||
|
description=help(),
|
||
|
epilog=epilog(),
|
||
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||
|
)
|
||
|
|
||
|
add_argument_parsers(flag_bisect_command)
|
||
|
flag_bisect_command.set_defaults(func=flag_bisect_main)
|
||
|
return flag_bisect_command
|
||
|
|
||
|
def help():
|
||
|
return 'Search for a set of flags triggering the faulty behavior in unit tests'
|
||
|
|
||
|
def get_terminal_width():
|
||
|
try:
|
||
|
return os.get_terminal_size().columns
|
||
|
except:
|
||
|
# Return a reasonable default when a terminal is not available
|
||
|
return 80
|
||
|
def wrap_text(text, width):
|
||
|
leading_whitespace_re = re.compile('( *)')
|
||
|
|
||
|
def get_paragraphs_and_indent(string):
|
||
|
lines = string.split('\n')
|
||
|
result = ''
|
||
|
line_count = 0
|
||
|
initial_indent = ''
|
||
|
subsequent_indent = ''
|
||
|
for line in lines:
|
||
|
if len(line.strip()) == 0:
|
||
|
if line_count > 0:
|
||
|
yield result, initial_indent, subsequent_indent
|
||
|
result = ''
|
||
|
line_count = 0
|
||
|
else:
|
||
|
line_count += 1
|
||
|
if line_count == 1:
|
||
|
initial_indent = leading_whitespace_re.match(line).group(1)
|
||
|
subsequent_indent = initial_indent
|
||
|
elif line_count == 2:
|
||
|
subsequent_indent = leading_whitespace_re.match(line).group(1)
|
||
|
result += line.strip() + '\n'
|
||
|
|
||
|
result = ''
|
||
|
for paragraph, initial_indent, subsequent_indent in get_paragraphs_and_indent(text):
|
||
|
result += textwrap.fill(paragraph, width=width, initial_indent=initial_indent, subsequent_indent=subsequent_indent, break_on_hyphens=False) + '\n\n'
|
||
|
return result
|
||
|
|
||
|
def wrap_text_for_terminal(text):
|
||
|
right_margin = 2 # This margin matches what argparse uses when formatting argument documentation
|
||
|
min_width = 20
|
||
|
width = max(min_width, get_terminal_width() - right_margin)
|
||
|
return wrap_text(text, width)
|
||
|
|
||
|
def epilog():
|
||
|
return wrap_text_for_terminal('''
|
||
|
This tool uses the delta debugging algorithm to minimize the set of flags to the ones that are faulty in your unit tests,
|
||
|
and the usage is trivial. Just provide a path to the unit test and you're done, the tool will do the rest.
|
||
|
|
||
|
There are many use cases with flag-bisect. Included but not limited to:
|
||
|
|
||
|
1: If your test is failing when you omit `--fflags=true` but it works when passing `--fflags=true`, then you can
|
||
|
use this tool to find that set of flag requirements to see which flags are missing that will help to fix it. Ditto
|
||
|
for the opposite too, this tool is generalized for that case.
|
||
|
|
||
|
2: If you happen to run into a problem on production, and you're not sure which flags is the problem and you can easily
|
||
|
create a unit test, you can run flag-bisect on that unit test to rapidly find the set of flags.
|
||
|
|
||
|
3: If you have a flag that causes a performance regression, there's also the `--timeout=N` where `N` is in seconds.
|
||
|
|
||
|
4: If you have tests that are demonstrating flakiness behavior, you can also use `--tries=N` where `N` is the number of
|
||
|
attempts to run the same set of flags before moving on to the new set. This will eventually drill down to the flaky flag(s).
|
||
|
Generally 8 tries should be more than enough, but it depends on the rarity. The more rare it is, the higher the attempts count
|
||
|
needs to be. Note that this comes with a performance cost the higher you go, but certainly still faster than manual search.
|
||
|
This argument will disable parallel mode by default. If this is not desired, explicitly write `--parallel=on`.
|
||
|
|
||
|
5: By default flag-bisect runs in parallel mode which uses a slightly modified version of delta debugging algorithm to support
|
||
|
trying multiple sets of flags concurrently. This means that the number of sets the algorithm will try at once is equal to the
|
||
|
number of concurrent jobs. There is currently no upper bound to that, so heed this warning that your machine may slow down
|
||
|
significantly. In this mode, we display the number of jobs it is running in parallel. Use `--parallel=off` to disable parallel
|
||
|
mode.
|
||
|
|
||
|
Be aware that this introduces some level of *non-determinism*, and it is fundamental due to the interaction with flag dependencies
|
||
|
and the fact one job may finish faster than another job that got ran in the same cycle. However, it generally shouldn't matter
|
||
|
if your test is deterministic and has no implicit flag dependencies in the codebase.
|
||
|
|
||
|
The tool will try to automatically figure out which of `--pass` or `--fail` to use if you omit them or use `--auto` by applying
|
||
|
heuristics. For example, if the tests works using `--fflags=true` and crashes if omitting `--fflags=true`, then it knows
|
||
|
to use `--pass` to give you set of flags that will cause that crash. As usual, vice versa is also true. Since this is a
|
||
|
heuristic, if it gets that guess wrong, you can override with `--pass` or `--fail`.
|
||
|
|
||
|
You can speed this process up by scoping it to as few tests as possible, for example if you're using doctest then you'd
|
||
|
pass `--tc=my_test` as an argument after `--`, so `flag-bisect ./path/to/binary -- --tc=my_test`.
|
||
|
''')
|
||
|
|
||
|
class InterestnessMode(Enum):
|
||
|
AUTO = 0,
|
||
|
FAIL = 1,
|
||
|
PASS = 2,
|
||
|
|
||
|
def add_argument_parsers(parser):
|
||
|
parser.add_argument('binary_path', help='Path to the unit test binary that will be bisected for a set of flags')
|
||
|
|
||
|
parser.add_argument('--tries', dest='attempts', type=int, default=1, metavar='N',
|
||
|
help='If the tests are flaky, flag-bisect will try again with the same set by N amount of times before moving on')
|
||
|
|
||
|
parser.add_argument('--parallel', dest='parallel', choices=['on', 'off'], default='default',
|
||
|
help='Test multiple sets of flags in parallel, useful when the test takes a while to run.')
|
||
|
|
||
|
parser.add_argument('--explicit', dest='explicit', action='store_true', default=False, help='Explicitly set flags to false')
|
||
|
|
||
|
parser.add_argument('--filter', dest='filter', default=None, help='Regular expression to filter for a subset of flags to test')
|
||
|
|
||
|
parser.add_argument('--verbose', dest='verbose', action='store_true', default=False, help='Show stdout and stderr of the program being run')
|
||
|
|
||
|
interestness_parser = parser.add_mutually_exclusive_group()
|
||
|
interestness_parser.add_argument('--auto', dest='mode', action='store_const', const=InterestnessMode.AUTO,
|
||
|
default=InterestnessMode.AUTO, help='Automatically figure out which one of --pass or --fail should be used')
|
||
|
interestness_parser.add_argument('--fail', dest='mode', action='store_const', const=InterestnessMode.FAIL,
|
||
|
help='You want this if omitting --fflags=true causes tests to fail')
|
||
|
interestness_parser.add_argument('--pass', dest='mode', action='store_const', const=InterestnessMode.PASS,
|
||
|
help='You want this if passing --fflags=true causes tests to pass')
|
||
|
interestness_parser.add_argument('--timeout', dest='timeout', type=int, default=0, metavar='SECONDS',
|
||
|
help='Find the flag(s) causing performance regression if time to run exceeds the timeout in seconds')
|
||
|
|
||
|
class Options:
|
||
|
def __init__(self, args, other_args, sense):
|
||
|
self.path = args.binary_path
|
||
|
self.explicit = args.explicit
|
||
|
self.sense = sense
|
||
|
self.timeout = args.timeout
|
||
|
self.interested_in_timeouts = args.timeout != 0
|
||
|
self.attempts = args.attempts
|
||
|
self.parallel = (args.parallel == 'on' or args.parallel == 'default') if args.attempts == 1 else args.parallel == 'on'
|
||
|
self.filter = re.compile(".*" + args.filter + ".*") if args.filter else None
|
||
|
self.verbose = args.verbose
|
||
|
self.other_args = [arg for arg in other_args if arg != '--'] # Useless to have -- here, discard.
|
||
|
|
||
|
def copy_with_sense(self, sense):
|
||
|
new_copy = copy.copy(self)
|
||
|
new_copy.sense = sense
|
||
|
return new_copy
|
||
|
|
||
|
class InterestnessResult(Enum):
|
||
|
FAIL = 0,
|
||
|
PASS = 1,
|
||
|
TIMED_OUT = 2,
|
||
|
|
||
|
class Progress:
|
||
|
def __init__(self, count, n_of_jobs=None):
|
||
|
self.count = count
|
||
|
self.steps = 0
|
||
|
self.n_of_jobs = n_of_jobs
|
||
|
self.buffer = None
|
||
|
|
||
|
def show(self):
|
||
|
# remaining is actually the height of the current search tree.
|
||
|
remain = int(math.log2(self.count))
|
||
|
flag_plural = 'flag' if self.count == 1 else 'flags'
|
||
|
node_plural = 'node' if remain == 1 else 'nodes'
|
||
|
jobs_info = f', running {self.n_of_jobs} jobs' if self.n_of_jobs is not None else ''
|
||
|
return f'flag bisection: testing {self.count} {flag_plural} (step {self.steps}, {remain} {node_plural} remain{jobs_info})'
|
||
|
|
||
|
def hide(self):
|
||
|
if self.buffer:
|
||
|
sys.stdout.write('\b \b' * len(self.buffer))
|
||
|
|
||
|
def update(self, len, n_of_jobs=None):
|
||
|
self.hide()
|
||
|
self.count = len
|
||
|
self.steps += 1
|
||
|
self.n_of_jobs = n_of_jobs
|
||
|
self.buffer = self.show()
|
||
|
sys.stdout.write(self.buffer)
|
||
|
sys.stdout.flush()
|
||
|
|
||
|
def list_fflags(options):
|
||
|
try:
|
||
|
out = subprocess.check_output([options.path, '--list-fflags'], encoding='UTF-8')
|
||
|
flag_names = []
|
||
|
|
||
|
# It's unlikely that a program we're going to test has no flags.
|
||
|
# So if the output doesn't start with FFlag, assume it doesn't support --list-fflags and therefore cannot be bisected.
|
||
|
if not out.startswith('FFlag') and not out.startswith('DFFlag') and not out.startswith('SFFlag'):
|
||
|
return None
|
||
|
|
||
|
flag_names = out.split('\n')[:-1]
|
||
|
|
||
|
subset = [flag for flag in flag_names if options.filter.match(flag) is not None] if options.filter else flag_names
|
||
|
return subset if subset else None
|
||
|
except:
|
||
|
return None
|
||
|
|
||
|
def mk_flags_argument(options, flags, initial_flags):
|
||
|
lst = [flag + '=true' for flag in flags]
|
||
|
|
||
|
# When --explicit is provided, we'd like to find the set of flags from initial_flags that's not in active flags.
|
||
|
# This is so that we can provide a =false value instead of leaving them out to be the default value.
|
||
|
if options.explicit:
|
||
|
for flag in initial_flags:
|
||
|
if flag not in flags:
|
||
|
lst.append(flag + '=false')
|
||
|
|
||
|
return '--fflags=' + ','.join(lst)
|
||
|
|
||
|
def mk_command_line(options, flags_argument):
|
||
|
arguments = [options.path, *options.other_args]
|
||
|
if flags_argument is not None:
|
||
|
arguments.append(flags_argument)
|
||
|
return arguments
|
||
|
|
||
|
async def get_interestness(options, flags_argument):
|
||
|
try:
|
||
|
timeout = options.timeout if options.interested_in_timeouts else None
|
||
|
cmd = mk_command_line(options, flags_argument)
|
||
|
stdout = subprocess.PIPE if not options.verbose else None
|
||
|
stderr = subprocess.PIPE if not options.verbose else None
|
||
|
process = subprocess.run(cmd, stdout=stdout, stderr=stderr, timeout=timeout)
|
||
|
return InterestnessResult.PASS if process.returncode == 0 else InterestnessResult.FAIL
|
||
|
except subprocess.TimeoutExpired:
|
||
|
return InterestnessResult.TIMED_OUT
|
||
|
|
||
|
async def is_hot(options, flags_argument, pred=any):
|
||
|
results = await asyncio.gather(*[get_interestness(options, flags_argument) for _ in range(options.attempts)])
|
||
|
|
||
|
if options.interested_in_timeouts:
|
||
|
return pred([InterestnessResult.TIMED_OUT == x for x in results])
|
||
|
else:
|
||
|
return pred([(InterestnessResult.PASS if options.sense else InterestnessResult.FAIL) == x for x in results])
|
||
|
|
||
|
def pairwise_disjoints(flags, granularity):
|
||
|
offset = 0
|
||
|
per_slice_len = len(flags) // granularity
|
||
|
while offset < len(flags):
|
||
|
yield flags[offset:offset + per_slice_len]
|
||
|
offset += per_slice_len
|
||
|
|
||
|
def subsets_and_complements(flags, granularity):
|
||
|
for disjoint_set in pairwise_disjoints(flags, granularity):
|
||
|
yield disjoint_set, [flag for flag in flags if flag not in disjoint_set]
|
||
|
|
||
|
# https://www.cs.purdue.edu/homes/xyzhang/fall07/Papers/delta-debugging.pdf
|
||
|
async def ddmin(options, initial_flags):
|
||
|
current = initial_flags
|
||
|
granularity = 2
|
||
|
|
||
|
progress = Progress(len(current))
|
||
|
progress.update(len(current))
|
||
|
|
||
|
while len(current) >= 2:
|
||
|
changed = False
|
||
|
|
||
|
for (subset, complement) in subsets_and_complements(current, granularity):
|
||
|
progress.update(len(current))
|
||
|
if await is_hot(options, mk_flags_argument(options, complement, initial_flags)):
|
||
|
current = complement
|
||
|
granularity = max(granularity - 1, 2)
|
||
|
changed = True
|
||
|
break
|
||
|
elif await is_hot(options, mk_flags_argument(options, subset, initial_flags)):
|
||
|
current = subset
|
||
|
granularity = 2
|
||
|
changed = True
|
||
|
break
|
||
|
|
||
|
if not changed:
|
||
|
if granularity == len(current):
|
||
|
break
|
||
|
granularity = min(granularity * 2, len(current))
|
||
|
|
||
|
progress.hide()
|
||
|
return current
|
||
|
|
||
|
async def ddmin_parallel(options, initial_flags):
|
||
|
current = initial_flags
|
||
|
granularity = 2
|
||
|
|
||
|
progress = Progress(len(current))
|
||
|
progress.update(len(current), granularity)
|
||
|
|
||
|
while len(current) >= 2:
|
||
|
changed = False
|
||
|
|
||
|
subset_jobs = []
|
||
|
complement_jobs = []
|
||
|
|
||
|
def advance(task):
|
||
|
nonlocal current
|
||
|
nonlocal granularity
|
||
|
nonlocal changed
|
||
|
# task.cancel() calls the callback passed to add_done_callback...
|
||
|
if task.cancelled():
|
||
|
return
|
||
|
hot, new_delta, new_granularity = task.result()
|
||
|
if hot and not changed:
|
||
|
current = new_delta
|
||
|
granularity = new_granularity
|
||
|
changed = True
|
||
|
for job in subset_jobs:
|
||
|
job.cancel()
|
||
|
for job in complement_jobs:
|
||
|
job.cancel()
|
||
|
|
||
|
for (subset, complement) in subsets_and_complements(current, granularity):
|
||
|
async def work(flags, new_granularity):
|
||
|
hot = await is_hot(options, mk_flags_argument(options, flags, initial_flags))
|
||
|
return (hot, flags, new_granularity)
|
||
|
|
||
|
# We want to run subset jobs in parallel first.
|
||
|
subset_job = asyncio.create_task(work(subset, 2))
|
||
|
subset_job.add_done_callback(advance)
|
||
|
subset_jobs.append(subset_job)
|
||
|
|
||
|
# Then the complements afterwards, but only if we didn't find a new subset.
|
||
|
complement_job = asyncio.create_task(work(complement, max(granularity - 1, 2)))
|
||
|
complement_job.add_done_callback(advance)
|
||
|
complement_jobs.append(complement_job)
|
||
|
|
||
|
# When we cancel jobs, the asyncio.gather will be waiting pointlessly.
|
||
|
# In that case, we'd like to return the control to this routine.
|
||
|
await asyncio.gather(*subset_jobs, return_exceptions=True)
|
||
|
if not changed:
|
||
|
await asyncio.gather(*complement_jobs, return_exceptions=True)
|
||
|
progress.update(len(current), granularity)
|
||
|
|
||
|
if not changed:
|
||
|
if granularity == len(current):
|
||
|
break
|
||
|
granularity = min(granularity * 2, len(current))
|
||
|
|
||
|
progress.hide()
|
||
|
return current
|
||
|
|
||
|
def search(options, initial_flags):
|
||
|
if options.parallel:
|
||
|
return ddmin_parallel(options, initial_flags)
|
||
|
else:
|
||
|
return ddmin(options, initial_flags)
|
||
|
|
||
|
async def do_work(args, other_args):
|
||
|
sense = None
|
||
|
|
||
|
# If --timeout isn't used, try to apply a heuristic to figure out which of --pass or --fail we want.
|
||
|
if args.timeout == 0 and args.mode == InterestnessMode.AUTO:
|
||
|
inner_options = Options(args, other_args, sense)
|
||
|
|
||
|
# We aren't interested in timeout for this heuristic. It just makes no sense to assume timeouts.
|
||
|
# This actually cannot happen by this point, but if we make timeout a non-exclusive switch to --auto, this will go wrong.
|
||
|
inner_options.timeout = 0
|
||
|
inner_options.interested_in_timeouts = False
|
||
|
|
||
|
all_tasks = asyncio.gather(
|
||
|
is_hot(inner_options.copy_with_sense(True), '--fflags=true', all),
|
||
|
is_hot(inner_options.copy_with_sense(False), '--fflags=false' if inner_options.explicit else None, all),
|
||
|
)
|
||
|
|
||
|
# If it times out, we can print a message saying that this is still working. We intentionally want to continue doing work.
|
||
|
done, pending = await asyncio.wait([all_tasks], timeout=1.5)
|
||
|
if all_tasks not in done:
|
||
|
print('Hang on! I\'m running your program to try and figure out which of --pass or --fail to use!')
|
||
|
print('Need to find out faster? Cancel the work and explicitly write --pass or --fail')
|
||
|
|
||
|
is_pass_hot, is_fail_hot = await all_tasks
|
||
|
|
||
|
# This is a bit counter-intuitive, but the following table tells us which of the sense we want.
|
||
|
# Because when you omit --fflags=true argument and it fails, then is_fail_hot is True.
|
||
|
# Consequently, you need to use --pass to find out what that set of flags is. And vice versa.
|
||
|
#
|
||
|
# Also, when is_pass_hot is True and is_fail_hot is False, then that program is working as expected.
|
||
|
# There should be no reason to run flag bisection.
|
||
|
# However, this can be ambiguous in the opposite of the aforementioned outcome!
|
||
|
#
|
||
|
# is_pass_hot | is_fail_hot | is ambiguous?
|
||
|
#-------------|-------------|---------------
|
||
|
# True | True | No! Pick --pass.
|
||
|
# False | False | No! Pick --fail.
|
||
|
# True | False | No! But this is the exact situation where you shouldn't need to flag-bisect. Raise an error.
|
||
|
# False | True | Yes! But we'll pragmatically pick --fail here in the hope it gives the correct set of flags.
|
||
|
|
||
|
if is_pass_hot and not is_fail_hot:
|
||
|
print('The tests seems to be working fine for me. If you really need to flag-bisect, please try again with an explicit --pass or --fail', file=sys.stderr)
|
||
|
return 1
|
||
|
|
||
|
if not is_pass_hot and is_fail_hot:
|
||
|
print('I couldn\'t quite figure out which of --pass or --fail to use, but I\'ll carry on anyway')
|
||
|
|
||
|
sense = is_pass_hot
|
||
|
argument = '--pass' if sense else '--fail'
|
||
|
print(f'I\'m bisecting flags as if {argument} was used')
|
||
|
else:
|
||
|
sense = True if args.mode == InterestnessMode.PASS else False
|
||
|
|
||
|
options = Options(args, other_args, sense)
|
||
|
|
||
|
initial_flags = list_fflags(options)
|
||
|
if initial_flags is None:
|
||
|
print('I cannot bisect flags with ' + options.path, file=sys.stderr)
|
||
|
print('These are required for me to be able to cooperate:', file=sys.stderr)
|
||
|
print('\t--list-fflags must print a list of flags separated by newlines, including FFlag prefix', file=sys.stderr)
|
||
|
print('\t--fflags=... to accept a comma-separated pair of flag names and their value in the form FFlagFoo=true', file=sys.stderr)
|
||
|
return 1
|
||
|
|
||
|
# On Windows, there is an upper bound on the numbers of characters for a command line incantation.
|
||
|
# If we don't handle this ourselves, the runtime error is going to look nothing like the actual problem.
|
||
|
# It'd say "file name way too long" or something to that effect. We can teed up a better error message and
|
||
|
# tell the user how to work around it by using --filter.
|
||
|
if platform.system() == 'Windows':
|
||
|
cmd_line = ' '.join(mk_command_line(options, mk_flags_argument(options, initial_flags, [])))
|
||
|
if len(cmd_line) >= 8191:
|
||
|
print(f'Never mind! The command line is too long because we have {len(initial_flags)} flags to test', file=sys.stderr)
|
||
|
print('Consider using `--filter=<regex>` to narrow it down upfront, or use any version of WSL instead', file=sys.stderr)
|
||
|
return 1
|
||
|
|
||
|
hot_flags = await search(options, initial_flags)
|
||
|
if hot_flags:
|
||
|
print('I narrowed down to these flags:')
|
||
|
print(textwrap.indent('\n'.join(hot_flags), prefix='\t'))
|
||
|
|
||
|
# If we showed the command line in explicit mode, all flags would be listed here.
|
||
|
# This would pollute the terminal with 3000 flags. We don't want that. Don't show it.
|
||
|
# Ditto for when the number flags we bisected are equal.
|
||
|
if not options.explicit and len(hot_flags) != len(initial_flags):
|
||
|
print('$ ' + ' '.join(mk_command_line(options, mk_flags_argument(options, hot_flags, initial_flags))))
|
||
|
|
||
|
return 0
|
||
|
|
||
|
print('I found nothing, sorry', file=sys.stderr)
|
||
|
return 1
|
||
|
|
||
|
def flag_bisect_main(args, other_args):
|
||
|
return asyncio.run(do_work(args, other_args))
|
||
|
|
||
|
def main():
|
||
|
parser = argparse.ArgumentParser(description=help(), epilog=epilog(), formatter_class=argparse.RawTextHelpFormatter)
|
||
|
add_argument_parsers(parser)
|
||
|
args, other_args = parser.parse_known_args()
|
||
|
return flag_bisect_main(args, other_args)
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
sys.exit(main())
|